gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static bool
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return true;
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                        &nunits_vectype))
 182     return false;
 183
 184   if (stmt_vectype)
 185     {
 186       if (STMT_VINFO_VECTYPE (stmt_info))
 187         /* The only case when a vectype had been already set is for stmts
 188            that contain a data ref, or for "pattern-stmts" (stmts generated
 189            by the vectorizer to represent/replace a certain idiom).  */
 190         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 191                      || vectype_maybe_set_p)
 192                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 193       else if (stmt_vectype == boolean_type_node)
 194         mask_producers->safe_push (stmt_info);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return true;
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  If some of the statements
 208    produce a mask result whose vector type can only be calculated later,
 209    add them to MASK_PRODUCERS.  Return true on success or false if
 210    something prevented vectorization.  */
 211
 212 static bool
 213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 214                             vec<stmt_vec_info > *mask_producers)
 215 {
 216   vec_info *vinfo = stmt_info->vinfo;
 217   if (dump_enabled_p ())
 218     {
 219       dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
 220       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 221     }
 222   if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
 223     return false;
 224
 225   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 226       && STMT_VINFO_RELATED_STMT (stmt_info))
 227     {
 228       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 229       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 230
 231       /* If a pattern statement has def stmts, analyze them too.  */
 232       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 233            !gsi_end_p (si); gsi_next (&si))
 234         {
 235           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 236           if (dump_enabled_p ())
 237             {
 238               dump_printf_loc (MSG_NOTE, vect_location,
 239                                "==> examining pattern def stmt: ");
 240               dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 241                                 def_stmt_info->stmt, 0);
 242             }
 243           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                              vf, mask_producers))
 245             return false;
 246         }
 247
 248       if (dump_enabled_p ())
 249         {
 250           dump_printf_loc (MSG_NOTE, vect_location,
 251                            "==> examining pattern statement: ");
 252           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 253         }
 254       if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
 255         return false;
 256     }
 257
 258   return true;
 259 }
 260
 261 /* Function vect_determine_vectorization_factor
 262
 263    Determine the vectorization factor (VF).  VF is the number of data elements
 264    that are operated upon in parallel in a single iteration of the vectorized
 265    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 266    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 267    elements can fit in a single vector register.
 268
 269    We currently support vectorization of loops in which all types operated upon
 270    are of the same size.  Therefore this function currently sets VF according to
 271    the size of the types operated upon, and fails if there are multiple sizes
 272    in the loop.
 273
 274    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 275    original loop:
 276         for (i=0; i<N; i++){
 277           a[i] = b[i] + c[i];
 278         }
 279
 280    vectorized loop:
 281         for (i=0; i<N; i+=VF){
 282           a[i:VF] = b[i:VF] + c[i:VF];
 283         }
 284 */
 285
 286 static bool
 287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 288 {
 289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 290   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 291   unsigned nbbs = loop->num_nodes;
 292   poly_uint64 vectorization_factor = 1;
 293   tree scalar_type = NULL_TREE;
 294   gphi *phi;
 295   tree vectype;
 296   stmt_vec_info stmt_info;
 297   unsigned i;
 298   auto_vec<stmt_vec_info> mask_producers;
 299
 300   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 301
 302   for (i = 0; i < nbbs; i++)
 303     {
 304       basic_block bb = bbs[i];
 305
 306       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 307            gsi_next (&si))
 308         {
 309           phi = si.phi ();
 310           stmt_info = loop_vinfo->lookup_stmt (phi);
 311           if (dump_enabled_p ())
 312             {
 313               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 314               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 315             }
 316
 317           gcc_assert (stmt_info);
 318
 319           if (STMT_VINFO_RELEVANT_P (stmt_info)
 320               || STMT_VINFO_LIVE_P (stmt_info))
 321             {
 322               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 323               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 324
 325               if (dump_enabled_p ())
 326                 {
 327                   dump_printf_loc (MSG_NOTE, vect_location,
 328                                    "get vectype for scalar type:  ");
 329                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 330                   dump_printf (MSG_NOTE, "\n");
 331                 }
 332
 333               vectype = get_vectype_for_scalar_type (scalar_type);
 334               if (!vectype)
 335                 {
 336                   if (dump_enabled_p ())
 337                     {
 338                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 339                                        "not vectorized: unsupported "
 340                                        "data-type ");
 341                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 342                                          scalar_type);
 343                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 344                     }
 345                   return false;
 346                 }
 347               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 348
 349               if (dump_enabled_p ())
 350                 {
 351                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 352                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 353                   dump_printf (MSG_NOTE, "\n");
 354                 }
 355
 356               if (dump_enabled_p ())
 357                 {
 358                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 359                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 360                   dump_printf (MSG_NOTE, "\n");
 361                 }
 362
 363               vect_update_max_nunits (&vectorization_factor, vectype);
 364             }
 365         }
 366
 367       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 368            gsi_next (&si))
 369         {
 370           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 371           if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 372                                            &mask_producers))
 373             return false;
 374         }
 375     }
 376
 377   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 378   if (dump_enabled_p ())
 379     {
 380       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 381       dump_dec (MSG_NOTE, vectorization_factor);
 382       dump_printf (MSG_NOTE, "\n");
 383     }
 384
 385   if (known_le (vectorization_factor, 1U))
 386     {
 387       if (dump_enabled_p ())
 388         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 389                          "not vectorized: unsupported data-type\n");
 390       return false;
 391     }
 392   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 393
 394   for (i = 0; i < mask_producers.length (); i++)
 395     {
 396       stmt_info = mask_producers[i];
 397       tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 398       if (!mask_type)
 399         return false;
 400       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 401     }
 402
 403   return true;
 404 }
 405
 406
 407 /* Function vect_is_simple_iv_evolution.
 408
 409    FORNOW: A simple evolution of an induction variables in the loop is
 410    considered a polynomial evolution.  */
 411
 412 static bool
 413 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 414                              tree * step)
 415 {
 416   tree init_expr;
 417   tree step_expr;
 418   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 419   basic_block bb;
 420
 421   /* When there is no evolution in this loop, the evolution function
 422      is not "simple".  */
 423   if (evolution_part == NULL_TREE)
 424     return false;
 425
 426   /* When the evolution is a polynomial of degree >= 2
 427      the evolution function is not "simple".  */
 428   if (tree_is_chrec (evolution_part))
 429     return false;
 430
 431   step_expr = evolution_part;
 432   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 433
 434   if (dump_enabled_p ())
 435     {
 436       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 437       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 438       dump_printf (MSG_NOTE, ",  init: ");
 439       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 440       dump_printf (MSG_NOTE, "\n");
 441     }
 442
 443   *init = init_expr;
 444   *step = step_expr;
 445
 446   if (TREE_CODE (step_expr) != INTEGER_CST
 447       && (TREE_CODE (step_expr) != SSA_NAME
 448           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 449               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 450           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 451               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 452                   || !flag_associative_math)))
 453       && (TREE_CODE (step_expr) != REAL_CST
 454           || !flag_associative_math))
 455     {
 456       if (dump_enabled_p ())
 457         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 458                          "step unknown.\n");
 459       return false;
 460     }
 461
 462   return true;
 463 }
 464
 465 /* Function vect_analyze_scalar_cycles_1.
 466
 467    Examine the cross iteration def-use cycles of scalar variables
 468    in LOOP.  LOOP_VINFO represents the loop that is now being
 469    considered for vectorization (can be LOOP, or an outer-loop
 470    enclosing LOOP).  */
 471
 472 static void
 473 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 474 {
 475   basic_block bb = loop->header;
 476   tree init, step;
 477   auto_vec<gimple *, 64> worklist;
 478   gphi_iterator gsi;
 479   bool double_reduc;
 480
 481   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 482
 483   /* First - identify all inductions.  Reduction detection assumes that all the
 484      inductions have been identified, therefore, this order must not be
 485      changed.  */
 486   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 487     {
 488       gphi *phi = gsi.phi ();
 489       tree access_fn = NULL;
 490       tree def = PHI_RESULT (phi);
 491       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 492
 493       if (dump_enabled_p ())
 494         {
 495           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 496           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 497         }
 498
 499       /* Skip virtual phi's.  The data dependences that are associated with
 500          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 501       if (virtual_operand_p (def))
 502         continue;
 503
 504       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 505
 506       /* Analyze the evolution function.  */
 507       access_fn = analyze_scalar_evolution (loop, def);
 508       if (access_fn)
 509         {
 510           STRIP_NOPS (access_fn);
 511           if (dump_enabled_p ())
 512             {
 513               dump_printf_loc (MSG_NOTE, vect_location,
 514                                "Access function of PHI: ");
 515               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 516               dump_printf (MSG_NOTE, "\n");
 517             }
 518           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 519             = initial_condition_in_loop_num (access_fn, loop->num);
 520           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 521             = evolution_part_in_loop_num (access_fn, loop->num);
 522         }
 523
 524       if (!access_fn
 525           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 526           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 527               && TREE_CODE (step) != INTEGER_CST))
 528         {
 529           worklist.safe_push (phi);
 530           continue;
 531         }
 532
 533       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 534                   != NULL_TREE);
 535       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 536
 537       if (dump_enabled_p ())
 538         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 539       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 540     }
 541
 542
 543   /* Second - identify all reductions and nested cycles.  */
 544   while (worklist.length () > 0)
 545     {
 546       gimple *phi = worklist.pop ();
 547       tree def = PHI_RESULT (phi);
 548       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 549
 550       if (dump_enabled_p ())
 551         {
 552           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 553           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 554         }
 555
 556       gcc_assert (!virtual_operand_p (def)
 557                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 558
 559       stmt_vec_info reduc_stmt_info
 560         = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
 561                                        &double_reduc, false);
 562       if (reduc_stmt_info)
 563         {
 564           if (double_reduc)
 565             {
 566               if (dump_enabled_p ())
 567                 dump_printf_loc (MSG_NOTE, vect_location,
 568                                  "Detected double reduction.\n");
 569
 570               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 571               STMT_VINFO_DEF_TYPE (reduc_stmt_info)
 572                 = vect_double_reduction_def;
 573             }
 574           else
 575             {
 576               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 577                 {
 578                   if (dump_enabled_p ())
 579                     dump_printf_loc (MSG_NOTE, vect_location,
 580                                      "Detected vectorizable nested cycle.\n");
 581
 582                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 583                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
 584                 }
 585               else
 586                 {
 587                   if (dump_enabled_p ())
 588                     dump_printf_loc (MSG_NOTE, vect_location,
 589                                      "Detected reduction.\n");
 590
 591                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 592                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 593                   /* Store the reduction cycles for possible vectorization in
 594                      loop-aware SLP if it was not detected as reduction
 595                      chain.  */
 596                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 597                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 598                       (reduc_stmt_info);
 599                 }
 600             }
 601         }
 602       else
 603         if (dump_enabled_p ())
 604           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 605                            "Unknown def-use cycle pattern.\n");
 606     }
 607 }
 608
 609
 610 /* Function vect_analyze_scalar_cycles.
 611
 612    Examine the cross iteration def-use cycles of scalar variables, by
 613    analyzing the loop-header PHIs of scalar variables.  Classify each
 614    cycle as one of the following: invariant, induction, reduction, unknown.
 615    We do that for the loop represented by LOOP_VINFO, and also to its
 616    inner-loop, if exists.
 617    Examples for scalar cycles:
 618
 619    Example1: reduction:
 620
 621               loop1:
 622               for (i=0; i<N; i++)
 623                  sum += a[i];
 624
 625    Example2: induction:
 626
 627               loop2:
 628               for (i=0; i<N; i++)
 629                  a[i] = i;  */
 630
 631 static void
 632 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 633 {
 634   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 635
 636   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 637
 638   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 639      Reductions in such inner-loop therefore have different properties than
 640      the reductions in the nest that gets vectorized:
 641      1. When vectorized, they are executed in the same order as in the original
 642         scalar loop, so we can't change the order of computation when
 643         vectorizing them.
 644      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 645         current checks are too strict.  */
 646
 647   if (loop->inner)
 648     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 649 }
 650
 651 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 652
 653 static void
 654 vect_fixup_reduc_chain (gimple *stmt)
 655 {
 656   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 657   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 658   stmt_vec_info stmtp;
 659   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 660               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 661   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 662   do
 663     {
 664       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 665       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 666       stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 667       if (stmt)
 668         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 669           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 670     }
 671   while (stmt);
 672   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 673 }
 674
 675 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 676
 677 static void
 678 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 679 {
 680   gimple *first;
 681   unsigned i;
 682
 683   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 684     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 685       {
 686         gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 687         while (next)
 688           {
 689             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 690               break;
 691             next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 692           }
 693         /* If not all stmt in the chain are patterns try to handle
 694            the chain without patterns.  */
 695         if (! next)
 696           {
 697             vect_fixup_reduc_chain (first);
 698             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 699               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 700           }
 701       }
 702 }
 703
 704 /* Function vect_get_loop_niters.
 705
 706    Determine how many iterations the loop is executed and place it
 707    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 708    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 709    niter information holds in ASSUMPTIONS.
 710
 711    Return the loop exit condition.  */
 712
 713
 714 static gcond *
 715 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 716                       tree *number_of_iterations, tree *number_of_iterationsm1)
 717 {
 718   edge exit = single_exit (loop);
 719   struct tree_niter_desc niter_desc;
 720   tree niter_assumptions, niter, may_be_zero;
 721   gcond *cond = get_loop_exit_condition (loop);
 722
 723   *assumptions = boolean_true_node;
 724   *number_of_iterationsm1 = chrec_dont_know;
 725   *number_of_iterations = chrec_dont_know;
 726   DUMP_VECT_SCOPE ("get_loop_niters");
 727
 728   if (!exit)
 729     return cond;
 730
 731   niter = chrec_dont_know;
 732   may_be_zero = NULL_TREE;
 733   niter_assumptions = boolean_true_node;
 734   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 735       || chrec_contains_undetermined (niter_desc.niter))
 736     return cond;
 737
 738   niter_assumptions = niter_desc.assumptions;
 739   may_be_zero = niter_desc.may_be_zero;
 740   niter = niter_desc.niter;
 741
 742   if (may_be_zero && integer_zerop (may_be_zero))
 743     may_be_zero = NULL_TREE;
 744
 745   if (may_be_zero)
 746     {
 747       if (COMPARISON_CLASS_P (may_be_zero))
 748         {
 749           /* Try to combine may_be_zero with assumptions, this can simplify
 750              computation of niter expression.  */
 751           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 752             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 753                                              niter_assumptions,
 754                                              fold_build1 (TRUTH_NOT_EXPR,
 755                                                           boolean_type_node,
 756                                                           may_be_zero));
 757           else
 758             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 759                                  build_int_cst (TREE_TYPE (niter), 0),
 760                                  rewrite_to_non_trapping_overflow (niter));
 761
 762           may_be_zero = NULL_TREE;
 763         }
 764       else if (integer_nonzerop (may_be_zero))
 765         {
 766           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 767           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 768           return cond;
 769         }
 770       else
 771         return cond;
 772     }
 773
 774   *assumptions = niter_assumptions;
 775   *number_of_iterationsm1 = niter;
 776
 777   /* We want the number of loop header executions which is the number
 778      of latch executions plus one.
 779      ???  For UINT_MAX latch executions this number overflows to zero
 780      for loops like do { n++; } while (n != 0);  */
 781   if (niter && !chrec_contains_undetermined (niter))
 782     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 783                           build_int_cst (TREE_TYPE (niter), 1));
 784   *number_of_iterations = niter;
 785
 786   return cond;
 787 }
 788
 789 /* Function bb_in_loop_p
 790
 791    Used as predicate for dfs order traversal of the loop bbs.  */
 792
 793 static bool
 794 bb_in_loop_p (const_basic_block bb, const void *data)
 795 {
 796   const struct loop *const loop = (const struct loop *)data;
 797   if (flow_bb_inside_loop_p (loop, bb))
 798     return true;
 799   return false;
 800 }
 801
 802
 803 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 804    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 805
 806 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 807   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 808     loop (loop_in),
 809     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 810     num_itersm1 (NULL_TREE),
 811     num_iters (NULL_TREE),
 812     num_iters_unchanged (NULL_TREE),
 813     num_iters_assumptions (NULL_TREE),
 814     th (0),
 815     versioning_threshold (0),
 816     vectorization_factor (0),
 817     max_vectorization_factor (0),
 818     mask_skip_niters (NULL_TREE),
 819     mask_compare_type (NULL_TREE),
 820     unaligned_dr (NULL),
 821     peeling_for_alignment (0),
 822     ptr_mask (0),
 823     ivexpr_map (NULL),
 824     slp_unrolling_factor (1),
 825     single_scalar_iteration_cost (0),
 826     vectorizable (false),
 827     can_fully_mask_p (true),
 828     fully_masked_p (false),
 829     peeling_for_gaps (false),
 830     peeling_for_niter (false),
 831     operands_swapped (false),
 832     no_data_dependencies (false),
 833     has_mask_store (false),
 834     scalar_loop (NULL),
 835     orig_loop_info (NULL)
 836 {
 837   /* Create/Update stmt_info for all stmts in the loop.  */
 838   basic_block *body = get_loop_body (loop);
 839   for (unsigned int i = 0; i < loop->num_nodes; i++)
 840     {
 841       basic_block bb = body[i];
 842       gimple_stmt_iterator si;
 843
 844       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 845         {
 846           gimple *phi = gsi_stmt (si);
 847           gimple_set_uid (phi, 0);
 848           add_stmt (phi);
 849         }
 850
 851       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 852         {
 853           gimple *stmt = gsi_stmt (si);
 854           gimple_set_uid (stmt, 0);
 855           add_stmt (stmt);
 856         }
 857     }
 858   free (body);
 859
 860   /* CHECKME: We want to visit all BBs before their successors (except for
 861      latch blocks, for which this assertion wouldn't hold).  In the simple
 862      case of the loop forms we allow, a dfs order of the BBs would the same
 863      as reversed postorder traversal, so we are safe.  */
 864
 865   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 866                                           bbs, loop->num_nodes, loop);
 867   gcc_assert (nbbs == loop->num_nodes);
 868 }
 869
 870 /* Free all levels of MASKS.  */
 871
 872 void
 873 release_vec_loop_masks (vec_loop_masks *masks)
 874 {
 875   rgroup_masks *rgm;
 876   unsigned int i;
 877   FOR_EACH_VEC_ELT (*masks, i, rgm)
 878     rgm->masks.release ();
 879   masks->release ();
 880 }
 881
 882 /* Free all memory used by the _loop_vec_info, as well as all the
 883    stmt_vec_info structs of all the stmts in the loop.  */
 884
 885 _loop_vec_info::~_loop_vec_info ()
 886 {
 887   int nbbs;
 888   gimple_stmt_iterator si;
 889   int j;
 890
 891   /* ???  We're releasing loop_vinfos en-block.  */
 892   set_stmt_vec_info_vec (&stmt_vec_infos);
 893   nbbs = loop->num_nodes;
 894   for (j = 0; j < nbbs; j++)
 895     {
 896       basic_block bb = bbs[j];
 897       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 898         free_stmt_vec_info (gsi_stmt (si));
 899
 900       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 901         {
 902           gimple *stmt = gsi_stmt (si);
 903
 904           /* We may have broken canonical form by moving a constant
 905              into RHS1 of a commutative op.  Fix such occurrences.  */
 906           if (operands_swapped && is_gimple_assign (stmt))
 907             {
 908               enum tree_code code = gimple_assign_rhs_code (stmt);
 909
 910               if ((code == PLUS_EXPR
 911                    || code == POINTER_PLUS_EXPR
 912                    || code == MULT_EXPR)
 913                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 914                 swap_ssa_operands (stmt,
 915                                    gimple_assign_rhs1_ptr (stmt),
 916                                    gimple_assign_rhs2_ptr (stmt));
 917               else if (code == COND_EXPR
 918                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 919                 {
 920                   tree cond_expr = gimple_assign_rhs1 (stmt);
 921                   enum tree_code cond_code = TREE_CODE (cond_expr);
 922
 923                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 924                     {
 925                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 926                                                                   0));
 927                       cond_code = invert_tree_comparison (cond_code,
 928                                                           honor_nans);
 929                       if (cond_code != ERROR_MARK)
 930                         {
 931                           TREE_SET_CODE (cond_expr, cond_code);
 932                           swap_ssa_operands (stmt,
 933                                              gimple_assign_rhs2_ptr (stmt),
 934                                              gimple_assign_rhs3_ptr (stmt));
 935                         }
 936                     }
 937                 }
 938             }
 939
 940           /* Free stmt_vec_info.  */
 941           free_stmt_vec_info (stmt);
 942           gsi_next (&si);
 943         }
 944     }
 945
 946   free (bbs);
 947
 948   release_vec_loop_masks (&masks);
 949   delete ivexpr_map;
 950
 951   loop->aux = NULL;
 952 }
 953
 954 /* Return an invariant or register for EXPR and emit necessary
 955    computations in the LOOP_VINFO loop preheader.  */
 956
 957 tree
 958 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 959 {
 960   if (is_gimple_reg (expr)
 961       || is_gimple_min_invariant (expr))
 962     return expr;
 963
 964   if (! loop_vinfo->ivexpr_map)
 965     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 966   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 967   if (! cached)
 968     {
 969       gimple_seq stmts = NULL;
 970       cached = force_gimple_operand (unshare_expr (expr),
 971                                      &stmts, true, NULL_TREE);
 972       if (stmts)
 973         {
 974           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 975           gsi_insert_seq_on_edge_immediate (e, stmts);
 976         }
 977     }
 978   return cached;
 979 }
 980
 981 /* Return true if we can use CMP_TYPE as the comparison type to produce
 982    all masks required to mask LOOP_VINFO.  */
 983
 984 static bool
 985 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 986 {
 987   rgroup_masks *rgm;
 988   unsigned int i;
 989   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 990     if (rgm->mask_type != NULL_TREE
 991         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 992                                             cmp_type, rgm->mask_type,
 993                                             OPTIMIZE_FOR_SPEED))
 994       return false;
 995   return true;
 996 }
 997
 998 /* Calculate the maximum number of scalars per iteration for every
 999    rgroup in LOOP_VINFO.  */
1000
1001 static unsigned int
1002 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1003 {
1004   unsigned int res = 1;
1005   unsigned int i;
1006   rgroup_masks *rgm;
1007   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1008     res = MAX (res, rgm->max_nscalars_per_iter);
1009   return res;
1010 }
1011
1012 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1013    whether we can actually generate the masks required.  Return true if so,
1014    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1015
1016 static bool
1017 vect_verify_full_masking (loop_vec_info loop_vinfo)
1018 {
1019   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1020   unsigned int min_ni_width;
1021
1022   /* Use a normal loop if there are no statements that need masking.
1023      This only happens in rare degenerate cases: it means that the loop
1024      has no loads, no stores, and no live-out values.  */
1025   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1026     return false;
1027
1028   /* Get the maximum number of iterations that is representable
1029      in the counter type.  */
1030   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1031   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1032
1033   /* Get a more refined estimate for the number of iterations.  */
1034   widest_int max_back_edges;
1035   if (max_loop_iterations (loop, &max_back_edges))
1036     max_ni = wi::smin (max_ni, max_back_edges + 1);
1037
1038   /* Account for rgroup masks, in which each bit is replicated N times.  */
1039   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1040
1041   /* Work out how many bits we need to represent the limit.  */
1042   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1043
1044   /* Find a scalar mode for which WHILE_ULT is supported.  */
1045   opt_scalar_int_mode cmp_mode_iter;
1046   tree cmp_type = NULL_TREE;
1047   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1048     {
1049       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1050       if (cmp_bits >= min_ni_width
1051           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1052         {
1053           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1054           if (this_type
1055               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1056             {
1057               /* Although we could stop as soon as we find a valid mode,
1058                  it's often better to continue until we hit Pmode, since the
1059                  operands to the WHILE are more likely to be reusable in
1060                  address calculations.  */
1061               cmp_type = this_type;
1062               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1063                 break;
1064             }
1065         }
1066     }
1067
1068   if (!cmp_type)
1069     return false;
1070
1071   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1072   return true;
1073 }
1074
1075 /* Calculate the cost of one scalar iteration of the loop.  */
1076 static void
1077 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1078 {
1079   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1080   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1081   int nbbs = loop->num_nodes, factor;
1082   int innerloop_iters, i;
1083
1084   /* Gather costs for statements in the scalar loop.  */
1085
1086   /* FORNOW.  */
1087   innerloop_iters = 1;
1088   if (loop->inner)
1089     innerloop_iters = 50; /* FIXME */
1090
1091   for (i = 0; i < nbbs; i++)
1092     {
1093       gimple_stmt_iterator si;
1094       basic_block bb = bbs[i];
1095
1096       if (bb->loop_father == loop->inner)
1097         factor = innerloop_iters;
1098       else
1099         factor = 1;
1100
1101       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1102         {
1103           gimple *stmt = gsi_stmt (si);
1104           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1105
1106           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1107             continue;
1108
1109           /* Skip stmts that are not vectorized inside the loop.  */
1110           if (stmt_info
1111               && !STMT_VINFO_RELEVANT_P (stmt_info)
1112               && (!STMT_VINFO_LIVE_P (stmt_info)
1113                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1114               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1115             continue;
1116
1117           vect_cost_for_stmt kind;
1118           if (STMT_VINFO_DATA_REF (stmt_info))
1119             {
1120               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1121                kind = scalar_load;
1122              else
1123                kind = scalar_store;
1124             }
1125           else
1126             kind = scalar_stmt;
1127
1128           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1129                             factor, kind, stmt_info, 0, vect_prologue);
1130         }
1131     }
1132
1133   /* Now accumulate cost.  */
1134   void *target_cost_data = init_cost (loop);
1135   stmt_info_for_cost *si;
1136   int j;
1137   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1138                     j, si)
1139     {
1140       struct _stmt_vec_info *stmt_info
1141         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
1142       (void) add_stmt_cost (target_cost_data, si->count,
1143                             si->kind, stmt_info, si->misalign,
1144                             vect_body);
1145     }
1146   unsigned dummy, body_cost = 0;
1147   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1148   destroy_cost_data (target_cost_data);
1149   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1150 }
1151
1152
1153 /* Function vect_analyze_loop_form_1.
1154
1155    Verify that certain CFG restrictions hold, including:
1156    - the loop has a pre-header
1157    - the loop has a single entry and exit
1158    - the loop exit condition is simple enough
1159    - the number of iterations can be analyzed, i.e, a countable loop.  The
1160      niter could be analyzed under some assumptions.  */
1161
1162 bool
1163 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1164                           tree *assumptions, tree *number_of_iterationsm1,
1165                           tree *number_of_iterations, gcond **inner_loop_cond)
1166 {
1167   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1168
1169   /* Different restrictions apply when we are considering an inner-most loop,
1170      vs. an outer (nested) loop.
1171      (FORNOW. May want to relax some of these restrictions in the future).  */
1172
1173   if (!loop->inner)
1174     {
1175       /* Inner-most loop.  We currently require that the number of BBs is
1176          exactly 2 (the header and latch).  Vectorizable inner-most loops
1177          look like this:
1178
1179                         (pre-header)
1180                            |
1181                           header <--------+
1182                            | |            |
1183                            | +--> latch --+
1184                            |
1185                         (exit-bb)  */
1186
1187       if (loop->num_nodes != 2)
1188         {
1189           if (dump_enabled_p ())
1190             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191                              "not vectorized: control flow in loop.\n");
1192           return false;
1193         }
1194
1195       if (empty_block_p (loop->header))
1196         {
1197           if (dump_enabled_p ())
1198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199                              "not vectorized: empty loop.\n");
1200           return false;
1201         }
1202     }
1203   else
1204     {
1205       struct loop *innerloop = loop->inner;
1206       edge entryedge;
1207
1208       /* Nested loop. We currently require that the loop is doubly-nested,
1209          contains a single inner loop, and the number of BBs is exactly 5.
1210          Vectorizable outer-loops look like this:
1211
1212                         (pre-header)
1213                            |
1214                           header <---+
1215                            |         |
1216                           inner-loop |
1217                            |         |
1218                           tail ------+
1219                            |
1220                         (exit-bb)
1221
1222          The inner-loop has the properties expected of inner-most loops
1223          as described above.  */
1224
1225       if ((loop->inner)->inner || (loop->inner)->next)
1226         {
1227           if (dump_enabled_p ())
1228             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1229                              "not vectorized: multiple nested loops.\n");
1230           return false;
1231         }
1232
1233       if (loop->num_nodes != 5)
1234         {
1235           if (dump_enabled_p ())
1236             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1237                              "not vectorized: control flow in loop.\n");
1238           return false;
1239         }
1240
1241       entryedge = loop_preheader_edge (innerloop);
1242       if (entryedge->src != loop->header
1243           || !single_exit (innerloop)
1244           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1245         {
1246           if (dump_enabled_p ())
1247             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1248                              "not vectorized: unsupported outerloop form.\n");
1249           return false;
1250         }
1251
1252       /* Analyze the inner-loop.  */
1253       tree inner_niterm1, inner_niter, inner_assumptions;
1254       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1255                                       &inner_assumptions, &inner_niterm1,
1256                                       &inner_niter, NULL)
1257           /* Don't support analyzing niter under assumptions for inner
1258              loop.  */
1259           || !integer_onep (inner_assumptions))
1260         {
1261           if (dump_enabled_p ())
1262             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1263                              "not vectorized: Bad inner loop.\n");
1264           return false;
1265         }
1266
1267       if (!expr_invariant_in_loop_p (loop, inner_niter))
1268         {
1269           if (dump_enabled_p ())
1270             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1271                              "not vectorized: inner-loop count not"
1272                              " invariant.\n");
1273           return false;
1274         }
1275
1276       if (dump_enabled_p ())
1277         dump_printf_loc (MSG_NOTE, vect_location,
1278                          "Considering outer-loop vectorization.\n");
1279     }
1280
1281   if (!single_exit (loop)
1282       || EDGE_COUNT (loop->header->preds) != 2)
1283     {
1284       if (dump_enabled_p ())
1285         {
1286           if (!single_exit (loop))
1287             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1288                              "not vectorized: multiple exits.\n");
1289           else if (EDGE_COUNT (loop->header->preds) != 2)
1290             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291                              "not vectorized: too many incoming edges.\n");
1292         }
1293       return false;
1294     }
1295
1296   /* We assume that the loop exit condition is at the end of the loop. i.e,
1297      that the loop is represented as a do-while (with a proper if-guard
1298      before the loop if needed), where the loop header contains all the
1299      executable statements, and the latch is empty.  */
1300   if (!empty_block_p (loop->latch)
1301       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1302     {
1303       if (dump_enabled_p ())
1304         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1305                          "not vectorized: latch block not empty.\n");
1306       return false;
1307     }
1308
1309   /* Make sure the exit is not abnormal.  */
1310   edge e = single_exit (loop);
1311   if (e->flags & EDGE_ABNORMAL)
1312     {
1313       if (dump_enabled_p ())
1314         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1315                          "not vectorized: abnormal loop exit edge.\n");
1316       return false;
1317     }
1318
1319   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1320                                      number_of_iterationsm1);
1321   if (!*loop_cond)
1322     {
1323       if (dump_enabled_p ())
1324         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325                          "not vectorized: complicated exit condition.\n");
1326       return false;
1327     }
1328
1329   if (integer_zerop (*assumptions)
1330       || !*number_of_iterations
1331       || chrec_contains_undetermined (*number_of_iterations))
1332     {
1333       if (dump_enabled_p ())
1334         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1335                          "not vectorized: number of iterations cannot be "
1336                          "computed.\n");
1337       return false;
1338     }
1339
1340   if (integer_zerop (*number_of_iterations))
1341     {
1342       if (dump_enabled_p ())
1343         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1344                          "not vectorized: number of iterations = 0.\n");
1345       return false;
1346     }
1347
1348   return true;
1349 }
1350
1351 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1352
1353 loop_vec_info
1354 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1355 {
1356   tree assumptions, number_of_iterations, number_of_iterationsm1;
1357   gcond *loop_cond, *inner_loop_cond = NULL;
1358
1359   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1360                                   &assumptions, &number_of_iterationsm1,
1361                                   &number_of_iterations, &inner_loop_cond))
1362     return NULL;
1363
1364   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1365   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1366   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1367   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1368   if (!integer_onep (assumptions))
1369     {
1370       /* We consider to vectorize this loop by versioning it under
1371          some assumptions.  In order to do this, we need to clear
1372          existing information computed by scev and niter analyzer.  */
1373       scev_reset_htab ();
1374       free_numbers_of_iterations_estimates (loop);
1375       /* Also set flag for this loop so that following scev and niter
1376          analysis are done under the assumptions.  */
1377       loop_constraint_set (loop, LOOP_C_FINITE);
1378       /* Also record the assumptions for versioning.  */
1379       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1380     }
1381
1382   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1383     {
1384       if (dump_enabled_p ())
1385         {
1386           dump_printf_loc (MSG_NOTE, vect_location,
1387                            "Symbolic number of iterations is ");
1388           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1389           dump_printf (MSG_NOTE, "\n");
1390         }
1391     }
1392
1393   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1394   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1395   if (inner_loop_cond)
1396     {
1397       stmt_vec_info inner_loop_cond_info
1398         = loop_vinfo->lookup_stmt (inner_loop_cond);
1399       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1400     }
1401
1402   gcc_assert (!loop->aux);
1403   loop->aux = loop_vinfo;
1404   return loop_vinfo;
1405 }
1406
1407
1408
1409 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1410    statements update the vectorization factor.  */
1411
1412 static void
1413 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1414 {
1415   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1416   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1417   int nbbs = loop->num_nodes;
1418   poly_uint64 vectorization_factor;
1419   int i;
1420
1421   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1422
1423   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1424   gcc_assert (known_ne (vectorization_factor, 0U));
1425
1426   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1427      vectorization factor of the loop is the unrolling factor required by
1428      the SLP instances.  If that unrolling factor is 1, we say, that we
1429      perform pure SLP on loop - cross iteration parallelism is not
1430      exploited.  */
1431   bool only_slp_in_loop = true;
1432   for (i = 0; i < nbbs; i++)
1433     {
1434       basic_block bb = bbs[i];
1435       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1436            gsi_next (&si))
1437         {
1438           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1439           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1440               && STMT_VINFO_RELATED_STMT (stmt_info))
1441             stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
1442           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1443                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1444               && !PURE_SLP_STMT (stmt_info))
1445             /* STMT needs both SLP and loop-based vectorization.  */
1446             only_slp_in_loop = false;
1447         }
1448     }
1449
1450   if (only_slp_in_loop)
1451     {
1452       dump_printf_loc (MSG_NOTE, vect_location,
1453                        "Loop contains only SLP stmts\n");
1454       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1455     }
1456   else
1457     {
1458       dump_printf_loc (MSG_NOTE, vect_location,
1459                        "Loop contains SLP and non-SLP stmts\n");
1460       /* Both the vectorization factor and unroll factor have the form
1461          current_vector_size * X for some rational X, so they must have
1462          a common multiple.  */
1463       vectorization_factor
1464         = force_common_multiple (vectorization_factor,
1465                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1466     }
1467
1468   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1469   if (dump_enabled_p ())
1470     {
1471       dump_printf_loc (MSG_NOTE, vect_location,
1472                        "Updating vectorization factor to ");
1473       dump_dec (MSG_NOTE, vectorization_factor);
1474       dump_printf (MSG_NOTE, ".\n");
1475     }
1476 }
1477
1478 /* Return true if STMT_INFO describes a double reduction phi and if
1479    the other phi in the reduction is also relevant for vectorization.
1480    This rejects cases such as:
1481
1482       outer1:
1483         x_1 = PHI <x_3(outer2), ...>;
1484         ...
1485
1486       inner:
1487         x_2 = ...;
1488         ...
1489
1490       outer2:
1491         x_3 = PHI <x_2(inner)>;
1492
1493    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1494
1495 static bool
1496 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1497 {
1498   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1499     return false;
1500
1501   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1502 }
1503
1504 /* Function vect_analyze_loop_operations.
1505
1506    Scan the loop stmts and make sure they are all vectorizable.  */
1507
1508 static bool
1509 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1510 {
1511   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1512   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1513   int nbbs = loop->num_nodes;
1514   int i;
1515   stmt_vec_info stmt_info;
1516   bool need_to_vectorize = false;
1517   bool ok;
1518
1519   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1520
1521   stmt_vector_for_cost cost_vec;
1522   cost_vec.create (2);
1523
1524   for (i = 0; i < nbbs; i++)
1525     {
1526       basic_block bb = bbs[i];
1527
1528       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1529            gsi_next (&si))
1530         {
1531           gphi *phi = si.phi ();
1532           ok = true;
1533
1534           stmt_info = loop_vinfo->lookup_stmt (phi);
1535           if (dump_enabled_p ())
1536             {
1537               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1538               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1539             }
1540           if (virtual_operand_p (gimple_phi_result (phi)))
1541             continue;
1542
1543           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1544              (i.e., a phi in the tail of the outer-loop).  */
1545           if (! is_loop_header_bb_p (bb))
1546             {
1547               /* FORNOW: we currently don't support the case that these phis
1548                  are not used in the outerloop (unless it is double reduction,
1549                  i.e., this phi is vect_reduction_def), cause this case
1550                  requires to actually do something here.  */
1551               if (STMT_VINFO_LIVE_P (stmt_info)
1552                   && !vect_active_double_reduction_p (stmt_info))
1553                 {
1554                   if (dump_enabled_p ())
1555                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556                                      "Unsupported loop-closed phi in "
1557                                      "outer-loop.\n");
1558                   return false;
1559                 }
1560
1561               /* If PHI is used in the outer loop, we check that its operand
1562                  is defined in the inner loop.  */
1563               if (STMT_VINFO_RELEVANT_P (stmt_info))
1564                 {
1565                   tree phi_op;
1566
1567                   if (gimple_phi_num_args (phi) != 1)
1568                     return false;
1569
1570                   phi_op = PHI_ARG_DEF (phi, 0);
1571                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1572                   if (!op_def_info)
1573                     return false;
1574
1575                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1576                       && (STMT_VINFO_RELEVANT (op_def_info)
1577                           != vect_used_in_outer_by_reduction))
1578                     return false;
1579                 }
1580
1581               continue;
1582             }
1583
1584           gcc_assert (stmt_info);
1585
1586           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1587                || STMT_VINFO_LIVE_P (stmt_info))
1588               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1589             {
1590               /* A scalar-dependence cycle that we don't support.  */
1591               if (dump_enabled_p ())
1592                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1593                                  "not vectorized: scalar dependence cycle.\n");
1594               return false;
1595             }
1596
1597           if (STMT_VINFO_RELEVANT_P (stmt_info))
1598             {
1599               need_to_vectorize = true;
1600               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1601                   && ! PURE_SLP_STMT (stmt_info))
1602                 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1603               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1604                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1605                        && ! PURE_SLP_STMT (stmt_info))
1606                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1607                                              &cost_vec);
1608             }
1609
1610           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1611           if (ok
1612               && STMT_VINFO_LIVE_P (stmt_info)
1613               && !PURE_SLP_STMT (stmt_info))
1614             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1615                                               &cost_vec);
1616
1617           if (!ok)
1618             {
1619               if (dump_enabled_p ())
1620                 {
1621                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1622                                    "not vectorized: relevant phi not "
1623                                    "supported: ");
1624                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1625                 }
1626               return false;
1627             }
1628         }
1629
1630       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1631            gsi_next (&si))
1632         {
1633           gimple *stmt = gsi_stmt (si);
1634           if (!gimple_clobber_p (stmt)
1635               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1636                                      &cost_vec))
1637             return false;
1638         }
1639     } /* bbs */
1640
1641   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1642   cost_vec.release ();
1643
1644   /* All operations in the loop are either irrelevant (deal with loop
1645      control, or dead), or only used outside the loop and can be moved
1646      out of the loop (e.g. invariants, inductions).  The loop can be
1647      optimized away by scalar optimizations.  We're better off not
1648      touching this loop.  */
1649   if (!need_to_vectorize)
1650     {
1651       if (dump_enabled_p ())
1652         dump_printf_loc (MSG_NOTE, vect_location,
1653                          "All the computation can be taken out of the loop.\n");
1654       if (dump_enabled_p ())
1655         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1656                          "not vectorized: redundant loop. no profit to "
1657                          "vectorize.\n");
1658       return false;
1659     }
1660
1661   return true;
1662 }
1663
1664 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1665    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1666    definitely no, or -1 if it's worth retrying.  */
1667
1668 static int
1669 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1670 {
1671   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1672   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1673
1674   /* Only fully-masked loops can have iteration counts less than the
1675      vectorization factor.  */
1676   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1677     {
1678       HOST_WIDE_INT max_niter;
1679
1680       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1681         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1682       else
1683         max_niter = max_stmt_executions_int (loop);
1684
1685       if (max_niter != -1
1686           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1687         {
1688           if (dump_enabled_p ())
1689             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1690                              "not vectorized: iteration count smaller than "
1691                              "vectorization factor.\n");
1692           return 0;
1693         }
1694     }
1695
1696   int min_profitable_iters, min_profitable_estimate;
1697   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1698                                       &min_profitable_estimate);
1699
1700   if (min_profitable_iters < 0)
1701     {
1702       if (dump_enabled_p ())
1703         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1704                          "not vectorized: vectorization not profitable.\n");
1705       if (dump_enabled_p ())
1706         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1707                          "not vectorized: vector version will never be "
1708                          "profitable.\n");
1709       return -1;
1710     }
1711
1712   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1713                                * assumed_vf);
1714
1715   /* Use the cost model only if it is more conservative than user specified
1716      threshold.  */
1717   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1718                                     min_profitable_iters);
1719
1720   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1721
1722   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1723       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1724     {
1725       if (dump_enabled_p ())
1726         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1727                          "not vectorized: vectorization not profitable.\n");
1728       if (dump_enabled_p ())
1729         dump_printf_loc (MSG_NOTE, vect_location,
1730                          "not vectorized: iteration count smaller than user "
1731                          "specified loop bound parameter or minimum profitable "
1732                          "iterations (whichever is more conservative).\n");
1733       return 0;
1734     }
1735
1736   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1737   if (estimated_niter == -1)
1738     estimated_niter = likely_max_stmt_executions_int (loop);
1739   if (estimated_niter != -1
1740       && ((unsigned HOST_WIDE_INT) estimated_niter
1741           < MAX (th, (unsigned) min_profitable_estimate)))
1742     {
1743       if (dump_enabled_p ())
1744         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1745                          "not vectorized: estimated iteration count too "
1746                          "small.\n");
1747       if (dump_enabled_p ())
1748         dump_printf_loc (MSG_NOTE, vect_location,
1749                          "not vectorized: estimated iteration count smaller "
1750                          "than specified loop bound parameter or minimum "
1751                          "profitable iterations (whichever is more "
1752                          "conservative).\n");
1753       return -1;
1754     }
1755
1756   return 1;
1757 }
1758
1759 static bool
1760 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1761                            vec<data_reference_p> *datarefs,
1762                            unsigned int *n_stmts)
1763 {
1764   *n_stmts = 0;
1765   for (unsigned i = 0; i < loop->num_nodes; i++)
1766     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1767          !gsi_end_p (gsi); gsi_next (&gsi))
1768       {
1769         gimple *stmt = gsi_stmt (gsi);
1770         if (is_gimple_debug (stmt))
1771           continue;
1772         ++(*n_stmts);
1773         if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1774           {
1775             if (is_gimple_call (stmt) && loop->safelen)
1776               {
1777                 tree fndecl = gimple_call_fndecl (stmt), op;
1778                 if (fndecl != NULL_TREE)
1779                   {
1780                     cgraph_node *node = cgraph_node::get (fndecl);
1781                     if (node != NULL && node->simd_clones != NULL)
1782                       {
1783                         unsigned int j, n = gimple_call_num_args (stmt);
1784                         for (j = 0; j < n; j++)
1785                           {
1786                             op = gimple_call_arg (stmt, j);
1787                             if (DECL_P (op)
1788                                 || (REFERENCE_CLASS_P (op)
1789                                     && get_base_address (op)))
1790                               break;
1791                           }
1792                         op = gimple_call_lhs (stmt);
1793                         /* Ignore #pragma omp declare simd functions
1794                            if they don't have data references in the
1795                            call stmt itself.  */
1796                         if (j == n
1797                             && !(op
1798                                  && (DECL_P (op)
1799                                      || (REFERENCE_CLASS_P (op)
1800                                          && get_base_address (op)))))
1801                           continue;
1802                       }
1803                   }
1804               }
1805             return false;
1806           }
1807         /* If dependence analysis will give up due to the limit on the
1808            number of datarefs stop here and fail fatally.  */
1809         if (datarefs->length ()
1810             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1811           return false;
1812       }
1813   return true;
1814 }
1815
1816 /* Function vect_analyze_loop_2.
1817
1818    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1819    for it.  The different analyses will record information in the
1820    loop_vec_info struct.  */
1821 static bool
1822 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1823 {
1824   bool ok;
1825   int res;
1826   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1827   poly_uint64 min_vf = 2;
1828
1829   /* The first group of checks is independent of the vector size.  */
1830   fatal = true;
1831
1832   /* Find all data references in the loop (which correspond to vdefs/vuses)
1833      and analyze their evolution in the loop.  */
1834
1835   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1836
1837   /* Gather the data references and count stmts in the loop.  */
1838   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1839     {
1840       if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1841                                       &LOOP_VINFO_DATAREFS (loop_vinfo),
1842                                       n_stmts))
1843         {
1844           if (dump_enabled_p ())
1845             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1846                              "not vectorized: loop contains function "
1847                              "calls or data references that cannot "
1848                              "be analyzed\n");
1849           return false;
1850         }
1851       loop_vinfo->shared->save_datarefs ();
1852     }
1853   else
1854     loop_vinfo->shared->check_datarefs ();
1855
1856   /* Analyze the data references and also adjust the minimal
1857      vectorization factor according to the loads and stores.  */
1858
1859   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1860   if (!ok)
1861     {
1862       if (dump_enabled_p ())
1863         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1864                          "bad data references.\n");
1865       return false;
1866     }
1867
1868   /* Classify all cross-iteration scalar data-flow cycles.
1869      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1870   vect_analyze_scalar_cycles (loop_vinfo);
1871
1872   vect_pattern_recog (loop_vinfo);
1873
1874   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1875
1876   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1877      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1878
1879   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1880   if (!ok)
1881     {
1882       if (dump_enabled_p ())
1883         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1884                          "bad data access.\n");
1885       return false;
1886     }
1887
1888   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1889
1890   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1891   if (!ok)
1892     {
1893       if (dump_enabled_p ())
1894         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1895                          "unexpected pattern.\n");
1896       return false;
1897     }
1898
1899   /* While the rest of the analysis below depends on it in some way.  */
1900   fatal = false;
1901
1902   /* Analyze data dependences between the data-refs in the loop
1903      and adjust the maximum vectorization factor according to
1904      the dependences.
1905      FORNOW: fail at the first data dependence that we encounter.  */
1906
1907   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1908   if (!ok
1909       || (max_vf != MAX_VECTORIZATION_FACTOR
1910           && maybe_lt (max_vf, min_vf)))
1911     {
1912       if (dump_enabled_p ())
1913             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1914                              "bad data dependence.\n");
1915       return false;
1916     }
1917   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1918
1919   ok = vect_determine_vectorization_factor (loop_vinfo);
1920   if (!ok)
1921     {
1922       if (dump_enabled_p ())
1923         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1924                          "can't determine vectorization factor.\n");
1925       return false;
1926     }
1927   if (max_vf != MAX_VECTORIZATION_FACTOR
1928       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1929     {
1930       if (dump_enabled_p ())
1931         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1932                          "bad data dependence.\n");
1933       return false;
1934     }
1935
1936   /* Compute the scalar iteration cost.  */
1937   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1938
1939   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1940   unsigned th;
1941
1942   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1943   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1944   if (!ok)
1945     return false;
1946
1947   /* If there are any SLP instances mark them as pure_slp.  */
1948   bool slp = vect_make_slp_decision (loop_vinfo);
1949   if (slp)
1950     {
1951       /* Find stmts that need to be both vectorized and SLPed.  */
1952       vect_detect_hybrid_slp (loop_vinfo);
1953
1954       /* Update the vectorization factor based on the SLP decision.  */
1955       vect_update_vf_for_slp (loop_vinfo);
1956     }
1957
1958   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1959
1960   /* We don't expect to have to roll back to anything other than an empty
1961      set of rgroups.  */
1962   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1963
1964   /* This is the point where we can re-start analysis with SLP forced off.  */
1965 start_over:
1966
1967   /* Now the vectorization factor is final.  */
1968   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1969   gcc_assert (known_ne (vectorization_factor, 0U));
1970
1971   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1972     {
1973       dump_printf_loc (MSG_NOTE, vect_location,
1974                        "vectorization_factor = ");
1975       dump_dec (MSG_NOTE, vectorization_factor);
1976       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1977                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1978     }
1979
1980   HOST_WIDE_INT max_niter
1981     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1982
1983   /* Analyze the alignment of the data-refs in the loop.
1984      Fail if a data reference is found that cannot be vectorized.  */
1985
1986   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1987   if (!ok)
1988     {
1989       if (dump_enabled_p ())
1990         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1991                          "bad data alignment.\n");
1992       return false;
1993     }
1994
1995   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1996      It is important to call pruning after vect_analyze_data_ref_accesses,
1997      since we use grouping information gathered by interleaving analysis.  */
1998   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1999   if (!ok)
2000     return false;
2001
2002   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2003      vectorization.  */
2004   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2005     {
2006     /* This pass will decide on using loop versioning and/or loop peeling in
2007        order to enhance the alignment of data references in the loop.  */
2008     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2009     if (!ok)
2010       {
2011         if (dump_enabled_p ())
2012           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2013                            "bad data alignment.\n");
2014         return false;
2015       }
2016     }
2017
2018   if (slp)
2019     {
2020       /* Analyze operations in the SLP instances.  Note this may
2021          remove unsupported SLP instances which makes the above
2022          SLP kind detection invalid.  */
2023       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2024       vect_slp_analyze_operations (loop_vinfo);
2025       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2026         goto again;
2027     }
2028
2029   /* Scan all the remaining operations in the loop that are not subject
2030      to SLP and make sure they are vectorizable.  */
2031   ok = vect_analyze_loop_operations (loop_vinfo);
2032   if (!ok)
2033     {
2034       if (dump_enabled_p ())
2035         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2036                          "bad operation or unsupported loop bound.\n");
2037       return false;
2038     }
2039
2040   /* Decide whether to use a fully-masked loop for this vectorization
2041      factor.  */
2042   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2043     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2044        && vect_verify_full_masking (loop_vinfo));
2045   if (dump_enabled_p ())
2046     {
2047       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2048         dump_printf_loc (MSG_NOTE, vect_location,
2049                          "using a fully-masked loop.\n");
2050       else
2051         dump_printf_loc (MSG_NOTE, vect_location,
2052                          "not using a fully-masked loop.\n");
2053     }
2054
2055   /* If epilog loop is required because of data accesses with gaps,
2056      one additional iteration needs to be peeled.  Check if there is
2057      enough iterations for vectorization.  */
2058   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2059       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2060       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2061     {
2062       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2063       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2064
2065       if (known_lt (wi::to_widest (scalar_niters), vf))
2066         {
2067           if (dump_enabled_p ())
2068             dump_printf_loc (MSG_NOTE, vect_location,
2069                              "loop has no enough iterations to support"
2070                              " peeling for gaps.\n");
2071           return false;
2072         }
2073     }
2074
2075   /* Check the costings of the loop make vectorizing worthwhile.  */
2076   res = vect_analyze_loop_costing (loop_vinfo);
2077   if (res < 0)
2078     goto again;
2079   if (!res)
2080     {
2081       if (dump_enabled_p ())
2082         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2083                          "Loop costings not worthwhile.\n");
2084       return false;
2085     }
2086
2087   /* Decide whether we need to create an epilogue loop to handle
2088      remaining scalar iterations.  */
2089   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2090
2091   unsigned HOST_WIDE_INT const_vf;
2092   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2093     /* The main loop handles all iterations.  */
2094     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2095   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2096            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2097     {
2098       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2099                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2100                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2101         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2102     }
2103   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2104            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2105            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2106                 < (unsigned) exact_log2 (const_vf))
2107                /* In case of versioning, check if the maximum number of
2108                   iterations is greater than th.  If they are identical,
2109                   the epilogue is unnecessary.  */
2110                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2111                    || ((unsigned HOST_WIDE_INT) max_niter
2112                        > (th / const_vf) * const_vf))))
2113     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2114
2115   /* If an epilogue loop is required make sure we can create one.  */
2116   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2117       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2118     {
2119       if (dump_enabled_p ())
2120         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2121       if (!vect_can_advance_ivs_p (loop_vinfo)
2122           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2123                                            single_exit (LOOP_VINFO_LOOP
2124                                                          (loop_vinfo))))
2125         {
2126           if (dump_enabled_p ())
2127             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2128                              "not vectorized: can't create required "
2129                              "epilog loop\n");
2130           goto again;
2131         }
2132     }
2133
2134   /* During peeling, we need to check if number of loop iterations is
2135      enough for both peeled prolog loop and vector loop.  This check
2136      can be merged along with threshold check of loop versioning, so
2137      increase threshold for this case if necessary.  */
2138   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2139     {
2140       poly_uint64 niters_th = 0;
2141
2142       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2143         {
2144           /* Niters for peeled prolog loop.  */
2145           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2146             {
2147               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2148               tree vectype
2149                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (vect_dr_stmt (dr)));
2150               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2151             }
2152           else
2153             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2154         }
2155
2156       /* Niters for at least one iteration of vectorized loop.  */
2157       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2158         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2159       /* One additional iteration because of peeling for gap.  */
2160       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2161         niters_th += 1;
2162       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2163     }
2164
2165   gcc_assert (known_eq (vectorization_factor,
2166                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2167
2168   /* Ok to vectorize!  */
2169   return true;
2170
2171 again:
2172   /* Try again with SLP forced off but if we didn't do any SLP there is
2173      no point in re-trying.  */
2174   if (!slp)
2175     return false;
2176
2177   /* If there are reduction chains re-trying will fail anyway.  */
2178   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2179     return false;
2180
2181   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2182      via interleaving or lane instructions.  */
2183   slp_instance instance;
2184   slp_tree node;
2185   unsigned i, j;
2186   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2187     {
2188       stmt_vec_info vinfo;
2189       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2190       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2191         continue;
2192       vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2193       unsigned int size = DR_GROUP_SIZE (vinfo);
2194       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2195       if (! vect_store_lanes_supported (vectype, size, false)
2196          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2197          && ! vect_grouped_store_supported (vectype, size))
2198        return false;
2199       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2200         {
2201           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2202           vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2203           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2204           size = DR_GROUP_SIZE (vinfo);
2205           vectype = STMT_VINFO_VECTYPE (vinfo);
2206           if (! vect_load_lanes_supported (vectype, size, false)
2207               && ! vect_grouped_load_supported (vectype, single_element_p,
2208                                                 size))
2209             return false;
2210         }
2211     }
2212
2213   if (dump_enabled_p ())
2214     dump_printf_loc (MSG_NOTE, vect_location,
2215                      "re-trying with SLP disabled\n");
2216
2217   /* Roll back state appropriately.  No SLP this time.  */
2218   slp = false;
2219   /* Restore vectorization factor as it were without SLP.  */
2220   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2221   /* Free the SLP instances.  */
2222   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2223     vect_free_slp_instance (instance, false);
2224   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2225   /* Reset SLP type to loop_vect on all stmts.  */
2226   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2227     {
2228       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2229       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2230            !gsi_end_p (si); gsi_next (&si))
2231         {
2232           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2233           STMT_SLP_TYPE (stmt_info) = loop_vect;
2234         }
2235       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2236            !gsi_end_p (si); gsi_next (&si))
2237         {
2238           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2239           STMT_SLP_TYPE (stmt_info) = loop_vect;
2240           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2241             {
2242               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2243               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2244               STMT_SLP_TYPE (stmt_info) = loop_vect;
2245               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2246                    !gsi_end_p (pi); gsi_next (&pi))
2247                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2248                   = loop_vect;
2249             }
2250         }
2251     }
2252   /* Free optimized alias test DDRS.  */
2253   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2254   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2255   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2256   /* Reset target cost data.  */
2257   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2258   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2259     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2260   /* Reset accumulated rgroup information.  */
2261   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2262   /* Reset assorted flags.  */
2263   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2264   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2265   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2266   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2267   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2268
2269   goto start_over;
2270 }
2271
2272 /* Function vect_analyze_loop.
2273
2274    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2275    for it.  The different analyses will record information in the
2276    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2277    be vectorized.  */
2278 loop_vec_info
2279 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2280                    vec_info_shared *shared)
2281 {
2282   loop_vec_info loop_vinfo;
2283   auto_vector_sizes vector_sizes;
2284
2285   /* Autodetect first vector size we try.  */
2286   current_vector_size = 0;
2287   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2288   unsigned int next_size = 0;
2289
2290   DUMP_VECT_SCOPE ("analyze_loop_nest");
2291
2292   if (loop_outer (loop)
2293       && loop_vec_info_for_loop (loop_outer (loop))
2294       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2295     {
2296       if (dump_enabled_p ())
2297         dump_printf_loc (MSG_NOTE, vect_location,
2298                          "outer-loop already vectorized.\n");
2299       return NULL;
2300     }
2301
2302   if (!find_loop_nest (loop, &shared->loop_nest))
2303     {
2304       if (dump_enabled_p ())
2305         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2306                          "not vectorized: loop nest containing two "
2307                          "or more consecutive inner loops cannot be "
2308                          "vectorized\n");
2309       return NULL;
2310     }
2311
2312   unsigned n_stmts = 0;
2313   poly_uint64 autodetected_vector_size = 0;
2314   while (1)
2315     {
2316       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2317       loop_vinfo = vect_analyze_loop_form (loop, shared);
2318       if (!loop_vinfo)
2319         {
2320           if (dump_enabled_p ())
2321             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2322                              "bad loop form.\n");
2323           return NULL;
2324         }
2325
2326       bool fatal = false;
2327
2328       if (orig_loop_vinfo)
2329         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2330
2331       if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2332         {
2333           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2334
2335           return loop_vinfo;
2336         }
2337
2338       delete loop_vinfo;
2339
2340       if (next_size == 0)
2341         autodetected_vector_size = current_vector_size;
2342
2343       if (next_size < vector_sizes.length ()
2344           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2345         next_size += 1;
2346
2347       if (fatal
2348           || next_size == vector_sizes.length ()
2349           || known_eq (current_vector_size, 0U))
2350         return NULL;
2351
2352       /* Try the next biggest vector size.  */
2353       current_vector_size = vector_sizes[next_size++];
2354       if (dump_enabled_p ())
2355         {
2356           dump_printf_loc (MSG_NOTE, vect_location,
2357                            "***** Re-trying analysis with "
2358                            "vector size ");
2359           dump_dec (MSG_NOTE, current_vector_size);
2360           dump_printf (MSG_NOTE, "\n");
2361         }
2362     }
2363 }
2364
2365 /* Return true if there is an in-order reduction function for CODE, storing
2366    it in *REDUC_FN if so.  */
2367
2368 static bool
2369 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2370 {
2371   switch (code)
2372     {
2373     case PLUS_EXPR:
2374       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2375       return true;
2376
2377     default:
2378       return false;
2379     }
2380 }
2381
2382 /* Function reduction_fn_for_scalar_code
2383
2384    Input:
2385    CODE - tree_code of a reduction operations.
2386
2387    Output:
2388    REDUC_FN - the corresponding internal function to be used to reduce the
2389       vector of partial results into a single scalar result, or IFN_LAST
2390       if the operation is a supported reduction operation, but does not have
2391       such an internal function.
2392
2393    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2394
2395 static bool
2396 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2397 {
2398   switch (code)
2399     {
2400       case MAX_EXPR:
2401         *reduc_fn = IFN_REDUC_MAX;
2402         return true;
2403
2404       case MIN_EXPR:
2405         *reduc_fn = IFN_REDUC_MIN;
2406         return true;
2407
2408       case PLUS_EXPR:
2409         *reduc_fn = IFN_REDUC_PLUS;
2410         return true;
2411
2412       case BIT_AND_EXPR:
2413         *reduc_fn = IFN_REDUC_AND;
2414         return true;
2415
2416       case BIT_IOR_EXPR:
2417         *reduc_fn = IFN_REDUC_IOR;
2418         return true;
2419
2420       case BIT_XOR_EXPR:
2421         *reduc_fn = IFN_REDUC_XOR;
2422         return true;
2423
2424       case MULT_EXPR:
2425       case MINUS_EXPR:
2426         *reduc_fn = IFN_LAST;
2427         return true;
2428
2429       default:
2430        return false;
2431     }
2432 }
2433
2434 /* If there is a neutral value X such that SLP reduction NODE would not
2435    be affected by the introduction of additional X elements, return that X,
2436    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2437    is true if the SLP statements perform a single reduction, false if each
2438    statement performs an independent reduction.  */
2439
2440 static tree
2441 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2442                               bool reduc_chain)
2443 {
2444   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2445   stmt_vec_info stmt_vinfo = stmts[0];
2446   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2447   tree scalar_type = TREE_TYPE (vector_type);
2448   struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2449   gcc_assert (loop);
2450
2451   switch (code)
2452     {
2453     case WIDEN_SUM_EXPR:
2454     case DOT_PROD_EXPR:
2455     case SAD_EXPR:
2456     case PLUS_EXPR:
2457     case MINUS_EXPR:
2458     case BIT_IOR_EXPR:
2459     case BIT_XOR_EXPR:
2460       return build_zero_cst (scalar_type);
2461
2462     case MULT_EXPR:
2463       return build_one_cst (scalar_type);
2464
2465     case BIT_AND_EXPR:
2466       return build_all_ones_cst (scalar_type);
2467
2468     case MAX_EXPR:
2469     case MIN_EXPR:
2470       /* For MIN/MAX the initial values are neutral.  A reduction chain
2471          has only a single initial value, so that value is neutral for
2472          all statements.  */
2473       if (reduc_chain)
2474         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2475                                       loop_preheader_edge (loop));
2476       return NULL_TREE;
2477
2478     default:
2479       return NULL_TREE;
2480     }
2481 }
2482
2483 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2484    STMT is printed with a message MSG. */
2485
2486 static void
2487 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2488 {
2489   dump_printf_loc (msg_type, vect_location, "%s", msg);
2490   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2491 }
2492
2493 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2494    operation.  Return true if the results of DEF_STMT_INFO are something
2495    that can be accumulated by such a reduction.  */
2496
2497 static bool
2498 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2499 {
2500   return (is_gimple_assign (def_stmt_info->stmt)
2501           || is_gimple_call (def_stmt_info->stmt)
2502           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2503           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2504               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2505               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2506 }
2507
2508 /* Detect SLP reduction of the form:
2509
2510    #a1 = phi <a5, a0>
2511    a2 = operation (a1)
2512    a3 = operation (a2)
2513    a4 = operation (a3)
2514    a5 = operation (a4)
2515
2516    #a = phi <a5>
2517
2518    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2519    FIRST_STMT is the first reduction stmt in the chain
2520    (a2 = operation (a1)).
2521
2522    Return TRUE if a reduction chain was detected.  */
2523
2524 static bool
2525 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2526                        gimple *first_stmt)
2527 {
2528   struct loop *loop = (gimple_bb (phi))->loop_father;
2529   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2530   enum tree_code code;
2531   gimple *loop_use_stmt = NULL, *first, *next_stmt;
2532   stmt_vec_info use_stmt_info, current_stmt_info = NULL;
2533   tree lhs;
2534   imm_use_iterator imm_iter;
2535   use_operand_p use_p;
2536   int nloop_uses, size = 0, n_out_of_loop_uses;
2537   bool found = false;
2538
2539   if (loop != vect_loop)
2540     return false;
2541
2542   lhs = PHI_RESULT (phi);
2543   code = gimple_assign_rhs_code (first_stmt);
2544   while (1)
2545     {
2546       nloop_uses = 0;
2547       n_out_of_loop_uses = 0;
2548       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2549         {
2550           gimple *use_stmt = USE_STMT (use_p);
2551           if (is_gimple_debug (use_stmt))
2552             continue;
2553
2554           /* Check if we got back to the reduction phi.  */
2555           if (use_stmt == phi)
2556             {
2557               loop_use_stmt = use_stmt;
2558               found = true;
2559               break;
2560             }
2561
2562           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2563             {
2564               loop_use_stmt = use_stmt;
2565               nloop_uses++;
2566             }
2567            else
2568              n_out_of_loop_uses++;
2569
2570            /* There are can be either a single use in the loop or two uses in
2571               phi nodes.  */
2572            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2573              return false;
2574         }
2575
2576       if (found)
2577         break;
2578
2579       /* We reached a statement with no loop uses.  */
2580       if (nloop_uses == 0)
2581         return false;
2582
2583       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2584       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2585         return false;
2586
2587       if (!is_gimple_assign (loop_use_stmt)
2588           || code != gimple_assign_rhs_code (loop_use_stmt)
2589           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2590         return false;
2591
2592       /* Insert USE_STMT into reduction chain.  */
2593       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2594       if (current_stmt_info)
2595         {
2596           REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2597           REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2598             = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2599         }
2600       else
2601         REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2602
2603       lhs = gimple_assign_lhs (loop_use_stmt);
2604       current_stmt_info = use_stmt_info;
2605       size++;
2606    }
2607
2608   if (!found || loop_use_stmt != phi || size < 2)
2609     return false;
2610
2611   /* Swap the operands, if needed, to make the reduction operand be the second
2612      operand.  */
2613   lhs = PHI_RESULT (phi);
2614   next_stmt = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2615   while (next_stmt)
2616     {
2617       if (gimple_assign_rhs2 (next_stmt) == lhs)
2618         {
2619           tree op = gimple_assign_rhs1 (next_stmt);
2620           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2621
2622           /* Check that the other def is either defined in the loop
2623              ("vect_internal_def"), or it's an induction (defined by a
2624              loop-header phi-node).  */
2625           if (def_stmt_info
2626               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2627               && vect_valid_reduction_input_p (def_stmt_info))
2628             {
2629               lhs = gimple_assign_lhs (next_stmt);
2630               next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2631               continue;
2632             }
2633
2634           return false;
2635         }
2636       else
2637         {
2638           tree op = gimple_assign_rhs2 (next_stmt);
2639           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2640
2641           /* Check that the other def is either defined in the loop
2642             ("vect_internal_def"), or it's an induction (defined by a
2643             loop-header phi-node).  */
2644           if (def_stmt_info
2645               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2646               && vect_valid_reduction_input_p (def_stmt_info))
2647             {
2648               if (dump_enabled_p ())
2649                 {
2650                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2651                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2652                 }
2653
2654               swap_ssa_operands (next_stmt,
2655                                  gimple_assign_rhs1_ptr (next_stmt),
2656                                  gimple_assign_rhs2_ptr (next_stmt));
2657               update_stmt (next_stmt);
2658
2659               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2660                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2661             }
2662           else
2663             return false;
2664         }
2665
2666       lhs = gimple_assign_lhs (next_stmt);
2667       next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2668     }
2669
2670   /* Save the chain for further analysis in SLP detection.  */
2671   first = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2672   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2673   REDUC_GROUP_SIZE (vinfo_for_stmt (first)) = size;
2674
2675   return true;
2676 }
2677
2678 /* Return true if we need an in-order reduction for operation CODE
2679    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2680    overflow must wrap.  */
2681
2682 static bool
2683 needs_fold_left_reduction_p (tree type, tree_code code,
2684                              bool need_wrapping_integral_overflow)
2685 {
2686   /* CHECKME: check for !flag_finite_math_only too?  */
2687   if (SCALAR_FLOAT_TYPE_P (type))
2688     switch (code)
2689       {
2690       case MIN_EXPR:
2691       case MAX_EXPR:
2692         return false;
2693
2694       default:
2695         return !flag_associative_math;
2696       }
2697
2698   if (INTEGRAL_TYPE_P (type))
2699     {
2700       if (!operation_no_trapping_overflow (type, code))
2701         return true;
2702       if (need_wrapping_integral_overflow
2703           && !TYPE_OVERFLOW_WRAPS (type)
2704           && operation_can_overflow (code))
2705         return true;
2706       return false;
2707     }
2708
2709   if (SAT_FIXED_POINT_TYPE_P (type))
2710     return true;
2711
2712   return false;
2713 }
2714
2715 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2716    reduction operation CODE has a handled computation expression.  */
2717
2718 bool
2719 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2720                       tree loop_arg, enum tree_code code)
2721 {
2722   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2723   auto_bitmap visited;
2724   tree lookfor = PHI_RESULT (phi);
2725   ssa_op_iter curri;
2726   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2727   while (USE_FROM_PTR (curr) != loop_arg)
2728     curr = op_iter_next_use (&curri);
2729   curri.i = curri.numops;
2730   do
2731     {
2732       path.safe_push (std::make_pair (curri, curr));
2733       tree use = USE_FROM_PTR (curr);
2734       if (use == lookfor)
2735         break;
2736       gimple *def = SSA_NAME_DEF_STMT (use);
2737       if (gimple_nop_p (def)
2738           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2739         {
2740 pop:
2741           do
2742             {
2743               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2744               curri = x.first;
2745               curr = x.second;
2746               do
2747                 curr = op_iter_next_use (&curri);
2748               /* Skip already visited or non-SSA operands (from iterating
2749                  over PHI args).  */
2750               while (curr != NULL_USE_OPERAND_P
2751                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2752                          || ! bitmap_set_bit (visited,
2753                                               SSA_NAME_VERSION
2754                                                 (USE_FROM_PTR (curr)))));
2755             }
2756           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2757           if (curr == NULL_USE_OPERAND_P)
2758             break;
2759         }
2760       else
2761         {
2762           if (gimple_code (def) == GIMPLE_PHI)
2763             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2764           else
2765             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2766           while (curr != NULL_USE_OPERAND_P
2767                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2768                      || ! bitmap_set_bit (visited,
2769                                           SSA_NAME_VERSION
2770                                             (USE_FROM_PTR (curr)))))
2771             curr = op_iter_next_use (&curri);
2772           if (curr == NULL_USE_OPERAND_P)
2773             goto pop;
2774         }
2775     }
2776   while (1);
2777   if (dump_file && (dump_flags & TDF_DETAILS))
2778     {
2779       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2780       unsigned i;
2781       std::pair<ssa_op_iter, use_operand_p> *x;
2782       FOR_EACH_VEC_ELT (path, i, x)
2783         {
2784           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2785           dump_printf (MSG_NOTE, " ");
2786         }
2787       dump_printf (MSG_NOTE, "\n");
2788     }
2789
2790   /* Check whether the reduction path detected is valid.  */
2791   bool fail = path.length () == 0;
2792   bool neg = false;
2793   for (unsigned i = 1; i < path.length (); ++i)
2794     {
2795       gimple *use_stmt = USE_STMT (path[i].second);
2796       tree op = USE_FROM_PTR (path[i].second);
2797       if (! has_single_use (op)
2798           || ! is_gimple_assign (use_stmt))
2799         {
2800           fail = true;
2801           break;
2802         }
2803       if (gimple_assign_rhs_code (use_stmt) != code)
2804         {
2805           if (code == PLUS_EXPR
2806               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2807             {
2808               /* Track whether we negate the reduction value each iteration.  */
2809               if (gimple_assign_rhs2 (use_stmt) == op)
2810                 neg = ! neg;
2811             }
2812           else
2813             {
2814               fail = true;
2815               break;
2816             }
2817         }
2818     }
2819   return ! fail && ! neg;
2820 }
2821
2822
2823 /* Function vect_is_simple_reduction
2824
2825    (1) Detect a cross-iteration def-use cycle that represents a simple
2826    reduction computation.  We look for the following pattern:
2827
2828    loop_header:
2829      a1 = phi < a0, a2 >
2830      a3 = ...
2831      a2 = operation (a3, a1)
2832
2833    or
2834
2835    a3 = ...
2836    loop_header:
2837      a1 = phi < a0, a2 >
2838      a2 = operation (a3, a1)
2839
2840    such that:
2841    1. operation is commutative and associative and it is safe to
2842       change the order of the computation
2843    2. no uses for a2 in the loop (a2 is used out of the loop)
2844    3. no uses of a1 in the loop besides the reduction operation
2845    4. no uses of a1 outside the loop.
2846
2847    Conditions 1,4 are tested here.
2848    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2849
2850    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2851    nested cycles.
2852
2853    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2854    reductions:
2855
2856      a1 = phi < a0, a2 >
2857      inner loop (def of a3)
2858      a2 = phi < a3 >
2859
2860    (4) Detect condition expressions, ie:
2861      for (int i = 0; i < N; i++)
2862        if (a[i] < val)
2863         ret_val = a[i];
2864
2865 */
2866
2867 static stmt_vec_info
2868 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2869                           bool *double_reduc,
2870                           bool need_wrapping_integral_overflow,
2871                           enum vect_reduction_type *v_reduc_type)
2872 {
2873   gphi *phi = as_a <gphi *> (phi_info->stmt);
2874   struct loop *loop = (gimple_bb (phi))->loop_father;
2875   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2876   gimple *phi_use_stmt = NULL;
2877   enum tree_code orig_code, code;
2878   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2879   tree type;
2880   int nloop_uses;
2881   tree name;
2882   imm_use_iterator imm_iter;
2883   use_operand_p use_p;
2884   bool phi_def;
2885
2886   *double_reduc = false;
2887   *v_reduc_type = TREE_CODE_REDUCTION;
2888
2889   tree phi_name = PHI_RESULT (phi);
2890   /* ???  If there are no uses of the PHI result the inner loop reduction
2891      won't be detected as possibly double-reduction by vectorizable_reduction
2892      because that tries to walk the PHI arg from the preheader edge which
2893      can be constant.  See PR60382.  */
2894   if (has_zero_uses (phi_name))
2895     return NULL;
2896   nloop_uses = 0;
2897   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2898     {
2899       gimple *use_stmt = USE_STMT (use_p);
2900       if (is_gimple_debug (use_stmt))
2901         continue;
2902
2903       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2904         {
2905           if (dump_enabled_p ())
2906             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2907                              "intermediate value used outside loop.\n");
2908
2909           return NULL;
2910         }
2911
2912       nloop_uses++;
2913       if (nloop_uses > 1)
2914         {
2915           if (dump_enabled_p ())
2916             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2917                              "reduction value used in loop.\n");
2918           return NULL;
2919         }
2920
2921       phi_use_stmt = use_stmt;
2922     }
2923
2924   edge latch_e = loop_latch_edge (loop);
2925   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2926   if (TREE_CODE (loop_arg) != SSA_NAME)
2927     {
2928       if (dump_enabled_p ())
2929         {
2930           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2931                            "reduction: not ssa_name: ");
2932           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2933           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2934         }
2935       return NULL;
2936     }
2937
2938   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2939   if (!def_stmt_info)
2940     return NULL;
2941
2942   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2943     {
2944       name = gimple_assign_lhs (def_stmt);
2945       phi_def = false;
2946     }
2947   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2948     {
2949       name = PHI_RESULT (def_stmt);
2950       phi_def = true;
2951     }
2952   else
2953     {
2954       if (dump_enabled_p ())
2955         {
2956           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2957                            "reduction: unhandled reduction operation: ");
2958           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2959                             def_stmt_info->stmt, 0);
2960         }
2961       return NULL;
2962     }
2963
2964   nloop_uses = 0;
2965   auto_vec<gphi *, 3> lcphis;
2966   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2967     {
2968       gimple *use_stmt = USE_STMT (use_p);
2969       if (is_gimple_debug (use_stmt))
2970         continue;
2971       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2972         nloop_uses++;
2973       else
2974         /* We can have more than one loop-closed PHI.  */
2975         lcphis.safe_push (as_a <gphi *> (use_stmt));
2976       if (nloop_uses > 1)
2977         {
2978           if (dump_enabled_p ())
2979             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2980                              "reduction used in loop.\n");
2981           return NULL;
2982         }
2983     }
2984
2985   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2986      defined in the inner loop.  */
2987   if (phi_def)
2988     {
2989       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2990       op1 = PHI_ARG_DEF (def_stmt, 0);
2991
2992       if (gimple_phi_num_args (def_stmt) != 1
2993           || TREE_CODE (op1) != SSA_NAME)
2994         {
2995           if (dump_enabled_p ())
2996             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2997                              "unsupported phi node definition.\n");
2998
2999           return NULL;
3000         }
3001
3002       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3003       if (gimple_bb (def1)
3004           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3005           && loop->inner
3006           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3007           && is_gimple_assign (def1)
3008           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3009         {
3010           if (dump_enabled_p ())
3011             report_vect_op (MSG_NOTE, def_stmt,
3012                             "detected double reduction: ");
3013
3014           *double_reduc = true;
3015           return def_stmt_info;
3016         }
3017
3018       return NULL;
3019     }
3020
3021   /* If we are vectorizing an inner reduction we are executing that
3022      in the original order only in case we are not dealing with a
3023      double reduction.  */
3024   bool check_reduction = true;
3025   if (flow_loop_nested_p (vect_loop, loop))
3026     {
3027       gphi *lcphi;
3028       unsigned i;
3029       check_reduction = false;
3030       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3031         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3032           {
3033             gimple *use_stmt = USE_STMT (use_p);
3034             if (is_gimple_debug (use_stmt))
3035               continue;
3036             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3037               check_reduction = true;
3038           }
3039     }
3040
3041   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3042   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3043   code = orig_code = gimple_assign_rhs_code (def_stmt);
3044
3045   /* We can handle "res -= x[i]", which is non-associative by
3046      simply rewriting this into "res += -x[i]".  Avoid changing
3047      gimple instruction for the first simple tests and only do this
3048      if we're allowed to change code at all.  */
3049   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3050     code = PLUS_EXPR;
3051
3052   if (code == COND_EXPR)
3053     {
3054       if (! nested_in_vect_loop)
3055         *v_reduc_type = COND_REDUCTION;
3056
3057       op3 = gimple_assign_rhs1 (def_stmt);
3058       if (COMPARISON_CLASS_P (op3))
3059         {
3060           op4 = TREE_OPERAND (op3, 1);
3061           op3 = TREE_OPERAND (op3, 0);
3062         }
3063       if (op3 == phi_name || op4 == phi_name)
3064         {
3065           if (dump_enabled_p ())
3066             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3067                             "reduction: condition depends on previous"
3068                             " iteration: ");
3069           return NULL;
3070         }
3071
3072       op1 = gimple_assign_rhs2 (def_stmt);
3073       op2 = gimple_assign_rhs3 (def_stmt);
3074     }
3075   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3076     {
3077       if (dump_enabled_p ())
3078         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3079                         "reduction: not commutative/associative: ");
3080       return NULL;
3081     }
3082   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3083     {
3084       op1 = gimple_assign_rhs1 (def_stmt);
3085       op2 = gimple_assign_rhs2 (def_stmt);
3086     }
3087   else
3088     {
3089       if (dump_enabled_p ())
3090         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3091                         "reduction: not handled operation: ");
3092       return NULL;
3093     }
3094
3095   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3096     {
3097       if (dump_enabled_p ())
3098         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3099                         "reduction: both uses not ssa_names: ");
3100
3101       return NULL;
3102     }
3103
3104   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3105   if ((TREE_CODE (op1) == SSA_NAME
3106        && !types_compatible_p (type,TREE_TYPE (op1)))
3107       || (TREE_CODE (op2) == SSA_NAME
3108           && !types_compatible_p (type, TREE_TYPE (op2)))
3109       || (op3 && TREE_CODE (op3) == SSA_NAME
3110           && !types_compatible_p (type, TREE_TYPE (op3)))
3111       || (op4 && TREE_CODE (op4) == SSA_NAME
3112           && !types_compatible_p (type, TREE_TYPE (op4))))
3113     {
3114       if (dump_enabled_p ())
3115         {
3116           dump_printf_loc (MSG_NOTE, vect_location,
3117                            "reduction: multiple types: operation type: ");
3118           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3119           dump_printf (MSG_NOTE, ", operands types: ");
3120           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3121                              TREE_TYPE (op1));
3122           dump_printf (MSG_NOTE, ",");
3123           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3124                              TREE_TYPE (op2));
3125           if (op3)
3126             {
3127               dump_printf (MSG_NOTE, ",");
3128               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3129                                  TREE_TYPE (op3));
3130             }
3131
3132           if (op4)
3133             {
3134               dump_printf (MSG_NOTE, ",");
3135               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3136                                  TREE_TYPE (op4));
3137             }
3138           dump_printf (MSG_NOTE, "\n");
3139         }
3140
3141       return NULL;
3142     }
3143
3144   /* Check whether it's ok to change the order of the computation.
3145      Generally, when vectorizing a reduction we change the order of the
3146      computation.  This may change the behavior of the program in some
3147      cases, so we need to check that this is ok.  One exception is when
3148      vectorizing an outer-loop: the inner-loop is executed sequentially,
3149      and therefore vectorizing reductions in the inner-loop during
3150      outer-loop vectorization is safe.  */
3151   if (check_reduction
3152       && *v_reduc_type == TREE_CODE_REDUCTION
3153       && needs_fold_left_reduction_p (type, code,
3154                                       need_wrapping_integral_overflow))
3155     *v_reduc_type = FOLD_LEFT_REDUCTION;
3156
3157   /* Reduction is safe. We're dealing with one of the following:
3158      1) integer arithmetic and no trapv
3159      2) floating point arithmetic, and special flags permit this optimization
3160      3) nested cycle (i.e., outer loop vectorization).  */
3161   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3162   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3163   if (code != COND_EXPR && !def1_info && !def2_info)
3164     {
3165       if (dump_enabled_p ())
3166         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3167       return NULL;
3168     }
3169
3170   /* Check that one def is the reduction def, defined by PHI,
3171      the other def is either defined in the loop ("vect_internal_def"),
3172      or it's an induction (defined by a loop-header phi-node).  */
3173
3174   if (def2_info
3175       && def2_info->stmt == phi
3176       && (code == COND_EXPR
3177           || !def1_info
3178           || vect_valid_reduction_input_p (def1_info)))
3179     {
3180       if (dump_enabled_p ())
3181         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3182       return def_stmt_info;
3183     }
3184
3185   if (def1_info
3186       && def1_info->stmt == phi
3187       && (code == COND_EXPR
3188           || !def2_info
3189           || vect_valid_reduction_input_p (def2_info)))
3190     {
3191       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3192         {
3193           /* Check if we can swap operands (just for simplicity - so that
3194              the rest of the code can assume that the reduction variable
3195              is always the last (second) argument).  */
3196           if (code == COND_EXPR)
3197             {
3198               /* Swap cond_expr by inverting the condition.  */
3199               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3200               enum tree_code invert_code = ERROR_MARK;
3201               enum tree_code cond_code = TREE_CODE (cond_expr);
3202
3203               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3204                 {
3205                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3206                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3207                 }
3208               if (invert_code != ERROR_MARK)
3209                 {
3210                   TREE_SET_CODE (cond_expr, invert_code);
3211                   swap_ssa_operands (def_stmt,
3212                                      gimple_assign_rhs2_ptr (def_stmt),
3213                                      gimple_assign_rhs3_ptr (def_stmt));
3214                 }
3215               else
3216                 {
3217                   if (dump_enabled_p ())
3218                     report_vect_op (MSG_NOTE, def_stmt,
3219                                     "detected reduction: cannot swap operands "
3220                                     "for cond_expr");
3221                   return NULL;
3222                 }
3223             }
3224           else
3225             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3226                                gimple_assign_rhs2_ptr (def_stmt));
3227
3228           if (dump_enabled_p ())
3229             report_vect_op (MSG_NOTE, def_stmt,
3230                             "detected reduction: need to swap operands: ");
3231
3232           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3233             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3234         }
3235       else
3236         {
3237           if (dump_enabled_p ())
3238             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3239         }
3240
3241       return def_stmt_info;
3242     }
3243
3244   /* Try to find SLP reduction chain.  */
3245   if (! nested_in_vect_loop
3246       && code != COND_EXPR
3247       && orig_code != MINUS_EXPR
3248       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3249     {
3250       if (dump_enabled_p ())
3251         report_vect_op (MSG_NOTE, def_stmt,
3252                         "reduction: detected reduction chain: ");
3253
3254       return def_stmt_info;
3255     }
3256
3257   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3258   gimple *first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3259   while (first)
3260     {
3261       gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3262       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3263       REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3264       first = next;
3265     }
3266
3267   /* Look for the expression computing loop_arg from loop PHI result.  */
3268   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3269     return def_stmt_info;
3270
3271   if (dump_enabled_p ())
3272     {
3273       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3274                       "reduction: unknown pattern: ");
3275     }
3276
3277   return NULL;
3278 }
3279
3280 /* Wrapper around vect_is_simple_reduction, which will modify code
3281    in-place if it enables detection of more reductions.  Arguments
3282    as there.  */
3283
3284 stmt_vec_info
3285 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3286                              bool *double_reduc,
3287                              bool need_wrapping_integral_overflow)
3288 {
3289   enum vect_reduction_type v_reduc_type;
3290   stmt_vec_info def_info
3291     = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3292                                 need_wrapping_integral_overflow,
3293                                 &v_reduc_type);
3294   if (def_info)
3295     {
3296       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3297       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3298       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3299       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3300     }
3301   return def_info;
3302 }
3303
3304 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3305 int
3306 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3307                              int *peel_iters_epilogue,
3308                              stmt_vector_for_cost *scalar_cost_vec,
3309                              stmt_vector_for_cost *prologue_cost_vec,
3310                              stmt_vector_for_cost *epilogue_cost_vec)
3311 {
3312   int retval = 0;
3313   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3314
3315   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3316     {
3317       *peel_iters_epilogue = assumed_vf / 2;
3318       if (dump_enabled_p ())
3319         dump_printf_loc (MSG_NOTE, vect_location,
3320                          "cost model: epilogue peel iters set to vf/2 "
3321                          "because loop iterations are unknown .\n");
3322
3323       /* If peeled iterations are known but number of scalar loop
3324          iterations are unknown, count a taken branch per peeled loop.  */
3325       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3326                                  NULL, 0, vect_prologue);
3327       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3328                                  NULL, 0, vect_epilogue);
3329     }
3330   else
3331     {
3332       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3333       peel_iters_prologue = niters < peel_iters_prologue ?
3334                             niters : peel_iters_prologue;
3335       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3336       /* If we need to peel for gaps, but no peeling is required, we have to
3337          peel VF iterations.  */
3338       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3339         *peel_iters_epilogue = assumed_vf;
3340     }
3341
3342   stmt_info_for_cost *si;
3343   int j;
3344   if (peel_iters_prologue)
3345     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3346         {
3347           stmt_vec_info stmt_info
3348             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3349           retval += record_stmt_cost (prologue_cost_vec,
3350                                       si->count * peel_iters_prologue,
3351                                       si->kind, stmt_info, si->misalign,
3352                                       vect_prologue);
3353         }
3354   if (*peel_iters_epilogue)
3355     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3356         {
3357           stmt_vec_info stmt_info
3358             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3359           retval += record_stmt_cost (epilogue_cost_vec,
3360                                       si->count * *peel_iters_epilogue,
3361                                       si->kind, stmt_info, si->misalign,
3362                                       vect_epilogue);
3363         }
3364
3365   return retval;
3366 }
3367
3368 /* Function vect_estimate_min_profitable_iters
3369
3370    Return the number of iterations required for the vector version of the
3371    loop to be profitable relative to the cost of the scalar version of the
3372    loop.
3373
3374    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3375    of iterations for vectorization.  -1 value means loop vectorization
3376    is not profitable.  This returned value may be used for dynamic
3377    profitability check.
3378
3379    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3380    for static check against estimated number of iterations.  */
3381
3382 static void
3383 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3384                                     int *ret_min_profitable_niters,
3385                                     int *ret_min_profitable_estimate)
3386 {
3387   int min_profitable_iters;
3388   int min_profitable_estimate;
3389   int peel_iters_prologue;
3390   int peel_iters_epilogue;
3391   unsigned vec_inside_cost = 0;
3392   int vec_outside_cost = 0;
3393   unsigned vec_prologue_cost = 0;
3394   unsigned vec_epilogue_cost = 0;
3395   int scalar_single_iter_cost = 0;
3396   int scalar_outside_cost = 0;
3397   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3398   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3399   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3400
3401   /* Cost model disabled.  */
3402   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3403     {
3404       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3405       *ret_min_profitable_niters = 0;
3406       *ret_min_profitable_estimate = 0;
3407       return;
3408     }
3409
3410   /* Requires loop versioning tests to handle misalignment.  */
3411   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3412     {
3413       /*  FIXME: Make cost depend on complexity of individual check.  */
3414       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3415       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3416                             vect_prologue);
3417       dump_printf (MSG_NOTE,
3418                    "cost model: Adding cost of checks for loop "
3419                    "versioning to treat misalignment.\n");
3420     }
3421
3422   /* Requires loop versioning with alias checks.  */
3423   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3424     {
3425       /*  FIXME: Make cost depend on complexity of individual check.  */
3426       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3427       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3428                             vect_prologue);
3429       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3430       if (len)
3431         /* Count LEN - 1 ANDs and LEN comparisons.  */
3432         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3433                               NULL, 0, vect_prologue);
3434       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3435       if (len)
3436         {
3437           /* Count LEN - 1 ANDs and LEN comparisons.  */
3438           unsigned int nstmts = len * 2 - 1;
3439           /* +1 for each bias that needs adding.  */
3440           for (unsigned int i = 0; i < len; ++i)
3441             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3442               nstmts += 1;
3443           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3444                                 NULL, 0, vect_prologue);
3445         }
3446       dump_printf (MSG_NOTE,
3447                    "cost model: Adding cost of checks for loop "
3448                    "versioning aliasing.\n");
3449     }
3450
3451   /* Requires loop versioning with niter checks.  */
3452   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3453     {
3454       /*  FIXME: Make cost depend on complexity of individual check.  */
3455       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3456                             vect_prologue);
3457       dump_printf (MSG_NOTE,
3458                    "cost model: Adding cost of checks for loop "
3459                    "versioning niters.\n");
3460     }
3461
3462   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3463     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3464                           vect_prologue);
3465
3466   /* Count statements in scalar loop.  Using this as scalar cost for a single
3467      iteration for now.
3468
3469      TODO: Add outer loop support.
3470
3471      TODO: Consider assigning different costs to different scalar
3472      statements.  */
3473
3474   scalar_single_iter_cost
3475     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3476
3477   /* Add additional cost for the peeled instructions in prologue and epilogue
3478      loop.  (For fully-masked loops there will be no peeling.)
3479
3480      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3481      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3482
3483      TODO: Build an expression that represents peel_iters for prologue and
3484      epilogue to be used in a run-time test.  */
3485
3486   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3487     {
3488       peel_iters_prologue = 0;
3489       peel_iters_epilogue = 0;
3490
3491       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3492         {
3493           /* We need to peel exactly one iteration.  */
3494           peel_iters_epilogue += 1;
3495           stmt_info_for_cost *si;
3496           int j;
3497           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3498                             j, si)
3499             {
3500               struct _stmt_vec_info *stmt_info
3501                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3502               (void) add_stmt_cost (target_cost_data, si->count,
3503                                     si->kind, stmt_info, si->misalign,
3504                                     vect_epilogue);
3505             }
3506         }
3507     }
3508   else if (npeel < 0)
3509     {
3510       peel_iters_prologue = assumed_vf / 2;
3511       dump_printf (MSG_NOTE, "cost model: "
3512                    "prologue peel iters set to vf/2.\n");
3513
3514       /* If peeling for alignment is unknown, loop bound of main loop becomes
3515          unknown.  */
3516       peel_iters_epilogue = assumed_vf / 2;
3517       dump_printf (MSG_NOTE, "cost model: "
3518                    "epilogue peel iters set to vf/2 because "
3519                    "peeling for alignment is unknown.\n");
3520
3521       /* If peeled iterations are unknown, count a taken branch and a not taken
3522          branch per peeled loop. Even if scalar loop iterations are known,
3523          vector iterations are not known since peeled prologue iterations are
3524          not known. Hence guards remain the same.  */
3525       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3526                             NULL, 0, vect_prologue);
3527       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3528                             NULL, 0, vect_prologue);
3529       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3530                             NULL, 0, vect_epilogue);
3531       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3532                             NULL, 0, vect_epilogue);
3533       stmt_info_for_cost *si;
3534       int j;
3535       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3536         {
3537           struct _stmt_vec_info *stmt_info
3538             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3539           (void) add_stmt_cost (target_cost_data,
3540                                 si->count * peel_iters_prologue,
3541                                 si->kind, stmt_info, si->misalign,
3542                                 vect_prologue);
3543           (void) add_stmt_cost (target_cost_data,
3544                                 si->count * peel_iters_epilogue,
3545                                 si->kind, stmt_info, si->misalign,
3546                                 vect_epilogue);
3547         }
3548     }
3549   else
3550     {
3551       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3552       stmt_info_for_cost *si;
3553       int j;
3554       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3555
3556       prologue_cost_vec.create (2);
3557       epilogue_cost_vec.create (2);
3558       peel_iters_prologue = npeel;
3559
3560       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3561                                           &peel_iters_epilogue,
3562                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3563                                             (loop_vinfo),
3564                                           &prologue_cost_vec,
3565                                           &epilogue_cost_vec);
3566
3567       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3568         {
3569           struct _stmt_vec_info *stmt_info
3570             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3571           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3572                                 si->misalign, vect_prologue);
3573         }
3574
3575       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3576         {
3577           struct _stmt_vec_info *stmt_info
3578             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3579           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3580                                 si->misalign, vect_epilogue);
3581         }
3582
3583       prologue_cost_vec.release ();
3584       epilogue_cost_vec.release ();
3585     }
3586
3587   /* FORNOW: The scalar outside cost is incremented in one of the
3588      following ways:
3589
3590      1. The vectorizer checks for alignment and aliasing and generates
3591      a condition that allows dynamic vectorization.  A cost model
3592      check is ANDED with the versioning condition.  Hence scalar code
3593      path now has the added cost of the versioning check.
3594
3595        if (cost > th & versioning_check)
3596          jmp to vector code
3597
3598      Hence run-time scalar is incremented by not-taken branch cost.
3599
3600      2. The vectorizer then checks if a prologue is required.  If the
3601      cost model check was not done before during versioning, it has to
3602      be done before the prologue check.
3603
3604        if (cost <= th)
3605          prologue = scalar_iters
3606        if (prologue == 0)
3607          jmp to vector code
3608        else
3609          execute prologue
3610        if (prologue == num_iters)
3611          go to exit
3612
3613      Hence the run-time scalar cost is incremented by a taken branch,
3614      plus a not-taken branch, plus a taken branch cost.
3615
3616      3. The vectorizer then checks if an epilogue is required.  If the
3617      cost model check was not done before during prologue check, it
3618      has to be done with the epilogue check.
3619
3620        if (prologue == 0)
3621          jmp to vector code
3622        else
3623          execute prologue
3624        if (prologue == num_iters)
3625          go to exit
3626        vector code:
3627          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3628            jmp to epilogue
3629
3630      Hence the run-time scalar cost should be incremented by 2 taken
3631      branches.
3632
3633      TODO: The back end may reorder the BBS's differently and reverse
3634      conditions/branch directions.  Change the estimates below to
3635      something more reasonable.  */
3636
3637   /* If the number of iterations is known and we do not do versioning, we can
3638      decide whether to vectorize at compile time.  Hence the scalar version
3639      do not carry cost model guard costs.  */
3640   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3641       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3642     {
3643       /* Cost model check occurs at versioning.  */
3644       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3645         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3646       else
3647         {
3648           /* Cost model check occurs at prologue generation.  */
3649           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3650             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3651               + vect_get_stmt_cost (cond_branch_not_taken);
3652           /* Cost model check occurs at epilogue generation.  */
3653           else
3654             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3655         }
3656     }
3657
3658   /* Complete the target-specific cost calculations.  */
3659   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3660                &vec_inside_cost, &vec_epilogue_cost);
3661
3662   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3663
3664   if (dump_enabled_p ())
3665     {
3666       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3667       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3668                    vec_inside_cost);
3669       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3670                    vec_prologue_cost);
3671       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3672                    vec_epilogue_cost);
3673       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3674                    scalar_single_iter_cost);
3675       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3676                    scalar_outside_cost);
3677       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3678                    vec_outside_cost);
3679       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3680                    peel_iters_prologue);
3681       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3682                    peel_iters_epilogue);
3683     }
3684
3685   /* Calculate number of iterations required to make the vector version
3686      profitable, relative to the loop bodies only.  The following condition
3687      must hold true:
3688      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3689      where
3690      SIC = scalar iteration cost, VIC = vector iteration cost,
3691      VOC = vector outside cost, VF = vectorization factor,
3692      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3693      SOC = scalar outside cost for run time cost model check.  */
3694
3695   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3696     {
3697       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3698                               * assumed_vf
3699                               - vec_inside_cost * peel_iters_prologue
3700                               - vec_inside_cost * peel_iters_epilogue);
3701       if (min_profitable_iters <= 0)
3702         min_profitable_iters = 0;
3703       else
3704         {
3705           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3706                                    - vec_inside_cost);
3707
3708           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3709               <= (((int) vec_inside_cost * min_profitable_iters)
3710                   + (((int) vec_outside_cost - scalar_outside_cost)
3711                      * assumed_vf)))
3712             min_profitable_iters++;
3713         }
3714     }
3715   /* vector version will never be profitable.  */
3716   else
3717     {
3718       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3719         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3720                     "vectorization did not happen for a simd loop");
3721
3722       if (dump_enabled_p ())
3723         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3724                          "cost model: the vector iteration cost = %d "
3725                          "divided by the scalar iteration cost = %d "
3726                          "is greater or equal to the vectorization factor = %d"
3727                          ".\n",
3728                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3729       *ret_min_profitable_niters = -1;
3730       *ret_min_profitable_estimate = -1;
3731       return;
3732     }
3733
3734   dump_printf (MSG_NOTE,
3735                "  Calculated minimum iters for profitability: %d\n",
3736                min_profitable_iters);
3737
3738   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3739       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3740     /* We want the vectorized loop to execute at least once.  */
3741     min_profitable_iters = assumed_vf + peel_iters_prologue;
3742
3743   if (dump_enabled_p ())
3744     dump_printf_loc (MSG_NOTE, vect_location,
3745                      "  Runtime profitability threshold = %d\n",
3746                      min_profitable_iters);
3747
3748   *ret_min_profitable_niters = min_profitable_iters;
3749
3750   /* Calculate number of iterations required to make the vector version
3751      profitable, relative to the loop bodies only.
3752
3753      Non-vectorized variant is SIC * niters and it must win over vector
3754      variant on the expected loop trip count.  The following condition must hold true:
3755      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3756
3757   if (vec_outside_cost <= 0)
3758     min_profitable_estimate = 0;
3759   else
3760     {
3761       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3762                                  * assumed_vf
3763                                  - vec_inside_cost * peel_iters_prologue
3764                                  - vec_inside_cost * peel_iters_epilogue)
3765                                  / ((scalar_single_iter_cost * assumed_vf)
3766                                    - vec_inside_cost);
3767     }
3768   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3769   if (dump_enabled_p ())
3770     dump_printf_loc (MSG_NOTE, vect_location,
3771                      "  Static estimate profitability threshold = %d\n",
3772                      min_profitable_estimate);
3773
3774   *ret_min_profitable_estimate = min_profitable_estimate;
3775 }
3776
3777 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3778    vector elements (not bits) for a vector with NELT elements.  */
3779 static void
3780 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3781                               vec_perm_builder *sel)
3782 {
3783   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3784      by vec_perm_indices.  */
3785   sel->new_vector (nelt, 1, 3);
3786   for (unsigned int i = 0; i < 3; i++)
3787     sel->quick_push (i + offset);
3788 }
3789
3790 /* Checks whether the target supports whole-vector shifts for vectors of mode
3791    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3792    it supports vec_perm_const with masks for all necessary shift amounts.  */
3793 static bool
3794 have_whole_vector_shift (machine_mode mode)
3795 {
3796   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3797     return true;
3798
3799   /* Variable-length vectors should be handled via the optab.  */
3800   unsigned int nelt;
3801   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3802     return false;
3803
3804   vec_perm_builder sel;
3805   vec_perm_indices indices;
3806   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3807     {
3808       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3809       indices.new_vector (sel, 2, nelt);
3810       if (!can_vec_perm_const_p (mode, indices, false))
3811         return false;
3812     }
3813   return true;
3814 }
3815
3816 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3817    functions. Design better to avoid maintenance issues.  */
3818
3819 /* Function vect_model_reduction_cost.
3820
3821    Models cost for a reduction operation, including the vector ops
3822    generated within the strip-mine loop, the initial definition before
3823    the loop, and the epilogue code that must be generated.  */
3824
3825 static void
3826 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3827                            int ncopies, stmt_vector_for_cost *cost_vec)
3828 {
3829   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3830   enum tree_code code;
3831   optab optab;
3832   tree vectype;
3833   machine_mode mode;
3834   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3835   struct loop *loop = NULL;
3836
3837   if (loop_vinfo)
3838     loop = LOOP_VINFO_LOOP (loop_vinfo);
3839
3840   /* Condition reductions generate two reductions in the loop.  */
3841   vect_reduction_type reduction_type
3842     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3843   if (reduction_type == COND_REDUCTION)
3844     ncopies *= 2;
3845
3846   vectype = STMT_VINFO_VECTYPE (stmt_info);
3847   mode = TYPE_MODE (vectype);
3848   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3849
3850   if (!orig_stmt_info)
3851     orig_stmt_info = stmt_info;
3852
3853   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3854
3855   if (reduction_type == EXTRACT_LAST_REDUCTION
3856       || reduction_type == FOLD_LEFT_REDUCTION)
3857     {
3858       /* No extra instructions needed in the prologue.  */
3859       prologue_cost = 0;
3860
3861       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3862         /* Count one reduction-like operation per vector.  */
3863         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3864                                         stmt_info, 0, vect_body);
3865       else
3866         {
3867           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3868           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3869           inside_cost = record_stmt_cost (cost_vec, nelements,
3870                                           vec_to_scalar, stmt_info, 0,
3871                                           vect_body);
3872           inside_cost += record_stmt_cost (cost_vec, nelements,
3873                                            scalar_stmt, stmt_info, 0,
3874                                            vect_body);
3875         }
3876     }
3877   else
3878     {
3879       /* Add in cost for initial definition.
3880          For cond reduction we have four vectors: initial index, step,
3881          initial result of the data reduction, initial value of the index
3882          reduction.  */
3883       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3884       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3885                                          scalar_to_vec, stmt_info, 0,
3886                                          vect_prologue);
3887
3888       /* Cost of reduction op inside loop.  */
3889       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3890                                       stmt_info, 0, vect_body);
3891     }
3892
3893   /* Determine cost of epilogue code.
3894
3895      We have a reduction operator that will reduce the vector in one statement.
3896      Also requires scalar extract.  */
3897
3898   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3899     {
3900       if (reduc_fn != IFN_LAST)
3901         {
3902           if (reduction_type == COND_REDUCTION)
3903             {
3904               /* An EQ stmt and an COND_EXPR stmt.  */
3905               epilogue_cost += record_stmt_cost (cost_vec, 2,
3906                                                  vector_stmt, stmt_info, 0,
3907                                                  vect_epilogue);
3908               /* Reduction of the max index and a reduction of the found
3909                  values.  */
3910               epilogue_cost += record_stmt_cost (cost_vec, 2,
3911                                                  vec_to_scalar, stmt_info, 0,
3912                                                  vect_epilogue);
3913               /* A broadcast of the max value.  */
3914               epilogue_cost += record_stmt_cost (cost_vec, 1,
3915                                                  scalar_to_vec, stmt_info, 0,
3916                                                  vect_epilogue);
3917             }
3918           else
3919             {
3920               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3921                                                  stmt_info, 0, vect_epilogue);
3922               epilogue_cost += record_stmt_cost (cost_vec, 1,
3923                                                  vec_to_scalar, stmt_info, 0,
3924                                                  vect_epilogue);
3925             }
3926         }
3927       else if (reduction_type == COND_REDUCTION)
3928         {
3929           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3930           /* Extraction of scalar elements.  */
3931           epilogue_cost += record_stmt_cost (cost_vec,
3932                                              2 * estimated_nunits,
3933                                              vec_to_scalar, stmt_info, 0,
3934                                              vect_epilogue);
3935           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3936           epilogue_cost += record_stmt_cost (cost_vec,
3937                                              2 * estimated_nunits - 3,
3938                                              scalar_stmt, stmt_info, 0,
3939                                              vect_epilogue);
3940         }
3941       else if (reduction_type == EXTRACT_LAST_REDUCTION
3942                || reduction_type == FOLD_LEFT_REDUCTION)
3943         /* No extra instructions need in the epilogue.  */
3944         ;
3945       else
3946         {
3947           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3948           tree bitsize =
3949             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3950           int element_bitsize = tree_to_uhwi (bitsize);
3951           int nelements = vec_size_in_bits / element_bitsize;
3952
3953           if (code == COND_EXPR)
3954             code = MAX_EXPR;
3955
3956           optab = optab_for_tree_code (code, vectype, optab_default);
3957
3958           /* We have a whole vector shift available.  */
3959           if (optab != unknown_optab
3960               && VECTOR_MODE_P (mode)
3961               && optab_handler (optab, mode) != CODE_FOR_nothing
3962               && have_whole_vector_shift (mode))
3963             {
3964               /* Final reduction via vector shifts and the reduction operator.
3965                  Also requires scalar extract.  */
3966               epilogue_cost += record_stmt_cost (cost_vec,
3967                                                  exact_log2 (nelements) * 2,
3968                                                  vector_stmt, stmt_info, 0,
3969                                                  vect_epilogue);
3970               epilogue_cost += record_stmt_cost (cost_vec, 1,
3971                                                  vec_to_scalar, stmt_info, 0,
3972                                                  vect_epilogue);
3973             }
3974           else
3975             /* Use extracts and reduction op for final reduction.  For N
3976                elements, we have N extracts and N-1 reduction ops.  */
3977             epilogue_cost += record_stmt_cost (cost_vec,
3978                                                nelements + nelements - 1,
3979                                                vector_stmt, stmt_info, 0,
3980                                                vect_epilogue);
3981         }
3982     }
3983
3984   if (dump_enabled_p ())
3985     dump_printf (MSG_NOTE,
3986                  "vect_model_reduction_cost: inside_cost = %d, "
3987                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3988                  prologue_cost, epilogue_cost);
3989 }
3990
3991
3992 /* Function vect_model_induction_cost.
3993
3994    Models cost for induction operations.  */
3995
3996 static void
3997 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3998                            stmt_vector_for_cost *cost_vec)
3999 {
4000   unsigned inside_cost, prologue_cost;
4001
4002   if (PURE_SLP_STMT (stmt_info))
4003     return;
4004
4005   /* loop cost for vec_loop.  */
4006   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4007                                   stmt_info, 0, vect_body);
4008
4009   /* prologue cost for vec_init and vec_step.  */
4010   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4011                                     stmt_info, 0, vect_prologue);
4012
4013   if (dump_enabled_p ())
4014     dump_printf_loc (MSG_NOTE, vect_location,
4015                      "vect_model_induction_cost: inside_cost = %d, "
4016                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4017 }
4018
4019
4020
4021 /* Function get_initial_def_for_reduction
4022
4023    Input:
4024    STMT - a stmt that performs a reduction operation in the loop.
4025    INIT_VAL - the initial value of the reduction variable
4026
4027    Output:
4028    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4029         of the reduction (used for adjusting the epilog - see below).
4030    Return a vector variable, initialized according to the operation that STMT
4031         performs. This vector will be used as the initial value of the
4032         vector of partial results.
4033
4034    Option1 (adjust in epilog): Initialize the vector as follows:
4035      add/bit or/xor:    [0,0,...,0,0]
4036      mult/bit and:      [1,1,...,1,1]
4037      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4038    and when necessary (e.g. add/mult case) let the caller know
4039    that it needs to adjust the result by init_val.
4040
4041    Option2: Initialize the vector as follows:
4042      add/bit or/xor:    [init_val,0,0,...,0]
4043      mult/bit and:      [init_val,1,1,...,1]
4044      min/max/cond_expr: [init_val,init_val,...,init_val]
4045    and no adjustments are needed.
4046
4047    For example, for the following code:
4048
4049    s = init_val;
4050    for (i=0;i<n;i++)
4051      s = s + a[i];
4052
4053    STMT is 's = s + a[i]', and the reduction variable is 's'.
4054    For a vector of 4 units, we want to return either [0,0,0,init_val],
4055    or [0,0,0,0] and let the caller know that it needs to adjust
4056    the result at the end by 'init_val'.
4057
4058    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4059    initialization vector is simpler (same element in all entries), if
4060    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4061
4062    A cost model should help decide between these two schemes.  */
4063
4064 tree
4065 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4066                                tree *adjustment_def)
4067 {
4068   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4069   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4070   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4071   tree scalar_type = TREE_TYPE (init_val);
4072   tree vectype = get_vectype_for_scalar_type (scalar_type);
4073   enum tree_code code = gimple_assign_rhs_code (stmt);
4074   tree def_for_init;
4075   tree init_def;
4076   REAL_VALUE_TYPE real_init_val = dconst0;
4077   int int_init_val = 0;
4078   gimple_seq stmts = NULL;
4079
4080   gcc_assert (vectype);
4081
4082   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4083               || SCALAR_FLOAT_TYPE_P (scalar_type));
4084
4085   gcc_assert (nested_in_vect_loop_p (loop, stmt)
4086               || loop == (gimple_bb (stmt))->loop_father);
4087
4088   vect_reduction_type reduction_type
4089     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4090
4091   switch (code)
4092     {
4093     case WIDEN_SUM_EXPR:
4094     case DOT_PROD_EXPR:
4095     case SAD_EXPR:
4096     case PLUS_EXPR:
4097     case MINUS_EXPR:
4098     case BIT_IOR_EXPR:
4099     case BIT_XOR_EXPR:
4100     case MULT_EXPR:
4101     case BIT_AND_EXPR:
4102       {
4103         /* ADJUSTMENT_DEF is NULL when called from
4104            vect_create_epilog_for_reduction to vectorize double reduction.  */
4105         if (adjustment_def)
4106           *adjustment_def = init_val;
4107
4108         if (code == MULT_EXPR)
4109           {
4110             real_init_val = dconst1;
4111             int_init_val = 1;
4112           }
4113
4114         if (code == BIT_AND_EXPR)
4115           int_init_val = -1;
4116
4117         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4118           def_for_init = build_real (scalar_type, real_init_val);
4119         else
4120           def_for_init = build_int_cst (scalar_type, int_init_val);
4121
4122         if (adjustment_def)
4123           /* Option1: the first element is '0' or '1' as well.  */
4124           init_def = gimple_build_vector_from_val (&stmts, vectype,
4125                                                    def_for_init);
4126         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4127           {
4128             /* Option2 (variable length): the first element is INIT_VAL.  */
4129             init_def = gimple_build_vector_from_val (&stmts, vectype,
4130                                                      def_for_init);
4131             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4132                                      vectype, init_def, init_val);
4133           }
4134         else
4135           {
4136             /* Option2: the first element is INIT_VAL.  */
4137             tree_vector_builder elts (vectype, 1, 2);
4138             elts.quick_push (init_val);
4139             elts.quick_push (def_for_init);
4140             init_def = gimple_build_vector (&stmts, &elts);
4141           }
4142       }
4143       break;
4144
4145     case MIN_EXPR:
4146     case MAX_EXPR:
4147     case COND_EXPR:
4148       {
4149         if (adjustment_def)
4150           {
4151             *adjustment_def = NULL_TREE;
4152             if (reduction_type != COND_REDUCTION
4153                 && reduction_type != EXTRACT_LAST_REDUCTION)
4154               {
4155                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4156                 break;
4157               }
4158           }
4159         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4160         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4161       }
4162       break;
4163
4164     default:
4165       gcc_unreachable ();
4166     }
4167
4168   if (stmts)
4169     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4170   return init_def;
4171 }
4172
4173 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4174    NUMBER_OF_VECTORS is the number of vector defs to create.
4175    If NEUTRAL_OP is nonnull, introducing extra elements of that
4176    value will not change the result.  */
4177
4178 static void
4179 get_initial_defs_for_reduction (slp_tree slp_node,
4180                                 vec<tree> *vec_oprnds,
4181                                 unsigned int number_of_vectors,
4182                                 bool reduc_chain, tree neutral_op)
4183 {
4184   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4185   stmt_vec_info stmt_vinfo = stmts[0];
4186   unsigned HOST_WIDE_INT nunits;
4187   unsigned j, number_of_places_left_in_vector;
4188   tree vector_type;
4189   tree vop;
4190   int group_size = stmts.length ();
4191   unsigned int vec_num, i;
4192   unsigned number_of_copies = 1;
4193   vec<tree> voprnds;
4194   voprnds.create (number_of_vectors);
4195   struct loop *loop;
4196   auto_vec<tree, 16> permute_results;
4197
4198   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4199
4200   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4201
4202   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4203   gcc_assert (loop);
4204   edge pe = loop_preheader_edge (loop);
4205
4206   gcc_assert (!reduc_chain || neutral_op);
4207
4208   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4209      created vectors. It is greater than 1 if unrolling is performed.
4210
4211      For example, we have two scalar operands, s1 and s2 (e.g., group of
4212      strided accesses of size two), while NUNITS is four (i.e., four scalars
4213      of this type can be packed in a vector).  The output vector will contain
4214      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4215      will be 2).
4216
4217      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4218      vectors containing the operands.
4219
4220      For example, NUNITS is four as before, and the group size is 8
4221      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4222      {s5, s6, s7, s8}.  */
4223
4224   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4225     nunits = group_size;
4226
4227   number_of_copies = nunits * number_of_vectors / group_size;
4228
4229   number_of_places_left_in_vector = nunits;
4230   bool constant_p = true;
4231   tree_vector_builder elts (vector_type, nunits, 1);
4232   elts.quick_grow (nunits);
4233   for (j = 0; j < number_of_copies; j++)
4234     {
4235       for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
4236         {
4237           tree op;
4238           /* Get the def before the loop.  In reduction chain we have only
4239              one initial value.  */
4240           if ((j != (number_of_copies - 1)
4241                || (reduc_chain && i != 0))
4242               && neutral_op)
4243             op = neutral_op;
4244           else
4245             op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4246
4247           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4248           number_of_places_left_in_vector--;
4249           elts[number_of_places_left_in_vector] = op;
4250           if (!CONSTANT_CLASS_P (op))
4251             constant_p = false;
4252
4253           if (number_of_places_left_in_vector == 0)
4254             {
4255               gimple_seq ctor_seq = NULL;
4256               tree init;
4257               if (constant_p && !neutral_op
4258                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4259                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4260                 /* Build the vector directly from ELTS.  */
4261                 init = gimple_build_vector (&ctor_seq, &elts);
4262               else if (neutral_op)
4263                 {
4264                   /* Build a vector of the neutral value and shift the
4265                      other elements into place.  */
4266                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4267                                                        neutral_op);
4268                   int k = nunits;
4269                   while (k > 0 && elts[k - 1] == neutral_op)
4270                     k -= 1;
4271                   while (k > 0)
4272                     {
4273                       k -= 1;
4274                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4275                                            vector_type, init, elts[k]);
4276                     }
4277                 }
4278               else
4279                 {
4280                   /* First time round, duplicate ELTS to fill the
4281                      required number of vectors, then cherry pick the
4282                      appropriate result for each iteration.  */
4283                   if (vec_oprnds->is_empty ())
4284                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4285                                               number_of_vectors,
4286                                               permute_results);
4287                   init = permute_results[number_of_vectors - j - 1];
4288                 }
4289               if (ctor_seq != NULL)
4290                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4291               voprnds.quick_push (init);
4292
4293               number_of_places_left_in_vector = nunits;
4294               elts.new_vector (vector_type, nunits, 1);
4295               elts.quick_grow (nunits);
4296               constant_p = true;
4297             }
4298         }
4299     }
4300
4301   /* Since the vectors are created in the reverse order, we should invert
4302      them.  */
4303   vec_num = voprnds.length ();
4304   for (j = vec_num; j != 0; j--)
4305     {
4306       vop = voprnds[j - 1];
4307       vec_oprnds->quick_push (vop);
4308     }
4309
4310   voprnds.release ();
4311
4312   /* In case that VF is greater than the unrolling factor needed for the SLP
4313      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4314      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4315      to replicate the vectors.  */
4316   tree neutral_vec = NULL;
4317   while (number_of_vectors > vec_oprnds->length ())
4318     {
4319       if (neutral_op)
4320         {
4321           if (!neutral_vec)
4322             {
4323               gimple_seq ctor_seq = NULL;
4324               neutral_vec = gimple_build_vector_from_val
4325                 (&ctor_seq, vector_type, neutral_op);
4326               if (ctor_seq != NULL)
4327                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4328             }
4329           vec_oprnds->quick_push (neutral_vec);
4330         }
4331       else
4332         {
4333           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4334             vec_oprnds->quick_push (vop);
4335         }
4336     }
4337 }
4338
4339
4340 /* Function vect_create_epilog_for_reduction
4341
4342    Create code at the loop-epilog to finalize the result of a reduction
4343    computation.
4344
4345    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4346      reduction statements.
4347    STMT is the scalar reduction stmt that is being vectorized.
4348    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4349      number of elements that we can fit in a vectype (nunits).  In this case
4350      we have to generate more than one vector stmt - i.e - we need to "unroll"
4351      the vector stmt by a factor VF/nunits.  For more details see documentation
4352      in vectorizable_operation.
4353    REDUC_FN is the internal function for the epilog reduction.
4354    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4355      computation.
4356    REDUC_INDEX is the index of the operand in the right hand side of the
4357      statement that is defined by REDUCTION_PHI.
4358    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4359    SLP_NODE is an SLP node containing a group of reduction statements. The
4360      first one in this group is STMT.
4361    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4362      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4363      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4364      any value of the IV in the loop.
4365    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4366    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4367      null if this is not an SLP reduction
4368
4369    This function:
4370    1. Creates the reduction def-use cycles: sets the arguments for
4371       REDUCTION_PHIS:
4372       The loop-entry argument is the vectorized initial-value of the reduction.
4373       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4374       sums.
4375    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4376       by calling the function specified by REDUC_FN if available, or by
4377       other means (whole-vector shifts or a scalar loop).
4378       The function also creates a new phi node at the loop exit to preserve
4379       loop-closed form, as illustrated below.
4380
4381      The flow at the entry to this function:
4382
4383         loop:
4384           vec_def = phi <null, null>            # REDUCTION_PHI
4385           VECT_DEF = vector_stmt                # vectorized form of STMT
4386           s_loop = scalar_stmt                  # (scalar) STMT
4387         loop_exit:
4388           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4389           use <s_out0>
4390           use <s_out0>
4391
4392      The above is transformed by this function into:
4393
4394         loop:
4395           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4396           VECT_DEF = vector_stmt                # vectorized form of STMT
4397           s_loop = scalar_stmt                  # (scalar) STMT
4398         loop_exit:
4399           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4400           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4401           v_out2 = reduce <v_out1>
4402           s_out3 = extract_field <v_out2, 0>
4403           s_out4 = adjust_result <s_out3>
4404           use <s_out4>
4405           use <s_out4>
4406 */
4407
4408 static void
4409 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4410                                   gimple *reduc_def_stmt,
4411                                   int ncopies, internal_fn reduc_fn,
4412                                   vec<stmt_vec_info> reduction_phis,
4413                                   bool double_reduc,
4414                                   slp_tree slp_node,
4415                                   slp_instance slp_node_instance,
4416                                   tree induc_val, enum tree_code induc_code,
4417                                   tree neutral_op)
4418 {
4419   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4420   stmt_vec_info prev_phi_info;
4421   tree vectype;
4422   machine_mode mode;
4423   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4424   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4425   basic_block exit_bb;
4426   tree scalar_dest;
4427   tree scalar_type;
4428   gimple *new_phi = NULL, *phi;
4429   stmt_vec_info phi_info;
4430   gimple_stmt_iterator exit_gsi;
4431   tree vec_dest;
4432   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4433   gimple *epilog_stmt = NULL;
4434   enum tree_code code = gimple_assign_rhs_code (stmt);
4435   gimple *exit_phi;
4436   tree bitsize;
4437   tree adjustment_def = NULL;
4438   tree vec_initial_def = NULL;
4439   tree expr, def, initial_def = NULL;
4440   tree orig_name, scalar_result;
4441   imm_use_iterator imm_iter, phi_imm_iter;
4442   use_operand_p use_p, phi_use_p;
4443   gimple *use_stmt;
4444   stmt_vec_info reduction_phi_info = NULL;
4445   bool nested_in_vect_loop = false;
4446   auto_vec<gimple *> new_phis;
4447   auto_vec<stmt_vec_info> inner_phis;
4448   enum vect_def_type dt = vect_unknown_def_type;
4449   int j, i;
4450   auto_vec<tree> scalar_results;
4451   unsigned int group_size = 1, k, ratio;
4452   auto_vec<tree> vec_initial_defs;
4453   auto_vec<gimple *> phis;
4454   bool slp_reduc = false;
4455   bool direct_slp_reduc;
4456   tree new_phi_result;
4457   stmt_vec_info inner_phi = NULL;
4458   tree induction_index = NULL_TREE;
4459
4460   if (slp_node)
4461     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4462
4463   if (nested_in_vect_loop_p (loop, stmt))
4464     {
4465       outer_loop = loop;
4466       loop = loop->inner;
4467       nested_in_vect_loop = true;
4468       gcc_assert (!slp_node);
4469     }
4470
4471   vectype = STMT_VINFO_VECTYPE (stmt_info);
4472   gcc_assert (vectype);
4473   mode = TYPE_MODE (vectype);
4474
4475   /* 1. Create the reduction def-use cycle:
4476      Set the arguments of REDUCTION_PHIS, i.e., transform
4477
4478         loop:
4479           vec_def = phi <null, null>            # REDUCTION_PHI
4480           VECT_DEF = vector_stmt                # vectorized form of STMT
4481           ...
4482
4483      into:
4484
4485         loop:
4486           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4487           VECT_DEF = vector_stmt                # vectorized form of STMT
4488           ...
4489
4490      (in case of SLP, do it for all the phis). */
4491
4492   /* Get the loop-entry arguments.  */
4493   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4494   if (slp_node)
4495     {
4496       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4497       vec_initial_defs.reserve (vec_num);
4498       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4499                                       &vec_initial_defs, vec_num,
4500                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4501                                       neutral_op);
4502     }
4503   else
4504     {
4505       /* Get at the scalar def before the loop, that defines the initial value
4506          of the reduction variable.  */
4507       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4508                                            loop_preheader_edge (loop));
4509       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4510          and we can't use zero for induc_val, use initial_def.  Similarly
4511          for REDUC_MIN and initial_def larger than the base.  */
4512       if (TREE_CODE (initial_def) == INTEGER_CST
4513           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4514               == INTEGER_INDUC_COND_REDUCTION)
4515           && !integer_zerop (induc_val)
4516           && ((induc_code == MAX_EXPR
4517                && tree_int_cst_lt (initial_def, induc_val))
4518               || (induc_code == MIN_EXPR
4519                   && tree_int_cst_lt (induc_val, initial_def))))
4520         induc_val = initial_def;
4521
4522       if (double_reduc)
4523         /* In case of double reduction we only create a vector variable
4524            to be put in the reduction phi node.  The actual statement
4525            creation is done later in this function.  */
4526         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4527       else if (nested_in_vect_loop)
4528         {
4529           /* Do not use an adjustment def as that case is not supported
4530              correctly if ncopies is not one.  */
4531           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4532           vec_initial_def = vect_get_vec_def_for_operand (initial_def, stmt);
4533         }
4534       else
4535         vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4536                                                          &adjustment_def);
4537       vec_initial_defs.create (1);
4538       vec_initial_defs.quick_push (vec_initial_def);
4539     }
4540
4541   /* Set phi nodes arguments.  */
4542   FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4543     {
4544       tree vec_init_def = vec_initial_defs[i];
4545       tree def = vect_defs[i];
4546       for (j = 0; j < ncopies; j++)
4547         {
4548           if (j != 0)
4549             {
4550               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4551               if (nested_in_vect_loop)
4552                 vec_init_def
4553                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4554                                                     vec_init_def);
4555             }
4556
4557           /* Set the loop-entry arg of the reduction-phi.  */
4558
4559           gphi *phi = as_a <gphi *> (phi_info->stmt);
4560           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4561               == INTEGER_INDUC_COND_REDUCTION)
4562             {
4563               /* Initialise the reduction phi to zero.  This prevents initial
4564                  values of non-zero interferring with the reduction op.  */
4565               gcc_assert (ncopies == 1);
4566               gcc_assert (i == 0);
4567
4568               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4569               tree induc_val_vec
4570                 = build_vector_from_val (vec_init_def_type, induc_val);
4571
4572               add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4573                            UNKNOWN_LOCATION);
4574             }
4575           else
4576             add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4577                          UNKNOWN_LOCATION);
4578
4579           /* Set the loop-latch arg for the reduction-phi.  */
4580           if (j > 0)
4581             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4582
4583           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4584
4585           if (dump_enabled_p ())
4586             {
4587               dump_printf_loc (MSG_NOTE, vect_location,
4588                                "transform reduction: created def-use cycle: ");
4589               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4590               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4591             }
4592         }
4593     }
4594
4595   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4596      which is updated with the current index of the loop for every match of
4597      the original loop's cond_expr (VEC_STMT).  This results in a vector
4598      containing the last time the condition passed for that vector lane.
4599      The first match will be a 1 to allow 0 to be used for non-matching
4600      indexes.  If there are no matches at all then the vector will be all
4601      zeroes.  */
4602   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4603     {
4604       tree indx_before_incr, indx_after_incr;
4605       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4606
4607       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4608       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4609
4610       int scalar_precision
4611         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4612       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4613       tree cr_index_vector_type = build_vector_type
4614         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4615
4616       /* First we create a simple vector induction variable which starts
4617          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4618          vector size (STEP).  */
4619
4620       /* Create a {1,2,3,...} vector.  */
4621       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4622
4623       /* Create a vector of the step value.  */
4624       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4625       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4626
4627       /* Create an induction variable.  */
4628       gimple_stmt_iterator incr_gsi;
4629       bool insert_after;
4630       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4631       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4632                  insert_after, &indx_before_incr, &indx_after_incr);
4633
4634       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4635          filled with zeros (VEC_ZERO).  */
4636
4637       /* Create a vector of 0s.  */
4638       tree zero = build_zero_cst (cr_index_scalar_type);
4639       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4640
4641       /* Create a vector phi node.  */
4642       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4643       new_phi = create_phi_node (new_phi_tree, loop->header);
4644       loop_vinfo->add_stmt (new_phi);
4645       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4646                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4647
4648       /* Now take the condition from the loops original cond_expr
4649          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4650          every match uses values from the induction variable
4651          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4652          (NEW_PHI_TREE).
4653          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4654          the new cond_expr (INDEX_COND_EXPR).  */
4655
4656       /* Duplicate the condition from vec_stmt.  */
4657       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4658
4659       /* Create a conditional, where the condition is taken from vec_stmt
4660          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4661          else is the phi (NEW_PHI_TREE).  */
4662       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4663                                      ccompare, indx_before_incr,
4664                                      new_phi_tree);
4665       induction_index = make_ssa_name (cr_index_vector_type);
4666       gimple *index_condition = gimple_build_assign (induction_index,
4667                                                      index_cond_expr);
4668       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4669       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4670       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4671
4672       /* Update the phi with the vec cond.  */
4673       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4674                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4675     }
4676
4677   /* 2. Create epilog code.
4678         The reduction epilog code operates across the elements of the vector
4679         of partial results computed by the vectorized loop.
4680         The reduction epilog code consists of:
4681
4682         step 1: compute the scalar result in a vector (v_out2)
4683         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4684         step 3: adjust the scalar result (s_out3) if needed.
4685
4686         Step 1 can be accomplished using one the following three schemes:
4687           (scheme 1) using reduc_fn, if available.
4688           (scheme 2) using whole-vector shifts, if available.
4689           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4690                      combined.
4691
4692           The overall epilog code looks like this:
4693
4694           s_out0 = phi <s_loop>         # original EXIT_PHI
4695           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4696           v_out2 = reduce <v_out1>              # step 1
4697           s_out3 = extract_field <v_out2, 0>    # step 2
4698           s_out4 = adjust_result <s_out3>       # step 3
4699
4700           (step 3 is optional, and steps 1 and 2 may be combined).
4701           Lastly, the uses of s_out0 are replaced by s_out4.  */
4702
4703
4704   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4705          v_out1 = phi <VECT_DEF>
4706          Store them in NEW_PHIS.  */
4707
4708   exit_bb = single_exit (loop)->dest;
4709   prev_phi_info = NULL;
4710   new_phis.create (vect_defs.length ());
4711   FOR_EACH_VEC_ELT (vect_defs, i, def)
4712     {
4713       for (j = 0; j < ncopies; j++)
4714         {
4715           tree new_def = copy_ssa_name (def);
4716           phi = create_phi_node (new_def, exit_bb);
4717           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4718           if (j == 0)
4719             new_phis.quick_push (phi);
4720           else
4721             {
4722               def = vect_get_vec_def_for_stmt_copy (dt, def);
4723               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4724             }
4725
4726           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4727           prev_phi_info = phi_info;
4728         }
4729     }
4730
4731   /* The epilogue is created for the outer-loop, i.e., for the loop being
4732      vectorized.  Create exit phis for the outer loop.  */
4733   if (double_reduc)
4734     {
4735       loop = outer_loop;
4736       exit_bb = single_exit (loop)->dest;
4737       inner_phis.create (vect_defs.length ());
4738       FOR_EACH_VEC_ELT (new_phis, i, phi)
4739         {
4740           stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4741           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4742           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4743           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4744                            PHI_RESULT (phi));
4745           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4746           inner_phis.quick_push (phi_info);
4747           new_phis[i] = outer_phi;
4748           while (STMT_VINFO_RELATED_STMT (phi_info))
4749             {
4750               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4751               new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4752               outer_phi = create_phi_node (new_result, exit_bb);
4753               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4754                                PHI_RESULT (phi_info->stmt));
4755               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4756               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4757               prev_phi_info = outer_phi_info;
4758             }
4759         }
4760     }
4761
4762   exit_gsi = gsi_after_labels (exit_bb);
4763
4764   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4765          (i.e. when reduc_fn is not available) and in the final adjustment
4766          code (if needed).  Also get the original scalar reduction variable as
4767          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4768          represents a reduction pattern), the tree-code and scalar-def are
4769          taken from the original stmt that the pattern-stmt (STMT) replaces.
4770          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4771          are taken from STMT.  */
4772
4773   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4774   if (!orig_stmt_info)
4775     {
4776       /* Regular reduction  */
4777       orig_stmt_info = stmt_info;
4778     }
4779   else
4780     {
4781       /* Reduction pattern  */
4782       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4783       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4784     }
4785
4786   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4787   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4788      partial results are added and not subtracted.  */
4789   if (code == MINUS_EXPR)
4790     code = PLUS_EXPR;
4791
4792   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4793   scalar_type = TREE_TYPE (scalar_dest);
4794   scalar_results.create (group_size);
4795   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4796   bitsize = TYPE_SIZE (scalar_type);
4797
4798   /* In case this is a reduction in an inner-loop while vectorizing an outer
4799      loop - we don't need to extract a single scalar result at the end of the
4800      inner-loop (unless it is double reduction, i.e., the use of reduction is
4801      outside the outer-loop).  The final vector of partial results will be used
4802      in the vectorized outer-loop, or reduced to a scalar result at the end of
4803      the outer-loop.  */
4804   if (nested_in_vect_loop && !double_reduc)
4805     goto vect_finalize_reduction;
4806
4807   /* SLP reduction without reduction chain, e.g.,
4808      # a1 = phi <a2, a0>
4809      # b1 = phi <b2, b0>
4810      a2 = operation (a1)
4811      b2 = operation (b1)  */
4812   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4813
4814   /* True if we should implement SLP_REDUC using native reduction operations
4815      instead of scalar operations.  */
4816   direct_slp_reduc = (reduc_fn != IFN_LAST
4817                       && slp_reduc
4818                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4819
4820   /* In case of reduction chain, e.g.,
4821      # a1 = phi <a3, a0>
4822      a2 = operation (a1)
4823      a3 = operation (a2),
4824
4825      we may end up with more than one vector result.  Here we reduce them to
4826      one vector.  */
4827   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4828     {
4829       tree first_vect = PHI_RESULT (new_phis[0]);
4830       gassign *new_vec_stmt = NULL;
4831       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4832       for (k = 1; k < new_phis.length (); k++)
4833         {
4834           gimple *next_phi = new_phis[k];
4835           tree second_vect = PHI_RESULT (next_phi);
4836           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4837           new_vec_stmt = gimple_build_assign (tem, code,
4838                                               first_vect, second_vect);
4839           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4840           first_vect = tem;
4841         }
4842
4843       new_phi_result = first_vect;
4844       if (new_vec_stmt)
4845         {
4846           new_phis.truncate (0);
4847           new_phis.safe_push (new_vec_stmt);
4848         }
4849     }
4850   /* Likewise if we couldn't use a single defuse cycle.  */
4851   else if (ncopies > 1)
4852     {
4853       gcc_assert (new_phis.length () == 1);
4854       tree first_vect = PHI_RESULT (new_phis[0]);
4855       gassign *new_vec_stmt = NULL;
4856       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4857       gimple *next_phi = new_phis[0];
4858       for (int k = 1; k < ncopies; ++k)
4859         {
4860           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4861           tree second_vect = PHI_RESULT (next_phi);
4862           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4863           new_vec_stmt = gimple_build_assign (tem, code,
4864                                               first_vect, second_vect);
4865           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4866           first_vect = tem;
4867         }
4868       new_phi_result = first_vect;
4869       new_phis.truncate (0);
4870       new_phis.safe_push (new_vec_stmt);
4871     }
4872   else
4873     new_phi_result = PHI_RESULT (new_phis[0]);
4874
4875   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4876       && reduc_fn != IFN_LAST)
4877     {
4878       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4879          various data values where the condition matched and another vector
4880          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4881          need to extract the last matching index (which will be the index with
4882          highest value) and use this to index into the data vector.
4883          For the case where there were no matches, the data vector will contain
4884          all default values and the index vector will be all zeros.  */
4885
4886       /* Get various versions of the type of the vector of indexes.  */
4887       tree index_vec_type = TREE_TYPE (induction_index);
4888       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4889       tree index_scalar_type = TREE_TYPE (index_vec_type);
4890       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4891         (index_vec_type);
4892
4893       /* Get an unsigned integer version of the type of the data vector.  */
4894       int scalar_precision
4895         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4896       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4897       tree vectype_unsigned = build_vector_type
4898         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4899
4900       /* First we need to create a vector (ZERO_VEC) of zeros and another
4901          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4902          can create using a MAX reduction and then expanding.
4903          In the case where the loop never made any matches, the max index will
4904          be zero.  */
4905
4906       /* Vector of {0, 0, 0,...}.  */
4907       tree zero_vec = make_ssa_name (vectype);
4908       tree zero_vec_rhs = build_zero_cst (vectype);
4909       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4910       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4911
4912       /* Find maximum value from the vector of found indexes.  */
4913       tree max_index = make_ssa_name (index_scalar_type);
4914       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4915                                                           1, induction_index);
4916       gimple_call_set_lhs (max_index_stmt, max_index);
4917       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4918
4919       /* Vector of {max_index, max_index, max_index,...}.  */
4920       tree max_index_vec = make_ssa_name (index_vec_type);
4921       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4922                                                       max_index);
4923       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4924                                                         max_index_vec_rhs);
4925       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4926
4927       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4928          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4929          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4930          otherwise.  Only one value should match, resulting in a vector
4931          (VEC_COND) with one data value and the rest zeros.
4932          In the case where the loop never made any matches, every index will
4933          match, resulting in a vector with all data values (which will all be
4934          the default value).  */
4935
4936       /* Compare the max index vector to the vector of found indexes to find
4937          the position of the max value.  */
4938       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4939       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4940                                                       induction_index,
4941                                                       max_index_vec);
4942       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4943
4944       /* Use the compare to choose either values from the data vector or
4945          zero.  */
4946       tree vec_cond = make_ssa_name (vectype);
4947       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4948                                                    vec_compare, new_phi_result,
4949                                                    zero_vec);
4950       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4951
4952       /* Finally we need to extract the data value from the vector (VEC_COND)
4953          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4954          reduction, but because this doesn't exist, we can use a MAX reduction
4955          instead.  The data value might be signed or a float so we need to cast
4956          it first.
4957          In the case where the loop never made any matches, the data values are
4958          all identical, and so will reduce down correctly.  */
4959
4960       /* Make the matched data values unsigned.  */
4961       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4962       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4963                                        vec_cond);
4964       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4965                                                         VIEW_CONVERT_EXPR,
4966                                                         vec_cond_cast_rhs);
4967       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4968
4969       /* Reduce down to a scalar value.  */
4970       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4971       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4972                                                            1, vec_cond_cast);
4973       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4974       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4975
4976       /* Convert the reduced value back to the result type and set as the
4977          result.  */
4978       gimple_seq stmts = NULL;
4979       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4980                                data_reduc);
4981       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4982       scalar_results.safe_push (new_temp);
4983     }
4984   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4985            && reduc_fn == IFN_LAST)
4986     {
4987       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4988          idx = 0;
4989          idx_val = induction_index[0];
4990          val = data_reduc[0];
4991          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4992            if (induction_index[i] > idx_val)
4993              val = data_reduc[i], idx_val = induction_index[i];
4994          return val;  */
4995
4996       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4997       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4998       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4999       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5000       /* Enforced by vectorizable_reduction, which ensures we have target
5001          support before allowing a conditional reduction on variable-length
5002          vectors.  */
5003       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5004       tree idx_val = NULL_TREE, val = NULL_TREE;
5005       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5006         {
5007           tree old_idx_val = idx_val;
5008           tree old_val = val;
5009           idx_val = make_ssa_name (idx_eltype);
5010           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5011                                              build3 (BIT_FIELD_REF, idx_eltype,
5012                                                      induction_index,
5013                                                      bitsize_int (el_size),
5014                                                      bitsize_int (off)));
5015           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5016           val = make_ssa_name (data_eltype);
5017           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5018                                              build3 (BIT_FIELD_REF,
5019                                                      data_eltype,
5020                                                      new_phi_result,
5021                                                      bitsize_int (el_size),
5022                                                      bitsize_int (off)));
5023           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5024           if (off != 0)
5025             {
5026               tree new_idx_val = idx_val;
5027               tree new_val = val;
5028               if (off != v_size - el_size)
5029                 {
5030                   new_idx_val = make_ssa_name (idx_eltype);
5031                   epilog_stmt = gimple_build_assign (new_idx_val,
5032                                                      MAX_EXPR, idx_val,
5033                                                      old_idx_val);
5034                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5035                 }
5036               new_val = make_ssa_name (data_eltype);
5037               epilog_stmt = gimple_build_assign (new_val,
5038                                                  COND_EXPR,
5039                                                  build2 (GT_EXPR,
5040                                                          boolean_type_node,
5041                                                          idx_val,
5042                                                          old_idx_val),
5043                                                  val, old_val);
5044               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5045               idx_val = new_idx_val;
5046               val = new_val;
5047             }
5048         }
5049       /* Convert the reduced value back to the result type and set as the
5050          result.  */
5051       gimple_seq stmts = NULL;
5052       val = gimple_convert (&stmts, scalar_type, val);
5053       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5054       scalar_results.safe_push (val);
5055     }
5056
5057   /* 2.3 Create the reduction code, using one of the three schemes described
5058          above. In SLP we simply need to extract all the elements from the
5059          vector (without reducing them), so we use scalar shifts.  */
5060   else if (reduc_fn != IFN_LAST && !slp_reduc)
5061     {
5062       tree tmp;
5063       tree vec_elem_type;
5064
5065       /* Case 1:  Create:
5066          v_out2 = reduc_expr <v_out1>  */
5067
5068       if (dump_enabled_p ())
5069         dump_printf_loc (MSG_NOTE, vect_location,
5070                          "Reduce using direct vector reduction.\n");
5071
5072       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5073       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5074         {
5075           tree tmp_dest
5076             = vect_create_destination_var (scalar_dest, vec_elem_type);
5077           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5078                                                     new_phi_result);
5079           gimple_set_lhs (epilog_stmt, tmp_dest);
5080           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5081           gimple_set_lhs (epilog_stmt, new_temp);
5082           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5083
5084           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5085                                              new_temp);
5086         }
5087       else
5088         {
5089           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5090                                                     new_phi_result);
5091           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5092         }
5093
5094       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5095       gimple_set_lhs (epilog_stmt, new_temp);
5096       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5097
5098       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5099            == INTEGER_INDUC_COND_REDUCTION)
5100           && !operand_equal_p (initial_def, induc_val, 0))
5101         {
5102           /* Earlier we set the initial value to be a vector if induc_val
5103              values.  Check the result and if it is induc_val then replace
5104              with the original initial value, unless induc_val is
5105              the same as initial_def already.  */
5106           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5107                                   induc_val);
5108
5109           tmp = make_ssa_name (new_scalar_dest);
5110           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5111                                              initial_def, new_temp);
5112           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5113           new_temp = tmp;
5114         }
5115
5116       scalar_results.safe_push (new_temp);
5117     }
5118   else if (direct_slp_reduc)
5119     {
5120       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5121          with the elements for other SLP statements replaced with the
5122          neutral value.  We can then do a normal reduction on each vector.  */
5123
5124       /* Enforced by vectorizable_reduction.  */
5125       gcc_assert (new_phis.length () == 1);
5126       gcc_assert (pow2p_hwi (group_size));
5127
5128       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5129       vec<stmt_vec_info> orig_phis
5130         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5131       gimple_seq seq = NULL;
5132
5133       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5134          and the same element size as VECTYPE.  */
5135       tree index = build_index_vector (vectype, 0, 1);
5136       tree index_type = TREE_TYPE (index);
5137       tree index_elt_type = TREE_TYPE (index_type);
5138       tree mask_type = build_same_sized_truth_vector_type (index_type);
5139
5140       /* Create a vector that, for each element, identifies which of
5141          the REDUC_GROUP_SIZE results should use it.  */
5142       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5143       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5144                             build_vector_from_val (index_type, index_mask));
5145
5146       /* Get a neutral vector value.  This is simply a splat of the neutral
5147          scalar value if we have one, otherwise the initial scalar value
5148          is itself a neutral value.  */
5149       tree vector_identity = NULL_TREE;
5150       if (neutral_op)
5151         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5152                                                         neutral_op);
5153       for (unsigned int i = 0; i < group_size; ++i)
5154         {
5155           /* If there's no univeral neutral value, we can use the
5156              initial scalar value from the original PHI.  This is used
5157              for MIN and MAX reduction, for example.  */
5158           if (!neutral_op)
5159             {
5160               tree scalar_value
5161                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5162                                          loop_preheader_edge (loop));
5163               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5164                                                               scalar_value);
5165             }
5166
5167           /* Calculate the equivalent of:
5168
5169              sel[j] = (index[j] == i);
5170
5171              which selects the elements of NEW_PHI_RESULT that should
5172              be included in the result.  */
5173           tree compare_val = build_int_cst (index_elt_type, i);
5174           compare_val = build_vector_from_val (index_type, compare_val);
5175           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5176                                    index, compare_val);
5177
5178           /* Calculate the equivalent of:
5179
5180              vec = seq ? new_phi_result : vector_identity;
5181
5182              VEC is now suitable for a full vector reduction.  */
5183           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5184                                    sel, new_phi_result, vector_identity);
5185
5186           /* Do the reduction and convert it to the appropriate type.  */
5187           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5188                                       TREE_TYPE (vectype), vec);
5189           scalar = gimple_convert (&seq, scalar_type, scalar);
5190           scalar_results.safe_push (scalar);
5191         }
5192       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5193     }
5194   else
5195     {
5196       bool reduce_with_shift;
5197       tree vec_temp;
5198
5199       /* COND reductions all do the final reduction with MAX_EXPR
5200          or MIN_EXPR.  */
5201       if (code == COND_EXPR)
5202         {
5203           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5204               == INTEGER_INDUC_COND_REDUCTION)
5205             code = induc_code;
5206           else
5207             code = MAX_EXPR;
5208         }
5209
5210       /* See if the target wants to do the final (shift) reduction
5211          in a vector mode of smaller size and first reduce upper/lower
5212          halves against each other.  */
5213       enum machine_mode mode1 = mode;
5214       tree vectype1 = vectype;
5215       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5216       unsigned sz1 = sz;
5217       if (!slp_reduc
5218           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5219         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5220
5221       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5222       reduce_with_shift = have_whole_vector_shift (mode1);
5223       if (!VECTOR_MODE_P (mode1))
5224         reduce_with_shift = false;
5225       else
5226         {
5227           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5228           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5229             reduce_with_shift = false;
5230         }
5231
5232       /* First reduce the vector to the desired vector size we should
5233          do shift reduction on by combining upper and lower halves.  */
5234       new_temp = new_phi_result;
5235       while (sz > sz1)
5236         {
5237           gcc_assert (!slp_reduc);
5238           sz /= 2;
5239           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5240
5241           /* The target has to make sure we support lowpart/highpart
5242              extraction, either via direct vector extract or through
5243              an integer mode punning.  */
5244           tree dst1, dst2;
5245           if (convert_optab_handler (vec_extract_optab,
5246                                      TYPE_MODE (TREE_TYPE (new_temp)),
5247                                      TYPE_MODE (vectype1))
5248               != CODE_FOR_nothing)
5249             {
5250               /* Extract sub-vectors directly once vec_extract becomes
5251                  a conversion optab.  */
5252               dst1 = make_ssa_name (vectype1);
5253               epilog_stmt
5254                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5255                                          build3 (BIT_FIELD_REF, vectype1,
5256                                                  new_temp, TYPE_SIZE (vectype1),
5257                                                  bitsize_int (0)));
5258               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5259               dst2 =  make_ssa_name (vectype1);
5260               epilog_stmt
5261                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5262                                          build3 (BIT_FIELD_REF, vectype1,
5263                                                  new_temp, TYPE_SIZE (vectype1),
5264                                                  bitsize_int (sz * BITS_PER_UNIT)));
5265               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5266             }
5267           else
5268             {
5269               /* Extract via punning to appropriately sized integer mode
5270                  vector.  */
5271               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5272                                                             1);
5273               tree etype = build_vector_type (eltype, 2);
5274               gcc_assert (convert_optab_handler (vec_extract_optab,
5275                                                  TYPE_MODE (etype),
5276                                                  TYPE_MODE (eltype))
5277                           != CODE_FOR_nothing);
5278               tree tem = make_ssa_name (etype);
5279               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5280                                                  build1 (VIEW_CONVERT_EXPR,
5281                                                          etype, new_temp));
5282               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5283               new_temp = tem;
5284               tem = make_ssa_name (eltype);
5285               epilog_stmt
5286                   = gimple_build_assign (tem, BIT_FIELD_REF,
5287                                          build3 (BIT_FIELD_REF, eltype,
5288                                                  new_temp, TYPE_SIZE (eltype),
5289                                                  bitsize_int (0)));
5290               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5291               dst1 = make_ssa_name (vectype1);
5292               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5293                                                  build1 (VIEW_CONVERT_EXPR,
5294                                                          vectype1, tem));
5295               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5296               tem = make_ssa_name (eltype);
5297               epilog_stmt
5298                   = gimple_build_assign (tem, BIT_FIELD_REF,
5299                                          build3 (BIT_FIELD_REF, eltype,
5300                                                  new_temp, TYPE_SIZE (eltype),
5301                                                  bitsize_int (sz * BITS_PER_UNIT)));
5302               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5303               dst2 =  make_ssa_name (vectype1);
5304               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5305                                                  build1 (VIEW_CONVERT_EXPR,
5306                                                          vectype1, tem));
5307               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5308             }
5309
5310           new_temp = make_ssa_name (vectype1);
5311           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5312           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5313         }
5314
5315       if (reduce_with_shift && !slp_reduc)
5316         {
5317           int element_bitsize = tree_to_uhwi (bitsize);
5318           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5319              for variable-length vectors and also requires direct target support
5320              for loop reductions.  */
5321           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5322           int nelements = vec_size_in_bits / element_bitsize;
5323           vec_perm_builder sel;
5324           vec_perm_indices indices;
5325
5326           int elt_offset;
5327
5328           tree zero_vec = build_zero_cst (vectype1);
5329           /* Case 2: Create:
5330              for (offset = nelements/2; offset >= 1; offset/=2)
5331                 {
5332                   Create:  va' = vec_shift <va, offset>
5333                   Create:  va = vop <va, va'>
5334                 }  */
5335
5336           tree rhs;
5337
5338           if (dump_enabled_p ())
5339             dump_printf_loc (MSG_NOTE, vect_location,
5340                              "Reduce using vector shifts\n");
5341
5342           mode1 = TYPE_MODE (vectype1);
5343           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5344           for (elt_offset = nelements / 2;
5345                elt_offset >= 1;
5346                elt_offset /= 2)
5347             {
5348               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5349               indices.new_vector (sel, 2, nelements);
5350               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5351               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5352                                                  new_temp, zero_vec, mask);
5353               new_name = make_ssa_name (vec_dest, epilog_stmt);
5354               gimple_assign_set_lhs (epilog_stmt, new_name);
5355               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5356
5357               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5358                                                  new_temp);
5359               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5360               gimple_assign_set_lhs (epilog_stmt, new_temp);
5361               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5362             }
5363
5364           /* 2.4  Extract the final scalar result.  Create:
5365              s_out3 = extract_field <v_out2, bitpos>  */
5366
5367           if (dump_enabled_p ())
5368             dump_printf_loc (MSG_NOTE, vect_location,
5369                              "extract scalar result\n");
5370
5371           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5372                         bitsize, bitsize_zero_node);
5373           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5374           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5375           gimple_assign_set_lhs (epilog_stmt, new_temp);
5376           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5377           scalar_results.safe_push (new_temp);
5378         }
5379       else
5380         {
5381           /* Case 3: Create:
5382              s = extract_field <v_out2, 0>
5383              for (offset = element_size;
5384                   offset < vector_size;
5385                   offset += element_size;)
5386                {
5387                  Create:  s' = extract_field <v_out2, offset>
5388                  Create:  s = op <s, s'>  // For non SLP cases
5389                }  */
5390
5391           if (dump_enabled_p ())
5392             dump_printf_loc (MSG_NOTE, vect_location,
5393                              "Reduce using scalar code.\n");
5394
5395           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5396           int element_bitsize = tree_to_uhwi (bitsize);
5397           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5398             {
5399               int bit_offset;
5400               if (gimple_code (new_phi) == GIMPLE_PHI)
5401                 vec_temp = PHI_RESULT (new_phi);
5402               else
5403                 vec_temp = gimple_assign_lhs (new_phi);
5404               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5405                                  bitsize_zero_node);
5406               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5407               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5408               gimple_assign_set_lhs (epilog_stmt, new_temp);
5409               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5410
5411               /* In SLP we don't need to apply reduction operation, so we just
5412                  collect s' values in SCALAR_RESULTS.  */
5413               if (slp_reduc)
5414                 scalar_results.safe_push (new_temp);
5415
5416               for (bit_offset = element_bitsize;
5417                    bit_offset < vec_size_in_bits;
5418                    bit_offset += element_bitsize)
5419                 {
5420                   tree bitpos = bitsize_int (bit_offset);
5421                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5422                                      bitsize, bitpos);
5423
5424                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5425                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5426                   gimple_assign_set_lhs (epilog_stmt, new_name);
5427                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5428
5429                   if (slp_reduc)
5430                     {
5431                       /* In SLP we don't need to apply reduction operation, so
5432                          we just collect s' values in SCALAR_RESULTS.  */
5433                       new_temp = new_name;
5434                       scalar_results.safe_push (new_name);
5435                     }
5436                   else
5437                     {
5438                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5439                                                          new_name, new_temp);
5440                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5441                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5442                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5443                     }
5444                 }
5445             }
5446
5447           /* The only case where we need to reduce scalar results in SLP, is
5448              unrolling.  If the size of SCALAR_RESULTS is greater than
5449              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5450              REDUC_GROUP_SIZE.  */
5451           if (slp_reduc)
5452             {
5453               tree res, first_res, new_res;
5454               gimple *new_stmt;
5455
5456               /* Reduce multiple scalar results in case of SLP unrolling.  */
5457               for (j = group_size; scalar_results.iterate (j, &res);
5458                    j++)
5459                 {
5460                   first_res = scalar_results[j % group_size];
5461                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5462                                                   first_res, res);
5463                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5464                   gimple_assign_set_lhs (new_stmt, new_res);
5465                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5466                   scalar_results[j % group_size] = new_res;
5467                 }
5468             }
5469           else
5470             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5471             scalar_results.safe_push (new_temp);
5472         }
5473
5474       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5475            == INTEGER_INDUC_COND_REDUCTION)
5476           && !operand_equal_p (initial_def, induc_val, 0))
5477         {
5478           /* Earlier we set the initial value to be a vector if induc_val
5479              values.  Check the result and if it is induc_val then replace
5480              with the original initial value, unless induc_val is
5481              the same as initial_def already.  */
5482           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5483                                   induc_val);
5484
5485           tree tmp = make_ssa_name (new_scalar_dest);
5486           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5487                                              initial_def, new_temp);
5488           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5489           scalar_results[0] = tmp;
5490         }
5491     }
5492
5493 vect_finalize_reduction:
5494
5495   if (double_reduc)
5496     loop = loop->inner;
5497
5498   /* 2.5 Adjust the final result by the initial value of the reduction
5499          variable. (When such adjustment is not needed, then
5500          'adjustment_def' is zero).  For example, if code is PLUS we create:
5501          new_temp = loop_exit_def + adjustment_def  */
5502
5503   if (adjustment_def)
5504     {
5505       gcc_assert (!slp_reduc);
5506       if (nested_in_vect_loop)
5507         {
5508           new_phi = new_phis[0];
5509           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5510           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5511           new_dest = vect_create_destination_var (scalar_dest, vectype);
5512         }
5513       else
5514         {
5515           new_temp = scalar_results[0];
5516           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5517           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5518           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5519         }
5520
5521       epilog_stmt = gimple_build_assign (new_dest, expr);
5522       new_temp = make_ssa_name (new_dest, epilog_stmt);
5523       gimple_assign_set_lhs (epilog_stmt, new_temp);
5524       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5525       if (nested_in_vect_loop)
5526         {
5527           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5528           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5529             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5530
5531           if (!double_reduc)
5532             scalar_results.quick_push (new_temp);
5533           else
5534             scalar_results[0] = new_temp;
5535         }
5536       else
5537         scalar_results[0] = new_temp;
5538
5539       new_phis[0] = epilog_stmt;
5540     }
5541
5542   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5543           phis with new adjusted scalar results, i.e., replace use <s_out0>
5544           with use <s_out4>.
5545
5546      Transform:
5547         loop_exit:
5548           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5549           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5550           v_out2 = reduce <v_out1>
5551           s_out3 = extract_field <v_out2, 0>
5552           s_out4 = adjust_result <s_out3>
5553           use <s_out0>
5554           use <s_out0>
5555
5556      into:
5557
5558         loop_exit:
5559           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5560           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5561           v_out2 = reduce <v_out1>
5562           s_out3 = extract_field <v_out2, 0>
5563           s_out4 = adjust_result <s_out3>
5564           use <s_out4>
5565           use <s_out4> */
5566
5567
5568   /* In SLP reduction chain we reduce vector results into one vector if
5569      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5570      LHS of the last stmt in the reduction chain, since we are looking for
5571      the loop exit phi node.  */
5572   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5573     {
5574       stmt_vec_info dest_stmt_info
5575         = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5576       /* Handle reduction patterns.  */
5577       if (STMT_VINFO_RELATED_STMT (dest_stmt_info))
5578         dest_stmt_info = STMT_VINFO_RELATED_STMT (dest_stmt_info);
5579
5580       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5581       group_size = 1;
5582     }
5583
5584   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5585      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5586      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5587      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5588      correspond to the first vector stmt, etc.
5589      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5590   if (group_size > new_phis.length ())
5591     {
5592       ratio = group_size / new_phis.length ();
5593       gcc_assert (!(group_size % new_phis.length ()));
5594     }
5595   else
5596     ratio = 1;
5597
5598   for (k = 0; k < group_size; k++)
5599     {
5600       if (k % ratio == 0)
5601         {
5602           epilog_stmt = new_phis[k / ratio];
5603           reduction_phi_info = reduction_phis[k / ratio];
5604           if (double_reduc)
5605             inner_phi = inner_phis[k / ratio];
5606         }
5607
5608       if (slp_reduc)
5609         {
5610           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5611
5612           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5613           /* SLP statements can't participate in patterns.  */
5614           gcc_assert (!orig_stmt_info);
5615           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5616         }
5617
5618       phis.create (3);
5619       /* Find the loop-closed-use at the loop exit of the original scalar
5620          result.  (The reduction result is expected to have two immediate uses -
5621          one at the latch block, and one at the loop exit).  */
5622       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5623         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5624             && !is_gimple_debug (USE_STMT (use_p)))
5625           phis.safe_push (USE_STMT (use_p));
5626
5627       /* While we expect to have found an exit_phi because of loop-closed-ssa
5628          form we can end up without one if the scalar cycle is dead.  */
5629
5630       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5631         {
5632           if (outer_loop)
5633             {
5634               stmt_vec_info exit_phi_vinfo
5635                 = loop_vinfo->lookup_stmt (exit_phi);
5636               gphi *vect_phi;
5637
5638               /* FORNOW. Currently not supporting the case that an inner-loop
5639                  reduction is not used in the outer-loop (but only outside the
5640                  outer-loop), unless it is double reduction.  */
5641               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5642                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5643                           || double_reduc);
5644
5645               if (double_reduc)
5646                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5647               else
5648                 STMT_VINFO_VEC_STMT (exit_phi_vinfo)
5649                   = vinfo_for_stmt (epilog_stmt);
5650               if (!double_reduc
5651                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5652                       != vect_double_reduction_def)
5653                 continue;
5654
5655               /* Handle double reduction:
5656
5657                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5658                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5659                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5660                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5661
5662                  At that point the regular reduction (stmt2 and stmt3) is
5663                  already vectorized, as well as the exit phi node, stmt4.
5664                  Here we vectorize the phi node of double reduction, stmt1, and
5665                  update all relevant statements.  */
5666
5667               /* Go through all the uses of s2 to find double reduction phi
5668                  node, i.e., stmt1 above.  */
5669               orig_name = PHI_RESULT (exit_phi);
5670               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5671                 {
5672                   stmt_vec_info use_stmt_vinfo;
5673                   tree vect_phi_init, preheader_arg, vect_phi_res;
5674                   basic_block bb = gimple_bb (use_stmt);
5675
5676                   /* Check that USE_STMT is really double reduction phi
5677                      node.  */
5678                   if (gimple_code (use_stmt) != GIMPLE_PHI
5679                       || gimple_phi_num_args (use_stmt) != 2
5680                       || bb->loop_father != outer_loop)
5681                     continue;
5682                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5683                   if (!use_stmt_vinfo
5684                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5685                           != vect_double_reduction_def)
5686                     continue;
5687
5688                   /* Create vector phi node for double reduction:
5689                      vs1 = phi <vs0, vs2>
5690                      vs1 was created previously in this function by a call to
5691                        vect_get_vec_def_for_operand and is stored in
5692                        vec_initial_def;
5693                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5694                      vs0 is created here.  */
5695
5696                   /* Create vector phi node.  */
5697                   vect_phi = create_phi_node (vec_initial_def, bb);
5698                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5699
5700                   /* Create vs0 - initial def of the double reduction phi.  */
5701                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5702                                              loop_preheader_edge (outer_loop));
5703                   vect_phi_init = get_initial_def_for_reduction
5704                     (stmt, preheader_arg, NULL);
5705
5706                   /* Update phi node arguments with vs0 and vs2.  */
5707                   add_phi_arg (vect_phi, vect_phi_init,
5708                                loop_preheader_edge (outer_loop),
5709                                UNKNOWN_LOCATION);
5710                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5711                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5712                   if (dump_enabled_p ())
5713                     {
5714                       dump_printf_loc (MSG_NOTE, vect_location,
5715                                        "created double reduction phi node: ");
5716                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5717                     }
5718
5719                   vect_phi_res = PHI_RESULT (vect_phi);
5720
5721                   /* Replace the use, i.e., set the correct vs1 in the regular
5722                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5723                      loop is redundant.  */
5724                   stmt_vec_info use_info = reduction_phi_info;
5725                   for (j = 0; j < ncopies; j++)
5726                     {
5727                       edge pr_edge = loop_preheader_edge (loop);
5728                       SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5729                                        pr_edge->dest_idx, vect_phi_res);
5730                       use_info = STMT_VINFO_RELATED_STMT (use_info);
5731                     }
5732                 }
5733             }
5734         }
5735
5736       phis.release ();
5737       if (nested_in_vect_loop)
5738         {
5739           if (double_reduc)
5740             loop = outer_loop;
5741           else
5742             continue;
5743         }
5744
5745       phis.create (3);
5746       /* Find the loop-closed-use at the loop exit of the original scalar
5747          result.  (The reduction result is expected to have two immediate uses,
5748          one at the latch block, and one at the loop exit).  For double
5749          reductions we are looking for exit phis of the outer loop.  */
5750       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5751         {
5752           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5753             {
5754               if (!is_gimple_debug (USE_STMT (use_p)))
5755                 phis.safe_push (USE_STMT (use_p));
5756             }
5757           else
5758             {
5759               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5760                 {
5761                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5762
5763                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5764                     {
5765                       if (!flow_bb_inside_loop_p (loop,
5766                                              gimple_bb (USE_STMT (phi_use_p)))
5767                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5768                         phis.safe_push (USE_STMT (phi_use_p));
5769                     }
5770                 }
5771             }
5772         }
5773
5774       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5775         {
5776           /* Replace the uses:  */
5777           orig_name = PHI_RESULT (exit_phi);
5778           scalar_result = scalar_results[k];
5779           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5780             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5781               SET_USE (use_p, scalar_result);
5782         }
5783
5784       phis.release ();
5785     }
5786 }
5787
5788 /* Return a vector of type VECTYPE that is equal to the vector select
5789    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5790    before GSI.  */
5791
5792 static tree
5793 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5794                      tree vec, tree identity)
5795 {
5796   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5797   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5798                                           mask, vec, identity);
5799   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5800   return cond;
5801 }
5802
5803 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5804    order, starting with LHS.  Insert the extraction statements before GSI and
5805    associate the new scalar SSA names with variable SCALAR_DEST.
5806    Return the SSA name for the result.  */
5807
5808 static tree
5809 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5810                        tree_code code, tree lhs, tree vector_rhs)
5811 {
5812   tree vectype = TREE_TYPE (vector_rhs);
5813   tree scalar_type = TREE_TYPE (vectype);
5814   tree bitsize = TYPE_SIZE (scalar_type);
5815   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5816   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5817
5818   for (unsigned HOST_WIDE_INT bit_offset = 0;
5819        bit_offset < vec_size_in_bits;
5820        bit_offset += element_bitsize)
5821     {
5822       tree bitpos = bitsize_int (bit_offset);
5823       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5824                          bitsize, bitpos);
5825
5826       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5827       rhs = make_ssa_name (scalar_dest, stmt);
5828       gimple_assign_set_lhs (stmt, rhs);
5829       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5830
5831       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5832       tree new_name = make_ssa_name (scalar_dest, stmt);
5833       gimple_assign_set_lhs (stmt, new_name);
5834       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5835       lhs = new_name;
5836     }
5837   return lhs;
5838 }
5839
5840 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
5841    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5842    statement.  CODE is the operation performed by STMT and OPS are
5843    its scalar operands.  REDUC_INDEX is the index of the operand in
5844    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5845    implements in-order reduction, or IFN_LAST if we should open-code it.
5846    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5847    that should be used to control the operation in a fully-masked loop.  */
5848
5849 static bool
5850 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5851                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5852                                gimple *reduc_def_stmt,
5853                                tree_code code, internal_fn reduc_fn,
5854                                tree ops[3], tree vectype_in,
5855                                int reduc_index, vec_loop_masks *masks)
5856 {
5857   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5858   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5859   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5860   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5861   stmt_vec_info new_stmt_info = NULL;
5862
5863   int ncopies;
5864   if (slp_node)
5865     ncopies = 1;
5866   else
5867     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5868
5869   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5870   gcc_assert (ncopies == 1);
5871   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5872   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5873   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5874               == FOLD_LEFT_REDUCTION);
5875
5876   if (slp_node)
5877     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5878                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5879
5880   tree op0 = ops[1 - reduc_index];
5881
5882   int group_size = 1;
5883   stmt_vec_info scalar_dest_def_info;
5884   auto_vec<tree> vec_oprnds0;
5885   if (slp_node)
5886     {
5887       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5888       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5889       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5890     }
5891   else
5892     {
5893       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5894       vec_oprnds0.create (1);
5895       vec_oprnds0.quick_push (loop_vec_def0);
5896       scalar_dest_def_info = stmt_info;
5897     }
5898
5899   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5900   tree scalar_type = TREE_TYPE (scalar_dest);
5901   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5902
5903   int vec_num = vec_oprnds0.length ();
5904   gcc_assert (vec_num == 1 || slp_node);
5905   tree vec_elem_type = TREE_TYPE (vectype_out);
5906   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5907
5908   tree vector_identity = NULL_TREE;
5909   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5910     vector_identity = build_zero_cst (vectype_out);
5911
5912   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5913   int i;
5914   tree def0;
5915   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5916     {
5917       gimple *new_stmt;
5918       tree mask = NULL_TREE;
5919       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5920         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5921
5922       /* Handle MINUS by adding the negative.  */
5923       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5924         {
5925           tree negated = make_ssa_name (vectype_out);
5926           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5927           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5928           def0 = negated;
5929         }
5930
5931       if (mask)
5932         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5933                                     vector_identity);
5934
5935       /* On the first iteration the input is simply the scalar phi
5936          result, and for subsequent iterations it is the output of
5937          the preceding operation.  */
5938       if (reduc_fn != IFN_LAST)
5939         {
5940           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5941           /* For chained SLP reductions the output of the previous reduction
5942              operation serves as the input of the next. For the final statement
5943              the output cannot be a temporary - we reuse the original
5944              scalar destination of the last statement.  */
5945           if (i != vec_num - 1)
5946             {
5947               gimple_set_lhs (new_stmt, scalar_dest_var);
5948               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5949               gimple_set_lhs (new_stmt, reduc_var);
5950             }
5951         }
5952       else
5953         {
5954           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5955                                              reduc_var, def0);
5956           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5957           /* Remove the statement, so that we can use the same code paths
5958              as for statements that we've just created.  */
5959           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5960           gsi_remove (&tmp_gsi, false);
5961         }
5962
5963       if (i == vec_num - 1)
5964         {
5965           gimple_set_lhs (new_stmt, scalar_dest);
5966           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5967                                                     new_stmt);
5968         }
5969       else
5970         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5971                                                      new_stmt, gsi);
5972
5973       if (slp_node)
5974         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5975     }
5976
5977   if (!slp_node)
5978     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5979
5980   return true;
5981 }
5982
5983 /* Function is_nonwrapping_integer_induction.
5984
5985    Check if STMT (which is part of loop LOOP) both increments and
5986    does not cause overflow.  */
5987
5988 static bool
5989 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5990 {
5991   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5992   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5993   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5994   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5995   widest_int ni, max_loop_value, lhs_max;
5996   wi::overflow_type overflow = wi::OVF_NONE;
5997
5998   /* Make sure the loop is integer based.  */
5999   if (TREE_CODE (base) != INTEGER_CST
6000       || TREE_CODE (step) != INTEGER_CST)
6001     return false;
6002
6003   /* Check that the max size of the loop will not wrap.  */
6004
6005   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6006     return true;
6007
6008   if (! max_stmt_executions (loop, &ni))
6009     return false;
6010
6011   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6012                             &overflow);
6013   if (overflow)
6014     return false;
6015
6016   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6017                             TYPE_SIGN (lhs_type), &overflow);
6018   if (overflow)
6019     return false;
6020
6021   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6022           <= TYPE_PRECISION (lhs_type));
6023 }
6024
6025 /* Function vectorizable_reduction.
6026
6027    Check if STMT performs a reduction operation that can be vectorized.
6028    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6029    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6030    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6031
6032    This function also handles reduction idioms (patterns) that have been
6033    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6034    of this form:
6035      X = pattern_expr (arg0, arg1, ..., X)
6036    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6037    sequence that had been detected and replaced by the pattern-stmt (STMT).
6038
6039    This function also handles reduction of condition expressions, for example:
6040      for (int i = 0; i < N; i++)
6041        if (a[i] < value)
6042          last = a[i];
6043    This is handled by vectorising the loop and creating an additional vector
6044    containing the loop indexes for which "a[i] < value" was true.  In the
6045    function epilogue this is reduced to a single max value and then used to
6046    index into the vector of results.
6047
6048    In some cases of reduction patterns, the type of the reduction variable X is
6049    different than the type of the other arguments of STMT.
6050    In such cases, the vectype that is used when transforming STMT into a vector
6051    stmt is different than the vectype that is used to determine the
6052    vectorization factor, because it consists of a different number of elements
6053    than the actual number of elements that are being operated upon in parallel.
6054
6055    For example, consider an accumulation of shorts into an int accumulator.
6056    On some targets it's possible to vectorize this pattern operating on 8
6057    shorts at a time (hence, the vectype for purposes of determining the
6058    vectorization factor should be V8HI); on the other hand, the vectype that
6059    is used to create the vector form is actually V4SI (the type of the result).
6060
6061    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6062    indicates what is the actual level of parallelism (V8HI in the example), so
6063    that the right vectorization factor would be derived.  This vectype
6064    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6065    be used to create the vectorized stmt.  The right vectype for the vectorized
6066    stmt is obtained from the type of the result X:
6067         get_vectype_for_scalar_type (TREE_TYPE (X))
6068
6069    This means that, contrary to "regular" reductions (or "regular" stmts in
6070    general), the following equation:
6071       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6072    does *NOT* necessarily hold for reduction patterns.  */
6073
6074 bool
6075 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6076                         stmt_vec_info *vec_stmt, slp_tree slp_node,
6077                         slp_instance slp_node_instance,
6078                         stmt_vector_for_cost *cost_vec)
6079 {
6080   tree vec_dest;
6081   tree scalar_dest;
6082   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6083   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6084   tree vectype_in = NULL_TREE;
6085   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6086   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6087   enum tree_code code, orig_code;
6088   internal_fn reduc_fn;
6089   machine_mode vec_mode;
6090   int op_type;
6091   optab optab;
6092   tree new_temp = NULL_TREE;
6093   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6094   gimple *cond_reduc_def_stmt = NULL;
6095   enum tree_code cond_reduc_op_code = ERROR_MARK;
6096   tree scalar_type;
6097   bool is_simple_use;
6098   int i;
6099   int ncopies;
6100   int epilog_copies;
6101   stmt_vec_info prev_stmt_info, prev_phi_info;
6102   bool single_defuse_cycle = false;
6103   stmt_vec_info new_stmt_info = NULL;
6104   int j;
6105   tree ops[3];
6106   enum vect_def_type dts[3];
6107   bool nested_cycle = false, found_nested_cycle_def = false;
6108   bool double_reduc = false;
6109   basic_block def_bb;
6110   struct loop * def_stmt_loop;
6111   tree def_arg;
6112   auto_vec<tree> vec_oprnds0;
6113   auto_vec<tree> vec_oprnds1;
6114   auto_vec<tree> vec_oprnds2;
6115   auto_vec<tree> vect_defs;
6116   auto_vec<stmt_vec_info> phis;
6117   int vec_num;
6118   tree def0, tem;
6119   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6120   tree cond_reduc_val = NULL_TREE;
6121
6122   /* Make sure it was already recognized as a reduction computation.  */
6123   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6124       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6125     return false;
6126
6127   if (nested_in_vect_loop_p (loop, stmt))
6128     {
6129       loop = loop->inner;
6130       nested_cycle = true;
6131     }
6132
6133   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6134     gcc_assert (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt);
6135
6136   if (gimple_code (stmt) == GIMPLE_PHI)
6137     {
6138       tree phi_result = gimple_phi_result (stmt);
6139       /* Analysis is fully done on the reduction stmt invocation.  */
6140       if (! vec_stmt)
6141         {
6142           if (slp_node)
6143             slp_node_instance->reduc_phis = slp_node;
6144
6145           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6146           return true;
6147         }
6148
6149       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6150         /* Leave the scalar phi in place.  Note that checking
6151            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6152            for reductions involving a single statement.  */
6153         return true;
6154
6155       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6156       if (STMT_VINFO_IN_PATTERN_P (reduc_stmt_info))
6157         reduc_stmt_info = STMT_VINFO_RELATED_STMT (reduc_stmt_info);
6158
6159       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6160           == EXTRACT_LAST_REDUCTION)
6161         /* Leave the scalar phi in place.  */
6162         return true;
6163
6164       gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6165       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6166         {
6167           tree op = gimple_op (reduc_stmt, k);
6168           if (op == gimple_phi_result (stmt))
6169             continue;
6170           if (k == 1
6171               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6172             continue;
6173           if (!vectype_in
6174               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6175                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6176             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6177           break;
6178         }
6179       gcc_assert (vectype_in);
6180
6181       if (slp_node)
6182         ncopies = 1;
6183       else
6184         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6185
6186       stmt_vec_info use_stmt_info;
6187       if (ncopies > 1
6188           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6189           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6190           && (use_stmt_info == reduc_stmt_info
6191               || STMT_VINFO_RELATED_STMT (use_stmt_info) == reduc_stmt))
6192         single_defuse_cycle = true;
6193
6194       /* Create the destination vector  */
6195       scalar_dest = gimple_assign_lhs (reduc_stmt);
6196       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6197
6198       if (slp_node)
6199         /* The size vect_schedule_slp_instance computes is off for us.  */
6200         vec_num = vect_get_num_vectors
6201           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6202            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6203            vectype_in);
6204       else
6205         vec_num = 1;
6206
6207       /* Generate the reduction PHIs upfront.  */
6208       prev_phi_info = NULL;
6209       for (j = 0; j < ncopies; j++)
6210         {
6211           if (j == 0 || !single_defuse_cycle)
6212             {
6213               for (i = 0; i < vec_num; i++)
6214                 {
6215                   /* Create the reduction-phi that defines the reduction
6216                      operand.  */
6217                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6218                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6219
6220                   if (slp_node)
6221                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6222                   else
6223                     {
6224                       if (j == 0)
6225                         STMT_VINFO_VEC_STMT (stmt_info)
6226                           = *vec_stmt = new_phi_info;
6227                       else
6228                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6229                       prev_phi_info = new_phi_info;
6230                     }
6231                 }
6232             }
6233         }
6234
6235       return true;
6236     }
6237
6238   /* 1. Is vectorizable reduction?  */
6239   /* Not supportable if the reduction variable is used in the loop, unless
6240      it's a reduction chain.  */
6241   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6242       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6243     return false;
6244
6245   /* Reductions that are not used even in an enclosing outer-loop,
6246      are expected to be "live" (used out of the loop).  */
6247   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6248       && !STMT_VINFO_LIVE_P (stmt_info))
6249     return false;
6250
6251   /* 2. Has this been recognized as a reduction pattern?
6252
6253      Check if STMT represents a pattern that has been recognized
6254      in earlier analysis stages.  For stmts that represent a pattern,
6255      the STMT_VINFO_RELATED_STMT field records the last stmt in
6256      the original sequence that constitutes the pattern.  */
6257
6258   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6259   if (orig_stmt_info)
6260     {
6261       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6262       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6263     }
6264
6265   /* 3. Check the operands of the operation.  The first operands are defined
6266         inside the loop body. The last operand is the reduction variable,
6267         which is defined by the loop-header-phi.  */
6268
6269   gcc_assert (is_gimple_assign (stmt));
6270
6271   /* Flatten RHS.  */
6272   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6273     {
6274     case GIMPLE_BINARY_RHS:
6275       code = gimple_assign_rhs_code (stmt);
6276       op_type = TREE_CODE_LENGTH (code);
6277       gcc_assert (op_type == binary_op);
6278       ops[0] = gimple_assign_rhs1 (stmt);
6279       ops[1] = gimple_assign_rhs2 (stmt);
6280       break;
6281
6282     case GIMPLE_TERNARY_RHS:
6283       code = gimple_assign_rhs_code (stmt);
6284       op_type = TREE_CODE_LENGTH (code);
6285       gcc_assert (op_type == ternary_op);
6286       ops[0] = gimple_assign_rhs1 (stmt);
6287       ops[1] = gimple_assign_rhs2 (stmt);
6288       ops[2] = gimple_assign_rhs3 (stmt);
6289       break;
6290
6291     case GIMPLE_UNARY_RHS:
6292       return false;
6293
6294     default:
6295       gcc_unreachable ();
6296     }
6297
6298   if (code == COND_EXPR && slp_node)
6299     return false;
6300
6301   scalar_dest = gimple_assign_lhs (stmt);
6302   scalar_type = TREE_TYPE (scalar_dest);
6303   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6304       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6305     return false;
6306
6307   /* Do not try to vectorize bit-precision reductions.  */
6308   if (!type_has_mode_precision_p (scalar_type))
6309     return false;
6310
6311   /* All uses but the last are expected to be defined in the loop.
6312      The last use is the reduction variable.  In case of nested cycle this
6313      assumption is not true: we use reduc_index to record the index of the
6314      reduction variable.  */
6315   stmt_vec_info reduc_def_info = NULL;
6316   int reduc_index = -1;
6317   for (i = 0; i < op_type; i++)
6318     {
6319       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6320       if (i == 0 && code == COND_EXPR)
6321         continue;
6322
6323       stmt_vec_info def_stmt_info;
6324       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6325                                           &def_stmt_info);
6326       dt = dts[i];
6327       gcc_assert (is_simple_use);
6328       if (dt == vect_reduction_def)
6329         {
6330           reduc_def_info = def_stmt_info;
6331           reduc_index = i;
6332           continue;
6333         }
6334       else if (tem)
6335         {
6336           /* To properly compute ncopies we are interested in the widest
6337              input type in case we're looking at a widening accumulation.  */
6338           if (!vectype_in
6339               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6340                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6341             vectype_in = tem;
6342         }
6343
6344       if (dt != vect_internal_def
6345           && dt != vect_external_def
6346           && dt != vect_constant_def
6347           && dt != vect_induction_def
6348           && !(dt == vect_nested_cycle && nested_cycle))
6349         return false;
6350
6351       if (dt == vect_nested_cycle)
6352         {
6353           found_nested_cycle_def = true;
6354           reduc_def_info = def_stmt_info;
6355           reduc_index = i;
6356         }
6357
6358       if (i == 1 && code == COND_EXPR)
6359         {
6360           /* Record how value of COND_EXPR is defined.  */
6361           if (dt == vect_constant_def)
6362             {
6363               cond_reduc_dt = dt;
6364               cond_reduc_val = ops[i];
6365             }
6366           if (dt == vect_induction_def
6367               && def_stmt_info
6368               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6369             {
6370               cond_reduc_dt = dt;
6371               cond_reduc_def_stmt = def_stmt_info;
6372             }
6373         }
6374     }
6375
6376   if (!vectype_in)
6377     vectype_in = vectype_out;
6378
6379   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6380      directy used in stmt.  */
6381   if (reduc_index == -1)
6382     {
6383       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6384         {
6385           if (dump_enabled_p ())
6386             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6387                              "in-order reduction chain without SLP.\n");
6388           return false;
6389         }
6390
6391       if (orig_stmt_info)
6392         reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6393       else
6394         reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6395     }
6396
6397   if (! reduc_def_info)
6398     return false;
6399
6400   gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt);
6401   if (!reduc_def_phi)
6402     return false;
6403
6404   if (!(reduc_index == -1
6405         || dts[reduc_index] == vect_reduction_def
6406         || dts[reduc_index] == vect_nested_cycle
6407         || ((dts[reduc_index] == vect_internal_def
6408              || dts[reduc_index] == vect_external_def
6409              || dts[reduc_index] == vect_constant_def
6410              || dts[reduc_index] == vect_induction_def)
6411             && nested_cycle && found_nested_cycle_def)))
6412     {
6413       /* For pattern recognized stmts, orig_stmt might be a reduction,
6414          but some helper statements for the pattern might not, or
6415          might be COND_EXPRs with reduction uses in the condition.  */
6416       gcc_assert (orig_stmt_info);
6417       return false;
6418     }
6419
6420   /* PHIs should not participate in patterns.  */
6421   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6422   enum vect_reduction_type v_reduc_type
6423     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6424   stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6425
6426   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6427   /* If we have a condition reduction, see if we can simplify it further.  */
6428   if (v_reduc_type == COND_REDUCTION)
6429     {
6430       /* TODO: We can't yet handle reduction chains, since we need to treat
6431          each COND_EXPR in the chain specially, not just the last one.
6432          E.g. for:
6433
6434             x_1 = PHI <x_3, ...>
6435             x_2 = a_2 ? ... : x_1;
6436             x_3 = a_3 ? ... : x_2;
6437
6438          we're interested in the last element in x_3 for which a_2 || a_3
6439          is true, whereas the current reduction chain handling would
6440          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6441          as a reduction operation.  */
6442       if (reduc_index == -1)
6443         {
6444           if (dump_enabled_p ())
6445             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6446                              "conditional reduction chains not supported\n");
6447           return false;
6448         }
6449
6450       /* vect_is_simple_reduction ensured that operand 2 is the
6451          loop-carried operand.  */
6452       gcc_assert (reduc_index == 2);
6453
6454       /* Loop peeling modifies initial value of reduction PHI, which
6455          makes the reduction stmt to be transformed different to the
6456          original stmt analyzed.  We need to record reduction code for
6457          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6458          it can be used directly at transform stage.  */
6459       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6460           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6461         {
6462           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6463           gcc_assert (cond_reduc_dt == vect_constant_def);
6464           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6465         }
6466       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6467                                                vectype_in, OPTIMIZE_FOR_SPEED))
6468         {
6469           if (dump_enabled_p ())
6470             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6471                              "optimizing condition reduction with"
6472                              " FOLD_EXTRACT_LAST.\n");
6473           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6474         }
6475       else if (cond_reduc_dt == vect_induction_def)
6476         {
6477           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6478           tree base
6479             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6480           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6481
6482           gcc_assert (TREE_CODE (base) == INTEGER_CST
6483                       && TREE_CODE (step) == INTEGER_CST);
6484           cond_reduc_val = NULL_TREE;
6485           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6486              above base; punt if base is the minimum value of the type for
6487              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6488           if (tree_int_cst_sgn (step) == -1)
6489             {
6490               cond_reduc_op_code = MIN_EXPR;
6491               if (tree_int_cst_sgn (base) == -1)
6492                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6493               else if (tree_int_cst_lt (base,
6494                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6495                 cond_reduc_val
6496                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6497             }
6498           else
6499             {
6500               cond_reduc_op_code = MAX_EXPR;
6501               if (tree_int_cst_sgn (base) == 1)
6502                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6503               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6504                                         base))
6505                 cond_reduc_val
6506                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6507             }
6508           if (cond_reduc_val)
6509             {
6510               if (dump_enabled_p ())
6511                 dump_printf_loc (MSG_NOTE, vect_location,
6512                                  "condition expression based on "
6513                                  "integer induction.\n");
6514               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6515                 = INTEGER_INDUC_COND_REDUCTION;
6516             }
6517         }
6518       else if (cond_reduc_dt == vect_constant_def)
6519         {
6520           enum vect_def_type cond_initial_dt;
6521           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6522           tree cond_initial_val
6523             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6524
6525           gcc_assert (cond_reduc_val != NULL_TREE);
6526           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6527           if (cond_initial_dt == vect_constant_def
6528               && types_compatible_p (TREE_TYPE (cond_initial_val),
6529                                      TREE_TYPE (cond_reduc_val)))
6530             {
6531               tree e = fold_binary (LE_EXPR, boolean_type_node,
6532                                     cond_initial_val, cond_reduc_val);
6533               if (e && (integer_onep (e) || integer_zerop (e)))
6534                 {
6535                   if (dump_enabled_p ())
6536                     dump_printf_loc (MSG_NOTE, vect_location,
6537                                      "condition expression based on "
6538                                      "compile time constant.\n");
6539                   /* Record reduction code at analysis stage.  */
6540                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6541                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6542                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6543                     = CONST_COND_REDUCTION;
6544                 }
6545             }
6546         }
6547     }
6548
6549   if (orig_stmt_info)
6550     gcc_assert (tmp == orig_stmt_info
6551                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6552   else
6553     /* We changed STMT to be the first stmt in reduction chain, hence we
6554        check that in this case the first element in the chain is STMT.  */
6555     gcc_assert (tmp == stmt_info
6556                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6557
6558   if (STMT_VINFO_LIVE_P (reduc_def_info))
6559     return false;
6560
6561   if (slp_node)
6562     ncopies = 1;
6563   else
6564     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6565
6566   gcc_assert (ncopies >= 1);
6567
6568   vec_mode = TYPE_MODE (vectype_in);
6569   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6570
6571   if (code == COND_EXPR)
6572     {
6573       /* Only call during the analysis stage, otherwise we'll lose
6574          STMT_VINFO_TYPE.  */
6575       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6576                                                 ops[reduc_index], 0, NULL,
6577                                                 cost_vec))
6578         {
6579           if (dump_enabled_p ())
6580             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6581                              "unsupported condition in reduction\n");
6582           return false;
6583         }
6584     }
6585   else
6586     {
6587       /* 4. Supportable by target?  */
6588
6589       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6590           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6591         {
6592           /* Shifts and rotates are only supported by vectorizable_shifts,
6593              not vectorizable_reduction.  */
6594           if (dump_enabled_p ())
6595             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6596                              "unsupported shift or rotation.\n");
6597           return false;
6598         }
6599
6600       /* 4.1. check support for the operation in the loop  */
6601       optab = optab_for_tree_code (code, vectype_in, optab_default);
6602       if (!optab)
6603         {
6604           if (dump_enabled_p ())
6605             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6606                              "no optab.\n");
6607
6608           return false;
6609         }
6610
6611       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6612         {
6613           if (dump_enabled_p ())
6614             dump_printf (MSG_NOTE, "op not supported by target.\n");
6615
6616           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6617               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6618             return false;
6619
6620           if (dump_enabled_p ())
6621             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6622         }
6623
6624       /* Worthwhile without SIMD support?  */
6625       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6626           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6627         {
6628           if (dump_enabled_p ())
6629             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6630                              "not worthwhile without SIMD support.\n");
6631
6632           return false;
6633         }
6634     }
6635
6636   /* 4.2. Check support for the epilog operation.
6637
6638           If STMT represents a reduction pattern, then the type of the
6639           reduction variable may be different than the type of the rest
6640           of the arguments.  For example, consider the case of accumulation
6641           of shorts into an int accumulator; The original code:
6642                         S1: int_a = (int) short_a;
6643           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6644
6645           was replaced with:
6646                         STMT: int_acc = widen_sum <short_a, int_acc>
6647
6648           This means that:
6649           1. The tree-code that is used to create the vector operation in the
6650              epilog code (that reduces the partial results) is not the
6651              tree-code of STMT, but is rather the tree-code of the original
6652              stmt from the pattern that STMT is replacing.  I.e, in the example
6653              above we want to use 'widen_sum' in the loop, but 'plus' in the
6654              epilog.
6655           2. The type (mode) we use to check available target support
6656              for the vector operation to be created in the *epilog*, is
6657              determined by the type of the reduction variable (in the example
6658              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6659              However the type (mode) we use to check available target support
6660              for the vector operation to be created *inside the loop*, is
6661              determined by the type of the other arguments to STMT (in the
6662              example we'd check this: optab_handler (widen_sum_optab,
6663              vect_short_mode)).
6664
6665           This is contrary to "regular" reductions, in which the types of all
6666           the arguments are the same as the type of the reduction variable.
6667           For "regular" reductions we can therefore use the same vector type
6668           (and also the same tree-code) when generating the epilog code and
6669           when generating the code inside the loop.  */
6670
6671   vect_reduction_type reduction_type
6672     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6673   if (orig_stmt_info
6674       && (reduction_type == TREE_CODE_REDUCTION
6675           || reduction_type == FOLD_LEFT_REDUCTION))
6676     {
6677       /* This is a reduction pattern: get the vectype from the type of the
6678          reduction variable, and get the tree-code from orig_stmt.  */
6679       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6680       gcc_assert (vectype_out);
6681       vec_mode = TYPE_MODE (vectype_out);
6682     }
6683   else
6684     {
6685       /* Regular reduction: use the same vectype and tree-code as used for
6686          the vector code inside the loop can be used for the epilog code. */
6687       orig_code = code;
6688
6689       if (code == MINUS_EXPR)
6690         orig_code = PLUS_EXPR;
6691
6692       /* For simple condition reductions, replace with the actual expression
6693          we want to base our reduction around.  */
6694       if (reduction_type == CONST_COND_REDUCTION)
6695         {
6696           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6697           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6698         }
6699       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6700         orig_code = cond_reduc_op_code;
6701     }
6702
6703   if (nested_cycle)
6704     {
6705       def_bb = gimple_bb (reduc_def_phi);
6706       def_stmt_loop = def_bb->loop_father;
6707       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6708                                        loop_preheader_edge (def_stmt_loop));
6709       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6710       if (def_arg_stmt_info
6711           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6712               == vect_double_reduction_def))
6713         double_reduc = true;
6714     }
6715
6716   reduc_fn = IFN_LAST;
6717
6718   if (reduction_type == TREE_CODE_REDUCTION
6719       || reduction_type == FOLD_LEFT_REDUCTION
6720       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6721       || reduction_type == CONST_COND_REDUCTION)
6722     {
6723       if (reduction_type == FOLD_LEFT_REDUCTION
6724           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6725           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6726         {
6727           if (reduc_fn != IFN_LAST
6728               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6729                                                   OPTIMIZE_FOR_SPEED))
6730             {
6731               if (dump_enabled_p ())
6732                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6733                                  "reduc op not supported by target.\n");
6734
6735               reduc_fn = IFN_LAST;
6736             }
6737         }
6738       else
6739         {
6740           if (!nested_cycle || double_reduc)
6741             {
6742               if (dump_enabled_p ())
6743                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6744                                  "no reduc code for scalar code.\n");
6745
6746               return false;
6747             }
6748         }
6749     }
6750   else if (reduction_type == COND_REDUCTION)
6751     {
6752       int scalar_precision
6753         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6754       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6755       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6756                                                 nunits_out);
6757
6758       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6759                                           OPTIMIZE_FOR_SPEED))
6760         reduc_fn = IFN_REDUC_MAX;
6761     }
6762
6763   if (reduction_type != EXTRACT_LAST_REDUCTION
6764       && reduc_fn == IFN_LAST
6765       && !nunits_out.is_constant ())
6766     {
6767       if (dump_enabled_p ())
6768         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6769                          "missing target support for reduction on"
6770                          " variable-length vectors.\n");
6771       return false;
6772     }
6773
6774   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6775       && ncopies > 1)
6776     {
6777       if (dump_enabled_p ())
6778         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6779                          "multiple types in double reduction or condition "
6780                          "reduction.\n");
6781       return false;
6782     }
6783
6784   /* For SLP reductions, see if there is a neutral value we can use.  */
6785   tree neutral_op = NULL_TREE;
6786   if (slp_node)
6787     neutral_op = neutral_op_for_slp_reduction
6788                    (slp_node_instance->reduc_phis, code,
6789                     REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6790
6791   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6792     {
6793       /* We can't support in-order reductions of code such as this:
6794
6795            for (int i = 0; i < n1; ++i)
6796              for (int j = 0; j < n2; ++j)
6797                l += a[j];
6798
6799          since GCC effectively transforms the loop when vectorizing:
6800
6801            for (int i = 0; i < n1 / VF; ++i)
6802              for (int j = 0; j < n2; ++j)
6803                for (int k = 0; k < VF; ++k)
6804                  l += a[j];
6805
6806          which is a reassociation of the original operation.  */
6807       if (dump_enabled_p ())
6808         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6809                          "in-order double reduction not supported.\n");
6810
6811       return false;
6812     }
6813
6814   if (reduction_type == FOLD_LEFT_REDUCTION
6815       && slp_node
6816       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6817     {
6818       /* We cannot use in-order reductions in this case because there is
6819          an implicit reassociation of the operations involved.  */
6820       if (dump_enabled_p ())
6821         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6822                          "in-order unchained SLP reductions not supported.\n");
6823       return false;
6824     }
6825
6826   /* For double reductions, and for SLP reductions with a neutral value,
6827      we construct a variable-length initial vector by loading a vector
6828      full of the neutral value and then shift-and-inserting the start
6829      values into the low-numbered elements.  */
6830   if ((double_reduc || neutral_op)
6831       && !nunits_out.is_constant ()
6832       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6833                                           vectype_out, OPTIMIZE_FOR_SPEED))
6834     {
6835       if (dump_enabled_p ())
6836         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6837                          "reduction on variable-length vectors requires"
6838                          " target support for a vector-shift-and-insert"
6839                          " operation.\n");
6840       return false;
6841     }
6842
6843   /* Check extra constraints for variable-length unchained SLP reductions.  */
6844   if (STMT_SLP_TYPE (stmt_info)
6845       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6846       && !nunits_out.is_constant ())
6847     {
6848       /* We checked above that we could build the initial vector when
6849          there's a neutral element value.  Check here for the case in
6850          which each SLP statement has its own initial value and in which
6851          that value needs to be repeated for every instance of the
6852          statement within the initial vector.  */
6853       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6854       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6855       if (!neutral_op
6856           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6857         {
6858           if (dump_enabled_p ())
6859             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6860                              "unsupported form of SLP reduction for"
6861                              " variable-length vectors: cannot build"
6862                              " initial vector.\n");
6863           return false;
6864         }
6865       /* The epilogue code relies on the number of elements being a multiple
6866          of the group size.  The duplicate-and-interleave approach to setting
6867          up the the initial vector does too.  */
6868       if (!multiple_p (nunits_out, group_size))
6869         {
6870           if (dump_enabled_p ())
6871             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6872                              "unsupported form of SLP reduction for"
6873                              " variable-length vectors: the vector size"
6874                              " is not a multiple of the number of results.\n");
6875           return false;
6876         }
6877     }
6878
6879   /* In case of widenning multiplication by a constant, we update the type
6880      of the constant to be the type of the other operand.  We check that the
6881      constant fits the type in the pattern recognition pass.  */
6882   if (code == DOT_PROD_EXPR
6883       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6884     {
6885       if (TREE_CODE (ops[0]) == INTEGER_CST)
6886         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6887       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6888         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6889       else
6890         {
6891           if (dump_enabled_p ())
6892             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6893                              "invalid types in dot-prod\n");
6894
6895           return false;
6896         }
6897     }
6898
6899   if (reduction_type == COND_REDUCTION)
6900     {
6901       widest_int ni;
6902
6903       if (! max_loop_iterations (loop, &ni))
6904         {
6905           if (dump_enabled_p ())
6906             dump_printf_loc (MSG_NOTE, vect_location,
6907                              "loop count not known, cannot create cond "
6908                              "reduction.\n");
6909           return false;
6910         }
6911       /* Convert backedges to iterations.  */
6912       ni += 1;
6913
6914       /* The additional index will be the same type as the condition.  Check
6915          that the loop can fit into this less one (because we'll use up the
6916          zero slot for when there are no matches).  */
6917       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6918       if (wi::geu_p (ni, wi::to_widest (max_index)))
6919         {
6920           if (dump_enabled_p ())
6921             dump_printf_loc (MSG_NOTE, vect_location,
6922                              "loop size is greater than data size.\n");
6923           return false;
6924         }
6925     }
6926
6927   /* In case the vectorization factor (VF) is bigger than the number
6928      of elements that we can fit in a vectype (nunits), we have to generate
6929      more than one vector stmt - i.e - we need to "unroll" the
6930      vector stmt by a factor VF/nunits.  For more details see documentation
6931      in vectorizable_operation.  */
6932
6933   /* If the reduction is used in an outer loop we need to generate
6934      VF intermediate results, like so (e.g. for ncopies=2):
6935         r0 = phi (init, r0)
6936         r1 = phi (init, r1)
6937         r0 = x0 + r0;
6938         r1 = x1 + r1;
6939     (i.e. we generate VF results in 2 registers).
6940     In this case we have a separate def-use cycle for each copy, and therefore
6941     for each copy we get the vector def for the reduction variable from the
6942     respective phi node created for this copy.
6943
6944     Otherwise (the reduction is unused in the loop nest), we can combine
6945     together intermediate results, like so (e.g. for ncopies=2):
6946         r = phi (init, r)
6947         r = x0 + r;
6948         r = x1 + r;
6949    (i.e. we generate VF/2 results in a single register).
6950    In this case for each copy we get the vector def for the reduction variable
6951    from the vectorized reduction operation generated in the previous iteration.
6952
6953    This only works when we see both the reduction PHI and its only consumer
6954    in vectorizable_reduction and there are no intermediate stmts
6955    participating.  */
6956   stmt_vec_info use_stmt_info;
6957   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6958   if (ncopies > 1
6959       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6960       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6961       && (use_stmt_info == stmt_info
6962           || STMT_VINFO_RELATED_STMT (use_stmt_info) == stmt))
6963     {
6964       single_defuse_cycle = true;
6965       epilog_copies = 1;
6966     }
6967   else
6968     epilog_copies = ncopies;
6969
6970   /* If the reduction stmt is one of the patterns that have lane
6971      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6972   if ((ncopies > 1
6973        && ! single_defuse_cycle)
6974       && (code == DOT_PROD_EXPR
6975           || code == WIDEN_SUM_EXPR
6976           || code == SAD_EXPR))
6977     {
6978       if (dump_enabled_p ())
6979         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6980                          "multi def-use cycle not possible for lane-reducing "
6981                          "reduction operation\n");
6982       return false;
6983     }
6984
6985   if (slp_node)
6986     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6987   else
6988     vec_num = 1;
6989
6990   internal_fn cond_fn = get_conditional_internal_fn (code);
6991   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6992
6993   if (!vec_stmt) /* transformation not required.  */
6994     {
6995       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6996       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6997         {
6998           if (reduction_type != FOLD_LEFT_REDUCTION
6999               && (cond_fn == IFN_LAST
7000                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7001                                                       OPTIMIZE_FOR_SPEED)))
7002             {
7003               if (dump_enabled_p ())
7004                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7005                                  "can't use a fully-masked loop because no"
7006                                  " conditional operation is available.\n");
7007               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7008             }
7009           else if (reduc_index == -1)
7010             {
7011               if (dump_enabled_p ())
7012                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7013                                  "can't use a fully-masked loop for chained"
7014                                  " reductions.\n");
7015               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7016             }
7017           else
7018             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7019                                    vectype_in);
7020         }
7021       if (dump_enabled_p ()
7022           && reduction_type == FOLD_LEFT_REDUCTION)
7023         dump_printf_loc (MSG_NOTE, vect_location,
7024                          "using an in-order (fold-left) reduction.\n");
7025       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7026       return true;
7027     }
7028
7029   /* Transform.  */
7030
7031   if (dump_enabled_p ())
7032     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7033
7034   /* FORNOW: Multiple types are not supported for condition.  */
7035   if (code == COND_EXPR)
7036     gcc_assert (ncopies == 1);
7037
7038   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7039
7040   if (reduction_type == FOLD_LEFT_REDUCTION)
7041     return vectorize_fold_left_reduction
7042       (stmt, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7043        reduc_fn, ops, vectype_in, reduc_index, masks);
7044
7045   if (reduction_type == EXTRACT_LAST_REDUCTION)
7046     {
7047       gcc_assert (!slp_node);
7048       return vectorizable_condition (stmt, gsi, vec_stmt,
7049                                      NULL, reduc_index, NULL, NULL);
7050     }
7051
7052   /* Create the destination vector  */
7053   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7054
7055   prev_stmt_info = NULL;
7056   prev_phi_info = NULL;
7057   if (!slp_node)
7058     {
7059       vec_oprnds0.create (1);
7060       vec_oprnds1.create (1);
7061       if (op_type == ternary_op)
7062         vec_oprnds2.create (1);
7063     }
7064
7065   phis.create (vec_num);
7066   vect_defs.create (vec_num);
7067   if (!slp_node)
7068     vect_defs.quick_push (NULL_TREE);
7069
7070   if (slp_node)
7071     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7072   else
7073     phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7074
7075   for (j = 0; j < ncopies; j++)
7076     {
7077       if (code == COND_EXPR)
7078         {
7079           gcc_assert (!slp_node);
7080           vectorizable_condition (stmt, gsi, vec_stmt,
7081                                   PHI_RESULT (phis[0]->stmt),
7082                                   reduc_index, NULL, NULL);
7083           /* Multiple types are not supported for condition.  */
7084           break;
7085         }
7086
7087       /* Handle uses.  */
7088       if (j == 0)
7089         {
7090           if (slp_node)
7091             {
7092               /* Get vec defs for all the operands except the reduction index,
7093                  ensuring the ordering of the ops in the vector is kept.  */
7094               auto_vec<tree, 3> slp_ops;
7095               auto_vec<vec<tree>, 3> vec_defs;
7096
7097               slp_ops.quick_push (ops[0]);
7098               slp_ops.quick_push (ops[1]);
7099               if (op_type == ternary_op)
7100                 slp_ops.quick_push (ops[2]);
7101
7102               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7103
7104               vec_oprnds0.safe_splice (vec_defs[0]);
7105               vec_defs[0].release ();
7106               vec_oprnds1.safe_splice (vec_defs[1]);
7107               vec_defs[1].release ();
7108               if (op_type == ternary_op)
7109                 {
7110                   vec_oprnds2.safe_splice (vec_defs[2]);
7111                   vec_defs[2].release ();
7112                 }
7113             }
7114           else
7115             {
7116               vec_oprnds0.quick_push
7117                 (vect_get_vec_def_for_operand (ops[0], stmt));
7118               vec_oprnds1.quick_push
7119                 (vect_get_vec_def_for_operand (ops[1], stmt));
7120               if (op_type == ternary_op)
7121                 vec_oprnds2.quick_push
7122                   (vect_get_vec_def_for_operand (ops[2], stmt));
7123             }
7124         }
7125       else
7126         {
7127           if (!slp_node)
7128             {
7129               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7130
7131               if (single_defuse_cycle && reduc_index == 0)
7132                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7133               else
7134                 vec_oprnds0[0]
7135                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7136               if (single_defuse_cycle && reduc_index == 1)
7137                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7138               else
7139                 vec_oprnds1[0]
7140                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7141               if (op_type == ternary_op)
7142                 {
7143                   if (single_defuse_cycle && reduc_index == 2)
7144                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7145                   else
7146                     vec_oprnds2[0]
7147                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7148                 }
7149             }
7150         }
7151
7152       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7153         {
7154           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7155           if (masked_loop_p)
7156             {
7157               /* Make sure that the reduction accumulator is vop[0].  */
7158               if (reduc_index == 1)
7159                 {
7160                   gcc_assert (commutative_tree_code (code));
7161                   std::swap (vop[0], vop[1]);
7162                 }
7163               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7164                                               vectype_in, i * ncopies + j);
7165               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7166                                                         vop[0], vop[1],
7167                                                         vop[0]);
7168               new_temp = make_ssa_name (vec_dest, call);
7169               gimple_call_set_lhs (call, new_temp);
7170               gimple_call_set_nothrow (call, true);
7171               new_stmt_info = vect_finish_stmt_generation (stmt, call, gsi);
7172             }
7173           else
7174             {
7175               if (op_type == ternary_op)
7176                 vop[2] = vec_oprnds2[i];
7177
7178               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7179                                                        vop[0], vop[1], vop[2]);
7180               new_temp = make_ssa_name (vec_dest, new_stmt);
7181               gimple_assign_set_lhs (new_stmt, new_temp);
7182               new_stmt_info
7183                 = vect_finish_stmt_generation (stmt, new_stmt, gsi);
7184             }
7185
7186           if (slp_node)
7187             {
7188               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7189               vect_defs.quick_push (new_temp);
7190             }
7191           else
7192             vect_defs[0] = new_temp;
7193         }
7194
7195       if (slp_node)
7196         continue;
7197
7198       if (j == 0)
7199         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7200       else
7201         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7202
7203       prev_stmt_info = new_stmt_info;
7204     }
7205
7206   /* Finalize the reduction-phi (set its arguments) and create the
7207      epilog reduction code.  */
7208   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7209     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7210
7211   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_phi,
7212                                     epilog_copies, reduc_fn, phis,
7213                                     double_reduc, slp_node, slp_node_instance,
7214                                     cond_reduc_val, cond_reduc_op_code,
7215                                     neutral_op);
7216
7217   return true;
7218 }
7219
7220 /* Function vect_min_worthwhile_factor.
7221
7222    For a loop where we could vectorize the operation indicated by CODE,
7223    return the minimum vectorization factor that makes it worthwhile
7224    to use generic vectors.  */
7225 static unsigned int
7226 vect_min_worthwhile_factor (enum tree_code code)
7227 {
7228   switch (code)
7229     {
7230     case PLUS_EXPR:
7231     case MINUS_EXPR:
7232     case NEGATE_EXPR:
7233       return 4;
7234
7235     case BIT_AND_EXPR:
7236     case BIT_IOR_EXPR:
7237     case BIT_XOR_EXPR:
7238     case BIT_NOT_EXPR:
7239       return 2;
7240
7241     default:
7242       return INT_MAX;
7243     }
7244 }
7245
7246 /* Return true if VINFO indicates we are doing loop vectorization and if
7247    it is worth decomposing CODE operations into scalar operations for
7248    that loop's vectorization factor.  */
7249
7250 bool
7251 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7252 {
7253   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7254   unsigned HOST_WIDE_INT value;
7255   return (loop_vinfo
7256           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7257           && value >= vect_min_worthwhile_factor (code));
7258 }
7259
7260 /* Function vectorizable_induction
7261
7262    Check if PHI performs an induction computation that can be vectorized.
7263    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7264    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7265    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7266
7267 bool
7268 vectorizable_induction (gimple *phi,
7269                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7270                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7271                         stmt_vector_for_cost *cost_vec)
7272 {
7273   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7274   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7275   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7276   unsigned ncopies;
7277   bool nested_in_vect_loop = false;
7278   struct loop *iv_loop;
7279   tree vec_def;
7280   edge pe = loop_preheader_edge (loop);
7281   basic_block new_bb;
7282   tree new_vec, vec_init, vec_step, t;
7283   tree new_name;
7284   gimple *new_stmt;
7285   gphi *induction_phi;
7286   tree induc_def, vec_dest;
7287   tree init_expr, step_expr;
7288   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7289   unsigned i;
7290   tree expr;
7291   gimple_seq stmts;
7292   imm_use_iterator imm_iter;
7293   use_operand_p use_p;
7294   gimple *exit_phi;
7295   edge latch_e;
7296   tree loop_arg;
7297   gimple_stmt_iterator si;
7298   basic_block bb = gimple_bb (phi);
7299
7300   if (gimple_code (phi) != GIMPLE_PHI)
7301     return false;
7302
7303   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7304     return false;
7305
7306   /* Make sure it was recognized as induction computation.  */
7307   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7308     return false;
7309
7310   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7311   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7312
7313   if (slp_node)
7314     ncopies = 1;
7315   else
7316     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7317   gcc_assert (ncopies >= 1);
7318
7319   /* FORNOW. These restrictions should be relaxed.  */
7320   if (nested_in_vect_loop_p (loop, phi))
7321     {
7322       imm_use_iterator imm_iter;
7323       use_operand_p use_p;
7324       gimple *exit_phi;
7325       edge latch_e;
7326       tree loop_arg;
7327
7328       if (ncopies > 1)
7329         {
7330           if (dump_enabled_p ())
7331             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7332                              "multiple types in nested loop.\n");
7333           return false;
7334         }
7335
7336       /* FORNOW: outer loop induction with SLP not supported.  */
7337       if (STMT_SLP_TYPE (stmt_info))
7338         return false;
7339
7340       exit_phi = NULL;
7341       latch_e = loop_latch_edge (loop->inner);
7342       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7343       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7344         {
7345           gimple *use_stmt = USE_STMT (use_p);
7346           if (is_gimple_debug (use_stmt))
7347             continue;
7348
7349           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7350             {
7351               exit_phi = use_stmt;
7352               break;
7353             }
7354         }
7355       if (exit_phi)
7356         {
7357           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7358           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7359                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7360             {
7361               if (dump_enabled_p ())
7362                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7363                                  "inner-loop induction only used outside "
7364                                  "of the outer vectorized loop.\n");
7365               return false;
7366             }
7367         }
7368
7369       nested_in_vect_loop = true;
7370       iv_loop = loop->inner;
7371     }
7372   else
7373     iv_loop = loop;
7374   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7375
7376   if (slp_node && !nunits.is_constant ())
7377     {
7378       /* The current SLP code creates the initial value element-by-element.  */
7379       if (dump_enabled_p ())
7380         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7381                          "SLP induction not supported for variable-length"
7382                          " vectors.\n");
7383       return false;
7384     }
7385
7386   if (!vec_stmt) /* transformation not required.  */
7387     {
7388       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7389       DUMP_VECT_SCOPE ("vectorizable_induction");
7390       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7391       return true;
7392     }
7393
7394   /* Transform.  */
7395
7396   /* Compute a vector variable, initialized with the first VF values of
7397      the induction variable.  E.g., for an iv with IV_PHI='X' and
7398      evolution S, for a vector of 4 units, we want to compute:
7399      [X, X + S, X + 2*S, X + 3*S].  */
7400
7401   if (dump_enabled_p ())
7402     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7403
7404   latch_e = loop_latch_edge (iv_loop);
7405   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7406
7407   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7408   gcc_assert (step_expr != NULL_TREE);
7409
7410   pe = loop_preheader_edge (iv_loop);
7411   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7412                                      loop_preheader_edge (iv_loop));
7413
7414   stmts = NULL;
7415   if (!nested_in_vect_loop)
7416     {
7417       /* Convert the initial value to the desired type.  */
7418       tree new_type = TREE_TYPE (vectype);
7419       init_expr = gimple_convert (&stmts, new_type, init_expr);
7420
7421       /* If we are using the loop mask to "peel" for alignment then we need
7422          to adjust the start value here.  */
7423       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7424       if (skip_niters != NULL_TREE)
7425         {
7426           if (FLOAT_TYPE_P (vectype))
7427             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7428                                         skip_niters);
7429           else
7430             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7431           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7432                                          skip_niters, step_expr);
7433           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7434                                     init_expr, skip_step);
7435         }
7436     }
7437
7438   /* Convert the step to the desired type.  */
7439   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7440
7441   if (stmts)
7442     {
7443       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7444       gcc_assert (!new_bb);
7445     }
7446
7447   /* Find the first insertion point in the BB.  */
7448   si = gsi_after_labels (bb);
7449
7450   /* For SLP induction we have to generate several IVs as for example
7451      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7452      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7453      [VF*S, VF*S, VF*S, VF*S] for all.  */
7454   if (slp_node)
7455     {
7456       /* Enforced above.  */
7457       unsigned int const_nunits = nunits.to_constant ();
7458
7459       /* Generate [VF*S, VF*S, ... ].  */
7460       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7461         {
7462           expr = build_int_cst (integer_type_node, vf);
7463           expr = fold_convert (TREE_TYPE (step_expr), expr);
7464         }
7465       else
7466         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7467       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7468                               expr, step_expr);
7469       if (! CONSTANT_CLASS_P (new_name))
7470         new_name = vect_init_vector (phi, new_name,
7471                                      TREE_TYPE (step_expr), NULL);
7472       new_vec = build_vector_from_val (vectype, new_name);
7473       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7474
7475       /* Now generate the IVs.  */
7476       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7477       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7478       unsigned elts = const_nunits * nvects;
7479       unsigned nivs = least_common_multiple (group_size,
7480                                              const_nunits) / const_nunits;
7481       gcc_assert (elts % group_size == 0);
7482       tree elt = init_expr;
7483       unsigned ivn;
7484       for (ivn = 0; ivn < nivs; ++ivn)
7485         {
7486           tree_vector_builder elts (vectype, const_nunits, 1);
7487           stmts = NULL;
7488           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7489             {
7490               if (ivn*const_nunits + eltn >= group_size
7491                   && (ivn * const_nunits + eltn) % group_size == 0)
7492                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7493                                     elt, step_expr);
7494               elts.quick_push (elt);
7495             }
7496           vec_init = gimple_build_vector (&stmts, &elts);
7497           if (stmts)
7498             {
7499               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7500               gcc_assert (!new_bb);
7501             }
7502
7503           /* Create the induction-phi that defines the induction-operand.  */
7504           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7505           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7506           stmt_vec_info induction_phi_info
7507             = loop_vinfo->add_stmt (induction_phi);
7508           induc_def = PHI_RESULT (induction_phi);
7509
7510           /* Create the iv update inside the loop  */
7511           vec_def = make_ssa_name (vec_dest);
7512           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7513           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7514           loop_vinfo->add_stmt (new_stmt);
7515
7516           /* Set the arguments of the phi node:  */
7517           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7518           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7519                        UNKNOWN_LOCATION);
7520
7521           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7522         }
7523
7524       /* Re-use IVs when we can.  */
7525       if (ivn < nvects)
7526         {
7527           unsigned vfp
7528             = least_common_multiple (group_size, const_nunits) / group_size;
7529           /* Generate [VF'*S, VF'*S, ... ].  */
7530           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7531             {
7532               expr = build_int_cst (integer_type_node, vfp);
7533               expr = fold_convert (TREE_TYPE (step_expr), expr);
7534             }
7535           else
7536             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7537           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7538                                   expr, step_expr);
7539           if (! CONSTANT_CLASS_P (new_name))
7540             new_name = vect_init_vector (phi, new_name,
7541                                          TREE_TYPE (step_expr), NULL);
7542           new_vec = build_vector_from_val (vectype, new_name);
7543           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7544           for (; ivn < nvects; ++ivn)
7545             {
7546               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7547               tree def;
7548               if (gimple_code (iv) == GIMPLE_PHI)
7549                 def = gimple_phi_result (iv);
7550               else
7551                 def = gimple_assign_lhs (iv);
7552               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7553                                               PLUS_EXPR,
7554                                               def, vec_step);
7555               if (gimple_code (iv) == GIMPLE_PHI)
7556                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7557               else
7558                 {
7559                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7560                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7561                 }
7562               SLP_TREE_VEC_STMTS (slp_node).quick_push
7563                 (loop_vinfo->add_stmt (new_stmt));
7564             }
7565         }
7566
7567       return true;
7568     }
7569
7570   /* Create the vector that holds the initial_value of the induction.  */
7571   if (nested_in_vect_loop)
7572     {
7573       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7574          been created during vectorization of previous stmts.  We obtain it
7575          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7576       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7577       /* If the initial value is not of proper type, convert it.  */
7578       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7579         {
7580           new_stmt
7581             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7582                                                           vect_simple_var,
7583                                                           "vec_iv_"),
7584                                    VIEW_CONVERT_EXPR,
7585                                    build1 (VIEW_CONVERT_EXPR, vectype,
7586                                            vec_init));
7587           vec_init = gimple_assign_lhs (new_stmt);
7588           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7589                                                  new_stmt);
7590           gcc_assert (!new_bb);
7591           loop_vinfo->add_stmt (new_stmt);
7592         }
7593     }
7594   else
7595     {
7596       /* iv_loop is the loop to be vectorized. Create:
7597          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7598       stmts = NULL;
7599       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7600
7601       unsigned HOST_WIDE_INT const_nunits;
7602       if (nunits.is_constant (&const_nunits))
7603         {
7604           tree_vector_builder elts (vectype, const_nunits, 1);
7605           elts.quick_push (new_name);
7606           for (i = 1; i < const_nunits; i++)
7607             {
7608               /* Create: new_name_i = new_name + step_expr  */
7609               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7610                                        new_name, step_expr);
7611               elts.quick_push (new_name);
7612             }
7613           /* Create a vector from [new_name_0, new_name_1, ...,
7614              new_name_nunits-1]  */
7615           vec_init = gimple_build_vector (&stmts, &elts);
7616         }
7617       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7618         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7619         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7620                                  new_name, step_expr);
7621       else
7622         {
7623           /* Build:
7624                 [base, base, base, ...]
7625                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7626           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7627           gcc_assert (flag_associative_math);
7628           tree index = build_index_vector (vectype, 0, 1);
7629           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7630                                                         new_name);
7631           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7632                                                         step_expr);
7633           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7634           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7635                                    vec_init, step_vec);
7636           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7637                                    vec_init, base_vec);
7638         }
7639
7640       if (stmts)
7641         {
7642           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7643           gcc_assert (!new_bb);
7644         }
7645     }
7646
7647
7648   /* Create the vector that holds the step of the induction.  */
7649   if (nested_in_vect_loop)
7650     /* iv_loop is nested in the loop to be vectorized. Generate:
7651        vec_step = [S, S, S, S]  */
7652     new_name = step_expr;
7653   else
7654     {
7655       /* iv_loop is the loop to be vectorized. Generate:
7656           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7657       gimple_seq seq = NULL;
7658       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7659         {
7660           expr = build_int_cst (integer_type_node, vf);
7661           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7662         }
7663       else
7664         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7665       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7666                                expr, step_expr);
7667       if (seq)
7668         {
7669           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7670           gcc_assert (!new_bb);
7671         }
7672     }
7673
7674   t = unshare_expr (new_name);
7675   gcc_assert (CONSTANT_CLASS_P (new_name)
7676               || TREE_CODE (new_name) == SSA_NAME);
7677   new_vec = build_vector_from_val (vectype, t);
7678   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7679
7680
7681   /* Create the following def-use cycle:
7682      loop prolog:
7683          vec_init = ...
7684          vec_step = ...
7685      loop:
7686          vec_iv = PHI <vec_init, vec_loop>
7687          ...
7688          STMT
7689          ...
7690          vec_loop = vec_iv + vec_step;  */
7691
7692   /* Create the induction-phi that defines the induction-operand.  */
7693   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7694   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7695   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7696   induc_def = PHI_RESULT (induction_phi);
7697
7698   /* Create the iv update inside the loop  */
7699   vec_def = make_ssa_name (vec_dest);
7700   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7701   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7702   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7703
7704   /* Set the arguments of the phi node:  */
7705   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7706   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7707                UNKNOWN_LOCATION);
7708
7709   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7710
7711   /* In case that vectorization factor (VF) is bigger than the number
7712      of elements that we can fit in a vectype (nunits), we have to generate
7713      more than one vector stmt - i.e - we need to "unroll" the
7714      vector stmt by a factor VF/nunits.  For more details see documentation
7715      in vectorizable_operation.  */
7716
7717   if (ncopies > 1)
7718     {
7719       gimple_seq seq = NULL;
7720       stmt_vec_info prev_stmt_vinfo;
7721       /* FORNOW. This restriction should be relaxed.  */
7722       gcc_assert (!nested_in_vect_loop);
7723
7724       /* Create the vector that holds the step of the induction.  */
7725       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7726         {
7727           expr = build_int_cst (integer_type_node, nunits);
7728           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7729         }
7730       else
7731         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7732       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7733                                expr, step_expr);
7734       if (seq)
7735         {
7736           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7737           gcc_assert (!new_bb);
7738         }
7739
7740       t = unshare_expr (new_name);
7741       gcc_assert (CONSTANT_CLASS_P (new_name)
7742                   || TREE_CODE (new_name) == SSA_NAME);
7743       new_vec = build_vector_from_val (vectype, t);
7744       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7745
7746       vec_def = induc_def;
7747       prev_stmt_vinfo = induction_phi_info;
7748       for (i = 1; i < ncopies; i++)
7749         {
7750           /* vec_i = vec_prev + vec_step  */
7751           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7752                                           vec_def, vec_step);
7753           vec_def = make_ssa_name (vec_dest, new_stmt);
7754           gimple_assign_set_lhs (new_stmt, vec_def);
7755
7756           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7757           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7758           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7759           prev_stmt_vinfo = new_stmt_info;
7760         }
7761     }
7762
7763   if (nested_in_vect_loop)
7764     {
7765       /* Find the loop-closed exit-phi of the induction, and record
7766          the final vector of induction results:  */
7767       exit_phi = NULL;
7768       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7769         {
7770           gimple *use_stmt = USE_STMT (use_p);
7771           if (is_gimple_debug (use_stmt))
7772             continue;
7773
7774           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7775             {
7776               exit_phi = use_stmt;
7777               break;
7778             }
7779         }
7780       if (exit_phi)
7781         {
7782           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7783           /* FORNOW. Currently not supporting the case that an inner-loop induction
7784              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7785           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7786                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7787
7788           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7789           if (dump_enabled_p ())
7790             {
7791               dump_printf_loc (MSG_NOTE, vect_location,
7792                                "vector of inductions after inner-loop:");
7793               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7794             }
7795         }
7796     }
7797
7798
7799   if (dump_enabled_p ())
7800     {
7801       dump_printf_loc (MSG_NOTE, vect_location,
7802                        "transform induction: created def-use cycle: ");
7803       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7804       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7805                         SSA_NAME_DEF_STMT (vec_def), 0);
7806     }
7807
7808   return true;
7809 }
7810
7811 /* Function vectorizable_live_operation.
7812
7813    STMT computes a value that is used outside the loop.  Check if
7814    it can be supported.  */
7815
7816 bool
7817 vectorizable_live_operation (gimple *stmt,
7818                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7819                              slp_tree slp_node, int slp_index,
7820                              stmt_vec_info *vec_stmt,
7821                              stmt_vector_for_cost *)
7822 {
7823   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7824   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7825   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7826   imm_use_iterator imm_iter;
7827   tree lhs, lhs_type, bitsize, vec_bitsize;
7828   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7829   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7830   int ncopies;
7831   gimple *use_stmt;
7832   auto_vec<tree> vec_oprnds;
7833   int vec_entry = 0;
7834   poly_uint64 vec_index = 0;
7835
7836   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7837
7838   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7839     return false;
7840
7841   /* FORNOW.  CHECKME.  */
7842   if (nested_in_vect_loop_p (loop, stmt))
7843     return false;
7844
7845   /* If STMT is not relevant and it is a simple assignment and its inputs are
7846      invariant then it can remain in place, unvectorized.  The original last
7847      scalar value that it computes will be used.  */
7848   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7849     {
7850       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7851       if (dump_enabled_p ())
7852         dump_printf_loc (MSG_NOTE, vect_location,
7853                          "statement is simple and uses invariant.  Leaving in "
7854                          "place.\n");
7855       return true;
7856     }
7857
7858   if (slp_node)
7859     ncopies = 1;
7860   else
7861     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7862
7863   if (slp_node)
7864     {
7865       gcc_assert (slp_index >= 0);
7866
7867       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7868       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7869
7870       /* Get the last occurrence of the scalar index from the concatenation of
7871          all the slp vectors. Calculate which slp vector it is and the index
7872          within.  */
7873       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7874
7875       /* Calculate which vector contains the result, and which lane of
7876          that vector we need.  */
7877       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7878         {
7879           if (dump_enabled_p ())
7880             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7881                              "Cannot determine which vector holds the"
7882                              " final result.\n");
7883           return false;
7884         }
7885     }
7886
7887   if (!vec_stmt)
7888     {
7889       /* No transformation required.  */
7890       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7891         {
7892           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7893                                                OPTIMIZE_FOR_SPEED))
7894             {
7895               if (dump_enabled_p ())
7896                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7897                                  "can't use a fully-masked loop because "
7898                                  "the target doesn't support extract last "
7899                                  "reduction.\n");
7900               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7901             }
7902           else if (slp_node)
7903             {
7904               if (dump_enabled_p ())
7905                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7906                                  "can't use a fully-masked loop because an "
7907                                  "SLP statement is live after the loop.\n");
7908               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7909             }
7910           else if (ncopies > 1)
7911             {
7912               if (dump_enabled_p ())
7913                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7914                                  "can't use a fully-masked loop because"
7915                                  " ncopies is greater than 1.\n");
7916               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7917             }
7918           else
7919             {
7920               gcc_assert (ncopies == 1 && !slp_node);
7921               vect_record_loop_mask (loop_vinfo,
7922                                      &LOOP_VINFO_MASKS (loop_vinfo),
7923                                      1, vectype);
7924             }
7925         }
7926       return true;
7927     }
7928
7929   /* If stmt has a related stmt, then use that for getting the lhs.  */
7930   if (is_pattern_stmt_p (stmt_info))
7931     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7932
7933   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7934         : gimple_get_lhs (stmt);
7935   lhs_type = TREE_TYPE (lhs);
7936
7937   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7938              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7939              : TYPE_SIZE (TREE_TYPE (vectype)));
7940   vec_bitsize = TYPE_SIZE (vectype);
7941
7942   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7943   tree vec_lhs, bitstart;
7944   if (slp_node)
7945     {
7946       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7947
7948       /* Get the correct slp vectorized stmt.  */
7949       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7950       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7951         vec_lhs = gimple_phi_result (phi);
7952       else
7953         vec_lhs = gimple_get_lhs (vec_stmt);
7954
7955       /* Get entry to use.  */
7956       bitstart = bitsize_int (vec_index);
7957       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7958     }
7959   else
7960     {
7961       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7962       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7963       gcc_checking_assert (ncopies == 1
7964                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7965
7966       /* For multiple copies, get the last copy.  */
7967       for (int i = 1; i < ncopies; ++i)
7968         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7969                                                   vec_lhs);
7970
7971       /* Get the last lane in the vector.  */
7972       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7973     }
7974
7975   gimple_seq stmts = NULL;
7976   tree new_tree;
7977   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7978     {
7979       /* Emit:
7980
7981            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7982
7983          where VEC_LHS is the vectorized live-out result and MASK is
7984          the loop mask for the final iteration.  */
7985       gcc_assert (ncopies == 1 && !slp_node);
7986       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7987       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7988                                       1, vectype, 0);
7989       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7990                                       scalar_type, mask, vec_lhs);
7991
7992       /* Convert the extracted vector element to the required scalar type.  */
7993       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7994     }
7995   else
7996     {
7997       tree bftype = TREE_TYPE (vectype);
7998       if (VECTOR_BOOLEAN_TYPE_P (vectype))
7999         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8000       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8001       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8002                                        &stmts, true, NULL_TREE);
8003     }
8004
8005   if (stmts)
8006     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8007
8008   /* Replace use of lhs with newly computed result.  If the use stmt is a
8009      single arg PHI, just replace all uses of PHI result.  It's necessary
8010      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8011   use_operand_p use_p;
8012   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8013     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8014         && !is_gimple_debug (use_stmt))
8015     {
8016       if (gimple_code (use_stmt) == GIMPLE_PHI
8017           && gimple_phi_num_args (use_stmt) == 1)
8018         {
8019           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8020         }
8021       else
8022         {
8023           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8024             SET_USE (use_p, new_tree);
8025         }
8026       update_stmt (use_stmt);
8027     }
8028
8029   return true;
8030 }
8031
8032 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8033
8034 static void
8035 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8036 {
8037   ssa_op_iter op_iter;
8038   imm_use_iterator imm_iter;
8039   def_operand_p def_p;
8040   gimple *ustmt;
8041
8042   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8043     {
8044       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8045         {
8046           basic_block bb;
8047
8048           if (!is_gimple_debug (ustmt))
8049             continue;
8050
8051           bb = gimple_bb (ustmt);
8052
8053           if (!flow_bb_inside_loop_p (loop, bb))
8054             {
8055               if (gimple_debug_bind_p (ustmt))
8056                 {
8057                   if (dump_enabled_p ())
8058                     dump_printf_loc (MSG_NOTE, vect_location,
8059                                      "killing debug use\n");
8060
8061                   gimple_debug_bind_reset_value (ustmt);
8062                   update_stmt (ustmt);
8063                 }
8064               else
8065                 gcc_unreachable ();
8066             }
8067         }
8068     }
8069 }
8070
8071 /* Given loop represented by LOOP_VINFO, return true if computation of
8072    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8073    otherwise.  */
8074
8075 static bool
8076 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8077 {
8078   /* Constant case.  */
8079   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8080     {
8081       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8082       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8083
8084       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8085       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8086       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8087         return true;
8088     }
8089
8090   widest_int max;
8091   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8092   /* Check the upper bound of loop niters.  */
8093   if (get_max_loop_iterations (loop, &max))
8094     {
8095       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8096       signop sgn = TYPE_SIGN (type);
8097       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8098       if (max < type_max)
8099         return true;
8100     }
8101   return false;
8102 }
8103
8104 /* Return a mask type with half the number of elements as TYPE.  */
8105
8106 tree
8107 vect_halve_mask_nunits (tree type)
8108 {
8109   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8110   return build_truth_vector_type (nunits, current_vector_size);
8111 }
8112
8113 /* Return a mask type with twice as many elements as TYPE.  */
8114
8115 tree
8116 vect_double_mask_nunits (tree type)
8117 {
8118   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8119   return build_truth_vector_type (nunits, current_vector_size);
8120 }
8121
8122 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8123    contain a sequence of NVECTORS masks that each control a vector of type
8124    VECTYPE.  */
8125
8126 void
8127 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8128                        unsigned int nvectors, tree vectype)
8129 {
8130   gcc_assert (nvectors != 0);
8131   if (masks->length () < nvectors)
8132     masks->safe_grow_cleared (nvectors);
8133   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8134   /* The number of scalars per iteration and the number of vectors are
8135      both compile-time constants.  */
8136   unsigned int nscalars_per_iter
8137     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8138                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8139   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8140     {
8141       rgm->max_nscalars_per_iter = nscalars_per_iter;
8142       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8143     }
8144 }
8145
8146 /* Given a complete set of masks MASKS, extract mask number INDEX
8147    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8148    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8149
8150    See the comment above vec_loop_masks for more details about the mask
8151    arrangement.  */
8152
8153 tree
8154 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8155                     unsigned int nvectors, tree vectype, unsigned int index)
8156 {
8157   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8158   tree mask_type = rgm->mask_type;
8159
8160   /* Populate the rgroup's mask array, if this is the first time we've
8161      used it.  */
8162   if (rgm->masks.is_empty ())
8163     {
8164       rgm->masks.safe_grow_cleared (nvectors);
8165       for (unsigned int i = 0; i < nvectors; ++i)
8166         {
8167           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8168           /* Provide a dummy definition until the real one is available.  */
8169           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8170           rgm->masks[i] = mask;
8171         }
8172     }
8173
8174   tree mask = rgm->masks[index];
8175   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8176                 TYPE_VECTOR_SUBPARTS (vectype)))
8177     {
8178       /* A loop mask for data type X can be reused for data type Y
8179          if X has N times more elements than Y and if Y's elements
8180          are N times bigger than X's.  In this case each sequence
8181          of N elements in the loop mask will be all-zero or all-one.
8182          We can then view-convert the mask so that each sequence of
8183          N elements is replaced by a single element.  */
8184       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8185                               TYPE_VECTOR_SUBPARTS (vectype)));
8186       gimple_seq seq = NULL;
8187       mask_type = build_same_sized_truth_vector_type (vectype);
8188       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8189       if (seq)
8190         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8191     }
8192   return mask;
8193 }
8194
8195 /* Scale profiling counters by estimation for LOOP which is vectorized
8196    by factor VF.  */
8197
8198 static void
8199 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8200 {
8201   edge preheader = loop_preheader_edge (loop);
8202   /* Reduce loop iterations by the vectorization factor.  */
8203   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8204   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8205
8206   if (freq_h.nonzero_p ())
8207     {
8208       profile_probability p;
8209
8210       /* Avoid dropping loop body profile counter to 0 because of zero count
8211          in loop's preheader.  */
8212       if (!(freq_e == profile_count::zero ()))
8213         freq_e = freq_e.force_nonzero ();
8214       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8215       scale_loop_frequencies (loop, p);
8216     }
8217
8218   edge exit_e = single_exit (loop);
8219   exit_e->probability = profile_probability::always ()
8220                                  .apply_scale (1, new_est_niter + 1);
8221
8222   edge exit_l = single_pred_edge (loop->latch);
8223   profile_probability prob = exit_l->probability;
8224   exit_l->probability = exit_e->probability.invert ();
8225   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8226     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8227 }
8228
8229 /* Vectorize STMT if relevant, inserting any new instructions before GSI.
8230    When vectorizing STMT as a store, set *SEEN_STORE to its stmt_vec_info.
8231    *SLP_SCHEDULE is a running record of whether we have called
8232    vect_schedule_slp.  */
8233
8234 static void
8235 vect_transform_loop_stmt (loop_vec_info loop_vinfo, gimple *stmt,
8236                           gimple_stmt_iterator *gsi,
8237                           stmt_vec_info *seen_store, bool *slp_scheduled)
8238 {
8239   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8240   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8241   stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
8242   if (!stmt_info)
8243     return;
8244
8245   if (dump_enabled_p ())
8246     {
8247       dump_printf_loc (MSG_NOTE, vect_location,
8248                        "------>vectorizing statement: ");
8249       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8250     }
8251
8252   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8253     vect_loop_kill_debug_uses (loop, stmt);
8254
8255   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8256       && !STMT_VINFO_LIVE_P (stmt_info))
8257     return;
8258
8259   if (STMT_VINFO_VECTYPE (stmt_info))
8260     {
8261       poly_uint64 nunits
8262         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8263       if (!STMT_SLP_TYPE (stmt_info)
8264           && maybe_ne (nunits, vf)
8265           && dump_enabled_p ())
8266         /* For SLP VF is set according to unrolling factor, and not
8267            to vector size, hence for SLP this print is not valid.  */
8268         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8269     }
8270
8271   /* SLP.  Schedule all the SLP instances when the first SLP stmt is
8272      reached.  */
8273   if (slp_vect_type slptype = STMT_SLP_TYPE (stmt_info))
8274     {
8275
8276       if (!*slp_scheduled)
8277         {
8278           *slp_scheduled = true;
8279
8280           DUMP_VECT_SCOPE ("scheduling SLP instances");
8281
8282           vect_schedule_slp (loop_vinfo);
8283         }
8284
8285       /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8286       if (slptype == pure_slp)
8287         return;
8288     }
8289
8290   if (dump_enabled_p ())
8291     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8292
8293   bool grouped_store = false;
8294   if (vect_transform_stmt (stmt, gsi, &grouped_store, NULL, NULL))
8295     *seen_store = stmt_info;
8296 }
8297
8298 /* Function vect_transform_loop.
8299
8300    The analysis phase has determined that the loop is vectorizable.
8301    Vectorize the loop - created vectorized stmts to replace the scalar
8302    stmts in the loop, and update the loop exit condition.
8303    Returns scalar epilogue loop if any.  */
8304
8305 struct loop *
8306 vect_transform_loop (loop_vec_info loop_vinfo)
8307 {
8308   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8309   struct loop *epilogue = NULL;
8310   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8311   int nbbs = loop->num_nodes;
8312   int i;
8313   tree niters_vector = NULL_TREE;
8314   tree step_vector = NULL_TREE;
8315   tree niters_vector_mult_vf = NULL_TREE;
8316   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8317   unsigned int lowest_vf = constant_lower_bound (vf);
8318   bool slp_scheduled = false;
8319   gimple *stmt;
8320   bool check_profitability = false;
8321   unsigned int th;
8322
8323   DUMP_VECT_SCOPE ("vec_transform_loop");
8324
8325   loop_vinfo->shared->check_datarefs ();
8326
8327   /* Use the more conservative vectorization threshold.  If the number
8328      of iterations is constant assume the cost check has been performed
8329      by our caller.  If the threshold makes all loops profitable that
8330      run at least the (estimated) vectorization factor number of times
8331      checking is pointless, too.  */
8332   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8333   if (th >= vect_vf_for_cost (loop_vinfo)
8334       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8335     {
8336       if (dump_enabled_p ())
8337         dump_printf_loc (MSG_NOTE, vect_location,
8338                          "Profitability threshold is %d loop iterations.\n",
8339                          th);
8340       check_profitability = true;
8341     }
8342
8343   /* Make sure there exists a single-predecessor exit bb.  Do this before
8344      versioning.   */
8345   edge e = single_exit (loop);
8346   if (! single_pred_p (e->dest))
8347     {
8348       split_loop_exit_edge (e);
8349       if (dump_enabled_p ())
8350         dump_printf (MSG_NOTE, "split exit edge\n");
8351     }
8352
8353   /* Version the loop first, if required, so the profitability check
8354      comes first.  */
8355
8356   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8357     {
8358       poly_uint64 versioning_threshold
8359         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8360       if (check_profitability
8361           && ordered_p (poly_uint64 (th), versioning_threshold))
8362         {
8363           versioning_threshold = ordered_max (poly_uint64 (th),
8364                                               versioning_threshold);
8365           check_profitability = false;
8366         }
8367       vect_loop_versioning (loop_vinfo, th, check_profitability,
8368                             versioning_threshold);
8369       check_profitability = false;
8370     }
8371
8372   /* Make sure there exists a single-predecessor exit bb also on the
8373      scalar loop copy.  Do this after versioning but before peeling
8374      so CFG structure is fine for both scalar and if-converted loop
8375      to make slpeel_duplicate_current_defs_from_edges face matched
8376      loop closed PHI nodes on the exit.  */
8377   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8378     {
8379       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8380       if (! single_pred_p (e->dest))
8381         {
8382           split_loop_exit_edge (e);
8383           if (dump_enabled_p ())
8384             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8385         }
8386     }
8387
8388   tree niters = vect_build_loop_niters (loop_vinfo);
8389   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8390   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8391   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8392   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8393                               &step_vector, &niters_vector_mult_vf, th,
8394                               check_profitability, niters_no_overflow);
8395
8396   if (niters_vector == NULL_TREE)
8397     {
8398       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8399           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8400           && known_eq (lowest_vf, vf))
8401         {
8402           niters_vector
8403             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8404                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8405           step_vector = build_one_cst (TREE_TYPE (niters));
8406         }
8407       else
8408         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8409                                      &step_vector, niters_no_overflow);
8410     }
8411
8412   /* 1) Make sure the loop header has exactly two entries
8413      2) Make sure we have a preheader basic block.  */
8414
8415   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8416
8417   split_edge (loop_preheader_edge (loop));
8418
8419   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8420       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8421     /* This will deal with any possible peeling.  */
8422     vect_prepare_for_masked_peels (loop_vinfo);
8423
8424   /* FORNOW: the vectorizer supports only loops which body consist
8425      of one basic block (header + empty latch). When the vectorizer will
8426      support more involved loop forms, the order by which the BBs are
8427      traversed need to be reconsidered.  */
8428
8429   for (i = 0; i < nbbs; i++)
8430     {
8431       basic_block bb = bbs[i];
8432       stmt_vec_info stmt_info;
8433
8434       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8435            gsi_next (&si))
8436         {
8437           gphi *phi = si.phi ();
8438           if (dump_enabled_p ())
8439             {
8440               dump_printf_loc (MSG_NOTE, vect_location,
8441                                "------>vectorizing phi: ");
8442               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8443             }
8444           stmt_info = loop_vinfo->lookup_stmt (phi);
8445           if (!stmt_info)
8446             continue;
8447
8448           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8449             vect_loop_kill_debug_uses (loop, phi);
8450
8451           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8452               && !STMT_VINFO_LIVE_P (stmt_info))
8453             continue;
8454
8455           if (STMT_VINFO_VECTYPE (stmt_info)
8456               && (maybe_ne
8457                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8458               && dump_enabled_p ())
8459             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8460
8461           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8462                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8463                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8464               && ! PURE_SLP_STMT (stmt_info))
8465             {
8466               if (dump_enabled_p ())
8467                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8468               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8469             }
8470         }
8471
8472       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8473            !gsi_end_p (si);)
8474         {
8475           stmt = gsi_stmt (si);
8476           /* During vectorization remove existing clobber stmts.  */
8477           if (gimple_clobber_p (stmt))
8478             {
8479               unlink_stmt_vdef (stmt);
8480               gsi_remove (&si, true);
8481               release_defs (stmt);
8482             }
8483           else
8484             {
8485               stmt_info = loop_vinfo->lookup_stmt (stmt);
8486
8487               /* vector stmts created in the outer-loop during vectorization of
8488                  stmts in an inner-loop may not have a stmt_info, and do not
8489                  need to be vectorized.  */
8490               stmt_vec_info seen_store = NULL;
8491               if (stmt_info)
8492                 {
8493                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8494                     {
8495                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8496                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8497                            !gsi_end_p (subsi); gsi_next (&subsi))
8498                         vect_transform_loop_stmt (loop_vinfo,
8499                                                   gsi_stmt (subsi), &si,
8500                                                   &seen_store,
8501                                                   &slp_scheduled);
8502                       gimple *pat_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8503                       vect_transform_loop_stmt (loop_vinfo, pat_stmt, &si,
8504                                                 &seen_store, &slp_scheduled);
8505                     }
8506                   vect_transform_loop_stmt (loop_vinfo, stmt, &si,
8507                                             &seen_store, &slp_scheduled);
8508                 }
8509               if (seen_store)
8510                 {
8511                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8512                     {
8513                       /* Interleaving.  If IS_STORE is TRUE, the
8514                          vectorization of the interleaving chain was
8515                          completed - free all the stores in the chain.  */
8516                       gsi_next (&si);
8517                       vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8518                     }
8519                   else
8520                     {
8521                       /* Free the attached stmt_vec_info and remove the
8522                          stmt.  */
8523                       free_stmt_vec_info (stmt);
8524                       unlink_stmt_vdef (stmt);
8525                       gsi_remove (&si, true);
8526                       release_defs (stmt);
8527                     }
8528                 }
8529               else
8530                 gsi_next (&si);
8531             }
8532         }
8533
8534       /* Stub out scalar statements that must not survive vectorization.
8535          Doing this here helps with grouped statements, or statements that
8536          are involved in patterns.  */
8537       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8538            !gsi_end_p (gsi); gsi_next (&gsi))
8539         {
8540           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8541           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8542             {
8543               tree lhs = gimple_get_lhs (call);
8544               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8545                 {
8546                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8547                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8548                   gsi_replace (&gsi, new_stmt, true);
8549                 }
8550             }
8551         }
8552     }                           /* BBs in loop */
8553
8554   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8555      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8556   if (integer_onep (step_vector))
8557     niters_no_overflow = true;
8558   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8559                            niters_vector_mult_vf, !niters_no_overflow);
8560
8561   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8562   scale_profile_for_vect_loop (loop, assumed_vf);
8563
8564   /* True if the final iteration might not handle a full vector's
8565      worth of scalar iterations.  */
8566   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8567   /* The minimum number of iterations performed by the epilogue.  This
8568      is 1 when peeling for gaps because we always need a final scalar
8569      iteration.  */
8570   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8571   /* +1 to convert latch counts to loop iteration counts,
8572      -min_epilogue_iters to remove iterations that cannot be performed
8573        by the vector code.  */
8574   int bias_for_lowest = 1 - min_epilogue_iters;
8575   int bias_for_assumed = bias_for_lowest;
8576   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8577   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8578     {
8579       /* When the amount of peeling is known at compile time, the first
8580          iteration will have exactly alignment_npeels active elements.
8581          In the worst case it will have at least one.  */
8582       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8583       bias_for_lowest += lowest_vf - min_first_active;
8584       bias_for_assumed += assumed_vf - min_first_active;
8585     }
8586   /* In these calculations the "- 1" converts loop iteration counts
8587      back to latch counts.  */
8588   if (loop->any_upper_bound)
8589     loop->nb_iterations_upper_bound
8590       = (final_iter_may_be_partial
8591          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8592                           lowest_vf) - 1
8593          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8594                            lowest_vf) - 1);
8595   if (loop->any_likely_upper_bound)
8596     loop->nb_iterations_likely_upper_bound
8597       = (final_iter_may_be_partial
8598          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8599                           + bias_for_lowest, lowest_vf) - 1
8600          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8601                            + bias_for_lowest, lowest_vf) - 1);
8602   if (loop->any_estimate)
8603     loop->nb_iterations_estimate
8604       = (final_iter_may_be_partial
8605          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8606                           assumed_vf) - 1
8607          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8608                            assumed_vf) - 1);
8609
8610   if (dump_enabled_p ())
8611     {
8612       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8613         {
8614           dump_printf_loc (MSG_NOTE, vect_location,
8615                            "LOOP VECTORIZED\n");
8616           if (loop->inner)
8617             dump_printf_loc (MSG_NOTE, vect_location,
8618                              "OUTER LOOP VECTORIZED\n");
8619           dump_printf (MSG_NOTE, "\n");
8620         }
8621       else
8622         {
8623           dump_printf_loc (MSG_NOTE, vect_location,
8624                            "LOOP EPILOGUE VECTORIZED (VS=");
8625           dump_dec (MSG_NOTE, current_vector_size);
8626           dump_printf (MSG_NOTE, ")\n");
8627         }
8628     }
8629
8630   /* Free SLP instances here because otherwise stmt reference counting
8631      won't work.  */
8632   slp_instance instance;
8633   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8634     vect_free_slp_instance (instance, true);
8635   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8636   /* Clear-up safelen field since its value is invalid after vectorization
8637      since vectorized loop can have loop-carried dependencies.  */
8638   loop->safelen = 0;
8639
8640   /* Don't vectorize epilogue for epilogue.  */
8641   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8642     epilogue = NULL;
8643
8644   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8645     epilogue = NULL;
8646
8647   if (epilogue)
8648     {
8649       auto_vector_sizes vector_sizes;
8650       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8651       unsigned int next_size = 0;
8652
8653       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8654           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8655           && known_eq (vf, lowest_vf))
8656         {
8657           unsigned int eiters
8658             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8659                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8660           eiters = eiters % lowest_vf;
8661           epilogue->nb_iterations_upper_bound = eiters - 1;
8662
8663           unsigned int ratio;
8664           while (next_size < vector_sizes.length ()
8665                  && !(constant_multiple_p (current_vector_size,
8666                                            vector_sizes[next_size], &ratio)
8667                       && eiters >= lowest_vf / ratio))
8668             next_size += 1;
8669         }
8670       else
8671         while (next_size < vector_sizes.length ()
8672                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8673           next_size += 1;
8674
8675       if (next_size == vector_sizes.length ())
8676         epilogue = NULL;
8677     }
8678
8679   if (epilogue)
8680     {
8681       epilogue->force_vectorize = loop->force_vectorize;
8682       epilogue->safelen = loop->safelen;
8683       epilogue->dont_vectorize = false;
8684
8685       /* We may need to if-convert epilogue to vectorize it.  */
8686       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8687         tree_if_conversion (epilogue);
8688     }
8689
8690   return epilogue;
8691 }
8692
8693 /* The code below is trying to perform simple optimization - revert
8694    if-conversion for masked stores, i.e. if the mask of a store is zero
8695    do not perform it and all stored value producers also if possible.
8696    For example,
8697      for (i=0; i<n; i++)
8698        if (c[i])
8699         {
8700           p1[i] += 1;
8701           p2[i] = p3[i] +2;
8702         }
8703    this transformation will produce the following semi-hammock:
8704
8705    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8706      {
8707        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8708        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8709        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8710        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8711        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8712        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8713      }
8714 */
8715
8716 void
8717 optimize_mask_stores (struct loop *loop)
8718 {
8719   basic_block *bbs = get_loop_body (loop);
8720   unsigned nbbs = loop->num_nodes;
8721   unsigned i;
8722   basic_block bb;
8723   struct loop *bb_loop;
8724   gimple_stmt_iterator gsi;
8725   gimple *stmt;
8726   auto_vec<gimple *> worklist;
8727
8728   vect_location = find_loop_location (loop);
8729   /* Pick up all masked stores in loop if any.  */
8730   for (i = 0; i < nbbs; i++)
8731     {
8732       bb = bbs[i];
8733       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8734            gsi_next (&gsi))
8735         {
8736           stmt = gsi_stmt (gsi);
8737           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8738             worklist.safe_push (stmt);
8739         }
8740     }
8741
8742   free (bbs);
8743   if (worklist.is_empty ())
8744     return;
8745
8746   /* Loop has masked stores.  */
8747   while (!worklist.is_empty ())
8748     {
8749       gimple *last, *last_store;
8750       edge e, efalse;
8751       tree mask;
8752       basic_block store_bb, join_bb;
8753       gimple_stmt_iterator gsi_to;
8754       tree vdef, new_vdef;
8755       gphi *phi;
8756       tree vectype;
8757       tree zero;
8758
8759       last = worklist.pop ();
8760       mask = gimple_call_arg (last, 2);
8761       bb = gimple_bb (last);
8762       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8763          the same loop as if_bb.  It could be different to LOOP when two
8764          level loop-nest is vectorized and mask_store belongs to the inner
8765          one.  */
8766       e = split_block (bb, last);
8767       bb_loop = bb->loop_father;
8768       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8769       join_bb = e->dest;
8770       store_bb = create_empty_bb (bb);
8771       add_bb_to_loop (store_bb, bb_loop);
8772       e->flags = EDGE_TRUE_VALUE;
8773       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8774       /* Put STORE_BB to likely part.  */
8775       efalse->probability = profile_probability::unlikely ();
8776       store_bb->count = efalse->count ();
8777       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8778       if (dom_info_available_p (CDI_DOMINATORS))
8779         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8780       if (dump_enabled_p ())
8781         dump_printf_loc (MSG_NOTE, vect_location,
8782                          "Create new block %d to sink mask stores.",
8783                          store_bb->index);
8784       /* Create vector comparison with boolean result.  */
8785       vectype = TREE_TYPE (mask);
8786       zero = build_zero_cst (vectype);
8787       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8788       gsi = gsi_last_bb (bb);
8789       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8790       /* Create new PHI node for vdef of the last masked store:
8791          .MEM_2 = VDEF <.MEM_1>
8792          will be converted to
8793          .MEM.3 = VDEF <.MEM_1>
8794          and new PHI node will be created in join bb
8795          .MEM_2 = PHI <.MEM_1, .MEM_3>
8796       */
8797       vdef = gimple_vdef (last);
8798       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8799       gimple_set_vdef (last, new_vdef);
8800       phi = create_phi_node (vdef, join_bb);
8801       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8802
8803       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8804       while (true)
8805         {
8806           gimple_stmt_iterator gsi_from;
8807           gimple *stmt1 = NULL;
8808
8809           /* Move masked store to STORE_BB.  */
8810           last_store = last;
8811           gsi = gsi_for_stmt (last);
8812           gsi_from = gsi;
8813           /* Shift GSI to the previous stmt for further traversal.  */
8814           gsi_prev (&gsi);
8815           gsi_to = gsi_start_bb (store_bb);
8816           gsi_move_before (&gsi_from, &gsi_to);
8817           /* Setup GSI_TO to the non-empty block start.  */
8818           gsi_to = gsi_start_bb (store_bb);
8819           if (dump_enabled_p ())
8820             {
8821               dump_printf_loc (MSG_NOTE, vect_location,
8822                                "Move stmt to created bb\n");
8823               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8824             }
8825           /* Move all stored value producers if possible.  */
8826           while (!gsi_end_p (gsi))
8827             {
8828               tree lhs;
8829               imm_use_iterator imm_iter;
8830               use_operand_p use_p;
8831               bool res;
8832
8833               /* Skip debug statements.  */
8834               if (is_gimple_debug (gsi_stmt (gsi)))
8835                 {
8836                   gsi_prev (&gsi);
8837                   continue;
8838                 }
8839               stmt1 = gsi_stmt (gsi);
8840               /* Do not consider statements writing to memory or having
8841                  volatile operand.  */
8842               if (gimple_vdef (stmt1)
8843                   || gimple_has_volatile_ops (stmt1))
8844                 break;
8845               gsi_from = gsi;
8846               gsi_prev (&gsi);
8847               lhs = gimple_get_lhs (stmt1);
8848               if (!lhs)
8849                 break;
8850
8851               /* LHS of vectorized stmt must be SSA_NAME.  */
8852               if (TREE_CODE (lhs) != SSA_NAME)
8853                 break;
8854
8855               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8856                 {
8857                   /* Remove dead scalar statement.  */
8858                   if (has_zero_uses (lhs))
8859                     {
8860                       gsi_remove (&gsi_from, true);
8861                       continue;
8862                     }
8863                 }
8864
8865               /* Check that LHS does not have uses outside of STORE_BB.  */
8866               res = true;
8867               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8868                 {
8869                   gimple *use_stmt;
8870                   use_stmt = USE_STMT (use_p);
8871                   if (is_gimple_debug (use_stmt))
8872                     continue;
8873                   if (gimple_bb (use_stmt) != store_bb)
8874                     {
8875                       res = false;
8876                       break;
8877                     }
8878                 }
8879               if (!res)
8880                 break;
8881
8882               if (gimple_vuse (stmt1)
8883                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8884                 break;
8885
8886               /* Can move STMT1 to STORE_BB.  */
8887               if (dump_enabled_p ())
8888                 {
8889                   dump_printf_loc (MSG_NOTE, vect_location,
8890                                    "Move stmt to created bb\n");
8891                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8892                 }
8893               gsi_move_before (&gsi_from, &gsi_to);
8894               /* Shift GSI_TO for further insertion.  */
8895               gsi_prev (&gsi_to);
8896             }
8897           /* Put other masked stores with the same mask to STORE_BB.  */
8898           if (worklist.is_empty ()
8899               || gimple_call_arg (worklist.last (), 2) != mask
8900               || worklist.last () != stmt1)
8901             break;
8902           last = worklist.pop ();
8903         }
8904       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8905     }
8906 }