gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static bool
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return true;
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                        &nunits_vectype))
 182     return false;
 183
 184   if (stmt_vectype)
 185     {
 186       if (STMT_VINFO_VECTYPE (stmt_info))
 187         /* The only case when a vectype had been already set is for stmts
 188            that contain a data ref, or for "pattern-stmts" (stmts generated
 189            by the vectorizer to represent/replace a certain idiom).  */
 190         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 191                      || vectype_maybe_set_p)
 192                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 193       else if (stmt_vectype == boolean_type_node)
 194         mask_producers->safe_push (stmt_info);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return true;
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  If some of the statements
 208    produce a mask result whose vector type can only be calculated later,
 209    add them to MASK_PRODUCERS.  Return true on success or false if
 210    something prevented vectorization.  */
 211
 212 static bool
 213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 214                             vec<stmt_vec_info > *mask_producers)
 215 {
 216   vec_info *vinfo = stmt_info->vinfo;
 217   if (dump_enabled_p ())
 218     {
 219       dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
 220       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 221     }
 222   if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
 223     return false;
 224
 225   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 226       && STMT_VINFO_RELATED_STMT (stmt_info))
 227     {
 228       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 229       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 230
 231       /* If a pattern statement has def stmts, analyze them too.  */
 232       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 233            !gsi_end_p (si); gsi_next (&si))
 234         {
 235           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 236           if (dump_enabled_p ())
 237             {
 238               dump_printf_loc (MSG_NOTE, vect_location,
 239                                "==> examining pattern def stmt: ");
 240               dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 241                                 def_stmt_info->stmt, 0);
 242             }
 243           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                              vf, mask_producers))
 245             return false;
 246         }
 247
 248       if (dump_enabled_p ())
 249         {
 250           dump_printf_loc (MSG_NOTE, vect_location,
 251                            "==> examining pattern statement: ");
 252           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 253         }
 254       if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
 255         return false;
 256     }
 257
 258   return true;
 259 }
 260
 261 /* Function vect_determine_vectorization_factor
 262
 263    Determine the vectorization factor (VF).  VF is the number of data elements
 264    that are operated upon in parallel in a single iteration of the vectorized
 265    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 266    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 267    elements can fit in a single vector register.
 268
 269    We currently support vectorization of loops in which all types operated upon
 270    are of the same size.  Therefore this function currently sets VF according to
 271    the size of the types operated upon, and fails if there are multiple sizes
 272    in the loop.
 273
 274    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 275    original loop:
 276         for (i=0; i<N; i++){
 277           a[i] = b[i] + c[i];
 278         }
 279
 280    vectorized loop:
 281         for (i=0; i<N; i+=VF){
 282           a[i:VF] = b[i:VF] + c[i:VF];
 283         }
 284 */
 285
 286 static bool
 287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 288 {
 289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 290   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 291   unsigned nbbs = loop->num_nodes;
 292   poly_uint64 vectorization_factor = 1;
 293   tree scalar_type = NULL_TREE;
 294   gphi *phi;
 295   tree vectype;
 296   stmt_vec_info stmt_info;
 297   unsigned i;
 298   auto_vec<stmt_vec_info> mask_producers;
 299
 300   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 301
 302   for (i = 0; i < nbbs; i++)
 303     {
 304       basic_block bb = bbs[i];
 305
 306       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 307            gsi_next (&si))
 308         {
 309           phi = si.phi ();
 310           stmt_info = loop_vinfo->lookup_stmt (phi);
 311           if (dump_enabled_p ())
 312             {
 313               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 314               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 315             }
 316
 317           gcc_assert (stmt_info);
 318
 319           if (STMT_VINFO_RELEVANT_P (stmt_info)
 320               || STMT_VINFO_LIVE_P (stmt_info))
 321             {
 322               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 323               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 324
 325               if (dump_enabled_p ())
 326                 {
 327                   dump_printf_loc (MSG_NOTE, vect_location,
 328                                    "get vectype for scalar type:  ");
 329                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 330                   dump_printf (MSG_NOTE, "\n");
 331                 }
 332
 333               vectype = get_vectype_for_scalar_type (scalar_type);
 334               if (!vectype)
 335                 {
 336                   if (dump_enabled_p ())
 337                     {
 338                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 339                                        "not vectorized: unsupported "
 340                                        "data-type ");
 341                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 342                                          scalar_type);
 343                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 344                     }
 345                   return false;
 346                 }
 347               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 348
 349               if (dump_enabled_p ())
 350                 {
 351                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 352                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 353                   dump_printf (MSG_NOTE, "\n");
 354                 }
 355
 356               if (dump_enabled_p ())
 357                 {
 358                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 359                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 360                   dump_printf (MSG_NOTE, "\n");
 361                 }
 362
 363               vect_update_max_nunits (&vectorization_factor, vectype);
 364             }
 365         }
 366
 367       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 368            gsi_next (&si))
 369         {
 370           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 371           if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 372                                            &mask_producers))
 373             return false;
 374         }
 375     }
 376
 377   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 378   if (dump_enabled_p ())
 379     {
 380       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 381       dump_dec (MSG_NOTE, vectorization_factor);
 382       dump_printf (MSG_NOTE, "\n");
 383     }
 384
 385   if (known_le (vectorization_factor, 1U))
 386     {
 387       if (dump_enabled_p ())
 388         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 389                          "not vectorized: unsupported data-type\n");
 390       return false;
 391     }
 392   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 393
 394   for (i = 0; i < mask_producers.length (); i++)
 395     {
 396       stmt_info = mask_producers[i];
 397       tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 398       if (!mask_type)
 399         return false;
 400       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 401     }
 402
 403   return true;
 404 }
 405
 406
 407 /* Function vect_is_simple_iv_evolution.
 408
 409    FORNOW: A simple evolution of an induction variables in the loop is
 410    considered a polynomial evolution.  */
 411
 412 static bool
 413 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 414                              tree * step)
 415 {
 416   tree init_expr;
 417   tree step_expr;
 418   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 419   basic_block bb;
 420
 421   /* When there is no evolution in this loop, the evolution function
 422      is not "simple".  */
 423   if (evolution_part == NULL_TREE)
 424     return false;
 425
 426   /* When the evolution is a polynomial of degree >= 2
 427      the evolution function is not "simple".  */
 428   if (tree_is_chrec (evolution_part))
 429     return false;
 430
 431   step_expr = evolution_part;
 432   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 433
 434   if (dump_enabled_p ())
 435     {
 436       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 437       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 438       dump_printf (MSG_NOTE, ",  init: ");
 439       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 440       dump_printf (MSG_NOTE, "\n");
 441     }
 442
 443   *init = init_expr;
 444   *step = step_expr;
 445
 446   if (TREE_CODE (step_expr) != INTEGER_CST
 447       && (TREE_CODE (step_expr) != SSA_NAME
 448           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 449               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 450           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 451               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 452                   || !flag_associative_math)))
 453       && (TREE_CODE (step_expr) != REAL_CST
 454           || !flag_associative_math))
 455     {
 456       if (dump_enabled_p ())
 457         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 458                          "step unknown.\n");
 459       return false;
 460     }
 461
 462   return true;
 463 }
 464
 465 /* Function vect_analyze_scalar_cycles_1.
 466
 467    Examine the cross iteration def-use cycles of scalar variables
 468    in LOOP.  LOOP_VINFO represents the loop that is now being
 469    considered for vectorization (can be LOOP, or an outer-loop
 470    enclosing LOOP).  */
 471
 472 static void
 473 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 474 {
 475   basic_block bb = loop->header;
 476   tree init, step;
 477   auto_vec<gimple *, 64> worklist;
 478   gphi_iterator gsi;
 479   bool double_reduc;
 480
 481   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 482
 483   /* First - identify all inductions.  Reduction detection assumes that all the
 484      inductions have been identified, therefore, this order must not be
 485      changed.  */
 486   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 487     {
 488       gphi *phi = gsi.phi ();
 489       tree access_fn = NULL;
 490       tree def = PHI_RESULT (phi);
 491       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 492
 493       if (dump_enabled_p ())
 494         {
 495           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 496           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 497         }
 498
 499       /* Skip virtual phi's.  The data dependences that are associated with
 500          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 501       if (virtual_operand_p (def))
 502         continue;
 503
 504       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 505
 506       /* Analyze the evolution function.  */
 507       access_fn = analyze_scalar_evolution (loop, def);
 508       if (access_fn)
 509         {
 510           STRIP_NOPS (access_fn);
 511           if (dump_enabled_p ())
 512             {
 513               dump_printf_loc (MSG_NOTE, vect_location,
 514                                "Access function of PHI: ");
 515               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 516               dump_printf (MSG_NOTE, "\n");
 517             }
 518           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 519             = initial_condition_in_loop_num (access_fn, loop->num);
 520           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 521             = evolution_part_in_loop_num (access_fn, loop->num);
 522         }
 523
 524       if (!access_fn
 525           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 526           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 527               && TREE_CODE (step) != INTEGER_CST))
 528         {
 529           worklist.safe_push (phi);
 530           continue;
 531         }
 532
 533       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 534                   != NULL_TREE);
 535       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 536
 537       if (dump_enabled_p ())
 538         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 539       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 540     }
 541
 542
 543   /* Second - identify all reductions and nested cycles.  */
 544   while (worklist.length () > 0)
 545     {
 546       gimple *phi = worklist.pop ();
 547       tree def = PHI_RESULT (phi);
 548       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 549
 550       if (dump_enabled_p ())
 551         {
 552           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 553           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 554         }
 555
 556       gcc_assert (!virtual_operand_p (def)
 557                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 558
 559       stmt_vec_info reduc_stmt_info
 560         = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
 561                                        &double_reduc, false);
 562       if (reduc_stmt_info)
 563         {
 564           if (double_reduc)
 565             {
 566               if (dump_enabled_p ())
 567                 dump_printf_loc (MSG_NOTE, vect_location,
 568                                  "Detected double reduction.\n");
 569
 570               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 571               STMT_VINFO_DEF_TYPE (reduc_stmt_info)
 572                 = vect_double_reduction_def;
 573             }
 574           else
 575             {
 576               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 577                 {
 578                   if (dump_enabled_p ())
 579                     dump_printf_loc (MSG_NOTE, vect_location,
 580                                      "Detected vectorizable nested cycle.\n");
 581
 582                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 583                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
 584                 }
 585               else
 586                 {
 587                   if (dump_enabled_p ())
 588                     dump_printf_loc (MSG_NOTE, vect_location,
 589                                      "Detected reduction.\n");
 590
 591                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 592                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 593                   /* Store the reduction cycles for possible vectorization in
 594                      loop-aware SLP if it was not detected as reduction
 595                      chain.  */
 596                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 597                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 598                       (reduc_stmt_info);
 599                 }
 600             }
 601         }
 602       else
 603         if (dump_enabled_p ())
 604           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 605                            "Unknown def-use cycle pattern.\n");
 606     }
 607 }
 608
 609
 610 /* Function vect_analyze_scalar_cycles.
 611
 612    Examine the cross iteration def-use cycles of scalar variables, by
 613    analyzing the loop-header PHIs of scalar variables.  Classify each
 614    cycle as one of the following: invariant, induction, reduction, unknown.
 615    We do that for the loop represented by LOOP_VINFO, and also to its
 616    inner-loop, if exists.
 617    Examples for scalar cycles:
 618
 619    Example1: reduction:
 620
 621               loop1:
 622               for (i=0; i<N; i++)
 623                  sum += a[i];
 624
 625    Example2: induction:
 626
 627               loop2:
 628               for (i=0; i<N; i++)
 629                  a[i] = i;  */
 630
 631 static void
 632 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 633 {
 634   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 635
 636   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 637
 638   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 639      Reductions in such inner-loop therefore have different properties than
 640      the reductions in the nest that gets vectorized:
 641      1. When vectorized, they are executed in the same order as in the original
 642         scalar loop, so we can't change the order of computation when
 643         vectorizing them.
 644      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 645         current checks are too strict.  */
 646
 647   if (loop->inner)
 648     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 649 }
 650
 651 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 652
 653 static void
 654 vect_fixup_reduc_chain (gimple *stmt)
 655 {
 656   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 657   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 658   stmt_vec_info stmtp;
 659   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 660               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 661   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 662   do
 663     {
 664       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 665       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 666       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 667       if (stmt_info)
 668         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 669           = STMT_VINFO_RELATED_STMT (stmt_info);
 670     }
 671   while (stmt_info);
 672   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 673 }
 674
 675 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 676
 677 static void
 678 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 679 {
 680   stmt_vec_info first;
 681   unsigned i;
 682
 683   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 684     if (STMT_VINFO_IN_PATTERN_P (first))
 685       {
 686         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 687         while (next)
 688           {
 689             if (! STMT_VINFO_IN_PATTERN_P (next))
 690               break;
 691             next = REDUC_GROUP_NEXT_ELEMENT (next);
 692           }
 693         /* If not all stmt in the chain are patterns try to handle
 694            the chain without patterns.  */
 695         if (! next)
 696           {
 697             vect_fixup_reduc_chain (first);
 698             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 699               = STMT_VINFO_RELATED_STMT (first);
 700           }
 701       }
 702 }
 703
 704 /* Function vect_get_loop_niters.
 705
 706    Determine how many iterations the loop is executed and place it
 707    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 708    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 709    niter information holds in ASSUMPTIONS.
 710
 711    Return the loop exit condition.  */
 712
 713
 714 static gcond *
 715 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 716                       tree *number_of_iterations, tree *number_of_iterationsm1)
 717 {
 718   edge exit = single_exit (loop);
 719   struct tree_niter_desc niter_desc;
 720   tree niter_assumptions, niter, may_be_zero;
 721   gcond *cond = get_loop_exit_condition (loop);
 722
 723   *assumptions = boolean_true_node;
 724   *number_of_iterationsm1 = chrec_dont_know;
 725   *number_of_iterations = chrec_dont_know;
 726   DUMP_VECT_SCOPE ("get_loop_niters");
 727
 728   if (!exit)
 729     return cond;
 730
 731   niter = chrec_dont_know;
 732   may_be_zero = NULL_TREE;
 733   niter_assumptions = boolean_true_node;
 734   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 735       || chrec_contains_undetermined (niter_desc.niter))
 736     return cond;
 737
 738   niter_assumptions = niter_desc.assumptions;
 739   may_be_zero = niter_desc.may_be_zero;
 740   niter = niter_desc.niter;
 741
 742   if (may_be_zero && integer_zerop (may_be_zero))
 743     may_be_zero = NULL_TREE;
 744
 745   if (may_be_zero)
 746     {
 747       if (COMPARISON_CLASS_P (may_be_zero))
 748         {
 749           /* Try to combine may_be_zero with assumptions, this can simplify
 750              computation of niter expression.  */
 751           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 752             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 753                                              niter_assumptions,
 754                                              fold_build1 (TRUTH_NOT_EXPR,
 755                                                           boolean_type_node,
 756                                                           may_be_zero));
 757           else
 758             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 759                                  build_int_cst (TREE_TYPE (niter), 0),
 760                                  rewrite_to_non_trapping_overflow (niter));
 761
 762           may_be_zero = NULL_TREE;
 763         }
 764       else if (integer_nonzerop (may_be_zero))
 765         {
 766           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 767           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 768           return cond;
 769         }
 770       else
 771         return cond;
 772     }
 773
 774   *assumptions = niter_assumptions;
 775   *number_of_iterationsm1 = niter;
 776
 777   /* We want the number of loop header executions which is the number
 778      of latch executions plus one.
 779      ???  For UINT_MAX latch executions this number overflows to zero
 780      for loops like do { n++; } while (n != 0);  */
 781   if (niter && !chrec_contains_undetermined (niter))
 782     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 783                           build_int_cst (TREE_TYPE (niter), 1));
 784   *number_of_iterations = niter;
 785
 786   return cond;
 787 }
 788
 789 /* Function bb_in_loop_p
 790
 791    Used as predicate for dfs order traversal of the loop bbs.  */
 792
 793 static bool
 794 bb_in_loop_p (const_basic_block bb, const void *data)
 795 {
 796   const struct loop *const loop = (const struct loop *)data;
 797   if (flow_bb_inside_loop_p (loop, bb))
 798     return true;
 799   return false;
 800 }
 801
 802
 803 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 804    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 805
 806 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 807   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 808     loop (loop_in),
 809     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 810     num_itersm1 (NULL_TREE),
 811     num_iters (NULL_TREE),
 812     num_iters_unchanged (NULL_TREE),
 813     num_iters_assumptions (NULL_TREE),
 814     th (0),
 815     versioning_threshold (0),
 816     vectorization_factor (0),
 817     max_vectorization_factor (0),
 818     mask_skip_niters (NULL_TREE),
 819     mask_compare_type (NULL_TREE),
 820     unaligned_dr (NULL),
 821     peeling_for_alignment (0),
 822     ptr_mask (0),
 823     ivexpr_map (NULL),
 824     slp_unrolling_factor (1),
 825     single_scalar_iteration_cost (0),
 826     vectorizable (false),
 827     can_fully_mask_p (true),
 828     fully_masked_p (false),
 829     peeling_for_gaps (false),
 830     peeling_for_niter (false),
 831     operands_swapped (false),
 832     no_data_dependencies (false),
 833     has_mask_store (false),
 834     scalar_loop (NULL),
 835     orig_loop_info (NULL)
 836 {
 837   /* Create/Update stmt_info for all stmts in the loop.  */
 838   basic_block *body = get_loop_body (loop);
 839   for (unsigned int i = 0; i < loop->num_nodes; i++)
 840     {
 841       basic_block bb = body[i];
 842       gimple_stmt_iterator si;
 843
 844       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 845         {
 846           gimple *phi = gsi_stmt (si);
 847           gimple_set_uid (phi, 0);
 848           add_stmt (phi);
 849         }
 850
 851       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 852         {
 853           gimple *stmt = gsi_stmt (si);
 854           gimple_set_uid (stmt, 0);
 855           add_stmt (stmt);
 856         }
 857     }
 858   free (body);
 859
 860   /* CHECKME: We want to visit all BBs before their successors (except for
 861      latch blocks, for which this assertion wouldn't hold).  In the simple
 862      case of the loop forms we allow, a dfs order of the BBs would the same
 863      as reversed postorder traversal, so we are safe.  */
 864
 865   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 866                                           bbs, loop->num_nodes, loop);
 867   gcc_assert (nbbs == loop->num_nodes);
 868 }
 869
 870 /* Free all levels of MASKS.  */
 871
 872 void
 873 release_vec_loop_masks (vec_loop_masks *masks)
 874 {
 875   rgroup_masks *rgm;
 876   unsigned int i;
 877   FOR_EACH_VEC_ELT (*masks, i, rgm)
 878     rgm->masks.release ();
 879   masks->release ();
 880 }
 881
 882 /* Free all memory used by the _loop_vec_info, as well as all the
 883    stmt_vec_info structs of all the stmts in the loop.  */
 884
 885 _loop_vec_info::~_loop_vec_info ()
 886 {
 887   int nbbs;
 888   gimple_stmt_iterator si;
 889   int j;
 890
 891   /* ???  We're releasing loop_vinfos en-block.  */
 892   set_stmt_vec_info_vec (&stmt_vec_infos);
 893   nbbs = loop->num_nodes;
 894   for (j = 0; j < nbbs; j++)
 895     {
 896       basic_block bb = bbs[j];
 897       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 898         free_stmt_vec_info (gsi_stmt (si));
 899
 900       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 901         {
 902           gimple *stmt = gsi_stmt (si);
 903
 904           /* We may have broken canonical form by moving a constant
 905              into RHS1 of a commutative op.  Fix such occurrences.  */
 906           if (operands_swapped && is_gimple_assign (stmt))
 907             {
 908               enum tree_code code = gimple_assign_rhs_code (stmt);
 909
 910               if ((code == PLUS_EXPR
 911                    || code == POINTER_PLUS_EXPR
 912                    || code == MULT_EXPR)
 913                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 914                 swap_ssa_operands (stmt,
 915                                    gimple_assign_rhs1_ptr (stmt),
 916                                    gimple_assign_rhs2_ptr (stmt));
 917               else if (code == COND_EXPR
 918                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 919                 {
 920                   tree cond_expr = gimple_assign_rhs1 (stmt);
 921                   enum tree_code cond_code = TREE_CODE (cond_expr);
 922
 923                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 924                     {
 925                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 926                                                                   0));
 927                       cond_code = invert_tree_comparison (cond_code,
 928                                                           honor_nans);
 929                       if (cond_code != ERROR_MARK)
 930                         {
 931                           TREE_SET_CODE (cond_expr, cond_code);
 932                           swap_ssa_operands (stmt,
 933                                              gimple_assign_rhs2_ptr (stmt),
 934                                              gimple_assign_rhs3_ptr (stmt));
 935                         }
 936                     }
 937                 }
 938             }
 939
 940           /* Free stmt_vec_info.  */
 941           free_stmt_vec_info (stmt);
 942           gsi_next (&si);
 943         }
 944     }
 945
 946   free (bbs);
 947
 948   release_vec_loop_masks (&masks);
 949   delete ivexpr_map;
 950
 951   loop->aux = NULL;
 952 }
 953
 954 /* Return an invariant or register for EXPR and emit necessary
 955    computations in the LOOP_VINFO loop preheader.  */
 956
 957 tree
 958 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 959 {
 960   if (is_gimple_reg (expr)
 961       || is_gimple_min_invariant (expr))
 962     return expr;
 963
 964   if (! loop_vinfo->ivexpr_map)
 965     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 966   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 967   if (! cached)
 968     {
 969       gimple_seq stmts = NULL;
 970       cached = force_gimple_operand (unshare_expr (expr),
 971                                      &stmts, true, NULL_TREE);
 972       if (stmts)
 973         {
 974           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 975           gsi_insert_seq_on_edge_immediate (e, stmts);
 976         }
 977     }
 978   return cached;
 979 }
 980
 981 /* Return true if we can use CMP_TYPE as the comparison type to produce
 982    all masks required to mask LOOP_VINFO.  */
 983
 984 static bool
 985 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 986 {
 987   rgroup_masks *rgm;
 988   unsigned int i;
 989   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 990     if (rgm->mask_type != NULL_TREE
 991         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 992                                             cmp_type, rgm->mask_type,
 993                                             OPTIMIZE_FOR_SPEED))
 994       return false;
 995   return true;
 996 }
 997
 998 /* Calculate the maximum number of scalars per iteration for every
 999    rgroup in LOOP_VINFO.  */
1000
1001 static unsigned int
1002 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1003 {
1004   unsigned int res = 1;
1005   unsigned int i;
1006   rgroup_masks *rgm;
1007   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1008     res = MAX (res, rgm->max_nscalars_per_iter);
1009   return res;
1010 }
1011
1012 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1013    whether we can actually generate the masks required.  Return true if so,
1014    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1015
1016 static bool
1017 vect_verify_full_masking (loop_vec_info loop_vinfo)
1018 {
1019   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1020   unsigned int min_ni_width;
1021
1022   /* Use a normal loop if there are no statements that need masking.
1023      This only happens in rare degenerate cases: it means that the loop
1024      has no loads, no stores, and no live-out values.  */
1025   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1026     return false;
1027
1028   /* Get the maximum number of iterations that is representable
1029      in the counter type.  */
1030   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1031   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1032
1033   /* Get a more refined estimate for the number of iterations.  */
1034   widest_int max_back_edges;
1035   if (max_loop_iterations (loop, &max_back_edges))
1036     max_ni = wi::smin (max_ni, max_back_edges + 1);
1037
1038   /* Account for rgroup masks, in which each bit is replicated N times.  */
1039   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1040
1041   /* Work out how many bits we need to represent the limit.  */
1042   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1043
1044   /* Find a scalar mode for which WHILE_ULT is supported.  */
1045   opt_scalar_int_mode cmp_mode_iter;
1046   tree cmp_type = NULL_TREE;
1047   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1048     {
1049       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1050       if (cmp_bits >= min_ni_width
1051           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1052         {
1053           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1054           if (this_type
1055               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1056             {
1057               /* Although we could stop as soon as we find a valid mode,
1058                  it's often better to continue until we hit Pmode, since the
1059                  operands to the WHILE are more likely to be reusable in
1060                  address calculations.  */
1061               cmp_type = this_type;
1062               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1063                 break;
1064             }
1065         }
1066     }
1067
1068   if (!cmp_type)
1069     return false;
1070
1071   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1072   return true;
1073 }
1074
1075 /* Calculate the cost of one scalar iteration of the loop.  */
1076 static void
1077 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1078 {
1079   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1080   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1081   int nbbs = loop->num_nodes, factor;
1082   int innerloop_iters, i;
1083
1084   /* Gather costs for statements in the scalar loop.  */
1085
1086   /* FORNOW.  */
1087   innerloop_iters = 1;
1088   if (loop->inner)
1089     innerloop_iters = 50; /* FIXME */
1090
1091   for (i = 0; i < nbbs; i++)
1092     {
1093       gimple_stmt_iterator si;
1094       basic_block bb = bbs[i];
1095
1096       if (bb->loop_father == loop->inner)
1097         factor = innerloop_iters;
1098       else
1099         factor = 1;
1100
1101       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1102         {
1103           gimple *stmt = gsi_stmt (si);
1104           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1105
1106           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1107             continue;
1108
1109           /* Skip stmts that are not vectorized inside the loop.  */
1110           if (stmt_info
1111               && !STMT_VINFO_RELEVANT_P (stmt_info)
1112               && (!STMT_VINFO_LIVE_P (stmt_info)
1113                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1114               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1115             continue;
1116
1117           vect_cost_for_stmt kind;
1118           if (STMT_VINFO_DATA_REF (stmt_info))
1119             {
1120               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1121                kind = scalar_load;
1122              else
1123                kind = scalar_store;
1124             }
1125           else
1126             kind = scalar_stmt;
1127
1128           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1129                             factor, kind, stmt_info, 0, vect_prologue);
1130         }
1131     }
1132
1133   /* Now accumulate cost.  */
1134   void *target_cost_data = init_cost (loop);
1135   stmt_info_for_cost *si;
1136   int j;
1137   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1138                     j, si)
1139     {
1140       struct _stmt_vec_info *stmt_info
1141         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
1142       (void) add_stmt_cost (target_cost_data, si->count,
1143                             si->kind, stmt_info, si->misalign,
1144                             vect_body);
1145     }
1146   unsigned dummy, body_cost = 0;
1147   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1148   destroy_cost_data (target_cost_data);
1149   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1150 }
1151
1152
1153 /* Function vect_analyze_loop_form_1.
1154
1155    Verify that certain CFG restrictions hold, including:
1156    - the loop has a pre-header
1157    - the loop has a single entry and exit
1158    - the loop exit condition is simple enough
1159    - the number of iterations can be analyzed, i.e, a countable loop.  The
1160      niter could be analyzed under some assumptions.  */
1161
1162 bool
1163 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1164                           tree *assumptions, tree *number_of_iterationsm1,
1165                           tree *number_of_iterations, gcond **inner_loop_cond)
1166 {
1167   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1168
1169   /* Different restrictions apply when we are considering an inner-most loop,
1170      vs. an outer (nested) loop.
1171      (FORNOW. May want to relax some of these restrictions in the future).  */
1172
1173   if (!loop->inner)
1174     {
1175       /* Inner-most loop.  We currently require that the number of BBs is
1176          exactly 2 (the header and latch).  Vectorizable inner-most loops
1177          look like this:
1178
1179                         (pre-header)
1180                            |
1181                           header <--------+
1182                            | |            |
1183                            | +--> latch --+
1184                            |
1185                         (exit-bb)  */
1186
1187       if (loop->num_nodes != 2)
1188         {
1189           if (dump_enabled_p ())
1190             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191                              "not vectorized: control flow in loop.\n");
1192           return false;
1193         }
1194
1195       if (empty_block_p (loop->header))
1196         {
1197           if (dump_enabled_p ())
1198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199                              "not vectorized: empty loop.\n");
1200           return false;
1201         }
1202     }
1203   else
1204     {
1205       struct loop *innerloop = loop->inner;
1206       edge entryedge;
1207
1208       /* Nested loop. We currently require that the loop is doubly-nested,
1209          contains a single inner loop, and the number of BBs is exactly 5.
1210          Vectorizable outer-loops look like this:
1211
1212                         (pre-header)
1213                            |
1214                           header <---+
1215                            |         |
1216                           inner-loop |
1217                            |         |
1218                           tail ------+
1219                            |
1220                         (exit-bb)
1221
1222          The inner-loop has the properties expected of inner-most loops
1223          as described above.  */
1224
1225       if ((loop->inner)->inner || (loop->inner)->next)
1226         {
1227           if (dump_enabled_p ())
1228             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1229                              "not vectorized: multiple nested loops.\n");
1230           return false;
1231         }
1232
1233       if (loop->num_nodes != 5)
1234         {
1235           if (dump_enabled_p ())
1236             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1237                              "not vectorized: control flow in loop.\n");
1238           return false;
1239         }
1240
1241       entryedge = loop_preheader_edge (innerloop);
1242       if (entryedge->src != loop->header
1243           || !single_exit (innerloop)
1244           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1245         {
1246           if (dump_enabled_p ())
1247             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1248                              "not vectorized: unsupported outerloop form.\n");
1249           return false;
1250         }
1251
1252       /* Analyze the inner-loop.  */
1253       tree inner_niterm1, inner_niter, inner_assumptions;
1254       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1255                                       &inner_assumptions, &inner_niterm1,
1256                                       &inner_niter, NULL)
1257           /* Don't support analyzing niter under assumptions for inner
1258              loop.  */
1259           || !integer_onep (inner_assumptions))
1260         {
1261           if (dump_enabled_p ())
1262             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1263                              "not vectorized: Bad inner loop.\n");
1264           return false;
1265         }
1266
1267       if (!expr_invariant_in_loop_p (loop, inner_niter))
1268         {
1269           if (dump_enabled_p ())
1270             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1271                              "not vectorized: inner-loop count not"
1272                              " invariant.\n");
1273           return false;
1274         }
1275
1276       if (dump_enabled_p ())
1277         dump_printf_loc (MSG_NOTE, vect_location,
1278                          "Considering outer-loop vectorization.\n");
1279     }
1280
1281   if (!single_exit (loop)
1282       || EDGE_COUNT (loop->header->preds) != 2)
1283     {
1284       if (dump_enabled_p ())
1285         {
1286           if (!single_exit (loop))
1287             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1288                              "not vectorized: multiple exits.\n");
1289           else if (EDGE_COUNT (loop->header->preds) != 2)
1290             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291                              "not vectorized: too many incoming edges.\n");
1292         }
1293       return false;
1294     }
1295
1296   /* We assume that the loop exit condition is at the end of the loop. i.e,
1297      that the loop is represented as a do-while (with a proper if-guard
1298      before the loop if needed), where the loop header contains all the
1299      executable statements, and the latch is empty.  */
1300   if (!empty_block_p (loop->latch)
1301       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1302     {
1303       if (dump_enabled_p ())
1304         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1305                          "not vectorized: latch block not empty.\n");
1306       return false;
1307     }
1308
1309   /* Make sure the exit is not abnormal.  */
1310   edge e = single_exit (loop);
1311   if (e->flags & EDGE_ABNORMAL)
1312     {
1313       if (dump_enabled_p ())
1314         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1315                          "not vectorized: abnormal loop exit edge.\n");
1316       return false;
1317     }
1318
1319   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1320                                      number_of_iterationsm1);
1321   if (!*loop_cond)
1322     {
1323       if (dump_enabled_p ())
1324         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325                          "not vectorized: complicated exit condition.\n");
1326       return false;
1327     }
1328
1329   if (integer_zerop (*assumptions)
1330       || !*number_of_iterations
1331       || chrec_contains_undetermined (*number_of_iterations))
1332     {
1333       if (dump_enabled_p ())
1334         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1335                          "not vectorized: number of iterations cannot be "
1336                          "computed.\n");
1337       return false;
1338     }
1339
1340   if (integer_zerop (*number_of_iterations))
1341     {
1342       if (dump_enabled_p ())
1343         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1344                          "not vectorized: number of iterations = 0.\n");
1345       return false;
1346     }
1347
1348   return true;
1349 }
1350
1351 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1352
1353 loop_vec_info
1354 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1355 {
1356   tree assumptions, number_of_iterations, number_of_iterationsm1;
1357   gcond *loop_cond, *inner_loop_cond = NULL;
1358
1359   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1360                                   &assumptions, &number_of_iterationsm1,
1361                                   &number_of_iterations, &inner_loop_cond))
1362     return NULL;
1363
1364   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1365   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1366   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1367   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1368   if (!integer_onep (assumptions))
1369     {
1370       /* We consider to vectorize this loop by versioning it under
1371          some assumptions.  In order to do this, we need to clear
1372          existing information computed by scev and niter analyzer.  */
1373       scev_reset_htab ();
1374       free_numbers_of_iterations_estimates (loop);
1375       /* Also set flag for this loop so that following scev and niter
1376          analysis are done under the assumptions.  */
1377       loop_constraint_set (loop, LOOP_C_FINITE);
1378       /* Also record the assumptions for versioning.  */
1379       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1380     }
1381
1382   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1383     {
1384       if (dump_enabled_p ())
1385         {
1386           dump_printf_loc (MSG_NOTE, vect_location,
1387                            "Symbolic number of iterations is ");
1388           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1389           dump_printf (MSG_NOTE, "\n");
1390         }
1391     }
1392
1393   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1394   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1395   if (inner_loop_cond)
1396     {
1397       stmt_vec_info inner_loop_cond_info
1398         = loop_vinfo->lookup_stmt (inner_loop_cond);
1399       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1400     }
1401
1402   gcc_assert (!loop->aux);
1403   loop->aux = loop_vinfo;
1404   return loop_vinfo;
1405 }
1406
1407
1408
1409 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1410    statements update the vectorization factor.  */
1411
1412 static void
1413 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1414 {
1415   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1416   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1417   int nbbs = loop->num_nodes;
1418   poly_uint64 vectorization_factor;
1419   int i;
1420
1421   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1422
1423   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1424   gcc_assert (known_ne (vectorization_factor, 0U));
1425
1426   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1427      vectorization factor of the loop is the unrolling factor required by
1428      the SLP instances.  If that unrolling factor is 1, we say, that we
1429      perform pure SLP on loop - cross iteration parallelism is not
1430      exploited.  */
1431   bool only_slp_in_loop = true;
1432   for (i = 0; i < nbbs; i++)
1433     {
1434       basic_block bb = bbs[i];
1435       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1436            gsi_next (&si))
1437         {
1438           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1439           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1440               && STMT_VINFO_RELATED_STMT (stmt_info))
1441             stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
1442           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1443                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1444               && !PURE_SLP_STMT (stmt_info))
1445             /* STMT needs both SLP and loop-based vectorization.  */
1446             only_slp_in_loop = false;
1447         }
1448     }
1449
1450   if (only_slp_in_loop)
1451     {
1452       dump_printf_loc (MSG_NOTE, vect_location,
1453                        "Loop contains only SLP stmts\n");
1454       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1455     }
1456   else
1457     {
1458       dump_printf_loc (MSG_NOTE, vect_location,
1459                        "Loop contains SLP and non-SLP stmts\n");
1460       /* Both the vectorization factor and unroll factor have the form
1461          current_vector_size * X for some rational X, so they must have
1462          a common multiple.  */
1463       vectorization_factor
1464         = force_common_multiple (vectorization_factor,
1465                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1466     }
1467
1468   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1469   if (dump_enabled_p ())
1470     {
1471       dump_printf_loc (MSG_NOTE, vect_location,
1472                        "Updating vectorization factor to ");
1473       dump_dec (MSG_NOTE, vectorization_factor);
1474       dump_printf (MSG_NOTE, ".\n");
1475     }
1476 }
1477
1478 /* Return true if STMT_INFO describes a double reduction phi and if
1479    the other phi in the reduction is also relevant for vectorization.
1480    This rejects cases such as:
1481
1482       outer1:
1483         x_1 = PHI <x_3(outer2), ...>;
1484         ...
1485
1486       inner:
1487         x_2 = ...;
1488         ...
1489
1490       outer2:
1491         x_3 = PHI <x_2(inner)>;
1492
1493    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1494
1495 static bool
1496 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1497 {
1498   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1499     return false;
1500
1501   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1502 }
1503
1504 /* Function vect_analyze_loop_operations.
1505
1506    Scan the loop stmts and make sure they are all vectorizable.  */
1507
1508 static bool
1509 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1510 {
1511   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1512   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1513   int nbbs = loop->num_nodes;
1514   int i;
1515   stmt_vec_info stmt_info;
1516   bool need_to_vectorize = false;
1517   bool ok;
1518
1519   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1520
1521   stmt_vector_for_cost cost_vec;
1522   cost_vec.create (2);
1523
1524   for (i = 0; i < nbbs; i++)
1525     {
1526       basic_block bb = bbs[i];
1527
1528       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1529            gsi_next (&si))
1530         {
1531           gphi *phi = si.phi ();
1532           ok = true;
1533
1534           stmt_info = loop_vinfo->lookup_stmt (phi);
1535           if (dump_enabled_p ())
1536             {
1537               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1538               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1539             }
1540           if (virtual_operand_p (gimple_phi_result (phi)))
1541             continue;
1542
1543           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1544              (i.e., a phi in the tail of the outer-loop).  */
1545           if (! is_loop_header_bb_p (bb))
1546             {
1547               /* FORNOW: we currently don't support the case that these phis
1548                  are not used in the outerloop (unless it is double reduction,
1549                  i.e., this phi is vect_reduction_def), cause this case
1550                  requires to actually do something here.  */
1551               if (STMT_VINFO_LIVE_P (stmt_info)
1552                   && !vect_active_double_reduction_p (stmt_info))
1553                 {
1554                   if (dump_enabled_p ())
1555                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556                                      "Unsupported loop-closed phi in "
1557                                      "outer-loop.\n");
1558                   return false;
1559                 }
1560
1561               /* If PHI is used in the outer loop, we check that its operand
1562                  is defined in the inner loop.  */
1563               if (STMT_VINFO_RELEVANT_P (stmt_info))
1564                 {
1565                   tree phi_op;
1566
1567                   if (gimple_phi_num_args (phi) != 1)
1568                     return false;
1569
1570                   phi_op = PHI_ARG_DEF (phi, 0);
1571                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1572                   if (!op_def_info)
1573                     return false;
1574
1575                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1576                       && (STMT_VINFO_RELEVANT (op_def_info)
1577                           != vect_used_in_outer_by_reduction))
1578                     return false;
1579                 }
1580
1581               continue;
1582             }
1583
1584           gcc_assert (stmt_info);
1585
1586           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1587                || STMT_VINFO_LIVE_P (stmt_info))
1588               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1589             {
1590               /* A scalar-dependence cycle that we don't support.  */
1591               if (dump_enabled_p ())
1592                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1593                                  "not vectorized: scalar dependence cycle.\n");
1594               return false;
1595             }
1596
1597           if (STMT_VINFO_RELEVANT_P (stmt_info))
1598             {
1599               need_to_vectorize = true;
1600               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1601                   && ! PURE_SLP_STMT (stmt_info))
1602                 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1603               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1604                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1605                        && ! PURE_SLP_STMT (stmt_info))
1606                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1607                                              &cost_vec);
1608             }
1609
1610           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1611           if (ok
1612               && STMT_VINFO_LIVE_P (stmt_info)
1613               && !PURE_SLP_STMT (stmt_info))
1614             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1615                                               &cost_vec);
1616
1617           if (!ok)
1618             {
1619               if (dump_enabled_p ())
1620                 {
1621                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1622                                    "not vectorized: relevant phi not "
1623                                    "supported: ");
1624                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1625                 }
1626               return false;
1627             }
1628         }
1629
1630       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1631            gsi_next (&si))
1632         {
1633           gimple *stmt = gsi_stmt (si);
1634           if (!gimple_clobber_p (stmt)
1635               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1636                                      &cost_vec))
1637             return false;
1638         }
1639     } /* bbs */
1640
1641   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1642   cost_vec.release ();
1643
1644   /* All operations in the loop are either irrelevant (deal with loop
1645      control, or dead), or only used outside the loop and can be moved
1646      out of the loop (e.g. invariants, inductions).  The loop can be
1647      optimized away by scalar optimizations.  We're better off not
1648      touching this loop.  */
1649   if (!need_to_vectorize)
1650     {
1651       if (dump_enabled_p ())
1652         dump_printf_loc (MSG_NOTE, vect_location,
1653                          "All the computation can be taken out of the loop.\n");
1654       if (dump_enabled_p ())
1655         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1656                          "not vectorized: redundant loop. no profit to "
1657                          "vectorize.\n");
1658       return false;
1659     }
1660
1661   return true;
1662 }
1663
1664 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1665    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1666    definitely no, or -1 if it's worth retrying.  */
1667
1668 static int
1669 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1670 {
1671   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1672   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1673
1674   /* Only fully-masked loops can have iteration counts less than the
1675      vectorization factor.  */
1676   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1677     {
1678       HOST_WIDE_INT max_niter;
1679
1680       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1681         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1682       else
1683         max_niter = max_stmt_executions_int (loop);
1684
1685       if (max_niter != -1
1686           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1687         {
1688           if (dump_enabled_p ())
1689             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1690                              "not vectorized: iteration count smaller than "
1691                              "vectorization factor.\n");
1692           return 0;
1693         }
1694     }
1695
1696   int min_profitable_iters, min_profitable_estimate;
1697   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1698                                       &min_profitable_estimate);
1699
1700   if (min_profitable_iters < 0)
1701     {
1702       if (dump_enabled_p ())
1703         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1704                          "not vectorized: vectorization not profitable.\n");
1705       if (dump_enabled_p ())
1706         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1707                          "not vectorized: vector version will never be "
1708                          "profitable.\n");
1709       return -1;
1710     }
1711
1712   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1713                                * assumed_vf);
1714
1715   /* Use the cost model only if it is more conservative than user specified
1716      threshold.  */
1717   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1718                                     min_profitable_iters);
1719
1720   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1721
1722   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1723       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1724     {
1725       if (dump_enabled_p ())
1726         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1727                          "not vectorized: vectorization not profitable.\n");
1728       if (dump_enabled_p ())
1729         dump_printf_loc (MSG_NOTE, vect_location,
1730                          "not vectorized: iteration count smaller than user "
1731                          "specified loop bound parameter or minimum profitable "
1732                          "iterations (whichever is more conservative).\n");
1733       return 0;
1734     }
1735
1736   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1737   if (estimated_niter == -1)
1738     estimated_niter = likely_max_stmt_executions_int (loop);
1739   if (estimated_niter != -1
1740       && ((unsigned HOST_WIDE_INT) estimated_niter
1741           < MAX (th, (unsigned) min_profitable_estimate)))
1742     {
1743       if (dump_enabled_p ())
1744         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1745                          "not vectorized: estimated iteration count too "
1746                          "small.\n");
1747       if (dump_enabled_p ())
1748         dump_printf_loc (MSG_NOTE, vect_location,
1749                          "not vectorized: estimated iteration count smaller "
1750                          "than specified loop bound parameter or minimum "
1751                          "profitable iterations (whichever is more "
1752                          "conservative).\n");
1753       return -1;
1754     }
1755
1756   return 1;
1757 }
1758
1759 static bool
1760 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1761                            vec<data_reference_p> *datarefs,
1762                            unsigned int *n_stmts)
1763 {
1764   *n_stmts = 0;
1765   for (unsigned i = 0; i < loop->num_nodes; i++)
1766     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1767          !gsi_end_p (gsi); gsi_next (&gsi))
1768       {
1769         gimple *stmt = gsi_stmt (gsi);
1770         if (is_gimple_debug (stmt))
1771           continue;
1772         ++(*n_stmts);
1773         if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1774           {
1775             if (is_gimple_call (stmt) && loop->safelen)
1776               {
1777                 tree fndecl = gimple_call_fndecl (stmt), op;
1778                 if (fndecl != NULL_TREE)
1779                   {
1780                     cgraph_node *node = cgraph_node::get (fndecl);
1781                     if (node != NULL && node->simd_clones != NULL)
1782                       {
1783                         unsigned int j, n = gimple_call_num_args (stmt);
1784                         for (j = 0; j < n; j++)
1785                           {
1786                             op = gimple_call_arg (stmt, j);
1787                             if (DECL_P (op)
1788                                 || (REFERENCE_CLASS_P (op)
1789                                     && get_base_address (op)))
1790                               break;
1791                           }
1792                         op = gimple_call_lhs (stmt);
1793                         /* Ignore #pragma omp declare simd functions
1794                            if they don't have data references in the
1795                            call stmt itself.  */
1796                         if (j == n
1797                             && !(op
1798                                  && (DECL_P (op)
1799                                      || (REFERENCE_CLASS_P (op)
1800                                          && get_base_address (op)))))
1801                           continue;
1802                       }
1803                   }
1804               }
1805             return false;
1806           }
1807         /* If dependence analysis will give up due to the limit on the
1808            number of datarefs stop here and fail fatally.  */
1809         if (datarefs->length ()
1810             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1811           return false;
1812       }
1813   return true;
1814 }
1815
1816 /* Function vect_analyze_loop_2.
1817
1818    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1819    for it.  The different analyses will record information in the
1820    loop_vec_info struct.  */
1821 static bool
1822 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1823 {
1824   bool ok;
1825   int res;
1826   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1827   poly_uint64 min_vf = 2;
1828
1829   /* The first group of checks is independent of the vector size.  */
1830   fatal = true;
1831
1832   /* Find all data references in the loop (which correspond to vdefs/vuses)
1833      and analyze their evolution in the loop.  */
1834
1835   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1836
1837   /* Gather the data references and count stmts in the loop.  */
1838   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1839     {
1840       if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1841                                       &LOOP_VINFO_DATAREFS (loop_vinfo),
1842                                       n_stmts))
1843         {
1844           if (dump_enabled_p ())
1845             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1846                              "not vectorized: loop contains function "
1847                              "calls or data references that cannot "
1848                              "be analyzed\n");
1849           return false;
1850         }
1851       loop_vinfo->shared->save_datarefs ();
1852     }
1853   else
1854     loop_vinfo->shared->check_datarefs ();
1855
1856   /* Analyze the data references and also adjust the minimal
1857      vectorization factor according to the loads and stores.  */
1858
1859   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1860   if (!ok)
1861     {
1862       if (dump_enabled_p ())
1863         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1864                          "bad data references.\n");
1865       return false;
1866     }
1867
1868   /* Classify all cross-iteration scalar data-flow cycles.
1869      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1870   vect_analyze_scalar_cycles (loop_vinfo);
1871
1872   vect_pattern_recog (loop_vinfo);
1873
1874   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1875
1876   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1877      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1878
1879   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1880   if (!ok)
1881     {
1882       if (dump_enabled_p ())
1883         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1884                          "bad data access.\n");
1885       return false;
1886     }
1887
1888   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1889
1890   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1891   if (!ok)
1892     {
1893       if (dump_enabled_p ())
1894         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1895                          "unexpected pattern.\n");
1896       return false;
1897     }
1898
1899   /* While the rest of the analysis below depends on it in some way.  */
1900   fatal = false;
1901
1902   /* Analyze data dependences between the data-refs in the loop
1903      and adjust the maximum vectorization factor according to
1904      the dependences.
1905      FORNOW: fail at the first data dependence that we encounter.  */
1906
1907   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1908   if (!ok
1909       || (max_vf != MAX_VECTORIZATION_FACTOR
1910           && maybe_lt (max_vf, min_vf)))
1911     {
1912       if (dump_enabled_p ())
1913             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1914                              "bad data dependence.\n");
1915       return false;
1916     }
1917   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1918
1919   ok = vect_determine_vectorization_factor (loop_vinfo);
1920   if (!ok)
1921     {
1922       if (dump_enabled_p ())
1923         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1924                          "can't determine vectorization factor.\n");
1925       return false;
1926     }
1927   if (max_vf != MAX_VECTORIZATION_FACTOR
1928       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1929     {
1930       if (dump_enabled_p ())
1931         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1932                          "bad data dependence.\n");
1933       return false;
1934     }
1935
1936   /* Compute the scalar iteration cost.  */
1937   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1938
1939   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1940   unsigned th;
1941
1942   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1943   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1944   if (!ok)
1945     return false;
1946
1947   /* If there are any SLP instances mark them as pure_slp.  */
1948   bool slp = vect_make_slp_decision (loop_vinfo);
1949   if (slp)
1950     {
1951       /* Find stmts that need to be both vectorized and SLPed.  */
1952       vect_detect_hybrid_slp (loop_vinfo);
1953
1954       /* Update the vectorization factor based on the SLP decision.  */
1955       vect_update_vf_for_slp (loop_vinfo);
1956     }
1957
1958   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1959
1960   /* We don't expect to have to roll back to anything other than an empty
1961      set of rgroups.  */
1962   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1963
1964   /* This is the point where we can re-start analysis with SLP forced off.  */
1965 start_over:
1966
1967   /* Now the vectorization factor is final.  */
1968   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1969   gcc_assert (known_ne (vectorization_factor, 0U));
1970
1971   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1972     {
1973       dump_printf_loc (MSG_NOTE, vect_location,
1974                        "vectorization_factor = ");
1975       dump_dec (MSG_NOTE, vectorization_factor);
1976       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1977                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1978     }
1979
1980   HOST_WIDE_INT max_niter
1981     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1982
1983   /* Analyze the alignment of the data-refs in the loop.
1984      Fail if a data reference is found that cannot be vectorized.  */
1985
1986   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1987   if (!ok)
1988     {
1989       if (dump_enabled_p ())
1990         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1991                          "bad data alignment.\n");
1992       return false;
1993     }
1994
1995   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1996      It is important to call pruning after vect_analyze_data_ref_accesses,
1997      since we use grouping information gathered by interleaving analysis.  */
1998   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1999   if (!ok)
2000     return false;
2001
2002   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2003      vectorization.  */
2004   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2005     {
2006     /* This pass will decide on using loop versioning and/or loop peeling in
2007        order to enhance the alignment of data references in the loop.  */
2008     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2009     if (!ok)
2010       {
2011         if (dump_enabled_p ())
2012           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2013                            "bad data alignment.\n");
2014         return false;
2015       }
2016     }
2017
2018   if (slp)
2019     {
2020       /* Analyze operations in the SLP instances.  Note this may
2021          remove unsupported SLP instances which makes the above
2022          SLP kind detection invalid.  */
2023       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2024       vect_slp_analyze_operations (loop_vinfo);
2025       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2026         goto again;
2027     }
2028
2029   /* Scan all the remaining operations in the loop that are not subject
2030      to SLP and make sure they are vectorizable.  */
2031   ok = vect_analyze_loop_operations (loop_vinfo);
2032   if (!ok)
2033     {
2034       if (dump_enabled_p ())
2035         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2036                          "bad operation or unsupported loop bound.\n");
2037       return false;
2038     }
2039
2040   /* Decide whether to use a fully-masked loop for this vectorization
2041      factor.  */
2042   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2043     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2044        && vect_verify_full_masking (loop_vinfo));
2045   if (dump_enabled_p ())
2046     {
2047       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2048         dump_printf_loc (MSG_NOTE, vect_location,
2049                          "using a fully-masked loop.\n");
2050       else
2051         dump_printf_loc (MSG_NOTE, vect_location,
2052                          "not using a fully-masked loop.\n");
2053     }
2054
2055   /* If epilog loop is required because of data accesses with gaps,
2056      one additional iteration needs to be peeled.  Check if there is
2057      enough iterations for vectorization.  */
2058   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2059       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2060       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2061     {
2062       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2063       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2064
2065       if (known_lt (wi::to_widest (scalar_niters), vf))
2066         {
2067           if (dump_enabled_p ())
2068             dump_printf_loc (MSG_NOTE, vect_location,
2069                              "loop has no enough iterations to support"
2070                              " peeling for gaps.\n");
2071           return false;
2072         }
2073     }
2074
2075   /* Check the costings of the loop make vectorizing worthwhile.  */
2076   res = vect_analyze_loop_costing (loop_vinfo);
2077   if (res < 0)
2078     goto again;
2079   if (!res)
2080     {
2081       if (dump_enabled_p ())
2082         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2083                          "Loop costings not worthwhile.\n");
2084       return false;
2085     }
2086
2087   /* Decide whether we need to create an epilogue loop to handle
2088      remaining scalar iterations.  */
2089   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2090
2091   unsigned HOST_WIDE_INT const_vf;
2092   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2093     /* The main loop handles all iterations.  */
2094     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2095   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2096            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2097     {
2098       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2099                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2100                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2101         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2102     }
2103   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2104            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2105            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2106                 < (unsigned) exact_log2 (const_vf))
2107                /* In case of versioning, check if the maximum number of
2108                   iterations is greater than th.  If they are identical,
2109                   the epilogue is unnecessary.  */
2110                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2111                    || ((unsigned HOST_WIDE_INT) max_niter
2112                        > (th / const_vf) * const_vf))))
2113     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2114
2115   /* If an epilogue loop is required make sure we can create one.  */
2116   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2117       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2118     {
2119       if (dump_enabled_p ())
2120         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2121       if (!vect_can_advance_ivs_p (loop_vinfo)
2122           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2123                                            single_exit (LOOP_VINFO_LOOP
2124                                                          (loop_vinfo))))
2125         {
2126           if (dump_enabled_p ())
2127             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2128                              "not vectorized: can't create required "
2129                              "epilog loop\n");
2130           goto again;
2131         }
2132     }
2133
2134   /* During peeling, we need to check if number of loop iterations is
2135      enough for both peeled prolog loop and vector loop.  This check
2136      can be merged along with threshold check of loop versioning, so
2137      increase threshold for this case if necessary.  */
2138   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2139     {
2140       poly_uint64 niters_th = 0;
2141
2142       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2143         {
2144           /* Niters for peeled prolog loop.  */
2145           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2146             {
2147               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2148               tree vectype = STMT_VINFO_VECTYPE (vect_dr_stmt (dr));
2149               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2150             }
2151           else
2152             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2153         }
2154
2155       /* Niters for at least one iteration of vectorized loop.  */
2156       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2157         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2158       /* One additional iteration because of peeling for gap.  */
2159       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2160         niters_th += 1;
2161       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2162     }
2163
2164   gcc_assert (known_eq (vectorization_factor,
2165                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2166
2167   /* Ok to vectorize!  */
2168   return true;
2169
2170 again:
2171   /* Try again with SLP forced off but if we didn't do any SLP there is
2172      no point in re-trying.  */
2173   if (!slp)
2174     return false;
2175
2176   /* If there are reduction chains re-trying will fail anyway.  */
2177   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2178     return false;
2179
2180   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2181      via interleaving or lane instructions.  */
2182   slp_instance instance;
2183   slp_tree node;
2184   unsigned i, j;
2185   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2186     {
2187       stmt_vec_info vinfo;
2188       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2189       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2190         continue;
2191       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2192       unsigned int size = DR_GROUP_SIZE (vinfo);
2193       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2194       if (! vect_store_lanes_supported (vectype, size, false)
2195          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2196          && ! vect_grouped_store_supported (vectype, size))
2197        return false;
2198       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2199         {
2200           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2201           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2202           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2203           size = DR_GROUP_SIZE (vinfo);
2204           vectype = STMT_VINFO_VECTYPE (vinfo);
2205           if (! vect_load_lanes_supported (vectype, size, false)
2206               && ! vect_grouped_load_supported (vectype, single_element_p,
2207                                                 size))
2208             return false;
2209         }
2210     }
2211
2212   if (dump_enabled_p ())
2213     dump_printf_loc (MSG_NOTE, vect_location,
2214                      "re-trying with SLP disabled\n");
2215
2216   /* Roll back state appropriately.  No SLP this time.  */
2217   slp = false;
2218   /* Restore vectorization factor as it were without SLP.  */
2219   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2220   /* Free the SLP instances.  */
2221   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2222     vect_free_slp_instance (instance, false);
2223   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2224   /* Reset SLP type to loop_vect on all stmts.  */
2225   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2226     {
2227       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2228       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2229            !gsi_end_p (si); gsi_next (&si))
2230         {
2231           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2232           STMT_SLP_TYPE (stmt_info) = loop_vect;
2233         }
2234       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2235            !gsi_end_p (si); gsi_next (&si))
2236         {
2237           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2238           STMT_SLP_TYPE (stmt_info) = loop_vect;
2239           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2240             {
2241               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2242               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2243               STMT_SLP_TYPE (stmt_info) = loop_vect;
2244               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2245                    !gsi_end_p (pi); gsi_next (&pi))
2246                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2247                   = loop_vect;
2248             }
2249         }
2250     }
2251   /* Free optimized alias test DDRS.  */
2252   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2253   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2254   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2255   /* Reset target cost data.  */
2256   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2257   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2258     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2259   /* Reset accumulated rgroup information.  */
2260   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2261   /* Reset assorted flags.  */
2262   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2263   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2264   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2265   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2266   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2267
2268   goto start_over;
2269 }
2270
2271 /* Function vect_analyze_loop.
2272
2273    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2274    for it.  The different analyses will record information in the
2275    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2276    be vectorized.  */
2277 loop_vec_info
2278 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2279                    vec_info_shared *shared)
2280 {
2281   loop_vec_info loop_vinfo;
2282   auto_vector_sizes vector_sizes;
2283
2284   /* Autodetect first vector size we try.  */
2285   current_vector_size = 0;
2286   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2287   unsigned int next_size = 0;
2288
2289   DUMP_VECT_SCOPE ("analyze_loop_nest");
2290
2291   if (loop_outer (loop)
2292       && loop_vec_info_for_loop (loop_outer (loop))
2293       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2294     {
2295       if (dump_enabled_p ())
2296         dump_printf_loc (MSG_NOTE, vect_location,
2297                          "outer-loop already vectorized.\n");
2298       return NULL;
2299     }
2300
2301   if (!find_loop_nest (loop, &shared->loop_nest))
2302     {
2303       if (dump_enabled_p ())
2304         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2305                          "not vectorized: loop nest containing two "
2306                          "or more consecutive inner loops cannot be "
2307                          "vectorized\n");
2308       return NULL;
2309     }
2310
2311   unsigned n_stmts = 0;
2312   poly_uint64 autodetected_vector_size = 0;
2313   while (1)
2314     {
2315       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2316       loop_vinfo = vect_analyze_loop_form (loop, shared);
2317       if (!loop_vinfo)
2318         {
2319           if (dump_enabled_p ())
2320             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2321                              "bad loop form.\n");
2322           return NULL;
2323         }
2324
2325       bool fatal = false;
2326
2327       if (orig_loop_vinfo)
2328         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2329
2330       if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2331         {
2332           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2333
2334           return loop_vinfo;
2335         }
2336
2337       delete loop_vinfo;
2338
2339       if (next_size == 0)
2340         autodetected_vector_size = current_vector_size;
2341
2342       if (next_size < vector_sizes.length ()
2343           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2344         next_size += 1;
2345
2346       if (fatal
2347           || next_size == vector_sizes.length ()
2348           || known_eq (current_vector_size, 0U))
2349         return NULL;
2350
2351       /* Try the next biggest vector size.  */
2352       current_vector_size = vector_sizes[next_size++];
2353       if (dump_enabled_p ())
2354         {
2355           dump_printf_loc (MSG_NOTE, vect_location,
2356                            "***** Re-trying analysis with "
2357                            "vector size ");
2358           dump_dec (MSG_NOTE, current_vector_size);
2359           dump_printf (MSG_NOTE, "\n");
2360         }
2361     }
2362 }
2363
2364 /* Return true if there is an in-order reduction function for CODE, storing
2365    it in *REDUC_FN if so.  */
2366
2367 static bool
2368 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2369 {
2370   switch (code)
2371     {
2372     case PLUS_EXPR:
2373       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2374       return true;
2375
2376     default:
2377       return false;
2378     }
2379 }
2380
2381 /* Function reduction_fn_for_scalar_code
2382
2383    Input:
2384    CODE - tree_code of a reduction operations.
2385
2386    Output:
2387    REDUC_FN - the corresponding internal function to be used to reduce the
2388       vector of partial results into a single scalar result, or IFN_LAST
2389       if the operation is a supported reduction operation, but does not have
2390       such an internal function.
2391
2392    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2393
2394 static bool
2395 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2396 {
2397   switch (code)
2398     {
2399       case MAX_EXPR:
2400         *reduc_fn = IFN_REDUC_MAX;
2401         return true;
2402
2403       case MIN_EXPR:
2404         *reduc_fn = IFN_REDUC_MIN;
2405         return true;
2406
2407       case PLUS_EXPR:
2408         *reduc_fn = IFN_REDUC_PLUS;
2409         return true;
2410
2411       case BIT_AND_EXPR:
2412         *reduc_fn = IFN_REDUC_AND;
2413         return true;
2414
2415       case BIT_IOR_EXPR:
2416         *reduc_fn = IFN_REDUC_IOR;
2417         return true;
2418
2419       case BIT_XOR_EXPR:
2420         *reduc_fn = IFN_REDUC_XOR;
2421         return true;
2422
2423       case MULT_EXPR:
2424       case MINUS_EXPR:
2425         *reduc_fn = IFN_LAST;
2426         return true;
2427
2428       default:
2429        return false;
2430     }
2431 }
2432
2433 /* If there is a neutral value X such that SLP reduction NODE would not
2434    be affected by the introduction of additional X elements, return that X,
2435    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2436    is true if the SLP statements perform a single reduction, false if each
2437    statement performs an independent reduction.  */
2438
2439 static tree
2440 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2441                               bool reduc_chain)
2442 {
2443   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2444   stmt_vec_info stmt_vinfo = stmts[0];
2445   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2446   tree scalar_type = TREE_TYPE (vector_type);
2447   struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2448   gcc_assert (loop);
2449
2450   switch (code)
2451     {
2452     case WIDEN_SUM_EXPR:
2453     case DOT_PROD_EXPR:
2454     case SAD_EXPR:
2455     case PLUS_EXPR:
2456     case MINUS_EXPR:
2457     case BIT_IOR_EXPR:
2458     case BIT_XOR_EXPR:
2459       return build_zero_cst (scalar_type);
2460
2461     case MULT_EXPR:
2462       return build_one_cst (scalar_type);
2463
2464     case BIT_AND_EXPR:
2465       return build_all_ones_cst (scalar_type);
2466
2467     case MAX_EXPR:
2468     case MIN_EXPR:
2469       /* For MIN/MAX the initial values are neutral.  A reduction chain
2470          has only a single initial value, so that value is neutral for
2471          all statements.  */
2472       if (reduc_chain)
2473         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2474                                       loop_preheader_edge (loop));
2475       return NULL_TREE;
2476
2477     default:
2478       return NULL_TREE;
2479     }
2480 }
2481
2482 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2483    STMT is printed with a message MSG. */
2484
2485 static void
2486 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2487 {
2488   dump_printf_loc (msg_type, vect_location, "%s", msg);
2489   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2490 }
2491
2492 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2493    operation.  Return true if the results of DEF_STMT_INFO are something
2494    that can be accumulated by such a reduction.  */
2495
2496 static bool
2497 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2498 {
2499   return (is_gimple_assign (def_stmt_info->stmt)
2500           || is_gimple_call (def_stmt_info->stmt)
2501           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2502           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2503               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2504               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2505 }
2506
2507 /* Detect SLP reduction of the form:
2508
2509    #a1 = phi <a5, a0>
2510    a2 = operation (a1)
2511    a3 = operation (a2)
2512    a4 = operation (a3)
2513    a5 = operation (a4)
2514
2515    #a = phi <a5>
2516
2517    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2518    FIRST_STMT is the first reduction stmt in the chain
2519    (a2 = operation (a1)).
2520
2521    Return TRUE if a reduction chain was detected.  */
2522
2523 static bool
2524 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2525                        gimple *first_stmt)
2526 {
2527   struct loop *loop = (gimple_bb (phi))->loop_father;
2528   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2529   enum tree_code code;
2530   gimple *loop_use_stmt = NULL;
2531   stmt_vec_info use_stmt_info, current_stmt_info = NULL;
2532   tree lhs;
2533   imm_use_iterator imm_iter;
2534   use_operand_p use_p;
2535   int nloop_uses, size = 0, n_out_of_loop_uses;
2536   bool found = false;
2537
2538   if (loop != vect_loop)
2539     return false;
2540
2541   lhs = PHI_RESULT (phi);
2542   code = gimple_assign_rhs_code (first_stmt);
2543   while (1)
2544     {
2545       nloop_uses = 0;
2546       n_out_of_loop_uses = 0;
2547       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2548         {
2549           gimple *use_stmt = USE_STMT (use_p);
2550           if (is_gimple_debug (use_stmt))
2551             continue;
2552
2553           /* Check if we got back to the reduction phi.  */
2554           if (use_stmt == phi)
2555             {
2556               loop_use_stmt = use_stmt;
2557               found = true;
2558               break;
2559             }
2560
2561           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2562             {
2563               loop_use_stmt = use_stmt;
2564               nloop_uses++;
2565             }
2566            else
2567              n_out_of_loop_uses++;
2568
2569            /* There are can be either a single use in the loop or two uses in
2570               phi nodes.  */
2571            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2572              return false;
2573         }
2574
2575       if (found)
2576         break;
2577
2578       /* We reached a statement with no loop uses.  */
2579       if (nloop_uses == 0)
2580         return false;
2581
2582       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2583       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2584         return false;
2585
2586       if (!is_gimple_assign (loop_use_stmt)
2587           || code != gimple_assign_rhs_code (loop_use_stmt)
2588           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2589         return false;
2590
2591       /* Insert USE_STMT into reduction chain.  */
2592       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2593       if (current_stmt_info)
2594         {
2595           REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt_info;
2596           REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2597             = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2598         }
2599       else
2600         REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt_info;
2601
2602       lhs = gimple_assign_lhs (loop_use_stmt);
2603       current_stmt_info = use_stmt_info;
2604       size++;
2605    }
2606
2607   if (!found || loop_use_stmt != phi || size < 2)
2608     return false;
2609
2610   /* Swap the operands, if needed, to make the reduction operand be the second
2611      operand.  */
2612   lhs = PHI_RESULT (phi);
2613   stmt_vec_info next_stmt_info = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2614   while (next_stmt_info)
2615     {
2616       gassign *next_stmt = as_a <gassign *> (next_stmt_info->stmt);
2617       if (gimple_assign_rhs2 (next_stmt) == lhs)
2618         {
2619           tree op = gimple_assign_rhs1 (next_stmt);
2620           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2621
2622           /* Check that the other def is either defined in the loop
2623              ("vect_internal_def"), or it's an induction (defined by a
2624              loop-header phi-node).  */
2625           if (def_stmt_info
2626               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2627               && vect_valid_reduction_input_p (def_stmt_info))
2628             {
2629               lhs = gimple_assign_lhs (next_stmt);
2630               next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2631               continue;
2632             }
2633
2634           return false;
2635         }
2636       else
2637         {
2638           tree op = gimple_assign_rhs2 (next_stmt);
2639           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2640
2641           /* Check that the other def is either defined in the loop
2642             ("vect_internal_def"), or it's an induction (defined by a
2643             loop-header phi-node).  */
2644           if (def_stmt_info
2645               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2646               && vect_valid_reduction_input_p (def_stmt_info))
2647             {
2648               if (dump_enabled_p ())
2649                 {
2650                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2651                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2652                 }
2653
2654               swap_ssa_operands (next_stmt,
2655                                  gimple_assign_rhs1_ptr (next_stmt),
2656                                  gimple_assign_rhs2_ptr (next_stmt));
2657               update_stmt (next_stmt);
2658
2659               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2660                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2661             }
2662           else
2663             return false;
2664         }
2665
2666       lhs = gimple_assign_lhs (next_stmt);
2667       next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2668     }
2669
2670   /* Save the chain for further analysis in SLP detection.  */
2671   stmt_vec_info first_stmt_info
2672     = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2673   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first_stmt_info);
2674   REDUC_GROUP_SIZE (first_stmt_info) = size;
2675
2676   return true;
2677 }
2678
2679 /* Return true if we need an in-order reduction for operation CODE
2680    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2681    overflow must wrap.  */
2682
2683 static bool
2684 needs_fold_left_reduction_p (tree type, tree_code code,
2685                              bool need_wrapping_integral_overflow)
2686 {
2687   /* CHECKME: check for !flag_finite_math_only too?  */
2688   if (SCALAR_FLOAT_TYPE_P (type))
2689     switch (code)
2690       {
2691       case MIN_EXPR:
2692       case MAX_EXPR:
2693         return false;
2694
2695       default:
2696         return !flag_associative_math;
2697       }
2698
2699   if (INTEGRAL_TYPE_P (type))
2700     {
2701       if (!operation_no_trapping_overflow (type, code))
2702         return true;
2703       if (need_wrapping_integral_overflow
2704           && !TYPE_OVERFLOW_WRAPS (type)
2705           && operation_can_overflow (code))
2706         return true;
2707       return false;
2708     }
2709
2710   if (SAT_FIXED_POINT_TYPE_P (type))
2711     return true;
2712
2713   return false;
2714 }
2715
2716 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2717    reduction operation CODE has a handled computation expression.  */
2718
2719 bool
2720 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2721                       tree loop_arg, enum tree_code code)
2722 {
2723   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2724   auto_bitmap visited;
2725   tree lookfor = PHI_RESULT (phi);
2726   ssa_op_iter curri;
2727   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2728   while (USE_FROM_PTR (curr) != loop_arg)
2729     curr = op_iter_next_use (&curri);
2730   curri.i = curri.numops;
2731   do
2732     {
2733       path.safe_push (std::make_pair (curri, curr));
2734       tree use = USE_FROM_PTR (curr);
2735       if (use == lookfor)
2736         break;
2737       gimple *def = SSA_NAME_DEF_STMT (use);
2738       if (gimple_nop_p (def)
2739           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2740         {
2741 pop:
2742           do
2743             {
2744               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2745               curri = x.first;
2746               curr = x.second;
2747               do
2748                 curr = op_iter_next_use (&curri);
2749               /* Skip already visited or non-SSA operands (from iterating
2750                  over PHI args).  */
2751               while (curr != NULL_USE_OPERAND_P
2752                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2753                          || ! bitmap_set_bit (visited,
2754                                               SSA_NAME_VERSION
2755                                                 (USE_FROM_PTR (curr)))));
2756             }
2757           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2758           if (curr == NULL_USE_OPERAND_P)
2759             break;
2760         }
2761       else
2762         {
2763           if (gimple_code (def) == GIMPLE_PHI)
2764             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2765           else
2766             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2767           while (curr != NULL_USE_OPERAND_P
2768                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2769                      || ! bitmap_set_bit (visited,
2770                                           SSA_NAME_VERSION
2771                                             (USE_FROM_PTR (curr)))))
2772             curr = op_iter_next_use (&curri);
2773           if (curr == NULL_USE_OPERAND_P)
2774             goto pop;
2775         }
2776     }
2777   while (1);
2778   if (dump_file && (dump_flags & TDF_DETAILS))
2779     {
2780       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2781       unsigned i;
2782       std::pair<ssa_op_iter, use_operand_p> *x;
2783       FOR_EACH_VEC_ELT (path, i, x)
2784         {
2785           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2786           dump_printf (MSG_NOTE, " ");
2787         }
2788       dump_printf (MSG_NOTE, "\n");
2789     }
2790
2791   /* Check whether the reduction path detected is valid.  */
2792   bool fail = path.length () == 0;
2793   bool neg = false;
2794   for (unsigned i = 1; i < path.length (); ++i)
2795     {
2796       gimple *use_stmt = USE_STMT (path[i].second);
2797       tree op = USE_FROM_PTR (path[i].second);
2798       if (! has_single_use (op)
2799           || ! is_gimple_assign (use_stmt))
2800         {
2801           fail = true;
2802           break;
2803         }
2804       if (gimple_assign_rhs_code (use_stmt) != code)
2805         {
2806           if (code == PLUS_EXPR
2807               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2808             {
2809               /* Track whether we negate the reduction value each iteration.  */
2810               if (gimple_assign_rhs2 (use_stmt) == op)
2811                 neg = ! neg;
2812             }
2813           else
2814             {
2815               fail = true;
2816               break;
2817             }
2818         }
2819     }
2820   return ! fail && ! neg;
2821 }
2822
2823
2824 /* Function vect_is_simple_reduction
2825
2826    (1) Detect a cross-iteration def-use cycle that represents a simple
2827    reduction computation.  We look for the following pattern:
2828
2829    loop_header:
2830      a1 = phi < a0, a2 >
2831      a3 = ...
2832      a2 = operation (a3, a1)
2833
2834    or
2835
2836    a3 = ...
2837    loop_header:
2838      a1 = phi < a0, a2 >
2839      a2 = operation (a3, a1)
2840
2841    such that:
2842    1. operation is commutative and associative and it is safe to
2843       change the order of the computation
2844    2. no uses for a2 in the loop (a2 is used out of the loop)
2845    3. no uses of a1 in the loop besides the reduction operation
2846    4. no uses of a1 outside the loop.
2847
2848    Conditions 1,4 are tested here.
2849    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2850
2851    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2852    nested cycles.
2853
2854    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2855    reductions:
2856
2857      a1 = phi < a0, a2 >
2858      inner loop (def of a3)
2859      a2 = phi < a3 >
2860
2861    (4) Detect condition expressions, ie:
2862      for (int i = 0; i < N; i++)
2863        if (a[i] < val)
2864         ret_val = a[i];
2865
2866 */
2867
2868 static stmt_vec_info
2869 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2870                           bool *double_reduc,
2871                           bool need_wrapping_integral_overflow,
2872                           enum vect_reduction_type *v_reduc_type)
2873 {
2874   gphi *phi = as_a <gphi *> (phi_info->stmt);
2875   struct loop *loop = (gimple_bb (phi))->loop_father;
2876   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2877   gimple *phi_use_stmt = NULL;
2878   enum tree_code orig_code, code;
2879   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2880   tree type;
2881   int nloop_uses;
2882   tree name;
2883   imm_use_iterator imm_iter;
2884   use_operand_p use_p;
2885   bool phi_def;
2886
2887   *double_reduc = false;
2888   *v_reduc_type = TREE_CODE_REDUCTION;
2889
2890   tree phi_name = PHI_RESULT (phi);
2891   /* ???  If there are no uses of the PHI result the inner loop reduction
2892      won't be detected as possibly double-reduction by vectorizable_reduction
2893      because that tries to walk the PHI arg from the preheader edge which
2894      can be constant.  See PR60382.  */
2895   if (has_zero_uses (phi_name))
2896     return NULL;
2897   nloop_uses = 0;
2898   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2899     {
2900       gimple *use_stmt = USE_STMT (use_p);
2901       if (is_gimple_debug (use_stmt))
2902         continue;
2903
2904       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2905         {
2906           if (dump_enabled_p ())
2907             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2908                              "intermediate value used outside loop.\n");
2909
2910           return NULL;
2911         }
2912
2913       nloop_uses++;
2914       if (nloop_uses > 1)
2915         {
2916           if (dump_enabled_p ())
2917             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2918                              "reduction value used in loop.\n");
2919           return NULL;
2920         }
2921
2922       phi_use_stmt = use_stmt;
2923     }
2924
2925   edge latch_e = loop_latch_edge (loop);
2926   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2927   if (TREE_CODE (loop_arg) != SSA_NAME)
2928     {
2929       if (dump_enabled_p ())
2930         {
2931           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2932                            "reduction: not ssa_name: ");
2933           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2934           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2935         }
2936       return NULL;
2937     }
2938
2939   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2940   if (!def_stmt_info)
2941     return NULL;
2942
2943   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2944     {
2945       name = gimple_assign_lhs (def_stmt);
2946       phi_def = false;
2947     }
2948   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2949     {
2950       name = PHI_RESULT (def_stmt);
2951       phi_def = true;
2952     }
2953   else
2954     {
2955       if (dump_enabled_p ())
2956         {
2957           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2958                            "reduction: unhandled reduction operation: ");
2959           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2960                             def_stmt_info->stmt, 0);
2961         }
2962       return NULL;
2963     }
2964
2965   nloop_uses = 0;
2966   auto_vec<gphi *, 3> lcphis;
2967   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2968     {
2969       gimple *use_stmt = USE_STMT (use_p);
2970       if (is_gimple_debug (use_stmt))
2971         continue;
2972       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2973         nloop_uses++;
2974       else
2975         /* We can have more than one loop-closed PHI.  */
2976         lcphis.safe_push (as_a <gphi *> (use_stmt));
2977       if (nloop_uses > 1)
2978         {
2979           if (dump_enabled_p ())
2980             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2981                              "reduction used in loop.\n");
2982           return NULL;
2983         }
2984     }
2985
2986   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2987      defined in the inner loop.  */
2988   if (phi_def)
2989     {
2990       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2991       op1 = PHI_ARG_DEF (def_stmt, 0);
2992
2993       if (gimple_phi_num_args (def_stmt) != 1
2994           || TREE_CODE (op1) != SSA_NAME)
2995         {
2996           if (dump_enabled_p ())
2997             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2998                              "unsupported phi node definition.\n");
2999
3000           return NULL;
3001         }
3002
3003       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3004       if (gimple_bb (def1)
3005           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3006           && loop->inner
3007           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3008           && is_gimple_assign (def1)
3009           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3010         {
3011           if (dump_enabled_p ())
3012             report_vect_op (MSG_NOTE, def_stmt,
3013                             "detected double reduction: ");
3014
3015           *double_reduc = true;
3016           return def_stmt_info;
3017         }
3018
3019       return NULL;
3020     }
3021
3022   /* If we are vectorizing an inner reduction we are executing that
3023      in the original order only in case we are not dealing with a
3024      double reduction.  */
3025   bool check_reduction = true;
3026   if (flow_loop_nested_p (vect_loop, loop))
3027     {
3028       gphi *lcphi;
3029       unsigned i;
3030       check_reduction = false;
3031       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3032         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3033           {
3034             gimple *use_stmt = USE_STMT (use_p);
3035             if (is_gimple_debug (use_stmt))
3036               continue;
3037             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3038               check_reduction = true;
3039           }
3040     }
3041
3042   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3043   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3044   code = orig_code = gimple_assign_rhs_code (def_stmt);
3045
3046   /* We can handle "res -= x[i]", which is non-associative by
3047      simply rewriting this into "res += -x[i]".  Avoid changing
3048      gimple instruction for the first simple tests and only do this
3049      if we're allowed to change code at all.  */
3050   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3051     code = PLUS_EXPR;
3052
3053   if (code == COND_EXPR)
3054     {
3055       if (! nested_in_vect_loop)
3056         *v_reduc_type = COND_REDUCTION;
3057
3058       op3 = gimple_assign_rhs1 (def_stmt);
3059       if (COMPARISON_CLASS_P (op3))
3060         {
3061           op4 = TREE_OPERAND (op3, 1);
3062           op3 = TREE_OPERAND (op3, 0);
3063         }
3064       if (op3 == phi_name || op4 == phi_name)
3065         {
3066           if (dump_enabled_p ())
3067             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3068                             "reduction: condition depends on previous"
3069                             " iteration: ");
3070           return NULL;
3071         }
3072
3073       op1 = gimple_assign_rhs2 (def_stmt);
3074       op2 = gimple_assign_rhs3 (def_stmt);
3075     }
3076   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3077     {
3078       if (dump_enabled_p ())
3079         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3080                         "reduction: not commutative/associative: ");
3081       return NULL;
3082     }
3083   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3084     {
3085       op1 = gimple_assign_rhs1 (def_stmt);
3086       op2 = gimple_assign_rhs2 (def_stmt);
3087     }
3088   else
3089     {
3090       if (dump_enabled_p ())
3091         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3092                         "reduction: not handled operation: ");
3093       return NULL;
3094     }
3095
3096   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3097     {
3098       if (dump_enabled_p ())
3099         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3100                         "reduction: both uses not ssa_names: ");
3101
3102       return NULL;
3103     }
3104
3105   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3106   if ((TREE_CODE (op1) == SSA_NAME
3107        && !types_compatible_p (type,TREE_TYPE (op1)))
3108       || (TREE_CODE (op2) == SSA_NAME
3109           && !types_compatible_p (type, TREE_TYPE (op2)))
3110       || (op3 && TREE_CODE (op3) == SSA_NAME
3111           && !types_compatible_p (type, TREE_TYPE (op3)))
3112       || (op4 && TREE_CODE (op4) == SSA_NAME
3113           && !types_compatible_p (type, TREE_TYPE (op4))))
3114     {
3115       if (dump_enabled_p ())
3116         {
3117           dump_printf_loc (MSG_NOTE, vect_location,
3118                            "reduction: multiple types: operation type: ");
3119           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3120           dump_printf (MSG_NOTE, ", operands types: ");
3121           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3122                              TREE_TYPE (op1));
3123           dump_printf (MSG_NOTE, ",");
3124           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3125                              TREE_TYPE (op2));
3126           if (op3)
3127             {
3128               dump_printf (MSG_NOTE, ",");
3129               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3130                                  TREE_TYPE (op3));
3131             }
3132
3133           if (op4)
3134             {
3135               dump_printf (MSG_NOTE, ",");
3136               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3137                                  TREE_TYPE (op4));
3138             }
3139           dump_printf (MSG_NOTE, "\n");
3140         }
3141
3142       return NULL;
3143     }
3144
3145   /* Check whether it's ok to change the order of the computation.
3146      Generally, when vectorizing a reduction we change the order of the
3147      computation.  This may change the behavior of the program in some
3148      cases, so we need to check that this is ok.  One exception is when
3149      vectorizing an outer-loop: the inner-loop is executed sequentially,
3150      and therefore vectorizing reductions in the inner-loop during
3151      outer-loop vectorization is safe.  */
3152   if (check_reduction
3153       && *v_reduc_type == TREE_CODE_REDUCTION
3154       && needs_fold_left_reduction_p (type, code,
3155                                       need_wrapping_integral_overflow))
3156     *v_reduc_type = FOLD_LEFT_REDUCTION;
3157
3158   /* Reduction is safe. We're dealing with one of the following:
3159      1) integer arithmetic and no trapv
3160      2) floating point arithmetic, and special flags permit this optimization
3161      3) nested cycle (i.e., outer loop vectorization).  */
3162   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3163   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3164   if (code != COND_EXPR && !def1_info && !def2_info)
3165     {
3166       if (dump_enabled_p ())
3167         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3168       return NULL;
3169     }
3170
3171   /* Check that one def is the reduction def, defined by PHI,
3172      the other def is either defined in the loop ("vect_internal_def"),
3173      or it's an induction (defined by a loop-header phi-node).  */
3174
3175   if (def2_info
3176       && def2_info->stmt == phi
3177       && (code == COND_EXPR
3178           || !def1_info
3179           || vect_valid_reduction_input_p (def1_info)))
3180     {
3181       if (dump_enabled_p ())
3182         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3183       return def_stmt_info;
3184     }
3185
3186   if (def1_info
3187       && def1_info->stmt == phi
3188       && (code == COND_EXPR
3189           || !def2_info
3190           || vect_valid_reduction_input_p (def2_info)))
3191     {
3192       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3193         {
3194           /* Check if we can swap operands (just for simplicity - so that
3195              the rest of the code can assume that the reduction variable
3196              is always the last (second) argument).  */
3197           if (code == COND_EXPR)
3198             {
3199               /* Swap cond_expr by inverting the condition.  */
3200               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3201               enum tree_code invert_code = ERROR_MARK;
3202               enum tree_code cond_code = TREE_CODE (cond_expr);
3203
3204               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3205                 {
3206                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3207                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3208                 }
3209               if (invert_code != ERROR_MARK)
3210                 {
3211                   TREE_SET_CODE (cond_expr, invert_code);
3212                   swap_ssa_operands (def_stmt,
3213                                      gimple_assign_rhs2_ptr (def_stmt),
3214                                      gimple_assign_rhs3_ptr (def_stmt));
3215                 }
3216               else
3217                 {
3218                   if (dump_enabled_p ())
3219                     report_vect_op (MSG_NOTE, def_stmt,
3220                                     "detected reduction: cannot swap operands "
3221                                     "for cond_expr");
3222                   return NULL;
3223                 }
3224             }
3225           else
3226             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3227                                gimple_assign_rhs2_ptr (def_stmt));
3228
3229           if (dump_enabled_p ())
3230             report_vect_op (MSG_NOTE, def_stmt,
3231                             "detected reduction: need to swap operands: ");
3232
3233           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3234             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3235         }
3236       else
3237         {
3238           if (dump_enabled_p ())
3239             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3240         }
3241
3242       return def_stmt_info;
3243     }
3244
3245   /* Try to find SLP reduction chain.  */
3246   if (! nested_in_vect_loop
3247       && code != COND_EXPR
3248       && orig_code != MINUS_EXPR
3249       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3250     {
3251       if (dump_enabled_p ())
3252         report_vect_op (MSG_NOTE, def_stmt,
3253                         "reduction: detected reduction chain: ");
3254
3255       return def_stmt_info;
3256     }
3257
3258   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3259   stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3260   while (first)
3261     {
3262       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
3263       REDUC_GROUP_FIRST_ELEMENT (first) = NULL;
3264       REDUC_GROUP_NEXT_ELEMENT (first) = NULL;
3265       first = next;
3266     }
3267
3268   /* Look for the expression computing loop_arg from loop PHI result.  */
3269   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3270     return def_stmt_info;
3271
3272   if (dump_enabled_p ())
3273     {
3274       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3275                       "reduction: unknown pattern: ");
3276     }
3277
3278   return NULL;
3279 }
3280
3281 /* Wrapper around vect_is_simple_reduction, which will modify code
3282    in-place if it enables detection of more reductions.  Arguments
3283    as there.  */
3284
3285 stmt_vec_info
3286 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3287                              bool *double_reduc,
3288                              bool need_wrapping_integral_overflow)
3289 {
3290   enum vect_reduction_type v_reduc_type;
3291   stmt_vec_info def_info
3292     = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3293                                 need_wrapping_integral_overflow,
3294                                 &v_reduc_type);
3295   if (def_info)
3296     {
3297       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3298       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3299       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3300       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3301     }
3302   return def_info;
3303 }
3304
3305 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3306 int
3307 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3308                              int *peel_iters_epilogue,
3309                              stmt_vector_for_cost *scalar_cost_vec,
3310                              stmt_vector_for_cost *prologue_cost_vec,
3311                              stmt_vector_for_cost *epilogue_cost_vec)
3312 {
3313   int retval = 0;
3314   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3315
3316   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3317     {
3318       *peel_iters_epilogue = assumed_vf / 2;
3319       if (dump_enabled_p ())
3320         dump_printf_loc (MSG_NOTE, vect_location,
3321                          "cost model: epilogue peel iters set to vf/2 "
3322                          "because loop iterations are unknown .\n");
3323
3324       /* If peeled iterations are known but number of scalar loop
3325          iterations are unknown, count a taken branch per peeled loop.  */
3326       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3327                                  NULL, 0, vect_prologue);
3328       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3329                                  NULL, 0, vect_epilogue);
3330     }
3331   else
3332     {
3333       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3334       peel_iters_prologue = niters < peel_iters_prologue ?
3335                             niters : peel_iters_prologue;
3336       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3337       /* If we need to peel for gaps, but no peeling is required, we have to
3338          peel VF iterations.  */
3339       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3340         *peel_iters_epilogue = assumed_vf;
3341     }
3342
3343   stmt_info_for_cost *si;
3344   int j;
3345   if (peel_iters_prologue)
3346     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3347         {
3348           stmt_vec_info stmt_info
3349             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3350           retval += record_stmt_cost (prologue_cost_vec,
3351                                       si->count * peel_iters_prologue,
3352                                       si->kind, stmt_info, si->misalign,
3353                                       vect_prologue);
3354         }
3355   if (*peel_iters_epilogue)
3356     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3357         {
3358           stmt_vec_info stmt_info
3359             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3360           retval += record_stmt_cost (epilogue_cost_vec,
3361                                       si->count * *peel_iters_epilogue,
3362                                       si->kind, stmt_info, si->misalign,
3363                                       vect_epilogue);
3364         }
3365
3366   return retval;
3367 }
3368
3369 /* Function vect_estimate_min_profitable_iters
3370
3371    Return the number of iterations required for the vector version of the
3372    loop to be profitable relative to the cost of the scalar version of the
3373    loop.
3374
3375    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3376    of iterations for vectorization.  -1 value means loop vectorization
3377    is not profitable.  This returned value may be used for dynamic
3378    profitability check.
3379
3380    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3381    for static check against estimated number of iterations.  */
3382
3383 static void
3384 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3385                                     int *ret_min_profitable_niters,
3386                                     int *ret_min_profitable_estimate)
3387 {
3388   int min_profitable_iters;
3389   int min_profitable_estimate;
3390   int peel_iters_prologue;
3391   int peel_iters_epilogue;
3392   unsigned vec_inside_cost = 0;
3393   int vec_outside_cost = 0;
3394   unsigned vec_prologue_cost = 0;
3395   unsigned vec_epilogue_cost = 0;
3396   int scalar_single_iter_cost = 0;
3397   int scalar_outside_cost = 0;
3398   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3399   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3400   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3401
3402   /* Cost model disabled.  */
3403   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3404     {
3405       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3406       *ret_min_profitable_niters = 0;
3407       *ret_min_profitable_estimate = 0;
3408       return;
3409     }
3410
3411   /* Requires loop versioning tests to handle misalignment.  */
3412   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3413     {
3414       /*  FIXME: Make cost depend on complexity of individual check.  */
3415       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3416       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3417                             vect_prologue);
3418       dump_printf (MSG_NOTE,
3419                    "cost model: Adding cost of checks for loop "
3420                    "versioning to treat misalignment.\n");
3421     }
3422
3423   /* Requires loop versioning with alias checks.  */
3424   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3425     {
3426       /*  FIXME: Make cost depend on complexity of individual check.  */
3427       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3428       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3429                             vect_prologue);
3430       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3431       if (len)
3432         /* Count LEN - 1 ANDs and LEN comparisons.  */
3433         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3434                               NULL, 0, vect_prologue);
3435       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3436       if (len)
3437         {
3438           /* Count LEN - 1 ANDs and LEN comparisons.  */
3439           unsigned int nstmts = len * 2 - 1;
3440           /* +1 for each bias that needs adding.  */
3441           for (unsigned int i = 0; i < len; ++i)
3442             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3443               nstmts += 1;
3444           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3445                                 NULL, 0, vect_prologue);
3446         }
3447       dump_printf (MSG_NOTE,
3448                    "cost model: Adding cost of checks for loop "
3449                    "versioning aliasing.\n");
3450     }
3451
3452   /* Requires loop versioning with niter checks.  */
3453   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3454     {
3455       /*  FIXME: Make cost depend on complexity of individual check.  */
3456       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3457                             vect_prologue);
3458       dump_printf (MSG_NOTE,
3459                    "cost model: Adding cost of checks for loop "
3460                    "versioning niters.\n");
3461     }
3462
3463   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3464     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3465                           vect_prologue);
3466
3467   /* Count statements in scalar loop.  Using this as scalar cost for a single
3468      iteration for now.
3469
3470      TODO: Add outer loop support.
3471
3472      TODO: Consider assigning different costs to different scalar
3473      statements.  */
3474
3475   scalar_single_iter_cost
3476     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3477
3478   /* Add additional cost for the peeled instructions in prologue and epilogue
3479      loop.  (For fully-masked loops there will be no peeling.)
3480
3481      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3482      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3483
3484      TODO: Build an expression that represents peel_iters for prologue and
3485      epilogue to be used in a run-time test.  */
3486
3487   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3488     {
3489       peel_iters_prologue = 0;
3490       peel_iters_epilogue = 0;
3491
3492       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3493         {
3494           /* We need to peel exactly one iteration.  */
3495           peel_iters_epilogue += 1;
3496           stmt_info_for_cost *si;
3497           int j;
3498           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3499                             j, si)
3500             {
3501               struct _stmt_vec_info *stmt_info
3502                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3503               (void) add_stmt_cost (target_cost_data, si->count,
3504                                     si->kind, stmt_info, si->misalign,
3505                                     vect_epilogue);
3506             }
3507         }
3508     }
3509   else if (npeel < 0)
3510     {
3511       peel_iters_prologue = assumed_vf / 2;
3512       dump_printf (MSG_NOTE, "cost model: "
3513                    "prologue peel iters set to vf/2.\n");
3514
3515       /* If peeling for alignment is unknown, loop bound of main loop becomes
3516          unknown.  */
3517       peel_iters_epilogue = assumed_vf / 2;
3518       dump_printf (MSG_NOTE, "cost model: "
3519                    "epilogue peel iters set to vf/2 because "
3520                    "peeling for alignment is unknown.\n");
3521
3522       /* If peeled iterations are unknown, count a taken branch and a not taken
3523          branch per peeled loop. Even if scalar loop iterations are known,
3524          vector iterations are not known since peeled prologue iterations are
3525          not known. Hence guards remain the same.  */
3526       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3527                             NULL, 0, vect_prologue);
3528       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3529                             NULL, 0, vect_prologue);
3530       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3531                             NULL, 0, vect_epilogue);
3532       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3533                             NULL, 0, vect_epilogue);
3534       stmt_info_for_cost *si;
3535       int j;
3536       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3537         {
3538           struct _stmt_vec_info *stmt_info
3539             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3540           (void) add_stmt_cost (target_cost_data,
3541                                 si->count * peel_iters_prologue,
3542                                 si->kind, stmt_info, si->misalign,
3543                                 vect_prologue);
3544           (void) add_stmt_cost (target_cost_data,
3545                                 si->count * peel_iters_epilogue,
3546                                 si->kind, stmt_info, si->misalign,
3547                                 vect_epilogue);
3548         }
3549     }
3550   else
3551     {
3552       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3553       stmt_info_for_cost *si;
3554       int j;
3555       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3556
3557       prologue_cost_vec.create (2);
3558       epilogue_cost_vec.create (2);
3559       peel_iters_prologue = npeel;
3560
3561       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3562                                           &peel_iters_epilogue,
3563                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3564                                             (loop_vinfo),
3565                                           &prologue_cost_vec,
3566                                           &epilogue_cost_vec);
3567
3568       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3569         {
3570           struct _stmt_vec_info *stmt_info
3571             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3572           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3573                                 si->misalign, vect_prologue);
3574         }
3575
3576       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3577         {
3578           struct _stmt_vec_info *stmt_info
3579             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3580           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3581                                 si->misalign, vect_epilogue);
3582         }
3583
3584       prologue_cost_vec.release ();
3585       epilogue_cost_vec.release ();
3586     }
3587
3588   /* FORNOW: The scalar outside cost is incremented in one of the
3589      following ways:
3590
3591      1. The vectorizer checks for alignment and aliasing and generates
3592      a condition that allows dynamic vectorization.  A cost model
3593      check is ANDED with the versioning condition.  Hence scalar code
3594      path now has the added cost of the versioning check.
3595
3596        if (cost > th & versioning_check)
3597          jmp to vector code
3598
3599      Hence run-time scalar is incremented by not-taken branch cost.
3600
3601      2. The vectorizer then checks if a prologue is required.  If the
3602      cost model check was not done before during versioning, it has to
3603      be done before the prologue check.
3604
3605        if (cost <= th)
3606          prologue = scalar_iters
3607        if (prologue == 0)
3608          jmp to vector code
3609        else
3610          execute prologue
3611        if (prologue == num_iters)
3612          go to exit
3613
3614      Hence the run-time scalar cost is incremented by a taken branch,
3615      plus a not-taken branch, plus a taken branch cost.
3616
3617      3. The vectorizer then checks if an epilogue is required.  If the
3618      cost model check was not done before during prologue check, it
3619      has to be done with the epilogue check.
3620
3621        if (prologue == 0)
3622          jmp to vector code
3623        else
3624          execute prologue
3625        if (prologue == num_iters)
3626          go to exit
3627        vector code:
3628          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3629            jmp to epilogue
3630
3631      Hence the run-time scalar cost should be incremented by 2 taken
3632      branches.
3633
3634      TODO: The back end may reorder the BBS's differently and reverse
3635      conditions/branch directions.  Change the estimates below to
3636      something more reasonable.  */
3637
3638   /* If the number of iterations is known and we do not do versioning, we can
3639      decide whether to vectorize at compile time.  Hence the scalar version
3640      do not carry cost model guard costs.  */
3641   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3642       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3643     {
3644       /* Cost model check occurs at versioning.  */
3645       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3646         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3647       else
3648         {
3649           /* Cost model check occurs at prologue generation.  */
3650           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3651             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3652               + vect_get_stmt_cost (cond_branch_not_taken);
3653           /* Cost model check occurs at epilogue generation.  */
3654           else
3655             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3656         }
3657     }
3658
3659   /* Complete the target-specific cost calculations.  */
3660   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3661                &vec_inside_cost, &vec_epilogue_cost);
3662
3663   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3664
3665   if (dump_enabled_p ())
3666     {
3667       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3668       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3669                    vec_inside_cost);
3670       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3671                    vec_prologue_cost);
3672       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3673                    vec_epilogue_cost);
3674       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3675                    scalar_single_iter_cost);
3676       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3677                    scalar_outside_cost);
3678       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3679                    vec_outside_cost);
3680       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3681                    peel_iters_prologue);
3682       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3683                    peel_iters_epilogue);
3684     }
3685
3686   /* Calculate number of iterations required to make the vector version
3687      profitable, relative to the loop bodies only.  The following condition
3688      must hold true:
3689      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3690      where
3691      SIC = scalar iteration cost, VIC = vector iteration cost,
3692      VOC = vector outside cost, VF = vectorization factor,
3693      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3694      SOC = scalar outside cost for run time cost model check.  */
3695
3696   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3697     {
3698       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3699                               * assumed_vf
3700                               - vec_inside_cost * peel_iters_prologue
3701                               - vec_inside_cost * peel_iters_epilogue);
3702       if (min_profitable_iters <= 0)
3703         min_profitable_iters = 0;
3704       else
3705         {
3706           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3707                                    - vec_inside_cost);
3708
3709           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3710               <= (((int) vec_inside_cost * min_profitable_iters)
3711                   + (((int) vec_outside_cost - scalar_outside_cost)
3712                      * assumed_vf)))
3713             min_profitable_iters++;
3714         }
3715     }
3716   /* vector version will never be profitable.  */
3717   else
3718     {
3719       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3720         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3721                     "vectorization did not happen for a simd loop");
3722
3723       if (dump_enabled_p ())
3724         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3725                          "cost model: the vector iteration cost = %d "
3726                          "divided by the scalar iteration cost = %d "
3727                          "is greater or equal to the vectorization factor = %d"
3728                          ".\n",
3729                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3730       *ret_min_profitable_niters = -1;
3731       *ret_min_profitable_estimate = -1;
3732       return;
3733     }
3734
3735   dump_printf (MSG_NOTE,
3736                "  Calculated minimum iters for profitability: %d\n",
3737                min_profitable_iters);
3738
3739   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3740       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3741     /* We want the vectorized loop to execute at least once.  */
3742     min_profitable_iters = assumed_vf + peel_iters_prologue;
3743
3744   if (dump_enabled_p ())
3745     dump_printf_loc (MSG_NOTE, vect_location,
3746                      "  Runtime profitability threshold = %d\n",
3747                      min_profitable_iters);
3748
3749   *ret_min_profitable_niters = min_profitable_iters;
3750
3751   /* Calculate number of iterations required to make the vector version
3752      profitable, relative to the loop bodies only.
3753
3754      Non-vectorized variant is SIC * niters and it must win over vector
3755      variant on the expected loop trip count.  The following condition must hold true:
3756      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3757
3758   if (vec_outside_cost <= 0)
3759     min_profitable_estimate = 0;
3760   else
3761     {
3762       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3763                                  * assumed_vf
3764                                  - vec_inside_cost * peel_iters_prologue
3765                                  - vec_inside_cost * peel_iters_epilogue)
3766                                  / ((scalar_single_iter_cost * assumed_vf)
3767                                    - vec_inside_cost);
3768     }
3769   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3770   if (dump_enabled_p ())
3771     dump_printf_loc (MSG_NOTE, vect_location,
3772                      "  Static estimate profitability threshold = %d\n",
3773                      min_profitable_estimate);
3774
3775   *ret_min_profitable_estimate = min_profitable_estimate;
3776 }
3777
3778 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3779    vector elements (not bits) for a vector with NELT elements.  */
3780 static void
3781 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3782                               vec_perm_builder *sel)
3783 {
3784   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3785      by vec_perm_indices.  */
3786   sel->new_vector (nelt, 1, 3);
3787   for (unsigned int i = 0; i < 3; i++)
3788     sel->quick_push (i + offset);
3789 }
3790
3791 /* Checks whether the target supports whole-vector shifts for vectors of mode
3792    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3793    it supports vec_perm_const with masks for all necessary shift amounts.  */
3794 static bool
3795 have_whole_vector_shift (machine_mode mode)
3796 {
3797   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3798     return true;
3799
3800   /* Variable-length vectors should be handled via the optab.  */
3801   unsigned int nelt;
3802   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3803     return false;
3804
3805   vec_perm_builder sel;
3806   vec_perm_indices indices;
3807   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3808     {
3809       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3810       indices.new_vector (sel, 2, nelt);
3811       if (!can_vec_perm_const_p (mode, indices, false))
3812         return false;
3813     }
3814   return true;
3815 }
3816
3817 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3818    functions. Design better to avoid maintenance issues.  */
3819
3820 /* Function vect_model_reduction_cost.
3821
3822    Models cost for a reduction operation, including the vector ops
3823    generated within the strip-mine loop, the initial definition before
3824    the loop, and the epilogue code that must be generated.  */
3825
3826 static void
3827 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3828                            int ncopies, stmt_vector_for_cost *cost_vec)
3829 {
3830   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3831   enum tree_code code;
3832   optab optab;
3833   tree vectype;
3834   machine_mode mode;
3835   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3836   struct loop *loop = NULL;
3837
3838   if (loop_vinfo)
3839     loop = LOOP_VINFO_LOOP (loop_vinfo);
3840
3841   /* Condition reductions generate two reductions in the loop.  */
3842   vect_reduction_type reduction_type
3843     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3844   if (reduction_type == COND_REDUCTION)
3845     ncopies *= 2;
3846
3847   vectype = STMT_VINFO_VECTYPE (stmt_info);
3848   mode = TYPE_MODE (vectype);
3849   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3850
3851   if (!orig_stmt_info)
3852     orig_stmt_info = stmt_info;
3853
3854   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3855
3856   if (reduction_type == EXTRACT_LAST_REDUCTION
3857       || reduction_type == FOLD_LEFT_REDUCTION)
3858     {
3859       /* No extra instructions needed in the prologue.  */
3860       prologue_cost = 0;
3861
3862       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3863         /* Count one reduction-like operation per vector.  */
3864         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3865                                         stmt_info, 0, vect_body);
3866       else
3867         {
3868           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3869           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3870           inside_cost = record_stmt_cost (cost_vec, nelements,
3871                                           vec_to_scalar, stmt_info, 0,
3872                                           vect_body);
3873           inside_cost += record_stmt_cost (cost_vec, nelements,
3874                                            scalar_stmt, stmt_info, 0,
3875                                            vect_body);
3876         }
3877     }
3878   else
3879     {
3880       /* Add in cost for initial definition.
3881          For cond reduction we have four vectors: initial index, step,
3882          initial result of the data reduction, initial value of the index
3883          reduction.  */
3884       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3885       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3886                                          scalar_to_vec, stmt_info, 0,
3887                                          vect_prologue);
3888
3889       /* Cost of reduction op inside loop.  */
3890       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3891                                       stmt_info, 0, vect_body);
3892     }
3893
3894   /* Determine cost of epilogue code.
3895
3896      We have a reduction operator that will reduce the vector in one statement.
3897      Also requires scalar extract.  */
3898
3899   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3900     {
3901       if (reduc_fn != IFN_LAST)
3902         {
3903           if (reduction_type == COND_REDUCTION)
3904             {
3905               /* An EQ stmt and an COND_EXPR stmt.  */
3906               epilogue_cost += record_stmt_cost (cost_vec, 2,
3907                                                  vector_stmt, stmt_info, 0,
3908                                                  vect_epilogue);
3909               /* Reduction of the max index and a reduction of the found
3910                  values.  */
3911               epilogue_cost += record_stmt_cost (cost_vec, 2,
3912                                                  vec_to_scalar, stmt_info, 0,
3913                                                  vect_epilogue);
3914               /* A broadcast of the max value.  */
3915               epilogue_cost += record_stmt_cost (cost_vec, 1,
3916                                                  scalar_to_vec, stmt_info, 0,
3917                                                  vect_epilogue);
3918             }
3919           else
3920             {
3921               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3922                                                  stmt_info, 0, vect_epilogue);
3923               epilogue_cost += record_stmt_cost (cost_vec, 1,
3924                                                  vec_to_scalar, stmt_info, 0,
3925                                                  vect_epilogue);
3926             }
3927         }
3928       else if (reduction_type == COND_REDUCTION)
3929         {
3930           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3931           /* Extraction of scalar elements.  */
3932           epilogue_cost += record_stmt_cost (cost_vec,
3933                                              2 * estimated_nunits,
3934                                              vec_to_scalar, stmt_info, 0,
3935                                              vect_epilogue);
3936           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3937           epilogue_cost += record_stmt_cost (cost_vec,
3938                                              2 * estimated_nunits - 3,
3939                                              scalar_stmt, stmt_info, 0,
3940                                              vect_epilogue);
3941         }
3942       else if (reduction_type == EXTRACT_LAST_REDUCTION
3943                || reduction_type == FOLD_LEFT_REDUCTION)
3944         /* No extra instructions need in the epilogue.  */
3945         ;
3946       else
3947         {
3948           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3949           tree bitsize =
3950             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3951           int element_bitsize = tree_to_uhwi (bitsize);
3952           int nelements = vec_size_in_bits / element_bitsize;
3953
3954           if (code == COND_EXPR)
3955             code = MAX_EXPR;
3956
3957           optab = optab_for_tree_code (code, vectype, optab_default);
3958
3959           /* We have a whole vector shift available.  */
3960           if (optab != unknown_optab
3961               && VECTOR_MODE_P (mode)
3962               && optab_handler (optab, mode) != CODE_FOR_nothing
3963               && have_whole_vector_shift (mode))
3964             {
3965               /* Final reduction via vector shifts and the reduction operator.
3966                  Also requires scalar extract.  */
3967               epilogue_cost += record_stmt_cost (cost_vec,
3968                                                  exact_log2 (nelements) * 2,
3969                                                  vector_stmt, stmt_info, 0,
3970                                                  vect_epilogue);
3971               epilogue_cost += record_stmt_cost (cost_vec, 1,
3972                                                  vec_to_scalar, stmt_info, 0,
3973                                                  vect_epilogue);
3974             }
3975           else
3976             /* Use extracts and reduction op for final reduction.  For N
3977                elements, we have N extracts and N-1 reduction ops.  */
3978             epilogue_cost += record_stmt_cost (cost_vec,
3979                                                nelements + nelements - 1,
3980                                                vector_stmt, stmt_info, 0,
3981                                                vect_epilogue);
3982         }
3983     }
3984
3985   if (dump_enabled_p ())
3986     dump_printf (MSG_NOTE,
3987                  "vect_model_reduction_cost: inside_cost = %d, "
3988                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3989                  prologue_cost, epilogue_cost);
3990 }
3991
3992
3993 /* Function vect_model_induction_cost.
3994
3995    Models cost for induction operations.  */
3996
3997 static void
3998 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3999                            stmt_vector_for_cost *cost_vec)
4000 {
4001   unsigned inside_cost, prologue_cost;
4002
4003   if (PURE_SLP_STMT (stmt_info))
4004     return;
4005
4006   /* loop cost for vec_loop.  */
4007   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4008                                   stmt_info, 0, vect_body);
4009
4010   /* prologue cost for vec_init and vec_step.  */
4011   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4012                                     stmt_info, 0, vect_prologue);
4013
4014   if (dump_enabled_p ())
4015     dump_printf_loc (MSG_NOTE, vect_location,
4016                      "vect_model_induction_cost: inside_cost = %d, "
4017                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4018 }
4019
4020
4021
4022 /* Function get_initial_def_for_reduction
4023
4024    Input:
4025    STMT - a stmt that performs a reduction operation in the loop.
4026    INIT_VAL - the initial value of the reduction variable
4027
4028    Output:
4029    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4030         of the reduction (used for adjusting the epilog - see below).
4031    Return a vector variable, initialized according to the operation that STMT
4032         performs. This vector will be used as the initial value of the
4033         vector of partial results.
4034
4035    Option1 (adjust in epilog): Initialize the vector as follows:
4036      add/bit or/xor:    [0,0,...,0,0]
4037      mult/bit and:      [1,1,...,1,1]
4038      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4039    and when necessary (e.g. add/mult case) let the caller know
4040    that it needs to adjust the result by init_val.
4041
4042    Option2: Initialize the vector as follows:
4043      add/bit or/xor:    [init_val,0,0,...,0]
4044      mult/bit and:      [init_val,1,1,...,1]
4045      min/max/cond_expr: [init_val,init_val,...,init_val]
4046    and no adjustments are needed.
4047
4048    For example, for the following code:
4049
4050    s = init_val;
4051    for (i=0;i<n;i++)
4052      s = s + a[i];
4053
4054    STMT is 's = s + a[i]', and the reduction variable is 's'.
4055    For a vector of 4 units, we want to return either [0,0,0,init_val],
4056    or [0,0,0,0] and let the caller know that it needs to adjust
4057    the result at the end by 'init_val'.
4058
4059    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4060    initialization vector is simpler (same element in all entries), if
4061    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4062
4063    A cost model should help decide between these two schemes.  */
4064
4065 tree
4066 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4067                                tree *adjustment_def)
4068 {
4069   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4070   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4071   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4072   tree scalar_type = TREE_TYPE (init_val);
4073   tree vectype = get_vectype_for_scalar_type (scalar_type);
4074   enum tree_code code = gimple_assign_rhs_code (stmt);
4075   tree def_for_init;
4076   tree init_def;
4077   REAL_VALUE_TYPE real_init_val = dconst0;
4078   int int_init_val = 0;
4079   gimple_seq stmts = NULL;
4080
4081   gcc_assert (vectype);
4082
4083   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4084               || SCALAR_FLOAT_TYPE_P (scalar_type));
4085
4086   gcc_assert (nested_in_vect_loop_p (loop, stmt)
4087               || loop == (gimple_bb (stmt))->loop_father);
4088
4089   vect_reduction_type reduction_type
4090     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4091
4092   switch (code)
4093     {
4094     case WIDEN_SUM_EXPR:
4095     case DOT_PROD_EXPR:
4096     case SAD_EXPR:
4097     case PLUS_EXPR:
4098     case MINUS_EXPR:
4099     case BIT_IOR_EXPR:
4100     case BIT_XOR_EXPR:
4101     case MULT_EXPR:
4102     case BIT_AND_EXPR:
4103       {
4104         /* ADJUSTMENT_DEF is NULL when called from
4105            vect_create_epilog_for_reduction to vectorize double reduction.  */
4106         if (adjustment_def)
4107           *adjustment_def = init_val;
4108
4109         if (code == MULT_EXPR)
4110           {
4111             real_init_val = dconst1;
4112             int_init_val = 1;
4113           }
4114
4115         if (code == BIT_AND_EXPR)
4116           int_init_val = -1;
4117
4118         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4119           def_for_init = build_real (scalar_type, real_init_val);
4120         else
4121           def_for_init = build_int_cst (scalar_type, int_init_val);
4122
4123         if (adjustment_def)
4124           /* Option1: the first element is '0' or '1' as well.  */
4125           init_def = gimple_build_vector_from_val (&stmts, vectype,
4126                                                    def_for_init);
4127         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4128           {
4129             /* Option2 (variable length): the first element is INIT_VAL.  */
4130             init_def = gimple_build_vector_from_val (&stmts, vectype,
4131                                                      def_for_init);
4132             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4133                                      vectype, init_def, init_val);
4134           }
4135         else
4136           {
4137             /* Option2: the first element is INIT_VAL.  */
4138             tree_vector_builder elts (vectype, 1, 2);
4139             elts.quick_push (init_val);
4140             elts.quick_push (def_for_init);
4141             init_def = gimple_build_vector (&stmts, &elts);
4142           }
4143       }
4144       break;
4145
4146     case MIN_EXPR:
4147     case MAX_EXPR:
4148     case COND_EXPR:
4149       {
4150         if (adjustment_def)
4151           {
4152             *adjustment_def = NULL_TREE;
4153             if (reduction_type != COND_REDUCTION
4154                 && reduction_type != EXTRACT_LAST_REDUCTION)
4155               {
4156                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4157                 break;
4158               }
4159           }
4160         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4161         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4162       }
4163       break;
4164
4165     default:
4166       gcc_unreachable ();
4167     }
4168
4169   if (stmts)
4170     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4171   return init_def;
4172 }
4173
4174 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4175    NUMBER_OF_VECTORS is the number of vector defs to create.
4176    If NEUTRAL_OP is nonnull, introducing extra elements of that
4177    value will not change the result.  */
4178
4179 static void
4180 get_initial_defs_for_reduction (slp_tree slp_node,
4181                                 vec<tree> *vec_oprnds,
4182                                 unsigned int number_of_vectors,
4183                                 bool reduc_chain, tree neutral_op)
4184 {
4185   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4186   stmt_vec_info stmt_vinfo = stmts[0];
4187   unsigned HOST_WIDE_INT nunits;
4188   unsigned j, number_of_places_left_in_vector;
4189   tree vector_type;
4190   tree vop;
4191   int group_size = stmts.length ();
4192   unsigned int vec_num, i;
4193   unsigned number_of_copies = 1;
4194   vec<tree> voprnds;
4195   voprnds.create (number_of_vectors);
4196   struct loop *loop;
4197   auto_vec<tree, 16> permute_results;
4198
4199   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4200
4201   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4202
4203   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4204   gcc_assert (loop);
4205   edge pe = loop_preheader_edge (loop);
4206
4207   gcc_assert (!reduc_chain || neutral_op);
4208
4209   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4210      created vectors. It is greater than 1 if unrolling is performed.
4211
4212      For example, we have two scalar operands, s1 and s2 (e.g., group of
4213      strided accesses of size two), while NUNITS is four (i.e., four scalars
4214      of this type can be packed in a vector).  The output vector will contain
4215      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4216      will be 2).
4217
4218      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4219      vectors containing the operands.
4220
4221      For example, NUNITS is four as before, and the group size is 8
4222      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4223      {s5, s6, s7, s8}.  */
4224
4225   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4226     nunits = group_size;
4227
4228   number_of_copies = nunits * number_of_vectors / group_size;
4229
4230   number_of_places_left_in_vector = nunits;
4231   bool constant_p = true;
4232   tree_vector_builder elts (vector_type, nunits, 1);
4233   elts.quick_grow (nunits);
4234   for (j = 0; j < number_of_copies; j++)
4235     {
4236       for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
4237         {
4238           tree op;
4239           /* Get the def before the loop.  In reduction chain we have only
4240              one initial value.  */
4241           if ((j != (number_of_copies - 1)
4242                || (reduc_chain && i != 0))
4243               && neutral_op)
4244             op = neutral_op;
4245           else
4246             op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4247
4248           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4249           number_of_places_left_in_vector--;
4250           elts[number_of_places_left_in_vector] = op;
4251           if (!CONSTANT_CLASS_P (op))
4252             constant_p = false;
4253
4254           if (number_of_places_left_in_vector == 0)
4255             {
4256               gimple_seq ctor_seq = NULL;
4257               tree init;
4258               if (constant_p && !neutral_op
4259                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4260                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4261                 /* Build the vector directly from ELTS.  */
4262                 init = gimple_build_vector (&ctor_seq, &elts);
4263               else if (neutral_op)
4264                 {
4265                   /* Build a vector of the neutral value and shift the
4266                      other elements into place.  */
4267                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4268                                                        neutral_op);
4269                   int k = nunits;
4270                   while (k > 0 && elts[k - 1] == neutral_op)
4271                     k -= 1;
4272                   while (k > 0)
4273                     {
4274                       k -= 1;
4275                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4276                                            vector_type, init, elts[k]);
4277                     }
4278                 }
4279               else
4280                 {
4281                   /* First time round, duplicate ELTS to fill the
4282                      required number of vectors, then cherry pick the
4283                      appropriate result for each iteration.  */
4284                   if (vec_oprnds->is_empty ())
4285                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4286                                               number_of_vectors,
4287                                               permute_results);
4288                   init = permute_results[number_of_vectors - j - 1];
4289                 }
4290               if (ctor_seq != NULL)
4291                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4292               voprnds.quick_push (init);
4293
4294               number_of_places_left_in_vector = nunits;
4295               elts.new_vector (vector_type, nunits, 1);
4296               elts.quick_grow (nunits);
4297               constant_p = true;
4298             }
4299         }
4300     }
4301
4302   /* Since the vectors are created in the reverse order, we should invert
4303      them.  */
4304   vec_num = voprnds.length ();
4305   for (j = vec_num; j != 0; j--)
4306     {
4307       vop = voprnds[j - 1];
4308       vec_oprnds->quick_push (vop);
4309     }
4310
4311   voprnds.release ();
4312
4313   /* In case that VF is greater than the unrolling factor needed for the SLP
4314      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4315      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4316      to replicate the vectors.  */
4317   tree neutral_vec = NULL;
4318   while (number_of_vectors > vec_oprnds->length ())
4319     {
4320       if (neutral_op)
4321         {
4322           if (!neutral_vec)
4323             {
4324               gimple_seq ctor_seq = NULL;
4325               neutral_vec = gimple_build_vector_from_val
4326                 (&ctor_seq, vector_type, neutral_op);
4327               if (ctor_seq != NULL)
4328                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4329             }
4330           vec_oprnds->quick_push (neutral_vec);
4331         }
4332       else
4333         {
4334           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4335             vec_oprnds->quick_push (vop);
4336         }
4337     }
4338 }
4339
4340
4341 /* Function vect_create_epilog_for_reduction
4342
4343    Create code at the loop-epilog to finalize the result of a reduction
4344    computation.
4345
4346    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4347      reduction statements.
4348    STMT is the scalar reduction stmt that is being vectorized.
4349    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4350      number of elements that we can fit in a vectype (nunits).  In this case
4351      we have to generate more than one vector stmt - i.e - we need to "unroll"
4352      the vector stmt by a factor VF/nunits.  For more details see documentation
4353      in vectorizable_operation.
4354    REDUC_FN is the internal function for the epilog reduction.
4355    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4356      computation.
4357    REDUC_INDEX is the index of the operand in the right hand side of the
4358      statement that is defined by REDUCTION_PHI.
4359    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4360    SLP_NODE is an SLP node containing a group of reduction statements. The
4361      first one in this group is STMT.
4362    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4363      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4364      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4365      any value of the IV in the loop.
4366    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4367    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4368      null if this is not an SLP reduction
4369
4370    This function:
4371    1. Creates the reduction def-use cycles: sets the arguments for
4372       REDUCTION_PHIS:
4373       The loop-entry argument is the vectorized initial-value of the reduction.
4374       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4375       sums.
4376    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4377       by calling the function specified by REDUC_FN if available, or by
4378       other means (whole-vector shifts or a scalar loop).
4379       The function also creates a new phi node at the loop exit to preserve
4380       loop-closed form, as illustrated below.
4381
4382      The flow at the entry to this function:
4383
4384         loop:
4385           vec_def = phi <null, null>            # REDUCTION_PHI
4386           VECT_DEF = vector_stmt                # vectorized form of STMT
4387           s_loop = scalar_stmt                  # (scalar) STMT
4388         loop_exit:
4389           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4390           use <s_out0>
4391           use <s_out0>
4392
4393      The above is transformed by this function into:
4394
4395         loop:
4396           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4397           VECT_DEF = vector_stmt                # vectorized form of STMT
4398           s_loop = scalar_stmt                  # (scalar) STMT
4399         loop_exit:
4400           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4401           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4402           v_out2 = reduce <v_out1>
4403           s_out3 = extract_field <v_out2, 0>
4404           s_out4 = adjust_result <s_out3>
4405           use <s_out4>
4406           use <s_out4>
4407 */
4408
4409 static void
4410 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4411                                   gimple *reduc_def_stmt,
4412                                   int ncopies, internal_fn reduc_fn,
4413                                   vec<stmt_vec_info> reduction_phis,
4414                                   bool double_reduc,
4415                                   slp_tree slp_node,
4416                                   slp_instance slp_node_instance,
4417                                   tree induc_val, enum tree_code induc_code,
4418                                   tree neutral_op)
4419 {
4420   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4421   stmt_vec_info prev_phi_info;
4422   tree vectype;
4423   machine_mode mode;
4424   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4425   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4426   basic_block exit_bb;
4427   tree scalar_dest;
4428   tree scalar_type;
4429   gimple *new_phi = NULL, *phi;
4430   stmt_vec_info phi_info;
4431   gimple_stmt_iterator exit_gsi;
4432   tree vec_dest;
4433   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4434   gimple *epilog_stmt = NULL;
4435   enum tree_code code = gimple_assign_rhs_code (stmt);
4436   gimple *exit_phi;
4437   tree bitsize;
4438   tree adjustment_def = NULL;
4439   tree vec_initial_def = NULL;
4440   tree expr, def, initial_def = NULL;
4441   tree orig_name, scalar_result;
4442   imm_use_iterator imm_iter, phi_imm_iter;
4443   use_operand_p use_p, phi_use_p;
4444   gimple *use_stmt;
4445   stmt_vec_info reduction_phi_info = NULL;
4446   bool nested_in_vect_loop = false;
4447   auto_vec<gimple *> new_phis;
4448   auto_vec<stmt_vec_info> inner_phis;
4449   enum vect_def_type dt = vect_unknown_def_type;
4450   int j, i;
4451   auto_vec<tree> scalar_results;
4452   unsigned int group_size = 1, k, ratio;
4453   auto_vec<tree> vec_initial_defs;
4454   auto_vec<gimple *> phis;
4455   bool slp_reduc = false;
4456   bool direct_slp_reduc;
4457   tree new_phi_result;
4458   stmt_vec_info inner_phi = NULL;
4459   tree induction_index = NULL_TREE;
4460
4461   if (slp_node)
4462     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4463
4464   if (nested_in_vect_loop_p (loop, stmt))
4465     {
4466       outer_loop = loop;
4467       loop = loop->inner;
4468       nested_in_vect_loop = true;
4469       gcc_assert (!slp_node);
4470     }
4471
4472   vectype = STMT_VINFO_VECTYPE (stmt_info);
4473   gcc_assert (vectype);
4474   mode = TYPE_MODE (vectype);
4475
4476   /* 1. Create the reduction def-use cycle:
4477      Set the arguments of REDUCTION_PHIS, i.e., transform
4478
4479         loop:
4480           vec_def = phi <null, null>            # REDUCTION_PHI
4481           VECT_DEF = vector_stmt                # vectorized form of STMT
4482           ...
4483
4484      into:
4485
4486         loop:
4487           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4488           VECT_DEF = vector_stmt                # vectorized form of STMT
4489           ...
4490
4491      (in case of SLP, do it for all the phis). */
4492
4493   /* Get the loop-entry arguments.  */
4494   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4495   if (slp_node)
4496     {
4497       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4498       vec_initial_defs.reserve (vec_num);
4499       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4500                                       &vec_initial_defs, vec_num,
4501                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4502                                       neutral_op);
4503     }
4504   else
4505     {
4506       /* Get at the scalar def before the loop, that defines the initial value
4507          of the reduction variable.  */
4508       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4509                                            loop_preheader_edge (loop));
4510       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4511          and we can't use zero for induc_val, use initial_def.  Similarly
4512          for REDUC_MIN and initial_def larger than the base.  */
4513       if (TREE_CODE (initial_def) == INTEGER_CST
4514           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4515               == INTEGER_INDUC_COND_REDUCTION)
4516           && !integer_zerop (induc_val)
4517           && ((induc_code == MAX_EXPR
4518                && tree_int_cst_lt (initial_def, induc_val))
4519               || (induc_code == MIN_EXPR
4520                   && tree_int_cst_lt (induc_val, initial_def))))
4521         induc_val = initial_def;
4522
4523       if (double_reduc)
4524         /* In case of double reduction we only create a vector variable
4525            to be put in the reduction phi node.  The actual statement
4526            creation is done later in this function.  */
4527         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4528       else if (nested_in_vect_loop)
4529         {
4530           /* Do not use an adjustment def as that case is not supported
4531              correctly if ncopies is not one.  */
4532           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4533           vec_initial_def = vect_get_vec_def_for_operand (initial_def, stmt);
4534         }
4535       else
4536         vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4537                                                          &adjustment_def);
4538       vec_initial_defs.create (1);
4539       vec_initial_defs.quick_push (vec_initial_def);
4540     }
4541
4542   /* Set phi nodes arguments.  */
4543   FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4544     {
4545       tree vec_init_def = vec_initial_defs[i];
4546       tree def = vect_defs[i];
4547       for (j = 0; j < ncopies; j++)
4548         {
4549           if (j != 0)
4550             {
4551               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4552               if (nested_in_vect_loop)
4553                 vec_init_def
4554                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4555                                                     vec_init_def);
4556             }
4557
4558           /* Set the loop-entry arg of the reduction-phi.  */
4559
4560           gphi *phi = as_a <gphi *> (phi_info->stmt);
4561           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4562               == INTEGER_INDUC_COND_REDUCTION)
4563             {
4564               /* Initialise the reduction phi to zero.  This prevents initial
4565                  values of non-zero interferring with the reduction op.  */
4566               gcc_assert (ncopies == 1);
4567               gcc_assert (i == 0);
4568
4569               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4570               tree induc_val_vec
4571                 = build_vector_from_val (vec_init_def_type, induc_val);
4572
4573               add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4574                            UNKNOWN_LOCATION);
4575             }
4576           else
4577             add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4578                          UNKNOWN_LOCATION);
4579
4580           /* Set the loop-latch arg for the reduction-phi.  */
4581           if (j > 0)
4582             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4583
4584           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4585
4586           if (dump_enabled_p ())
4587             {
4588               dump_printf_loc (MSG_NOTE, vect_location,
4589                                "transform reduction: created def-use cycle: ");
4590               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4591               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4592             }
4593         }
4594     }
4595
4596   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4597      which is updated with the current index of the loop for every match of
4598      the original loop's cond_expr (VEC_STMT).  This results in a vector
4599      containing the last time the condition passed for that vector lane.
4600      The first match will be a 1 to allow 0 to be used for non-matching
4601      indexes.  If there are no matches at all then the vector will be all
4602      zeroes.  */
4603   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4604     {
4605       tree indx_before_incr, indx_after_incr;
4606       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4607
4608       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4609       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4610
4611       int scalar_precision
4612         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4613       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4614       tree cr_index_vector_type = build_vector_type
4615         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4616
4617       /* First we create a simple vector induction variable which starts
4618          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4619          vector size (STEP).  */
4620
4621       /* Create a {1,2,3,...} vector.  */
4622       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4623
4624       /* Create a vector of the step value.  */
4625       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4626       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4627
4628       /* Create an induction variable.  */
4629       gimple_stmt_iterator incr_gsi;
4630       bool insert_after;
4631       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4632       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4633                  insert_after, &indx_before_incr, &indx_after_incr);
4634
4635       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4636          filled with zeros (VEC_ZERO).  */
4637
4638       /* Create a vector of 0s.  */
4639       tree zero = build_zero_cst (cr_index_scalar_type);
4640       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4641
4642       /* Create a vector phi node.  */
4643       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4644       new_phi = create_phi_node (new_phi_tree, loop->header);
4645       loop_vinfo->add_stmt (new_phi);
4646       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4647                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4648
4649       /* Now take the condition from the loops original cond_expr
4650          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4651          every match uses values from the induction variable
4652          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4653          (NEW_PHI_TREE).
4654          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4655          the new cond_expr (INDEX_COND_EXPR).  */
4656
4657       /* Duplicate the condition from vec_stmt.  */
4658       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4659
4660       /* Create a conditional, where the condition is taken from vec_stmt
4661          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4662          else is the phi (NEW_PHI_TREE).  */
4663       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4664                                      ccompare, indx_before_incr,
4665                                      new_phi_tree);
4666       induction_index = make_ssa_name (cr_index_vector_type);
4667       gimple *index_condition = gimple_build_assign (induction_index,
4668                                                      index_cond_expr);
4669       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4670       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4671       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4672
4673       /* Update the phi with the vec cond.  */
4674       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4675                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4676     }
4677
4678   /* 2. Create epilog code.
4679         The reduction epilog code operates across the elements of the vector
4680         of partial results computed by the vectorized loop.
4681         The reduction epilog code consists of:
4682
4683         step 1: compute the scalar result in a vector (v_out2)
4684         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4685         step 3: adjust the scalar result (s_out3) if needed.
4686
4687         Step 1 can be accomplished using one the following three schemes:
4688           (scheme 1) using reduc_fn, if available.
4689           (scheme 2) using whole-vector shifts, if available.
4690           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4691                      combined.
4692
4693           The overall epilog code looks like this:
4694
4695           s_out0 = phi <s_loop>         # original EXIT_PHI
4696           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4697           v_out2 = reduce <v_out1>              # step 1
4698           s_out3 = extract_field <v_out2, 0>    # step 2
4699           s_out4 = adjust_result <s_out3>       # step 3
4700
4701           (step 3 is optional, and steps 1 and 2 may be combined).
4702           Lastly, the uses of s_out0 are replaced by s_out4.  */
4703
4704
4705   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4706          v_out1 = phi <VECT_DEF>
4707          Store them in NEW_PHIS.  */
4708
4709   exit_bb = single_exit (loop)->dest;
4710   prev_phi_info = NULL;
4711   new_phis.create (vect_defs.length ());
4712   FOR_EACH_VEC_ELT (vect_defs, i, def)
4713     {
4714       for (j = 0; j < ncopies; j++)
4715         {
4716           tree new_def = copy_ssa_name (def);
4717           phi = create_phi_node (new_def, exit_bb);
4718           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4719           if (j == 0)
4720             new_phis.quick_push (phi);
4721           else
4722             {
4723               def = vect_get_vec_def_for_stmt_copy (dt, def);
4724               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4725             }
4726
4727           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4728           prev_phi_info = phi_info;
4729         }
4730     }
4731
4732   /* The epilogue is created for the outer-loop, i.e., for the loop being
4733      vectorized.  Create exit phis for the outer loop.  */
4734   if (double_reduc)
4735     {
4736       loop = outer_loop;
4737       exit_bb = single_exit (loop)->dest;
4738       inner_phis.create (vect_defs.length ());
4739       FOR_EACH_VEC_ELT (new_phis, i, phi)
4740         {
4741           stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4742           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4743           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4744           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4745                            PHI_RESULT (phi));
4746           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4747           inner_phis.quick_push (phi_info);
4748           new_phis[i] = outer_phi;
4749           while (STMT_VINFO_RELATED_STMT (phi_info))
4750             {
4751               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4752               new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4753               outer_phi = create_phi_node (new_result, exit_bb);
4754               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4755                                PHI_RESULT (phi_info->stmt));
4756               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4757               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4758               prev_phi_info = outer_phi_info;
4759             }
4760         }
4761     }
4762
4763   exit_gsi = gsi_after_labels (exit_bb);
4764
4765   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4766          (i.e. when reduc_fn is not available) and in the final adjustment
4767          code (if needed).  Also get the original scalar reduction variable as
4768          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4769          represents a reduction pattern), the tree-code and scalar-def are
4770          taken from the original stmt that the pattern-stmt (STMT) replaces.
4771          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4772          are taken from STMT.  */
4773
4774   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4775   if (!orig_stmt_info)
4776     {
4777       /* Regular reduction  */
4778       orig_stmt_info = stmt_info;
4779     }
4780   else
4781     {
4782       /* Reduction pattern  */
4783       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4784       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4785     }
4786
4787   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4788   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4789      partial results are added and not subtracted.  */
4790   if (code == MINUS_EXPR)
4791     code = PLUS_EXPR;
4792
4793   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4794   scalar_type = TREE_TYPE (scalar_dest);
4795   scalar_results.create (group_size);
4796   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4797   bitsize = TYPE_SIZE (scalar_type);
4798
4799   /* In case this is a reduction in an inner-loop while vectorizing an outer
4800      loop - we don't need to extract a single scalar result at the end of the
4801      inner-loop (unless it is double reduction, i.e., the use of reduction is
4802      outside the outer-loop).  The final vector of partial results will be used
4803      in the vectorized outer-loop, or reduced to a scalar result at the end of
4804      the outer-loop.  */
4805   if (nested_in_vect_loop && !double_reduc)
4806     goto vect_finalize_reduction;
4807
4808   /* SLP reduction without reduction chain, e.g.,
4809      # a1 = phi <a2, a0>
4810      # b1 = phi <b2, b0>
4811      a2 = operation (a1)
4812      b2 = operation (b1)  */
4813   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4814
4815   /* True if we should implement SLP_REDUC using native reduction operations
4816      instead of scalar operations.  */
4817   direct_slp_reduc = (reduc_fn != IFN_LAST
4818                       && slp_reduc
4819                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4820
4821   /* In case of reduction chain, e.g.,
4822      # a1 = phi <a3, a0>
4823      a2 = operation (a1)
4824      a3 = operation (a2),
4825
4826      we may end up with more than one vector result.  Here we reduce them to
4827      one vector.  */
4828   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4829     {
4830       tree first_vect = PHI_RESULT (new_phis[0]);
4831       gassign *new_vec_stmt = NULL;
4832       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4833       for (k = 1; k < new_phis.length (); k++)
4834         {
4835           gimple *next_phi = new_phis[k];
4836           tree second_vect = PHI_RESULT (next_phi);
4837           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4838           new_vec_stmt = gimple_build_assign (tem, code,
4839                                               first_vect, second_vect);
4840           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4841           first_vect = tem;
4842         }
4843
4844       new_phi_result = first_vect;
4845       if (new_vec_stmt)
4846         {
4847           new_phis.truncate (0);
4848           new_phis.safe_push (new_vec_stmt);
4849         }
4850     }
4851   /* Likewise if we couldn't use a single defuse cycle.  */
4852   else if (ncopies > 1)
4853     {
4854       gcc_assert (new_phis.length () == 1);
4855       tree first_vect = PHI_RESULT (new_phis[0]);
4856       gassign *new_vec_stmt = NULL;
4857       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4858       gimple *next_phi = new_phis[0];
4859       for (int k = 1; k < ncopies; ++k)
4860         {
4861           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4862           tree second_vect = PHI_RESULT (next_phi);
4863           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4864           new_vec_stmt = gimple_build_assign (tem, code,
4865                                               first_vect, second_vect);
4866           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4867           first_vect = tem;
4868         }
4869       new_phi_result = first_vect;
4870       new_phis.truncate (0);
4871       new_phis.safe_push (new_vec_stmt);
4872     }
4873   else
4874     new_phi_result = PHI_RESULT (new_phis[0]);
4875
4876   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4877       && reduc_fn != IFN_LAST)
4878     {
4879       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4880          various data values where the condition matched and another vector
4881          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4882          need to extract the last matching index (which will be the index with
4883          highest value) and use this to index into the data vector.
4884          For the case where there were no matches, the data vector will contain
4885          all default values and the index vector will be all zeros.  */
4886
4887       /* Get various versions of the type of the vector of indexes.  */
4888       tree index_vec_type = TREE_TYPE (induction_index);
4889       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4890       tree index_scalar_type = TREE_TYPE (index_vec_type);
4891       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4892         (index_vec_type);
4893
4894       /* Get an unsigned integer version of the type of the data vector.  */
4895       int scalar_precision
4896         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4897       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4898       tree vectype_unsigned = build_vector_type
4899         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4900
4901       /* First we need to create a vector (ZERO_VEC) of zeros and another
4902          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4903          can create using a MAX reduction and then expanding.
4904          In the case where the loop never made any matches, the max index will
4905          be zero.  */
4906
4907       /* Vector of {0, 0, 0,...}.  */
4908       tree zero_vec = make_ssa_name (vectype);
4909       tree zero_vec_rhs = build_zero_cst (vectype);
4910       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4911       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4912
4913       /* Find maximum value from the vector of found indexes.  */
4914       tree max_index = make_ssa_name (index_scalar_type);
4915       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4916                                                           1, induction_index);
4917       gimple_call_set_lhs (max_index_stmt, max_index);
4918       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4919
4920       /* Vector of {max_index, max_index, max_index,...}.  */
4921       tree max_index_vec = make_ssa_name (index_vec_type);
4922       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4923                                                       max_index);
4924       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4925                                                         max_index_vec_rhs);
4926       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4927
4928       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4929          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4930          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4931          otherwise.  Only one value should match, resulting in a vector
4932          (VEC_COND) with one data value and the rest zeros.
4933          In the case where the loop never made any matches, every index will
4934          match, resulting in a vector with all data values (which will all be
4935          the default value).  */
4936
4937       /* Compare the max index vector to the vector of found indexes to find
4938          the position of the max value.  */
4939       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4940       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4941                                                       induction_index,
4942                                                       max_index_vec);
4943       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4944
4945       /* Use the compare to choose either values from the data vector or
4946          zero.  */
4947       tree vec_cond = make_ssa_name (vectype);
4948       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4949                                                    vec_compare, new_phi_result,
4950                                                    zero_vec);
4951       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4952
4953       /* Finally we need to extract the data value from the vector (VEC_COND)
4954          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4955          reduction, but because this doesn't exist, we can use a MAX reduction
4956          instead.  The data value might be signed or a float so we need to cast
4957          it first.
4958          In the case where the loop never made any matches, the data values are
4959          all identical, and so will reduce down correctly.  */
4960
4961       /* Make the matched data values unsigned.  */
4962       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4963       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4964                                        vec_cond);
4965       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4966                                                         VIEW_CONVERT_EXPR,
4967                                                         vec_cond_cast_rhs);
4968       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4969
4970       /* Reduce down to a scalar value.  */
4971       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4972       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4973                                                            1, vec_cond_cast);
4974       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4975       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4976
4977       /* Convert the reduced value back to the result type and set as the
4978          result.  */
4979       gimple_seq stmts = NULL;
4980       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4981                                data_reduc);
4982       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4983       scalar_results.safe_push (new_temp);
4984     }
4985   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4986            && reduc_fn == IFN_LAST)
4987     {
4988       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4989          idx = 0;
4990          idx_val = induction_index[0];
4991          val = data_reduc[0];
4992          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4993            if (induction_index[i] > idx_val)
4994              val = data_reduc[i], idx_val = induction_index[i];
4995          return val;  */
4996
4997       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4998       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4999       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5000       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5001       /* Enforced by vectorizable_reduction, which ensures we have target
5002          support before allowing a conditional reduction on variable-length
5003          vectors.  */
5004       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5005       tree idx_val = NULL_TREE, val = NULL_TREE;
5006       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5007         {
5008           tree old_idx_val = idx_val;
5009           tree old_val = val;
5010           idx_val = make_ssa_name (idx_eltype);
5011           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5012                                              build3 (BIT_FIELD_REF, idx_eltype,
5013                                                      induction_index,
5014                                                      bitsize_int (el_size),
5015                                                      bitsize_int (off)));
5016           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5017           val = make_ssa_name (data_eltype);
5018           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5019                                              build3 (BIT_FIELD_REF,
5020                                                      data_eltype,
5021                                                      new_phi_result,
5022                                                      bitsize_int (el_size),
5023                                                      bitsize_int (off)));
5024           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5025           if (off != 0)
5026             {
5027               tree new_idx_val = idx_val;
5028               tree new_val = val;
5029               if (off != v_size - el_size)
5030                 {
5031                   new_idx_val = make_ssa_name (idx_eltype);
5032                   epilog_stmt = gimple_build_assign (new_idx_val,
5033                                                      MAX_EXPR, idx_val,
5034                                                      old_idx_val);
5035                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5036                 }
5037               new_val = make_ssa_name (data_eltype);
5038               epilog_stmt = gimple_build_assign (new_val,
5039                                                  COND_EXPR,
5040                                                  build2 (GT_EXPR,
5041                                                          boolean_type_node,
5042                                                          idx_val,
5043                                                          old_idx_val),
5044                                                  val, old_val);
5045               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5046               idx_val = new_idx_val;
5047               val = new_val;
5048             }
5049         }
5050       /* Convert the reduced value back to the result type and set as the
5051          result.  */
5052       gimple_seq stmts = NULL;
5053       val = gimple_convert (&stmts, scalar_type, val);
5054       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5055       scalar_results.safe_push (val);
5056     }
5057
5058   /* 2.3 Create the reduction code, using one of the three schemes described
5059          above. In SLP we simply need to extract all the elements from the
5060          vector (without reducing them), so we use scalar shifts.  */
5061   else if (reduc_fn != IFN_LAST && !slp_reduc)
5062     {
5063       tree tmp;
5064       tree vec_elem_type;
5065
5066       /* Case 1:  Create:
5067          v_out2 = reduc_expr <v_out1>  */
5068
5069       if (dump_enabled_p ())
5070         dump_printf_loc (MSG_NOTE, vect_location,
5071                          "Reduce using direct vector reduction.\n");
5072
5073       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5074       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5075         {
5076           tree tmp_dest
5077             = vect_create_destination_var (scalar_dest, vec_elem_type);
5078           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5079                                                     new_phi_result);
5080           gimple_set_lhs (epilog_stmt, tmp_dest);
5081           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5082           gimple_set_lhs (epilog_stmt, new_temp);
5083           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5084
5085           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5086                                              new_temp);
5087         }
5088       else
5089         {
5090           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5091                                                     new_phi_result);
5092           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5093         }
5094
5095       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5096       gimple_set_lhs (epilog_stmt, new_temp);
5097       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5098
5099       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5100            == INTEGER_INDUC_COND_REDUCTION)
5101           && !operand_equal_p (initial_def, induc_val, 0))
5102         {
5103           /* Earlier we set the initial value to be a vector if induc_val
5104              values.  Check the result and if it is induc_val then replace
5105              with the original initial value, unless induc_val is
5106              the same as initial_def already.  */
5107           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5108                                   induc_val);
5109
5110           tmp = make_ssa_name (new_scalar_dest);
5111           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5112                                              initial_def, new_temp);
5113           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5114           new_temp = tmp;
5115         }
5116
5117       scalar_results.safe_push (new_temp);
5118     }
5119   else if (direct_slp_reduc)
5120     {
5121       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5122          with the elements for other SLP statements replaced with the
5123          neutral value.  We can then do a normal reduction on each vector.  */
5124
5125       /* Enforced by vectorizable_reduction.  */
5126       gcc_assert (new_phis.length () == 1);
5127       gcc_assert (pow2p_hwi (group_size));
5128
5129       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5130       vec<stmt_vec_info> orig_phis
5131         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5132       gimple_seq seq = NULL;
5133
5134       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5135          and the same element size as VECTYPE.  */
5136       tree index = build_index_vector (vectype, 0, 1);
5137       tree index_type = TREE_TYPE (index);
5138       tree index_elt_type = TREE_TYPE (index_type);
5139       tree mask_type = build_same_sized_truth_vector_type (index_type);
5140
5141       /* Create a vector that, for each element, identifies which of
5142          the REDUC_GROUP_SIZE results should use it.  */
5143       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5144       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5145                             build_vector_from_val (index_type, index_mask));
5146
5147       /* Get a neutral vector value.  This is simply a splat of the neutral
5148          scalar value if we have one, otherwise the initial scalar value
5149          is itself a neutral value.  */
5150       tree vector_identity = NULL_TREE;
5151       if (neutral_op)
5152         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5153                                                         neutral_op);
5154       for (unsigned int i = 0; i < group_size; ++i)
5155         {
5156           /* If there's no univeral neutral value, we can use the
5157              initial scalar value from the original PHI.  This is used
5158              for MIN and MAX reduction, for example.  */
5159           if (!neutral_op)
5160             {
5161               tree scalar_value
5162                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5163                                          loop_preheader_edge (loop));
5164               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5165                                                               scalar_value);
5166             }
5167
5168           /* Calculate the equivalent of:
5169
5170              sel[j] = (index[j] == i);
5171
5172              which selects the elements of NEW_PHI_RESULT that should
5173              be included in the result.  */
5174           tree compare_val = build_int_cst (index_elt_type, i);
5175           compare_val = build_vector_from_val (index_type, compare_val);
5176           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5177                                    index, compare_val);
5178
5179           /* Calculate the equivalent of:
5180
5181              vec = seq ? new_phi_result : vector_identity;
5182
5183              VEC is now suitable for a full vector reduction.  */
5184           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5185                                    sel, new_phi_result, vector_identity);
5186
5187           /* Do the reduction and convert it to the appropriate type.  */
5188           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5189                                       TREE_TYPE (vectype), vec);
5190           scalar = gimple_convert (&seq, scalar_type, scalar);
5191           scalar_results.safe_push (scalar);
5192         }
5193       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5194     }
5195   else
5196     {
5197       bool reduce_with_shift;
5198       tree vec_temp;
5199
5200       /* COND reductions all do the final reduction with MAX_EXPR
5201          or MIN_EXPR.  */
5202       if (code == COND_EXPR)
5203         {
5204           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5205               == INTEGER_INDUC_COND_REDUCTION)
5206             code = induc_code;
5207           else
5208             code = MAX_EXPR;
5209         }
5210
5211       /* See if the target wants to do the final (shift) reduction
5212          in a vector mode of smaller size and first reduce upper/lower
5213          halves against each other.  */
5214       enum machine_mode mode1 = mode;
5215       tree vectype1 = vectype;
5216       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5217       unsigned sz1 = sz;
5218       if (!slp_reduc
5219           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5220         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5221
5222       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5223       reduce_with_shift = have_whole_vector_shift (mode1);
5224       if (!VECTOR_MODE_P (mode1))
5225         reduce_with_shift = false;
5226       else
5227         {
5228           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5229           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5230             reduce_with_shift = false;
5231         }
5232
5233       /* First reduce the vector to the desired vector size we should
5234          do shift reduction on by combining upper and lower halves.  */
5235       new_temp = new_phi_result;
5236       while (sz > sz1)
5237         {
5238           gcc_assert (!slp_reduc);
5239           sz /= 2;
5240           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5241
5242           /* The target has to make sure we support lowpart/highpart
5243              extraction, either via direct vector extract or through
5244              an integer mode punning.  */
5245           tree dst1, dst2;
5246           if (convert_optab_handler (vec_extract_optab,
5247                                      TYPE_MODE (TREE_TYPE (new_temp)),
5248                                      TYPE_MODE (vectype1))
5249               != CODE_FOR_nothing)
5250             {
5251               /* Extract sub-vectors directly once vec_extract becomes
5252                  a conversion optab.  */
5253               dst1 = make_ssa_name (vectype1);
5254               epilog_stmt
5255                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5256                                          build3 (BIT_FIELD_REF, vectype1,
5257                                                  new_temp, TYPE_SIZE (vectype1),
5258                                                  bitsize_int (0)));
5259               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5260               dst2 =  make_ssa_name (vectype1);
5261               epilog_stmt
5262                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5263                                          build3 (BIT_FIELD_REF, vectype1,
5264                                                  new_temp, TYPE_SIZE (vectype1),
5265                                                  bitsize_int (sz * BITS_PER_UNIT)));
5266               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5267             }
5268           else
5269             {
5270               /* Extract via punning to appropriately sized integer mode
5271                  vector.  */
5272               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5273                                                             1);
5274               tree etype = build_vector_type (eltype, 2);
5275               gcc_assert (convert_optab_handler (vec_extract_optab,
5276                                                  TYPE_MODE (etype),
5277                                                  TYPE_MODE (eltype))
5278                           != CODE_FOR_nothing);
5279               tree tem = make_ssa_name (etype);
5280               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5281                                                  build1 (VIEW_CONVERT_EXPR,
5282                                                          etype, new_temp));
5283               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5284               new_temp = tem;
5285               tem = make_ssa_name (eltype);
5286               epilog_stmt
5287                   = gimple_build_assign (tem, BIT_FIELD_REF,
5288                                          build3 (BIT_FIELD_REF, eltype,
5289                                                  new_temp, TYPE_SIZE (eltype),
5290                                                  bitsize_int (0)));
5291               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5292               dst1 = make_ssa_name (vectype1);
5293               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5294                                                  build1 (VIEW_CONVERT_EXPR,
5295                                                          vectype1, tem));
5296               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5297               tem = make_ssa_name (eltype);
5298               epilog_stmt
5299                   = gimple_build_assign (tem, BIT_FIELD_REF,
5300                                          build3 (BIT_FIELD_REF, eltype,
5301                                                  new_temp, TYPE_SIZE (eltype),
5302                                                  bitsize_int (sz * BITS_PER_UNIT)));
5303               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5304               dst2 =  make_ssa_name (vectype1);
5305               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5306                                                  build1 (VIEW_CONVERT_EXPR,
5307                                                          vectype1, tem));
5308               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5309             }
5310
5311           new_temp = make_ssa_name (vectype1);
5312           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5313           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5314         }
5315
5316       if (reduce_with_shift && !slp_reduc)
5317         {
5318           int element_bitsize = tree_to_uhwi (bitsize);
5319           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5320              for variable-length vectors and also requires direct target support
5321              for loop reductions.  */
5322           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5323           int nelements = vec_size_in_bits / element_bitsize;
5324           vec_perm_builder sel;
5325           vec_perm_indices indices;
5326
5327           int elt_offset;
5328
5329           tree zero_vec = build_zero_cst (vectype1);
5330           /* Case 2: Create:
5331              for (offset = nelements/2; offset >= 1; offset/=2)
5332                 {
5333                   Create:  va' = vec_shift <va, offset>
5334                   Create:  va = vop <va, va'>
5335                 }  */
5336
5337           tree rhs;
5338
5339           if (dump_enabled_p ())
5340             dump_printf_loc (MSG_NOTE, vect_location,
5341                              "Reduce using vector shifts\n");
5342
5343           mode1 = TYPE_MODE (vectype1);
5344           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5345           for (elt_offset = nelements / 2;
5346                elt_offset >= 1;
5347                elt_offset /= 2)
5348             {
5349               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5350               indices.new_vector (sel, 2, nelements);
5351               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5352               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5353                                                  new_temp, zero_vec, mask);
5354               new_name = make_ssa_name (vec_dest, epilog_stmt);
5355               gimple_assign_set_lhs (epilog_stmt, new_name);
5356               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5357
5358               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5359                                                  new_temp);
5360               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5361               gimple_assign_set_lhs (epilog_stmt, new_temp);
5362               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5363             }
5364
5365           /* 2.4  Extract the final scalar result.  Create:
5366              s_out3 = extract_field <v_out2, bitpos>  */
5367
5368           if (dump_enabled_p ())
5369             dump_printf_loc (MSG_NOTE, vect_location,
5370                              "extract scalar result\n");
5371
5372           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5373                         bitsize, bitsize_zero_node);
5374           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5375           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5376           gimple_assign_set_lhs (epilog_stmt, new_temp);
5377           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5378           scalar_results.safe_push (new_temp);
5379         }
5380       else
5381         {
5382           /* Case 3: Create:
5383              s = extract_field <v_out2, 0>
5384              for (offset = element_size;
5385                   offset < vector_size;
5386                   offset += element_size;)
5387                {
5388                  Create:  s' = extract_field <v_out2, offset>
5389                  Create:  s = op <s, s'>  // For non SLP cases
5390                }  */
5391
5392           if (dump_enabled_p ())
5393             dump_printf_loc (MSG_NOTE, vect_location,
5394                              "Reduce using scalar code.\n");
5395
5396           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5397           int element_bitsize = tree_to_uhwi (bitsize);
5398           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5399             {
5400               int bit_offset;
5401               if (gimple_code (new_phi) == GIMPLE_PHI)
5402                 vec_temp = PHI_RESULT (new_phi);
5403               else
5404                 vec_temp = gimple_assign_lhs (new_phi);
5405               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5406                                  bitsize_zero_node);
5407               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5408               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5409               gimple_assign_set_lhs (epilog_stmt, new_temp);
5410               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5411
5412               /* In SLP we don't need to apply reduction operation, so we just
5413                  collect s' values in SCALAR_RESULTS.  */
5414               if (slp_reduc)
5415                 scalar_results.safe_push (new_temp);
5416
5417               for (bit_offset = element_bitsize;
5418                    bit_offset < vec_size_in_bits;
5419                    bit_offset += element_bitsize)
5420                 {
5421                   tree bitpos = bitsize_int (bit_offset);
5422                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5423                                      bitsize, bitpos);
5424
5425                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5426                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5427                   gimple_assign_set_lhs (epilog_stmt, new_name);
5428                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5429
5430                   if (slp_reduc)
5431                     {
5432                       /* In SLP we don't need to apply reduction operation, so
5433                          we just collect s' values in SCALAR_RESULTS.  */
5434                       new_temp = new_name;
5435                       scalar_results.safe_push (new_name);
5436                     }
5437                   else
5438                     {
5439                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5440                                                          new_name, new_temp);
5441                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5442                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5443                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5444                     }
5445                 }
5446             }
5447
5448           /* The only case where we need to reduce scalar results in SLP, is
5449              unrolling.  If the size of SCALAR_RESULTS is greater than
5450              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5451              REDUC_GROUP_SIZE.  */
5452           if (slp_reduc)
5453             {
5454               tree res, first_res, new_res;
5455               gimple *new_stmt;
5456
5457               /* Reduce multiple scalar results in case of SLP unrolling.  */
5458               for (j = group_size; scalar_results.iterate (j, &res);
5459                    j++)
5460                 {
5461                   first_res = scalar_results[j % group_size];
5462                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5463                                                   first_res, res);
5464                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5465                   gimple_assign_set_lhs (new_stmt, new_res);
5466                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5467                   scalar_results[j % group_size] = new_res;
5468                 }
5469             }
5470           else
5471             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5472             scalar_results.safe_push (new_temp);
5473         }
5474
5475       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5476            == INTEGER_INDUC_COND_REDUCTION)
5477           && !operand_equal_p (initial_def, induc_val, 0))
5478         {
5479           /* Earlier we set the initial value to be a vector if induc_val
5480              values.  Check the result and if it is induc_val then replace
5481              with the original initial value, unless induc_val is
5482              the same as initial_def already.  */
5483           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5484                                   induc_val);
5485
5486           tree tmp = make_ssa_name (new_scalar_dest);
5487           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5488                                              initial_def, new_temp);
5489           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5490           scalar_results[0] = tmp;
5491         }
5492     }
5493
5494 vect_finalize_reduction:
5495
5496   if (double_reduc)
5497     loop = loop->inner;
5498
5499   /* 2.5 Adjust the final result by the initial value of the reduction
5500          variable. (When such adjustment is not needed, then
5501          'adjustment_def' is zero).  For example, if code is PLUS we create:
5502          new_temp = loop_exit_def + adjustment_def  */
5503
5504   if (adjustment_def)
5505     {
5506       gcc_assert (!slp_reduc);
5507       if (nested_in_vect_loop)
5508         {
5509           new_phi = new_phis[0];
5510           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5511           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5512           new_dest = vect_create_destination_var (scalar_dest, vectype);
5513         }
5514       else
5515         {
5516           new_temp = scalar_results[0];
5517           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5518           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5519           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5520         }
5521
5522       epilog_stmt = gimple_build_assign (new_dest, expr);
5523       new_temp = make_ssa_name (new_dest, epilog_stmt);
5524       gimple_assign_set_lhs (epilog_stmt, new_temp);
5525       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5526       if (nested_in_vect_loop)
5527         {
5528           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5529           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5530             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5531
5532           if (!double_reduc)
5533             scalar_results.quick_push (new_temp);
5534           else
5535             scalar_results[0] = new_temp;
5536         }
5537       else
5538         scalar_results[0] = new_temp;
5539
5540       new_phis[0] = epilog_stmt;
5541     }
5542
5543   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5544           phis with new adjusted scalar results, i.e., replace use <s_out0>
5545           with use <s_out4>.
5546
5547      Transform:
5548         loop_exit:
5549           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5550           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5551           v_out2 = reduce <v_out1>
5552           s_out3 = extract_field <v_out2, 0>
5553           s_out4 = adjust_result <s_out3>
5554           use <s_out0>
5555           use <s_out0>
5556
5557      into:
5558
5559         loop_exit:
5560           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5561           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5562           v_out2 = reduce <v_out1>
5563           s_out3 = extract_field <v_out2, 0>
5564           s_out4 = adjust_result <s_out3>
5565           use <s_out4>
5566           use <s_out4> */
5567
5568
5569   /* In SLP reduction chain we reduce vector results into one vector if
5570      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5571      LHS of the last stmt in the reduction chain, since we are looking for
5572      the loop exit phi node.  */
5573   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5574     {
5575       stmt_vec_info dest_stmt_info
5576         = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5577       /* Handle reduction patterns.  */
5578       if (STMT_VINFO_RELATED_STMT (dest_stmt_info))
5579         dest_stmt_info = STMT_VINFO_RELATED_STMT (dest_stmt_info);
5580
5581       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5582       group_size = 1;
5583     }
5584
5585   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5586      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5587      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5588      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5589      correspond to the first vector stmt, etc.
5590      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5591   if (group_size > new_phis.length ())
5592     {
5593       ratio = group_size / new_phis.length ();
5594       gcc_assert (!(group_size % new_phis.length ()));
5595     }
5596   else
5597     ratio = 1;
5598
5599   for (k = 0; k < group_size; k++)
5600     {
5601       if (k % ratio == 0)
5602         {
5603           epilog_stmt = new_phis[k / ratio];
5604           reduction_phi_info = reduction_phis[k / ratio];
5605           if (double_reduc)
5606             inner_phi = inner_phis[k / ratio];
5607         }
5608
5609       if (slp_reduc)
5610         {
5611           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5612
5613           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5614           /* SLP statements can't participate in patterns.  */
5615           gcc_assert (!orig_stmt_info);
5616           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5617         }
5618
5619       phis.create (3);
5620       /* Find the loop-closed-use at the loop exit of the original scalar
5621          result.  (The reduction result is expected to have two immediate uses -
5622          one at the latch block, and one at the loop exit).  */
5623       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5624         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5625             && !is_gimple_debug (USE_STMT (use_p)))
5626           phis.safe_push (USE_STMT (use_p));
5627
5628       /* While we expect to have found an exit_phi because of loop-closed-ssa
5629          form we can end up without one if the scalar cycle is dead.  */
5630
5631       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5632         {
5633           if (outer_loop)
5634             {
5635               stmt_vec_info exit_phi_vinfo
5636                 = loop_vinfo->lookup_stmt (exit_phi);
5637               gphi *vect_phi;
5638
5639               /* FORNOW. Currently not supporting the case that an inner-loop
5640                  reduction is not used in the outer-loop (but only outside the
5641                  outer-loop), unless it is double reduction.  */
5642               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5643                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5644                           || double_reduc);
5645
5646               if (double_reduc)
5647                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5648               else
5649                 STMT_VINFO_VEC_STMT (exit_phi_vinfo)
5650                   = vinfo_for_stmt (epilog_stmt);
5651               if (!double_reduc
5652                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5653                       != vect_double_reduction_def)
5654                 continue;
5655
5656               /* Handle double reduction:
5657
5658                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5659                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5660                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5661                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5662
5663                  At that point the regular reduction (stmt2 and stmt3) is
5664                  already vectorized, as well as the exit phi node, stmt4.
5665                  Here we vectorize the phi node of double reduction, stmt1, and
5666                  update all relevant statements.  */
5667
5668               /* Go through all the uses of s2 to find double reduction phi
5669                  node, i.e., stmt1 above.  */
5670               orig_name = PHI_RESULT (exit_phi);
5671               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5672                 {
5673                   stmt_vec_info use_stmt_vinfo;
5674                   tree vect_phi_init, preheader_arg, vect_phi_res;
5675                   basic_block bb = gimple_bb (use_stmt);
5676
5677                   /* Check that USE_STMT is really double reduction phi
5678                      node.  */
5679                   if (gimple_code (use_stmt) != GIMPLE_PHI
5680                       || gimple_phi_num_args (use_stmt) != 2
5681                       || bb->loop_father != outer_loop)
5682                     continue;
5683                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5684                   if (!use_stmt_vinfo
5685                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5686                           != vect_double_reduction_def)
5687                     continue;
5688
5689                   /* Create vector phi node for double reduction:
5690                      vs1 = phi <vs0, vs2>
5691                      vs1 was created previously in this function by a call to
5692                        vect_get_vec_def_for_operand and is stored in
5693                        vec_initial_def;
5694                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5695                      vs0 is created here.  */
5696
5697                   /* Create vector phi node.  */
5698                   vect_phi = create_phi_node (vec_initial_def, bb);
5699                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5700
5701                   /* Create vs0 - initial def of the double reduction phi.  */
5702                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5703                                              loop_preheader_edge (outer_loop));
5704                   vect_phi_init = get_initial_def_for_reduction
5705                     (stmt, preheader_arg, NULL);
5706
5707                   /* Update phi node arguments with vs0 and vs2.  */
5708                   add_phi_arg (vect_phi, vect_phi_init,
5709                                loop_preheader_edge (outer_loop),
5710                                UNKNOWN_LOCATION);
5711                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5712                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5713                   if (dump_enabled_p ())
5714                     {
5715                       dump_printf_loc (MSG_NOTE, vect_location,
5716                                        "created double reduction phi node: ");
5717                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5718                     }
5719
5720                   vect_phi_res = PHI_RESULT (vect_phi);
5721
5722                   /* Replace the use, i.e., set the correct vs1 in the regular
5723                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5724                      loop is redundant.  */
5725                   stmt_vec_info use_info = reduction_phi_info;
5726                   for (j = 0; j < ncopies; j++)
5727                     {
5728                       edge pr_edge = loop_preheader_edge (loop);
5729                       SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5730                                        pr_edge->dest_idx, vect_phi_res);
5731                       use_info = STMT_VINFO_RELATED_STMT (use_info);
5732                     }
5733                 }
5734             }
5735         }
5736
5737       phis.release ();
5738       if (nested_in_vect_loop)
5739         {
5740           if (double_reduc)
5741             loop = outer_loop;
5742           else
5743             continue;
5744         }
5745
5746       phis.create (3);
5747       /* Find the loop-closed-use at the loop exit of the original scalar
5748          result.  (The reduction result is expected to have two immediate uses,
5749          one at the latch block, and one at the loop exit).  For double
5750          reductions we are looking for exit phis of the outer loop.  */
5751       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5752         {
5753           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5754             {
5755               if (!is_gimple_debug (USE_STMT (use_p)))
5756                 phis.safe_push (USE_STMT (use_p));
5757             }
5758           else
5759             {
5760               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5761                 {
5762                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5763
5764                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5765                     {
5766                       if (!flow_bb_inside_loop_p (loop,
5767                                              gimple_bb (USE_STMT (phi_use_p)))
5768                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5769                         phis.safe_push (USE_STMT (phi_use_p));
5770                     }
5771                 }
5772             }
5773         }
5774
5775       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5776         {
5777           /* Replace the uses:  */
5778           orig_name = PHI_RESULT (exit_phi);
5779           scalar_result = scalar_results[k];
5780           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5781             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5782               SET_USE (use_p, scalar_result);
5783         }
5784
5785       phis.release ();
5786     }
5787 }
5788
5789 /* Return a vector of type VECTYPE that is equal to the vector select
5790    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5791    before GSI.  */
5792
5793 static tree
5794 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5795                      tree vec, tree identity)
5796 {
5797   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5798   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5799                                           mask, vec, identity);
5800   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5801   return cond;
5802 }
5803
5804 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5805    order, starting with LHS.  Insert the extraction statements before GSI and
5806    associate the new scalar SSA names with variable SCALAR_DEST.
5807    Return the SSA name for the result.  */
5808
5809 static tree
5810 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5811                        tree_code code, tree lhs, tree vector_rhs)
5812 {
5813   tree vectype = TREE_TYPE (vector_rhs);
5814   tree scalar_type = TREE_TYPE (vectype);
5815   tree bitsize = TYPE_SIZE (scalar_type);
5816   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5817   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5818
5819   for (unsigned HOST_WIDE_INT bit_offset = 0;
5820        bit_offset < vec_size_in_bits;
5821        bit_offset += element_bitsize)
5822     {
5823       tree bitpos = bitsize_int (bit_offset);
5824       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5825                          bitsize, bitpos);
5826
5827       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5828       rhs = make_ssa_name (scalar_dest, stmt);
5829       gimple_assign_set_lhs (stmt, rhs);
5830       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5831
5832       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5833       tree new_name = make_ssa_name (scalar_dest, stmt);
5834       gimple_assign_set_lhs (stmt, new_name);
5835       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5836       lhs = new_name;
5837     }
5838   return lhs;
5839 }
5840
5841 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
5842    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5843    statement.  CODE is the operation performed by STMT and OPS are
5844    its scalar operands.  REDUC_INDEX is the index of the operand in
5845    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5846    implements in-order reduction, or IFN_LAST if we should open-code it.
5847    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5848    that should be used to control the operation in a fully-masked loop.  */
5849
5850 static bool
5851 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5852                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5853                                gimple *reduc_def_stmt,
5854                                tree_code code, internal_fn reduc_fn,
5855                                tree ops[3], tree vectype_in,
5856                                int reduc_index, vec_loop_masks *masks)
5857 {
5858   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5859   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5860   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5861   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5862   stmt_vec_info new_stmt_info = NULL;
5863
5864   int ncopies;
5865   if (slp_node)
5866     ncopies = 1;
5867   else
5868     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5869
5870   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5871   gcc_assert (ncopies == 1);
5872   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5873   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5874   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5875               == FOLD_LEFT_REDUCTION);
5876
5877   if (slp_node)
5878     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5879                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5880
5881   tree op0 = ops[1 - reduc_index];
5882
5883   int group_size = 1;
5884   stmt_vec_info scalar_dest_def_info;
5885   auto_vec<tree> vec_oprnds0;
5886   if (slp_node)
5887     {
5888       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5889       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5890       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5891     }
5892   else
5893     {
5894       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5895       vec_oprnds0.create (1);
5896       vec_oprnds0.quick_push (loop_vec_def0);
5897       scalar_dest_def_info = stmt_info;
5898     }
5899
5900   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5901   tree scalar_type = TREE_TYPE (scalar_dest);
5902   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5903
5904   int vec_num = vec_oprnds0.length ();
5905   gcc_assert (vec_num == 1 || slp_node);
5906   tree vec_elem_type = TREE_TYPE (vectype_out);
5907   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5908
5909   tree vector_identity = NULL_TREE;
5910   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5911     vector_identity = build_zero_cst (vectype_out);
5912
5913   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5914   int i;
5915   tree def0;
5916   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5917     {
5918       gimple *new_stmt;
5919       tree mask = NULL_TREE;
5920       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5921         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5922
5923       /* Handle MINUS by adding the negative.  */
5924       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5925         {
5926           tree negated = make_ssa_name (vectype_out);
5927           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5928           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5929           def0 = negated;
5930         }
5931
5932       if (mask)
5933         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5934                                     vector_identity);
5935
5936       /* On the first iteration the input is simply the scalar phi
5937          result, and for subsequent iterations it is the output of
5938          the preceding operation.  */
5939       if (reduc_fn != IFN_LAST)
5940         {
5941           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5942           /* For chained SLP reductions the output of the previous reduction
5943              operation serves as the input of the next. For the final statement
5944              the output cannot be a temporary - we reuse the original
5945              scalar destination of the last statement.  */
5946           if (i != vec_num - 1)
5947             {
5948               gimple_set_lhs (new_stmt, scalar_dest_var);
5949               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5950               gimple_set_lhs (new_stmt, reduc_var);
5951             }
5952         }
5953       else
5954         {
5955           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5956                                              reduc_var, def0);
5957           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5958           /* Remove the statement, so that we can use the same code paths
5959              as for statements that we've just created.  */
5960           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5961           gsi_remove (&tmp_gsi, false);
5962         }
5963
5964       if (i == vec_num - 1)
5965         {
5966           gimple_set_lhs (new_stmt, scalar_dest);
5967           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5968                                                     new_stmt);
5969         }
5970       else
5971         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5972                                                      new_stmt, gsi);
5973
5974       if (slp_node)
5975         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5976     }
5977
5978   if (!slp_node)
5979     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5980
5981   return true;
5982 }
5983
5984 /* Function is_nonwrapping_integer_induction.
5985
5986    Check if STMT (which is part of loop LOOP) both increments and
5987    does not cause overflow.  */
5988
5989 static bool
5990 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5991 {
5992   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5993   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5994   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5995   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5996   widest_int ni, max_loop_value, lhs_max;
5997   wi::overflow_type overflow = wi::OVF_NONE;
5998
5999   /* Make sure the loop is integer based.  */
6000   if (TREE_CODE (base) != INTEGER_CST
6001       || TREE_CODE (step) != INTEGER_CST)
6002     return false;
6003
6004   /* Check that the max size of the loop will not wrap.  */
6005
6006   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6007     return true;
6008
6009   if (! max_stmt_executions (loop, &ni))
6010     return false;
6011
6012   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6013                             &overflow);
6014   if (overflow)
6015     return false;
6016
6017   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6018                             TYPE_SIGN (lhs_type), &overflow);
6019   if (overflow)
6020     return false;
6021
6022   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6023           <= TYPE_PRECISION (lhs_type));
6024 }
6025
6026 /* Function vectorizable_reduction.
6027
6028    Check if STMT performs a reduction operation that can be vectorized.
6029    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6030    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6031    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6032
6033    This function also handles reduction idioms (patterns) that have been
6034    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6035    of this form:
6036      X = pattern_expr (arg0, arg1, ..., X)
6037    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6038    sequence that had been detected and replaced by the pattern-stmt (STMT).
6039
6040    This function also handles reduction of condition expressions, for example:
6041      for (int i = 0; i < N; i++)
6042        if (a[i] < value)
6043          last = a[i];
6044    This is handled by vectorising the loop and creating an additional vector
6045    containing the loop indexes for which "a[i] < value" was true.  In the
6046    function epilogue this is reduced to a single max value and then used to
6047    index into the vector of results.
6048
6049    In some cases of reduction patterns, the type of the reduction variable X is
6050    different than the type of the other arguments of STMT.
6051    In such cases, the vectype that is used when transforming STMT into a vector
6052    stmt is different than the vectype that is used to determine the
6053    vectorization factor, because it consists of a different number of elements
6054    than the actual number of elements that are being operated upon in parallel.
6055
6056    For example, consider an accumulation of shorts into an int accumulator.
6057    On some targets it's possible to vectorize this pattern operating on 8
6058    shorts at a time (hence, the vectype for purposes of determining the
6059    vectorization factor should be V8HI); on the other hand, the vectype that
6060    is used to create the vector form is actually V4SI (the type of the result).
6061
6062    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6063    indicates what is the actual level of parallelism (V8HI in the example), so
6064    that the right vectorization factor would be derived.  This vectype
6065    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6066    be used to create the vectorized stmt.  The right vectype for the vectorized
6067    stmt is obtained from the type of the result X:
6068         get_vectype_for_scalar_type (TREE_TYPE (X))
6069
6070    This means that, contrary to "regular" reductions (or "regular" stmts in
6071    general), the following equation:
6072       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6073    does *NOT* necessarily hold for reduction patterns.  */
6074
6075 bool
6076 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6077                         stmt_vec_info *vec_stmt, slp_tree slp_node,
6078                         slp_instance slp_node_instance,
6079                         stmt_vector_for_cost *cost_vec)
6080 {
6081   tree vec_dest;
6082   tree scalar_dest;
6083   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6084   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6085   tree vectype_in = NULL_TREE;
6086   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6087   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6088   enum tree_code code, orig_code;
6089   internal_fn reduc_fn;
6090   machine_mode vec_mode;
6091   int op_type;
6092   optab optab;
6093   tree new_temp = NULL_TREE;
6094   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6095   gimple *cond_reduc_def_stmt = NULL;
6096   enum tree_code cond_reduc_op_code = ERROR_MARK;
6097   tree scalar_type;
6098   bool is_simple_use;
6099   int i;
6100   int ncopies;
6101   int epilog_copies;
6102   stmt_vec_info prev_stmt_info, prev_phi_info;
6103   bool single_defuse_cycle = false;
6104   stmt_vec_info new_stmt_info = NULL;
6105   int j;
6106   tree ops[3];
6107   enum vect_def_type dts[3];
6108   bool nested_cycle = false, found_nested_cycle_def = false;
6109   bool double_reduc = false;
6110   basic_block def_bb;
6111   struct loop * def_stmt_loop;
6112   tree def_arg;
6113   auto_vec<tree> vec_oprnds0;
6114   auto_vec<tree> vec_oprnds1;
6115   auto_vec<tree> vec_oprnds2;
6116   auto_vec<tree> vect_defs;
6117   auto_vec<stmt_vec_info> phis;
6118   int vec_num;
6119   tree def0, tem;
6120   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6121   tree cond_reduc_val = NULL_TREE;
6122
6123   /* Make sure it was already recognized as a reduction computation.  */
6124   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6125       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6126     return false;
6127
6128   if (nested_in_vect_loop_p (loop, stmt))
6129     {
6130       loop = loop->inner;
6131       nested_cycle = true;
6132     }
6133
6134   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6135     gcc_assert (slp_node
6136                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6137
6138   if (gimple_code (stmt) == GIMPLE_PHI)
6139     {
6140       tree phi_result = gimple_phi_result (stmt);
6141       /* Analysis is fully done on the reduction stmt invocation.  */
6142       if (! vec_stmt)
6143         {
6144           if (slp_node)
6145             slp_node_instance->reduc_phis = slp_node;
6146
6147           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6148           return true;
6149         }
6150
6151       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6152         /* Leave the scalar phi in place.  Note that checking
6153            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6154            for reductions involving a single statement.  */
6155         return true;
6156
6157       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6158       if (STMT_VINFO_IN_PATTERN_P (reduc_stmt_info))
6159         reduc_stmt_info = STMT_VINFO_RELATED_STMT (reduc_stmt_info);
6160
6161       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6162           == EXTRACT_LAST_REDUCTION)
6163         /* Leave the scalar phi in place.  */
6164         return true;
6165
6166       gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6167       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6168         {
6169           tree op = gimple_op (reduc_stmt, k);
6170           if (op == gimple_phi_result (stmt))
6171             continue;
6172           if (k == 1
6173               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6174             continue;
6175           if (!vectype_in
6176               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6177                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6178             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6179           break;
6180         }
6181       gcc_assert (vectype_in);
6182
6183       if (slp_node)
6184         ncopies = 1;
6185       else
6186         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6187
6188       stmt_vec_info use_stmt_info;
6189       if (ncopies > 1
6190           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6191           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6192           && (use_stmt_info == reduc_stmt_info
6193               || STMT_VINFO_RELATED_STMT (use_stmt_info) == reduc_stmt))
6194         single_defuse_cycle = true;
6195
6196       /* Create the destination vector  */
6197       scalar_dest = gimple_assign_lhs (reduc_stmt);
6198       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6199
6200       if (slp_node)
6201         /* The size vect_schedule_slp_instance computes is off for us.  */
6202         vec_num = vect_get_num_vectors
6203           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6204            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6205            vectype_in);
6206       else
6207         vec_num = 1;
6208
6209       /* Generate the reduction PHIs upfront.  */
6210       prev_phi_info = NULL;
6211       for (j = 0; j < ncopies; j++)
6212         {
6213           if (j == 0 || !single_defuse_cycle)
6214             {
6215               for (i = 0; i < vec_num; i++)
6216                 {
6217                   /* Create the reduction-phi that defines the reduction
6218                      operand.  */
6219                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6220                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6221
6222                   if (slp_node)
6223                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6224                   else
6225                     {
6226                       if (j == 0)
6227                         STMT_VINFO_VEC_STMT (stmt_info)
6228                           = *vec_stmt = new_phi_info;
6229                       else
6230                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6231                       prev_phi_info = new_phi_info;
6232                     }
6233                 }
6234             }
6235         }
6236
6237       return true;
6238     }
6239
6240   /* 1. Is vectorizable reduction?  */
6241   /* Not supportable if the reduction variable is used in the loop, unless
6242      it's a reduction chain.  */
6243   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6244       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6245     return false;
6246
6247   /* Reductions that are not used even in an enclosing outer-loop,
6248      are expected to be "live" (used out of the loop).  */
6249   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6250       && !STMT_VINFO_LIVE_P (stmt_info))
6251     return false;
6252
6253   /* 2. Has this been recognized as a reduction pattern?
6254
6255      Check if STMT represents a pattern that has been recognized
6256      in earlier analysis stages.  For stmts that represent a pattern,
6257      the STMT_VINFO_RELATED_STMT field records the last stmt in
6258      the original sequence that constitutes the pattern.  */
6259
6260   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6261   if (orig_stmt_info)
6262     {
6263       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6264       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6265     }
6266
6267   /* 3. Check the operands of the operation.  The first operands are defined
6268         inside the loop body. The last operand is the reduction variable,
6269         which is defined by the loop-header-phi.  */
6270
6271   gcc_assert (is_gimple_assign (stmt));
6272
6273   /* Flatten RHS.  */
6274   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6275     {
6276     case GIMPLE_BINARY_RHS:
6277       code = gimple_assign_rhs_code (stmt);
6278       op_type = TREE_CODE_LENGTH (code);
6279       gcc_assert (op_type == binary_op);
6280       ops[0] = gimple_assign_rhs1 (stmt);
6281       ops[1] = gimple_assign_rhs2 (stmt);
6282       break;
6283
6284     case GIMPLE_TERNARY_RHS:
6285       code = gimple_assign_rhs_code (stmt);
6286       op_type = TREE_CODE_LENGTH (code);
6287       gcc_assert (op_type == ternary_op);
6288       ops[0] = gimple_assign_rhs1 (stmt);
6289       ops[1] = gimple_assign_rhs2 (stmt);
6290       ops[2] = gimple_assign_rhs3 (stmt);
6291       break;
6292
6293     case GIMPLE_UNARY_RHS:
6294       return false;
6295
6296     default:
6297       gcc_unreachable ();
6298     }
6299
6300   if (code == COND_EXPR && slp_node)
6301     return false;
6302
6303   scalar_dest = gimple_assign_lhs (stmt);
6304   scalar_type = TREE_TYPE (scalar_dest);
6305   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6306       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6307     return false;
6308
6309   /* Do not try to vectorize bit-precision reductions.  */
6310   if (!type_has_mode_precision_p (scalar_type))
6311     return false;
6312
6313   /* All uses but the last are expected to be defined in the loop.
6314      The last use is the reduction variable.  In case of nested cycle this
6315      assumption is not true: we use reduc_index to record the index of the
6316      reduction variable.  */
6317   stmt_vec_info reduc_def_info = NULL;
6318   int reduc_index = -1;
6319   for (i = 0; i < op_type; i++)
6320     {
6321       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6322       if (i == 0 && code == COND_EXPR)
6323         continue;
6324
6325       stmt_vec_info def_stmt_info;
6326       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6327                                           &def_stmt_info);
6328       dt = dts[i];
6329       gcc_assert (is_simple_use);
6330       if (dt == vect_reduction_def)
6331         {
6332           reduc_def_info = def_stmt_info;
6333           reduc_index = i;
6334           continue;
6335         }
6336       else if (tem)
6337         {
6338           /* To properly compute ncopies we are interested in the widest
6339              input type in case we're looking at a widening accumulation.  */
6340           if (!vectype_in
6341               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6342                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6343             vectype_in = tem;
6344         }
6345
6346       if (dt != vect_internal_def
6347           && dt != vect_external_def
6348           && dt != vect_constant_def
6349           && dt != vect_induction_def
6350           && !(dt == vect_nested_cycle && nested_cycle))
6351         return false;
6352
6353       if (dt == vect_nested_cycle)
6354         {
6355           found_nested_cycle_def = true;
6356           reduc_def_info = def_stmt_info;
6357           reduc_index = i;
6358         }
6359
6360       if (i == 1 && code == COND_EXPR)
6361         {
6362           /* Record how value of COND_EXPR is defined.  */
6363           if (dt == vect_constant_def)
6364             {
6365               cond_reduc_dt = dt;
6366               cond_reduc_val = ops[i];
6367             }
6368           if (dt == vect_induction_def
6369               && def_stmt_info
6370               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6371             {
6372               cond_reduc_dt = dt;
6373               cond_reduc_def_stmt = def_stmt_info;
6374             }
6375         }
6376     }
6377
6378   if (!vectype_in)
6379     vectype_in = vectype_out;
6380
6381   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6382      directy used in stmt.  */
6383   if (reduc_index == -1)
6384     {
6385       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6386         {
6387           if (dump_enabled_p ())
6388             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6389                              "in-order reduction chain without SLP.\n");
6390           return false;
6391         }
6392
6393       if (orig_stmt_info)
6394         reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6395       else
6396         reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6397     }
6398
6399   if (! reduc_def_info)
6400     return false;
6401
6402   gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt);
6403   if (!reduc_def_phi)
6404     return false;
6405
6406   if (!(reduc_index == -1
6407         || dts[reduc_index] == vect_reduction_def
6408         || dts[reduc_index] == vect_nested_cycle
6409         || ((dts[reduc_index] == vect_internal_def
6410              || dts[reduc_index] == vect_external_def
6411              || dts[reduc_index] == vect_constant_def
6412              || dts[reduc_index] == vect_induction_def)
6413             && nested_cycle && found_nested_cycle_def)))
6414     {
6415       /* For pattern recognized stmts, orig_stmt might be a reduction,
6416          but some helper statements for the pattern might not, or
6417          might be COND_EXPRs with reduction uses in the condition.  */
6418       gcc_assert (orig_stmt_info);
6419       return false;
6420     }
6421
6422   /* PHIs should not participate in patterns.  */
6423   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6424   enum vect_reduction_type v_reduc_type
6425     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6426   stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6427
6428   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6429   /* If we have a condition reduction, see if we can simplify it further.  */
6430   if (v_reduc_type == COND_REDUCTION)
6431     {
6432       /* TODO: We can't yet handle reduction chains, since we need to treat
6433          each COND_EXPR in the chain specially, not just the last one.
6434          E.g. for:
6435
6436             x_1 = PHI <x_3, ...>
6437             x_2 = a_2 ? ... : x_1;
6438             x_3 = a_3 ? ... : x_2;
6439
6440          we're interested in the last element in x_3 for which a_2 || a_3
6441          is true, whereas the current reduction chain handling would
6442          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6443          as a reduction operation.  */
6444       if (reduc_index == -1)
6445         {
6446           if (dump_enabled_p ())
6447             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6448                              "conditional reduction chains not supported\n");
6449           return false;
6450         }
6451
6452       /* vect_is_simple_reduction ensured that operand 2 is the
6453          loop-carried operand.  */
6454       gcc_assert (reduc_index == 2);
6455
6456       /* Loop peeling modifies initial value of reduction PHI, which
6457          makes the reduction stmt to be transformed different to the
6458          original stmt analyzed.  We need to record reduction code for
6459          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6460          it can be used directly at transform stage.  */
6461       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6462           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6463         {
6464           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6465           gcc_assert (cond_reduc_dt == vect_constant_def);
6466           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6467         }
6468       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6469                                                vectype_in, OPTIMIZE_FOR_SPEED))
6470         {
6471           if (dump_enabled_p ())
6472             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6473                              "optimizing condition reduction with"
6474                              " FOLD_EXTRACT_LAST.\n");
6475           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6476         }
6477       else if (cond_reduc_dt == vect_induction_def)
6478         {
6479           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6480           tree base
6481             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6482           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6483
6484           gcc_assert (TREE_CODE (base) == INTEGER_CST
6485                       && TREE_CODE (step) == INTEGER_CST);
6486           cond_reduc_val = NULL_TREE;
6487           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6488              above base; punt if base is the minimum value of the type for
6489              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6490           if (tree_int_cst_sgn (step) == -1)
6491             {
6492               cond_reduc_op_code = MIN_EXPR;
6493               if (tree_int_cst_sgn (base) == -1)
6494                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6495               else if (tree_int_cst_lt (base,
6496                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6497                 cond_reduc_val
6498                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6499             }
6500           else
6501             {
6502               cond_reduc_op_code = MAX_EXPR;
6503               if (tree_int_cst_sgn (base) == 1)
6504                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6505               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6506                                         base))
6507                 cond_reduc_val
6508                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6509             }
6510           if (cond_reduc_val)
6511             {
6512               if (dump_enabled_p ())
6513                 dump_printf_loc (MSG_NOTE, vect_location,
6514                                  "condition expression based on "
6515                                  "integer induction.\n");
6516               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6517                 = INTEGER_INDUC_COND_REDUCTION;
6518             }
6519         }
6520       else if (cond_reduc_dt == vect_constant_def)
6521         {
6522           enum vect_def_type cond_initial_dt;
6523           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6524           tree cond_initial_val
6525             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6526
6527           gcc_assert (cond_reduc_val != NULL_TREE);
6528           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6529           if (cond_initial_dt == vect_constant_def
6530               && types_compatible_p (TREE_TYPE (cond_initial_val),
6531                                      TREE_TYPE (cond_reduc_val)))
6532             {
6533               tree e = fold_binary (LE_EXPR, boolean_type_node,
6534                                     cond_initial_val, cond_reduc_val);
6535               if (e && (integer_onep (e) || integer_zerop (e)))
6536                 {
6537                   if (dump_enabled_p ())
6538                     dump_printf_loc (MSG_NOTE, vect_location,
6539                                      "condition expression based on "
6540                                      "compile time constant.\n");
6541                   /* Record reduction code at analysis stage.  */
6542                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6543                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6544                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6545                     = CONST_COND_REDUCTION;
6546                 }
6547             }
6548         }
6549     }
6550
6551   if (orig_stmt_info)
6552     gcc_assert (tmp == orig_stmt_info
6553                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6554   else
6555     /* We changed STMT to be the first stmt in reduction chain, hence we
6556        check that in this case the first element in the chain is STMT.  */
6557     gcc_assert (tmp == stmt_info
6558                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6559
6560   if (STMT_VINFO_LIVE_P (reduc_def_info))
6561     return false;
6562
6563   if (slp_node)
6564     ncopies = 1;
6565   else
6566     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6567
6568   gcc_assert (ncopies >= 1);
6569
6570   vec_mode = TYPE_MODE (vectype_in);
6571   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6572
6573   if (code == COND_EXPR)
6574     {
6575       /* Only call during the analysis stage, otherwise we'll lose
6576          STMT_VINFO_TYPE.  */
6577       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6578                                                 ops[reduc_index], 0, NULL,
6579                                                 cost_vec))
6580         {
6581           if (dump_enabled_p ())
6582             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6583                              "unsupported condition in reduction\n");
6584           return false;
6585         }
6586     }
6587   else
6588     {
6589       /* 4. Supportable by target?  */
6590
6591       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6592           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6593         {
6594           /* Shifts and rotates are only supported by vectorizable_shifts,
6595              not vectorizable_reduction.  */
6596           if (dump_enabled_p ())
6597             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6598                              "unsupported shift or rotation.\n");
6599           return false;
6600         }
6601
6602       /* 4.1. check support for the operation in the loop  */
6603       optab = optab_for_tree_code (code, vectype_in, optab_default);
6604       if (!optab)
6605         {
6606           if (dump_enabled_p ())
6607             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6608                              "no optab.\n");
6609
6610           return false;
6611         }
6612
6613       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6614         {
6615           if (dump_enabled_p ())
6616             dump_printf (MSG_NOTE, "op not supported by target.\n");
6617
6618           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6619               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6620             return false;
6621
6622           if (dump_enabled_p ())
6623             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6624         }
6625
6626       /* Worthwhile without SIMD support?  */
6627       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6628           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6629         {
6630           if (dump_enabled_p ())
6631             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6632                              "not worthwhile without SIMD support.\n");
6633
6634           return false;
6635         }
6636     }
6637
6638   /* 4.2. Check support for the epilog operation.
6639
6640           If STMT represents a reduction pattern, then the type of the
6641           reduction variable may be different than the type of the rest
6642           of the arguments.  For example, consider the case of accumulation
6643           of shorts into an int accumulator; The original code:
6644                         S1: int_a = (int) short_a;
6645           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6646
6647           was replaced with:
6648                         STMT: int_acc = widen_sum <short_a, int_acc>
6649
6650           This means that:
6651           1. The tree-code that is used to create the vector operation in the
6652              epilog code (that reduces the partial results) is not the
6653              tree-code of STMT, but is rather the tree-code of the original
6654              stmt from the pattern that STMT is replacing.  I.e, in the example
6655              above we want to use 'widen_sum' in the loop, but 'plus' in the
6656              epilog.
6657           2. The type (mode) we use to check available target support
6658              for the vector operation to be created in the *epilog*, is
6659              determined by the type of the reduction variable (in the example
6660              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6661              However the type (mode) we use to check available target support
6662              for the vector operation to be created *inside the loop*, is
6663              determined by the type of the other arguments to STMT (in the
6664              example we'd check this: optab_handler (widen_sum_optab,
6665              vect_short_mode)).
6666
6667           This is contrary to "regular" reductions, in which the types of all
6668           the arguments are the same as the type of the reduction variable.
6669           For "regular" reductions we can therefore use the same vector type
6670           (and also the same tree-code) when generating the epilog code and
6671           when generating the code inside the loop.  */
6672
6673   vect_reduction_type reduction_type
6674     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6675   if (orig_stmt_info
6676       && (reduction_type == TREE_CODE_REDUCTION
6677           || reduction_type == FOLD_LEFT_REDUCTION))
6678     {
6679       /* This is a reduction pattern: get the vectype from the type of the
6680          reduction variable, and get the tree-code from orig_stmt.  */
6681       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6682       gcc_assert (vectype_out);
6683       vec_mode = TYPE_MODE (vectype_out);
6684     }
6685   else
6686     {
6687       /* Regular reduction: use the same vectype and tree-code as used for
6688          the vector code inside the loop can be used for the epilog code. */
6689       orig_code = code;
6690
6691       if (code == MINUS_EXPR)
6692         orig_code = PLUS_EXPR;
6693
6694       /* For simple condition reductions, replace with the actual expression
6695          we want to base our reduction around.  */
6696       if (reduction_type == CONST_COND_REDUCTION)
6697         {
6698           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6699           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6700         }
6701       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6702         orig_code = cond_reduc_op_code;
6703     }
6704
6705   if (nested_cycle)
6706     {
6707       def_bb = gimple_bb (reduc_def_phi);
6708       def_stmt_loop = def_bb->loop_father;
6709       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6710                                        loop_preheader_edge (def_stmt_loop));
6711       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6712       if (def_arg_stmt_info
6713           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6714               == vect_double_reduction_def))
6715         double_reduc = true;
6716     }
6717
6718   reduc_fn = IFN_LAST;
6719
6720   if (reduction_type == TREE_CODE_REDUCTION
6721       || reduction_type == FOLD_LEFT_REDUCTION
6722       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6723       || reduction_type == CONST_COND_REDUCTION)
6724     {
6725       if (reduction_type == FOLD_LEFT_REDUCTION
6726           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6727           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6728         {
6729           if (reduc_fn != IFN_LAST
6730               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6731                                                   OPTIMIZE_FOR_SPEED))
6732             {
6733               if (dump_enabled_p ())
6734                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6735                                  "reduc op not supported by target.\n");
6736
6737               reduc_fn = IFN_LAST;
6738             }
6739         }
6740       else
6741         {
6742           if (!nested_cycle || double_reduc)
6743             {
6744               if (dump_enabled_p ())
6745                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6746                                  "no reduc code for scalar code.\n");
6747
6748               return false;
6749             }
6750         }
6751     }
6752   else if (reduction_type == COND_REDUCTION)
6753     {
6754       int scalar_precision
6755         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6756       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6757       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6758                                                 nunits_out);
6759
6760       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6761                                           OPTIMIZE_FOR_SPEED))
6762         reduc_fn = IFN_REDUC_MAX;
6763     }
6764
6765   if (reduction_type != EXTRACT_LAST_REDUCTION
6766       && reduc_fn == IFN_LAST
6767       && !nunits_out.is_constant ())
6768     {
6769       if (dump_enabled_p ())
6770         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6771                          "missing target support for reduction on"
6772                          " variable-length vectors.\n");
6773       return false;
6774     }
6775
6776   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6777       && ncopies > 1)
6778     {
6779       if (dump_enabled_p ())
6780         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6781                          "multiple types in double reduction or condition "
6782                          "reduction.\n");
6783       return false;
6784     }
6785
6786   /* For SLP reductions, see if there is a neutral value we can use.  */
6787   tree neutral_op = NULL_TREE;
6788   if (slp_node)
6789     neutral_op = neutral_op_for_slp_reduction
6790       (slp_node_instance->reduc_phis, code,
6791        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL_STMT_VEC_INFO);
6792
6793   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6794     {
6795       /* We can't support in-order reductions of code such as this:
6796
6797            for (int i = 0; i < n1; ++i)
6798              for (int j = 0; j < n2; ++j)
6799                l += a[j];
6800
6801          since GCC effectively transforms the loop when vectorizing:
6802
6803            for (int i = 0; i < n1 / VF; ++i)
6804              for (int j = 0; j < n2; ++j)
6805                for (int k = 0; k < VF; ++k)
6806                  l += a[j];
6807
6808          which is a reassociation of the original operation.  */
6809       if (dump_enabled_p ())
6810         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6811                          "in-order double reduction not supported.\n");
6812
6813       return false;
6814     }
6815
6816   if (reduction_type == FOLD_LEFT_REDUCTION
6817       && slp_node
6818       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6819     {
6820       /* We cannot use in-order reductions in this case because there is
6821          an implicit reassociation of the operations involved.  */
6822       if (dump_enabled_p ())
6823         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6824                          "in-order unchained SLP reductions not supported.\n");
6825       return false;
6826     }
6827
6828   /* For double reductions, and for SLP reductions with a neutral value,
6829      we construct a variable-length initial vector by loading a vector
6830      full of the neutral value and then shift-and-inserting the start
6831      values into the low-numbered elements.  */
6832   if ((double_reduc || neutral_op)
6833       && !nunits_out.is_constant ()
6834       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6835                                           vectype_out, OPTIMIZE_FOR_SPEED))
6836     {
6837       if (dump_enabled_p ())
6838         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6839                          "reduction on variable-length vectors requires"
6840                          " target support for a vector-shift-and-insert"
6841                          " operation.\n");
6842       return false;
6843     }
6844
6845   /* Check extra constraints for variable-length unchained SLP reductions.  */
6846   if (STMT_SLP_TYPE (stmt_info)
6847       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6848       && !nunits_out.is_constant ())
6849     {
6850       /* We checked above that we could build the initial vector when
6851          there's a neutral element value.  Check here for the case in
6852          which each SLP statement has its own initial value and in which
6853          that value needs to be repeated for every instance of the
6854          statement within the initial vector.  */
6855       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6856       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6857       if (!neutral_op
6858           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6859         {
6860           if (dump_enabled_p ())
6861             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6862                              "unsupported form of SLP reduction for"
6863                              " variable-length vectors: cannot build"
6864                              " initial vector.\n");
6865           return false;
6866         }
6867       /* The epilogue code relies on the number of elements being a multiple
6868          of the group size.  The duplicate-and-interleave approach to setting
6869          up the the initial vector does too.  */
6870       if (!multiple_p (nunits_out, group_size))
6871         {
6872           if (dump_enabled_p ())
6873             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6874                              "unsupported form of SLP reduction for"
6875                              " variable-length vectors: the vector size"
6876                              " is not a multiple of the number of results.\n");
6877           return false;
6878         }
6879     }
6880
6881   /* In case of widenning multiplication by a constant, we update the type
6882      of the constant to be the type of the other operand.  We check that the
6883      constant fits the type in the pattern recognition pass.  */
6884   if (code == DOT_PROD_EXPR
6885       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6886     {
6887       if (TREE_CODE (ops[0]) == INTEGER_CST)
6888         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6889       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6890         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6891       else
6892         {
6893           if (dump_enabled_p ())
6894             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6895                              "invalid types in dot-prod\n");
6896
6897           return false;
6898         }
6899     }
6900
6901   if (reduction_type == COND_REDUCTION)
6902     {
6903       widest_int ni;
6904
6905       if (! max_loop_iterations (loop, &ni))
6906         {
6907           if (dump_enabled_p ())
6908             dump_printf_loc (MSG_NOTE, vect_location,
6909                              "loop count not known, cannot create cond "
6910                              "reduction.\n");
6911           return false;
6912         }
6913       /* Convert backedges to iterations.  */
6914       ni += 1;
6915
6916       /* The additional index will be the same type as the condition.  Check
6917          that the loop can fit into this less one (because we'll use up the
6918          zero slot for when there are no matches).  */
6919       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6920       if (wi::geu_p (ni, wi::to_widest (max_index)))
6921         {
6922           if (dump_enabled_p ())
6923             dump_printf_loc (MSG_NOTE, vect_location,
6924                              "loop size is greater than data size.\n");
6925           return false;
6926         }
6927     }
6928
6929   /* In case the vectorization factor (VF) is bigger than the number
6930      of elements that we can fit in a vectype (nunits), we have to generate
6931      more than one vector stmt - i.e - we need to "unroll" the
6932      vector stmt by a factor VF/nunits.  For more details see documentation
6933      in vectorizable_operation.  */
6934
6935   /* If the reduction is used in an outer loop we need to generate
6936      VF intermediate results, like so (e.g. for ncopies=2):
6937         r0 = phi (init, r0)
6938         r1 = phi (init, r1)
6939         r0 = x0 + r0;
6940         r1 = x1 + r1;
6941     (i.e. we generate VF results in 2 registers).
6942     In this case we have a separate def-use cycle for each copy, and therefore
6943     for each copy we get the vector def for the reduction variable from the
6944     respective phi node created for this copy.
6945
6946     Otherwise (the reduction is unused in the loop nest), we can combine
6947     together intermediate results, like so (e.g. for ncopies=2):
6948         r = phi (init, r)
6949         r = x0 + r;
6950         r = x1 + r;
6951    (i.e. we generate VF/2 results in a single register).
6952    In this case for each copy we get the vector def for the reduction variable
6953    from the vectorized reduction operation generated in the previous iteration.
6954
6955    This only works when we see both the reduction PHI and its only consumer
6956    in vectorizable_reduction and there are no intermediate stmts
6957    participating.  */
6958   stmt_vec_info use_stmt_info;
6959   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6960   if (ncopies > 1
6961       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6962       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6963       && (use_stmt_info == stmt_info
6964           || STMT_VINFO_RELATED_STMT (use_stmt_info) == stmt))
6965     {
6966       single_defuse_cycle = true;
6967       epilog_copies = 1;
6968     }
6969   else
6970     epilog_copies = ncopies;
6971
6972   /* If the reduction stmt is one of the patterns that have lane
6973      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6974   if ((ncopies > 1
6975        && ! single_defuse_cycle)
6976       && (code == DOT_PROD_EXPR
6977           || code == WIDEN_SUM_EXPR
6978           || code == SAD_EXPR))
6979     {
6980       if (dump_enabled_p ())
6981         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6982                          "multi def-use cycle not possible for lane-reducing "
6983                          "reduction operation\n");
6984       return false;
6985     }
6986
6987   if (slp_node)
6988     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6989   else
6990     vec_num = 1;
6991
6992   internal_fn cond_fn = get_conditional_internal_fn (code);
6993   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6994
6995   if (!vec_stmt) /* transformation not required.  */
6996     {
6997       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6998       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6999         {
7000           if (reduction_type != FOLD_LEFT_REDUCTION
7001               && (cond_fn == IFN_LAST
7002                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7003                                                       OPTIMIZE_FOR_SPEED)))
7004             {
7005               if (dump_enabled_p ())
7006                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7007                                  "can't use a fully-masked loop because no"
7008                                  " conditional operation is available.\n");
7009               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7010             }
7011           else if (reduc_index == -1)
7012             {
7013               if (dump_enabled_p ())
7014                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7015                                  "can't use a fully-masked loop for chained"
7016                                  " reductions.\n");
7017               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7018             }
7019           else
7020             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7021                                    vectype_in);
7022         }
7023       if (dump_enabled_p ()
7024           && reduction_type == FOLD_LEFT_REDUCTION)
7025         dump_printf_loc (MSG_NOTE, vect_location,
7026                          "using an in-order (fold-left) reduction.\n");
7027       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7028       return true;
7029     }
7030
7031   /* Transform.  */
7032
7033   if (dump_enabled_p ())
7034     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7035
7036   /* FORNOW: Multiple types are not supported for condition.  */
7037   if (code == COND_EXPR)
7038     gcc_assert (ncopies == 1);
7039
7040   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7041
7042   if (reduction_type == FOLD_LEFT_REDUCTION)
7043     return vectorize_fold_left_reduction
7044       (stmt, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7045        reduc_fn, ops, vectype_in, reduc_index, masks);
7046
7047   if (reduction_type == EXTRACT_LAST_REDUCTION)
7048     {
7049       gcc_assert (!slp_node);
7050       return vectorizable_condition (stmt, gsi, vec_stmt,
7051                                      NULL, reduc_index, NULL, NULL);
7052     }
7053
7054   /* Create the destination vector  */
7055   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7056
7057   prev_stmt_info = NULL;
7058   prev_phi_info = NULL;
7059   if (!slp_node)
7060     {
7061       vec_oprnds0.create (1);
7062       vec_oprnds1.create (1);
7063       if (op_type == ternary_op)
7064         vec_oprnds2.create (1);
7065     }
7066
7067   phis.create (vec_num);
7068   vect_defs.create (vec_num);
7069   if (!slp_node)
7070     vect_defs.quick_push (NULL_TREE);
7071
7072   if (slp_node)
7073     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7074   else
7075     phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7076
7077   for (j = 0; j < ncopies; j++)
7078     {
7079       if (code == COND_EXPR)
7080         {
7081           gcc_assert (!slp_node);
7082           vectorizable_condition (stmt, gsi, vec_stmt,
7083                                   PHI_RESULT (phis[0]->stmt),
7084                                   reduc_index, NULL, NULL);
7085           /* Multiple types are not supported for condition.  */
7086           break;
7087         }
7088
7089       /* Handle uses.  */
7090       if (j == 0)
7091         {
7092           if (slp_node)
7093             {
7094               /* Get vec defs for all the operands except the reduction index,
7095                  ensuring the ordering of the ops in the vector is kept.  */
7096               auto_vec<tree, 3> slp_ops;
7097               auto_vec<vec<tree>, 3> vec_defs;
7098
7099               slp_ops.quick_push (ops[0]);
7100               slp_ops.quick_push (ops[1]);
7101               if (op_type == ternary_op)
7102                 slp_ops.quick_push (ops[2]);
7103
7104               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7105
7106               vec_oprnds0.safe_splice (vec_defs[0]);
7107               vec_defs[0].release ();
7108               vec_oprnds1.safe_splice (vec_defs[1]);
7109               vec_defs[1].release ();
7110               if (op_type == ternary_op)
7111                 {
7112                   vec_oprnds2.safe_splice (vec_defs[2]);
7113                   vec_defs[2].release ();
7114                 }
7115             }
7116           else
7117             {
7118               vec_oprnds0.quick_push
7119                 (vect_get_vec_def_for_operand (ops[0], stmt));
7120               vec_oprnds1.quick_push
7121                 (vect_get_vec_def_for_operand (ops[1], stmt));
7122               if (op_type == ternary_op)
7123                 vec_oprnds2.quick_push
7124                   (vect_get_vec_def_for_operand (ops[2], stmt));
7125             }
7126         }
7127       else
7128         {
7129           if (!slp_node)
7130             {
7131               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7132
7133               if (single_defuse_cycle && reduc_index == 0)
7134                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7135               else
7136                 vec_oprnds0[0]
7137                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7138               if (single_defuse_cycle && reduc_index == 1)
7139                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7140               else
7141                 vec_oprnds1[0]
7142                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7143               if (op_type == ternary_op)
7144                 {
7145                   if (single_defuse_cycle && reduc_index == 2)
7146                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7147                   else
7148                     vec_oprnds2[0]
7149                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7150                 }
7151             }
7152         }
7153
7154       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7155         {
7156           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7157           if (masked_loop_p)
7158             {
7159               /* Make sure that the reduction accumulator is vop[0].  */
7160               if (reduc_index == 1)
7161                 {
7162                   gcc_assert (commutative_tree_code (code));
7163                   std::swap (vop[0], vop[1]);
7164                 }
7165               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7166                                               vectype_in, i * ncopies + j);
7167               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7168                                                         vop[0], vop[1],
7169                                                         vop[0]);
7170               new_temp = make_ssa_name (vec_dest, call);
7171               gimple_call_set_lhs (call, new_temp);
7172               gimple_call_set_nothrow (call, true);
7173               new_stmt_info = vect_finish_stmt_generation (stmt, call, gsi);
7174             }
7175           else
7176             {
7177               if (op_type == ternary_op)
7178                 vop[2] = vec_oprnds2[i];
7179
7180               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7181                                                        vop[0], vop[1], vop[2]);
7182               new_temp = make_ssa_name (vec_dest, new_stmt);
7183               gimple_assign_set_lhs (new_stmt, new_temp);
7184               new_stmt_info
7185                 = vect_finish_stmt_generation (stmt, new_stmt, gsi);
7186             }
7187
7188           if (slp_node)
7189             {
7190               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7191               vect_defs.quick_push (new_temp);
7192             }
7193           else
7194             vect_defs[0] = new_temp;
7195         }
7196
7197       if (slp_node)
7198         continue;
7199
7200       if (j == 0)
7201         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7202       else
7203         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7204
7205       prev_stmt_info = new_stmt_info;
7206     }
7207
7208   /* Finalize the reduction-phi (set its arguments) and create the
7209      epilog reduction code.  */
7210   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7211     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7212
7213   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_phi,
7214                                     epilog_copies, reduc_fn, phis,
7215                                     double_reduc, slp_node, slp_node_instance,
7216                                     cond_reduc_val, cond_reduc_op_code,
7217                                     neutral_op);
7218
7219   return true;
7220 }
7221
7222 /* Function vect_min_worthwhile_factor.
7223
7224    For a loop where we could vectorize the operation indicated by CODE,
7225    return the minimum vectorization factor that makes it worthwhile
7226    to use generic vectors.  */
7227 static unsigned int
7228 vect_min_worthwhile_factor (enum tree_code code)
7229 {
7230   switch (code)
7231     {
7232     case PLUS_EXPR:
7233     case MINUS_EXPR:
7234     case NEGATE_EXPR:
7235       return 4;
7236
7237     case BIT_AND_EXPR:
7238     case BIT_IOR_EXPR:
7239     case BIT_XOR_EXPR:
7240     case BIT_NOT_EXPR:
7241       return 2;
7242
7243     default:
7244       return INT_MAX;
7245     }
7246 }
7247
7248 /* Return true if VINFO indicates we are doing loop vectorization and if
7249    it is worth decomposing CODE operations into scalar operations for
7250    that loop's vectorization factor.  */
7251
7252 bool
7253 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7254 {
7255   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7256   unsigned HOST_WIDE_INT value;
7257   return (loop_vinfo
7258           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7259           && value >= vect_min_worthwhile_factor (code));
7260 }
7261
7262 /* Function vectorizable_induction
7263
7264    Check if PHI performs an induction computation that can be vectorized.
7265    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7266    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7267    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7268
7269 bool
7270 vectorizable_induction (gimple *phi,
7271                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7272                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7273                         stmt_vector_for_cost *cost_vec)
7274 {
7275   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7276   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7277   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7278   unsigned ncopies;
7279   bool nested_in_vect_loop = false;
7280   struct loop *iv_loop;
7281   tree vec_def;
7282   edge pe = loop_preheader_edge (loop);
7283   basic_block new_bb;
7284   tree new_vec, vec_init, vec_step, t;
7285   tree new_name;
7286   gimple *new_stmt;
7287   gphi *induction_phi;
7288   tree induc_def, vec_dest;
7289   tree init_expr, step_expr;
7290   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7291   unsigned i;
7292   tree expr;
7293   gimple_seq stmts;
7294   imm_use_iterator imm_iter;
7295   use_operand_p use_p;
7296   gimple *exit_phi;
7297   edge latch_e;
7298   tree loop_arg;
7299   gimple_stmt_iterator si;
7300   basic_block bb = gimple_bb (phi);
7301
7302   if (gimple_code (phi) != GIMPLE_PHI)
7303     return false;
7304
7305   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7306     return false;
7307
7308   /* Make sure it was recognized as induction computation.  */
7309   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7310     return false;
7311
7312   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7313   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7314
7315   if (slp_node)
7316     ncopies = 1;
7317   else
7318     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7319   gcc_assert (ncopies >= 1);
7320
7321   /* FORNOW. These restrictions should be relaxed.  */
7322   if (nested_in_vect_loop_p (loop, phi))
7323     {
7324       imm_use_iterator imm_iter;
7325       use_operand_p use_p;
7326       gimple *exit_phi;
7327       edge latch_e;
7328       tree loop_arg;
7329
7330       if (ncopies > 1)
7331         {
7332           if (dump_enabled_p ())
7333             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7334                              "multiple types in nested loop.\n");
7335           return false;
7336         }
7337
7338       /* FORNOW: outer loop induction with SLP not supported.  */
7339       if (STMT_SLP_TYPE (stmt_info))
7340         return false;
7341
7342       exit_phi = NULL;
7343       latch_e = loop_latch_edge (loop->inner);
7344       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7345       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7346         {
7347           gimple *use_stmt = USE_STMT (use_p);
7348           if (is_gimple_debug (use_stmt))
7349             continue;
7350
7351           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7352             {
7353               exit_phi = use_stmt;
7354               break;
7355             }
7356         }
7357       if (exit_phi)
7358         {
7359           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7360           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7361                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7362             {
7363               if (dump_enabled_p ())
7364                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7365                                  "inner-loop induction only used outside "
7366                                  "of the outer vectorized loop.\n");
7367               return false;
7368             }
7369         }
7370
7371       nested_in_vect_loop = true;
7372       iv_loop = loop->inner;
7373     }
7374   else
7375     iv_loop = loop;
7376   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7377
7378   if (slp_node && !nunits.is_constant ())
7379     {
7380       /* The current SLP code creates the initial value element-by-element.  */
7381       if (dump_enabled_p ())
7382         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7383                          "SLP induction not supported for variable-length"
7384                          " vectors.\n");
7385       return false;
7386     }
7387
7388   if (!vec_stmt) /* transformation not required.  */
7389     {
7390       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7391       DUMP_VECT_SCOPE ("vectorizable_induction");
7392       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7393       return true;
7394     }
7395
7396   /* Transform.  */
7397
7398   /* Compute a vector variable, initialized with the first VF values of
7399      the induction variable.  E.g., for an iv with IV_PHI='X' and
7400      evolution S, for a vector of 4 units, we want to compute:
7401      [X, X + S, X + 2*S, X + 3*S].  */
7402
7403   if (dump_enabled_p ())
7404     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7405
7406   latch_e = loop_latch_edge (iv_loop);
7407   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7408
7409   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7410   gcc_assert (step_expr != NULL_TREE);
7411
7412   pe = loop_preheader_edge (iv_loop);
7413   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7414                                      loop_preheader_edge (iv_loop));
7415
7416   stmts = NULL;
7417   if (!nested_in_vect_loop)
7418     {
7419       /* Convert the initial value to the desired type.  */
7420       tree new_type = TREE_TYPE (vectype);
7421       init_expr = gimple_convert (&stmts, new_type, init_expr);
7422
7423       /* If we are using the loop mask to "peel" for alignment then we need
7424          to adjust the start value here.  */
7425       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7426       if (skip_niters != NULL_TREE)
7427         {
7428           if (FLOAT_TYPE_P (vectype))
7429             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7430                                         skip_niters);
7431           else
7432             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7433           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7434                                          skip_niters, step_expr);
7435           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7436                                     init_expr, skip_step);
7437         }
7438     }
7439
7440   /* Convert the step to the desired type.  */
7441   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7442
7443   if (stmts)
7444     {
7445       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7446       gcc_assert (!new_bb);
7447     }
7448
7449   /* Find the first insertion point in the BB.  */
7450   si = gsi_after_labels (bb);
7451
7452   /* For SLP induction we have to generate several IVs as for example
7453      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7454      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7455      [VF*S, VF*S, VF*S, VF*S] for all.  */
7456   if (slp_node)
7457     {
7458       /* Enforced above.  */
7459       unsigned int const_nunits = nunits.to_constant ();
7460
7461       /* Generate [VF*S, VF*S, ... ].  */
7462       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7463         {
7464           expr = build_int_cst (integer_type_node, vf);
7465           expr = fold_convert (TREE_TYPE (step_expr), expr);
7466         }
7467       else
7468         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7469       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7470                               expr, step_expr);
7471       if (! CONSTANT_CLASS_P (new_name))
7472         new_name = vect_init_vector (phi, new_name,
7473                                      TREE_TYPE (step_expr), NULL);
7474       new_vec = build_vector_from_val (vectype, new_name);
7475       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7476
7477       /* Now generate the IVs.  */
7478       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7479       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7480       unsigned elts = const_nunits * nvects;
7481       unsigned nivs = least_common_multiple (group_size,
7482                                              const_nunits) / const_nunits;
7483       gcc_assert (elts % group_size == 0);
7484       tree elt = init_expr;
7485       unsigned ivn;
7486       for (ivn = 0; ivn < nivs; ++ivn)
7487         {
7488           tree_vector_builder elts (vectype, const_nunits, 1);
7489           stmts = NULL;
7490           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7491             {
7492               if (ivn*const_nunits + eltn >= group_size
7493                   && (ivn * const_nunits + eltn) % group_size == 0)
7494                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7495                                     elt, step_expr);
7496               elts.quick_push (elt);
7497             }
7498           vec_init = gimple_build_vector (&stmts, &elts);
7499           if (stmts)
7500             {
7501               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7502               gcc_assert (!new_bb);
7503             }
7504
7505           /* Create the induction-phi that defines the induction-operand.  */
7506           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7507           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7508           stmt_vec_info induction_phi_info
7509             = loop_vinfo->add_stmt (induction_phi);
7510           induc_def = PHI_RESULT (induction_phi);
7511
7512           /* Create the iv update inside the loop  */
7513           vec_def = make_ssa_name (vec_dest);
7514           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7515           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7516           loop_vinfo->add_stmt (new_stmt);
7517
7518           /* Set the arguments of the phi node:  */
7519           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7520           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7521                        UNKNOWN_LOCATION);
7522
7523           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7524         }
7525
7526       /* Re-use IVs when we can.  */
7527       if (ivn < nvects)
7528         {
7529           unsigned vfp
7530             = least_common_multiple (group_size, const_nunits) / group_size;
7531           /* Generate [VF'*S, VF'*S, ... ].  */
7532           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7533             {
7534               expr = build_int_cst (integer_type_node, vfp);
7535               expr = fold_convert (TREE_TYPE (step_expr), expr);
7536             }
7537           else
7538             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7539           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7540                                   expr, step_expr);
7541           if (! CONSTANT_CLASS_P (new_name))
7542             new_name = vect_init_vector (phi, new_name,
7543                                          TREE_TYPE (step_expr), NULL);
7544           new_vec = build_vector_from_val (vectype, new_name);
7545           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7546           for (; ivn < nvects; ++ivn)
7547             {
7548               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7549               tree def;
7550               if (gimple_code (iv) == GIMPLE_PHI)
7551                 def = gimple_phi_result (iv);
7552               else
7553                 def = gimple_assign_lhs (iv);
7554               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7555                                               PLUS_EXPR,
7556                                               def, vec_step);
7557               if (gimple_code (iv) == GIMPLE_PHI)
7558                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7559               else
7560                 {
7561                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7562                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7563                 }
7564               SLP_TREE_VEC_STMTS (slp_node).quick_push
7565                 (loop_vinfo->add_stmt (new_stmt));
7566             }
7567         }
7568
7569       return true;
7570     }
7571
7572   /* Create the vector that holds the initial_value of the induction.  */
7573   if (nested_in_vect_loop)
7574     {
7575       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7576          been created during vectorization of previous stmts.  We obtain it
7577          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7578       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7579       /* If the initial value is not of proper type, convert it.  */
7580       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7581         {
7582           new_stmt
7583             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7584                                                           vect_simple_var,
7585                                                           "vec_iv_"),
7586                                    VIEW_CONVERT_EXPR,
7587                                    build1 (VIEW_CONVERT_EXPR, vectype,
7588                                            vec_init));
7589           vec_init = gimple_assign_lhs (new_stmt);
7590           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7591                                                  new_stmt);
7592           gcc_assert (!new_bb);
7593           loop_vinfo->add_stmt (new_stmt);
7594         }
7595     }
7596   else
7597     {
7598       /* iv_loop is the loop to be vectorized. Create:
7599          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7600       stmts = NULL;
7601       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7602
7603       unsigned HOST_WIDE_INT const_nunits;
7604       if (nunits.is_constant (&const_nunits))
7605         {
7606           tree_vector_builder elts (vectype, const_nunits, 1);
7607           elts.quick_push (new_name);
7608           for (i = 1; i < const_nunits; i++)
7609             {
7610               /* Create: new_name_i = new_name + step_expr  */
7611               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7612                                        new_name, step_expr);
7613               elts.quick_push (new_name);
7614             }
7615           /* Create a vector from [new_name_0, new_name_1, ...,
7616              new_name_nunits-1]  */
7617           vec_init = gimple_build_vector (&stmts, &elts);
7618         }
7619       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7620         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7621         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7622                                  new_name, step_expr);
7623       else
7624         {
7625           /* Build:
7626                 [base, base, base, ...]
7627                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7628           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7629           gcc_assert (flag_associative_math);
7630           tree index = build_index_vector (vectype, 0, 1);
7631           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7632                                                         new_name);
7633           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7634                                                         step_expr);
7635           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7636           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7637                                    vec_init, step_vec);
7638           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7639                                    vec_init, base_vec);
7640         }
7641
7642       if (stmts)
7643         {
7644           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7645           gcc_assert (!new_bb);
7646         }
7647     }
7648
7649
7650   /* Create the vector that holds the step of the induction.  */
7651   if (nested_in_vect_loop)
7652     /* iv_loop is nested in the loop to be vectorized. Generate:
7653        vec_step = [S, S, S, S]  */
7654     new_name = step_expr;
7655   else
7656     {
7657       /* iv_loop is the loop to be vectorized. Generate:
7658           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7659       gimple_seq seq = NULL;
7660       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7661         {
7662           expr = build_int_cst (integer_type_node, vf);
7663           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7664         }
7665       else
7666         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7667       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7668                                expr, step_expr);
7669       if (seq)
7670         {
7671           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7672           gcc_assert (!new_bb);
7673         }
7674     }
7675
7676   t = unshare_expr (new_name);
7677   gcc_assert (CONSTANT_CLASS_P (new_name)
7678               || TREE_CODE (new_name) == SSA_NAME);
7679   new_vec = build_vector_from_val (vectype, t);
7680   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7681
7682
7683   /* Create the following def-use cycle:
7684      loop prolog:
7685          vec_init = ...
7686          vec_step = ...
7687      loop:
7688          vec_iv = PHI <vec_init, vec_loop>
7689          ...
7690          STMT
7691          ...
7692          vec_loop = vec_iv + vec_step;  */
7693
7694   /* Create the induction-phi that defines the induction-operand.  */
7695   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7696   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7697   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7698   induc_def = PHI_RESULT (induction_phi);
7699
7700   /* Create the iv update inside the loop  */
7701   vec_def = make_ssa_name (vec_dest);
7702   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7703   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7704   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7705
7706   /* Set the arguments of the phi node:  */
7707   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7708   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7709                UNKNOWN_LOCATION);
7710
7711   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7712
7713   /* In case that vectorization factor (VF) is bigger than the number
7714      of elements that we can fit in a vectype (nunits), we have to generate
7715      more than one vector stmt - i.e - we need to "unroll" the
7716      vector stmt by a factor VF/nunits.  For more details see documentation
7717      in vectorizable_operation.  */
7718
7719   if (ncopies > 1)
7720     {
7721       gimple_seq seq = NULL;
7722       stmt_vec_info prev_stmt_vinfo;
7723       /* FORNOW. This restriction should be relaxed.  */
7724       gcc_assert (!nested_in_vect_loop);
7725
7726       /* Create the vector that holds the step of the induction.  */
7727       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7728         {
7729           expr = build_int_cst (integer_type_node, nunits);
7730           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7731         }
7732       else
7733         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7734       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7735                                expr, step_expr);
7736       if (seq)
7737         {
7738           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7739           gcc_assert (!new_bb);
7740         }
7741
7742       t = unshare_expr (new_name);
7743       gcc_assert (CONSTANT_CLASS_P (new_name)
7744                   || TREE_CODE (new_name) == SSA_NAME);
7745       new_vec = build_vector_from_val (vectype, t);
7746       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7747
7748       vec_def = induc_def;
7749       prev_stmt_vinfo = induction_phi_info;
7750       for (i = 1; i < ncopies; i++)
7751         {
7752           /* vec_i = vec_prev + vec_step  */
7753           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7754                                           vec_def, vec_step);
7755           vec_def = make_ssa_name (vec_dest, new_stmt);
7756           gimple_assign_set_lhs (new_stmt, vec_def);
7757
7758           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7759           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7760           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7761           prev_stmt_vinfo = new_stmt_info;
7762         }
7763     }
7764
7765   if (nested_in_vect_loop)
7766     {
7767       /* Find the loop-closed exit-phi of the induction, and record
7768          the final vector of induction results:  */
7769       exit_phi = NULL;
7770       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7771         {
7772           gimple *use_stmt = USE_STMT (use_p);
7773           if (is_gimple_debug (use_stmt))
7774             continue;
7775
7776           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7777             {
7778               exit_phi = use_stmt;
7779               break;
7780             }
7781         }
7782       if (exit_phi)
7783         {
7784           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7785           /* FORNOW. Currently not supporting the case that an inner-loop induction
7786              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7787           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7788                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7789
7790           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7791           if (dump_enabled_p ())
7792             {
7793               dump_printf_loc (MSG_NOTE, vect_location,
7794                                "vector of inductions after inner-loop:");
7795               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7796             }
7797         }
7798     }
7799
7800
7801   if (dump_enabled_p ())
7802     {
7803       dump_printf_loc (MSG_NOTE, vect_location,
7804                        "transform induction: created def-use cycle: ");
7805       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7806       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7807                         SSA_NAME_DEF_STMT (vec_def), 0);
7808     }
7809
7810   return true;
7811 }
7812
7813 /* Function vectorizable_live_operation.
7814
7815    STMT computes a value that is used outside the loop.  Check if
7816    it can be supported.  */
7817
7818 bool
7819 vectorizable_live_operation (gimple *stmt,
7820                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7821                              slp_tree slp_node, int slp_index,
7822                              stmt_vec_info *vec_stmt,
7823                              stmt_vector_for_cost *)
7824 {
7825   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7826   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7827   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7828   imm_use_iterator imm_iter;
7829   tree lhs, lhs_type, bitsize, vec_bitsize;
7830   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7831   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7832   int ncopies;
7833   gimple *use_stmt;
7834   auto_vec<tree> vec_oprnds;
7835   int vec_entry = 0;
7836   poly_uint64 vec_index = 0;
7837
7838   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7839
7840   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7841     return false;
7842
7843   /* FORNOW.  CHECKME.  */
7844   if (nested_in_vect_loop_p (loop, stmt))
7845     return false;
7846
7847   /* If STMT is not relevant and it is a simple assignment and its inputs are
7848      invariant then it can remain in place, unvectorized.  The original last
7849      scalar value that it computes will be used.  */
7850   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7851     {
7852       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7853       if (dump_enabled_p ())
7854         dump_printf_loc (MSG_NOTE, vect_location,
7855                          "statement is simple and uses invariant.  Leaving in "
7856                          "place.\n");
7857       return true;
7858     }
7859
7860   if (slp_node)
7861     ncopies = 1;
7862   else
7863     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7864
7865   if (slp_node)
7866     {
7867       gcc_assert (slp_index >= 0);
7868
7869       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7870       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7871
7872       /* Get the last occurrence of the scalar index from the concatenation of
7873          all the slp vectors. Calculate which slp vector it is and the index
7874          within.  */
7875       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7876
7877       /* Calculate which vector contains the result, and which lane of
7878          that vector we need.  */
7879       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7880         {
7881           if (dump_enabled_p ())
7882             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7883                              "Cannot determine which vector holds the"
7884                              " final result.\n");
7885           return false;
7886         }
7887     }
7888
7889   if (!vec_stmt)
7890     {
7891       /* No transformation required.  */
7892       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7893         {
7894           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7895                                                OPTIMIZE_FOR_SPEED))
7896             {
7897               if (dump_enabled_p ())
7898                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7899                                  "can't use a fully-masked loop because "
7900                                  "the target doesn't support extract last "
7901                                  "reduction.\n");
7902               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7903             }
7904           else if (slp_node)
7905             {
7906               if (dump_enabled_p ())
7907                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7908                                  "can't use a fully-masked loop because an "
7909                                  "SLP statement is live after the loop.\n");
7910               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7911             }
7912           else if (ncopies > 1)
7913             {
7914               if (dump_enabled_p ())
7915                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7916                                  "can't use a fully-masked loop because"
7917                                  " ncopies is greater than 1.\n");
7918               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7919             }
7920           else
7921             {
7922               gcc_assert (ncopies == 1 && !slp_node);
7923               vect_record_loop_mask (loop_vinfo,
7924                                      &LOOP_VINFO_MASKS (loop_vinfo),
7925                                      1, vectype);
7926             }
7927         }
7928       return true;
7929     }
7930
7931   /* If stmt has a related stmt, then use that for getting the lhs.  */
7932   if (is_pattern_stmt_p (stmt_info))
7933     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7934
7935   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7936         : gimple_get_lhs (stmt);
7937   lhs_type = TREE_TYPE (lhs);
7938
7939   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7940              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7941              : TYPE_SIZE (TREE_TYPE (vectype)));
7942   vec_bitsize = TYPE_SIZE (vectype);
7943
7944   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7945   tree vec_lhs, bitstart;
7946   if (slp_node)
7947     {
7948       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7949
7950       /* Get the correct slp vectorized stmt.  */
7951       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7952       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7953         vec_lhs = gimple_phi_result (phi);
7954       else
7955         vec_lhs = gimple_get_lhs (vec_stmt);
7956
7957       /* Get entry to use.  */
7958       bitstart = bitsize_int (vec_index);
7959       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7960     }
7961   else
7962     {
7963       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7964       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7965       gcc_checking_assert (ncopies == 1
7966                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7967
7968       /* For multiple copies, get the last copy.  */
7969       for (int i = 1; i < ncopies; ++i)
7970         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7971                                                   vec_lhs);
7972
7973       /* Get the last lane in the vector.  */
7974       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7975     }
7976
7977   gimple_seq stmts = NULL;
7978   tree new_tree;
7979   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7980     {
7981       /* Emit:
7982
7983            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7984
7985          where VEC_LHS is the vectorized live-out result and MASK is
7986          the loop mask for the final iteration.  */
7987       gcc_assert (ncopies == 1 && !slp_node);
7988       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7989       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7990                                       1, vectype, 0);
7991       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7992                                       scalar_type, mask, vec_lhs);
7993
7994       /* Convert the extracted vector element to the required scalar type.  */
7995       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7996     }
7997   else
7998     {
7999       tree bftype = TREE_TYPE (vectype);
8000       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8001         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8002       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8003       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8004                                        &stmts, true, NULL_TREE);
8005     }
8006
8007   if (stmts)
8008     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8009
8010   /* Replace use of lhs with newly computed result.  If the use stmt is a
8011      single arg PHI, just replace all uses of PHI result.  It's necessary
8012      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8013   use_operand_p use_p;
8014   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8015     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8016         && !is_gimple_debug (use_stmt))
8017     {
8018       if (gimple_code (use_stmt) == GIMPLE_PHI
8019           && gimple_phi_num_args (use_stmt) == 1)
8020         {
8021           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8022         }
8023       else
8024         {
8025           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8026             SET_USE (use_p, new_tree);
8027         }
8028       update_stmt (use_stmt);
8029     }
8030
8031   return true;
8032 }
8033
8034 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8035
8036 static void
8037 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8038 {
8039   ssa_op_iter op_iter;
8040   imm_use_iterator imm_iter;
8041   def_operand_p def_p;
8042   gimple *ustmt;
8043
8044   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8045     {
8046       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8047         {
8048           basic_block bb;
8049
8050           if (!is_gimple_debug (ustmt))
8051             continue;
8052
8053           bb = gimple_bb (ustmt);
8054
8055           if (!flow_bb_inside_loop_p (loop, bb))
8056             {
8057               if (gimple_debug_bind_p (ustmt))
8058                 {
8059                   if (dump_enabled_p ())
8060                     dump_printf_loc (MSG_NOTE, vect_location,
8061                                      "killing debug use\n");
8062
8063                   gimple_debug_bind_reset_value (ustmt);
8064                   update_stmt (ustmt);
8065                 }
8066               else
8067                 gcc_unreachable ();
8068             }
8069         }
8070     }
8071 }
8072
8073 /* Given loop represented by LOOP_VINFO, return true if computation of
8074    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8075    otherwise.  */
8076
8077 static bool
8078 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8079 {
8080   /* Constant case.  */
8081   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8082     {
8083       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8084       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8085
8086       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8087       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8088       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8089         return true;
8090     }
8091
8092   widest_int max;
8093   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8094   /* Check the upper bound of loop niters.  */
8095   if (get_max_loop_iterations (loop, &max))
8096     {
8097       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8098       signop sgn = TYPE_SIGN (type);
8099       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8100       if (max < type_max)
8101         return true;
8102     }
8103   return false;
8104 }
8105
8106 /* Return a mask type with half the number of elements as TYPE.  */
8107
8108 tree
8109 vect_halve_mask_nunits (tree type)
8110 {
8111   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8112   return build_truth_vector_type (nunits, current_vector_size);
8113 }
8114
8115 /* Return a mask type with twice as many elements as TYPE.  */
8116
8117 tree
8118 vect_double_mask_nunits (tree type)
8119 {
8120   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8121   return build_truth_vector_type (nunits, current_vector_size);
8122 }
8123
8124 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8125    contain a sequence of NVECTORS masks that each control a vector of type
8126    VECTYPE.  */
8127
8128 void
8129 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8130                        unsigned int nvectors, tree vectype)
8131 {
8132   gcc_assert (nvectors != 0);
8133   if (masks->length () < nvectors)
8134     masks->safe_grow_cleared (nvectors);
8135   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8136   /* The number of scalars per iteration and the number of vectors are
8137      both compile-time constants.  */
8138   unsigned int nscalars_per_iter
8139     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8140                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8141   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8142     {
8143       rgm->max_nscalars_per_iter = nscalars_per_iter;
8144       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8145     }
8146 }
8147
8148 /* Given a complete set of masks MASKS, extract mask number INDEX
8149    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8150    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8151
8152    See the comment above vec_loop_masks for more details about the mask
8153    arrangement.  */
8154
8155 tree
8156 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8157                     unsigned int nvectors, tree vectype, unsigned int index)
8158 {
8159   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8160   tree mask_type = rgm->mask_type;
8161
8162   /* Populate the rgroup's mask array, if this is the first time we've
8163      used it.  */
8164   if (rgm->masks.is_empty ())
8165     {
8166       rgm->masks.safe_grow_cleared (nvectors);
8167       for (unsigned int i = 0; i < nvectors; ++i)
8168         {
8169           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8170           /* Provide a dummy definition until the real one is available.  */
8171           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8172           rgm->masks[i] = mask;
8173         }
8174     }
8175
8176   tree mask = rgm->masks[index];
8177   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8178                 TYPE_VECTOR_SUBPARTS (vectype)))
8179     {
8180       /* A loop mask for data type X can be reused for data type Y
8181          if X has N times more elements than Y and if Y's elements
8182          are N times bigger than X's.  In this case each sequence
8183          of N elements in the loop mask will be all-zero or all-one.
8184          We can then view-convert the mask so that each sequence of
8185          N elements is replaced by a single element.  */
8186       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8187                               TYPE_VECTOR_SUBPARTS (vectype)));
8188       gimple_seq seq = NULL;
8189       mask_type = build_same_sized_truth_vector_type (vectype);
8190       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8191       if (seq)
8192         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8193     }
8194   return mask;
8195 }
8196
8197 /* Scale profiling counters by estimation for LOOP which is vectorized
8198    by factor VF.  */
8199
8200 static void
8201 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8202 {
8203   edge preheader = loop_preheader_edge (loop);
8204   /* Reduce loop iterations by the vectorization factor.  */
8205   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8206   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8207
8208   if (freq_h.nonzero_p ())
8209     {
8210       profile_probability p;
8211
8212       /* Avoid dropping loop body profile counter to 0 because of zero count
8213          in loop's preheader.  */
8214       if (!(freq_e == profile_count::zero ()))
8215         freq_e = freq_e.force_nonzero ();
8216       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8217       scale_loop_frequencies (loop, p);
8218     }
8219
8220   edge exit_e = single_exit (loop);
8221   exit_e->probability = profile_probability::always ()
8222                                  .apply_scale (1, new_est_niter + 1);
8223
8224   edge exit_l = single_pred_edge (loop->latch);
8225   profile_probability prob = exit_l->probability;
8226   exit_l->probability = exit_e->probability.invert ();
8227   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8228     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8229 }
8230
8231 /* Vectorize STMT if relevant, inserting any new instructions before GSI.
8232    When vectorizing STMT as a store, set *SEEN_STORE to its stmt_vec_info.
8233    *SLP_SCHEDULE is a running record of whether we have called
8234    vect_schedule_slp.  */
8235
8236 static void
8237 vect_transform_loop_stmt (loop_vec_info loop_vinfo, gimple *stmt,
8238                           gimple_stmt_iterator *gsi,
8239                           stmt_vec_info *seen_store, bool *slp_scheduled)
8240 {
8241   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8242   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8243   stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
8244   if (!stmt_info)
8245     return;
8246
8247   if (dump_enabled_p ())
8248     {
8249       dump_printf_loc (MSG_NOTE, vect_location,
8250                        "------>vectorizing statement: ");
8251       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8252     }
8253
8254   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8255     vect_loop_kill_debug_uses (loop, stmt);
8256
8257   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8258       && !STMT_VINFO_LIVE_P (stmt_info))
8259     return;
8260
8261   if (STMT_VINFO_VECTYPE (stmt_info))
8262     {
8263       poly_uint64 nunits
8264         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8265       if (!STMT_SLP_TYPE (stmt_info)
8266           && maybe_ne (nunits, vf)
8267           && dump_enabled_p ())
8268         /* For SLP VF is set according to unrolling factor, and not
8269            to vector size, hence for SLP this print is not valid.  */
8270         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8271     }
8272
8273   /* SLP.  Schedule all the SLP instances when the first SLP stmt is
8274      reached.  */
8275   if (slp_vect_type slptype = STMT_SLP_TYPE (stmt_info))
8276     {
8277
8278       if (!*slp_scheduled)
8279         {
8280           *slp_scheduled = true;
8281
8282           DUMP_VECT_SCOPE ("scheduling SLP instances");
8283
8284           vect_schedule_slp (loop_vinfo);
8285         }
8286
8287       /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8288       if (slptype == pure_slp)
8289         return;
8290     }
8291
8292   if (dump_enabled_p ())
8293     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8294
8295   bool grouped_store = false;
8296   if (vect_transform_stmt (stmt, gsi, &grouped_store, NULL, NULL))
8297     *seen_store = stmt_info;
8298 }
8299
8300 /* Function vect_transform_loop.
8301
8302    The analysis phase has determined that the loop is vectorizable.
8303    Vectorize the loop - created vectorized stmts to replace the scalar
8304    stmts in the loop, and update the loop exit condition.
8305    Returns scalar epilogue loop if any.  */
8306
8307 struct loop *
8308 vect_transform_loop (loop_vec_info loop_vinfo)
8309 {
8310   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8311   struct loop *epilogue = NULL;
8312   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8313   int nbbs = loop->num_nodes;
8314   int i;
8315   tree niters_vector = NULL_TREE;
8316   tree step_vector = NULL_TREE;
8317   tree niters_vector_mult_vf = NULL_TREE;
8318   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8319   unsigned int lowest_vf = constant_lower_bound (vf);
8320   bool slp_scheduled = false;
8321   gimple *stmt;
8322   bool check_profitability = false;
8323   unsigned int th;
8324
8325   DUMP_VECT_SCOPE ("vec_transform_loop");
8326
8327   loop_vinfo->shared->check_datarefs ();
8328
8329   /* Use the more conservative vectorization threshold.  If the number
8330      of iterations is constant assume the cost check has been performed
8331      by our caller.  If the threshold makes all loops profitable that
8332      run at least the (estimated) vectorization factor number of times
8333      checking is pointless, too.  */
8334   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8335   if (th >= vect_vf_for_cost (loop_vinfo)
8336       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8337     {
8338       if (dump_enabled_p ())
8339         dump_printf_loc (MSG_NOTE, vect_location,
8340                          "Profitability threshold is %d loop iterations.\n",
8341                          th);
8342       check_profitability = true;
8343     }
8344
8345   /* Make sure there exists a single-predecessor exit bb.  Do this before
8346      versioning.   */
8347   edge e = single_exit (loop);
8348   if (! single_pred_p (e->dest))
8349     {
8350       split_loop_exit_edge (e);
8351       if (dump_enabled_p ())
8352         dump_printf (MSG_NOTE, "split exit edge\n");
8353     }
8354
8355   /* Version the loop first, if required, so the profitability check
8356      comes first.  */
8357
8358   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8359     {
8360       poly_uint64 versioning_threshold
8361         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8362       if (check_profitability
8363           && ordered_p (poly_uint64 (th), versioning_threshold))
8364         {
8365           versioning_threshold = ordered_max (poly_uint64 (th),
8366                                               versioning_threshold);
8367           check_profitability = false;
8368         }
8369       vect_loop_versioning (loop_vinfo, th, check_profitability,
8370                             versioning_threshold);
8371       check_profitability = false;
8372     }
8373
8374   /* Make sure there exists a single-predecessor exit bb also on the
8375      scalar loop copy.  Do this after versioning but before peeling
8376      so CFG structure is fine for both scalar and if-converted loop
8377      to make slpeel_duplicate_current_defs_from_edges face matched
8378      loop closed PHI nodes on the exit.  */
8379   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8380     {
8381       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8382       if (! single_pred_p (e->dest))
8383         {
8384           split_loop_exit_edge (e);
8385           if (dump_enabled_p ())
8386             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8387         }
8388     }
8389
8390   tree niters = vect_build_loop_niters (loop_vinfo);
8391   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8392   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8393   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8394   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8395                               &step_vector, &niters_vector_mult_vf, th,
8396                               check_profitability, niters_no_overflow);
8397
8398   if (niters_vector == NULL_TREE)
8399     {
8400       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8401           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8402           && known_eq (lowest_vf, vf))
8403         {
8404           niters_vector
8405             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8406                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8407           step_vector = build_one_cst (TREE_TYPE (niters));
8408         }
8409       else
8410         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8411                                      &step_vector, niters_no_overflow);
8412     }
8413
8414   /* 1) Make sure the loop header has exactly two entries
8415      2) Make sure we have a preheader basic block.  */
8416
8417   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8418
8419   split_edge (loop_preheader_edge (loop));
8420
8421   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8422       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8423     /* This will deal with any possible peeling.  */
8424     vect_prepare_for_masked_peels (loop_vinfo);
8425
8426   /* FORNOW: the vectorizer supports only loops which body consist
8427      of one basic block (header + empty latch). When the vectorizer will
8428      support more involved loop forms, the order by which the BBs are
8429      traversed need to be reconsidered.  */
8430
8431   for (i = 0; i < nbbs; i++)
8432     {
8433       basic_block bb = bbs[i];
8434       stmt_vec_info stmt_info;
8435
8436       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8437            gsi_next (&si))
8438         {
8439           gphi *phi = si.phi ();
8440           if (dump_enabled_p ())
8441             {
8442               dump_printf_loc (MSG_NOTE, vect_location,
8443                                "------>vectorizing phi: ");
8444               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8445             }
8446           stmt_info = loop_vinfo->lookup_stmt (phi);
8447           if (!stmt_info)
8448             continue;
8449
8450           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8451             vect_loop_kill_debug_uses (loop, phi);
8452
8453           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8454               && !STMT_VINFO_LIVE_P (stmt_info))
8455             continue;
8456
8457           if (STMT_VINFO_VECTYPE (stmt_info)
8458               && (maybe_ne
8459                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8460               && dump_enabled_p ())
8461             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8462
8463           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8464                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8465                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8466               && ! PURE_SLP_STMT (stmt_info))
8467             {
8468               if (dump_enabled_p ())
8469                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8470               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8471             }
8472         }
8473
8474       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8475            !gsi_end_p (si);)
8476         {
8477           stmt = gsi_stmt (si);
8478           /* During vectorization remove existing clobber stmts.  */
8479           if (gimple_clobber_p (stmt))
8480             {
8481               unlink_stmt_vdef (stmt);
8482               gsi_remove (&si, true);
8483               release_defs (stmt);
8484             }
8485           else
8486             {
8487               stmt_info = loop_vinfo->lookup_stmt (stmt);
8488
8489               /* vector stmts created in the outer-loop during vectorization of
8490                  stmts in an inner-loop may not have a stmt_info, and do not
8491                  need to be vectorized.  */
8492               stmt_vec_info seen_store = NULL;
8493               if (stmt_info)
8494                 {
8495                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8496                     {
8497                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8498                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8499                            !gsi_end_p (subsi); gsi_next (&subsi))
8500                         vect_transform_loop_stmt (loop_vinfo,
8501                                                   gsi_stmt (subsi), &si,
8502                                                   &seen_store,
8503                                                   &slp_scheduled);
8504                       gimple *pat_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8505                       vect_transform_loop_stmt (loop_vinfo, pat_stmt, &si,
8506                                                 &seen_store, &slp_scheduled);
8507                     }
8508                   vect_transform_loop_stmt (loop_vinfo, stmt, &si,
8509                                             &seen_store, &slp_scheduled);
8510                 }
8511               if (seen_store)
8512                 {
8513                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8514                     {
8515                       /* Interleaving.  If IS_STORE is TRUE, the
8516                          vectorization of the interleaving chain was
8517                          completed - free all the stores in the chain.  */
8518                       gsi_next (&si);
8519                       vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8520                     }
8521                   else
8522                     {
8523                       /* Free the attached stmt_vec_info and remove the
8524                          stmt.  */
8525                       free_stmt_vec_info (stmt);
8526                       unlink_stmt_vdef (stmt);
8527                       gsi_remove (&si, true);
8528                       release_defs (stmt);
8529                     }
8530                 }
8531               else
8532                 gsi_next (&si);
8533             }
8534         }
8535
8536       /* Stub out scalar statements that must not survive vectorization.
8537          Doing this here helps with grouped statements, or statements that
8538          are involved in patterns.  */
8539       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8540            !gsi_end_p (gsi); gsi_next (&gsi))
8541         {
8542           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8543           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8544             {
8545               tree lhs = gimple_get_lhs (call);
8546               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8547                 {
8548                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8549                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8550                   gsi_replace (&gsi, new_stmt, true);
8551                 }
8552             }
8553         }
8554     }                           /* BBs in loop */
8555
8556   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8557      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8558   if (integer_onep (step_vector))
8559     niters_no_overflow = true;
8560   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8561                            niters_vector_mult_vf, !niters_no_overflow);
8562
8563   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8564   scale_profile_for_vect_loop (loop, assumed_vf);
8565
8566   /* True if the final iteration might not handle a full vector's
8567      worth of scalar iterations.  */
8568   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8569   /* The minimum number of iterations performed by the epilogue.  This
8570      is 1 when peeling for gaps because we always need a final scalar
8571      iteration.  */
8572   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8573   /* +1 to convert latch counts to loop iteration counts,
8574      -min_epilogue_iters to remove iterations that cannot be performed
8575        by the vector code.  */
8576   int bias_for_lowest = 1 - min_epilogue_iters;
8577   int bias_for_assumed = bias_for_lowest;
8578   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8579   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8580     {
8581       /* When the amount of peeling is known at compile time, the first
8582          iteration will have exactly alignment_npeels active elements.
8583          In the worst case it will have at least one.  */
8584       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8585       bias_for_lowest += lowest_vf - min_first_active;
8586       bias_for_assumed += assumed_vf - min_first_active;
8587     }
8588   /* In these calculations the "- 1" converts loop iteration counts
8589      back to latch counts.  */
8590   if (loop->any_upper_bound)
8591     loop->nb_iterations_upper_bound
8592       = (final_iter_may_be_partial
8593          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8594                           lowest_vf) - 1
8595          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8596                            lowest_vf) - 1);
8597   if (loop->any_likely_upper_bound)
8598     loop->nb_iterations_likely_upper_bound
8599       = (final_iter_may_be_partial
8600          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8601                           + bias_for_lowest, lowest_vf) - 1
8602          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8603                            + bias_for_lowest, lowest_vf) - 1);
8604   if (loop->any_estimate)
8605     loop->nb_iterations_estimate
8606       = (final_iter_may_be_partial
8607          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8608                           assumed_vf) - 1
8609          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8610                            assumed_vf) - 1);
8611
8612   if (dump_enabled_p ())
8613     {
8614       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8615         {
8616           dump_printf_loc (MSG_NOTE, vect_location,
8617                            "LOOP VECTORIZED\n");
8618           if (loop->inner)
8619             dump_printf_loc (MSG_NOTE, vect_location,
8620                              "OUTER LOOP VECTORIZED\n");
8621           dump_printf (MSG_NOTE, "\n");
8622         }
8623       else
8624         {
8625           dump_printf_loc (MSG_NOTE, vect_location,
8626                            "LOOP EPILOGUE VECTORIZED (VS=");
8627           dump_dec (MSG_NOTE, current_vector_size);
8628           dump_printf (MSG_NOTE, ")\n");
8629         }
8630     }
8631
8632   /* Free SLP instances here because otherwise stmt reference counting
8633      won't work.  */
8634   slp_instance instance;
8635   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8636     vect_free_slp_instance (instance, true);
8637   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8638   /* Clear-up safelen field since its value is invalid after vectorization
8639      since vectorized loop can have loop-carried dependencies.  */
8640   loop->safelen = 0;
8641
8642   /* Don't vectorize epilogue for epilogue.  */
8643   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8644     epilogue = NULL;
8645
8646   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8647     epilogue = NULL;
8648
8649   if (epilogue)
8650     {
8651       auto_vector_sizes vector_sizes;
8652       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8653       unsigned int next_size = 0;
8654
8655       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8656           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8657           && known_eq (vf, lowest_vf))
8658         {
8659           unsigned int eiters
8660             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8661                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8662           eiters = eiters % lowest_vf;
8663           epilogue->nb_iterations_upper_bound = eiters - 1;
8664
8665           unsigned int ratio;
8666           while (next_size < vector_sizes.length ()
8667                  && !(constant_multiple_p (current_vector_size,
8668                                            vector_sizes[next_size], &ratio)
8669                       && eiters >= lowest_vf / ratio))
8670             next_size += 1;
8671         }
8672       else
8673         while (next_size < vector_sizes.length ()
8674                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8675           next_size += 1;
8676
8677       if (next_size == vector_sizes.length ())
8678         epilogue = NULL;
8679     }
8680
8681   if (epilogue)
8682     {
8683       epilogue->force_vectorize = loop->force_vectorize;
8684       epilogue->safelen = loop->safelen;
8685       epilogue->dont_vectorize = false;
8686
8687       /* We may need to if-convert epilogue to vectorize it.  */
8688       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8689         tree_if_conversion (epilogue);
8690     }
8691
8692   return epilogue;
8693 }
8694
8695 /* The code below is trying to perform simple optimization - revert
8696    if-conversion for masked stores, i.e. if the mask of a store is zero
8697    do not perform it and all stored value producers also if possible.
8698    For example,
8699      for (i=0; i<n; i++)
8700        if (c[i])
8701         {
8702           p1[i] += 1;
8703           p2[i] = p3[i] +2;
8704         }
8705    this transformation will produce the following semi-hammock:
8706
8707    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8708      {
8709        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8710        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8711        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8712        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8713        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8714        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8715      }
8716 */
8717
8718 void
8719 optimize_mask_stores (struct loop *loop)
8720 {
8721   basic_block *bbs = get_loop_body (loop);
8722   unsigned nbbs = loop->num_nodes;
8723   unsigned i;
8724   basic_block bb;
8725   struct loop *bb_loop;
8726   gimple_stmt_iterator gsi;
8727   gimple *stmt;
8728   auto_vec<gimple *> worklist;
8729
8730   vect_location = find_loop_location (loop);
8731   /* Pick up all masked stores in loop if any.  */
8732   for (i = 0; i < nbbs; i++)
8733     {
8734       bb = bbs[i];
8735       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8736            gsi_next (&gsi))
8737         {
8738           stmt = gsi_stmt (gsi);
8739           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8740             worklist.safe_push (stmt);
8741         }
8742     }
8743
8744   free (bbs);
8745   if (worklist.is_empty ())
8746     return;
8747
8748   /* Loop has masked stores.  */
8749   while (!worklist.is_empty ())
8750     {
8751       gimple *last, *last_store;
8752       edge e, efalse;
8753       tree mask;
8754       basic_block store_bb, join_bb;
8755       gimple_stmt_iterator gsi_to;
8756       tree vdef, new_vdef;
8757       gphi *phi;
8758       tree vectype;
8759       tree zero;
8760
8761       last = worklist.pop ();
8762       mask = gimple_call_arg (last, 2);
8763       bb = gimple_bb (last);
8764       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8765          the same loop as if_bb.  It could be different to LOOP when two
8766          level loop-nest is vectorized and mask_store belongs to the inner
8767          one.  */
8768       e = split_block (bb, last);
8769       bb_loop = bb->loop_father;
8770       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8771       join_bb = e->dest;
8772       store_bb = create_empty_bb (bb);
8773       add_bb_to_loop (store_bb, bb_loop);
8774       e->flags = EDGE_TRUE_VALUE;
8775       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8776       /* Put STORE_BB to likely part.  */
8777       efalse->probability = profile_probability::unlikely ();
8778       store_bb->count = efalse->count ();
8779       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8780       if (dom_info_available_p (CDI_DOMINATORS))
8781         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8782       if (dump_enabled_p ())
8783         dump_printf_loc (MSG_NOTE, vect_location,
8784                          "Create new block %d to sink mask stores.",
8785                          store_bb->index);
8786       /* Create vector comparison with boolean result.  */
8787       vectype = TREE_TYPE (mask);
8788       zero = build_zero_cst (vectype);
8789       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8790       gsi = gsi_last_bb (bb);
8791       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8792       /* Create new PHI node for vdef of the last masked store:
8793          .MEM_2 = VDEF <.MEM_1>
8794          will be converted to
8795          .MEM.3 = VDEF <.MEM_1>
8796          and new PHI node will be created in join bb
8797          .MEM_2 = PHI <.MEM_1, .MEM_3>
8798       */
8799       vdef = gimple_vdef (last);
8800       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8801       gimple_set_vdef (last, new_vdef);
8802       phi = create_phi_node (vdef, join_bb);
8803       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8804
8805       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8806       while (true)
8807         {
8808           gimple_stmt_iterator gsi_from;
8809           gimple *stmt1 = NULL;
8810
8811           /* Move masked store to STORE_BB.  */
8812           last_store = last;
8813           gsi = gsi_for_stmt (last);
8814           gsi_from = gsi;
8815           /* Shift GSI to the previous stmt for further traversal.  */
8816           gsi_prev (&gsi);
8817           gsi_to = gsi_start_bb (store_bb);
8818           gsi_move_before (&gsi_from, &gsi_to);
8819           /* Setup GSI_TO to the non-empty block start.  */
8820           gsi_to = gsi_start_bb (store_bb);
8821           if (dump_enabled_p ())
8822             {
8823               dump_printf_loc (MSG_NOTE, vect_location,
8824                                "Move stmt to created bb\n");
8825               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8826             }
8827           /* Move all stored value producers if possible.  */
8828           while (!gsi_end_p (gsi))
8829             {
8830               tree lhs;
8831               imm_use_iterator imm_iter;
8832               use_operand_p use_p;
8833               bool res;
8834
8835               /* Skip debug statements.  */
8836               if (is_gimple_debug (gsi_stmt (gsi)))
8837                 {
8838                   gsi_prev (&gsi);
8839                   continue;
8840                 }
8841               stmt1 = gsi_stmt (gsi);
8842               /* Do not consider statements writing to memory or having
8843                  volatile operand.  */
8844               if (gimple_vdef (stmt1)
8845                   || gimple_has_volatile_ops (stmt1))
8846                 break;
8847               gsi_from = gsi;
8848               gsi_prev (&gsi);
8849               lhs = gimple_get_lhs (stmt1);
8850               if (!lhs)
8851                 break;
8852
8853               /* LHS of vectorized stmt must be SSA_NAME.  */
8854               if (TREE_CODE (lhs) != SSA_NAME)
8855                 break;
8856
8857               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8858                 {
8859                   /* Remove dead scalar statement.  */
8860                   if (has_zero_uses (lhs))
8861                     {
8862                       gsi_remove (&gsi_from, true);
8863                       continue;
8864                     }
8865                 }
8866
8867               /* Check that LHS does not have uses outside of STORE_BB.  */
8868               res = true;
8869               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8870                 {
8871                   gimple *use_stmt;
8872                   use_stmt = USE_STMT (use_p);
8873                   if (is_gimple_debug (use_stmt))
8874                     continue;
8875                   if (gimple_bb (use_stmt) != store_bb)
8876                     {
8877                       res = false;
8878                       break;
8879                     }
8880                 }
8881               if (!res)
8882                 break;
8883
8884               if (gimple_vuse (stmt1)
8885                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8886                 break;
8887
8888               /* Can move STMT1 to STORE_BB.  */
8889               if (dump_enabled_p ())
8890                 {
8891                   dump_printf_loc (MSG_NOTE, vect_location,
8892                                    "Move stmt to created bb\n");
8893                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8894                 }
8895               gsi_move_before (&gsi_from, &gsi_to);
8896               /* Shift GSI_TO for further insertion.  */
8897               gsi_prev (&gsi_to);
8898             }
8899           /* Put other masked stores with the same mask to STORE_BB.  */
8900           if (worklist.is_empty ()
8901               || gimple_call_arg (worklist.last (), 2) != mask
8902               || worklist.last () != stmt1)
8903             break;
8904           last = worklist.pop ();
8905         }
8906       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8907     }
8908 }