gcc/tree-vect-slp.cc

   1 /* SLP - Basic Block Vectorization
   2    Copyright (C) 2007-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #define INCLUDE_ALGORITHM
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "insn-config.h"
  35 #include "recog.h"              /* FIXME: for insn_data */
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "gimple-iterator.h"
  39 #include "cfgloop.h"
  40 #include "tree-vectorizer.h"
  41 #include "langhooks.h"
  42 #include "gimple-walk.h"
  43 #include "dbgcnt.h"
  44 #include "tree-vector-builder.h"
  45 #include "vec-perm-indices.h"
  46 #include "gimple-fold.h"
  47 #include "internal-fn.h"
  48 #include "dump-context.h"
  49 #include "cfganal.h"
  50 #include "tree-eh.h"
  51 #include "tree-cfg.h"
  52 #include "alloc-pool.h"
  53 #include "sreal.h"
  54 #include "predict.h"
  55
  56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
  57                                             load_permutation_t &,
  58                                             const vec<tree> &,
  59                                             gimple_stmt_iterator *,
  60                                             poly_uint64, bool, bool,
  61                                             unsigned *,
  62                                             unsigned * = nullptr,
  63                                             bool = false);
  64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
  65                                            slp_tree, lane_permutation_t &,
  66                                            vec<slp_tree> &, bool);
  67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
  68                                           slp_tree, stmt_vector_for_cost *);
  69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
  70
  71 static object_allocator<_slp_tree> *slp_tree_pool;
  72 static slp_tree slp_first_node;
  73
  74 void
  75 vect_slp_init (void)
  76 {
  77   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
  78 }
  79
  80 void
  81 vect_slp_fini (void)
  82 {
  83   while (slp_first_node)
  84     delete slp_first_node;
  85   delete slp_tree_pool;
  86   slp_tree_pool = NULL;
  87 }
  88
  89 void *
  90 _slp_tree::operator new (size_t n)
  91 {
  92   gcc_assert (n == sizeof (_slp_tree));
  93   return slp_tree_pool->allocate_raw ();
  94 }
  95
  96 void
  97 _slp_tree::operator delete (void *node, size_t n)
  98 {
  99   gcc_assert (n == sizeof (_slp_tree));
 100   slp_tree_pool->remove_raw (node);
 101 }
 102
 103
 104 /* Initialize a SLP node.  */
 105
 106 _slp_tree::_slp_tree ()
 107 {
 108   this->prev_node = NULL;
 109   if (slp_first_node)
 110     slp_first_node->prev_node = this;
 111   this->next_node = slp_first_node;
 112   slp_first_node = this;
 113   SLP_TREE_SCALAR_STMTS (this) = vNULL;
 114   SLP_TREE_SCALAR_OPS (this) = vNULL;
 115   SLP_TREE_VEC_DEFS (this) = vNULL;
 116   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
 117   SLP_TREE_CHILDREN (this) = vNULL;
 118   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
 119   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
 120   SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
 121   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
 122   SLP_TREE_CODE (this) = ERROR_MARK;
 123   SLP_TREE_VECTYPE (this) = NULL_TREE;
 124   SLP_TREE_REPRESENTATIVE (this) = NULL;
 125   SLP_TREE_REF_COUNT (this) = 1;
 126   this->failed = NULL;
 127   this->max_nunits = 1;
 128   this->lanes = 0;
 129 }
 130
 131 /* Tear down a SLP node.  */
 132
 133 _slp_tree::~_slp_tree ()
 134 {
 135   if (this->prev_node)
 136     this->prev_node->next_node = this->next_node;
 137   else
 138     slp_first_node = this->next_node;
 139   if (this->next_node)
 140     this->next_node->prev_node = this->prev_node;
 141   SLP_TREE_CHILDREN (this).release ();
 142   SLP_TREE_SCALAR_STMTS (this).release ();
 143   SLP_TREE_SCALAR_OPS (this).release ();
 144   SLP_TREE_VEC_DEFS (this).release ();
 145   SLP_TREE_LOAD_PERMUTATION (this).release ();
 146   SLP_TREE_LANE_PERMUTATION (this).release ();
 147   SLP_TREE_SIMD_CLONE_INFO (this).release ();
 148   if (this->failed)
 149     free (failed);
 150 }
 151
 152 /* Push the single SSA definition in DEF to the vector of vector defs.  */
 153
 154 void
 155 _slp_tree::push_vec_def (gimple *def)
 156 {
 157   if (gphi *phi = dyn_cast <gphi *> (def))
 158     vec_defs.quick_push (gimple_phi_result (phi));
 159   else
 160     {
 161       def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
 162       vec_defs.quick_push (get_def_from_ptr (defop));
 163     }
 164 }
 165
 166 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
 167
 168 void
 169 vect_free_slp_tree (slp_tree node)
 170 {
 171   int i;
 172   slp_tree child;
 173
 174   if (--SLP_TREE_REF_COUNT (node) != 0)
 175     return;
 176
 177   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
 178     if (child)
 179       vect_free_slp_tree (child);
 180
 181   /* If the node defines any SLP only patterns then those patterns are no
 182      longer valid and should be removed.  */
 183   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
 184   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
 185     {
 186       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
 187       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
 188       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
 189     }
 190
 191   delete node;
 192 }
 193
 194 /* Return a location suitable for dumpings related to the SLP instance.  */
 195
 196 dump_user_location_t
 197 _slp_instance::location () const
 198 {
 199   if (!root_stmts.is_empty ())
 200     return root_stmts[0]->stmt;
 201   else
 202     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
 203 }
 204
 205
 206 /* Free the memory allocated for the SLP instance.  */
 207
 208 void
 209 vect_free_slp_instance (slp_instance instance)
 210 {
 211   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
 212   SLP_INSTANCE_LOADS (instance).release ();
 213   SLP_INSTANCE_ROOT_STMTS (instance).release ();
 214   SLP_INSTANCE_REMAIN_DEFS (instance).release ();
 215   instance->subgraph_entries.release ();
 216   instance->cost_vec.release ();
 217   free (instance);
 218 }
 219
 220
 221 /* Create an SLP node for SCALAR_STMTS.  */
 222
 223 slp_tree
 224 vect_create_new_slp_node (unsigned nops, tree_code code)
 225 {
 226   slp_tree node = new _slp_tree;
 227   SLP_TREE_SCALAR_STMTS (node) = vNULL;
 228   SLP_TREE_CHILDREN (node).create (nops);
 229   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 230   SLP_TREE_CODE (node) = code;
 231   return node;
 232 }
 233 /* Create an SLP node for SCALAR_STMTS.  */
 234
 235 static slp_tree
 236 vect_create_new_slp_node (slp_tree node,
 237                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
 238 {
 239   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
 240   SLP_TREE_CHILDREN (node).create (nops);
 241   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 242   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 243   SLP_TREE_LANES (node) = scalar_stmts.length ();
 244   return node;
 245 }
 246
 247 /* Create an SLP node for SCALAR_STMTS.  */
 248
 249 static slp_tree
 250 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 251 {
 252   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
 253 }
 254
 255 /* Create an SLP node for OPS.  */
 256
 257 static slp_tree
 258 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
 259 {
 260   SLP_TREE_SCALAR_OPS (node) = ops;
 261   SLP_TREE_DEF_TYPE (node) = vect_external_def;
 262   SLP_TREE_LANES (node) = ops.length ();
 263   return node;
 264 }
 265
 266 /* Create an SLP node for OPS.  */
 267
 268 static slp_tree
 269 vect_create_new_slp_node (vec<tree> ops)
 270 {
 271   return vect_create_new_slp_node (new _slp_tree, ops);
 272 }
 273
 274
 275 /* This structure is used in creation of an SLP tree.  Each instance
 276    corresponds to the same operand in a group of scalar stmts in an SLP
 277    node.  */
 278 typedef struct _slp_oprnd_info
 279 {
 280   /* Def-stmts for the operands.  */
 281   vec<stmt_vec_info> def_stmts;
 282   /* Operands.  */
 283   vec<tree> ops;
 284   /* Information about the first statement, its vector def-type, type, the
 285      operand itself in case it's constant, and an indication if it's a pattern
 286      stmt and gather/scatter info.  */
 287   tree first_op_type;
 288   enum vect_def_type first_dt;
 289   bool any_pattern;
 290   bool first_gs_p;
 291   gather_scatter_info first_gs_info;
 292 } *slp_oprnd_info;
 293
 294
 295 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
 296    operand.  */
 297 static vec<slp_oprnd_info>
 298 vect_create_oprnd_info (int nops, int group_size)
 299 {
 300   int i;
 301   slp_oprnd_info oprnd_info;
 302   vec<slp_oprnd_info> oprnds_info;
 303
 304   oprnds_info.create (nops);
 305   for (i = 0; i < nops; i++)
 306     {
 307       oprnd_info = XNEW (struct _slp_oprnd_info);
 308       oprnd_info->def_stmts.create (group_size);
 309       oprnd_info->ops.create (group_size);
 310       oprnd_info->first_dt = vect_uninitialized_def;
 311       oprnd_info->first_op_type = NULL_TREE;
 312       oprnd_info->any_pattern = false;
 313       oprnd_info->first_gs_p = false;
 314       oprnds_info.quick_push (oprnd_info);
 315     }
 316
 317   return oprnds_info;
 318 }
 319
 320
 321 /* Free operands info.  */
 322
 323 static void
 324 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 325 {
 326   int i;
 327   slp_oprnd_info oprnd_info;
 328
 329   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
 330     {
 331       oprnd_info->def_stmts.release ();
 332       oprnd_info->ops.release ();
 333       XDELETE (oprnd_info);
 334     }
 335
 336   oprnds_info.release ();
 337 }
 338
 339 /* Return the execution frequency of NODE (so that a higher value indicates
 340    a "more important" node when optimizing for speed).  */
 341
 342 static sreal
 343 vect_slp_node_weight (slp_tree node)
 344 {
 345   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
 346   basic_block bb = gimple_bb (stmt_info->stmt);
 347   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
 348 }
 349
 350 /* Return true if STMTS contains a pattern statement.  */
 351
 352 static bool
 353 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
 354 {
 355   stmt_vec_info stmt_info;
 356   unsigned int i;
 357   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 358     if (is_pattern_stmt_p (stmt_info))
 359       return true;
 360   return false;
 361 }
 362
 363 /* Return true when all lanes in the external or constant NODE have
 364    the same value.  */
 365
 366 static bool
 367 vect_slp_tree_uniform_p (slp_tree node)
 368 {
 369   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
 370               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
 371
 372   /* Pre-exsting vectors.  */
 373   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
 374     return false;
 375
 376   unsigned i;
 377   tree op, first = NULL_TREE;
 378   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
 379     if (!first)
 380       first = op;
 381     else if (!operand_equal_p (first, op, 0))
 382       return false;
 383
 384   return true;
 385 }
 386
 387 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
 388    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
 389    of the chain.  */
 390
 391 int
 392 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 393                                       stmt_vec_info first_stmt_info)
 394 {
 395   stmt_vec_info next_stmt_info = first_stmt_info;
 396   int result = 0;
 397
 398   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
 399     return -1;
 400
 401   do
 402     {
 403       if (next_stmt_info == stmt_info)
 404         return result;
 405       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
 406       if (next_stmt_info)
 407         result += DR_GROUP_GAP (next_stmt_info);
 408     }
 409   while (next_stmt_info);
 410
 411   return -1;
 412 }
 413
 414 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
 415    using the method implemented by duplicate_and_interleave.  Return true
 416    if so, returning the number of intermediate vectors in *NVECTORS_OUT
 417    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
 418    (if nonnull).  */
 419
 420 bool
 421 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
 422                                 tree elt_type, unsigned int *nvectors_out,
 423                                 tree *vector_type_out,
 424                                 tree *permutes)
 425 {
 426   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
 427   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
 428     return false;
 429
 430   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
 431   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
 432   unsigned int nvectors = 1;
 433   for (;;)
 434     {
 435       scalar_int_mode int_mode;
 436       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
 437       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
 438         {
 439           /* Get the natural vector type for this SLP group size.  */
 440           tree int_type = build_nonstandard_integer_type
 441             (GET_MODE_BITSIZE (int_mode), 1);
 442           tree vector_type
 443             = get_vectype_for_scalar_type (vinfo, int_type, count);
 444           poly_int64 half_nelts;
 445           if (vector_type
 446               && VECTOR_MODE_P (TYPE_MODE (vector_type))
 447               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
 448                            GET_MODE_SIZE (base_vector_mode))
 449               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
 450                              2, &half_nelts))
 451             {
 452               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
 453                  together into elements of type INT_TYPE and using the result
 454                  to build NVECTORS vectors.  */
 455               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
 456               vec_perm_builder sel1 (nelts, 2, 3);
 457               vec_perm_builder sel2 (nelts, 2, 3);
 458
 459               for (unsigned int i = 0; i < 3; ++i)
 460                 {
 461                   sel1.quick_push (i);
 462                   sel1.quick_push (i + nelts);
 463                   sel2.quick_push (half_nelts + i);
 464                   sel2.quick_push (half_nelts + i + nelts);
 465                 }
 466               vec_perm_indices indices1 (sel1, 2, nelts);
 467               vec_perm_indices indices2 (sel2, 2, nelts);
 468               machine_mode vmode = TYPE_MODE (vector_type);
 469               if (can_vec_perm_const_p (vmode, vmode, indices1)
 470                   && can_vec_perm_const_p (vmode, vmode, indices2))
 471                 {
 472                   if (nvectors_out)
 473                     *nvectors_out = nvectors;
 474                   if (vector_type_out)
 475                     *vector_type_out = vector_type;
 476                   if (permutes)
 477                     {
 478                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
 479                                                                 indices1);
 480                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
 481                                                                 indices2);
 482                     }
 483                   return true;
 484                 }
 485             }
 486         }
 487       if (!multiple_p (elt_bytes, 2, &elt_bytes))
 488         return false;
 489       nvectors *= 2;
 490     }
 491 }
 492
 493 /* Return true if DTA and DTB match.  */
 494
 495 static bool
 496 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
 497 {
 498   return (dta == dtb
 499           || ((dta == vect_external_def || dta == vect_constant_def)
 500               && (dtb == vect_external_def || dtb == vect_constant_def)));
 501 }
 502
 503 static const int cond_expr_maps[3][5] = {
 504   { 4, -1, -2, 1, 2 },
 505   { 4, -2, -1, 1, 2 },
 506   { 4, -1, -2, 2, 1 }
 507 };
 508 static const int arg0_map[] = { 1, 0 };
 509 static const int arg1_map[] = { 1, 1 };
 510 static const int arg2_map[] = { 1, 2 };
 511 static const int arg1_arg4_map[] = { 2, 1, 4 };
 512 static const int arg3_arg2_map[] = { 2, 3, 2 };
 513 static const int op1_op0_map[] = { 2, 1, 0 };
 514 static const int off_map[] = { 1, -3 };
 515 static const int off_op0_map[] = { 2, -3, 0 };
 516 static const int off_arg2_map[] = { 2, -3, 2 };
 517 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
 518 static const int mask_call_maps[6][7] = {
 519   { 1, 1, },
 520   { 2, 1, 2, },
 521   { 3, 1, 2, 3, },
 522   { 4, 1, 2, 3, 4, },
 523   { 5, 1, 2, 3, 4, 5, },
 524   { 6, 1, 2, 3, 4, 5, 6 },
 525 };
 526
 527 /* For most SLP statements, there is a one-to-one mapping between
 528    gimple arguments and child nodes.  If that is not true for STMT,
 529    return an array that contains:
 530
 531    - the number of child nodes, followed by
 532    - for each child node, the index of the argument associated with that node.
 533      The special index -1 is the first operand of an embedded comparison and
 534      the special index -2 is the second operand of an embedded comparison.
 535      The special indes -3 is the offset of a gather as analyzed by
 536      vect_check_gather_scatter.
 537
 538    SWAP is as for vect_get_and_check_slp_defs.  */
 539
 540 static const int *
 541 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
 542                       unsigned char swap = 0)
 543 {
 544   if (auto assign = dyn_cast<const gassign *> (stmt))
 545     {
 546       if (gimple_assign_rhs_code (assign) == COND_EXPR
 547           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
 548         return cond_expr_maps[swap];
 549       if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
 550           && swap)
 551         return op1_op0_map;
 552       if (gather_scatter_p)
 553         return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
 554                 ? off_op0_map : off_map);
 555     }
 556   gcc_assert (!swap);
 557   if (auto call = dyn_cast<const gcall *> (stmt))
 558     {
 559       if (gimple_call_internal_p (call))
 560         switch (gimple_call_internal_fn (call))
 561           {
 562           case IFN_MASK_LOAD:
 563             return gather_scatter_p ? off_arg2_map : arg2_map;
 564
 565           case IFN_GATHER_LOAD:
 566             return arg1_map;
 567
 568           case IFN_MASK_GATHER_LOAD:
 569           case IFN_MASK_LEN_GATHER_LOAD:
 570             return arg1_arg4_map;
 571
 572           case IFN_MASK_STORE:
 573             return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
 574
 575           case IFN_MASK_CALL:
 576             {
 577               unsigned nargs = gimple_call_num_args (call);
 578               if (nargs >= 2 && nargs <= 7)
 579                 return mask_call_maps[nargs-2];
 580               else
 581                 return nullptr;
 582             }
 583
 584           case IFN_CLZ:
 585           case IFN_CTZ:
 586             return arg0_map;
 587
 588           default:
 589             break;
 590           }
 591     }
 592   return nullptr;
 593 }
 594
 595 /* Return the SLP node child index for operand OP of STMT.  */
 596
 597 int
 598 vect_slp_child_index_for_operand (const gimple *stmt, int op,
 599                                   bool gather_scatter_p)
 600 {
 601   const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
 602   if (!opmap)
 603     return op;
 604   for (int i = 1; i < 1 + opmap[0]; ++i)
 605     if (opmap[i] == op)
 606       return i - 1;
 607   gcc_unreachable ();
 608 }
 609
 610 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
 611    they are of a valid type and that they match the defs of the first stmt of
 612    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
 613    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
 614    indicates swap is required for cond_expr stmts.  Specifically, SWAP
 615    is 1 if STMT is cond and operands of comparison need to be swapped;
 616    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
 617
 618    If there was a fatal error return -1; if the error could be corrected by
 619    swapping operands of father node of this one, return 1; if everything is
 620    ok return 0.  */
 621 static int
 622 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 623                              bool *skip_args,
 624                              vec<stmt_vec_info> stmts, unsigned stmt_num,
 625                              vec<slp_oprnd_info> *oprnds_info)
 626 {
 627   stmt_vec_info stmt_info = stmts[stmt_num];
 628   tree oprnd;
 629   unsigned int i, number_of_oprnds;
 630   enum vect_def_type dt = vect_uninitialized_def;
 631   slp_oprnd_info oprnd_info;
 632   gather_scatter_info gs_info;
 633   unsigned int gs_op = -1u;
 634   unsigned int commutative_op = -1U;
 635   bool first = stmt_num == 0;
 636
 637   if (!is_a<gcall *> (stmt_info->stmt)
 638       && !is_a<gassign *> (stmt_info->stmt)
 639       && !is_a<gphi *> (stmt_info->stmt))
 640     return -1;
 641
 642   number_of_oprnds = gimple_num_args (stmt_info->stmt);
 643   const int *map
 644     = vect_get_operand_map (stmt_info->stmt,
 645                             STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
 646   if (map)
 647     number_of_oprnds = *map++;
 648   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 649     {
 650       if (gimple_call_internal_p (stmt))
 651         {
 652           internal_fn ifn = gimple_call_internal_fn (stmt);
 653           commutative_op = first_commutative_argument (ifn);
 654         }
 655     }
 656   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
 657     {
 658       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
 659         commutative_op = 0;
 660     }
 661
 662   bool swapped = (swap != 0);
 663   bool backedge = false;
 664   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
 665   for (i = 0; i < number_of_oprnds; i++)
 666     {
 667       oprnd_info = (*oprnds_info)[i];
 668       int opno = map ? map[i] : int (i);
 669       if (opno == -3)
 670         {
 671           gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
 672           if (!is_a <loop_vec_info> (vinfo)
 673               || !vect_check_gather_scatter (stmt_info,
 674                                              as_a <loop_vec_info> (vinfo),
 675                                              first ? &oprnd_info->first_gs_info
 676                                              : &gs_info))
 677             return -1;
 678
 679           if (first)
 680             {
 681               oprnd_info->first_gs_p = true;
 682               oprnd = oprnd_info->first_gs_info.offset;
 683             }
 684           else
 685             {
 686               gs_op = i;
 687               oprnd = gs_info.offset;
 688             }
 689         }
 690       else if (opno < 0)
 691         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
 692       else
 693         {
 694           oprnd = gimple_arg (stmt_info->stmt, opno);
 695           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
 696             {
 697               edge e = gimple_phi_arg_edge (stmt, opno);
 698               backedge = (is_a <bb_vec_info> (vinfo)
 699                           ? e->flags & EDGE_DFS_BACK
 700                           : dominated_by_p (CDI_DOMINATORS, e->src,
 701                                             gimple_bb (stmt_info->stmt)));
 702             }
 703         }
 704       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
 705         oprnd = TREE_OPERAND (oprnd, 0);
 706
 707       stmt_vec_info def_stmt_info;
 708       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
 709         {
 710           if (dump_enabled_p ())
 711             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 712                              "Build SLP failed: can't analyze def for %T\n",
 713                              oprnd);
 714
 715           return -1;
 716         }
 717
 718       if (skip_args[i])
 719         {
 720           oprnd_info->def_stmts.quick_push (NULL);
 721           oprnd_info->ops.quick_push (NULL_TREE);
 722           oprnd_info->first_dt = vect_uninitialized_def;
 723           continue;
 724         }
 725
 726       oprnd_info->def_stmts.quick_push (def_stmt_info);
 727       oprnd_info->ops.quick_push (oprnd);
 728
 729       if (def_stmt_info
 730           && is_pattern_stmt_p (def_stmt_info))
 731         {
 732           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
 733               != def_stmt_info)
 734             oprnd_info->any_pattern = true;
 735           else
 736             /* If we promote this to external use the original stmt def.  */
 737             oprnd_info->ops.last ()
 738               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
 739         }
 740
 741       /* If there's a extern def on a backedge make sure we can
 742          code-generate at the region start.
 743          ???  This is another case that could be fixed by adjusting
 744          how we split the function but at the moment we'd have conflicting
 745          goals there.  */
 746       if (backedge
 747           && dts[i] == vect_external_def
 748           && is_a <bb_vec_info> (vinfo)
 749           && TREE_CODE (oprnd) == SSA_NAME
 750           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
 751           && !dominated_by_p (CDI_DOMINATORS,
 752                               as_a <bb_vec_info> (vinfo)->bbs[0],
 753                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
 754         {
 755           if (dump_enabled_p ())
 756             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 757                              "Build SLP failed: extern def %T only defined "
 758                              "on backedge\n", oprnd);
 759           return -1;
 760         }
 761
 762       if (first)
 763         {
 764           tree type = TREE_TYPE (oprnd);
 765           dt = dts[i];
 766
 767           /* For the swapping logic below force vect_reduction_def
 768              for the reduction op in a SLP reduction group.  */
 769           if (!STMT_VINFO_DATA_REF (stmt_info)
 770               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 771               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
 772               && def_stmt_info)
 773             dts[i] = dt = vect_reduction_def;
 774
 775           /* Check the types of the definition.  */
 776           switch (dt)
 777             {
 778             case vect_external_def:
 779             case vect_constant_def:
 780             case vect_internal_def:
 781             case vect_reduction_def:
 782             case vect_induction_def:
 783             case vect_nested_cycle:
 784             case vect_first_order_recurrence:
 785               break;
 786
 787             default:
 788               /* FORNOW: Not supported.  */
 789               if (dump_enabled_p ())
 790                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 791                                  "Build SLP failed: illegal type of def %T\n",
 792                                  oprnd);
 793               return -1;
 794             }
 795
 796           oprnd_info->first_dt = dt;
 797           oprnd_info->first_op_type = type;
 798         }
 799     }
 800   if (first)
 801     return 0;
 802
 803   /* Now match the operand definition types to that of the first stmt.  */
 804   for (i = 0; i < number_of_oprnds;)
 805     {
 806       if (skip_args[i])
 807         {
 808           ++i;
 809           continue;
 810         }
 811
 812       oprnd_info = (*oprnds_info)[i];
 813       dt = dts[i];
 814       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
 815       oprnd = oprnd_info->ops[stmt_num];
 816       tree type = TREE_TYPE (oprnd);
 817
 818       if (!types_compatible_p (oprnd_info->first_op_type, type))
 819         {
 820           if (dump_enabled_p ())
 821             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 822                              "Build SLP failed: different operand types\n");
 823           return 1;
 824         }
 825
 826       if ((gs_op == i) != oprnd_info->first_gs_p)
 827         {
 828           if (dump_enabled_p ())
 829             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 830                              "Build SLP failed: mixed gather and non-gather\n");
 831           return 1;
 832         }
 833       else if (gs_op == i)
 834         {
 835           if (!operand_equal_p (oprnd_info->first_gs_info.base,
 836                                 gs_info.base))
 837             {
 838               if (dump_enabled_p ())
 839                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 840                                  "Build SLP failed: different gather base\n");
 841               return 1;
 842             }
 843           if (oprnd_info->first_gs_info.scale != gs_info.scale)
 844             {
 845               if (dump_enabled_p ())
 846                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 847                                  "Build SLP failed: different gather scale\n");
 848               return 1;
 849             }
 850         }
 851
 852       /* Not first stmt of the group, check that the def-stmt/s match
 853          the def-stmt/s of the first stmt.  Allow different definition
 854          types for reduction chains: the first stmt must be a
 855          vect_reduction_def (a phi node), and the rest
 856          end in the reduction chain.  */
 857       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
 858            && !(oprnd_info->first_dt == vect_reduction_def
 859                 && !STMT_VINFO_DATA_REF (stmt_info)
 860                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 861                 && def_stmt_info
 862                 && !STMT_VINFO_DATA_REF (def_stmt_info)
 863                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 864                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
 865           || (!STMT_VINFO_DATA_REF (stmt_info)
 866               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 867               && ((!def_stmt_info
 868                    || STMT_VINFO_DATA_REF (def_stmt_info)
 869                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 870                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 871                   != (oprnd_info->first_dt != vect_reduction_def))))
 872         {
 873           /* Try swapping operands if we got a mismatch.  For BB
 874              vectorization only in case it will clearly improve things.  */
 875           if (i == commutative_op && !swapped
 876               && (!is_a <bb_vec_info> (vinfo)
 877                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
 878                                              dts[i+1])
 879                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
 880                           || vect_def_types_match
 881                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
 882             {
 883               if (dump_enabled_p ())
 884                 dump_printf_loc (MSG_NOTE, vect_location,
 885                                  "trying swapped operands\n");
 886               std::swap (dts[i], dts[i+1]);
 887               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
 888                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
 889               std::swap ((*oprnds_info)[i]->ops[stmt_num],
 890                          (*oprnds_info)[i+1]->ops[stmt_num]);
 891               swapped = true;
 892               continue;
 893             }
 894
 895           if (is_a <bb_vec_info> (vinfo)
 896               && !oprnd_info->any_pattern)
 897             {
 898               /* Now for commutative ops we should see whether we can
 899                  make the other operand matching.  */
 900               if (dump_enabled_p ())
 901                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 902                                  "treating operand as external\n");
 903               oprnd_info->first_dt = dt = vect_external_def;
 904             }
 905           else
 906             {
 907               if (dump_enabled_p ())
 908                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 909                                  "Build SLP failed: different types\n");
 910               return 1;
 911             }
 912         }
 913
 914       /* Make sure to demote the overall operand to external.  */
 915       if (dt == vect_external_def)
 916         oprnd_info->first_dt = vect_external_def;
 917       /* For a SLP reduction chain we want to duplicate the reduction to
 918          each of the chain members.  That gets us a sane SLP graph (still
 919          the stmts are not 100% correct wrt the initial values).  */
 920       else if ((dt == vect_internal_def
 921                 || dt == vect_reduction_def)
 922                && oprnd_info->first_dt == vect_reduction_def
 923                && !STMT_VINFO_DATA_REF (stmt_info)
 924                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 925                && !STMT_VINFO_DATA_REF (def_stmt_info)
 926                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 927                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 928         {
 929           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
 930           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
 931         }
 932
 933       ++i;
 934     }
 935
 936   /* Swap operands.  */
 937   if (swapped)
 938     {
 939       if (dump_enabled_p ())
 940         dump_printf_loc (MSG_NOTE, vect_location,
 941                          "swapped operands to match def types in %G",
 942                          stmt_info->stmt);
 943     }
 944
 945   return 0;
 946 }
 947
 948 /* Return true if call statements CALL1 and CALL2 are similar enough
 949    to be combined into the same SLP group.  */
 950
 951 bool
 952 compatible_calls_p (gcall *call1, gcall *call2)
 953 {
 954   unsigned int nargs = gimple_call_num_args (call1);
 955   if (nargs != gimple_call_num_args (call2))
 956     return false;
 957
 958   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
 959     return false;
 960
 961   if (gimple_call_internal_p (call1))
 962     {
 963       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
 964                                TREE_TYPE (gimple_call_lhs (call2))))
 965         return false;
 966       for (unsigned int i = 0; i < nargs; ++i)
 967         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
 968                                  TREE_TYPE (gimple_call_arg (call2, i))))
 969           return false;
 970     }
 971   else
 972     {
 973       if (!operand_equal_p (gimple_call_fn (call1),
 974                             gimple_call_fn (call2), 0))
 975         return false;
 976
 977       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
 978         return false;
 979     }
 980
 981   /* Check that any unvectorized arguments are equal.  */
 982   if (const int *map = vect_get_operand_map (call1))
 983     {
 984       unsigned int nkept = *map++;
 985       unsigned int mapi = 0;
 986       for (unsigned int i = 0; i < nargs; ++i)
 987         if (mapi < nkept && map[mapi] == int (i))
 988           mapi += 1;
 989         else if (!operand_equal_p (gimple_call_arg (call1, i),
 990                                    gimple_call_arg (call2, i)))
 991           return false;
 992     }
 993
 994   return true;
 995 }
 996
 997 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
 998    caller's attempt to find the vector type in STMT_INFO with the narrowest
 999    element type.  Return true if VECTYPE is nonnull and if it is valid
1000    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
1001    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
1002    vect_build_slp_tree.  */
1003
1004 static bool
1005 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1006                         unsigned int group_size,
1007                         tree vectype, poly_uint64 *max_nunits)
1008 {
1009   if (!vectype)
1010     {
1011       if (dump_enabled_p ())
1012         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1013                          "Build SLP failed: unsupported data-type in %G\n",
1014                          stmt_info->stmt);
1015       /* Fatal mismatch.  */
1016       return false;
1017     }
1018
1019   /* If populating the vector type requires unrolling then fail
1020      before adjusting *max_nunits for basic-block vectorization.  */
1021   if (is_a <bb_vec_info> (vinfo)
1022       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1023     {
1024       if (dump_enabled_p ())
1025         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1026                          "Build SLP failed: unrolling required "
1027                          "in basic block SLP\n");
1028       /* Fatal mismatch.  */
1029       return false;
1030     }
1031
1032   /* In case of multiple types we need to detect the smallest type.  */
1033   vect_update_max_nunits (max_nunits, vectype);
1034   return true;
1035 }
1036
1037 /* Verify if the scalar stmts STMTS are isomorphic, require data
1038    permutation or are of unsupported types of operation.  Return
1039    true if they are, otherwise return false and indicate in *MATCHES
1040    which stmts are not isomorphic to the first one.  If MATCHES[0]
1041    is false then this indicates the comparison could not be
1042    carried out or the stmts will never be vectorized by SLP.
1043
1044    Note COND_EXPR is possibly isomorphic to another one after swapping its
1045    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1046    the first stmt by swapping the two operands of comparison; set SWAP[i]
1047    to 2 if stmt I is isormorphic to the first stmt by inverting the code
1048    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1049    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
1050
1051 static bool
1052 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1053                        vec<stmt_vec_info> stmts, unsigned int group_size,
1054                        poly_uint64 *max_nunits, bool *matches,
1055                        bool *two_operators, tree *node_vectype)
1056 {
1057   unsigned int i;
1058   stmt_vec_info first_stmt_info = stmts[0];
1059   code_helper first_stmt_code = ERROR_MARK;
1060   code_helper alt_stmt_code = ERROR_MARK;
1061   code_helper rhs_code = ERROR_MARK;
1062   code_helper first_cond_code = ERROR_MARK;
1063   tree lhs;
1064   bool need_same_oprnds = false;
1065   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1066   stmt_vec_info first_load = NULL, prev_first_load = NULL;
1067   bool first_stmt_ldst_p = false, ldst_p = false;
1068   bool first_stmt_phi_p = false, phi_p = false;
1069   bool maybe_soft_fail = false;
1070   tree soft_fail_nunits_vectype = NULL_TREE;
1071
1072   /* For every stmt in NODE find its def stmt/s.  */
1073   stmt_vec_info stmt_info;
1074   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1075     {
1076       gimple *stmt = stmt_info->stmt;
1077       swap[i] = 0;
1078       matches[i] = false;
1079
1080       if (dump_enabled_p ())
1081         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1082
1083       /* Fail to vectorize statements marked as unvectorizable, throw
1084          or are volatile.  */
1085       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1086           || stmt_can_throw_internal (cfun, stmt)
1087           || gimple_has_volatile_ops (stmt))
1088         {
1089           if (dump_enabled_p ())
1090             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1091                              "Build SLP failed: unvectorizable statement %G",
1092                              stmt);
1093           /* ???  For BB vectorization we want to commutate operands in a way
1094              to shuffle all unvectorizable defs into one operand and have
1095              the other still vectorized.  The following doesn't reliably
1096              work for this though but it's the easiest we can do here.  */
1097           if (is_a <bb_vec_info> (vinfo) && i != 0)
1098             continue;
1099           /* Fatal mismatch.  */
1100           matches[0] = false;
1101           return false;
1102         }
1103
1104       gcall *call_stmt = dyn_cast <gcall *> (stmt);
1105       lhs = gimple_get_lhs (stmt);
1106       if (lhs == NULL_TREE
1107           && (!call_stmt
1108               || !gimple_call_internal_p (stmt)
1109               || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1110         {
1111           if (dump_enabled_p ())
1112             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113                              "Build SLP failed: not GIMPLE_ASSIGN nor "
1114                              "GIMPLE_CALL %G", stmt);
1115           if (is_a <bb_vec_info> (vinfo) && i != 0)
1116             continue;
1117           /* Fatal mismatch.  */
1118           matches[0] = false;
1119           return false;
1120         }
1121
1122       tree nunits_vectype;
1123       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1124                                            &nunits_vectype, group_size))
1125         {
1126           if (is_a <bb_vec_info> (vinfo) && i != 0)
1127             continue;
1128           /* Fatal mismatch.  */
1129           matches[0] = false;
1130           return false;
1131         }
1132       /* Record nunits required but continue analysis, producing matches[]
1133          as if nunits was not an issue.  This allows splitting of groups
1134          to happen.  */
1135       if (nunits_vectype
1136           && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1137                                       nunits_vectype, max_nunits))
1138         {
1139           gcc_assert (is_a <bb_vec_info> (vinfo));
1140           maybe_soft_fail = true;
1141           soft_fail_nunits_vectype = nunits_vectype;
1142         }
1143
1144       gcc_assert (vectype);
1145
1146       if (call_stmt)
1147         {
1148           combined_fn cfn = gimple_call_combined_fn (call_stmt);
1149           if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1150             rhs_code = cfn;
1151           else
1152             rhs_code = CALL_EXPR;
1153
1154           if (cfn == CFN_MASK_LOAD
1155               || cfn == CFN_GATHER_LOAD
1156               || cfn == CFN_MASK_GATHER_LOAD
1157               || cfn == CFN_MASK_LEN_GATHER_LOAD)
1158             ldst_p = true;
1159           else if (cfn == CFN_MASK_STORE)
1160             {
1161               ldst_p = true;
1162               rhs_code = CFN_MASK_STORE;
1163             }
1164           else if ((cfn != CFN_LAST
1165                     && cfn != CFN_MASK_CALL
1166                     && internal_fn_p (cfn)
1167                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1168                    || gimple_call_tail_p (call_stmt)
1169                    || gimple_call_noreturn_p (call_stmt)
1170                    || gimple_call_chain (call_stmt))
1171             {
1172               if (dump_enabled_p ())
1173                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1174                                  "Build SLP failed: unsupported call type %G",
1175                                  (gimple *) call_stmt);
1176               if (is_a <bb_vec_info> (vinfo) && i != 0)
1177                 continue;
1178               /* Fatal mismatch.  */
1179               matches[0] = false;
1180               return false;
1181             }
1182         }
1183       else if (gimple_code (stmt) == GIMPLE_PHI)
1184         {
1185           rhs_code = ERROR_MARK;
1186           phi_p = true;
1187         }
1188       else
1189         {
1190           rhs_code = gimple_assign_rhs_code (stmt);
1191           ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1192         }
1193
1194       /* Check the operation.  */
1195       if (i == 0)
1196         {
1197           *node_vectype = vectype;
1198           first_stmt_code = rhs_code;
1199           first_stmt_ldst_p = ldst_p;
1200           first_stmt_phi_p = phi_p;
1201
1202           /* Shift arguments should be equal in all the packed stmts for a
1203              vector shift with scalar shift operand.  */
1204           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1205               || rhs_code == LROTATE_EXPR
1206               || rhs_code == RROTATE_EXPR)
1207             {
1208               /* First see if we have a vector/vector shift.  */
1209               if (!directly_supported_p (rhs_code, vectype, optab_vector))
1210                 {
1211                   /* No vector/vector shift, try for a vector/scalar shift.  */
1212                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1213                     {
1214                       if (dump_enabled_p ())
1215                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1216                                          "Build SLP failed: "
1217                                          "op not supported by target.\n");
1218                       if (is_a <bb_vec_info> (vinfo) && i != 0)
1219                         continue;
1220                       /* Fatal mismatch.  */
1221                       matches[0] = false;
1222                       return false;
1223                     }
1224                   need_same_oprnds = true;
1225                   first_op1 = gimple_assign_rhs2 (stmt);
1226                 }
1227             }
1228           else if (rhs_code == WIDEN_LSHIFT_EXPR)
1229             {
1230               need_same_oprnds = true;
1231               first_op1 = gimple_assign_rhs2 (stmt);
1232             }
1233           else if (!ldst_p
1234                    && rhs_code == BIT_FIELD_REF)
1235             {
1236               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1237               if (!is_a <bb_vec_info> (vinfo)
1238                   || TREE_CODE (vec) != SSA_NAME
1239                   /* When the element types are not compatible we pun the
1240                      source to the target vectype which requires equal size.  */
1241                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1242                        || !types_compatible_p (TREE_TYPE (vectype),
1243                                                TREE_TYPE (TREE_TYPE (vec))))
1244                       && !operand_equal_p (TYPE_SIZE (vectype),
1245                                            TYPE_SIZE (TREE_TYPE (vec)))))
1246                 {
1247                   if (dump_enabled_p ())
1248                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249                                      "Build SLP failed: "
1250                                      "BIT_FIELD_REF not supported\n");
1251                   /* Fatal mismatch.  */
1252                   matches[0] = false;
1253                   return false;
1254                 }
1255             }
1256           else if (rhs_code == CFN_DIV_POW2)
1257             {
1258               need_same_oprnds = true;
1259               first_op1 = gimple_call_arg (call_stmt, 1);
1260             }
1261         }
1262       else
1263         {
1264           if (first_stmt_code != rhs_code
1265               && alt_stmt_code == ERROR_MARK)
1266             alt_stmt_code = rhs_code;
1267           if ((first_stmt_code != rhs_code
1268                && (first_stmt_code != IMAGPART_EXPR
1269                    || rhs_code != REALPART_EXPR)
1270                && (first_stmt_code != REALPART_EXPR
1271                    || rhs_code != IMAGPART_EXPR)
1272                /* Handle mismatches in plus/minus by computing both
1273                   and merging the results.  */
1274                && !((first_stmt_code == PLUS_EXPR
1275                      || first_stmt_code == MINUS_EXPR)
1276                     && (alt_stmt_code == PLUS_EXPR
1277                         || alt_stmt_code == MINUS_EXPR)
1278                     && rhs_code == alt_stmt_code)
1279                && !(first_stmt_code.is_tree_code ()
1280                     && rhs_code.is_tree_code ()
1281                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1282                         == tcc_comparison)
1283                     && (swap_tree_comparison (tree_code (first_stmt_code))
1284                         == tree_code (rhs_code)))
1285                && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1286                     && (first_stmt_code == ARRAY_REF
1287                         || first_stmt_code == BIT_FIELD_REF
1288                         || first_stmt_code == INDIRECT_REF
1289                         || first_stmt_code == COMPONENT_REF
1290                         || first_stmt_code == MEM_REF)
1291                     && (rhs_code == ARRAY_REF
1292                         || rhs_code == BIT_FIELD_REF
1293                         || rhs_code == INDIRECT_REF
1294                         || rhs_code == COMPONENT_REF
1295                         || rhs_code == MEM_REF)))
1296               || (ldst_p
1297                   && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1298                       != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1299               || (ldst_p
1300                   && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1301                       != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1302               || first_stmt_ldst_p != ldst_p
1303               || first_stmt_phi_p != phi_p)
1304             {
1305               if (dump_enabled_p ())
1306                 {
1307                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1308                                    "Build SLP failed: different operation "
1309                                    "in stmt %G", stmt);
1310                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1311                                    "original stmt %G", first_stmt_info->stmt);
1312                 }
1313               /* Mismatch.  */
1314               continue;
1315             }
1316
1317           if (!ldst_p
1318               && first_stmt_code == BIT_FIELD_REF
1319               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1320                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1321             {
1322               if (dump_enabled_p ())
1323                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1324                                  "Build SLP failed: different BIT_FIELD_REF "
1325                                  "arguments in %G", stmt);
1326               /* Mismatch.  */
1327               continue;
1328             }
1329
1330           if (call_stmt
1331               && first_stmt_code != CFN_MASK_LOAD
1332               && first_stmt_code != CFN_MASK_STORE)
1333             {
1334               if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1335                                        call_stmt))
1336                 {
1337                   if (dump_enabled_p ())
1338                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1339                                      "Build SLP failed: different calls in %G",
1340                                      stmt);
1341                   /* Mismatch.  */
1342                   continue;
1343                 }
1344             }
1345
1346           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1347               && (gimple_bb (first_stmt_info->stmt)
1348                   != gimple_bb (stmt_info->stmt)))
1349             {
1350               if (dump_enabled_p ())
1351                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1352                                  "Build SLP failed: different BB for PHI "
1353                                  "or possibly trapping operation in %G", stmt);
1354               /* Mismatch.  */
1355               continue;
1356             }
1357
1358           if (need_same_oprnds)
1359             {
1360               tree other_op1 = gimple_arg (stmt, 1);
1361               if (!operand_equal_p (first_op1, other_op1, 0))
1362                 {
1363                   if (dump_enabled_p ())
1364                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1365                                      "Build SLP failed: different shift "
1366                                      "arguments in %G", stmt);
1367                   /* Mismatch.  */
1368                   continue;
1369                 }
1370             }
1371
1372           if (!types_compatible_p (vectype, *node_vectype))
1373             {
1374               if (dump_enabled_p ())
1375                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1376                                  "Build SLP failed: different vector type "
1377                                  "in %G", stmt);
1378               /* Mismatch.  */
1379               continue;
1380             }
1381         }
1382
1383       /* Grouped store or load.  */
1384       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1385         {
1386           gcc_assert (ldst_p);
1387           if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1388             {
1389               /* Store.  */
1390               gcc_assert (rhs_code == CFN_MASK_STORE
1391                           || REFERENCE_CLASS_P (lhs)
1392                           || DECL_P (lhs));
1393             }
1394           else
1395             {
1396               /* Load.  */
1397               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1398               if (prev_first_load)
1399                 {
1400                   /* Check that there are no loads from different interleaving
1401                      chains in the same node.  */
1402                   if (prev_first_load != first_load)
1403                     {
1404                       if (dump_enabled_p ())
1405                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1406                                          vect_location,
1407                                          "Build SLP failed: different "
1408                                          "interleaving chains in one node %G",
1409                                          stmt);
1410                       /* Mismatch.  */
1411                       continue;
1412                     }
1413                 }
1414               else
1415                 prev_first_load = first_load;
1416            }
1417         }
1418       /* Non-grouped store or load.  */
1419       else if (ldst_p)
1420         {
1421           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1422               && rhs_code != CFN_GATHER_LOAD
1423               && rhs_code != CFN_MASK_GATHER_LOAD
1424               && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1425               && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1426               /* Not grouped loads are handled as externals for BB
1427                  vectorization.  For loop vectorization we can handle
1428                  splats the same we handle single element interleaving.  */
1429               && (is_a <bb_vec_info> (vinfo)
1430                   || stmt_info != first_stmt_info))
1431             {
1432               /* Not grouped load.  */
1433               if (dump_enabled_p ())
1434                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1435                                  "Build SLP failed: not grouped load %G", stmt);
1436
1437               if (i != 0)
1438                 continue;
1439               /* Fatal mismatch.  */
1440               matches[0] = false;
1441               return false;
1442             }
1443         }
1444       /* Not memory operation.  */
1445       else
1446         {
1447           if (!phi_p
1448               && rhs_code.is_tree_code ()
1449               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1450               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1451               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1452               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1453               && rhs_code != VIEW_CONVERT_EXPR
1454               && rhs_code != CALL_EXPR
1455               && rhs_code != BIT_FIELD_REF)
1456             {
1457               if (dump_enabled_p ())
1458                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1459                                  "Build SLP failed: operation unsupported %G",
1460                                  stmt);
1461               if (is_a <bb_vec_info> (vinfo) && i != 0)
1462                 continue;
1463               /* Fatal mismatch.  */
1464               matches[0] = false;
1465               return false;
1466             }
1467
1468           if (rhs_code == COND_EXPR)
1469             {
1470               tree cond_expr = gimple_assign_rhs1 (stmt);
1471               enum tree_code cond_code = TREE_CODE (cond_expr);
1472               enum tree_code swap_code = ERROR_MARK;
1473               enum tree_code invert_code = ERROR_MARK;
1474
1475               if (i == 0)
1476                 first_cond_code = TREE_CODE (cond_expr);
1477               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1478                 {
1479                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1480                   swap_code = swap_tree_comparison (cond_code);
1481                   invert_code = invert_tree_comparison (cond_code, honor_nans);
1482                 }
1483
1484               if (first_cond_code == cond_code)
1485                 ;
1486               /* Isomorphic can be achieved by swapping.  */
1487               else if (first_cond_code == swap_code)
1488                 swap[i] = 1;
1489               /* Isomorphic can be achieved by inverting.  */
1490               else if (first_cond_code == invert_code)
1491                 swap[i] = 2;
1492               else
1493                 {
1494                   if (dump_enabled_p ())
1495                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1496                                      "Build SLP failed: different"
1497                                      " operation %G", stmt);
1498                   /* Mismatch.  */
1499                   continue;
1500                 }
1501             }
1502
1503           if (rhs_code.is_tree_code ()
1504               && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1505               && (swap_tree_comparison ((tree_code)first_stmt_code)
1506                   == (tree_code)rhs_code))
1507             swap[i] = 1;
1508         }
1509
1510       matches[i] = true;
1511     }
1512
1513   for (i = 0; i < group_size; ++i)
1514     if (!matches[i])
1515       return false;
1516
1517   /* If we allowed a two-operation SLP node verify the target can cope
1518      with the permute we are going to use.  */
1519   if (alt_stmt_code != ERROR_MARK
1520       && (!alt_stmt_code.is_tree_code ()
1521           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1522               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1523     {
1524       *two_operators = true;
1525     }
1526
1527   if (maybe_soft_fail)
1528     {
1529       unsigned HOST_WIDE_INT const_nunits;
1530       if (!TYPE_VECTOR_SUBPARTS
1531             (soft_fail_nunits_vectype).is_constant (&const_nunits)
1532           || const_nunits > group_size)
1533         matches[0] = false;
1534       else
1535         {
1536           /* With constant vector elements simulate a mismatch at the
1537              point we need to split.  */
1538           unsigned tail = group_size & (const_nunits - 1);
1539           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1540         }
1541       return false;
1542     }
1543
1544   return true;
1545 }
1546
1547 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1548    Note we never remove apart from at destruction time so we do not
1549    need a special value for deleted that differs from empty.  */
1550 struct bst_traits
1551 {
1552   typedef vec <stmt_vec_info> value_type;
1553   typedef vec <stmt_vec_info> compare_type;
1554   static inline hashval_t hash (value_type);
1555   static inline bool equal (value_type existing, value_type candidate);
1556   static inline bool is_empty (value_type x) { return !x.exists (); }
1557   static inline bool is_deleted (value_type x) { return !x.exists (); }
1558   static const bool empty_zero_p = true;
1559   static inline void mark_empty (value_type &x) { x.release (); }
1560   static inline void mark_deleted (value_type &x) { x.release (); }
1561   static inline void remove (value_type &x) { x.release (); }
1562 };
1563 inline hashval_t
1564 bst_traits::hash (value_type x)
1565 {
1566   inchash::hash h;
1567   for (unsigned i = 0; i < x.length (); ++i)
1568     h.add_int (gimple_uid (x[i]->stmt));
1569   return h.end ();
1570 }
1571 inline bool
1572 bst_traits::equal (value_type existing, value_type candidate)
1573 {
1574   if (existing.length () != candidate.length ())
1575     return false;
1576   for (unsigned i = 0; i < existing.length (); ++i)
1577     if (existing[i] != candidate[i])
1578       return false;
1579   return true;
1580 }
1581
1582 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1583    but then vec::insert does memmove and that's not compatible with
1584    std::pair.  */
1585 struct chain_op_t
1586 {
1587   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1588       : code (code_), dt (dt_), op (op_) {}
1589   tree_code code;
1590   vect_def_type dt;
1591   tree op;
1592 };
1593
1594 /* Comparator for sorting associatable chains.  */
1595
1596 static int
1597 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1598 {
1599   auto *op1 = (const chain_op_t *) op1_;
1600   auto *op2 = (const chain_op_t *) op2_;
1601   if (op1->dt != op2->dt)
1602     return (int)op1->dt - (int)op2->dt;
1603   return (int)op1->code - (int)op2->code;
1604 }
1605
1606 /* Linearize the associatable expression chain at START with the
1607    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1608    filling CHAIN with the result and using WORKLIST as intermediate storage.
1609    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1610    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1611    stmts, starting with START.  */
1612
1613 static void
1614 vect_slp_linearize_chain (vec_info *vinfo,
1615                           vec<std::pair<tree_code, gimple *> > &worklist,
1616                           vec<chain_op_t> &chain,
1617                           enum tree_code code, gimple *start,
1618                           gimple *&code_stmt, gimple *&alt_code_stmt,
1619                           vec<gimple *> *chain_stmts)
1620 {
1621   /* For each lane linearize the addition/subtraction (or other
1622      uniform associatable operation) expression tree.  */
1623   worklist.safe_push (std::make_pair (code, start));
1624   while (!worklist.is_empty ())
1625     {
1626       auto entry = worklist.pop ();
1627       gassign *stmt = as_a <gassign *> (entry.second);
1628       enum tree_code in_code = entry.first;
1629       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1630       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1631       if (!code_stmt
1632           && gimple_assign_rhs_code (stmt) == code)
1633         code_stmt = stmt;
1634       else if (!alt_code_stmt
1635                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1636         alt_code_stmt = stmt;
1637       if (chain_stmts)
1638         chain_stmts->safe_push (stmt);
1639       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1640         {
1641           tree op = gimple_op (stmt, opnum);
1642           vect_def_type dt;
1643           stmt_vec_info def_stmt_info;
1644           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1645           gcc_assert (res);
1646           if (dt == vect_internal_def
1647               && is_pattern_stmt_p (def_stmt_info))
1648             op = gimple_get_lhs (def_stmt_info->stmt);
1649           gimple *use_stmt;
1650           use_operand_p use_p;
1651           if (dt == vect_internal_def
1652               && single_imm_use (op, &use_p, &use_stmt)
1653               && is_gimple_assign (def_stmt_info->stmt)
1654               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1655                   || (code == PLUS_EXPR
1656                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
1657                           == MINUS_EXPR))))
1658             {
1659               tree_code op_def_code = this_code;
1660               if (op_def_code == MINUS_EXPR && opnum == 1)
1661                 op_def_code = PLUS_EXPR;
1662               if (in_code == MINUS_EXPR)
1663                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1664               worklist.safe_push (std::make_pair (op_def_code,
1665                                                   def_stmt_info->stmt));
1666             }
1667           else
1668             {
1669               tree_code op_def_code = this_code;
1670               if (op_def_code == MINUS_EXPR && opnum == 1)
1671                 op_def_code = PLUS_EXPR;
1672               if (in_code == MINUS_EXPR)
1673                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1674               chain.safe_push (chain_op_t (op_def_code, dt, op));
1675             }
1676         }
1677     }
1678 }
1679
1680 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1681                   simple_hashmap_traits <bst_traits, slp_tree> >
1682   scalar_stmts_to_slp_tree_map_t;
1683
1684 static slp_tree
1685 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1686                        vec<stmt_vec_info> stmts, unsigned int group_size,
1687                        poly_uint64 *max_nunits,
1688                        bool *matches, unsigned *limit, unsigned *tree_size,
1689                        scalar_stmts_to_slp_tree_map_t *bst_map);
1690
1691 static slp_tree
1692 vect_build_slp_tree (vec_info *vinfo,
1693                      vec<stmt_vec_info> stmts, unsigned int group_size,
1694                      poly_uint64 *max_nunits,
1695                      bool *matches, unsigned *limit, unsigned *tree_size,
1696                      scalar_stmts_to_slp_tree_map_t *bst_map)
1697 {
1698   if (slp_tree *leader = bst_map->get (stmts))
1699     {
1700       if (dump_enabled_p ())
1701         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1702                          !(*leader)->failed ? "" : "failed ",
1703                          (void *) *leader);
1704       if (!(*leader)->failed)
1705         {
1706           SLP_TREE_REF_COUNT (*leader)++;
1707           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1708           stmts.release ();
1709           return *leader;
1710         }
1711       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1712       return NULL;
1713     }
1714
1715   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1716      so we can pick up backedge destinations during discovery.  */
1717   slp_tree res = new _slp_tree;
1718   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1719   SLP_TREE_SCALAR_STMTS (res) = stmts;
1720   bst_map->put (stmts.copy (), res);
1721
1722   if (*limit == 0)
1723     {
1724       if (dump_enabled_p ())
1725         dump_printf_loc (MSG_NOTE, vect_location,
1726                          "SLP discovery limit exceeded\n");
1727       /* Mark the node invalid so we can detect those when still in use
1728          as backedge destinations.  */
1729       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1730       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1731       res->failed = XNEWVEC (bool, group_size);
1732       memset (res->failed, 0, sizeof (bool) * group_size);
1733       memset (matches, 0, sizeof (bool) * group_size);
1734       return NULL;
1735     }
1736   --*limit;
1737
1738   if (dump_enabled_p ())
1739     dump_printf_loc (MSG_NOTE, vect_location,
1740                      "starting SLP discovery for node %p\n", (void *) res);
1741
1742   poly_uint64 this_max_nunits = 1;
1743   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1744                                         &this_max_nunits,
1745                                         matches, limit, tree_size, bst_map);
1746   if (!res_)
1747     {
1748       if (dump_enabled_p ())
1749         dump_printf_loc (MSG_NOTE, vect_location,
1750                          "SLP discovery for node %p failed\n", (void *) res);
1751       /* Mark the node invalid so we can detect those when still in use
1752          as backedge destinations.  */
1753       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1754       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1755       res->failed = XNEWVEC (bool, group_size);
1756       if (flag_checking)
1757         {
1758           unsigned i;
1759           for (i = 0; i < group_size; ++i)
1760             if (!matches[i])
1761               break;
1762           gcc_assert (i < group_size);
1763         }
1764       memcpy (res->failed, matches, sizeof (bool) * group_size);
1765     }
1766   else
1767     {
1768       if (dump_enabled_p ())
1769         dump_printf_loc (MSG_NOTE, vect_location,
1770                          "SLP discovery for node %p succeeded\n",
1771                          (void *) res);
1772       gcc_assert (res_ == res);
1773       res->max_nunits = this_max_nunits;
1774       vect_update_max_nunits (max_nunits, this_max_nunits);
1775       /* Keep a reference for the bst_map use.  */
1776       SLP_TREE_REF_COUNT (res)++;
1777     }
1778   return res_;
1779 }
1780
1781 /* Helper for building an associated SLP node chain.  */
1782
1783 static void
1784 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1785                                    slp_tree op0, slp_tree op1,
1786                                    stmt_vec_info oper1, stmt_vec_info oper2,
1787                                    vec<std::pair<unsigned, unsigned> > lperm)
1788 {
1789   unsigned group_size = SLP_TREE_LANES (op1);
1790
1791   slp_tree child1 = new _slp_tree;
1792   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1793   SLP_TREE_VECTYPE (child1) = vectype;
1794   SLP_TREE_LANES (child1) = group_size;
1795   SLP_TREE_CHILDREN (child1).create (2);
1796   SLP_TREE_CHILDREN (child1).quick_push (op0);
1797   SLP_TREE_CHILDREN (child1).quick_push (op1);
1798   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1799
1800   slp_tree child2 = new _slp_tree;
1801   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1802   SLP_TREE_VECTYPE (child2) = vectype;
1803   SLP_TREE_LANES (child2) = group_size;
1804   SLP_TREE_CHILDREN (child2).create (2);
1805   SLP_TREE_CHILDREN (child2).quick_push (op0);
1806   SLP_TREE_REF_COUNT (op0)++;
1807   SLP_TREE_CHILDREN (child2).quick_push (op1);
1808   SLP_TREE_REF_COUNT (op1)++;
1809   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1810
1811   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1812   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1813   SLP_TREE_VECTYPE (perm) = vectype;
1814   SLP_TREE_LANES (perm) = group_size;
1815   /* ???  We should set this NULL but that's not expected.  */
1816   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1817   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1818   SLP_TREE_CHILDREN (perm).quick_push (child1);
1819   SLP_TREE_CHILDREN (perm).quick_push (child2);
1820 }
1821
1822 /* Recursively build an SLP tree starting from NODE.
1823    Fail (and return a value not equal to zero) if def-stmts are not
1824    isomorphic, require data permutation or are of unsupported types of
1825    operation.  Otherwise, return 0.
1826    The value returned is the depth in the SLP tree where a mismatch
1827    was found.  */
1828
1829 static slp_tree
1830 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1831                        vec<stmt_vec_info> stmts, unsigned int group_size,
1832                        poly_uint64 *max_nunits,
1833                        bool *matches, unsigned *limit, unsigned *tree_size,
1834                        scalar_stmts_to_slp_tree_map_t *bst_map)
1835 {
1836   unsigned nops, i, this_tree_size = 0;
1837   poly_uint64 this_max_nunits = *max_nunits;
1838
1839   matches[0] = false;
1840
1841   stmt_vec_info stmt_info = stmts[0];
1842   if (!is_a<gcall *> (stmt_info->stmt)
1843       && !is_a<gassign *> (stmt_info->stmt)
1844       && !is_a<gphi *> (stmt_info->stmt))
1845     return NULL;
1846
1847   nops = gimple_num_args (stmt_info->stmt);
1848   if (const int *map = vect_get_operand_map (stmt_info->stmt,
1849                                              STMT_VINFO_GATHER_SCATTER_P
1850                                                (stmt_info)))
1851     nops = map[0];
1852
1853   /* If the SLP node is a PHI (induction or reduction), terminate
1854      the recursion.  */
1855   bool *skip_args = XALLOCAVEC (bool, nops);
1856   memset (skip_args, 0, sizeof (bool) * nops);
1857   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1858     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1859       {
1860         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1861         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1862                                                     group_size);
1863         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1864                                      max_nunits))
1865           return NULL;
1866
1867         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1868         if (def_type == vect_induction_def)
1869           {
1870             /* Induction PHIs are not cycles but walk the initial
1871                value.  Only for inner loops through, for outer loops
1872                we need to pick up the value from the actual PHIs
1873                to more easily support peeling and epilogue vectorization.  */
1874             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1875             if (!nested_in_vect_loop_p (loop, stmt_info))
1876               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1877             else
1878               loop = loop->inner;
1879             skip_args[loop_latch_edge (loop)->dest_idx] = true;
1880           }
1881         else if (def_type == vect_reduction_def
1882                  || def_type == vect_double_reduction_def
1883                  || def_type == vect_nested_cycle
1884                  || def_type == vect_first_order_recurrence)
1885           {
1886             /* Else def types have to match.  */
1887             stmt_vec_info other_info;
1888             bool all_same = true;
1889             FOR_EACH_VEC_ELT (stmts, i, other_info)
1890               {
1891                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1892                   return NULL;
1893                 if (other_info != stmt_info)
1894                   all_same = false;
1895               }
1896             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1897             /* Reduction initial values are not explicitely represented.  */
1898             if (def_type != vect_first_order_recurrence
1899                 && !nested_in_vect_loop_p (loop, stmt_info))
1900               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1901             /* Reduction chain backedge defs are filled manually.
1902                ???  Need a better way to identify a SLP reduction chain PHI.
1903                Or a better overall way to SLP match those.  */
1904             if (all_same && def_type == vect_reduction_def)
1905               skip_args[loop_latch_edge (loop)->dest_idx] = true;
1906           }
1907         else if (def_type != vect_internal_def)
1908           return NULL;
1909       }
1910
1911
1912   bool two_operators = false;
1913   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1914   tree vectype = NULL_TREE;
1915   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1916                               &this_max_nunits, matches, &two_operators,
1917                               &vectype))
1918     return NULL;
1919
1920   /* If the SLP node is a load, terminate the recursion unless masked.  */
1921   if (STMT_VINFO_DATA_REF (stmt_info)
1922       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1923     {
1924       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1925         gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1926       else
1927         {
1928           *max_nunits = this_max_nunits;
1929           (*tree_size)++;
1930           node = vect_create_new_slp_node (node, stmts, 0);
1931           SLP_TREE_VECTYPE (node) = vectype;
1932           /* And compute the load permutation.  Whether it is actually
1933              a permutation depends on the unrolling factor which is
1934              decided later.  */
1935           vec<unsigned> load_permutation;
1936           int j;
1937           stmt_vec_info load_info;
1938           load_permutation.create (group_size);
1939           stmt_vec_info first_stmt_info
1940             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1941           bool any_permute = false;
1942           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1943             {
1944               int load_place;
1945               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1946                 load_place = vect_get_place_in_interleaving_chain
1947                     (load_info, first_stmt_info);
1948               else
1949                 load_place = 0;
1950               gcc_assert (load_place != -1);
1951               any_permute |= load_place != j;
1952               load_permutation.quick_push (load_place);
1953             }
1954
1955           if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1956             {
1957               gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1958                           || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1959                           || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
1960                           || gimple_call_internal_p (stmt,
1961                                                      IFN_MASK_LEN_GATHER_LOAD));
1962               load_permutation.release ();
1963               /* We cannot handle permuted masked loads, see PR114375.  */
1964               if (any_permute
1965                   || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1966                       && DR_GROUP_SIZE (first_stmt_info) != group_size)
1967                   || STMT_VINFO_STRIDED_P (stmt_info))
1968                 {
1969                   matches[0] = false;
1970                   return NULL;
1971                 }
1972             }
1973           else
1974             {
1975               SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1976               return node;
1977             }
1978         }
1979     }
1980   else if (gimple_assign_single_p (stmt_info->stmt)
1981            && !gimple_vuse (stmt_info->stmt)
1982            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1983     {
1984       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1985          the same SSA name vector of a compatible type to vectype.  */
1986       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1987       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1988       stmt_vec_info estmt_info;
1989       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1990         {
1991           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1992           tree bfref = gimple_assign_rhs1 (estmt);
1993           HOST_WIDE_INT lane;
1994           if (!known_eq (bit_field_size (bfref),
1995                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1996               || !constant_multiple_p (bit_field_offset (bfref),
1997                                        bit_field_size (bfref), &lane))
1998             {
1999               lperm.release ();
2000               matches[0] = false;
2001               return NULL;
2002             }
2003           lperm.safe_push (std::make_pair (0, (unsigned)lane));
2004         }
2005       slp_tree vnode = vect_create_new_slp_node (vNULL);
2006       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2007         /* ???  We record vectype here but we hide eventually necessary
2008            punning and instead rely on code generation to materialize
2009            VIEW_CONVERT_EXPRs as necessary.  We instead should make
2010            this explicit somehow.  */
2011         SLP_TREE_VECTYPE (vnode) = vectype;
2012       else
2013         {
2014           /* For different size but compatible elements we can still
2015              use VEC_PERM_EXPR without punning.  */
2016           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2017                       && types_compatible_p (TREE_TYPE (vectype),
2018                                              TREE_TYPE (TREE_TYPE (vec))));
2019           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2020         }
2021       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2022       unsigned HOST_WIDE_INT const_nunits;
2023       if (nunits.is_constant (&const_nunits))
2024         SLP_TREE_LANES (vnode) = const_nunits;
2025       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2026       /* We are always building a permutation node even if it is an identity
2027          permute to shield the rest of the vectorizer from the odd node
2028          representing an actual vector without any scalar ops.
2029          ???  We could hide it completely with making the permute node
2030          external?  */
2031       node = vect_create_new_slp_node (node, stmts, 1);
2032       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2033       SLP_TREE_LANE_PERMUTATION (node) = lperm;
2034       SLP_TREE_VECTYPE (node) = vectype;
2035       SLP_TREE_CHILDREN (node).quick_push (vnode);
2036       return node;
2037     }
2038   /* When discovery reaches an associatable operation see whether we can
2039      improve that to match up lanes in a way superior to the operand
2040      swapping code which at most looks at two defs.
2041      ???  For BB vectorization we cannot do the brute-force search
2042      for matching as we can succeed by means of builds from scalars
2043      and have no good way to "cost" one build against another.  */
2044   else if (is_a <loop_vec_info> (vinfo)
2045            /* ???  We don't handle !vect_internal_def defs below.  */
2046            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2047            && is_gimple_assign (stmt_info->stmt)
2048            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2049                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2050            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2051                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2052                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2053     {
2054       /* See if we have a chain of (mixed) adds or subtracts or other
2055          associatable ops.  */
2056       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2057       if (code == MINUS_EXPR)
2058         code = PLUS_EXPR;
2059       stmt_vec_info other_op_stmt_info = NULL;
2060       stmt_vec_info op_stmt_info = NULL;
2061       unsigned chain_len = 0;
2062       auto_vec<chain_op_t> chain;
2063       auto_vec<std::pair<tree_code, gimple *> > worklist;
2064       auto_vec<vec<chain_op_t> > chains (group_size);
2065       auto_vec<slp_tree, 4> children;
2066       bool hard_fail = true;
2067       for (unsigned lane = 0; lane < group_size; ++lane)
2068         {
2069           /* For each lane linearize the addition/subtraction (or other
2070              uniform associatable operation) expression tree.  */
2071           gimple *op_stmt = NULL, *other_op_stmt = NULL;
2072           vect_slp_linearize_chain (vinfo, worklist, chain, code,
2073                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
2074                                     NULL);
2075           if (!op_stmt_info && op_stmt)
2076             op_stmt_info = vinfo->lookup_stmt (op_stmt);
2077           if (!other_op_stmt_info && other_op_stmt)
2078             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2079           if (chain.length () == 2)
2080             {
2081               /* In a chain of just two elements resort to the regular
2082                  operand swapping scheme.  If we run into a length
2083                  mismatch still hard-FAIL.  */
2084               if (chain_len == 0)
2085                 hard_fail = false;
2086               else
2087                 {
2088                   matches[lane] = false;
2089                   /* ???  We might want to process the other lanes, but
2090                      make sure to not give false matching hints to the
2091                      caller for lanes we did not process.  */
2092                   if (lane != group_size - 1)
2093                     matches[0] = false;
2094                 }
2095               break;
2096             }
2097           else if (chain_len == 0)
2098             chain_len = chain.length ();
2099           else if (chain.length () != chain_len)
2100             {
2101               /* ???  Here we could slip in magic to compensate with
2102                  neutral operands.  */
2103               matches[lane] = false;
2104               if (lane != group_size - 1)
2105                 matches[0] = false;
2106               break;
2107             }
2108           chains.quick_push (chain.copy ());
2109           chain.truncate (0);
2110         }
2111       if (chains.length () == group_size)
2112         {
2113           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
2114           if (!op_stmt_info)
2115             {
2116               hard_fail = false;
2117               goto out;
2118             }
2119           /* Now we have a set of chains with the same length.  */
2120           /* 1. pre-sort according to def_type and operation.  */
2121           for (unsigned lane = 0; lane < group_size; ++lane)
2122             chains[lane].stablesort (dt_sort_cmp, vinfo);
2123           if (dump_enabled_p ())
2124             {
2125               dump_printf_loc (MSG_NOTE, vect_location,
2126                                "pre-sorted chains of %s\n",
2127                                get_tree_code_name (code));
2128               for (unsigned lane = 0; lane < group_size; ++lane)
2129                 {
2130                   for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2131                     dump_printf (MSG_NOTE, "%s %T ",
2132                                  get_tree_code_name (chains[lane][opnum].code),
2133                                  chains[lane][opnum].op);
2134                   dump_printf (MSG_NOTE, "\n");
2135                 }
2136             }
2137           /* 2. try to build children nodes, associating as necessary.  */
2138           for (unsigned n = 0; n < chain_len; ++n)
2139             {
2140               vect_def_type dt = chains[0][n].dt;
2141               unsigned lane;
2142               for (lane = 0; lane < group_size; ++lane)
2143                 if (chains[lane][n].dt != dt)
2144                   {
2145                     if (dt == vect_constant_def
2146                         && chains[lane][n].dt == vect_external_def)
2147                       dt = vect_external_def;
2148                     else if (dt == vect_external_def
2149                              && chains[lane][n].dt == vect_constant_def)
2150                       ;
2151                     else
2152                       break;
2153                   }
2154               if (lane != group_size)
2155                 {
2156                   if (dump_enabled_p ())
2157                     dump_printf_loc (MSG_NOTE, vect_location,
2158                                      "giving up on chain due to mismatched "
2159                                      "def types\n");
2160                   matches[lane] = false;
2161                   if (lane != group_size - 1)
2162                     matches[0] = false;
2163                   goto out;
2164                 }
2165               if (dt == vect_constant_def
2166                   || dt == vect_external_def)
2167                 {
2168                   /* Check whether we can build the invariant.  If we can't
2169                      we never will be able to.  */
2170                   tree type = TREE_TYPE (chains[0][n].op);
2171                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2172                       && (TREE_CODE (type) == BOOLEAN_TYPE
2173                           || !can_duplicate_and_interleave_p (vinfo, group_size,
2174                                                               type)))
2175                     {
2176                       matches[0] = false;
2177                       goto out;
2178                     }
2179                   vec<tree> ops;
2180                   ops.create (group_size);
2181                   for (lane = 0; lane < group_size; ++lane)
2182                     ops.quick_push (chains[lane][n].op);
2183                   slp_tree child = vect_create_new_slp_node (ops);
2184                   SLP_TREE_DEF_TYPE (child) = dt;
2185                   children.safe_push (child);
2186                 }
2187               else if (dt != vect_internal_def)
2188                 {
2189                   /* Not sure, we might need sth special.
2190                      gcc.dg/vect/pr96854.c,
2191                      gfortran.dg/vect/fast-math-pr37021.f90
2192                      and gfortran.dg/vect/pr61171.f trigger.  */
2193                   /* Soft-fail for now.  */
2194                   hard_fail = false;
2195                   goto out;
2196                 }
2197               else
2198                 {
2199                   vec<stmt_vec_info> op_stmts;
2200                   op_stmts.create (group_size);
2201                   slp_tree child = NULL;
2202                   /* Brute-force our way.  We have to consider a lane
2203                      failing after fixing an earlier fail up in the
2204                      SLP discovery recursion.  So track the current
2205                      permute per lane.  */
2206                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
2207                   memset (perms, 0, sizeof (unsigned) * group_size);
2208                   do
2209                     {
2210                       op_stmts.truncate (0);
2211                       for (lane = 0; lane < group_size; ++lane)
2212                         op_stmts.quick_push
2213                           (vinfo->lookup_def (chains[lane][n].op));
2214                       child = vect_build_slp_tree (vinfo, op_stmts,
2215                                                    group_size, &this_max_nunits,
2216                                                    matches, limit,
2217                                                    &this_tree_size, bst_map);
2218                       /* ???  We're likely getting too many fatal mismatches
2219                          here so maybe we want to ignore them (but then we
2220                          have no idea which lanes fatally mismatched).  */
2221                       if (child || !matches[0])
2222                         break;
2223                       /* Swap another lane we have not yet matched up into
2224                          lanes that did not match.  If we run out of
2225                          permute possibilities for a lane terminate the
2226                          search.  */
2227                       bool term = false;
2228                       for (lane = 1; lane < group_size; ++lane)
2229                         if (!matches[lane])
2230                           {
2231                             if (n + perms[lane] + 1 == chain_len)
2232                               {
2233                                 term = true;
2234                                 break;
2235                               }
2236                             std::swap (chains[lane][n],
2237                                        chains[lane][n + perms[lane] + 1]);
2238                             perms[lane]++;
2239                           }
2240                       if (term)
2241                         break;
2242                     }
2243                   while (1);
2244                   if (!child)
2245                     {
2246                       if (dump_enabled_p ())
2247                         dump_printf_loc (MSG_NOTE, vect_location,
2248                                          "failed to match up op %d\n", n);
2249                       op_stmts.release ();
2250                       if (lane != group_size - 1)
2251                         matches[0] = false;
2252                       else
2253                         matches[lane] = false;
2254                       goto out;
2255                     }
2256                   if (dump_enabled_p ())
2257                     {
2258                       dump_printf_loc (MSG_NOTE, vect_location,
2259                                        "matched up op %d to\n", n);
2260                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
2261                     }
2262                   children.safe_push (child);
2263                 }
2264             }
2265           /* 3. build SLP nodes to combine the chain.  */
2266           for (unsigned lane = 0; lane < group_size; ++lane)
2267             if (chains[lane][0].code != code)
2268               {
2269                 /* See if there's any alternate all-PLUS entry.  */
2270                 unsigned n;
2271                 for (n = 1; n < chain_len; ++n)
2272                   {
2273                     for (lane = 0; lane < group_size; ++lane)
2274                       if (chains[lane][n].code != code)
2275                         break;
2276                     if (lane == group_size)
2277                       break;
2278                   }
2279                 if (n != chain_len)
2280                   {
2281                     /* Swap that in at first position.  */
2282                     std::swap (children[0], children[n]);
2283                     for (lane = 0; lane < group_size; ++lane)
2284                       std::swap (chains[lane][0], chains[lane][n]);
2285                   }
2286                 else
2287                   {
2288                     /* ???  When this triggers and we end up with two
2289                        vect_constant/external_def up-front things break (ICE)
2290                        spectacularly finding an insertion place for the
2291                        all-constant op.  We should have a fully
2292                        vect_internal_def operand though(?) so we can swap
2293                        that into first place and then prepend the all-zero
2294                        constant.  */
2295                     if (dump_enabled_p ())
2296                       dump_printf_loc (MSG_NOTE, vect_location,
2297                                        "inserting constant zero to compensate "
2298                                        "for (partially) negated first "
2299                                        "operand\n");
2300                     chain_len++;
2301                     for (lane = 0; lane < group_size; ++lane)
2302                       chains[lane].safe_insert
2303                         (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2304                     vec<tree> zero_ops;
2305                     zero_ops.create (group_size);
2306                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2307                     for (lane = 1; lane < group_size; ++lane)
2308                       zero_ops.quick_push (zero_ops[0]);
2309                     slp_tree zero = vect_create_new_slp_node (zero_ops);
2310                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2311                     children.safe_insert (0, zero);
2312                   }
2313                 break;
2314               }
2315           for (unsigned i = 1; i < children.length (); ++i)
2316             {
2317               slp_tree op0 = children[i - 1];
2318               slp_tree op1 = children[i];
2319               bool this_two_op = false;
2320               for (unsigned lane = 0; lane < group_size; ++lane)
2321                 if (chains[lane][i].code != chains[0][i].code)
2322                   {
2323                     this_two_op = true;
2324                     break;
2325                   }
2326               slp_tree child;
2327               if (i == children.length () - 1)
2328                 child = vect_create_new_slp_node (node, stmts, 2);
2329               else
2330                 child = vect_create_new_slp_node (2, ERROR_MARK);
2331               if (this_two_op)
2332                 {
2333                   vec<std::pair<unsigned, unsigned> > lperm;
2334                   lperm.create (group_size);
2335                   for (unsigned lane = 0; lane < group_size; ++lane)
2336                     lperm.quick_push (std::make_pair
2337                       (chains[lane][i].code != chains[0][i].code, lane));
2338                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2339                                                      (chains[0][i].code == code
2340                                                       ? op_stmt_info
2341                                                       : other_op_stmt_info),
2342                                                      (chains[0][i].code == code
2343                                                       ? other_op_stmt_info
2344                                                       : op_stmt_info),
2345                                                      lperm);
2346                 }
2347               else
2348                 {
2349                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2350                   SLP_TREE_VECTYPE (child) = vectype;
2351                   SLP_TREE_LANES (child) = group_size;
2352                   SLP_TREE_CHILDREN (child).quick_push (op0);
2353                   SLP_TREE_CHILDREN (child).quick_push (op1);
2354                   SLP_TREE_REPRESENTATIVE (child)
2355                     = (chains[0][i].code == code
2356                        ? op_stmt_info : other_op_stmt_info);
2357                 }
2358               children[i] = child;
2359             }
2360           *tree_size += this_tree_size + 1;
2361           *max_nunits = this_max_nunits;
2362           while (!chains.is_empty ())
2363             chains.pop ().release ();
2364           return node;
2365         }
2366 out:
2367       while (!children.is_empty ())
2368         vect_free_slp_tree (children.pop ());
2369       while (!chains.is_empty ())
2370         chains.pop ().release ();
2371       /* Hard-fail, otherwise we might run into quadratic processing of the
2372          chains starting one stmt into the chain again.  */
2373       if (hard_fail)
2374         return NULL;
2375       /* Fall thru to normal processing.  */
2376     }
2377
2378   /* Get at the operands, verifying they are compatible.  */
2379   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2380   slp_oprnd_info oprnd_info;
2381   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2382     {
2383       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2384                                              stmts, i, &oprnds_info);
2385       if (res != 0)
2386         matches[(res == -1) ? 0 : i] = false;
2387       if (!matches[0])
2388         break;
2389     }
2390   for (i = 0; i < group_size; ++i)
2391     if (!matches[i])
2392       {
2393         vect_free_oprnd_info (oprnds_info);
2394         return NULL;
2395       }
2396   swap = NULL;
2397
2398   auto_vec<slp_tree, 4> children;
2399
2400   stmt_info = stmts[0];
2401
2402   /* Create SLP_TREE nodes for the definition node/s.  */
2403   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2404     {
2405       slp_tree child = nullptr;
2406       unsigned int j;
2407
2408       /* We're skipping certain operands from processing, for example
2409          outer loop reduction initial defs.  */
2410       if (skip_args[i])
2411         {
2412           children.safe_push (NULL);
2413           continue;
2414         }
2415
2416       if (oprnd_info->first_dt == vect_uninitialized_def)
2417         {
2418           /* COND_EXPR have one too many eventually if the condition
2419              is a SSA name.  */
2420           gcc_assert (i == 3 && nops == 4);
2421           continue;
2422         }
2423
2424       if (is_a <bb_vec_info> (vinfo)
2425           && oprnd_info->first_dt == vect_internal_def
2426           && !oprnd_info->any_pattern)
2427         {
2428           /* For BB vectorization, if all defs are the same do not
2429              bother to continue the build along the single-lane
2430              graph but use a splat of the scalar value.  */
2431           stmt_vec_info first_def = oprnd_info->def_stmts[0];
2432           for (j = 1; j < group_size; ++j)
2433             if (oprnd_info->def_stmts[j] != first_def)
2434               break;
2435           if (j == group_size
2436               /* But avoid doing this for loads where we may be
2437                  able to CSE things, unless the stmt is not
2438                  vectorizable.  */
2439               && (!STMT_VINFO_VECTORIZABLE (first_def)
2440                   || !gimple_vuse (first_def->stmt)))
2441             {
2442               if (dump_enabled_p ())
2443                 dump_printf_loc (MSG_NOTE, vect_location,
2444                                  "Using a splat of the uniform operand %G",
2445                                  first_def->stmt);
2446               oprnd_info->first_dt = vect_external_def;
2447             }
2448         }
2449
2450       if (oprnd_info->first_dt == vect_external_def
2451           || oprnd_info->first_dt == vect_constant_def)
2452         {
2453           if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2454             {
2455               tree op0;
2456               tree uniform_val = op0 = oprnd_info->ops[0];
2457               for (j = 1; j < oprnd_info->ops.length (); ++j)
2458                 if (!operand_equal_p (uniform_val, oprnd_info->ops[j]))
2459                   {
2460                     uniform_val = NULL_TREE;
2461                     break;
2462                   }
2463               if (!uniform_val
2464                   && !can_duplicate_and_interleave_p (vinfo,
2465                                                       oprnd_info->ops.length (),
2466                                                       TREE_TYPE (op0)))
2467                 {
2468                   matches[j] = false;
2469                   if (dump_enabled_p ())
2470                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2471                                      "Build SLP failed: invalid type of def "
2472                                      "for variable-length SLP %T\n", op0);
2473                   goto fail;
2474                 }
2475             }
2476           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2477           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2478           oprnd_info->ops = vNULL;
2479           children.safe_push (invnode);
2480           continue;
2481         }
2482
2483       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2484                                         group_size, &this_max_nunits,
2485                                         matches, limit,
2486                                         &this_tree_size, bst_map)) != NULL)
2487         {
2488           oprnd_info->def_stmts = vNULL;
2489           children.safe_push (child);
2490           continue;
2491         }
2492
2493       /* If the SLP build for operand zero failed and operand zero
2494          and one can be commutated try that for the scalar stmts
2495          that failed the match.  */
2496       if (i == 0
2497           /* A first scalar stmt mismatch signals a fatal mismatch.  */
2498           && matches[0]
2499           /* ???  For COND_EXPRs we can swap the comparison operands
2500              as well as the arms under some constraints.  */
2501           && nops == 2
2502           && oprnds_info[1]->first_dt == vect_internal_def
2503           && is_gimple_assign (stmt_info->stmt)
2504           /* Swapping operands for reductions breaks assumptions later on.  */
2505           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2506           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2507         {
2508           /* See whether we can swap the matching or the non-matching
2509              stmt operands.  */
2510           bool swap_not_matching = true;
2511           do
2512             {
2513               for (j = 0; j < group_size; ++j)
2514                 {
2515                   if (matches[j] != !swap_not_matching)
2516                     continue;
2517                   stmt_vec_info stmt_info = stmts[j];
2518                   /* Verify if we can swap operands of this stmt.  */
2519                   gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2520                   if (!stmt
2521                       || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2522                     {
2523                       if (!swap_not_matching)
2524                         goto fail;
2525                       swap_not_matching = false;
2526                       break;
2527                     }
2528                 }
2529             }
2530           while (j != group_size);
2531
2532           /* Swap mismatched definition stmts.  */
2533           if (dump_enabled_p ())
2534             dump_printf_loc (MSG_NOTE, vect_location,
2535                              "Re-trying with swapped operands of stmts ");
2536           for (j = 0; j < group_size; ++j)
2537             if (matches[j] == !swap_not_matching)
2538               {
2539                 std::swap (oprnds_info[0]->def_stmts[j],
2540                            oprnds_info[1]->def_stmts[j]);
2541                 std::swap (oprnds_info[0]->ops[j],
2542                            oprnds_info[1]->ops[j]);
2543                 if (dump_enabled_p ())
2544                   dump_printf (MSG_NOTE, "%d ", j);
2545               }
2546           if (dump_enabled_p ())
2547             dump_printf (MSG_NOTE, "\n");
2548           /* After swapping some operands we lost track whether an
2549              operand has any pattern defs so be conservative here.  */
2550           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2551             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2552           /* And try again with scratch 'matches' ... */
2553           bool *tem = XALLOCAVEC (bool, group_size);
2554           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2555                                             group_size, &this_max_nunits,
2556                                             tem, limit,
2557                                             &this_tree_size, bst_map)) != NULL)
2558             {
2559               oprnd_info->def_stmts = vNULL;
2560               children.safe_push (child);
2561               continue;
2562             }
2563         }
2564 fail:
2565
2566       /* If the SLP build failed and we analyze a basic-block
2567          simply treat nodes we fail to build as externally defined
2568          (and thus build vectors from the scalar defs).
2569          The cost model will reject outright expensive cases.
2570          ???  This doesn't treat cases where permutation ultimatively
2571          fails (or we don't try permutation below).  Ideally we'd
2572          even compute a permutation that will end up with the maximum
2573          SLP tree size...  */
2574       if (is_a <bb_vec_info> (vinfo)
2575           /* ???  Rejecting patterns this way doesn't work.  We'd have to
2576              do extra work to cancel the pattern so the uses see the
2577              scalar version.  */
2578           && !is_pattern_stmt_p (stmt_info)
2579           && !oprnd_info->any_pattern)
2580         {
2581           /* But if there's a leading vector sized set of matching stmts
2582              fail here so we can split the group.  This matches the condition
2583              vect_analyze_slp_instance uses.  */
2584           /* ???  We might want to split here and combine the results to support
2585              multiple vector sizes better.  */
2586           for (j = 0; j < group_size; ++j)
2587             if (!matches[j])
2588               break;
2589           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2590             {
2591               if (dump_enabled_p ())
2592                 dump_printf_loc (MSG_NOTE, vect_location,
2593                                  "Building vector operands from scalars\n");
2594               this_tree_size++;
2595               child = vect_create_new_slp_node (oprnd_info->ops);
2596               children.safe_push (child);
2597               oprnd_info->ops = vNULL;
2598               continue;
2599             }
2600         }
2601
2602       gcc_assert (child == NULL);
2603       FOR_EACH_VEC_ELT (children, j, child)
2604         if (child)
2605           vect_free_slp_tree (child);
2606       vect_free_oprnd_info (oprnds_info);
2607       return NULL;
2608     }
2609
2610   vect_free_oprnd_info (oprnds_info);
2611
2612   /* If we have all children of a child built up from uniform scalars
2613      or does more than one possibly expensive vector construction then
2614      just throw that away, causing it built up from scalars.
2615      The exception is the SLP node for the vector store.  */
2616   if (is_a <bb_vec_info> (vinfo)
2617       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2618       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2619          do extra work to cancel the pattern so the uses see the
2620          scalar version.  */
2621       && !is_pattern_stmt_p (stmt_info))
2622     {
2623       slp_tree child;
2624       unsigned j;
2625       bool all_uniform_p = true;
2626       unsigned n_vector_builds = 0;
2627       FOR_EACH_VEC_ELT (children, j, child)
2628         {
2629           if (!child)
2630             ;
2631           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2632             all_uniform_p = false;
2633           else if (!vect_slp_tree_uniform_p (child))
2634             {
2635               all_uniform_p = false;
2636               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2637                 n_vector_builds++;
2638             }
2639         }
2640       if (all_uniform_p
2641           || n_vector_builds > 1
2642           || (n_vector_builds == children.length ()
2643               && is_a <gphi *> (stmt_info->stmt)))
2644         {
2645           /* Roll back.  */
2646           matches[0] = false;
2647           FOR_EACH_VEC_ELT (children, j, child)
2648             if (child)
2649               vect_free_slp_tree (child);
2650
2651           if (dump_enabled_p ())
2652             dump_printf_loc (MSG_NOTE, vect_location,
2653                              "Building parent vector operands from "
2654                              "scalars instead\n");
2655           return NULL;
2656         }
2657     }
2658
2659   *tree_size += this_tree_size + 1;
2660   *max_nunits = this_max_nunits;
2661
2662   if (two_operators)
2663     {
2664       /* ???  We'd likely want to either cache in bst_map sth like
2665          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2666          the true { a+b, a+b, a+b, a+b } ... but there we don't have
2667          explicit stmts to put in so the keying on 'stmts' doesn't
2668          work (but we have the same issue with nodes that use 'ops').  */
2669       slp_tree one = new _slp_tree;
2670       slp_tree two = new _slp_tree;
2671       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2672       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2673       SLP_TREE_VECTYPE (one) = vectype;
2674       SLP_TREE_VECTYPE (two) = vectype;
2675       SLP_TREE_CHILDREN (one).safe_splice (children);
2676       SLP_TREE_CHILDREN (two).safe_splice (children);
2677       slp_tree child;
2678       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2679         SLP_TREE_REF_COUNT (child)++;
2680
2681       /* Here we record the original defs since this
2682          node represents the final lane configuration.  */
2683       node = vect_create_new_slp_node (node, stmts, 2);
2684       SLP_TREE_VECTYPE (node) = vectype;
2685       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2686       SLP_TREE_CHILDREN (node).quick_push (one);
2687       SLP_TREE_CHILDREN (node).quick_push (two);
2688       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2689       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2690       enum tree_code ocode = ERROR_MARK;
2691       stmt_vec_info ostmt_info;
2692       unsigned j = 0;
2693       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2694         {
2695           gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2696           if (gimple_assign_rhs_code (ostmt) != code0)
2697             {
2698               SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2699               ocode = gimple_assign_rhs_code (ostmt);
2700               j = i;
2701             }
2702           else
2703             SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2704         }
2705       SLP_TREE_CODE (one) = code0;
2706       SLP_TREE_CODE (two) = ocode;
2707       SLP_TREE_LANES (one) = stmts.length ();
2708       SLP_TREE_LANES (two) = stmts.length ();
2709       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2710       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2711       return node;
2712     }
2713
2714   node = vect_create_new_slp_node (node, stmts, nops);
2715   SLP_TREE_VECTYPE (node) = vectype;
2716   SLP_TREE_CHILDREN (node).splice (children);
2717   return node;
2718 }
2719
2720 /* Dump a single SLP tree NODE.  */
2721
2722 static void
2723 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2724                      slp_tree node)
2725 {
2726   unsigned i, j;
2727   slp_tree child;
2728   stmt_vec_info stmt_info;
2729   tree op;
2730
2731   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2732   dump_user_location_t user_loc = loc.get_user_location ();
2733   dump_printf_loc (metadata, user_loc,
2734                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2735                    ", refcnt=%u)",
2736                    SLP_TREE_DEF_TYPE (node) == vect_external_def
2737                    ? " (external)"
2738                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2739                       ? " (constant)"
2740                       : ""), (void *) node,
2741                    estimated_poly_value (node->max_nunits),
2742                                          SLP_TREE_REF_COUNT (node));
2743   if (SLP_TREE_VECTYPE (node))
2744     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2745   dump_printf (metadata, "\n");
2746   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2747     {
2748       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2749         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2750       else
2751         dump_printf_loc (metadata, user_loc, "op template: %G",
2752                          SLP_TREE_REPRESENTATIVE (node)->stmt);
2753     }
2754   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2755     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2756       dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2757   else
2758     {
2759       dump_printf_loc (metadata, user_loc, "\t{ ");
2760       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2761         dump_printf (metadata, "%T%s ", op,
2762                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2763       dump_printf (metadata, "}\n");
2764     }
2765   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2766     {
2767       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2768       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2769         dump_printf (dump_kind, " %u", j);
2770       dump_printf (dump_kind, " }\n");
2771     }
2772   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2773     {
2774       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2775       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2776         dump_printf (dump_kind, " %u[%u]",
2777                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
2778                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
2779       dump_printf (dump_kind, " }\n");
2780     }
2781   if (SLP_TREE_CHILDREN (node).is_empty ())
2782     return;
2783   dump_printf_loc (metadata, user_loc, "\tchildren");
2784   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2785     dump_printf (dump_kind, " %p", (void *)child);
2786   dump_printf (dump_kind, "\n");
2787 }
2788
2789 DEBUG_FUNCTION void
2790 debug (slp_tree node)
2791 {
2792   debug_dump_context ctx;
2793   vect_print_slp_tree (MSG_NOTE,
2794                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2795                        node);
2796 }
2797
2798 /* Recursive helper for the dot producer below.  */
2799
2800 static void
2801 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2802 {
2803   if (visited.add (node))
2804     return;
2805
2806   fprintf (f, "\"%p\" [label=\"", (void *)node);
2807   vect_print_slp_tree (MSG_NOTE,
2808                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2809                        node);
2810   fprintf (f, "\"];\n");
2811
2812
2813   for (slp_tree child : SLP_TREE_CHILDREN (node))
2814     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2815
2816   for (slp_tree child : SLP_TREE_CHILDREN (node))
2817     if (child)
2818       dot_slp_tree (f, child, visited);
2819 }
2820
2821 DEBUG_FUNCTION void
2822 dot_slp_tree (const char *fname, slp_tree node)
2823 {
2824   FILE *f = fopen (fname, "w");
2825   fprintf (f, "digraph {\n");
2826   fflush (f);
2827     {
2828       debug_dump_context ctx (f);
2829       hash_set<slp_tree> visited;
2830       dot_slp_tree (f, node, visited);
2831     }
2832   fflush (f);
2833   fprintf (f, "}\n");
2834   fclose (f);
2835 }
2836
2837 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
2838
2839 static void
2840 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2841                       slp_tree node, hash_set<slp_tree> &visited)
2842 {
2843   unsigned i;
2844   slp_tree child;
2845
2846   if (visited.add (node))
2847     return;
2848
2849   vect_print_slp_tree (dump_kind, loc, node);
2850
2851   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2852     if (child)
2853       vect_print_slp_graph (dump_kind, loc, child, visited);
2854 }
2855
2856 static void
2857 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2858                       slp_tree entry)
2859 {
2860   hash_set<slp_tree> visited;
2861   vect_print_slp_graph (dump_kind, loc, entry, visited);
2862 }
2863
2864 /* Mark the tree rooted at NODE with PURE_SLP.  */
2865
2866 static void
2867 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2868 {
2869   int i;
2870   stmt_vec_info stmt_info;
2871   slp_tree child;
2872
2873   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2874     return;
2875
2876   if (visited.add (node))
2877     return;
2878
2879   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2880     STMT_SLP_TYPE (stmt_info) = pure_slp;
2881
2882   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2883     if (child)
2884       vect_mark_slp_stmts (child, visited);
2885 }
2886
2887 static void
2888 vect_mark_slp_stmts (slp_tree node)
2889 {
2890   hash_set<slp_tree> visited;
2891   vect_mark_slp_stmts (node, visited);
2892 }
2893
2894 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
2895
2896 static void
2897 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2898 {
2899   int i;
2900   stmt_vec_info stmt_info;
2901   slp_tree child;
2902
2903   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2904     return;
2905
2906   if (visited.add (node))
2907     return;
2908
2909   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2910     {
2911       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2912                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2913       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2914     }
2915
2916   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2917     if (child)
2918       vect_mark_slp_stmts_relevant (child, visited);
2919 }
2920
2921 static void
2922 vect_mark_slp_stmts_relevant (slp_tree node)
2923 {
2924   hash_set<slp_tree> visited;
2925   vect_mark_slp_stmts_relevant (node, visited);
2926 }
2927
2928
2929 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
2930
2931 static void
2932 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2933                        hash_set<slp_tree> &visited)
2934 {
2935   if (!node || visited.add (node))
2936     return;
2937
2938   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2939     return;
2940
2941   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
2942     {
2943       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2944       if (STMT_VINFO_DATA_REF (stmt_info)
2945           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2946         loads.safe_push (node);
2947     }
2948
2949   unsigned i;
2950   slp_tree child;
2951   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2952     vect_gather_slp_loads (loads, child, visited);
2953 }
2954
2955
2956 /* Find the last store in SLP INSTANCE.  */
2957
2958 stmt_vec_info
2959 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2960 {
2961   stmt_vec_info last = NULL;
2962   stmt_vec_info stmt_vinfo;
2963
2964   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2965     {
2966       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2967       last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2968     }
2969
2970   return last;
2971 }
2972
2973 /* Find the first stmt in NODE.  */
2974
2975 stmt_vec_info
2976 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2977 {
2978   stmt_vec_info first = NULL;
2979   stmt_vec_info stmt_vinfo;
2980
2981   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2982     {
2983       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2984       if (!first
2985           || get_later_stmt (stmt_vinfo, first) == first)
2986         first = stmt_vinfo;
2987     }
2988
2989   return first;
2990 }
2991
2992 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2993    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2994    (also containing the first GROUP1_SIZE stmts, since stores are
2995    consecutive), the second containing the remainder.
2996    Return the first stmt in the second group.  */
2997
2998 static stmt_vec_info
2999 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3000 {
3001   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3002   gcc_assert (group1_size > 0);
3003   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3004   gcc_assert (group2_size > 0);
3005   DR_GROUP_SIZE (first_vinfo) = group1_size;
3006
3007   stmt_vec_info stmt_info = first_vinfo;
3008   for (unsigned i = group1_size; i > 1; i--)
3009     {
3010       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3011       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3012     }
3013   /* STMT is now the last element of the first group.  */
3014   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3015   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3016
3017   DR_GROUP_SIZE (group2) = group2_size;
3018   for (stmt_info = group2; stmt_info;
3019        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3020     {
3021       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3022       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3023     }
3024
3025   /* For the second group, the DR_GROUP_GAP is that before the original group,
3026      plus skipping over the first vector.  */
3027   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3028
3029   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
3030   DR_GROUP_GAP (first_vinfo) += group2_size;
3031
3032   if (dump_enabled_p ())
3033     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3034                      group1_size, group2_size);
3035
3036   return group2;
3037 }
3038
3039 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3040    statements and a vector of NUNITS elements.  */
3041
3042 static poly_uint64
3043 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3044 {
3045   return exact_div (common_multiple (nunits, group_size), group_size);
3046 }
3047
3048 /* Helper that checks to see if a node is a load node.  */
3049
3050 static inline bool
3051 vect_is_slp_load_node  (slp_tree root)
3052 {
3053   return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3054          && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3055          && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3056 }
3057
3058
3059 /* Helper function of optimize_load_redistribution that performs the operation
3060    recursively.  */
3061
3062 static slp_tree
3063 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3064                                 vec_info *vinfo, unsigned int group_size,
3065                                 hash_map<slp_tree, slp_tree> *load_map,
3066                                 slp_tree root)
3067 {
3068   if (slp_tree *leader = load_map->get (root))
3069     return *leader;
3070
3071   slp_tree node;
3072   unsigned i;
3073
3074   /* For now, we don't know anything about externals so do not do anything.  */
3075   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3076     return NULL;
3077   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3078     {
3079       /* First convert this node into a load node and add it to the leaves
3080          list and flatten the permute from a lane to a load one.  If it's
3081          unneeded it will be elided later.  */
3082       vec<stmt_vec_info> stmts;
3083       stmts.create (SLP_TREE_LANES (root));
3084       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3085       for (unsigned j = 0; j < lane_perm.length (); j++)
3086         {
3087           std::pair<unsigned, unsigned> perm = lane_perm[j];
3088           node = SLP_TREE_CHILDREN (root)[perm.first];
3089
3090           if (!vect_is_slp_load_node (node)
3091               || SLP_TREE_CHILDREN (node).exists ())
3092             {
3093               stmts.release ();
3094               goto next;
3095             }
3096
3097           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3098         }
3099
3100       if (dump_enabled_p ())
3101         dump_printf_loc (MSG_NOTE, vect_location,
3102                          "converting stmts on permute node %p\n",
3103                          (void *) root);
3104
3105       bool *matches = XALLOCAVEC (bool, group_size);
3106       poly_uint64 max_nunits = 1;
3107       unsigned tree_size = 0, limit = 1;
3108       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3109                                   matches, &limit, &tree_size, bst_map);
3110       if (!node)
3111         stmts.release ();
3112
3113       load_map->put (root, node);
3114       return node;
3115     }
3116
3117 next:
3118   load_map->put (root, NULL);
3119
3120   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3121     {
3122       slp_tree value
3123         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3124                                           node);
3125       if (value)
3126         {
3127           SLP_TREE_REF_COUNT (value)++;
3128           SLP_TREE_CHILDREN (root)[i] = value;
3129           /* ???  We know the original leafs of the replaced nodes will
3130              be referenced by bst_map, only the permutes created by
3131              pattern matching are not.  */
3132           if (SLP_TREE_REF_COUNT (node) == 1)
3133             load_map->remove (node);
3134           vect_free_slp_tree (node);
3135         }
3136     }
3137
3138   return NULL;
3139 }
3140
3141 /* Temporary workaround for loads not being CSEd during SLP build.  This
3142    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3143    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3144    same DR such that the final operation is equal to a permuted load.  Such
3145    NODES are then directly converted into LOADS themselves.  The nodes are
3146    CSEd using BST_MAP.  */
3147
3148 static void
3149 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3150                               vec_info *vinfo, unsigned int group_size,
3151                               hash_map<slp_tree, slp_tree> *load_map,
3152                               slp_tree root)
3153 {
3154   slp_tree node;
3155   unsigned i;
3156
3157   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3158     {
3159       slp_tree value
3160         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3161                                           node);
3162       if (value)
3163         {
3164           SLP_TREE_REF_COUNT (value)++;
3165           SLP_TREE_CHILDREN (root)[i] = value;
3166           /* ???  We know the original leafs of the replaced nodes will
3167              be referenced by bst_map, only the permutes created by
3168              pattern matching are not.  */
3169           if (SLP_TREE_REF_COUNT (node) == 1)
3170             load_map->remove (node);
3171           vect_free_slp_tree (node);
3172         }
3173     }
3174 }
3175
3176 /* Helper function of vect_match_slp_patterns.
3177
3178    Attempts to match patterns against the slp tree rooted in REF_NODE using
3179    VINFO.  Patterns are matched in post-order traversal.
3180
3181    If matching is successful the value in REF_NODE is updated and returned, if
3182    not then it is returned unchanged.  */
3183
3184 static bool
3185 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3186                            slp_tree_to_load_perm_map_t *perm_cache,
3187                            slp_compat_nodes_map_t *compat_cache,
3188                            hash_set<slp_tree> *visited)
3189 {
3190   unsigned i;
3191   slp_tree node = *ref_node;
3192   bool found_p = false;
3193   if (!node || visited->add (node))
3194     return false;
3195
3196   slp_tree child;
3197   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3198     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3199                                           vinfo, perm_cache, compat_cache,
3200                                           visited);
3201
3202   for (unsigned x = 0; x < num__slp_patterns; x++)
3203     {
3204       vect_pattern *pattern
3205         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3206       if (pattern)
3207         {
3208           pattern->build (vinfo);
3209           delete pattern;
3210           found_p = true;
3211         }
3212     }
3213
3214   return found_p;
3215 }
3216
3217 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3218    vec_info VINFO.
3219
3220    The modified tree is returned.  Patterns are tried in order and multiple
3221    patterns may match.  */
3222
3223 static bool
3224 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3225                          hash_set<slp_tree> *visited,
3226                          slp_tree_to_load_perm_map_t *perm_cache,
3227                          slp_compat_nodes_map_t *compat_cache)
3228 {
3229   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3230   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3231
3232   if (dump_enabled_p ())
3233     dump_printf_loc (MSG_NOTE, vect_location,
3234                      "Analyzing SLP tree %p for patterns\n",
3235                      (void *) SLP_INSTANCE_TREE (instance));
3236
3237   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3238                                     visited);
3239 }
3240
3241 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3242    splitting into two, with the first split group having size NEW_GROUP_SIZE.
3243    Return true if we could use IFN_STORE_LANES instead and if that appears
3244    to be the better approach.  */
3245
3246 static bool
3247 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3248                                unsigned int group_size,
3249                                unsigned int new_group_size)
3250 {
3251   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3252   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3253   if (!vectype)
3254     return false;
3255   /* Allow the split if one of the two new groups would operate on full
3256      vectors *within* rather than across one scalar loop iteration.
3257      This is purely a heuristic, but it should work well for group
3258      sizes of 3 and 4, where the possible splits are:
3259
3260        3->2+1:  OK if the vector has exactly two elements
3261        4->2+2:  Likewise
3262        4->3+1:  Less clear-cut.  */
3263   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3264       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3265     return false;
3266   return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3267 }
3268
3269 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3270    vect_build_slp_tree to build a tree of packed stmts if possible.
3271    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3272
3273 static bool
3274 vect_analyze_slp_instance (vec_info *vinfo,
3275                            scalar_stmts_to_slp_tree_map_t *bst_map,
3276                            stmt_vec_info stmt_info, slp_instance_kind kind,
3277                            unsigned max_tree_size, unsigned *limit);
3278
3279 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3280    of KIND.  Return true if successful.  */
3281
3282 static bool
3283 vect_build_slp_instance (vec_info *vinfo,
3284                          slp_instance_kind kind,
3285                          vec<stmt_vec_info> &scalar_stmts,
3286                          vec<stmt_vec_info> &root_stmt_infos,
3287                          vec<tree> &remain,
3288                          unsigned max_tree_size, unsigned *limit,
3289                          scalar_stmts_to_slp_tree_map_t *bst_map,
3290                          /* ???  We need stmt_info for group splitting.  */
3291                          stmt_vec_info stmt_info_)
3292 {
3293   if (kind == slp_inst_kind_ctor)
3294     {
3295       if (dump_enabled_p ())
3296         dump_printf_loc (MSG_NOTE, vect_location,
3297                          "Analyzing vectorizable constructor: %G\n",
3298                          root_stmt_infos[0]->stmt);
3299     }
3300
3301   if (dump_enabled_p ())
3302     {
3303       dump_printf_loc (MSG_NOTE, vect_location,
3304                        "Starting SLP discovery for\n");
3305       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3306         dump_printf_loc (MSG_NOTE, vect_location,
3307                          "  %G", scalar_stmts[i]->stmt);
3308     }
3309
3310   /* Build the tree for the SLP instance.  */
3311   unsigned int group_size = scalar_stmts.length ();
3312   bool *matches = XALLOCAVEC (bool, group_size);
3313   poly_uint64 max_nunits = 1;
3314   unsigned tree_size = 0;
3315   unsigned i;
3316   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3317                                        &max_nunits, matches, limit,
3318                                        &tree_size, bst_map);
3319   if (node != NULL)
3320     {
3321       /* Calculate the unrolling factor based on the smallest type.  */
3322       poly_uint64 unrolling_factor
3323         = calculate_unrolling_factor (max_nunits, group_size);
3324
3325       if (maybe_ne (unrolling_factor, 1U)
3326           && is_a <bb_vec_info> (vinfo))
3327         {
3328           unsigned HOST_WIDE_INT const_max_nunits;
3329           if (!max_nunits.is_constant (&const_max_nunits)
3330               || const_max_nunits > group_size)
3331             {
3332               if (dump_enabled_p ())
3333                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3334                                  "Build SLP failed: store group "
3335                                  "size not a multiple of the vector size "
3336                                  "in basic block SLP\n");
3337               vect_free_slp_tree (node);
3338               return false;
3339             }
3340           /* Fatal mismatch.  */
3341           if (dump_enabled_p ())
3342             dump_printf_loc (MSG_NOTE, vect_location,
3343                              "SLP discovery succeeded but node needs "
3344                              "splitting\n");
3345           memset (matches, true, group_size);
3346           matches[group_size / const_max_nunits * const_max_nunits] = false;
3347           vect_free_slp_tree (node);
3348         }
3349       else
3350         {
3351           /* Create a new SLP instance.  */
3352           slp_instance new_instance = XNEW (class _slp_instance);
3353           SLP_INSTANCE_TREE (new_instance) = node;
3354           SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3355           SLP_INSTANCE_LOADS (new_instance) = vNULL;
3356           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3357           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3358           SLP_INSTANCE_KIND (new_instance) = kind;
3359           new_instance->reduc_phis = NULL;
3360           new_instance->cost_vec = vNULL;
3361           new_instance->subgraph_entries = vNULL;
3362
3363           if (dump_enabled_p ())
3364             dump_printf_loc (MSG_NOTE, vect_location,
3365                              "SLP size %u vs. limit %u.\n",
3366                              tree_size, max_tree_size);
3367
3368           /* Fixup SLP reduction chains.  */
3369           if (kind == slp_inst_kind_reduc_chain)
3370             {
3371               /* If this is a reduction chain with a conversion in front
3372                  amend the SLP tree with a node for that.  */
3373               gimple *scalar_def
3374                 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3375               if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3376                 {
3377                   /* Get at the conversion stmt - we know it's the single use
3378                      of the last stmt of the reduction chain.  */
3379                   use_operand_p use_p;
3380                   bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3381                                            &use_p, &scalar_def);
3382                   gcc_assert (r);
3383                   stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3384                   next_info = vect_stmt_to_vectorize (next_info);
3385                   scalar_stmts = vNULL;
3386                   scalar_stmts.create (group_size);
3387                   for (unsigned i = 0; i < group_size; ++i)
3388                     scalar_stmts.quick_push (next_info);
3389                   slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3390                   SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3391                   SLP_TREE_CHILDREN (conv).quick_push (node);
3392                   SLP_INSTANCE_TREE (new_instance) = conv;
3393                   /* We also have to fake this conversion stmt as SLP reduction
3394                      group so we don't have to mess with too much code
3395                      elsewhere.  */
3396                   REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3397                   REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3398                 }
3399               /* Fill the backedge child of the PHI SLP node.  The
3400                  general matching code cannot find it because the
3401                  scalar code does not reflect how we vectorize the
3402                  reduction.  */
3403               use_operand_p use_p;
3404               imm_use_iterator imm_iter;
3405               class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3406               FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3407                                      gimple_get_lhs (scalar_def))
3408                 /* There are exactly two non-debug uses, the reduction
3409                    PHI and the loop-closed PHI node.  */
3410                 if (!is_gimple_debug (USE_STMT (use_p))
3411                     && gimple_bb (USE_STMT (use_p)) == loop->header)
3412                   {
3413                     auto_vec<stmt_vec_info, 64> phis (group_size);
3414                     stmt_vec_info phi_info
3415                       = vinfo->lookup_stmt (USE_STMT (use_p));
3416                     for (unsigned i = 0; i < group_size; ++i)
3417                       phis.quick_push (phi_info);
3418                     slp_tree *phi_node = bst_map->get (phis);
3419                     unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3420                     SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3421                       = SLP_INSTANCE_TREE (new_instance);
3422                     SLP_INSTANCE_TREE (new_instance)->refcnt++;
3423                   }
3424             }
3425
3426           vinfo->slp_instances.safe_push (new_instance);
3427
3428           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3429              the number of scalar stmts in the root in a few places.
3430              Verify that assumption holds.  */
3431           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3432                         .length () == group_size);
3433
3434           if (dump_enabled_p ())
3435             {
3436               dump_printf_loc (MSG_NOTE, vect_location,
3437                                "Final SLP tree for instance %p:\n",
3438                                (void *) new_instance);
3439               vect_print_slp_graph (MSG_NOTE, vect_location,
3440                                     SLP_INSTANCE_TREE (new_instance));
3441             }
3442
3443           return true;
3444         }
3445     }
3446   else
3447     {
3448       /* Failed to SLP.  */
3449       /* Free the allocated memory.  */
3450       scalar_stmts.release ();
3451     }
3452
3453   stmt_vec_info stmt_info = stmt_info_;
3454   /* Try to break the group up into pieces.  */
3455   if (kind == slp_inst_kind_store)
3456     {
3457       /* ???  We could delay all the actual splitting of store-groups
3458          until after SLP discovery of the original group completed.
3459          Then we can recurse to vect_build_slp_instance directly.  */
3460       for (i = 0; i < group_size; i++)
3461         if (!matches[i])
3462           break;
3463
3464       /* For basic block SLP, try to break the group up into multiples of
3465          a vector size.  */
3466       if (is_a <bb_vec_info> (vinfo)
3467           && (i > 1 && i < group_size))
3468         {
3469           tree scalar_type
3470             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3471           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3472                                                       1 << floor_log2 (i));
3473           unsigned HOST_WIDE_INT const_nunits;
3474           if (vectype
3475               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3476             {
3477               /* Split into two groups at the first vector boundary.  */
3478               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3479               unsigned group1_size = i & ~(const_nunits - 1);
3480
3481               if (dump_enabled_p ())
3482                 dump_printf_loc (MSG_NOTE, vect_location,
3483                                  "Splitting SLP group at stmt %u\n", i);
3484               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3485                                                                group1_size);
3486               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3487                                                     kind, max_tree_size,
3488                                                     limit);
3489               /* Split the rest at the failure point and possibly
3490                  re-analyze the remaining matching part if it has
3491                  at least two lanes.  */
3492               if (group1_size < i
3493                   && (i + 1 < group_size
3494                       || i - group1_size > 1))
3495                 {
3496                   stmt_vec_info rest2 = rest;
3497                   rest = vect_split_slp_store_group (rest, i - group1_size);
3498                   if (i - group1_size > 1)
3499                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3500                                                       kind, max_tree_size,
3501                                                       limit);
3502                 }
3503               /* Re-analyze the non-matching tail if it has at least
3504                  two lanes.  */
3505               if (i + 1 < group_size)
3506                 res |= vect_analyze_slp_instance (vinfo, bst_map,
3507                                                   rest, kind, max_tree_size,
3508                                                   limit);
3509               return res;
3510             }
3511         }
3512
3513       /* For loop vectorization split into arbitrary pieces of size > 1.  */
3514       if (is_a <loop_vec_info> (vinfo)
3515           && (i > 1 && i < group_size)
3516           && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3517         {
3518           unsigned group1_size = i;
3519
3520           if (dump_enabled_p ())
3521             dump_printf_loc (MSG_NOTE, vect_location,
3522                              "Splitting SLP group at stmt %u\n", i);
3523
3524           stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3525                                                            group1_size);
3526           /* Loop vectorization cannot handle gaps in stores, make sure
3527              the split group appears as strided.  */
3528           STMT_VINFO_STRIDED_P (rest) = 1;
3529           DR_GROUP_GAP (rest) = 0;
3530           STMT_VINFO_STRIDED_P (stmt_info) = 1;
3531           DR_GROUP_GAP (stmt_info) = 0;
3532
3533           bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3534                                                 kind, max_tree_size, limit);
3535           if (i + 1 < group_size)
3536             res |= vect_analyze_slp_instance (vinfo, bst_map,
3537                                               rest, kind, max_tree_size, limit);
3538
3539           return res;
3540         }
3541
3542       /* Even though the first vector did not all match, we might be able to SLP
3543          (some) of the remainder.  FORNOW ignore this possibility.  */
3544     }
3545
3546   /* Failed to SLP.  */
3547   if (dump_enabled_p ())
3548     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3549   return false;
3550 }
3551
3552
3553 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3554    vect_build_slp_tree to build a tree of packed stmts if possible.
3555    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3556
3557 static bool
3558 vect_analyze_slp_instance (vec_info *vinfo,
3559                            scalar_stmts_to_slp_tree_map_t *bst_map,
3560                            stmt_vec_info stmt_info,
3561                            slp_instance_kind kind,
3562                            unsigned max_tree_size, unsigned *limit)
3563 {
3564   unsigned int i;
3565   vec<stmt_vec_info> scalar_stmts;
3566
3567   if (is_a <bb_vec_info> (vinfo))
3568     vect_location = stmt_info->stmt;
3569
3570   stmt_vec_info next_info = stmt_info;
3571   if (kind == slp_inst_kind_store)
3572     {
3573       /* Collect the stores and store them in scalar_stmts.  */
3574       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3575       while (next_info)
3576         {
3577           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3578           next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3579         }
3580     }
3581   else if (kind == slp_inst_kind_reduc_chain)
3582     {
3583       /* Collect the reduction stmts and store them in scalar_stmts.  */
3584       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3585       while (next_info)
3586         {
3587           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3588           next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3589         }
3590       /* Mark the first element of the reduction chain as reduction to properly
3591          transform the node.  In the reduction analysis phase only the last
3592          element of the chain is marked as reduction.  */
3593       STMT_VINFO_DEF_TYPE (stmt_info)
3594         = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3595       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3596         = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3597     }
3598   else if (kind == slp_inst_kind_reduc_group)
3599     {
3600       /* Collect reduction statements.  */
3601       const vec<stmt_vec_info> &reductions
3602         = as_a <loop_vec_info> (vinfo)->reductions;
3603       scalar_stmts.create (reductions.length ());
3604       for (i = 0; reductions.iterate (i, &next_info); i++)
3605         if ((STMT_VINFO_RELEVANT_P (next_info)
3606              || STMT_VINFO_LIVE_P (next_info))
3607             /* ???  Make sure we didn't skip a conversion around a reduction
3608                path.  In that case we'd have to reverse engineer that conversion
3609                stmt following the chain using reduc_idx and from the PHI
3610                using reduc_def.  */
3611             && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3612           scalar_stmts.quick_push (next_info);
3613       /* If less than two were relevant/live there's nothing to SLP.  */
3614       if (scalar_stmts.length () < 2)
3615         return false;
3616     }
3617   else
3618     gcc_unreachable ();
3619
3620   vec<stmt_vec_info> roots = vNULL;
3621   vec<tree> remain = vNULL;
3622   /* Build the tree for the SLP instance.  */
3623   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3624                                       roots, remain,
3625                                       max_tree_size, limit, bst_map,
3626                                       kind == slp_inst_kind_store
3627                                       ? stmt_info : NULL);
3628
3629   /* ???  If this is slp_inst_kind_store and the above succeeded here's
3630      where we should do store group splitting.  */
3631
3632   return res;
3633 }
3634
3635 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
3636    trees of packed scalar stmts if SLP is possible.  */
3637
3638 opt_result
3639 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3640 {
3641   unsigned int i;
3642   stmt_vec_info first_element;
3643   slp_instance instance;
3644
3645   DUMP_VECT_SCOPE ("vect_analyze_slp");
3646
3647   unsigned limit = max_tree_size;
3648
3649   scalar_stmts_to_slp_tree_map_t *bst_map
3650     = new scalar_stmts_to_slp_tree_map_t ();
3651
3652   /* Find SLP sequences starting from groups of grouped stores.  */
3653   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3654     vect_analyze_slp_instance (vinfo, bst_map, first_element,
3655                                slp_inst_kind_store, max_tree_size, &limit);
3656
3657   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3658     {
3659       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3660         {
3661           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3662           /* Apply patterns.  */
3663           for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
3664             bb_vinfo->roots[i].stmts[j]
3665               = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
3666           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3667                                        bb_vinfo->roots[i].stmts,
3668                                        bb_vinfo->roots[i].roots,
3669                                        bb_vinfo->roots[i].remain,
3670                                        max_tree_size, &limit, bst_map, NULL))
3671             {
3672               bb_vinfo->roots[i].stmts = vNULL;
3673               bb_vinfo->roots[i].roots = vNULL;
3674               bb_vinfo->roots[i].remain = vNULL;
3675             }
3676         }
3677     }
3678
3679   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3680     {
3681       /* Find SLP sequences starting from reduction chains.  */
3682       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3683         if (! STMT_VINFO_RELEVANT_P (first_element)
3684             && ! STMT_VINFO_LIVE_P (first_element))
3685           ;
3686         else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3687                                               slp_inst_kind_reduc_chain,
3688                                               max_tree_size, &limit))
3689           {
3690             /* Dissolve reduction chain group.  */
3691             stmt_vec_info vinfo = first_element;
3692             stmt_vec_info last = NULL;
3693             while (vinfo)
3694               {
3695                 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3696                 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3697                 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3698                 last = vinfo;
3699                 vinfo = next;
3700               }
3701             STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3702             /* It can be still vectorized as part of an SLP reduction.  */
3703             loop_vinfo->reductions.safe_push (last);
3704           }
3705
3706       /* Find SLP sequences starting from groups of reductions.  */
3707       if (loop_vinfo->reductions.length () > 1)
3708         vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3709                                    slp_inst_kind_reduc_group, max_tree_size,
3710                                    &limit);
3711     }
3712
3713   hash_set<slp_tree> visited_patterns;
3714   slp_tree_to_load_perm_map_t perm_cache;
3715   slp_compat_nodes_map_t compat_cache;
3716
3717   /* See if any patterns can be found in the SLP tree.  */
3718   bool pattern_found = false;
3719   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3720     pattern_found |= vect_match_slp_patterns (instance, vinfo,
3721                                               &visited_patterns, &perm_cache,
3722                                               &compat_cache);
3723
3724   /* If any were found optimize permutations of loads.  */
3725   if (pattern_found)
3726     {
3727       hash_map<slp_tree, slp_tree> load_map;
3728       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3729         {
3730           slp_tree root = SLP_INSTANCE_TREE (instance);
3731           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3732                                         &load_map, root);
3733         }
3734     }
3735
3736
3737
3738   /* The map keeps a reference on SLP nodes built, release that.  */
3739   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3740        it != bst_map->end (); ++it)
3741     if ((*it).second)
3742       vect_free_slp_tree ((*it).second);
3743   delete bst_map;
3744
3745   if (pattern_found && dump_enabled_p ())
3746     {
3747       dump_printf_loc (MSG_NOTE, vect_location,
3748                        "Pattern matched SLP tree\n");
3749       hash_set<slp_tree> visited;
3750       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3751         vect_print_slp_graph (MSG_NOTE, vect_location,
3752                               SLP_INSTANCE_TREE (instance), visited);
3753     }
3754
3755   return opt_result::success ();
3756 }
3757
3758 /* Estimates the cost of inserting layout changes into the SLP graph.
3759    It can also say that the insertion is impossible.  */
3760
3761 struct slpg_layout_cost
3762 {
3763   slpg_layout_cost () = default;
3764   slpg_layout_cost (sreal, bool);
3765
3766   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3767   bool is_possible () const { return depth != sreal::max (); }
3768
3769   bool operator== (const slpg_layout_cost &) const;
3770   bool operator!= (const slpg_layout_cost &) const;
3771
3772   bool is_better_than (const slpg_layout_cost &, bool) const;
3773
3774   void add_parallel_cost (const slpg_layout_cost &);
3775   void add_serial_cost (const slpg_layout_cost &);
3776   void split (unsigned int);
3777
3778   /* The longest sequence of layout changes needed during any traversal
3779      of the partition dag, weighted by execution frequency.
3780
3781      This is the most important metric when optimizing for speed, since
3782      it helps to ensure that we keep the number of operations on
3783      critical paths to a minimum.  */
3784   sreal depth = 0;
3785
3786   /* An estimate of the total number of operations needed.  It is weighted by
3787      execution frequency when optimizing for speed but not when optimizing for
3788      size.  In order to avoid double-counting, a node with a fanout of N will
3789      distribute 1/N of its total cost to each successor.
3790
3791      This is the most important metric when optimizing for size, since
3792      it helps to keep the total number of operations to a minimum,  */
3793   sreal total = 0;
3794 };
3795
3796 /* Construct costs for a node with weight WEIGHT.  A higher weight
3797    indicates more frequent execution.  IS_FOR_SIZE is true if we are
3798    optimizing for size rather than speed.  */
3799
3800 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3801   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3802 {
3803 }
3804
3805 bool
3806 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3807 {
3808   return depth == other.depth && total == other.total;
3809 }
3810
3811 bool
3812 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3813 {
3814   return !operator== (other);
3815 }
3816
3817 /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
3818    true if we are optimizing for size rather than speed.  */
3819
3820 bool
3821 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3822                                   bool is_for_size) const
3823 {
3824   if (is_for_size)
3825     {
3826       if (total != other.total)
3827         return total < other.total;
3828       return depth < other.depth;
3829     }
3830   else
3831     {
3832       if (depth != other.depth)
3833         return depth < other.depth;
3834       return total < other.total;
3835     }
3836 }
3837
3838 /* Increase the costs to account for something with cost INPUT_COST
3839    happening in parallel with the current costs.  */
3840
3841 void
3842 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3843 {
3844   depth = std::max (depth, input_cost.depth);
3845   total += input_cost.total;
3846 }
3847
3848 /* Increase the costs to account for something with cost INPUT_COST
3849    happening in series with the current costs.  */
3850
3851 void
3852 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3853 {
3854   depth += other.depth;
3855   total += other.total;
3856 }
3857
3858 /* Split the total cost among TIMES successors or predecessors.  */
3859
3860 void
3861 slpg_layout_cost::split (unsigned int times)
3862 {
3863   if (times > 1)
3864     total /= times;
3865 }
3866
3867 /* Information about one node in the SLP graph, for use during
3868    vect_optimize_slp_pass.  */
3869
3870 struct slpg_vertex
3871 {
3872   slpg_vertex (slp_tree node_) : node (node_) {}
3873
3874   /* The node itself.  */
3875   slp_tree node;
3876
3877   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
3878      partitions are flexible; they can have whichever layout consumers
3879      want them to have.  */
3880   int partition = -1;
3881
3882   /* The number of nodes that directly use the result of this one
3883      (i.e. the number of nodes that count this one as a child).  */
3884   unsigned int out_degree = 0;
3885
3886   /* The execution frequency of the node.  */
3887   sreal weight = 0;
3888
3889   /* The total execution frequency of all nodes that directly use the
3890      result of this one.  */
3891   sreal out_weight = 0;
3892 };
3893
3894 /* Information about one partition of the SLP graph, for use during
3895    vect_optimize_slp_pass.  */
3896
3897 struct slpg_partition_info
3898 {
3899   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3900      of m_partitioned_nodes.  */
3901   unsigned int node_begin = 0;
3902   unsigned int node_end = 0;
3903
3904   /* Which layout we've chosen to use for this partition, or -1 if
3905      we haven't picked one yet.  */
3906   int layout = -1;
3907
3908   /* The number of predecessors and successors in the partition dag.
3909      The predecessors always have lower partition numbers and the
3910      successors always have higher partition numbers.
3911
3912      Note that the directions of these edges are not necessarily the
3913      same as in the data flow graph.  For example, if an SCC has separate
3914      partitions for an inner loop and an outer loop, the inner loop's
3915      partition will have at least two incoming edges from the outer loop's
3916      partition: one for a live-in value and one for a live-out value.
3917      In data flow terms, one of these edges would also be from the outer loop
3918      to the inner loop, but the other would be in the opposite direction.  */
3919   unsigned int in_degree = 0;
3920   unsigned int out_degree = 0;
3921 };
3922
3923 /* Information about the costs of using a particular layout for a
3924    particular partition.  It can also say that the combination is
3925    impossible.  */
3926
3927 struct slpg_partition_layout_costs
3928 {
3929   bool is_possible () const { return internal_cost.is_possible (); }
3930   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3931
3932   /* The costs inherited from predecessor partitions.  */
3933   slpg_layout_cost in_cost;
3934
3935   /* The inherent cost of the layout within the node itself.  For example,
3936      this is nonzero for a load if choosing a particular layout would require
3937      the load to permute the loaded elements.  It is nonzero for a
3938      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3939      to full-vector moves.  */
3940   slpg_layout_cost internal_cost;
3941
3942   /* The costs inherited from successor partitions.  */
3943   slpg_layout_cost out_cost;
3944 };
3945
3946 /* This class tries to optimize the layout of vectors in order to avoid
3947    unnecessary shuffling.  At the moment, the set of possible layouts are
3948    restricted to bijective permutations.
3949
3950    The goal of the pass depends on whether we're optimizing for size or
3951    for speed.  When optimizing for size, the goal is to reduce the overall
3952    number of layout changes (including layout changes implied by things
3953    like load permutations).  When optimizing for speed, the goal is to
3954    reduce the maximum latency attributable to layout changes on any
3955    non-cyclical path through the data flow graph.
3956
3957    For example, when optimizing a loop nest for speed, we will prefer
3958    to make layout changes outside of a loop rather than inside of a loop,
3959    and will prefer to make layout changes in parallel rather than serially,
3960    even if that increases the overall number of layout changes.
3961
3962    The high-level procedure is:
3963
3964    (1) Build a graph in which edges go from uses (parents) to definitions
3965        (children).
3966
3967    (2) Divide the graph into a dag of strongly-connected components (SCCs).
3968
3969    (3) When optimizing for speed, partition the nodes in each SCC based
3970        on their containing cfg loop.  When optimizing for size, treat
3971        each SCC as a single partition.
3972
3973        This gives us a dag of partitions.  The goal is now to assign a
3974        layout to each partition.
3975
3976    (4) Construct a set of vector layouts that are worth considering.
3977        Record which nodes must keep their current layout.
3978
3979    (5) Perform a forward walk over the partition dag (from loads to stores)
3980        accumulating the "forward" cost of using each layout.  When visiting
3981        each partition, assign a tentative choice of layout to the partition
3982        and use that choice when calculating the cost of using a different
3983        layout in successor partitions.
3984
3985    (6) Perform a backward walk over the partition dag (from stores to loads),
3986        accumulating the "backward" cost of using each layout.  When visiting
3987        each partition, make a final choice of layout for that partition based
3988        on the accumulated forward costs (from (5)) and backward costs
3989        (from (6)).
3990
3991    (7) Apply the chosen layouts to the SLP graph.
3992
3993    For example, consider the SLP statements:
3994
3995    S1:      a_1 = load
3996        loop:
3997    S2:      a_2 = PHI<a_1, a_3>
3998    S3:      b_1 = load
3999    S4:      a_3 = a_2 + b_1
4000        exit:
4001    S5:      a_4 = PHI<a_3>
4002    S6:      store a_4
4003
4004    S2 and S4 form an SCC and are part of the same loop.  Every other
4005    statement is in a singleton SCC.  In this example there is a one-to-one
4006    mapping between SCCs and partitions and the partition dag looks like this;
4007
4008         S1     S3
4009          \     /
4010           S2+S4
4011             |
4012            S5
4013             |
4014            S6
4015
4016    S2, S3 and S4 will have a higher execution frequency than the other
4017    statements, so when optimizing for speed, the goal is to avoid any
4018    layout changes:
4019
4020    - within S3
4021    - within S2+S4
4022    - on the S3->S2+S4 edge
4023
4024    For example, if S3 was originally a reversing load, the goal of the
4025    pass is to make it an unreversed load and change the layout on the
4026    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
4027    on S1->S2+S4 and S5->S6 would also be acceptable.)
4028
4029    The difference between SCCs and partitions becomes important if we
4030    add an outer loop:
4031
4032    S1:      a_1 = ...
4033        loop1:
4034    S2:      a_2 = PHI<a_1, a_6>
4035    S3:      b_1 = load
4036    S4:      a_3 = a_2 + b_1
4037        loop2:
4038    S5:      a_4 = PHI<a_3, a_5>
4039    S6:      c_1 = load
4040    S7:      a_5 = a_4 + c_1
4041        exit2:
4042    S8:      a_6 = PHI<a_5>
4043    S9:      store a_6
4044        exit1:
4045
4046    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
4047    for speed, we usually do not want restrictions in the outer loop to "infect"
4048    the decision for the inner loop.  For example, if an outer-loop node
4049    in the SCC contains a statement with a fixed layout, that should not
4050    prevent the inner loop from using a different layout.  Conversely,
4051    the inner loop should not dictate a layout to the outer loop: if the
4052    outer loop does a lot of computation, then it may not be efficient to
4053    do all of that computation in the inner loop's preferred layout.
4054
4055    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
4056    and S5+S7 (inner).  We also try to arrange partitions so that:
4057
4058    - the partition for an outer loop comes before the partition for
4059      an inner loop
4060
4061    - if a sibling loop A dominates a sibling loop B, A's partition
4062      comes before B's
4063
4064    This gives the following partition dag for the example above:
4065
4066         S1        S3
4067          \        /
4068           S2+S4+S8   S6
4069            |   \\    /
4070            |    S5+S7
4071            |
4072           S9
4073
4074    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
4075    one for a reversal of the edge S7->S8.
4076
4077    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
4078    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
4079    preferred layout against the cost of changing the layout on entry to the
4080    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
4081
4082    Although this works well when optimizing for speed, it has the downside
4083    when optimizing for size that the choice of layout for S5+S7 is completely
4084    independent of S9, which lessens the chance of reducing the overall number
4085    of permutations.  We therefore do not partition SCCs when optimizing
4086    for size.
4087
4088    To give a concrete example of the difference between optimizing
4089    for size and speed, consider:
4090
4091    a[0] = (b[1] << c[3]) - d[1];
4092    a[1] = (b[0] << c[2]) - d[0];
4093    a[2] = (b[3] << c[1]) - d[3];
4094    a[3] = (b[2] << c[0]) - d[2];
4095
4096    There are three different layouts here: one for a, one for b and d,
4097    and one for c.  When optimizing for speed it is better to permute each
4098    of b, c and d into the order required by a, since those permutations
4099    happen in parallel.  But when optimizing for size, it is better to:
4100
4101    - permute c into the same order as b
4102    - do the arithmetic
4103    - permute the result into the order required by a
4104
4105    This gives 2 permutations rather than 3.  */
4106
4107 class vect_optimize_slp_pass
4108 {
4109 public:
4110   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
4111   void run ();
4112
4113 private:
4114   /* Graph building.  */
4115   struct loop *containing_loop (slp_tree);
4116   bool is_cfg_latch_edge (graph_edge *);
4117   void build_vertices (hash_set<slp_tree> &, slp_tree);
4118   void build_vertices ();
4119   void build_graph ();
4120
4121   /* Partitioning.  */
4122   void create_partitions ();
4123   template<typename T> void for_each_partition_edge (unsigned int, T);
4124
4125   /* Layout selection.  */
4126   bool is_compatible_layout (slp_tree, unsigned int);
4127   int change_layout_cost (slp_tree, unsigned int, unsigned int);
4128   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4129                                                        unsigned int);
4130   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4131                                int, unsigned int);
4132   int internal_node_cost (slp_tree, int, unsigned int);
4133   void start_choosing_layouts ();
4134
4135   /* Cost propagation.  */
4136   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4137                                      unsigned int, unsigned int);
4138   slpg_layout_cost total_in_cost (unsigned int);
4139   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4140   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4141   void forward_pass ();
4142   void backward_pass ();
4143
4144   /* Rematerialization.  */
4145   slp_tree get_result_with_layout (slp_tree, unsigned int);
4146   void materialize ();
4147
4148   /* Clean-up.  */
4149   void remove_redundant_permutations ();
4150
4151   void dump ();
4152
4153   vec_info *m_vinfo;
4154
4155   /* True if we should optimize the graph for size, false if we should
4156      optimize it for speed.  (It wouldn't be easy to make this decision
4157      more locally.)  */
4158   bool m_optimize_size;
4159
4160   /* A graph of all SLP nodes, with edges leading from uses to definitions.
4161      In other words, a node's predecessors are its slp_tree parents and
4162      a node's successors are its slp_tree children.  */
4163   graph *m_slpg = nullptr;
4164
4165   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
4166   auto_vec<slpg_vertex> m_vertices;
4167
4168   /* The list of all leaves of M_SLPG. such as external definitions, constants,
4169      and loads.  */
4170   auto_vec<int> m_leafs;
4171
4172   /* This array has one entry for every vector layout that we're considering.
4173      Element 0 is null and indicates "no change".  Other entries describe
4174      permutations that are inherent in the current graph and that we would
4175      like to reverse if possible.
4176
4177      For example, a permutation { 1, 2, 3, 0 } means that something has
4178      effectively been permuted in that way, such as a load group
4179      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4180      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4181      in order to put things "back" in order.  */
4182   auto_vec<vec<unsigned> > m_perms;
4183
4184   /* A partitioning of the nodes for which a layout must be chosen.
4185      Each partition represents an <SCC, cfg loop> pair; that is,
4186      nodes in different SCCs belong to different partitions, and nodes
4187      within an SCC can be further partitioned according to a containing
4188      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
4189
4190      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4191        from leaves (such as loads) to roots (such as stores).
4192
4193      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
4194   auto_vec<slpg_partition_info> m_partitions;
4195
4196   /* The list of all nodes for which a layout must be chosen.  Nodes for
4197      partition P come before the nodes for partition P+1.  Nodes within a
4198      partition are in reverse postorder.  */
4199   auto_vec<unsigned int> m_partitioned_nodes;
4200
4201   /* Index P * num-layouts + L contains the cost of using layout L
4202      for partition P.  */
4203   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4204
4205   /* Index N * num-layouts + L, if nonnull, is a node that provides the
4206      original output of node N adjusted to have layout L.  */
4207   auto_vec<slp_tree> m_node_layouts;
4208 };
4209
4210 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4211    Also record whether we should optimize anything for speed rather
4212    than size.  */
4213
4214 void
4215 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4216                                         slp_tree node)
4217 {
4218   unsigned i;
4219   slp_tree child;
4220
4221   if (visited.add (node))
4222     return;
4223
4224   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4225     {
4226       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4227       if (optimize_bb_for_speed_p (bb))
4228         m_optimize_size = false;
4229     }
4230
4231   node->vertex = m_vertices.length ();
4232   m_vertices.safe_push (slpg_vertex (node));
4233
4234   bool leaf = true;
4235   bool force_leaf = false;
4236   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4237     if (child)
4238       {
4239         leaf = false;
4240         build_vertices (visited, child);
4241       }
4242     else
4243       force_leaf = true;
4244   /* Since SLP discovery works along use-def edges all cycles have an
4245      entry - but there's the exception of cycles where we do not handle
4246      the entry explicitely (but with a NULL SLP node), like some reductions
4247      and inductions.  Force those SLP PHIs to act as leafs to make them
4248      backwards reachable.  */
4249   if (leaf || force_leaf)
4250     m_leafs.safe_push (node->vertex);
4251 }
4252
4253 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
4254
4255 void
4256 vect_optimize_slp_pass::build_vertices ()
4257 {
4258   hash_set<slp_tree> visited;
4259   unsigned i;
4260   slp_instance instance;
4261   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4262     build_vertices (visited, SLP_INSTANCE_TREE (instance));
4263 }
4264
4265 /* Apply (reverse) bijectite PERM to VEC.  */
4266
4267 template <class T>
4268 static void
4269 vect_slp_permute (vec<unsigned> perm,
4270                   vec<T> &vec, bool reverse)
4271 {
4272   auto_vec<T, 64> saved;
4273   saved.create (vec.length ());
4274   for (unsigned i = 0; i < vec.length (); ++i)
4275     saved.quick_push (vec[i]);
4276
4277   if (reverse)
4278     {
4279       for (unsigned i = 0; i < vec.length (); ++i)
4280         vec[perm[i]] = saved[i];
4281       for (unsigned i = 0; i < vec.length (); ++i)
4282         gcc_assert (vec[perm[i]] == saved[i]);
4283     }
4284   else
4285     {
4286       for (unsigned i = 0; i < vec.length (); ++i)
4287         vec[i] = saved[perm[i]];
4288       for (unsigned i = 0; i < vec.length (); ++i)
4289         gcc_assert (vec[i] == saved[perm[i]]);
4290     }
4291 }
4292
4293 /* Return the cfg loop that contains NODE.  */
4294
4295 struct loop *
4296 vect_optimize_slp_pass::containing_loop (slp_tree node)
4297 {
4298   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4299   if (!rep)
4300     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4301   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4302 }
4303
4304 /* Return true if UD (an edge from a use to a definition) is associated
4305    with a loop latch edge in the cfg.  */
4306
4307 bool
4308 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4309 {
4310   slp_tree use = m_vertices[ud->src].node;
4311   slp_tree def = m_vertices[ud->dest].node;
4312   if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
4313        || SLP_TREE_CODE (use) == VEC_PERM_EXPR)
4314       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4315     return false;
4316
4317   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4318   return (is_a<gphi *> (use_rep->stmt)
4319           && bb_loop_header_p (gimple_bb (use_rep->stmt))
4320           && containing_loop (def) == containing_loop (use));
4321 }
4322
4323 /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
4324    a nonnull data field.  */
4325
4326 void
4327 vect_optimize_slp_pass::build_graph ()
4328 {
4329   m_optimize_size = true;
4330   build_vertices ();
4331
4332   m_slpg = new_graph (m_vertices.length ());
4333   for (slpg_vertex &v : m_vertices)
4334     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4335       if (child)
4336         {
4337           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4338           if (is_cfg_latch_edge (ud))
4339             ud->data = this;
4340         }
4341 }
4342
4343 /* Return true if E corresponds to a loop latch edge in the cfg.  */
4344
4345 static bool
4346 skip_cfg_latch_edges (graph_edge *e)
4347 {
4348   return e->data;
4349 }
4350
4351 /* Create the node partitions.  */
4352
4353 void
4354 vect_optimize_slp_pass::create_partitions ()
4355 {
4356   /* Calculate a postorder of the graph, ignoring edges that correspond
4357      to natural latch edges in the cfg.  Reading the vector from the end
4358      to the beginning gives the reverse postorder.  */
4359   auto_vec<int> initial_rpo;
4360   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4361                false, NULL, skip_cfg_latch_edges);
4362   gcc_assert (initial_rpo.length () == m_vertices.length ());
4363
4364   /* Calculate the strongly connected components of the graph.  */
4365   auto_vec<int> scc_grouping;
4366   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4367
4368   /* Create a new index order in which all nodes from the same SCC are
4369      consecutive.  Use scc_pos to record the index of the first node in
4370      each SCC.  */
4371   auto_vec<unsigned int> scc_pos (num_sccs);
4372   int last_component = -1;
4373   unsigned int node_count = 0;
4374   for (unsigned int node_i : scc_grouping)
4375     {
4376       if (last_component != m_slpg->vertices[node_i].component)
4377         {
4378           last_component = m_slpg->vertices[node_i].component;
4379           gcc_assert (last_component == int (scc_pos.length ()));
4380           scc_pos.quick_push (node_count);
4381         }
4382       node_count += 1;
4383     }
4384   gcc_assert (node_count == initial_rpo.length ()
4385               && last_component + 1 == int (num_sccs));
4386
4387   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4388      inside each SCC following the RPO we calculated above.  The fact that
4389      we ignored natural latch edges when calculating the RPO should ensure
4390      that, for natural loop nests:
4391
4392      - the first node that we encounter in a cfg loop is the loop header phi
4393      - the loop header phis are in dominance order
4394
4395      Arranging for this is an optimization (see below) rather than a
4396      correctness issue.  Unnatural loops with a tangled mess of backedges
4397      will still work correctly, but might give poorer results.
4398
4399      Also update scc_pos so that it gives 1 + the index of the last node
4400      in the SCC.  */
4401   m_partitioned_nodes.safe_grow (node_count);
4402   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4403     {
4404       unsigned int node_i = initial_rpo[old_i];
4405       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4406       m_partitioned_nodes[new_i] = node_i;
4407     }
4408
4409   /* When optimizing for speed, partition each SCC based on the containing
4410      cfg loop. The order we constructed above should ensure that, for natural
4411      cfg loops, we'll create sub-SCC partitions for outer loops before
4412      the corresponding sub-SCC partitions for inner loops.  Similarly,
4413      when one sibling loop A dominates another sibling loop B, we should
4414      create a sub-SCC partition for A before a sub-SCC partition for B.
4415
4416      As above, nothing depends for correctness on whether this achieves
4417      a natural nesting, but we should get better results when it does.  */
4418   m_partitions.reserve (m_vertices.length ());
4419   unsigned int next_partition_i = 0;
4420   hash_map<struct loop *, int> loop_partitions;
4421   unsigned int rpo_begin = 0;
4422   unsigned int num_partitioned_nodes = 0;
4423   for (unsigned int rpo_end : scc_pos)
4424     {
4425       loop_partitions.empty ();
4426       unsigned int partition_i = next_partition_i;
4427       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4428         {
4429           /* Handle externals and constants optimistically throughout.
4430              But treat existing vectors as fixed since we do not handle
4431              permuting them.  */
4432           unsigned int node_i = m_partitioned_nodes[rpo_i];
4433           auto &vertex = m_vertices[node_i];
4434           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4435                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4436               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4437             vertex.partition = -1;
4438           else
4439             {
4440               bool existed;
4441               if (m_optimize_size)
4442                 existed = next_partition_i > partition_i;
4443               else
4444                 {
4445                   struct loop *loop = containing_loop (vertex.node);
4446                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
4447                   if (!existed)
4448                     entry = next_partition_i;
4449                   partition_i = entry;
4450                 }
4451               if (!existed)
4452                 {
4453                   m_partitions.quick_push (slpg_partition_info ());
4454                   next_partition_i += 1;
4455                 }
4456               vertex.partition = partition_i;
4457               num_partitioned_nodes += 1;
4458               m_partitions[partition_i].node_end += 1;
4459             }
4460         }
4461       rpo_begin = rpo_end;
4462     }
4463
4464   /* Assign ranges of consecutive node indices to each partition,
4465      in partition order.  Start with node_end being the same as
4466      node_begin so that the next loop can use it as a counter.  */
4467   unsigned int node_begin = 0;
4468   for (auto &partition : m_partitions)
4469     {
4470       partition.node_begin = node_begin;
4471       node_begin += partition.node_end;
4472       partition.node_end = partition.node_begin;
4473     }
4474   gcc_assert (node_begin == num_partitioned_nodes);
4475
4476   /* Finally build the list of nodes in partition order.  */
4477   m_partitioned_nodes.truncate (num_partitioned_nodes);
4478   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4479     {
4480       int partition_i = m_vertices[node_i].partition;
4481       if (partition_i >= 0)
4482         {
4483           unsigned int order_i = m_partitions[partition_i].node_end++;
4484           m_partitioned_nodes[order_i] = node_i;
4485         }
4486     }
4487 }
4488
4489 /* Look for edges from earlier partitions into node NODE_I and edges from
4490    node NODE_I into later partitions.  Call:
4491
4492       FN (ud, other_node_i)
4493
4494    for each such use-to-def edge ud, where other_node_i is the node at the
4495    other end of the edge.  */
4496
4497 template<typename T>
4498 void
4499 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4500 {
4501   int partition_i = m_vertices[node_i].partition;
4502   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4503        pred; pred = pred->pred_next)
4504     {
4505       int src_partition_i = m_vertices[pred->src].partition;
4506       if (src_partition_i >= 0 && src_partition_i != partition_i)
4507         fn (pred, pred->src);
4508     }
4509   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4510        succ; succ = succ->succ_next)
4511     {
4512       int dest_partition_i = m_vertices[succ->dest].partition;
4513       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4514         fn (succ, succ->dest);
4515     }
4516 }
4517
4518 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4519    that NODE would operate on.  This test is independent of NODE's actual
4520    operation.  */
4521
4522 bool
4523 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4524                                               unsigned int layout_i)
4525 {
4526   if (layout_i == 0)
4527     return true;
4528
4529   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4530     return false;
4531
4532   return true;
4533 }
4534
4535 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4536    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
4537    layouts is incompatible with NODE or if the change is not possible for
4538    some other reason.
4539
4540    The properties taken from NODE include the number of lanes and the
4541    vector type.  The actual operation doesn't matter.  */
4542
4543 int
4544 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4545                                             unsigned int from_layout_i,
4546                                             unsigned int to_layout_i)
4547 {
4548   if (!is_compatible_layout (node, from_layout_i)
4549       || !is_compatible_layout (node, to_layout_i))
4550     return -1;
4551
4552   if (from_layout_i == to_layout_i)
4553     return 0;
4554
4555   auto_vec<slp_tree, 1> children (1);
4556   children.quick_push (node);
4557   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4558   if (from_layout_i > 0)
4559     for (unsigned int i : m_perms[from_layout_i])
4560       perm.quick_push ({ 0, i });
4561   else
4562     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4563       perm.quick_push ({ 0, i });
4564   if (to_layout_i > 0)
4565     vect_slp_permute (m_perms[to_layout_i], perm, true);
4566   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4567                                                children, false);
4568   if (count >= 0)
4569     return MAX (count, 1);
4570
4571   /* ??? In principle we could try changing via layout 0, giving two
4572      layout changes rather than 1.  Doing that would require
4573      corresponding support in get_result_with_layout.  */
4574   return -1;
4575 }
4576
4577 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
4578
4579 inline slpg_partition_layout_costs &
4580 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4581                                                 unsigned int layout_i)
4582 {
4583   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4584 }
4585
4586 /* Change PERM in one of two ways:
4587
4588    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4589      chosen for child I of NODE.
4590
4591    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4592
4593    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
4594
4595 void
4596 vect_optimize_slp_pass::
4597 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4598                         int in_layout_i, unsigned int out_layout_i)
4599 {
4600   for (auto &entry : perm)
4601     {
4602       int this_in_layout_i = in_layout_i;
4603       if (this_in_layout_i < 0)
4604         {
4605           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4606           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4607           this_in_layout_i = m_partitions[in_partition_i].layout;
4608         }
4609       if (this_in_layout_i > 0)
4610         entry.second = m_perms[this_in_layout_i][entry.second];
4611     }
4612   if (out_layout_i > 0)
4613     vect_slp_permute (m_perms[out_layout_i], perm, true);
4614 }
4615
4616 /* Check whether the target allows NODE to be rearranged so that the node's
4617    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
4618    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
4619
4620    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4621    NODE can adapt to the layout changes that have (perhaps provisionally)
4622    been chosen for NODE's children, so that no extra permutations are
4623    needed on either the input or the output of NODE.
4624
4625    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4626    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4627
4628    IN_LAYOUT_I has no meaning for other types of node.
4629
4630    Keeping the node as-is is always valid.  If the target doesn't appear
4631    to support the node as-is, but might realistically support other layouts,
4632    then layout 0 instead has the cost of a worst-case permutation.  On the
4633    one hand, this ensures that every node has at least one valid layout,
4634    avoiding what would otherwise be an awkward special case.  On the other,
4635    it still encourages the pass to change an invalid pre-existing layout
4636    choice into a valid one.  */
4637
4638 int
4639 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4640                                             unsigned int out_layout_i)
4641 {
4642   const int fallback_cost = 1;
4643
4644   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4645     {
4646       auto_lane_permutation_t tmp_perm;
4647       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4648
4649       /* Check that the child nodes support the chosen layout.  Checking
4650          the first child is enough, since any second child would have the
4651          same shape.  */
4652       auto first_child = SLP_TREE_CHILDREN (node)[0];
4653       if (in_layout_i > 0
4654           && !is_compatible_layout (first_child, in_layout_i))
4655         return -1;
4656
4657       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4658       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4659                                                   node, tmp_perm,
4660                                                   SLP_TREE_CHILDREN (node),
4661                                                   false);
4662       if (count < 0)
4663         {
4664           if (in_layout_i == 0 && out_layout_i == 0)
4665             {
4666               /* Use the fallback cost if the node could in principle support
4667                  some nonzero layout for both the inputs and the outputs.
4668                  Otherwise assume that the node will be rejected later
4669                  and rebuilt from scalars.  */
4670               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4671                 return fallback_cost;
4672               return 0;
4673             }
4674           return -1;
4675         }
4676
4677       /* We currently have no way of telling whether the new layout is cheaper
4678          or more expensive than the old one.  But at least in principle,
4679          it should be worth making zero permutations (whole-vector shuffles)
4680          cheaper than real permutations, in case the pass is able to remove
4681          the latter.  */
4682       return count == 0 ? 0 : 1;
4683     }
4684
4685   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4686   if (rep
4687       && STMT_VINFO_DATA_REF (rep)
4688       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4689       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4690     {
4691       auto_load_permutation_t tmp_perm;
4692       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4693       if (out_layout_i > 0)
4694         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4695
4696       poly_uint64 vf = 1;
4697       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4698         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4699       unsigned int n_perms;
4700       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4701                                            nullptr, vf, true, false, &n_perms))
4702         {
4703           auto rep = SLP_TREE_REPRESENTATIVE (node);
4704           if (out_layout_i == 0)
4705             {
4706               /* Use the fallback cost if the load is an N-to-N permutation.
4707                  Otherwise assume that the node will be rejected later
4708                  and rebuilt from scalars.  */
4709               if (STMT_VINFO_GROUPED_ACCESS (rep)
4710                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4711                       == SLP_TREE_LANES (node)))
4712                 return fallback_cost;
4713               return 0;
4714             }
4715           return -1;
4716         }
4717
4718       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
4719       return n_perms == 0 ? 0 : 1;
4720     }
4721
4722   return 0;
4723 }
4724
4725 /* Decide which element layouts we should consider using.  Calculate the
4726    weights associated with inserting layout changes on partition edges.
4727    Also mark partitions that cannot change layout, by setting their
4728    layout to zero.  */
4729
4730 void
4731 vect_optimize_slp_pass::start_choosing_layouts ()
4732 {
4733   /* Used to assign unique permutation indices.  */
4734   using perm_hash = unbounded_hashmap_traits<
4735     vec_free_hash_base<int_hash_base<unsigned>>,
4736     int_hash<int, -1, -2>
4737   >;
4738   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4739
4740   /* Layout 0 is "no change".  */
4741   m_perms.safe_push (vNULL);
4742
4743   /* Create layouts from existing permutations.  */
4744   auto_load_permutation_t tmp_perm;
4745   for (unsigned int node_i : m_partitioned_nodes)
4746     {
4747       /* Leafs also double as entries to the reverse graph.  Allow the
4748          layout of those to be changed.  */
4749       auto &vertex = m_vertices[node_i];
4750       auto &partition = m_partitions[vertex.partition];
4751       if (!m_slpg->vertices[node_i].succ)
4752         partition.layout = 0;
4753
4754       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
4755       slp_tree node = vertex.node;
4756       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4757       slp_tree child;
4758       unsigned HOST_WIDE_INT imin, imax = 0;
4759       bool any_permute = false;
4760       tmp_perm.truncate (0);
4761       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4762         {
4763           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4764              unpermuted, record a layout that reverses this permutation.
4765
4766              We would need more work to cope with loads that are internally
4767              permuted and also have inputs (such as masks for
4768              IFN_MASK_LOADs).  */
4769           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4770           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4771             {
4772               partition.layout = -1;
4773               continue;
4774             }
4775           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4776           imin = DR_GROUP_SIZE (dr_stmt) + 1;
4777           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4778         }
4779       else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4780                && SLP_TREE_CHILDREN (node).length () == 1
4781                && (child = SLP_TREE_CHILDREN (node)[0])
4782                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4783                    .is_constant (&imin)))
4784         {
4785           /* If the child has the same vector size as this node,
4786              reversing the permutation can make the permutation a no-op.
4787              In other cases it can change a true permutation into a
4788              full-vector extract.  */
4789           tmp_perm.reserve (SLP_TREE_LANES (node));
4790           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4791             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4792         }
4793       else
4794         continue;
4795
4796       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4797         {
4798           unsigned idx = tmp_perm[j];
4799           imin = MIN (imin, idx);
4800           imax = MAX (imax, idx);
4801           if (idx - tmp_perm[0] != j)
4802             any_permute = true;
4803         }
4804       /* If the span doesn't match we'd disrupt VF computation, avoid
4805          that for now.  */
4806       if (imax - imin + 1 != SLP_TREE_LANES (node))
4807         continue;
4808       /* If there's no permute no need to split one out.  In this case
4809          we can consider turning a load into a permuted load, if that
4810          turns out to be cheaper than alternatives.  */
4811       if (!any_permute)
4812         {
4813           partition.layout = -1;
4814           continue;
4815         }
4816
4817       /* For now only handle true permutes, like
4818          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
4819          when permuting constants and invariants keeping the permute
4820          bijective.  */
4821       auto_sbitmap load_index (SLP_TREE_LANES (node));
4822       bitmap_clear (load_index);
4823       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4824         bitmap_set_bit (load_index, tmp_perm[j] - imin);
4825       unsigned j;
4826       for (j = 0; j < SLP_TREE_LANES (node); ++j)
4827         if (!bitmap_bit_p (load_index, j))
4828           break;
4829       if (j != SLP_TREE_LANES (node))
4830         continue;
4831
4832       vec<unsigned> perm = vNULL;
4833       perm.safe_grow (SLP_TREE_LANES (node), true);
4834       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4835         perm[j] = tmp_perm[j] - imin;
4836
4837       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4838         {
4839           /* Continue to use existing layouts, but don't add any more.  */
4840           int *entry = layout_ids.get (perm);
4841           partition.layout = entry ? *entry : 0;
4842           perm.release ();
4843         }
4844       else
4845         {
4846           bool existed;
4847           int &layout_i = layout_ids.get_or_insert (perm, &existed);
4848           if (existed)
4849             perm.release ();
4850           else
4851             {
4852               layout_i = m_perms.length ();
4853               m_perms.safe_push (perm);
4854             }
4855           partition.layout = layout_i;
4856         }
4857     }
4858
4859   /* Initially assume that every layout is possible and has zero cost
4860      in every partition.  */
4861   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4862                                               * m_perms.length ());
4863
4864   /* We have to mark outgoing permutations facing non-associating-reduction
4865      graph entries that are not represented as to be materialized.
4866      slp_inst_kind_bb_reduc currently only covers associatable reductions.  */
4867   for (slp_instance instance : m_vinfo->slp_instances)
4868     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4869       {
4870         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4871         m_partitions[m_vertices[node_i].partition].layout = 0;
4872       }
4873     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4874       {
4875         stmt_vec_info stmt_info
4876           = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4877         stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4878         if (needs_fold_left_reduction_p (TREE_TYPE
4879                                            (gimple_get_lhs (stmt_info->stmt)),
4880                                          STMT_VINFO_REDUC_CODE (reduc_info)))
4881           {
4882             unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4883             m_partitions[m_vertices[node_i].partition].layout = 0;
4884           }
4885       }
4886
4887   /* Check which layouts each node and partition can handle.  Calculate the
4888      weights associated with inserting layout changes on edges.  */
4889   for (unsigned int node_i : m_partitioned_nodes)
4890     {
4891       auto &vertex = m_vertices[node_i];
4892       auto &partition = m_partitions[vertex.partition];
4893       slp_tree node = vertex.node;
4894
4895       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4896         {
4897           vertex.weight = vect_slp_node_weight (node);
4898
4899           /* We do not handle stores with a permutation, so all
4900              incoming permutations must have been materialized.
4901
4902              We also don't handle masked grouped loads, which lack a
4903              permutation vector.  In this case the memory locations
4904              form an implicit second input to the loads, on top of the
4905              explicit mask input, and the memory input's layout cannot
4906              be changed.
4907
4908              On the other hand, we do support permuting gather loads and
4909              masked gather loads, where each scalar load is independent
4910              of the others.  This can be useful if the address/index input
4911              benefits from permutation.  */
4912           if (STMT_VINFO_DATA_REF (rep)
4913               && STMT_VINFO_GROUPED_ACCESS (rep)
4914               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4915             partition.layout = 0;
4916
4917           /* We cannot change the layout of an operation that is
4918              not independent on lanes.  Note this is an explicit
4919              negative list since that's much shorter than the respective
4920              positive one but it's critical to keep maintaining it.  */
4921           if (is_gimple_call (STMT_VINFO_STMT (rep)))
4922             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4923               {
4924               case CFN_COMPLEX_ADD_ROT90:
4925               case CFN_COMPLEX_ADD_ROT270:
4926               case CFN_COMPLEX_MUL:
4927               case CFN_COMPLEX_MUL_CONJ:
4928               case CFN_VEC_ADDSUB:
4929               case CFN_VEC_FMADDSUB:
4930               case CFN_VEC_FMSUBADD:
4931                 partition.layout = 0;
4932               default:;
4933               }
4934         }
4935
4936       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4937         {
4938           auto &other_vertex = m_vertices[other_node_i];
4939
4940           /* Count the number of edges from earlier partitions and the number
4941              of edges to later partitions.  */
4942           if (other_vertex.partition < vertex.partition)
4943             partition.in_degree += 1;
4944           else
4945             partition.out_degree += 1;
4946
4947           /* If the current node uses the result of OTHER_NODE_I, accumulate
4948              the effects of that.  */
4949           if (ud->src == int (node_i))
4950             {
4951               other_vertex.out_weight += vertex.weight;
4952               other_vertex.out_degree += 1;
4953             }
4954         };
4955       for_each_partition_edge (node_i, process_edge);
4956     }
4957 }
4958
4959 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4960    its current (provisional) choice of layout.  The inputs do not necessarily
4961    have the same layout as each other.  */
4962
4963 slpg_layout_cost
4964 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4965 {
4966   auto &vertex = m_vertices[node_i];
4967   slpg_layout_cost cost;
4968   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4969     {
4970       auto &other_vertex = m_vertices[other_node_i];
4971       if (other_vertex.partition < vertex.partition)
4972         {
4973           auto &other_partition = m_partitions[other_vertex.partition];
4974           auto &other_costs = partition_layout_costs (other_vertex.partition,
4975                                                       other_partition.layout);
4976           slpg_layout_cost this_cost = other_costs.in_cost;
4977           this_cost.add_serial_cost (other_costs.internal_cost);
4978           this_cost.split (other_partition.out_degree);
4979           cost.add_parallel_cost (this_cost);
4980         }
4981     };
4982   for_each_partition_edge (node_i, add_cost);
4983   return cost;
4984 }
4985
4986 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4987    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
4988    slpg_layout_cost::impossible () if the change isn't possible.  */
4989
4990 slpg_layout_cost
4991 vect_optimize_slp_pass::
4992 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4993                   unsigned int layout2_i)
4994 {
4995   auto &def_vertex = m_vertices[ud->dest];
4996   auto &use_vertex = m_vertices[ud->src];
4997   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4998   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4999   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
5000                                     use_layout_i);
5001   if (factor < 0)
5002     return slpg_layout_cost::impossible ();
5003
5004   /* We have a choice of putting the layout change at the site of the
5005      definition or at the site of the use.  Prefer the former when
5006      optimizing for size or when the execution frequency of the
5007      definition is no greater than the combined execution frequencies of
5008      the uses.  When putting the layout change at the site of the definition,
5009      divvy up the cost among all consumers.  */
5010   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
5011     {
5012       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
5013       cost.split (def_vertex.out_degree);
5014       return cost;
5015     }
5016   return { use_vertex.weight * factor, m_optimize_size };
5017 }
5018
5019 /* UD represents a use-def link between FROM_NODE_I and a node in a later
5020    partition; FROM_NODE_I could be the definition node or the use node.
5021    The node at the other end of the link wants to use layout TO_LAYOUT_I.
5022    Return the cost of any necessary fix-ups on edge UD, or return
5023    slpg_layout_cost::impossible () if the change isn't possible.
5024
5025    At this point, FROM_NODE_I's partition has chosen the cheapest
5026    layout based on the information available so far, but this choice
5027    is only provisional.  */
5028
5029 slpg_layout_cost
5030 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
5031                                       unsigned int to_layout_i)
5032 {
5033   auto &from_vertex = m_vertices[from_node_i];
5034   unsigned int from_partition_i = from_vertex.partition;
5035   slpg_partition_info &from_partition = m_partitions[from_partition_i];
5036   gcc_assert (from_partition.layout >= 0);
5037
5038   /* First calculate the cost on the assumption that FROM_PARTITION sticks
5039      with its current layout preference.  */
5040   slpg_layout_cost cost = slpg_layout_cost::impossible ();
5041   auto edge_cost = edge_layout_cost (ud, from_node_i,
5042                                      from_partition.layout, to_layout_i);
5043   if (edge_cost.is_possible ())
5044     {
5045       auto &from_costs = partition_layout_costs (from_partition_i,
5046                                                  from_partition.layout);
5047       cost = from_costs.in_cost;
5048       cost.add_serial_cost (from_costs.internal_cost);
5049       cost.split (from_partition.out_degree);
5050       cost.add_serial_cost (edge_cost);
5051     }
5052   else if (from_partition.layout == 0)
5053     /* We must allow the source partition to have layout 0 as a fallback,
5054        in case all other options turn out to be impossible.  */
5055     return cost;
5056
5057   /* Take the minimum of that cost and the cost that applies if
5058      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
5059   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
5060                                                       to_layout_i);
5061   if (direct_layout_costs.is_possible ())
5062     {
5063       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
5064       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
5065       direct_cost.split (from_partition.out_degree);
5066       if (!cost.is_possible ()
5067           || direct_cost.is_better_than (cost, m_optimize_size))
5068         cost = direct_cost;
5069     }
5070
5071   return cost;
5072 }
5073
5074 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
5075    partition; TO_NODE_I could be the definition node or the use node.
5076    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
5077    return the cost of any necessary fix-ups on edge UD, or
5078    slpg_layout_cost::impossible () if the choice cannot be made.
5079
5080    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
5081
5082 slpg_layout_cost
5083 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
5084                                        unsigned int from_layout_i)
5085 {
5086   auto &to_vertex = m_vertices[to_node_i];
5087   unsigned int to_partition_i = to_vertex.partition;
5088   slpg_partition_info &to_partition = m_partitions[to_partition_i];
5089   gcc_assert (to_partition.layout >= 0);
5090
5091   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
5092      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
5093      any other inputs keep their current choice of layout.  */
5094   auto &to_costs = partition_layout_costs (to_partition_i,
5095                                            to_partition.layout);
5096   if (ud->src == int (to_node_i)
5097       && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
5098     {
5099       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
5100       auto old_layout = from_partition.layout;
5101       from_partition.layout = from_layout_i;
5102       int factor = internal_node_cost (to_vertex.node, -1,
5103                                        to_partition.layout);
5104       from_partition.layout = old_layout;
5105       if (factor >= 0)
5106         {
5107           slpg_layout_cost cost = to_costs.out_cost;
5108           cost.add_serial_cost ({ to_vertex.weight * factor,
5109                                   m_optimize_size });
5110           cost.split (to_partition.in_degree);
5111           return cost;
5112         }
5113     }
5114
5115   /* Compute the cost if we insert any necessary layout change on edge UD.  */
5116   auto edge_cost = edge_layout_cost (ud, to_node_i,
5117                                      to_partition.layout, from_layout_i);
5118   if (edge_cost.is_possible ())
5119     {
5120       slpg_layout_cost cost = to_costs.out_cost;
5121       cost.add_serial_cost (to_costs.internal_cost);
5122       cost.split (to_partition.in_degree);
5123       cost.add_serial_cost (edge_cost);
5124       return cost;
5125     }
5126
5127   return slpg_layout_cost::impossible ();
5128 }
5129
5130 /* Make a forward pass through the partitions, accumulating input costs.
5131    Make a tentative (provisional) choice of layout for each partition,
5132    ensuring that this choice still allows later partitions to keep
5133    their original layout.  */
5134
5135 void
5136 vect_optimize_slp_pass::forward_pass ()
5137 {
5138   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5139        ++partition_i)
5140     {
5141       auto &partition = m_partitions[partition_i];
5142
5143       /* If the partition consists of a single VEC_PERM_EXPR, precompute
5144          the incoming cost that would apply if every predecessor partition
5145          keeps its current layout.  This is used within the loop below.  */
5146       slpg_layout_cost in_cost;
5147       slp_tree single_node = nullptr;
5148       if (partition.node_end == partition.node_begin + 1)
5149         {
5150           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5151           single_node = m_vertices[node_i].node;
5152           if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5153             in_cost = total_in_cost (node_i);
5154         }
5155
5156       /* Go through the possible layouts.  Decide which ones are valid
5157          for this partition and record which of the valid layouts has
5158          the lowest cost.  */
5159       unsigned int min_layout_i = 0;
5160       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5161       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5162         {
5163           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5164           if (!layout_costs.is_possible ())
5165             continue;
5166
5167           /* If the recorded layout is already 0 then the layout cannot
5168              change.  */
5169           if (partition.layout == 0 && layout_i != 0)
5170             {
5171               layout_costs.mark_impossible ();
5172               continue;
5173             }
5174
5175           bool is_possible = true;
5176           for (unsigned int order_i = partition.node_begin;
5177                order_i < partition.node_end; ++order_i)
5178             {
5179               unsigned int node_i = m_partitioned_nodes[order_i];
5180               auto &vertex = m_vertices[node_i];
5181
5182               /* Reject the layout if it is individually incompatible
5183                  with any node in the partition.  */
5184               if (!is_compatible_layout (vertex.node, layout_i))
5185                 {
5186                   is_possible = false;
5187                   break;
5188                 }
5189
5190               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5191                 {
5192                   auto &other_vertex = m_vertices[other_node_i];
5193                   if (other_vertex.partition < vertex.partition)
5194                     {
5195                       /* Accumulate the incoming costs from earlier
5196                          partitions, plus the cost of any layout changes
5197                          on UD itself.  */
5198                       auto cost = forward_cost (ud, other_node_i, layout_i);
5199                       if (!cost.is_possible ())
5200                         is_possible = false;
5201                       else
5202                         layout_costs.in_cost.add_parallel_cost (cost);
5203                     }
5204                   else
5205                     /* Reject the layout if it would make layout 0 impossible
5206                        for later partitions.  This amounts to testing that the
5207                        target supports reversing the layout change on edges
5208                        to later partitions.
5209
5210                        In principle, it might be possible to push a layout
5211                        change all the way down a graph, so that it never
5212                        needs to be reversed and so that the target doesn't
5213                        need to support the reverse operation.  But it would
5214                        be awkward to bail out if we hit a partition that
5215                        does not support the new layout, especially since
5216                        we are not dealing with a lattice.  */
5217                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
5218                                                      layout_i).is_possible ();
5219                 };
5220               for_each_partition_edge (node_i, add_cost);
5221
5222               /* Accumulate the cost of using LAYOUT_I within NODE,
5223                  both for the inputs and the outputs.  */
5224               int factor = internal_node_cost (vertex.node, layout_i,
5225                                                layout_i);
5226               if (factor < 0)
5227                 {
5228                   is_possible = false;
5229                   break;
5230                 }
5231               else if (factor)
5232                 layout_costs.internal_cost.add_serial_cost
5233                   ({ vertex.weight * factor, m_optimize_size });
5234             }
5235           if (!is_possible)
5236             {
5237               layout_costs.mark_impossible ();
5238               continue;
5239             }
5240
5241           /* Combine the incoming and partition-internal costs.  */
5242           slpg_layout_cost combined_cost = layout_costs.in_cost;
5243           combined_cost.add_serial_cost (layout_costs.internal_cost);
5244
5245           /* If this partition consists of a single VEC_PERM_EXPR, see
5246              if the VEC_PERM_EXPR can be changed to support output layout
5247              LAYOUT_I while keeping all the provisional choices of input
5248              layout.  */
5249           if (single_node
5250               && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5251             {
5252               int factor = internal_node_cost (single_node, -1, layout_i);
5253               if (factor >= 0)
5254                 {
5255                   auto weight = m_vertices[single_node->vertex].weight;
5256                   slpg_layout_cost internal_cost
5257                     = { weight * factor, m_optimize_size };
5258
5259                   slpg_layout_cost alt_cost = in_cost;
5260                   alt_cost.add_serial_cost (internal_cost);
5261                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5262                     {
5263                       combined_cost = alt_cost;
5264                       layout_costs.in_cost = in_cost;
5265                       layout_costs.internal_cost = internal_cost;
5266                     }
5267                 }
5268             }
5269
5270           /* Record the layout with the lowest cost.  Prefer layout 0 in
5271              the event of a tie between it and another layout.  */
5272           if (!min_layout_cost.is_possible ()
5273               || combined_cost.is_better_than (min_layout_cost,
5274                                                m_optimize_size))
5275             {
5276               min_layout_i = layout_i;
5277               min_layout_cost = combined_cost;
5278             }
5279         }
5280
5281       /* This loop's handling of earlier partitions should ensure that
5282          choosing the original layout for the current partition is no
5283          less valid than it was in the original graph, even with the
5284          provisional layout choices for those earlier partitions.  */
5285       gcc_assert (min_layout_cost.is_possible ());
5286       partition.layout = min_layout_i;
5287     }
5288 }
5289
5290 /* Make a backward pass through the partitions, accumulating output costs.
5291    Make a final choice of layout for each partition.  */
5292
5293 void
5294 vect_optimize_slp_pass::backward_pass ()
5295 {
5296   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5297     {
5298       auto &partition = m_partitions[partition_i];
5299
5300       unsigned int min_layout_i = 0;
5301       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5302       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5303         {
5304           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5305           if (!layout_costs.is_possible ())
5306             continue;
5307
5308           /* Accumulate the costs from successor partitions.  */
5309           bool is_possible = true;
5310           for (unsigned int order_i = partition.node_begin;
5311                order_i < partition.node_end; ++order_i)
5312             {
5313               unsigned int node_i = m_partitioned_nodes[order_i];
5314               auto &vertex = m_vertices[node_i];
5315               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5316                 {
5317                   auto &other_vertex = m_vertices[other_node_i];
5318                   auto &other_partition = m_partitions[other_vertex.partition];
5319                   if (other_vertex.partition > vertex.partition)
5320                     {
5321                       /* Accumulate the incoming costs from later
5322                          partitions, plus the cost of any layout changes
5323                          on UD itself.  */
5324                       auto cost = backward_cost (ud, other_node_i, layout_i);
5325                       if (!cost.is_possible ())
5326                         is_possible = false;
5327                       else
5328                         layout_costs.out_cost.add_parallel_cost (cost);
5329                     }
5330                   else
5331                     /* Make sure that earlier partitions can (if necessary
5332                        or beneficial) keep the layout that they chose in
5333                        the forward pass.  This ensures that there is at
5334                        least one valid choice of layout.  */
5335                     is_possible &= edge_layout_cost (ud, other_node_i,
5336                                                      other_partition.layout,
5337                                                      layout_i).is_possible ();
5338                 };
5339               for_each_partition_edge (node_i, add_cost);
5340             }
5341           if (!is_possible)
5342             {
5343               layout_costs.mark_impossible ();
5344               continue;
5345             }
5346
5347           /* Locally combine the costs from the forward and backward passes.
5348              (This combined cost is not passed on, since that would lead
5349              to double counting.)  */
5350           slpg_layout_cost combined_cost = layout_costs.in_cost;
5351           combined_cost.add_serial_cost (layout_costs.internal_cost);
5352           combined_cost.add_serial_cost (layout_costs.out_cost);
5353
5354           /* Record the layout with the lowest cost.  Prefer layout 0 in
5355              the event of a tie between it and another layout.  */
5356           if (!min_layout_cost.is_possible ()
5357               || combined_cost.is_better_than (min_layout_cost,
5358                                                m_optimize_size))
5359             {
5360               min_layout_i = layout_i;
5361               min_layout_cost = combined_cost;
5362             }
5363         }
5364
5365       gcc_assert (min_layout_cost.is_possible ());
5366       partition.layout = min_layout_i;
5367     }
5368 }
5369
5370 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5371    NODE already has the layout that was selected for its partition.  */
5372
5373 slp_tree
5374 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5375                                                 unsigned int to_layout_i)
5376 {
5377   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5378   slp_tree result = m_node_layouts[result_i];
5379   if (result)
5380     return result;
5381
5382   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5383       || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5384           /* We can't permute vector defs in place.  */
5385           && SLP_TREE_VEC_DEFS (node).is_empty ()))
5386     {
5387       /* If the vector is uniform or unchanged, there's nothing to do.  */
5388       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5389         result = node;
5390       else
5391         {
5392           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5393           result = vect_create_new_slp_node (scalar_ops);
5394           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5395         }
5396     }
5397   else
5398     {
5399       unsigned int partition_i = m_vertices[node->vertex].partition;
5400       unsigned int from_layout_i = m_partitions[partition_i].layout;
5401       if (from_layout_i == to_layout_i)
5402         return node;
5403
5404       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5405          permutation instead of a serial one.  Leave the new permutation
5406          in TMP_PERM on success.  */
5407       auto_lane_permutation_t tmp_perm;
5408       unsigned int num_inputs = 1;
5409       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5410         {
5411           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5412           if (from_layout_i != 0)
5413             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5414           if (to_layout_i != 0)
5415             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5416           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5417                                               tmp_perm,
5418                                               SLP_TREE_CHILDREN (node),
5419                                               false) >= 0)
5420             num_inputs = SLP_TREE_CHILDREN (node).length ();
5421           else
5422             tmp_perm.truncate (0);
5423         }
5424
5425       if (dump_enabled_p ())
5426         {
5427           if (tmp_perm.length () > 0)
5428             dump_printf_loc (MSG_NOTE, vect_location,
5429                              "duplicating permutation node %p with"
5430                              " layout %d\n",
5431                              (void *) node, to_layout_i);
5432           else
5433             dump_printf_loc (MSG_NOTE, vect_location,
5434                              "inserting permutation node in place of %p\n",
5435                              (void *) node);
5436         }
5437
5438       unsigned int num_lanes = SLP_TREE_LANES (node);
5439       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5440       if (SLP_TREE_SCALAR_STMTS (node).length ())
5441         {
5442           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5443           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5444           if (from_layout_i != 0)
5445             vect_slp_permute (m_perms[from_layout_i], stmts, false);
5446           if (to_layout_i != 0)
5447             vect_slp_permute (m_perms[to_layout_i], stmts, true);
5448         }
5449       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5450       SLP_TREE_LANES (result) = num_lanes;
5451       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5452       result->vertex = -1;
5453
5454       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5455       if (tmp_perm.length ())
5456         {
5457           lane_perm.safe_splice (tmp_perm);
5458           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5459         }
5460       else
5461         {
5462           lane_perm.create (num_lanes);
5463           for (unsigned j = 0; j < num_lanes; ++j)
5464             lane_perm.quick_push ({ 0, j });
5465           if (from_layout_i != 0)
5466             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5467           if (to_layout_i != 0)
5468             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5469           SLP_TREE_CHILDREN (result).safe_push (node);
5470         }
5471       for (slp_tree child : SLP_TREE_CHILDREN (result))
5472         child->refcnt++;
5473     }
5474   m_node_layouts[result_i] = result;
5475   return result;
5476 }
5477
5478 /* Apply the chosen vector layouts to the SLP graph.  */
5479
5480 void
5481 vect_optimize_slp_pass::materialize ()
5482 {
5483   /* We no longer need the costs, so avoid having two O(N * P) arrays
5484      live at the same time.  */
5485   m_partition_layout_costs.release ();
5486   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5487
5488   auto_sbitmap fully_folded (m_vertices.length ());
5489   bitmap_clear (fully_folded);
5490   for (unsigned int node_i : m_partitioned_nodes)
5491     {
5492       auto &vertex = m_vertices[node_i];
5493       slp_tree node = vertex.node;
5494       int layout_i = m_partitions[vertex.partition].layout;
5495       gcc_assert (layout_i >= 0);
5496
5497       /* Rearrange the scalar statements to match the chosen layout.  */
5498       if (layout_i > 0)
5499         vect_slp_permute (m_perms[layout_i],
5500                           SLP_TREE_SCALAR_STMTS (node), true);
5501
5502       /* Update load and lane permutations.  */
5503       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5504         {
5505           /* First try to absorb the input vector layouts.  If that fails,
5506              force the inputs to have layout LAYOUT_I too.  We checked that
5507              that was possible before deciding to use nonzero output layouts.
5508              (Note that at this stage we don't really have any guarantee that
5509              the target supports the original VEC_PERM_EXPR.)  */
5510           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5511           auto_lane_permutation_t tmp_perm;
5512           tmp_perm.safe_splice (perm);
5513           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5514           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5515                                               tmp_perm,
5516                                               SLP_TREE_CHILDREN (node),
5517                                               false) >= 0)
5518             {
5519               if (dump_enabled_p ()
5520                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5521                                   perm.begin ()))
5522                 dump_printf_loc (MSG_NOTE, vect_location,
5523                                  "absorbing input layouts into %p\n",
5524                                  (void *) node);
5525               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5526               bitmap_set_bit (fully_folded, node_i);
5527             }
5528           else
5529             {
5530               /* Not MSG_MISSED because it would make no sense to users.  */
5531               if (dump_enabled_p ())
5532                 dump_printf_loc (MSG_NOTE, vect_location,
5533                                  "failed to absorb input layouts into %p\n",
5534                                  (void *) node);
5535               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5536             }
5537         }
5538       else
5539         {
5540           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5541           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5542           if (layout_i > 0)
5543             /* ???  When we handle non-bijective permutes the idea
5544                is that we can force the load-permutation to be
5545                { min, min + 1, min + 2, ... max }.  But then the
5546                scalar defs might no longer match the lane content
5547                which means wrong-code with live lane vectorization.
5548                So we possibly have to have NULL entries for those.  */
5549             vect_slp_permute (m_perms[layout_i], load_perm, true);
5550         }
5551     }
5552
5553   /* Do this before any nodes disappear, since it involves a walk
5554      over the leaves.  */
5555   remove_redundant_permutations ();
5556
5557   /* Replace each child with a correctly laid-out version.  */
5558   for (unsigned int node_i : m_partitioned_nodes)
5559     {
5560       /* Skip nodes that have already been handled above.  */
5561       if (bitmap_bit_p (fully_folded, node_i))
5562         continue;
5563
5564       auto &vertex = m_vertices[node_i];
5565       int in_layout_i = m_partitions[vertex.partition].layout;
5566       gcc_assert (in_layout_i >= 0);
5567
5568       unsigned j;
5569       slp_tree child;
5570       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5571         {
5572           if (!child)
5573             continue;
5574
5575           slp_tree new_child = get_result_with_layout (child, in_layout_i);
5576           if (new_child != child)
5577             {
5578               vect_free_slp_tree (child);
5579               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5580               new_child->refcnt += 1;
5581             }
5582         }
5583     }
5584 }
5585
5586 /* Elide load permutations that are not necessary.  Such permutations might
5587    be pre-existing, rather than created by the layout optimizations.  */
5588
5589 void
5590 vect_optimize_slp_pass::remove_redundant_permutations ()
5591 {
5592   for (unsigned int node_i : m_leafs)
5593     {
5594       slp_tree node = m_vertices[node_i].node;
5595       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5596         continue;
5597
5598       /* In basic block vectorization we allow any subchain of an interleaving
5599          chain.
5600          FORNOW: not in loop SLP because of realignment complications.  */
5601       if (is_a <bb_vec_info> (m_vinfo))
5602         {
5603           bool subchain_p = true;
5604           stmt_vec_info next_load_info = NULL;
5605           stmt_vec_info load_info;
5606           unsigned j;
5607           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5608             {
5609               if (j != 0
5610                   && (next_load_info != load_info
5611                       || DR_GROUP_GAP (load_info) != 1))
5612                 {
5613                   subchain_p = false;
5614                   break;
5615                 }
5616               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5617             }
5618           if (subchain_p)
5619             {
5620               SLP_TREE_LOAD_PERMUTATION (node).release ();
5621               continue;
5622             }
5623         }
5624       else
5625         {
5626           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5627           stmt_vec_info load_info;
5628           bool this_load_permuted = false;
5629           unsigned j;
5630           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5631             if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5632               {
5633                 this_load_permuted = true;
5634                 break;
5635               }
5636           /* When this isn't a grouped access we know it's single element
5637              and contiguous.  */
5638           if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5639             {
5640               if (!this_load_permuted
5641                   && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5642                       || SLP_TREE_LANES (node) == 1))
5643                 SLP_TREE_LOAD_PERMUTATION (node).release ();
5644               continue;
5645             }
5646           stmt_vec_info first_stmt_info
5647             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5648           if (!this_load_permuted
5649               /* The load requires permutation when unrolling exposes
5650                  a gap either because the group is larger than the SLP
5651                  group-size or because there is a gap between the groups.  */
5652               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5653                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5654                       && DR_GROUP_GAP (first_stmt_info) == 0)))
5655             {
5656               SLP_TREE_LOAD_PERMUTATION (node).release ();
5657               continue;
5658             }
5659         }
5660     }
5661 }
5662
5663 /* Print the partition graph and layout information to the dump file.  */
5664
5665 void
5666 vect_optimize_slp_pass::dump ()
5667 {
5668   dump_printf_loc (MSG_NOTE, vect_location,
5669                    "SLP optimize permutations:\n");
5670   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5671     {
5672       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
5673       const char *sep = "";
5674       for (unsigned int idx : m_perms[layout_i])
5675         {
5676           dump_printf (MSG_NOTE, "%s%d", sep, idx);
5677           sep = ", ";
5678         }
5679       dump_printf (MSG_NOTE, " }\n");
5680     }
5681   dump_printf_loc (MSG_NOTE, vect_location,
5682                    "SLP optimize partitions:\n");
5683   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5684        ++partition_i)
5685     {
5686       auto &partition = m_partitions[partition_i];
5687       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
5688       dump_printf_loc (MSG_NOTE, vect_location,
5689                        "  partition %d (layout %d):\n",
5690                        partition_i, partition.layout);
5691       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
5692       for (unsigned int order_i = partition.node_begin;
5693            order_i < partition.node_end; ++order_i)
5694         {
5695           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5696           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
5697                            (void *) vertex.node);
5698           dump_printf_loc (MSG_NOTE, vect_location,
5699                            "          weight: %f\n",
5700                            vertex.weight.to_double ());
5701           if (vertex.out_degree)
5702             dump_printf_loc (MSG_NOTE, vect_location,
5703                              "          out weight: %f (degree %d)\n",
5704                              vertex.out_weight.to_double (),
5705                              vertex.out_degree);
5706           if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5707             dump_printf_loc (MSG_NOTE, vect_location,
5708                              "          op: VEC_PERM_EXPR\n");
5709           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5710             dump_printf_loc (MSG_NOTE, vect_location,
5711                              "          op template: %G", rep->stmt);
5712         }
5713       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
5714       for (unsigned int order_i = partition.node_begin;
5715            order_i < partition.node_end; ++order_i)
5716         {
5717           unsigned int node_i = m_partitioned_nodes[order_i];
5718           auto &vertex = m_vertices[node_i];
5719           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5720             {
5721               auto &other_vertex = m_vertices[other_node_i];
5722               if (other_vertex.partition < vertex.partition)
5723                 dump_printf_loc (MSG_NOTE, vect_location,
5724                                  "      - %p [%d] --> %p\n",
5725                                  (void *) other_vertex.node,
5726                                  other_vertex.partition,
5727                                  (void *) vertex.node);
5728               else
5729                 dump_printf_loc (MSG_NOTE, vect_location,
5730                                  "      - %p --> [%d] %p\n",
5731                                  (void *) vertex.node,
5732                                  other_vertex.partition,
5733                                  (void *) other_vertex.node);
5734             };
5735           for_each_partition_edge (node_i, print_edge);
5736         }
5737
5738       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5739         {
5740           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5741           if (layout_costs.is_possible ())
5742             {
5743               dump_printf_loc (MSG_NOTE, vect_location,
5744                                "    layout %d:%s\n", layout_i,
5745                                partition.layout == int (layout_i)
5746                                ? " (*)" : "");
5747               slpg_layout_cost combined_cost = layout_costs.in_cost;
5748               combined_cost.add_serial_cost (layout_costs.internal_cost);
5749               combined_cost.add_serial_cost (layout_costs.out_cost);
5750 #define TEMPLATE "{depth: %f, total: %f}"
5751               dump_printf_loc (MSG_NOTE, vect_location,
5752                                "        " TEMPLATE "\n",
5753                                layout_costs.in_cost.depth.to_double (),
5754                                layout_costs.in_cost.total.to_double ());
5755               dump_printf_loc (MSG_NOTE, vect_location,
5756                                "      + " TEMPLATE "\n",
5757                                layout_costs.internal_cost.depth.to_double (),
5758                                layout_costs.internal_cost.total.to_double ());
5759               dump_printf_loc (MSG_NOTE, vect_location,
5760                                "      + " TEMPLATE "\n",
5761                                layout_costs.out_cost.depth.to_double (),
5762                                layout_costs.out_cost.total.to_double ());
5763               dump_printf_loc (MSG_NOTE, vect_location,
5764                                "      = " TEMPLATE "\n",
5765                                combined_cost.depth.to_double (),
5766                                combined_cost.total.to_double ());
5767 #undef TEMPLATE
5768             }
5769           else
5770             dump_printf_loc (MSG_NOTE, vect_location,
5771                              "    layout %d: rejected\n", layout_i);
5772         }
5773     }
5774 }
5775
5776 /* Main entry point for the SLP graph optimization pass.  */
5777
5778 void
5779 vect_optimize_slp_pass::run ()
5780 {
5781   build_graph ();
5782   create_partitions ();
5783   start_choosing_layouts ();
5784   if (m_perms.length () > 1)
5785     {
5786       forward_pass ();
5787       backward_pass ();
5788       if (dump_enabled_p ())
5789         dump ();
5790       materialize ();
5791       while (!m_perms.is_empty ())
5792         m_perms.pop ().release ();
5793     }
5794   else
5795     remove_redundant_permutations ();
5796   free_graph (m_slpg);
5797 }
5798
5799 /* Optimize the SLP graph of VINFO.  */
5800
5801 void
5802 vect_optimize_slp (vec_info *vinfo)
5803 {
5804   if (vinfo->slp_instances.is_empty ())
5805     return;
5806   vect_optimize_slp_pass (vinfo).run ();
5807 }
5808
5809 /* Gather loads reachable from the individual SLP graph entries.  */
5810
5811 void
5812 vect_gather_slp_loads (vec_info *vinfo)
5813 {
5814   unsigned i;
5815   slp_instance instance;
5816   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5817     {
5818       hash_set<slp_tree> visited;
5819       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5820                              SLP_INSTANCE_TREE (instance), visited);
5821     }
5822 }
5823
5824
5825 /* For each possible SLP instance decide whether to SLP it and calculate overall
5826    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
5827    least one instance.  */
5828
5829 bool
5830 vect_make_slp_decision (loop_vec_info loop_vinfo)
5831 {
5832   unsigned int i;
5833   poly_uint64 unrolling_factor = 1;
5834   const vec<slp_instance> &slp_instances
5835     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5836   slp_instance instance;
5837   int decided_to_slp = 0;
5838
5839   DUMP_VECT_SCOPE ("vect_make_slp_decision");
5840
5841   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5842     {
5843       /* FORNOW: SLP if you can.  */
5844       /* All unroll factors have the form:
5845
5846            GET_MODE_SIZE (vinfo->vector_mode) * X
5847
5848          for some rational X, so they must have a common multiple.  */
5849       unrolling_factor
5850         = force_common_multiple (unrolling_factor,
5851                                  SLP_INSTANCE_UNROLLING_FACTOR (instance));
5852
5853       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
5854          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5855          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
5856       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5857       decided_to_slp++;
5858     }
5859
5860   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5861
5862   if (decided_to_slp && dump_enabled_p ())
5863     {
5864       dump_printf_loc (MSG_NOTE, vect_location,
5865                        "Decided to SLP %d instances. Unrolling factor ",
5866                        decided_to_slp);
5867       dump_dec (MSG_NOTE, unrolling_factor);
5868       dump_printf (MSG_NOTE, "\n");
5869     }
5870
5871   return (decided_to_slp > 0);
5872 }
5873
5874 /* Private data for vect_detect_hybrid_slp.  */
5875 struct vdhs_data
5876 {
5877   loop_vec_info loop_vinfo;
5878   vec<stmt_vec_info> *worklist;
5879 };
5880
5881 /* Walker for walk_gimple_op.  */
5882
5883 static tree
5884 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5885 {
5886   walk_stmt_info *wi = (walk_stmt_info *)data;
5887   vdhs_data *dat = (vdhs_data *)wi->info;
5888
5889   if (wi->is_lhs)
5890     return NULL_TREE;
5891
5892   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5893   if (!def_stmt_info)
5894     return NULL_TREE;
5895   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5896   if (PURE_SLP_STMT (def_stmt_info))
5897     {
5898       if (dump_enabled_p ())
5899         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5900                          def_stmt_info->stmt);
5901       STMT_SLP_TYPE (def_stmt_info) = hybrid;
5902       dat->worklist->safe_push (def_stmt_info);
5903     }
5904
5905   return NULL_TREE;
5906 }
5907
5908 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5909    if so, otherwise pushing it to WORKLIST.  */
5910
5911 static void
5912 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5913                                vec<stmt_vec_info> &worklist,
5914                                stmt_vec_info stmt_info)
5915 {
5916   if (dump_enabled_p ())
5917     dump_printf_loc (MSG_NOTE, vect_location,
5918                      "Processing hybrid candidate : %G", stmt_info->stmt);
5919   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5920   imm_use_iterator iter2;
5921   ssa_op_iter iter1;
5922   use_operand_p use_p;
5923   def_operand_p def_p;
5924   bool any_def = false;
5925   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5926     {
5927       any_def = true;
5928       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5929         {
5930           if (is_gimple_debug (USE_STMT (use_p)))
5931             continue;
5932           stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5933           /* An out-of loop use means this is a loop_vect sink.  */
5934           if (!use_info)
5935             {
5936               if (dump_enabled_p ())
5937                 dump_printf_loc (MSG_NOTE, vect_location,
5938                                  "Found loop_vect sink: %G", stmt_info->stmt);
5939               worklist.safe_push (stmt_info);
5940               return;
5941             }
5942           else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5943             {
5944               if (dump_enabled_p ())
5945                 dump_printf_loc (MSG_NOTE, vect_location,
5946                                  "Found loop_vect use: %G", use_info->stmt);
5947               worklist.safe_push (stmt_info);
5948               return;
5949             }
5950         }
5951     }
5952   /* No def means this is a loo_vect sink.  */
5953   if (!any_def)
5954     {
5955       if (dump_enabled_p ())
5956         dump_printf_loc (MSG_NOTE, vect_location,
5957                          "Found loop_vect sink: %G", stmt_info->stmt);
5958       worklist.safe_push (stmt_info);
5959       return;
5960     }
5961   if (dump_enabled_p ())
5962     dump_printf_loc (MSG_NOTE, vect_location,
5963                      "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5964   STMT_SLP_TYPE (stmt_info) = pure_slp;
5965 }
5966
5967 /* Find stmts that must be both vectorized and SLPed.  */
5968
5969 void
5970 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5971 {
5972   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5973
5974   /* All stmts participating in SLP are marked pure_slp, all other
5975      stmts are loop_vect.
5976      First collect all loop_vect stmts into a worklist.
5977      SLP patterns cause not all original scalar stmts to appear in
5978      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5979      Rectify this here and do a backward walk over the IL only considering
5980      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5981      mark them as pure_slp.  */
5982   auto_vec<stmt_vec_info> worklist;
5983   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5984     {
5985       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5986       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5987            gsi_next (&gsi))
5988         {
5989           gphi *phi = gsi.phi ();
5990           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5991           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5992             maybe_push_to_hybrid_worklist (loop_vinfo,
5993                                            worklist, stmt_info);
5994         }
5995       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5996            gsi_prev (&gsi))
5997         {
5998           gimple *stmt = gsi_stmt (gsi);
5999           if (is_gimple_debug (stmt))
6000             continue;
6001           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
6002           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
6003             {
6004               for (gimple_stmt_iterator gsi2
6005                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
6006                    !gsi_end_p (gsi2); gsi_next (&gsi2))
6007                 {
6008                   stmt_vec_info patt_info
6009                     = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
6010                   if (!STMT_SLP_TYPE (patt_info)
6011                       && STMT_VINFO_RELEVANT (patt_info))
6012                     maybe_push_to_hybrid_worklist (loop_vinfo,
6013                                                    worklist, patt_info);
6014                 }
6015               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6016             }
6017           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
6018             maybe_push_to_hybrid_worklist (loop_vinfo,
6019                                            worklist, stmt_info);
6020         }
6021     }
6022
6023   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
6024      mark any SLP vectorized stmt as hybrid.
6025      ???  We're visiting def stmts N times (once for each non-SLP and
6026      once for each hybrid-SLP use).  */
6027   walk_stmt_info wi;
6028   vdhs_data dat;
6029   dat.worklist = &worklist;
6030   dat.loop_vinfo = loop_vinfo;
6031   memset (&wi, 0, sizeof (wi));
6032   wi.info = (void *)&dat;
6033   while (!worklist.is_empty ())
6034     {
6035       stmt_vec_info stmt_info = worklist.pop ();
6036       /* Since SSA operands are not set up for pattern stmts we need
6037          to use walk_gimple_op.  */
6038       wi.is_lhs = 0;
6039       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
6040       /* For gather/scatter make sure to walk the offset operand, that
6041          can be a scaling and conversion away.  */
6042       gather_scatter_info gs_info;
6043       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
6044           && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
6045         {
6046           int dummy;
6047           vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
6048         }
6049     }
6050 }
6051
6052
6053 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
6054
6055 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
6056   : vec_info (vec_info::bb, shared),
6057     bbs (_bbs),
6058     roots (vNULL)
6059 {
6060   for (unsigned i = 0; i < bbs.length (); ++i)
6061     {
6062       if (i != 0)
6063         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6064              gsi_next (&si))
6065           {
6066             gphi *phi = si.phi ();
6067             gimple_set_uid (phi, 0);
6068             add_stmt (phi);
6069           }
6070       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6071            !gsi_end_p (gsi); gsi_next (&gsi))
6072         {
6073           gimple *stmt = gsi_stmt (gsi);
6074           gimple_set_uid (stmt, 0);
6075           if (is_gimple_debug (stmt))
6076             continue;
6077           add_stmt (stmt);
6078         }
6079     }
6080 }
6081
6082
6083 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
6084    stmts in the basic block.  */
6085
6086 _bb_vec_info::~_bb_vec_info ()
6087 {
6088   /* Reset region marker.  */
6089   for (unsigned i = 0; i < bbs.length (); ++i)
6090     {
6091       if (i != 0)
6092         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6093              gsi_next (&si))
6094           {
6095             gphi *phi = si.phi ();
6096             gimple_set_uid (phi, -1);
6097           }
6098       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6099            !gsi_end_p (gsi); gsi_next (&gsi))
6100         {
6101           gimple *stmt = gsi_stmt (gsi);
6102           gimple_set_uid (stmt, -1);
6103         }
6104     }
6105
6106   for (unsigned i = 0; i < roots.length (); ++i)
6107     {
6108       roots[i].stmts.release ();
6109       roots[i].roots.release ();
6110       roots[i].remain.release ();
6111     }
6112   roots.release ();
6113 }
6114
6115 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
6116    given then that child nodes have already been processed, and that
6117    their def types currently match their SLP node's def type.  */
6118
6119 static bool
6120 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
6121                                     slp_instance node_instance,
6122                                     stmt_vector_for_cost *cost_vec)
6123 {
6124   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6125
6126   /* Calculate the number of vector statements to be created for the
6127      scalar stmts in this node.  For SLP reductions it is equal to the
6128      number of vector statements in the children (which has already been
6129      calculated by the recursive call).  Otherwise it is the number of
6130      scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
6131      VF divided by the number of elements in a vector.  */
6132   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6133       && !STMT_VINFO_DATA_REF (stmt_info)
6134       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6135     {
6136       for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6137         if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6138           {
6139             SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6140               = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6141             break;
6142           }
6143     }
6144   else
6145     {
6146       poly_uint64 vf;
6147       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6148         vf = loop_vinfo->vectorization_factor;
6149       else
6150         vf = 1;
6151       unsigned int group_size = SLP_TREE_LANES (node);
6152       tree vectype = SLP_TREE_VECTYPE (node);
6153       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6154         = vect_get_num_vectors (vf * group_size, vectype);
6155     }
6156
6157   /* Handle purely internal nodes.  */
6158   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6159     {
6160       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6161         return false;
6162
6163       stmt_vec_info slp_stmt_info;
6164       unsigned int i;
6165       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6166         {
6167           if (STMT_VINFO_LIVE_P (slp_stmt_info)
6168               && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6169                                                node_instance, i,
6170                                                false, cost_vec))
6171             return false;
6172         }
6173       return true;
6174     }
6175
6176   bool dummy;
6177   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6178                             node, node_instance, cost_vec);
6179 }
6180
6181 /* Try to build NODE from scalars, returning true on success.
6182    NODE_INSTANCE is the SLP instance that contains NODE.  */
6183
6184 static bool
6185 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6186                               slp_instance node_instance)
6187 {
6188   stmt_vec_info stmt_info;
6189   unsigned int i;
6190
6191   if (!is_a <bb_vec_info> (vinfo)
6192       || node == SLP_INSTANCE_TREE (node_instance)
6193       || !SLP_TREE_SCALAR_STMTS (node).exists ()
6194       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6195       /* Force the mask use to be built from scalars instead.  */
6196       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6197     return false;
6198
6199   if (dump_enabled_p ())
6200     dump_printf_loc (MSG_NOTE, vect_location,
6201                      "Building vector operands of %p from scalars instead\n",
6202                      (void *) node);
6203
6204   /* Don't remove and free the child nodes here, since they could be
6205      referenced by other structures.  The analysis and scheduling phases
6206      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
6207   unsigned int group_size = SLP_TREE_LANES (node);
6208   SLP_TREE_DEF_TYPE (node) = vect_external_def;
6209   /* Invariants get their vector type from the uses.  */
6210   SLP_TREE_VECTYPE (node) = NULL_TREE;
6211   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6212   SLP_TREE_LOAD_PERMUTATION (node).release ();
6213   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6214     {
6215       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6216       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6217     }
6218   return true;
6219 }
6220
6221 /* Return true if all elements of the slice are the same.  */
6222 bool
6223 vect_scalar_ops_slice::all_same_p () const
6224 {
6225   for (unsigned int i = 1; i < length; ++i)
6226     if (!operand_equal_p (op (0), op (i)))
6227       return false;
6228   return true;
6229 }
6230
6231 hashval_t
6232 vect_scalar_ops_slice_hash::hash (const value_type &s)
6233 {
6234   hashval_t hash = 0;
6235   for (unsigned i = 0; i < s.length; ++i)
6236     hash = iterative_hash_expr (s.op (i), hash);
6237   return hash;
6238 }
6239
6240 bool
6241 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6242                                    const compare_type &s2)
6243 {
6244   if (s1.length != s2.length)
6245     return false;
6246   for (unsigned i = 0; i < s1.length; ++i)
6247     if (!operand_equal_p (s1.op (i), s2.op (i)))
6248       return false;
6249   return true;
6250 }
6251
6252 /* Compute the prologue cost for invariant or constant operands represented
6253    by NODE.  */
6254
6255 static void
6256 vect_prologue_cost_for_slp (slp_tree node,
6257                             stmt_vector_for_cost *cost_vec)
6258 {
6259   /* There's a special case of an existing vector, that costs nothing.  */
6260   if (SLP_TREE_SCALAR_OPS (node).length () == 0
6261       && !SLP_TREE_VEC_DEFS (node).is_empty ())
6262     return;
6263   /* Without looking at the actual initializer a vector of
6264      constants can be implemented as load from the constant pool.
6265      When all elements are the same we can use a splat.  */
6266   tree vectype = SLP_TREE_VECTYPE (node);
6267   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6268   unsigned HOST_WIDE_INT const_nunits;
6269   unsigned nelt_limit;
6270   auto ops = &SLP_TREE_SCALAR_OPS (node);
6271   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6272   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6273       && ! multiple_p (const_nunits, group_size))
6274     {
6275       nelt_limit = const_nunits;
6276       hash_set<vect_scalar_ops_slice_hash> vector_ops;
6277       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6278         if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6279           starts.quick_push (i * const_nunits);
6280     }
6281   else
6282     {
6283       /* If either the vector has variable length or the vectors
6284          are composed of repeated whole groups we only need to
6285          cost construction once.  All vectors will be the same.  */
6286       nelt_limit = group_size;
6287       starts.quick_push (0);
6288     }
6289   /* ???  We're just tracking whether vectors in a single node are the same.
6290      Ideally we'd do something more global.  */
6291   bool passed = false;
6292   for (unsigned int start : starts)
6293     {
6294       vect_cost_for_stmt kind;
6295       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6296         kind = vector_load;
6297       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6298         kind = scalar_to_vec;
6299       else
6300         kind = vec_construct;
6301       /* The target cost hook has no idea which part of the SLP node
6302          we are costing so avoid passing it down more than once.  Pass
6303          it to the first vec_construct or scalar_to_vec part since for those
6304          the x86 backend tries to account for GPR to XMM register moves.  */
6305       record_stmt_cost (cost_vec, 1, kind,
6306                         (kind != vector_load && !passed) ? node : nullptr,
6307                         vectype, 0, vect_prologue);
6308       if (kind != vector_load)
6309         passed = true;
6310     }
6311 }
6312
6313 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6314    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6315
6316    Return true if the operations are supported.  */
6317
6318 static bool
6319 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6320                                   slp_instance node_instance,
6321                                   hash_set<slp_tree> &visited_set,
6322                                   vec<slp_tree> &visited_vec,
6323                                   stmt_vector_for_cost *cost_vec)
6324 {
6325   int i, j;
6326   slp_tree child;
6327
6328   /* Assume we can code-generate all invariants.  */
6329   if (!node
6330       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6331       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6332     return true;
6333
6334   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6335     {
6336       if (dump_enabled_p ())
6337         dump_printf_loc (MSG_NOTE, vect_location,
6338                          "Failed cyclic SLP reference in %p\n", (void *) node);
6339       return false;
6340     }
6341   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6342
6343   /* If we already analyzed the exact same set of scalar stmts we're done.
6344      We share the generated vector stmts for those.  */
6345   if (visited_set.add (node))
6346     return true;
6347   visited_vec.safe_push (node);
6348
6349   bool res = true;
6350   unsigned visited_rec_start = visited_vec.length ();
6351   unsigned cost_vec_rec_start = cost_vec->length ();
6352   bool seen_non_constant_child = false;
6353   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6354     {
6355       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6356                                               visited_set, visited_vec,
6357                                               cost_vec);
6358       if (!res)
6359         break;
6360       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6361         seen_non_constant_child = true;
6362     }
6363   /* We're having difficulties scheduling nodes with just constant
6364      operands and no scalar stmts since we then cannot compute a stmt
6365      insertion place.  */
6366   if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6367     {
6368       if (dump_enabled_p ())
6369         dump_printf_loc (MSG_NOTE, vect_location,
6370                          "Cannot vectorize all-constant op node %p\n",
6371                          (void *) node);
6372       res = false;
6373     }
6374
6375   if (res)
6376     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6377                                               cost_vec);
6378   /* If analysis failed we have to pop all recursive visited nodes
6379      plus ourselves.  */
6380   if (!res)
6381     {
6382       while (visited_vec.length () >= visited_rec_start)
6383         visited_set.remove (visited_vec.pop ());
6384       cost_vec->truncate (cost_vec_rec_start);
6385     }
6386
6387   /* When the node can be vectorized cost invariant nodes it references.
6388      This is not done in DFS order to allow the refering node
6389      vectorizable_* calls to nail down the invariant nodes vector type
6390      and possibly unshare it if it needs a different vector type than
6391      other referrers.  */
6392   if (res)
6393     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6394       if (child
6395           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6396               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6397           /* Perform usual caching, note code-generation still
6398              code-gens these nodes multiple times but we expect
6399              to CSE them later.  */
6400           && !visited_set.add (child))
6401         {
6402           visited_vec.safe_push (child);
6403           /* ???  After auditing more code paths make a "default"
6404              and push the vector type from NODE to all children
6405              if it is not already set.  */
6406           /* Compute the number of vectors to be generated.  */
6407           tree vector_type = SLP_TREE_VECTYPE (child);
6408           if (!vector_type)
6409             {
6410               /* For shifts with a scalar argument we don't need
6411                  to cost or code-generate anything.
6412                  ???  Represent this more explicitely.  */
6413               gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6414                            == shift_vec_info_type)
6415                           && j == 1);
6416               continue;
6417             }
6418           unsigned group_size = SLP_TREE_LANES (child);
6419           poly_uint64 vf = 1;
6420           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6421             vf = loop_vinfo->vectorization_factor;
6422           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6423             = vect_get_num_vectors (vf * group_size, vector_type);
6424           /* And cost them.  */
6425           vect_prologue_cost_for_slp (child, cost_vec);
6426         }
6427
6428   /* If this node or any of its children can't be vectorized, try pruning
6429      the tree here rather than felling the whole thing.  */
6430   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6431     {
6432       /* We'll need to revisit this for invariant costing and number
6433          of vectorized stmt setting.   */
6434       res = true;
6435     }
6436
6437   return res;
6438 }
6439
6440 /* Given a definition DEF, analyze if it will have any live scalar use after
6441    performing SLP vectorization whose information is represented by BB_VINFO,
6442    and record result into hash map SCALAR_USE_MAP as cache for later fast
6443    check.  If recursion DEPTH exceeds a limit, stop analysis and make a
6444    conservative assumption.  Return 0 if no scalar use, 1 if there is, -1
6445    means recursion is limited.  */
6446
6447 static int
6448 vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
6449                         hash_map<tree, int> &scalar_use_map,
6450                         int depth = 0)
6451 {
6452   const int depth_limit = 2;
6453   imm_use_iterator use_iter;
6454   gimple *use_stmt;
6455
6456   if (int *res = scalar_use_map.get (def))
6457     return *res;
6458
6459   int scalar_use = 1;
6460
6461   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
6462     {
6463       if (is_gimple_debug (use_stmt))
6464         continue;
6465
6466       stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6467
6468       if (!use_stmt_info)
6469         break;
6470
6471       if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6472         continue;
6473
6474       /* Do not step forward when encounter PHI statement, since it may
6475          involve cyclic reference and cause infinite recursive invocation.  */
6476       if (gimple_code (use_stmt) == GIMPLE_PHI)
6477         break;
6478
6479       /* When pattern recognition is involved, a statement whose definition is
6480          consumed in some pattern, may not be included in the final replacement
6481          pattern statements, so would be skipped when building SLP graph.
6482
6483          * Original
6484           char a_c = *(char *) a;
6485           char b_c = *(char *) b;
6486           unsigned short a_s = (unsigned short) a_c;
6487           int a_i = (int) a_s;
6488           int b_i = (int) b_c;
6489           int r_i = a_i - b_i;
6490
6491          * After pattern replacement
6492           a_s = (unsigned short) a_c;
6493           a_i = (int) a_s;
6494
6495           patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
6496           patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
6497
6498           patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
6499           patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
6500
6501          The definitions of a_i(original statement) and b_i(pattern statement)
6502          are related to, but actually not part of widen_minus pattern.
6503          Vectorizing the pattern does not cause these definition statements to
6504          be marked as PURE_SLP.  For this case, we need to recursively check
6505          whether their uses are all absorbed into vectorized code.  But there
6506          is an exception that some use may participate in an vectorized
6507          operation via an external SLP node containing that use as an element.
6508          The parameter "scalar_use_map" tags such kind of SSA as having scalar
6509          use in advance.  */
6510       tree lhs = gimple_get_lhs (use_stmt);
6511
6512       if (!lhs || TREE_CODE (lhs) != SSA_NAME)
6513         break;
6514
6515       if (depth_limit && depth >= depth_limit)
6516         return -1;
6517
6518       if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
6519                                                 depth + 1)))
6520         break;
6521     }
6522
6523   if (end_imm_use_stmt_p (&use_iter))
6524     scalar_use = 0;
6525
6526   /* If recursion is limited, do not cache result for non-root defs.  */
6527   if (!depth || scalar_use >= 0)
6528     {
6529       bool added = scalar_use_map.put (def, scalar_use);
6530       gcc_assert (!added);
6531     }
6532
6533   return scalar_use;
6534 }
6535
6536 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6537    region and that can be vectorized using vectorizable_live_operation
6538    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
6539    scalar code computing it to be retained.  */
6540
6541 static void
6542 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6543                              slp_instance instance,
6544                              stmt_vector_for_cost *cost_vec,
6545                              hash_map<tree, int> &scalar_use_map,
6546                              hash_set<stmt_vec_info> &svisited,
6547                              hash_set<slp_tree> &visited)
6548 {
6549   if (visited.add (node))
6550     return;
6551
6552   unsigned i;
6553   stmt_vec_info stmt_info;
6554   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6555   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6556     {
6557       if (svisited.contains (stmt_info))
6558         continue;
6559       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6560       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6561           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6562         /* Only the pattern root stmt computes the original scalar value.  */
6563         continue;
6564       bool mark_visited = true;
6565       gimple *orig_stmt = orig_stmt_info->stmt;
6566       ssa_op_iter op_iter;
6567       def_operand_p def_p;
6568       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6569         {
6570           if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
6571                                       scalar_use_map))
6572             {
6573               STMT_VINFO_LIVE_P (stmt_info) = true;
6574               if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
6575                                                instance, i, false, cost_vec))
6576                 /* ???  So we know we can vectorize the live stmt from one SLP
6577                    node.  If we cannot do so from all or none consistently
6578                    we'd have to record which SLP node (and lane) we want to
6579                    use for the live operation.  So make sure we can
6580                    code-generate from all nodes.  */
6581                 mark_visited = false;
6582               else
6583                 STMT_VINFO_LIVE_P (stmt_info) = false;
6584             }
6585
6586           /* We have to verify whether we can insert the lane extract
6587              before all uses.  The following is a conservative approximation.
6588              We cannot put this into vectorizable_live_operation because
6589              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6590              doesn't work.
6591              Note that while the fact that we emit code for loads at the
6592              first load should make this a non-problem leafs we construct
6593              from scalars are vectorized after the last scalar def.
6594              ???  If we'd actually compute the insert location during
6595              analysis we could use sth less conservative than the last
6596              scalar stmt in the node for the dominance check.  */
6597           /* ???  What remains is "live" uses in vector CTORs in the same
6598              SLP graph which is where those uses can end up code-generated
6599              right after their definition instead of close to their original
6600              use.  But that would restrict us to code-generate lane-extracts
6601              from the latest stmt in a node.  So we compensate for this
6602              during code-generation, simply not replacing uses for those
6603              hopefully rare cases.  */
6604           imm_use_iterator use_iter;
6605           gimple *use_stmt;
6606           stmt_vec_info use_stmt_info;
6607
6608           if (STMT_VINFO_LIVE_P (stmt_info))
6609             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6610               if (!is_gimple_debug (use_stmt)
6611                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6612                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6613                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6614                 {
6615                   if (dump_enabled_p ())
6616                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6617                                      "Cannot determine insertion place for "
6618                                      "lane extract\n");
6619                   STMT_VINFO_LIVE_P (stmt_info) = false;
6620                   mark_visited = true;
6621                 }
6622         }
6623       if (mark_visited)
6624         svisited.add (stmt_info);
6625     }
6626
6627   slp_tree child;
6628   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6629     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6630       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
6631                                    scalar_use_map, svisited, visited);
6632 }
6633
6634 /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
6635    are live outside of the basic-block vectorized region and that can be
6636    vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
6637
6638 static void
6639 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
6640 {
6641   if (bb_vinfo->slp_instances.is_empty ())
6642     return;
6643
6644   hash_set<stmt_vec_info> svisited;
6645   hash_set<slp_tree> visited;
6646   hash_map<tree, int> scalar_use_map;
6647   auto_vec<slp_tree> worklist;
6648
6649   for (slp_instance instance : bb_vinfo->slp_instances)
6650     {
6651       if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
6652         for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
6653           if (TREE_CODE (op) == SSA_NAME)
6654             scalar_use_map.put (op, 1);
6655       if (!visited.add (SLP_INSTANCE_TREE (instance)))
6656         worklist.safe_push (SLP_INSTANCE_TREE (instance));
6657     }
6658
6659   do
6660     {
6661       slp_tree node = worklist.pop ();
6662
6663       if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
6664         {
6665           for (tree op : SLP_TREE_SCALAR_OPS (node))
6666             if (TREE_CODE (op) == SSA_NAME)
6667               scalar_use_map.put (op, 1);
6668         }
6669       else
6670         {
6671           for (slp_tree child : SLP_TREE_CHILDREN (node))
6672             if (child && !visited.add (child))
6673               worklist.safe_push (child);
6674         }
6675     }
6676   while (!worklist.is_empty ());
6677
6678   visited.empty ();
6679
6680   for (slp_instance instance : bb_vinfo->slp_instances)
6681     {
6682       vect_location = instance->location ();
6683       vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6684                                    instance, &instance->cost_vec,
6685                                    scalar_use_map, svisited, visited);
6686     }
6687 }
6688
6689 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
6690
6691 static bool
6692 vectorizable_bb_reduc_epilogue (slp_instance instance,
6693                                 stmt_vector_for_cost *cost_vec)
6694 {
6695   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6696   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6697   if (reduc_code == MINUS_EXPR)
6698     reduc_code = PLUS_EXPR;
6699   internal_fn reduc_fn;
6700   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6701   if (!vectype
6702       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6703       || reduc_fn == IFN_LAST
6704       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6705       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6706                                      TREE_TYPE (vectype)))
6707     {
6708       if (dump_enabled_p ())
6709         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6710                          "not vectorized: basic block reduction epilogue "
6711                          "operation unsupported.\n");
6712       return false;
6713     }
6714
6715   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6716      cost log2 vector operations plus shuffles and one extraction.  */
6717   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6718   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6719                     vectype, 0, vect_body);
6720   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6721                     vectype, 0, vect_body);
6722   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6723                     vectype, 0, vect_body);
6724
6725   /* Since we replace all stmts of a possibly longer scalar reduction
6726      chain account for the extra scalar stmts for that.  */
6727   record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
6728                     instance->root_stmts[0], 0, vect_body);
6729   return true;
6730 }
6731
6732 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6733    and recurse to children.  */
6734
6735 static void
6736 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6737                               hash_set<slp_tree> &visited)
6738 {
6739   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6740       || visited.add (node))
6741     return;
6742
6743   stmt_vec_info stmt;
6744   unsigned i;
6745   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6746     roots.remove (vect_orig_stmt (stmt));
6747
6748   slp_tree child;
6749   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6750     if (child)
6751       vect_slp_prune_covered_roots (child, roots, visited);
6752 }
6753
6754 /* Analyze statements in SLP instances of VINFO.  Return true if the
6755    operations are supported. */
6756
6757 bool
6758 vect_slp_analyze_operations (vec_info *vinfo)
6759 {
6760   slp_instance instance;
6761   int i;
6762
6763   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6764
6765   hash_set<slp_tree> visited;
6766   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6767     {
6768       auto_vec<slp_tree> visited_vec;
6769       stmt_vector_for_cost cost_vec;
6770       cost_vec.create (2);
6771       if (is_a <bb_vec_info> (vinfo))
6772         vect_location = instance->location ();
6773       if (!vect_slp_analyze_node_operations (vinfo,
6774                                              SLP_INSTANCE_TREE (instance),
6775                                              instance, visited, visited_vec,
6776                                              &cost_vec)
6777           /* CTOR instances require vectorized defs for the SLP tree root.  */
6778           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6779               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6780                   != vect_internal_def
6781                   /* Make sure we vectorized with the expected type.  */
6782                   || !useless_type_conversion_p
6783                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6784                                               (instance->root_stmts[0]->stmt))),
6785                          TREE_TYPE (SLP_TREE_VECTYPE
6786                                             (SLP_INSTANCE_TREE (instance))))))
6787           /* Check we can vectorize the reduction.  */
6788           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6789               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6790         {
6791           slp_tree node = SLP_INSTANCE_TREE (instance);
6792           stmt_vec_info stmt_info;
6793           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6794             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6795           else
6796             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6797           if (dump_enabled_p ())
6798             dump_printf_loc (MSG_NOTE, vect_location,
6799                              "removing SLP instance operations starting from: %G",
6800                              stmt_info->stmt);
6801           vect_free_slp_instance (instance);
6802           vinfo->slp_instances.ordered_remove (i);
6803           cost_vec.release ();
6804           while (!visited_vec.is_empty ())
6805             visited.remove (visited_vec.pop ());
6806         }
6807       else
6808         {
6809           i++;
6810           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6811             {
6812               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6813               cost_vec.release ();
6814             }
6815           else
6816             /* For BB vectorization remember the SLP graph entry
6817                cost for later.  */
6818             instance->cost_vec = cost_vec;
6819         }
6820     }
6821
6822   /* Now look for SLP instances with a root that are covered by other
6823      instances and remove them.  */
6824   hash_set<stmt_vec_info> roots;
6825   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6826     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6827       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6828   if (!roots.is_empty ())
6829     {
6830       visited.empty ();
6831       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6832         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6833                                       visited);
6834       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6835         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6836             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6837           {
6838             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6839             if (dump_enabled_p ())
6840               dump_printf_loc (MSG_NOTE, vect_location,
6841                                "removing SLP instance operations starting "
6842                                "from: %G", root->stmt);
6843             vect_free_slp_instance (instance);
6844             vinfo->slp_instances.ordered_remove (i);
6845           }
6846         else
6847           ++i;
6848     }
6849
6850   /* Compute vectorizable live stmts.  */
6851   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6852     vect_bb_slp_mark_live_stmts (bb_vinfo);
6853
6854   return !vinfo->slp_instances.is_empty ();
6855 }
6856
6857 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6858    closing the eventual chain.  */
6859
6860 static slp_instance
6861 get_ultimate_leader (slp_instance instance,
6862                      hash_map<slp_instance, slp_instance> &instance_leader)
6863 {
6864   auto_vec<slp_instance *, 8> chain;
6865   slp_instance *tem;
6866   while (*(tem = instance_leader.get (instance)) != instance)
6867     {
6868       chain.safe_push (tem);
6869       instance = *tem;
6870     }
6871   while (!chain.is_empty ())
6872     *chain.pop () = instance;
6873   return instance;
6874 }
6875
6876 namespace {
6877 /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
6878    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6879    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
6880
6881    INSTANCE_LEADER is as for get_ultimate_leader.  */
6882
6883 template<typename T>
6884 bool
6885 vect_map_to_instance (slp_instance instance, T key,
6886                       hash_map<T, slp_instance> &key_to_instance,
6887                       hash_map<slp_instance, slp_instance> &instance_leader)
6888 {
6889   bool existed_p;
6890   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6891   if (!existed_p)
6892     ;
6893   else if (key_instance != instance)
6894     {
6895       /* If we're running into a previously marked key make us the
6896          leader of the current ultimate leader.  This keeps the
6897          leader chain acyclic and works even when the current instance
6898          connects two previously independent graph parts.  */
6899       slp_instance key_leader
6900         = get_ultimate_leader (key_instance, instance_leader);
6901       if (key_leader != instance)
6902         instance_leader.put (key_leader, instance);
6903     }
6904   key_instance = instance;
6905   return existed_p;
6906 }
6907 }
6908
6909 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
6910
6911 static void
6912 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6913                            slp_instance instance, slp_tree node,
6914                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6915                            hash_map<slp_tree, slp_instance> &node_to_instance,
6916                            hash_map<slp_instance, slp_instance> &instance_leader)
6917 {
6918   stmt_vec_info stmt_info;
6919   unsigned i;
6920
6921   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6922     vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6923                           instance_leader);
6924
6925   if (vect_map_to_instance (instance, node, node_to_instance,
6926                             instance_leader))
6927     return;
6928
6929   slp_tree child;
6930   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6931     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6932       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6933                                  node_to_instance, instance_leader);
6934 }
6935
6936 /* Partition the SLP graph into pieces that can be costed independently.  */
6937
6938 static void
6939 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6940 {
6941   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6942
6943   /* First walk the SLP graph assigning each involved scalar stmt a
6944      corresponding SLP graph entry and upon visiting a previously
6945      marked stmt, make the stmts leader the current SLP graph entry.  */
6946   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6947   hash_map<slp_tree, slp_instance> node_to_instance;
6948   hash_map<slp_instance, slp_instance> instance_leader;
6949   slp_instance instance;
6950   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6951     {
6952       instance_leader.put (instance, instance);
6953       vect_bb_partition_graph_r (bb_vinfo,
6954                                  instance, SLP_INSTANCE_TREE (instance),
6955                                  stmt_to_instance, node_to_instance,
6956                                  instance_leader);
6957     }
6958
6959   /* Then collect entries to each independent subgraph.  */
6960   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6961     {
6962       slp_instance leader = get_ultimate_leader (instance, instance_leader);
6963       leader->subgraph_entries.safe_push (instance);
6964       if (dump_enabled_p ()
6965           && leader != instance)
6966         dump_printf_loc (MSG_NOTE, vect_location,
6967                          "instance %p is leader of %p\n",
6968                          (void *) leader, (void *) instance);
6969     }
6970 }
6971
6972 /* Compute the set of scalar stmts participating in internal and external
6973    nodes.  */
6974
6975 static void
6976 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6977                                          hash_set<slp_tree> &visited,
6978                                          hash_set<stmt_vec_info> &vstmts,
6979                                          hash_set<stmt_vec_info> &estmts)
6980 {
6981   int i;
6982   stmt_vec_info stmt_info;
6983   slp_tree child;
6984
6985   if (visited.add (node))
6986     return;
6987
6988   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6989     {
6990       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6991         vstmts.add (stmt_info);
6992
6993       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6994         if (child)
6995           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6996                                                    vstmts, estmts);
6997     }
6998   else
6999     for (tree def : SLP_TREE_SCALAR_OPS (node))
7000       {
7001         stmt_vec_info def_stmt = vinfo->lookup_def (def);
7002         if (def_stmt)
7003           estmts.add (def_stmt);
7004       }
7005 }
7006
7007
7008 /* Compute the scalar cost of the SLP node NODE and its children
7009    and return it.  Do not account defs that are marked in LIFE and
7010    update LIFE according to uses of NODE.  */
7011
7012 static void
7013 vect_bb_slp_scalar_cost (vec_info *vinfo,
7014                          slp_tree node, vec<bool, va_heap> *life,
7015                          stmt_vector_for_cost *cost_vec,
7016                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
7017                          hash_set<slp_tree> &visited)
7018 {
7019   unsigned i;
7020   stmt_vec_info stmt_info;
7021   slp_tree child;
7022
7023   if (visited.add (node))
7024     return;
7025
7026   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7027     {
7028       ssa_op_iter op_iter;
7029       def_operand_p def_p;
7030
7031       if ((*life)[i])
7032         continue;
7033
7034       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
7035       gimple *orig_stmt = orig_stmt_info->stmt;
7036
7037       /* If there is a non-vectorized use of the defs then the scalar
7038          stmt is kept live in which case we do not account it or any
7039          required defs in the SLP children in the scalar cost.  This
7040          way we make the vectorization more costly when compared to
7041          the scalar cost.  */
7042       if (!STMT_VINFO_LIVE_P (stmt_info))
7043         {
7044           auto_vec<gimple *, 8> worklist;
7045           hash_set<gimple *> *worklist_visited = NULL;
7046           worklist.quick_push (orig_stmt);
7047           do
7048             {
7049               gimple *work_stmt = worklist.pop ();
7050               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
7051                 {
7052                   imm_use_iterator use_iter;
7053                   gimple *use_stmt;
7054                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
7055                                          DEF_FROM_PTR (def_p))
7056                     if (!is_gimple_debug (use_stmt))
7057                       {
7058                         stmt_vec_info use_stmt_info
7059                           = vinfo->lookup_stmt (use_stmt);
7060                         if (!use_stmt_info
7061                             || !vectorized_scalar_stmts.contains (use_stmt_info))
7062                           {
7063                             if (use_stmt_info
7064                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
7065                               {
7066                                 /* For stmts participating in patterns we have
7067                                    to check its uses recursively.  */
7068                                 if (!worklist_visited)
7069                                   worklist_visited = new hash_set<gimple *> ();
7070                                 if (!worklist_visited->add (use_stmt))
7071                                   worklist.safe_push (use_stmt);
7072                                 continue;
7073                               }
7074                             (*life)[i] = true;
7075                             goto next_lane;
7076                           }
7077                       }
7078                 }
7079             }
7080           while (!worklist.is_empty ());
7081 next_lane:
7082           if (worklist_visited)
7083             delete worklist_visited;
7084           if ((*life)[i])
7085             continue;
7086         }
7087
7088       /* Count scalar stmts only once.  */
7089       if (gimple_visited_p (orig_stmt))
7090         continue;
7091       gimple_set_visited (orig_stmt, true);
7092
7093       vect_cost_for_stmt kind;
7094       if (STMT_VINFO_DATA_REF (orig_stmt_info))
7095         {
7096           if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
7097             kind = scalar_load;
7098           else
7099             kind = scalar_store;
7100         }
7101       else if (vect_nop_conversion_p (orig_stmt_info))
7102         continue;
7103       /* For single-argument PHIs assume coalescing which means zero cost
7104          for the scalar and the vector PHIs.  This avoids artificially
7105          favoring the vector path (but may pessimize it in some cases).  */
7106       else if (is_a <gphi *> (orig_stmt_info->stmt)
7107                && gimple_phi_num_args
7108                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
7109         continue;
7110       else
7111         kind = scalar_stmt;
7112       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
7113                         SLP_TREE_VECTYPE (node), 0, vect_body);
7114     }
7115
7116   auto_vec<bool, 20> subtree_life;
7117   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7118     {
7119       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7120         {
7121           /* Do not directly pass LIFE to the recursive call, copy it to
7122              confine changes in the callee to the current child/subtree.  */
7123           if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7124             {
7125               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
7126               for (unsigned j = 0;
7127                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
7128                 {
7129                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
7130                   if (perm.first == i)
7131                     subtree_life[perm.second] = (*life)[j];
7132                 }
7133             }
7134           else
7135             {
7136               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
7137               subtree_life.safe_splice (*life);
7138             }
7139           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
7140                                    vectorized_scalar_stmts, visited);
7141           subtree_life.truncate (0);
7142         }
7143     }
7144 }
7145
7146 /* Comparator for the loop-index sorted cost vectors.  */
7147
7148 static int
7149 li_cost_vec_cmp (const void *a_, const void *b_)
7150 {
7151   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
7152   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
7153   if (a->first < b->first)
7154     return -1;
7155   else if (a->first == b->first)
7156     return 0;
7157   return 1;
7158 }
7159
7160 /* Check if vectorization of the basic block is profitable for the
7161    subgraph denoted by SLP_INSTANCES.  */
7162
7163 static bool
7164 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
7165                                     vec<slp_instance> slp_instances,
7166                                     loop_p orig_loop)
7167 {
7168   slp_instance instance;
7169   int i;
7170   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
7171   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
7172
7173   if (dump_enabled_p ())
7174     {
7175       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
7176       hash_set<slp_tree> visited;
7177       FOR_EACH_VEC_ELT (slp_instances, i, instance)
7178         vect_print_slp_graph (MSG_NOTE, vect_location,
7179                               SLP_INSTANCE_TREE (instance), visited);
7180     }
7181
7182   /* Compute the set of scalar stmts we know will go away 'locally' when
7183      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
7184      not accurate for nodes promoted extern late or for scalar stmts that
7185      are used both in extern defs and in vectorized defs.  */
7186   hash_set<stmt_vec_info> vectorized_scalar_stmts;
7187   hash_set<stmt_vec_info> scalar_stmts_in_externs;
7188   hash_set<slp_tree> visited;
7189   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7190     {
7191       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
7192                                                SLP_INSTANCE_TREE (instance),
7193                                                visited,
7194                                                vectorized_scalar_stmts,
7195                                                scalar_stmts_in_externs);
7196       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
7197         vectorized_scalar_stmts.add (rstmt);
7198     }
7199   /* Scalar stmts used as defs in external nodes need to be preseved, so
7200      remove them from vectorized_scalar_stmts.  */
7201   for (stmt_vec_info stmt : scalar_stmts_in_externs)
7202     vectorized_scalar_stmts.remove (stmt);
7203
7204   /* Calculate scalar cost and sum the cost for the vector stmts
7205      previously collected.  */
7206   stmt_vector_for_cost scalar_costs = vNULL;
7207   stmt_vector_for_cost vector_costs = vNULL;
7208   visited.empty ();
7209   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7210     {
7211       auto_vec<bool, 20> life;
7212       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
7213                               true);
7214       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7215         record_stmt_cost (&scalar_costs,
7216                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
7217                           scalar_stmt,
7218                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
7219       vect_bb_slp_scalar_cost (bb_vinfo,
7220                                SLP_INSTANCE_TREE (instance),
7221                                &life, &scalar_costs, vectorized_scalar_stmts,
7222                                visited);
7223       vector_costs.safe_splice (instance->cost_vec);
7224       instance->cost_vec.release ();
7225     }
7226
7227   if (dump_enabled_p ())
7228     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
7229
7230   /* When costing non-loop vectorization we need to consider each covered
7231      loop independently and make sure vectorization is profitable.  For
7232      now we assume a loop may be not entered or executed an arbitrary
7233      number of iterations (???  static information can provide more
7234      precise info here) which means we can simply cost each containing
7235      loops stmts separately.  */
7236
7237   /* First produce cost vectors sorted by loop index.  */
7238   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7239     li_scalar_costs (scalar_costs.length ());
7240   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7241     li_vector_costs (vector_costs.length ());
7242   stmt_info_for_cost *cost;
7243   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7244     {
7245       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7246       li_scalar_costs.quick_push (std::make_pair (l, cost));
7247     }
7248   /* Use a random used loop as fallback in case the first vector_costs
7249      entry does not have a stmt_info associated with it.  */
7250   unsigned l = li_scalar_costs[0].first;
7251   FOR_EACH_VEC_ELT (vector_costs, i, cost)
7252     {
7253       /* We inherit from the previous COST, invariants, externals and
7254          extracts immediately follow the cost for the related stmt.  */
7255       if (cost->stmt_info)
7256         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7257       li_vector_costs.quick_push (std::make_pair (l, cost));
7258     }
7259   li_scalar_costs.qsort (li_cost_vec_cmp);
7260   li_vector_costs.qsort (li_cost_vec_cmp);
7261
7262   /* Now cost the portions individually.  */
7263   unsigned vi = 0;
7264   unsigned si = 0;
7265   bool profitable = true;
7266   while (si < li_scalar_costs.length ()
7267          && vi < li_vector_costs.length ())
7268     {
7269       unsigned sl = li_scalar_costs[si].first;
7270       unsigned vl = li_vector_costs[vi].first;
7271       if (sl != vl)
7272         {
7273           if (dump_enabled_p ())
7274             dump_printf_loc (MSG_NOTE, vect_location,
7275                              "Scalar %d and vector %d loop part do not "
7276                              "match up, skipping scalar part\n", sl, vl);
7277           /* Skip the scalar part, assuming zero cost on the vector side.  */
7278           do
7279             {
7280               si++;
7281             }
7282           while (si < li_scalar_costs.length ()
7283                  && li_scalar_costs[si].first == sl);
7284           continue;
7285         }
7286
7287       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
7288       do
7289         {
7290           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
7291           si++;
7292         }
7293       while (si < li_scalar_costs.length ()
7294              && li_scalar_costs[si].first == sl);
7295       unsigned dummy;
7296       finish_cost (scalar_target_cost_data, nullptr,
7297                    &dummy, &scalar_cost, &dummy);
7298
7299       /* Complete the target-specific vector cost calculation.  */
7300       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
7301       do
7302         {
7303           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
7304           vi++;
7305         }
7306       while (vi < li_vector_costs.length ()
7307              && li_vector_costs[vi].first == vl);
7308       finish_cost (vect_target_cost_data, scalar_target_cost_data,
7309                    &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7310       delete scalar_target_cost_data;
7311       delete vect_target_cost_data;
7312
7313       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7314
7315       if (dump_enabled_p ())
7316         {
7317           dump_printf_loc (MSG_NOTE, vect_location,
7318                            "Cost model analysis for part in loop %d:\n", sl);
7319           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
7320                        vec_inside_cost + vec_outside_cost);
7321           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
7322         }
7323
7324       /* Vectorization is profitable if its cost is more than the cost of scalar
7325          version.  Note that we err on the vector side for equal cost because
7326          the cost estimate is otherwise quite pessimistic (constant uses are
7327          free on the scalar side but cost a load on the vector side for
7328          example).  */
7329       if (vec_outside_cost + vec_inside_cost > scalar_cost)
7330         {
7331           profitable = false;
7332           break;
7333         }
7334     }
7335   if (profitable && vi < li_vector_costs.length ())
7336     {
7337       if (dump_enabled_p ())
7338         dump_printf_loc (MSG_NOTE, vect_location,
7339                          "Excess vector cost for part in loop %d:\n",
7340                          li_vector_costs[vi].first);
7341       profitable = false;
7342     }
7343
7344   /* Unset visited flag.  This is delayed when the subgraph is profitable
7345      and we process the loop for remaining unvectorized if-converted code.  */
7346   if (!orig_loop || !profitable)
7347     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7348       gimple_set_visited  (cost->stmt_info->stmt, false);
7349
7350   scalar_costs.release ();
7351   vector_costs.release ();
7352
7353   return profitable;
7354 }
7355
7356 /* qsort comparator for lane defs.  */
7357
7358 static int
7359 vld_cmp (const void *a_, const void *b_)
7360 {
7361   auto *a = (const std::pair<unsigned, tree> *)a_;
7362   auto *b = (const std::pair<unsigned, tree> *)b_;
7363   return a->first - b->first;
7364 }
7365
7366 /* Return true if USE_STMT is a vector lane insert into VEC and set
7367    *THIS_LANE to the lane number that is set.  */
7368
7369 static bool
7370 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7371 {
7372   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7373   if (!use_ass
7374       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7375       || (vec
7376           ? gimple_assign_rhs1 (use_ass) != vec
7377           : ((vec = gimple_assign_rhs1 (use_ass)), false))
7378       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7379                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7380       || !constant_multiple_p
7381             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7382              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7383              this_lane))
7384     return false;
7385   return true;
7386 }
7387
7388 /* Find any vectorizable constructors and add them to the grouped_store
7389    array.  */
7390
7391 static void
7392 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7393 {
7394   for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7395     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7396          !gsi_end_p (gsi); gsi_next (&gsi))
7397     {
7398       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7399       if (!assign)
7400         continue;
7401
7402       tree rhs = gimple_assign_rhs1 (assign);
7403       enum tree_code code = gimple_assign_rhs_code (assign);
7404       use_operand_p use_p;
7405       gimple *use_stmt;
7406       if (code == CONSTRUCTOR)
7407         {
7408           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7409               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7410                            CONSTRUCTOR_NELTS (rhs))
7411               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7412               || uniform_vector_p (rhs))
7413             continue;
7414
7415           unsigned j;
7416           tree val;
7417           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7418             if (TREE_CODE (val) != SSA_NAME
7419                 || !bb_vinfo->lookup_def (val))
7420               break;
7421           if (j != CONSTRUCTOR_NELTS (rhs))
7422             continue;
7423
7424           vec<stmt_vec_info> roots = vNULL;
7425           roots.safe_push (bb_vinfo->lookup_stmt (assign));
7426           vec<stmt_vec_info> stmts;
7427           stmts.create (CONSTRUCTOR_NELTS (rhs));
7428           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7429             stmts.quick_push
7430               (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7431           bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7432                                                stmts, roots));
7433         }
7434       else if (code == BIT_INSERT_EXPR
7435                && VECTOR_TYPE_P (TREE_TYPE (rhs))
7436                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7437                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7438                && integer_zerop (gimple_assign_rhs3 (assign))
7439                && useless_type_conversion_p
7440                     (TREE_TYPE (TREE_TYPE (rhs)),
7441                      TREE_TYPE (gimple_assign_rhs2 (assign)))
7442                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7443         {
7444           /* We start to match on insert to lane zero but since the
7445              inserts need not be ordered we'd have to search both
7446              the def and the use chains.  */
7447           tree vectype = TREE_TYPE (rhs);
7448           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7449           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7450           auto_sbitmap lanes (nlanes);
7451           bitmap_clear (lanes);
7452           bitmap_set_bit (lanes, 0);
7453           tree def = gimple_assign_lhs (assign);
7454           lane_defs.quick_push
7455                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
7456           unsigned lanes_found = 1;
7457           /* Start with the use chains, the last stmt will be the root.  */
7458           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7459           vec<stmt_vec_info> roots = vNULL;
7460           roots.safe_push (last);
7461           do
7462             {
7463               use_operand_p use_p;
7464               gimple *use_stmt;
7465               if (!single_imm_use (def, &use_p, &use_stmt))
7466                 break;
7467               unsigned this_lane;
7468               if (!bb_vinfo->lookup_stmt (use_stmt)
7469                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7470                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7471                 break;
7472               if (bitmap_bit_p (lanes, this_lane))
7473                 break;
7474               lanes_found++;
7475               bitmap_set_bit (lanes, this_lane);
7476               gassign *use_ass = as_a <gassign *> (use_stmt);
7477               lane_defs.quick_push (std::make_pair
7478                                      (this_lane, gimple_assign_rhs2 (use_ass)));
7479               last = bb_vinfo->lookup_stmt (use_ass);
7480               roots.safe_push (last);
7481               def = gimple_assign_lhs (use_ass);
7482             }
7483           while (lanes_found < nlanes);
7484           if (roots.length () > 1)
7485             std::swap(roots[0], roots[roots.length () - 1]);
7486           if (lanes_found < nlanes)
7487             {
7488               /* Now search the def chain.  */
7489               def = gimple_assign_rhs1 (assign);
7490               do
7491                 {
7492                   if (TREE_CODE (def) != SSA_NAME
7493                       || !has_single_use (def))
7494                     break;
7495                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7496                   unsigned this_lane;
7497                   if (!bb_vinfo->lookup_stmt (def_stmt)
7498                       || !vect_slp_is_lane_insert (def_stmt,
7499                                                    NULL_TREE, &this_lane)
7500                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7501                     break;
7502                   if (bitmap_bit_p (lanes, this_lane))
7503                     break;
7504                   lanes_found++;
7505                   bitmap_set_bit (lanes, this_lane);
7506                   lane_defs.quick_push (std::make_pair
7507                                           (this_lane,
7508                                            gimple_assign_rhs2 (def_stmt)));
7509                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7510                   def = gimple_assign_rhs1 (def_stmt);
7511                 }
7512               while (lanes_found < nlanes);
7513             }
7514           if (lanes_found == nlanes)
7515             {
7516               /* Sort lane_defs after the lane index and register the root.  */
7517               lane_defs.qsort (vld_cmp);
7518               vec<stmt_vec_info> stmts;
7519               stmts.create (nlanes);
7520               for (unsigned i = 0; i < nlanes; ++i)
7521                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7522               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7523                                                    stmts, roots));
7524             }
7525           else
7526             roots.release ();
7527         }
7528       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7529                && (associative_tree_code (code) || code == MINUS_EXPR)
7530                /* ???  This pessimizes a two-element reduction.  PR54400.
7531                   ???  In-order reduction could be handled if we only
7532                   traverse one operand chain in vect_slp_linearize_chain.  */
7533                && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7534                /* Ops with constants at the tail can be stripped here.  */
7535                && TREE_CODE (rhs) == SSA_NAME
7536                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7537                /* Should be the chain end.  */
7538                && (!single_imm_use (gimple_assign_lhs (assign),
7539                                     &use_p, &use_stmt)
7540                    || !is_gimple_assign (use_stmt)
7541                    || (gimple_assign_rhs_code (use_stmt) != code
7542                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
7543                            || (gimple_assign_rhs_code (use_stmt)
7544                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7545         {
7546           /* We start the match at the end of a possible association
7547              chain.  */
7548           auto_vec<chain_op_t> chain;
7549           auto_vec<std::pair<tree_code, gimple *> > worklist;
7550           auto_vec<gimple *> chain_stmts;
7551           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7552           if (code == MINUS_EXPR)
7553             code = PLUS_EXPR;
7554           internal_fn reduc_fn;
7555           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7556               || reduc_fn == IFN_LAST)
7557             continue;
7558           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7559                                     /* ??? */
7560                                     code_stmt, alt_code_stmt, &chain_stmts);
7561           if (chain.length () > 1)
7562             {
7563               /* Sort the chain according to def_type and operation.  */
7564               chain.sort (dt_sort_cmp, bb_vinfo);
7565               /* ???  Now we'd want to strip externals and constants
7566                  but record those to be handled in the epilogue.  */
7567               /* ???  For now do not allow mixing ops or externs/constants.  */
7568               bool invalid = false;
7569               unsigned remain_cnt = 0;
7570               unsigned last_idx = 0;
7571               for (unsigned i = 0; i < chain.length (); ++i)
7572                 {
7573                   if (chain[i].code != code)
7574                     {
7575                       invalid = true;
7576                       break;
7577                     }
7578                   if (chain[i].dt != vect_internal_def
7579                       /* Avoid stmts where the def is not the LHS, like
7580                          ASMs.  */
7581                       || (gimple_get_lhs (bb_vinfo->lookup_def
7582                                                       (chain[i].op)->stmt)
7583                           != chain[i].op))
7584                     remain_cnt++;
7585                   else
7586                     last_idx = i;
7587                 }
7588               /* Make sure to have an even number of lanes as we later do
7589                  all-or-nothing discovery, not trying to split further.  */
7590               if ((chain.length () - remain_cnt) & 1)
7591                 remain_cnt++;
7592               if (!invalid && chain.length () - remain_cnt > 1)
7593                 {
7594                   vec<stmt_vec_info> stmts;
7595                   vec<tree> remain = vNULL;
7596                   stmts.create (chain.length ());
7597                   if (remain_cnt > 0)
7598                     remain.create (remain_cnt);
7599                   for (unsigned i = 0; i < chain.length (); ++i)
7600                     {
7601                       stmt_vec_info stmt_info;
7602                       if (chain[i].dt == vect_internal_def
7603                           && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
7604                               gimple_get_lhs (stmt_info->stmt) == chain[i].op)
7605                           && (i != last_idx
7606                               || (stmts.length () & 1)))
7607                         stmts.quick_push (stmt_info);
7608                       else
7609                         remain.quick_push (chain[i].op);
7610                     }
7611                   vec<stmt_vec_info> roots;
7612                   roots.create (chain_stmts.length ());
7613                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
7614                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7615                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7616                                                        stmts, roots, remain));
7617                 }
7618             }
7619         }
7620     }
7621 }
7622
7623 /* Walk the grouped store chains and replace entries with their
7624    pattern variant if any.  */
7625
7626 static void
7627 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7628 {
7629   stmt_vec_info first_element;
7630   unsigned i;
7631
7632   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7633     {
7634       /* We also have CTORs in this array.  */
7635       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7636         continue;
7637       if (STMT_VINFO_IN_PATTERN_P (first_element))
7638         {
7639           stmt_vec_info orig = first_element;
7640           first_element = STMT_VINFO_RELATED_STMT (first_element);
7641           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7642           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7643           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7644           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7645           vinfo->grouped_stores[i] = first_element;
7646         }
7647       stmt_vec_info prev = first_element;
7648       while (DR_GROUP_NEXT_ELEMENT (prev))
7649         {
7650           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7651           if (STMT_VINFO_IN_PATTERN_P (elt))
7652             {
7653               stmt_vec_info orig = elt;
7654               elt = STMT_VINFO_RELATED_STMT (elt);
7655               DR_GROUP_NEXT_ELEMENT (prev) = elt;
7656               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7657               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7658             }
7659           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7660           prev = elt;
7661         }
7662     }
7663 }
7664
7665 /* Check if the region described by BB_VINFO can be vectorized, returning
7666    true if so.  When returning false, set FATAL to true if the same failure
7667    would prevent vectorization at other vector sizes, false if it is still
7668    worth trying other sizes.  N_STMTS is the number of statements in the
7669    region.  */
7670
7671 static bool
7672 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7673                        vec<int> *dataref_groups)
7674 {
7675   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7676
7677   slp_instance instance;
7678   int i;
7679   poly_uint64 min_vf = 2;
7680
7681   /* The first group of checks is independent of the vector size.  */
7682   fatal = true;
7683
7684   /* Analyze the data references.  */
7685
7686   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7687     {
7688       if (dump_enabled_p ())
7689         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7690                          "not vectorized: unhandled data-ref in basic "
7691                          "block.\n");
7692       return false;
7693     }
7694
7695   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7696     {
7697      if (dump_enabled_p ())
7698        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7699                         "not vectorized: unhandled data access in "
7700                         "basic block.\n");
7701       return false;
7702     }
7703
7704   vect_slp_check_for_roots (bb_vinfo);
7705
7706   /* If there are no grouped stores and no constructors in the region
7707      there is no need to continue with pattern recog as vect_analyze_slp
7708      will fail anyway.  */
7709   if (bb_vinfo->grouped_stores.is_empty ()
7710       && bb_vinfo->roots.is_empty ())
7711     {
7712       if (dump_enabled_p ())
7713         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7714                          "not vectorized: no grouped stores in "
7715                          "basic block.\n");
7716       return false;
7717     }
7718
7719   /* While the rest of the analysis below depends on it in some way.  */
7720   fatal = false;
7721
7722   vect_pattern_recog (bb_vinfo);
7723
7724   /* Update store groups from pattern processing.  */
7725   vect_fixup_store_groups_with_patterns (bb_vinfo);
7726
7727   /* Check the SLP opportunities in the basic block, analyze and build SLP
7728      trees.  */
7729   if (!vect_analyze_slp (bb_vinfo, n_stmts))
7730     {
7731       if (dump_enabled_p ())
7732         {
7733           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7734                            "Failed to SLP the basic block.\n");
7735           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7736                            "not vectorized: failed to find SLP opportunities "
7737                            "in basic block.\n");
7738         }
7739       return false;
7740     }
7741
7742   /* Optimize permutations.  */
7743   vect_optimize_slp (bb_vinfo);
7744
7745   /* Gather the loads reachable from the SLP graph entries.  */
7746   vect_gather_slp_loads (bb_vinfo);
7747
7748   vect_record_base_alignments (bb_vinfo);
7749
7750   /* Analyze and verify the alignment of data references and the
7751      dependence in the SLP instances.  */
7752   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7753     {
7754       vect_location = instance->location ();
7755       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7756           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7757         {
7758           slp_tree node = SLP_INSTANCE_TREE (instance);
7759           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7760           if (dump_enabled_p ())
7761             dump_printf_loc (MSG_NOTE, vect_location,
7762                              "removing SLP instance operations starting from: %G",
7763                              stmt_info->stmt);
7764           vect_free_slp_instance (instance);
7765           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7766           continue;
7767         }
7768
7769       /* Mark all the statements that we want to vectorize as pure SLP and
7770          relevant.  */
7771       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7772       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7773       unsigned j;
7774       stmt_vec_info root;
7775       /* Likewise consider instance root stmts as vectorized.  */
7776       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7777         STMT_SLP_TYPE (root) = pure_slp;
7778
7779       i++;
7780     }
7781   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7782     return false;
7783
7784   if (!vect_slp_analyze_operations (bb_vinfo))
7785     {
7786       if (dump_enabled_p ())
7787         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7788                          "not vectorized: bad operation in basic block.\n");
7789       return false;
7790     }
7791
7792   vect_bb_partition_graph (bb_vinfo);
7793
7794   return true;
7795 }
7796
7797 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
7798    basic blocks in BBS, returning true on success.
7799    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
7800
7801 static bool
7802 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7803                  vec<int> *dataref_groups, unsigned int n_stmts,
7804                  loop_p orig_loop)
7805 {
7806   bb_vec_info bb_vinfo;
7807   auto_vector_modes vector_modes;
7808
7809   /* Autodetect first vector size we try.  */
7810   machine_mode next_vector_mode = VOIDmode;
7811   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7812   unsigned int mode_i = 0;
7813
7814   vec_info_shared shared;
7815
7816   machine_mode autodetected_vector_mode = VOIDmode;
7817   while (1)
7818     {
7819       bool vectorized = false;
7820       bool fatal = false;
7821       bb_vinfo = new _bb_vec_info (bbs, &shared);
7822
7823       bool first_time_p = shared.datarefs.is_empty ();
7824       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7825       if (first_time_p)
7826         bb_vinfo->shared->save_datarefs ();
7827       else
7828         bb_vinfo->shared->check_datarefs ();
7829       bb_vinfo->vector_mode = next_vector_mode;
7830
7831       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7832         {
7833           if (dump_enabled_p ())
7834             {
7835               dump_printf_loc (MSG_NOTE, vect_location,
7836                                "***** Analysis succeeded with vector mode"
7837                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7838               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7839             }
7840
7841           bb_vinfo->shared->check_datarefs ();
7842
7843           bool force_clear = false;
7844           auto_vec<slp_instance> profitable_subgraphs;
7845           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7846             {
7847               if (instance->subgraph_entries.is_empty ())
7848                 continue;
7849
7850               dump_user_location_t saved_vect_location = vect_location;
7851               vect_location = instance->location ();
7852               if (!unlimited_cost_model (NULL)
7853                   && !vect_bb_vectorization_profitable_p
7854                         (bb_vinfo, instance->subgraph_entries, orig_loop))
7855                 {
7856                   if (dump_enabled_p ())
7857                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7858                                      "not vectorized: vectorization is not "
7859                                      "profitable.\n");
7860                   vect_location = saved_vect_location;
7861                   continue;
7862                 }
7863
7864               vect_location = saved_vect_location;
7865               if (!dbg_cnt (vect_slp))
7866                 {
7867                   force_clear = true;
7868                   continue;
7869                 }
7870
7871               profitable_subgraphs.safe_push (instance);
7872             }
7873
7874           /* When we're vectorizing an if-converted loop body make sure
7875              we vectorized all if-converted code.  */
7876           if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
7877             {
7878               gcc_assert (bb_vinfo->bbs.length () == 1);
7879               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7880                    !gsi_end_p (gsi); gsi_next (&gsi))
7881                 {
7882                   /* The costing above left us with DCEable vectorized scalar
7883                      stmts having the visited flag set on profitable
7884                      subgraphs.  Do the delayed clearing of the flag here.  */
7885                   if (gimple_visited_p (gsi_stmt (gsi)))
7886                     {
7887                       gimple_set_visited (gsi_stmt (gsi), false);
7888                       continue;
7889                     }
7890                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7891                     continue;
7892
7893                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7894                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
7895                       {
7896                         if (!profitable_subgraphs.is_empty ()
7897                             && dump_enabled_p ())
7898                           dump_printf_loc (MSG_NOTE, vect_location,
7899                                            "not profitable because of "
7900                                            "unprofitable if-converted scalar "
7901                                            "code\n");
7902                         profitable_subgraphs.truncate (0);
7903                       }
7904                 }
7905             }
7906
7907           /* Finally schedule the profitable subgraphs.  */
7908           for (slp_instance instance : profitable_subgraphs)
7909             {
7910               if (!vectorized && dump_enabled_p ())
7911                 dump_printf_loc (MSG_NOTE, vect_location,
7912                                  "Basic block will be vectorized "
7913                                  "using SLP\n");
7914               vectorized = true;
7915
7916               /* Dump before scheduling as store vectorization will remove
7917                  the original stores and mess with the instance tree
7918                  so querying its location will eventually ICE.  */
7919               if (flag_checking)
7920                 for (slp_instance sub : instance->subgraph_entries)
7921                   gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7922               unsigned HOST_WIDE_INT bytes;
7923               if (dump_enabled_p ())
7924                 for (slp_instance sub : instance->subgraph_entries)
7925                   {
7926                     tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7927                     if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7928                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7929                                        sub->location (),
7930                                        "basic block part vectorized using %wu "
7931                                        "byte vectors\n", bytes);
7932                     else
7933                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7934                                        sub->location (),
7935                                        "basic block part vectorized using "
7936                                        "variable length vectors\n");
7937                   }
7938
7939               dump_user_location_t saved_vect_location = vect_location;
7940               vect_location = instance->location ();
7941
7942               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7943
7944               vect_location = saved_vect_location;
7945             }
7946         }
7947       else
7948         {
7949           if (dump_enabled_p ())
7950             dump_printf_loc (MSG_NOTE, vect_location,
7951                              "***** Analysis failed with vector mode %s\n",
7952                              GET_MODE_NAME (bb_vinfo->vector_mode));
7953         }
7954
7955       if (mode_i == 0)
7956         autodetected_vector_mode = bb_vinfo->vector_mode;
7957
7958       if (!fatal)
7959         while (mode_i < vector_modes.length ()
7960                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7961           {
7962             if (dump_enabled_p ())
7963               dump_printf_loc (MSG_NOTE, vect_location,
7964                                "***** The result for vector mode %s would"
7965                                " be the same\n",
7966                                GET_MODE_NAME (vector_modes[mode_i]));
7967             mode_i += 1;
7968           }
7969
7970       delete bb_vinfo;
7971
7972       if (mode_i < vector_modes.length ()
7973           && VECTOR_MODE_P (autodetected_vector_mode)
7974           && (related_vector_mode (vector_modes[mode_i],
7975                                    GET_MODE_INNER (autodetected_vector_mode))
7976               == autodetected_vector_mode)
7977           && (related_vector_mode (autodetected_vector_mode,
7978                                    GET_MODE_INNER (vector_modes[mode_i]))
7979               == vector_modes[mode_i]))
7980         {
7981           if (dump_enabled_p ())
7982             dump_printf_loc (MSG_NOTE, vect_location,
7983                              "***** Skipping vector mode %s, which would"
7984                              " repeat the analysis for %s\n",
7985                              GET_MODE_NAME (vector_modes[mode_i]),
7986                              GET_MODE_NAME (autodetected_vector_mode));
7987           mode_i += 1;
7988         }
7989
7990       if (vectorized
7991           || mode_i == vector_modes.length ()
7992           || autodetected_vector_mode == VOIDmode
7993           /* If vect_slp_analyze_bb_1 signaled that analysis for all
7994              vector sizes will fail do not bother iterating.  */
7995           || fatal)
7996         return vectorized;
7997
7998       /* Try the next biggest vector size.  */
7999       next_vector_mode = vector_modes[mode_i++];
8000       if (dump_enabled_p ())
8001         dump_printf_loc (MSG_NOTE, vect_location,
8002                          "***** Re-trying analysis with vector mode %s\n",
8003                          GET_MODE_NAME (next_vector_mode));
8004     }
8005 }
8006
8007
8008 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
8009    true if anything in the basic-block was vectorized.  */
8010
8011 static bool
8012 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
8013 {
8014   vec<data_reference_p> datarefs = vNULL;
8015   auto_vec<int> dataref_groups;
8016   int insns = 0;
8017   int current_group = 0;
8018
8019   for (unsigned i = 0; i < bbs.length (); i++)
8020     {
8021       basic_block bb = bbs[i];
8022       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
8023            gsi_next (&gsi))
8024         {
8025           gimple *stmt = gsi_stmt (gsi);
8026           if (is_gimple_debug (stmt))
8027             continue;
8028
8029           insns++;
8030
8031           if (gimple_location (stmt) != UNKNOWN_LOCATION)
8032             vect_location = stmt;
8033
8034           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
8035                                               &dataref_groups, current_group))
8036             ++current_group;
8037         }
8038       /* New BBs always start a new DR group.  */
8039       ++current_group;
8040     }
8041
8042   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
8043 }
8044
8045 /* Special entry for the BB vectorizer.  Analyze and transform a single
8046    if-converted BB with ORIG_LOOPs body being the not if-converted
8047    representation.  Returns true if anything in the basic-block was
8048    vectorized.  */
8049
8050 bool
8051 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
8052 {
8053   auto_vec<basic_block> bbs;
8054   bbs.safe_push (bb);
8055   return vect_slp_bbs (bbs, orig_loop);
8056 }
8057
8058 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
8059    true if anything in the basic-block was vectorized.  */
8060
8061 bool
8062 vect_slp_function (function *fun)
8063 {
8064   bool r = false;
8065   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
8066   auto_bitmap exit_bbs;
8067   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
8068   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
8069   unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
8070                                                       true, rpo, NULL);
8071
8072   /* For the moment split the function into pieces to avoid making
8073      the iteration on the vector mode moot.  Split at points we know
8074      to not handle well which is CFG merges (SLP discovery doesn't
8075      handle non-loop-header PHIs) and loop exits.  Since pattern
8076      recog requires reverse iteration to visit uses before defs
8077      simply chop RPO into pieces.  */
8078   auto_vec<basic_block> bbs;
8079   for (unsigned i = 0; i < n; i++)
8080     {
8081       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
8082       bool split = false;
8083
8084       /* Split when a BB is not dominated by the first block.  */
8085       if (!bbs.is_empty ()
8086           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
8087         {
8088           if (dump_enabled_p ())
8089             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8090                              "splitting region at dominance boundary bb%d\n",
8091                              bb->index);
8092           split = true;
8093         }
8094       /* Split when the loop determined by the first block
8095          is exited.  This is because we eventually insert
8096          invariants at region begin.  */
8097       else if (!bbs.is_empty ()
8098                && bbs[0]->loop_father != bb->loop_father
8099                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
8100         {
8101           if (dump_enabled_p ())
8102             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8103                              "splitting region at loop %d exit at bb%d\n",
8104                              bbs[0]->loop_father->num, bb->index);
8105           split = true;
8106         }
8107       else if (!bbs.is_empty ()
8108                && bb->loop_father->header == bb
8109                && bb->loop_father->dont_vectorize)
8110         {
8111           if (dump_enabled_p ())
8112             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8113                              "splitting region at dont-vectorize loop %d "
8114                              "entry at bb%d\n",
8115                              bb->loop_father->num, bb->index);
8116           split = true;
8117         }
8118
8119       if (split && !bbs.is_empty ())
8120         {
8121           r |= vect_slp_bbs (bbs, NULL);
8122           bbs.truncate (0);
8123         }
8124
8125       if (bbs.is_empty ())
8126         {
8127           /* We need to be able to insert at the head of the region which
8128              we cannot for region starting with a returns-twice call.  */
8129           if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
8130             if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
8131               {
8132                 if (dump_enabled_p ())
8133                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8134                                    "skipping bb%d as start of region as it "
8135                                    "starts with returns-twice call\n",
8136                                    bb->index);
8137                 continue;
8138               }
8139           /* If the loop this BB belongs to is marked as not to be vectorized
8140              honor that also for BB vectorization.  */
8141           if (bb->loop_father->dont_vectorize)
8142             continue;
8143         }
8144
8145       bbs.safe_push (bb);
8146
8147       /* When we have a stmt ending this block and defining a
8148          value we have to insert on edges when inserting after it for
8149          a vector containing its definition.  Avoid this for now.  */
8150       if (gimple *last = *gsi_last_bb (bb))
8151         if (gimple_get_lhs (last)
8152             && is_ctrl_altering_stmt (last))
8153           {
8154             if (dump_enabled_p ())
8155               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8156                                "splitting region at control altering "
8157                                "definition %G", last);
8158             r |= vect_slp_bbs (bbs, NULL);
8159             bbs.truncate (0);
8160           }
8161     }
8162
8163   if (!bbs.is_empty ())
8164     r |= vect_slp_bbs (bbs, NULL);
8165
8166   free (rpo);
8167
8168   return r;
8169 }
8170
8171 /* Build a variable-length vector in which the elements in ELTS are repeated
8172    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
8173    RESULTS and add any new instructions to SEQ.
8174
8175    The approach we use is:
8176
8177    (1) Find a vector mode VM with integer elements of mode IM.
8178
8179    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8180        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
8181        from small vectors to IM.
8182
8183    (3) Duplicate each ELTS'[I] into a vector of mode VM.
8184
8185    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
8186        correct byte contents.
8187
8188    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
8189
8190    We try to find the largest IM for which this sequence works, in order
8191    to cut down on the number of interleaves.  */
8192
8193 void
8194 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
8195                           const vec<tree> &elts, unsigned int nresults,
8196                           vec<tree> &results)
8197 {
8198   unsigned int nelts = elts.length ();
8199   tree element_type = TREE_TYPE (vector_type);
8200
8201   /* (1) Find a vector mode VM with integer elements of mode IM.  */
8202   unsigned int nvectors = 1;
8203   tree new_vector_type;
8204   tree permutes[2];
8205   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
8206                                        &nvectors, &new_vector_type,
8207                                        permutes))
8208     gcc_unreachable ();
8209
8210   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
8211   unsigned int partial_nelts = nelts / nvectors;
8212   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
8213
8214   tree_vector_builder partial_elts;
8215   auto_vec<tree, 32> pieces (nvectors * 2);
8216   pieces.quick_grow_cleared (nvectors * 2);
8217   for (unsigned int i = 0; i < nvectors; ++i)
8218     {
8219       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8220              ELTS' has mode IM.  */
8221       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
8222       for (unsigned int j = 0; j < partial_nelts; ++j)
8223         partial_elts.quick_push (elts[i * partial_nelts + j]);
8224       tree t = gimple_build_vector (seq, &partial_elts);
8225       t = gimple_build (seq, VIEW_CONVERT_EXPR,
8226                         TREE_TYPE (new_vector_type), t);
8227
8228       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
8229       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
8230     }
8231
8232   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
8233          correct byte contents.
8234
8235      Conceptually, we need to repeat the following operation log2(nvectors)
8236      times, where hi_start = nvectors / 2:
8237
8238         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
8239         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
8240
8241      However, if each input repeats every N elements and the VF is
8242      a multiple of N * 2, the HI result is the same as the LO result.
8243      This will be true for the first N1 iterations of the outer loop,
8244      followed by N2 iterations for which both the LO and HI results
8245      are needed.  I.e.:
8246
8247         N1 + N2 = log2(nvectors)
8248
8249      Each "N1 iteration" doubles the number of redundant vectors and the
8250      effect of the process as a whole is to have a sequence of nvectors/2**N1
8251      vectors that repeats 2**N1 times.  Rather than generate these redundant
8252      vectors, we halve the number of vectors for each N1 iteration.  */
8253   unsigned int in_start = 0;
8254   unsigned int out_start = nvectors;
8255   unsigned int new_nvectors = nvectors;
8256   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
8257     {
8258       unsigned int hi_start = new_nvectors / 2;
8259       unsigned int out_i = 0;
8260       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
8261         {
8262           if ((in_i & 1) != 0
8263               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
8264                              2 * in_repeat))
8265             continue;
8266
8267           tree output = make_ssa_name (new_vector_type);
8268           tree input1 = pieces[in_start + (in_i / 2)];
8269           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
8270           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
8271                                                input1, input2,
8272                                                permutes[in_i & 1]);
8273           gimple_seq_add_stmt (seq, stmt);
8274           pieces[out_start + out_i] = output;
8275           out_i += 1;
8276         }
8277       std::swap (in_start, out_start);
8278       new_nvectors = out_i;
8279     }
8280
8281   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
8282   results.reserve (nresults);
8283   for (unsigned int i = 0; i < nresults; ++i)
8284     if (i < new_nvectors)
8285       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
8286                                         pieces[in_start + i]));
8287     else
8288       results.quick_push (results[i - new_nvectors]);
8289 }
8290
8291
8292 /* For constant and loop invariant defs in OP_NODE this function creates
8293    vector defs that will be used in the vectorized stmts and stores them
8294    to SLP_TREE_VEC_DEFS of OP_NODE.  */
8295
8296 static void
8297 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8298 {
8299   unsigned HOST_WIDE_INT nunits;
8300   tree vec_cst;
8301   unsigned j, number_of_places_left_in_vector;
8302   tree vector_type;
8303   tree vop;
8304   int group_size = op_node->ops.length ();
8305   unsigned int vec_num, i;
8306   unsigned number_of_copies = 1;
8307   bool constant_p;
8308   gimple_seq ctor_seq = NULL;
8309   auto_vec<tree, 16> permute_results;
8310
8311   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
8312   vector_type = SLP_TREE_VECTYPE (op_node);
8313
8314   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8315   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
8316   auto_vec<tree> voprnds (number_of_vectors);
8317
8318   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8319      created vectors. It is greater than 1 if unrolling is performed.
8320
8321      For example, we have two scalar operands, s1 and s2 (e.g., group of
8322      strided accesses of size two), while NUNITS is four (i.e., four scalars
8323      of this type can be packed in a vector).  The output vector will contain
8324      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
8325      will be 2).
8326
8327      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8328      containing the operands.
8329
8330      For example, NUNITS is four as before, and the group size is 8
8331      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
8332      {s5, s6, s7, s8}.  */
8333
8334   /* When using duplicate_and_interleave, we just need one element for
8335      each scalar statement.  */
8336   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
8337     nunits = group_size;
8338
8339   number_of_copies = nunits * number_of_vectors / group_size;
8340
8341   number_of_places_left_in_vector = nunits;
8342   constant_p = true;
8343   tree uniform_elt = NULL_TREE;
8344   tree_vector_builder elts (vector_type, nunits, 1);
8345   elts.quick_grow (nunits);
8346   stmt_vec_info insert_after = NULL;
8347   for (j = 0; j < number_of_copies; j++)
8348     {
8349       tree op;
8350       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
8351         {
8352           /* Create 'vect_ = {op0,op1,...,opn}'.  */
8353           tree orig_op = op;
8354           if (number_of_places_left_in_vector == nunits)
8355             uniform_elt = op;
8356           else if (uniform_elt && operand_equal_p (uniform_elt, op))
8357             op = elts[number_of_places_left_in_vector];
8358           else
8359             uniform_elt = NULL_TREE;
8360           number_of_places_left_in_vector--;
8361           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8362             {
8363               if (CONSTANT_CLASS_P (op))
8364                 {
8365                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8366                     {
8367                       /* Can't use VIEW_CONVERT_EXPR for booleans because
8368                          of possibly different sizes of scalar value and
8369                          vector element.  */
8370                       if (integer_zerop (op))
8371                         op = build_int_cst (TREE_TYPE (vector_type), 0);
8372                       else if (integer_onep (op))
8373                         op = build_all_ones_cst (TREE_TYPE (vector_type));
8374                       else
8375                         gcc_unreachable ();
8376                     }
8377                   else
8378                     op = fold_unary (VIEW_CONVERT_EXPR,
8379                                      TREE_TYPE (vector_type), op);
8380                   gcc_assert (op && CONSTANT_CLASS_P (op));
8381                 }
8382               else
8383                 {
8384                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8385                   gimple *init_stmt;
8386                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8387                     {
8388                       tree true_val
8389                         = build_all_ones_cst (TREE_TYPE (vector_type));
8390                       tree false_val
8391                         = build_zero_cst (TREE_TYPE (vector_type));
8392                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8393                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8394                                                        op, true_val,
8395                                                        false_val);
8396                     }
8397                   else
8398                     {
8399                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8400                                    op);
8401                       init_stmt
8402                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8403                                                op);
8404                     }
8405                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
8406                   op = new_temp;
8407                 }
8408             }
8409           elts[number_of_places_left_in_vector] = op;
8410           if (!CONSTANT_CLASS_P (op))
8411             constant_p = false;
8412           /* For BB vectorization we have to compute an insert location
8413              when a def is inside the analyzed region since we cannot
8414              simply insert at the BB start in this case.  */
8415           stmt_vec_info opdef;
8416           if (TREE_CODE (orig_op) == SSA_NAME
8417               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8418               && is_a <bb_vec_info> (vinfo)
8419               && (opdef = vinfo->lookup_def (orig_op)))
8420             {
8421               if (!insert_after)
8422                 insert_after = opdef;
8423               else
8424                 insert_after = get_later_stmt (insert_after, opdef);
8425             }
8426
8427           if (number_of_places_left_in_vector == 0)
8428             {
8429               auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
8430               if (uniform_elt)
8431                 vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
8432                                                         elts[0]);
8433               else if (constant_p
8434                        ? multiple_p (type_nunits, nunits)
8435                        : known_eq (type_nunits, nunits))
8436                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8437               else
8438                 {
8439                   if (permute_results.is_empty ())
8440                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8441                                               elts, number_of_vectors,
8442                                               permute_results);
8443                   vec_cst = permute_results[number_of_vectors - j - 1];
8444                 }
8445               if (!gimple_seq_empty_p (ctor_seq))
8446                 {
8447                   if (insert_after)
8448                     {
8449                       gimple_stmt_iterator gsi;
8450                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8451                         {
8452                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8453                           gsi_insert_seq_before (&gsi, ctor_seq,
8454                                                  GSI_CONTINUE_LINKING);
8455                         }
8456                       else if (!stmt_ends_bb_p (insert_after->stmt))
8457                         {
8458                           gsi = gsi_for_stmt (insert_after->stmt);
8459                           gsi_insert_seq_after (&gsi, ctor_seq,
8460                                                 GSI_CONTINUE_LINKING);
8461                         }
8462                       else
8463                         {
8464                           /* When we want to insert after a def where the
8465                              defining stmt throws then insert on the fallthru
8466                              edge.  */
8467                           edge e = find_fallthru_edge
8468                                      (gimple_bb (insert_after->stmt)->succs);
8469                           basic_block new_bb
8470                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8471                           gcc_assert (!new_bb);
8472                         }
8473                     }
8474                   else
8475                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
8476                   ctor_seq = NULL;
8477                 }
8478               voprnds.quick_push (vec_cst);
8479               insert_after = NULL;
8480               number_of_places_left_in_vector = nunits;
8481               constant_p = true;
8482               elts.new_vector (vector_type, nunits, 1);
8483               elts.quick_grow (nunits);
8484             }
8485         }
8486     }
8487
8488   /* Since the vectors are created in the reverse order, we should invert
8489      them.  */
8490   vec_num = voprnds.length ();
8491   for (j = vec_num; j != 0; j--)
8492     {
8493       vop = voprnds[j - 1];
8494       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8495     }
8496
8497   /* In case that VF is greater than the unrolling factor needed for the SLP
8498      group of stmts, NUMBER_OF_VECTORS to be created is greater than
8499      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8500      to replicate the vectors.  */
8501   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8502     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8503          i++)
8504       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8505 }
8506
8507 /* Get the Ith vectorized definition from SLP_NODE.  */
8508
8509 tree
8510 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8511 {
8512   return SLP_TREE_VEC_DEFS (slp_node)[i];
8513 }
8514
8515 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
8516
8517 void
8518 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8519 {
8520   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8521   vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8522 }
8523
8524 /* Get N vectorized definitions for SLP_NODE.  */
8525
8526 void
8527 vect_get_slp_defs (vec_info *,
8528                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8529 {
8530   if (n == -1U)
8531     n = SLP_TREE_CHILDREN (slp_node).length ();
8532
8533   for (unsigned i = 0; i < n; ++i)
8534     {
8535       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8536       vec<tree> vec_defs = vNULL;
8537       vect_get_slp_defs (child, &vec_defs);
8538       vec_oprnds->quick_push (vec_defs);
8539     }
8540 }
8541
8542 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8543    - PERM gives the permutation that the caller wants to use for NODE,
8544      which might be different from SLP_LOAD_PERMUTATION.
8545    - DUMP_P controls whether the function dumps information.  */
8546
8547 static bool
8548 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8549                                 load_permutation_t &perm,
8550                                 const vec<tree> &dr_chain,
8551                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
8552                                 bool analyze_only, bool dump_p,
8553                                 unsigned *n_perms, unsigned int *n_loads,
8554                                 bool dce_chain)
8555 {
8556   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8557   int vec_index = 0;
8558   tree vectype = SLP_TREE_VECTYPE (node);
8559   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8560   unsigned int mask_element;
8561   unsigned dr_group_size;
8562   machine_mode mode;
8563
8564   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8565     dr_group_size = 1;
8566   else
8567     {
8568       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8569       dr_group_size = DR_GROUP_SIZE (stmt_info);
8570     }
8571
8572   mode = TYPE_MODE (vectype);
8573   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8574   unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8575
8576   /* Initialize the vect stmts of NODE to properly insert the generated
8577      stmts later.  */
8578   if (! analyze_only)
8579     for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8580       SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8581
8582   /* Generate permutation masks for every NODE. Number of masks for each NODE
8583      is equal to GROUP_SIZE.
8584      E.g., we have a group of three nodes with three loads from the same
8585      location in each node, and the vector size is 4. I.e., we have a
8586      a0b0c0a1b1c1... sequence and we need to create the following vectors:
8587      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8588      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8589      ...
8590
8591      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8592      The last mask is illegal since we assume two operands for permute
8593      operation, and the mask element values can't be outside that range.
8594      Hence, the last mask must be converted into {2,5,5,5}.
8595      For the first two permutations we need the first and the second input
8596      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8597      we need the second and the third vectors: {b1,c1,a2,b2} and
8598      {c2,a3,b3,c3}.  */
8599
8600   int vect_stmts_counter = 0;
8601   unsigned int index = 0;
8602   int first_vec_index = -1;
8603   int second_vec_index = -1;
8604   bool noop_p = true;
8605   *n_perms = 0;
8606
8607   vec_perm_builder mask;
8608   unsigned int nelts_to_build;
8609   unsigned int nvectors_per_build;
8610   unsigned int in_nlanes;
8611   bool repeating_p = (group_size == dr_group_size
8612                       && multiple_p (nunits, group_size));
8613   if (repeating_p)
8614     {
8615       /* A single vector contains a whole number of copies of the node, so:
8616          (a) all permutes can use the same mask; and
8617          (b) the permutes only need a single vector input.  */
8618       mask.new_vector (nunits, group_size, 3);
8619       nelts_to_build = mask.encoded_nelts ();
8620       /* It's possible to obtain zero nstmts during analyze_only, so make
8621          it at least one to ensure the later computation for n_perms
8622          proceed.  */
8623       nvectors_per_build = nstmts > 0 ? nstmts : 1;
8624       in_nlanes = dr_group_size * 3;
8625     }
8626   else
8627     {
8628       /* We need to construct a separate mask for each vector statement.  */
8629       unsigned HOST_WIDE_INT const_nunits, const_vf;
8630       if (!nunits.is_constant (&const_nunits)
8631           || !vf.is_constant (&const_vf))
8632         return false;
8633       mask.new_vector (const_nunits, const_nunits, 1);
8634       nelts_to_build = const_vf * group_size;
8635       nvectors_per_build = 1;
8636       in_nlanes = const_vf * dr_group_size;
8637     }
8638   auto_sbitmap used_in_lanes (in_nlanes);
8639   bitmap_clear (used_in_lanes);
8640   auto_bitmap used_defs;
8641
8642   unsigned int count = mask.encoded_nelts ();
8643   mask.quick_grow (count);
8644   vec_perm_indices indices;
8645
8646   for (unsigned int j = 0; j < nelts_to_build; j++)
8647     {
8648       unsigned int iter_num = j / group_size;
8649       unsigned int stmt_num = j % group_size;
8650       unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8651       bitmap_set_bit (used_in_lanes, i);
8652       if (repeating_p)
8653         {
8654           first_vec_index = 0;
8655           mask_element = i;
8656         }
8657       else
8658         {
8659           /* Enforced before the loop when !repeating_p.  */
8660           unsigned int const_nunits = nunits.to_constant ();
8661           vec_index = i / const_nunits;
8662           mask_element = i % const_nunits;
8663           if (vec_index == first_vec_index
8664               || first_vec_index == -1)
8665             {
8666               first_vec_index = vec_index;
8667             }
8668           else if (vec_index == second_vec_index
8669                    || second_vec_index == -1)
8670             {
8671               second_vec_index = vec_index;
8672               mask_element += const_nunits;
8673             }
8674           else
8675             {
8676               if (dump_p)
8677                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8678                                  "permutation requires at "
8679                                  "least three vectors %G",
8680                                  stmt_info->stmt);
8681               gcc_assert (analyze_only);
8682               return false;
8683             }
8684
8685           gcc_assert (mask_element < 2 * const_nunits);
8686         }
8687
8688       if (mask_element != index)
8689         noop_p = false;
8690       mask[index++] = mask_element;
8691
8692       if (index == count)
8693         {
8694           if (!noop_p)
8695             {
8696               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8697               if (!can_vec_perm_const_p (mode, mode, indices))
8698                 {
8699                   if (dump_p)
8700                     {
8701                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8702                                        "unsupported vect permute { ");
8703                       for (i = 0; i < count; ++i)
8704                         {
8705                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8706                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8707                         }
8708                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8709                     }
8710                   gcc_assert (analyze_only);
8711                   return false;
8712                 }
8713
8714               tree mask_vec = NULL_TREE;
8715               if (!analyze_only)
8716                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8717
8718               if (second_vec_index == -1)
8719                 second_vec_index = first_vec_index;
8720
8721               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8722                 {
8723                   ++*n_perms;
8724                   if (analyze_only)
8725                     continue;
8726                   /* Generate the permute statement if necessary.  */
8727                   tree first_vec = dr_chain[first_vec_index + ri];
8728                   tree second_vec = dr_chain[second_vec_index + ri];
8729                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8730                   tree perm_dest
8731                     = vect_create_destination_var (gimple_assign_lhs (stmt),
8732                                                    vectype);
8733                   perm_dest = make_ssa_name (perm_dest);
8734                   gimple *perm_stmt
8735                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8736                                            second_vec, mask_vec);
8737                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8738                                                gsi);
8739                   if (dce_chain)
8740                     {
8741                       bitmap_set_bit (used_defs, first_vec_index + ri);
8742                       bitmap_set_bit (used_defs, second_vec_index + ri);
8743                     }
8744
8745                   /* Store the vector statement in NODE.  */
8746                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8747                 }
8748             }
8749           else if (!analyze_only)
8750             {
8751               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8752                 {
8753                   tree first_vec = dr_chain[first_vec_index + ri];
8754                   /* If mask was NULL_TREE generate the requested
8755                      identity transform.  */
8756                   if (dce_chain)
8757                     bitmap_set_bit (used_defs, first_vec_index + ri);
8758
8759                   /* Store the vector statement in NODE.  */
8760                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8761                 }
8762             }
8763
8764           index = 0;
8765           first_vec_index = -1;
8766           second_vec_index = -1;
8767           noop_p = true;
8768         }
8769     }
8770
8771   if (n_loads)
8772     {
8773       if (repeating_p)
8774         *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8775       else
8776         {
8777           /* Enforced above when !repeating_p.  */
8778           unsigned int const_nunits = nunits.to_constant ();
8779           *n_loads = 0;
8780           bool load_seen = false;
8781           for (unsigned i = 0; i < in_nlanes; ++i)
8782             {
8783               if (i % const_nunits == 0)
8784                 {
8785                   if (load_seen)
8786                     *n_loads += 1;
8787                   load_seen = false;
8788                 }
8789               if (bitmap_bit_p (used_in_lanes, i))
8790                 load_seen = true;
8791             }
8792           if (load_seen)
8793             *n_loads += 1;
8794         }
8795     }
8796
8797   if (dce_chain)
8798     for (unsigned i = 0; i < dr_chain.length (); ++i)
8799       if (!bitmap_bit_p (used_defs, i))
8800         {
8801           tree def = dr_chain[i];
8802           do
8803             {
8804               gimple *stmt = SSA_NAME_DEF_STMT (def);
8805               if (is_gimple_assign (stmt)
8806                   && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
8807                       || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
8808                 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
8809               else
8810                 def = NULL;
8811               gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8812               gsi_remove (&rgsi, true);
8813               release_defs (stmt);
8814             }
8815           while (def);
8816         }
8817
8818   return true;
8819 }
8820
8821 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8822    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8823    permute statements for the SLP node NODE.  Store the number of vector
8824    permute instructions in *N_PERMS and the number of vector load
8825    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
8826    that were not needed.  */
8827
8828 bool
8829 vect_transform_slp_perm_load (vec_info *vinfo,
8830                               slp_tree node, const vec<tree> &dr_chain,
8831                               gimple_stmt_iterator *gsi, poly_uint64 vf,
8832                               bool analyze_only, unsigned *n_perms,
8833                               unsigned int *n_loads, bool dce_chain)
8834 {
8835   return vect_transform_slp_perm_load_1 (vinfo, node,
8836                                          SLP_TREE_LOAD_PERMUTATION (node),
8837                                          dr_chain, gsi, vf, analyze_only,
8838                                          dump_enabled_p (), n_perms, n_loads,
8839                                          dce_chain);
8840 }
8841
8842 /* Produce the next vector result for SLP permutation NODE by adding a vector
8843    statement at GSI.  If MASK_VEC is nonnull, add:
8844
8845       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8846
8847    otherwise add:
8848
8849       <new SSA name> = FIRST_DEF.  */
8850
8851 static void
8852 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8853                           slp_tree node, tree first_def, tree second_def,
8854                           tree mask_vec, poly_uint64 identity_offset)
8855 {
8856   tree vectype = SLP_TREE_VECTYPE (node);
8857
8858   /* ???  We SLP match existing vector element extracts but
8859      allow punning which we need to re-instantiate at uses
8860      but have no good way of explicitly representing.  */
8861   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8862       && !types_compatible_p (TREE_TYPE (first_def), vectype))
8863     {
8864       gassign *conv_stmt
8865         = gimple_build_assign (make_ssa_name (vectype),
8866                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8867       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8868       first_def = gimple_assign_lhs (conv_stmt);
8869     }
8870   gassign *perm_stmt;
8871   tree perm_dest = make_ssa_name (vectype);
8872   if (mask_vec)
8873     {
8874       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8875                            TYPE_SIZE (vectype))
8876           && !types_compatible_p (TREE_TYPE (second_def), vectype))
8877         {
8878           gassign *conv_stmt
8879             = gimple_build_assign (make_ssa_name (vectype),
8880                                    build1 (VIEW_CONVERT_EXPR,
8881                                            vectype, second_def));
8882           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8883           second_def = gimple_assign_lhs (conv_stmt);
8884         }
8885       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8886                                        first_def, second_def,
8887                                        mask_vec);
8888     }
8889   else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8890     {
8891       /* For identity permutes we still need to handle the case
8892          of offsetted extracts or concats.  */
8893       unsigned HOST_WIDE_INT c;
8894       auto first_def_nunits
8895         = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8896       if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8897         {
8898           unsigned HOST_WIDE_INT elsz
8899             = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8900           tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8901                                  TYPE_SIZE (vectype),
8902                                  bitsize_int (identity_offset * elsz));
8903           perm_stmt = gimple_build_assign (perm_dest, lowpart);
8904         }
8905       else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8906                                     first_def_nunits, &c) && c == 2)
8907         {
8908           tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8909                                             NULL_TREE, second_def);
8910           perm_stmt = gimple_build_assign (perm_dest, ctor);
8911         }
8912       else
8913         gcc_unreachable ();
8914     }
8915   else
8916     {
8917       /* We need a copy here in case the def was external.  */
8918       perm_stmt = gimple_build_assign (perm_dest, first_def);
8919     }
8920   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8921   /* Store the vector statement in NODE.  */
8922   node->push_vec_def (perm_stmt);
8923 }
8924
8925 /* Subroutine of vectorizable_slp_permutation.  Check whether the target
8926    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8927    If GSI is nonnull, emit the permutation there.
8928
8929    When GSI is null, the only purpose of NODE is to give properties
8930    of the result, such as the vector type and number of SLP lanes.
8931    The node does not need to be a VEC_PERM_EXPR.
8932
8933    If the target supports the operation, return the number of individual
8934    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
8935    dump file if DUMP_P is true.  */
8936
8937 static int
8938 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8939                                 slp_tree node, lane_permutation_t &perm,
8940                                 vec<slp_tree> &children, bool dump_p)
8941 {
8942   tree vectype = SLP_TREE_VECTYPE (node);
8943
8944   /* ???  We currently only support all same vector input types
8945      while the SLP IL should really do a concat + select and thus accept
8946      arbitrary mismatches.  */
8947   slp_tree child;
8948   unsigned i;
8949   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8950   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8951   tree op_vectype = NULL_TREE;
8952   FOR_EACH_VEC_ELT (children, i, child)
8953     if (SLP_TREE_VECTYPE (child))
8954       {
8955         op_vectype = SLP_TREE_VECTYPE (child);
8956         break;
8957       }
8958   if (!op_vectype)
8959     op_vectype = vectype;
8960   FOR_EACH_VEC_ELT (children, i, child)
8961     {
8962       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8963            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8964           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8965           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8966         {
8967           if (dump_p)
8968             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8969                              "Unsupported vector types in lane permutation\n");
8970           return -1;
8971         }
8972       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8973         repeating_p = false;
8974     }
8975
8976   gcc_assert (perm.length () == SLP_TREE_LANES (node));
8977   if (dump_p)
8978     {
8979       dump_printf_loc (MSG_NOTE, vect_location,
8980                        "vectorizing permutation");
8981       for (unsigned i = 0; i < perm.length (); ++i)
8982         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8983       if (repeating_p)
8984         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8985       dump_printf (MSG_NOTE, "\n");
8986     }
8987
8988   /* REPEATING_P is true if every output vector is guaranteed to use the
8989      same permute vector.  We can handle that case for both variable-length
8990      and constant-length vectors, but we only handle other cases for
8991      constant-length vectors.
8992
8993      Set:
8994
8995      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8996        mask vector that we want to build.
8997
8998      - NCOPIES to the number of copies of PERM that we need in order
8999        to build the necessary permute mask vectors.
9000
9001      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
9002        for each permute mask vector.  This is only relevant when GSI is
9003        nonnull.  */
9004   uint64_t npatterns;
9005   unsigned nelts_per_pattern;
9006   uint64_t ncopies;
9007   unsigned noutputs_per_mask;
9008   if (repeating_p)
9009     {
9010       /* We need a single permute mask vector that has the form:
9011
9012            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
9013
9014          In other words, the original n-element permute in PERM is
9015          "unrolled" to fill a full vector.  The stepped vector encoding
9016          that we use for permutes requires 3n elements.  */
9017       npatterns = SLP_TREE_LANES (node);
9018       nelts_per_pattern = ncopies = 3;
9019       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9020     }
9021   else
9022     {
9023       /* Calculate every element of every permute mask vector explicitly,
9024          instead of relying on the pattern described above.  */
9025       if (!nunits.is_constant (&npatterns)
9026           || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
9027         return -1;
9028       nelts_per_pattern = ncopies = 1;
9029       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
9030         if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
9031           return -1;
9032       noutputs_per_mask = 1;
9033     }
9034   unsigned olanes = ncopies * SLP_TREE_LANES (node);
9035   gcc_assert (repeating_p || multiple_p (olanes, nunits));
9036
9037   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
9038      from the { SLP operand, scalar lane } permutation as recorded in the
9039      SLP node as intermediate step.  This part should already work
9040      with SLP children with arbitrary number of lanes.  */
9041   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
9042   auto_vec<unsigned> active_lane;
9043   vperm.create (olanes);
9044   active_lane.safe_grow_cleared (children.length (), true);
9045   for (unsigned i = 0; i < ncopies; ++i)
9046     {
9047       for (unsigned pi = 0; pi < perm.length (); ++pi)
9048         {
9049           std::pair<unsigned, unsigned> p = perm[pi];
9050           tree vtype = SLP_TREE_VECTYPE (children[p.first]);
9051           if (repeating_p)
9052             vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
9053           else
9054             {
9055               /* We checked above that the vectors are constant-length.  */
9056               unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
9057               unsigned vi = (active_lane[p.first] + p.second) / vnunits;
9058               unsigned vl = (active_lane[p.first] + p.second) % vnunits;
9059               vperm.quick_push ({{p.first, vi}, vl});
9060             }
9061         }
9062       /* Advance to the next group.  */
9063       for (unsigned j = 0; j < children.length (); ++j)
9064         active_lane[j] += SLP_TREE_LANES (children[j]);
9065     }
9066
9067   if (dump_p)
9068     {
9069       dump_printf_loc (MSG_NOTE, vect_location,
9070                        "vectorizing permutation");
9071       for (unsigned i = 0; i < perm.length (); ++i)
9072         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
9073       if (repeating_p)
9074         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
9075       dump_printf (MSG_NOTE, "\n");
9076       dump_printf_loc (MSG_NOTE, vect_location, "as");
9077       for (unsigned i = 0; i < vperm.length (); ++i)
9078         {
9079           if (i != 0
9080               && (repeating_p
9081                   ? multiple_p (i, npatterns)
9082                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
9083             dump_printf (MSG_NOTE, ",");
9084           dump_printf (MSG_NOTE, " vops%u[%u][%u]",
9085                        vperm[i].first.first, vperm[i].first.second,
9086                        vperm[i].second);
9087         }
9088       dump_printf (MSG_NOTE, "\n");
9089     }
9090
9091   /* We can only handle two-vector permutes, everything else should
9092      be lowered on the SLP level.  The following is closely inspired
9093      by vect_transform_slp_perm_load and is supposed to eventually
9094      replace it.
9095      ???   As intermediate step do code-gen in the SLP tree representation
9096      somehow?  */
9097   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
9098   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
9099   unsigned int index = 0;
9100   poly_uint64 mask_element;
9101   vec_perm_builder mask;
9102   mask.new_vector (nunits, npatterns, nelts_per_pattern);
9103   unsigned int count = mask.encoded_nelts ();
9104   mask.quick_grow (count);
9105   vec_perm_indices indices;
9106   unsigned nperms = 0;
9107   for (unsigned i = 0; i < vperm.length (); ++i)
9108     {
9109       mask_element = vperm[i].second;
9110       if (first_vec.first == -1U
9111           || first_vec == vperm[i].first)
9112         first_vec = vperm[i].first;
9113       else if (second_vec.first == -1U
9114                || second_vec == vperm[i].first)
9115         {
9116           second_vec = vperm[i].first;
9117           mask_element += nunits;
9118         }
9119       else
9120         {
9121           if (dump_p)
9122             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9123                              "permutation requires at "
9124                              "least three vectors\n");
9125           gcc_assert (!gsi);
9126           return -1;
9127         }
9128
9129       mask[index++] = mask_element;
9130
9131       if (index == count)
9132         {
9133           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
9134                               TYPE_VECTOR_SUBPARTS (op_vectype));
9135           bool identity_p = (indices.series_p (0, 1, mask[0], 1)
9136                              && constant_multiple_p (mask[0], nunits));
9137           machine_mode vmode = TYPE_MODE (vectype);
9138           machine_mode op_vmode = TYPE_MODE (op_vectype);
9139           unsigned HOST_WIDE_INT c;
9140           if ((!identity_p
9141                && !can_vec_perm_const_p (vmode, op_vmode, indices))
9142               || (identity_p
9143                   && !known_le (nunits,
9144                                 TYPE_VECTOR_SUBPARTS (op_vectype))
9145                   && (!constant_multiple_p (nunits,
9146                                             TYPE_VECTOR_SUBPARTS (op_vectype),
9147                                             &c) || c != 2)))
9148             {
9149               if (dump_p)
9150                 {
9151                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
9152                                    vect_location,
9153                                    "unsupported vect permute { ");
9154                   for (i = 0; i < count; ++i)
9155                     {
9156                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
9157                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
9158                     }
9159                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
9160                 }
9161               gcc_assert (!gsi);
9162               return -1;
9163             }
9164
9165           if (!identity_p)
9166             nperms++;
9167           if (gsi)
9168             {
9169               if (second_vec.first == -1U)
9170                 second_vec = first_vec;
9171
9172               slp_tree
9173                 first_node = children[first_vec.first],
9174                 second_node = children[second_vec.first];
9175
9176               tree mask_vec = NULL_TREE;
9177               if (!identity_p)
9178                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
9179
9180               for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
9181                 {
9182                   tree first_def
9183                     = vect_get_slp_vect_def (first_node,
9184                                              first_vec.second + vi);
9185                   tree second_def
9186                     = vect_get_slp_vect_def (second_node,
9187                                              second_vec.second + vi);
9188                   vect_add_slp_permutation (vinfo, gsi, node, first_def,
9189                                             second_def, mask_vec, mask[0]);
9190                 }
9191             }
9192
9193           index = 0;
9194           first_vec = std::make_pair (-1U, -1U);
9195           second_vec = std::make_pair (-1U, -1U);
9196         }
9197     }
9198
9199   return nperms;
9200 }
9201
9202 /* Vectorize the SLP permutations in NODE as specified
9203    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
9204    child number and lane number.
9205    Interleaving of two two-lane two-child SLP subtrees (not supported):
9206      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
9207    A blend of two four-lane two-child SLP subtrees:
9208      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
9209    Highpart of a four-lane one-child SLP subtree (not supported):
9210      [ { 0, 2 }, { 0, 3 } ]
9211    Where currently only a subset is supported by code generating below.  */
9212
9213 static bool
9214 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
9215                               slp_tree node, stmt_vector_for_cost *cost_vec)
9216 {
9217   tree vectype = SLP_TREE_VECTYPE (node);
9218   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
9219   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
9220                                                SLP_TREE_CHILDREN (node),
9221                                                dump_enabled_p ());
9222   if (nperms < 0)
9223     return false;
9224
9225   if (!gsi)
9226     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
9227
9228   return true;
9229 }
9230
9231 /* Vectorize SLP NODE.  */
9232
9233 static void
9234 vect_schedule_slp_node (vec_info *vinfo,
9235                         slp_tree node, slp_instance instance)
9236 {
9237   gimple_stmt_iterator si;
9238   int i;
9239   slp_tree child;
9240
9241   /* Vectorize externals and constants.  */
9242   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
9243       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
9244     {
9245       /* ???  vectorizable_shift can end up using a scalar operand which is
9246          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
9247          node in this case.  */
9248       if (!SLP_TREE_VECTYPE (node))
9249         return;
9250
9251       /* There are two reasons vector defs might already exist.  The first
9252          is that we are vectorizing an existing vector def.  The second is
9253          when performing BB vectorization shared constant/external nodes
9254          are not split apart during partitioning so during the code-gen
9255          DFS walk we can end up visiting them twice.  */
9256       if (! SLP_TREE_VEC_DEFS (node).exists ())
9257         vect_create_constant_vectors (vinfo, node);
9258       return;
9259     }
9260
9261   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
9262
9263   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
9264
9265   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
9266   SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
9267
9268   if (dump_enabled_p ())
9269     dump_printf_loc (MSG_NOTE, vect_location,
9270                      "------>vectorizing SLP node starting from: %G",
9271                      stmt_info->stmt);
9272
9273   if (STMT_VINFO_DATA_REF (stmt_info)
9274       && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9275     {
9276       /* Vectorized loads go before the first scalar load to make it
9277          ready early, vectorized stores go before the last scalar
9278          stmt which is where all uses are ready.  */
9279       stmt_vec_info last_stmt_info = NULL;
9280       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
9281         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
9282       else /* DR_IS_WRITE */
9283         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
9284       si = gsi_for_stmt (last_stmt_info->stmt);
9285     }
9286   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
9287             || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
9288             || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
9289            && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9290     {
9291       /* For PHI node vectorization we do not use the insertion iterator.  */
9292       si = gsi_none ();
9293     }
9294   else
9295     {
9296       /* Emit other stmts after the children vectorized defs which is
9297          earliest possible.  */
9298       gimple *last_stmt = NULL;
9299       if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
9300         if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
9301             || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
9302           {
9303             /* But avoid scheduling internal defs outside of the loop when
9304                we might have only implicitly tracked loop mask/len defs.  */
9305             gimple_stmt_iterator si
9306               = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
9307             last_stmt = *si;
9308           }
9309       bool seen_vector_def = false;
9310       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9311         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9312           {
9313             /* For fold-left reductions we are retaining the scalar
9314                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
9315                set so the representation isn't perfect.  Resort to the
9316                last scalar def here.  */
9317             if (SLP_TREE_VEC_DEFS (child).is_empty ())
9318               {
9319                 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
9320                             == cycle_phi_info_type);
9321                 gphi *phi = as_a <gphi *>
9322                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
9323                 if (!last_stmt
9324                     || vect_stmt_dominates_stmt_p (last_stmt, phi))
9325                   last_stmt = phi;
9326               }
9327             /* We are emitting all vectorized stmts in the same place and
9328                the last one is the last.
9329                ???  Unless we have a load permutation applied and that
9330                figures to re-use an earlier generated load.  */
9331             unsigned j;
9332             tree vdef;
9333             FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9334               {
9335                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9336                 if (!last_stmt
9337                     || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9338                   last_stmt = vstmt;
9339               }
9340           }
9341         else if (!SLP_TREE_VECTYPE (child))
9342           {
9343             /* For externals we use unvectorized at all scalar defs.  */
9344             unsigned j;
9345             tree def;
9346             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9347               if (TREE_CODE (def) == SSA_NAME
9348                   && !SSA_NAME_IS_DEFAULT_DEF (def))
9349                 {
9350                   gimple *stmt = SSA_NAME_DEF_STMT (def);
9351                   if (!last_stmt
9352                       || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9353                     last_stmt = stmt;
9354                 }
9355           }
9356         else
9357           {
9358             /* For externals we have to look at all defs since their
9359                insertion place is decided per vector.  But beware
9360                of pre-existing vectors where we need to make sure
9361                we do not insert before the region boundary.  */
9362             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9363                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9364               seen_vector_def = true;
9365             else
9366               {
9367                 unsigned j;
9368                 tree vdef;
9369                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9370                   if (TREE_CODE (vdef) == SSA_NAME
9371                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9372                     {
9373                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9374                       if (!last_stmt
9375                           || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9376                         last_stmt = vstmt;
9377                     }
9378               }
9379           }
9380       /* This can happen when all children are pre-existing vectors or
9381          constants.  */
9382       if (!last_stmt)
9383         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9384       if (!last_stmt)
9385         {
9386           gcc_assert (seen_vector_def);
9387           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9388         }
9389       else if (is_ctrl_altering_stmt (last_stmt))
9390         {
9391           /* We split regions to vectorize at control altering stmts
9392              with a definition so this must be an external which
9393              we can insert at the start of the region.  */
9394           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9395         }
9396       else if (is_a <bb_vec_info> (vinfo)
9397                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9398                && gimple_could_trap_p (stmt_info->stmt))
9399         {
9400           /* We've constrained possibly trapping operations to all come
9401              from the same basic-block, if vectorized defs would allow earlier
9402              scheduling still force vectorized stmts to the original block.
9403              This is only necessary for BB vectorization since for loop vect
9404              all operations are in a single BB and scalar stmt based
9405              placement doesn't play well with epilogue vectorization.  */
9406           gcc_assert (dominated_by_p (CDI_DOMINATORS,
9407                                       gimple_bb (stmt_info->stmt),
9408                                       gimple_bb (last_stmt)));
9409           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9410         }
9411       else if (is_a <gphi *> (last_stmt))
9412         si = gsi_after_labels (gimple_bb (last_stmt));
9413       else
9414         {
9415           si = gsi_for_stmt (last_stmt);
9416           gsi_next (&si);
9417         }
9418     }
9419
9420   /* Handle purely internal nodes.  */
9421   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9422     {
9423       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
9424          be shared with different SLP nodes (but usually it's the same
9425          operation apart from the case the stmt is only there for denoting
9426          the actual scalar lane defs ...).  So do not call vect_transform_stmt
9427          but open-code it here (partly).  */
9428       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9429       gcc_assert (done);
9430       stmt_vec_info slp_stmt_info;
9431       unsigned int i;
9432       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9433         if (STMT_VINFO_LIVE_P (slp_stmt_info))
9434           {
9435             done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9436                                                 instance, i, true, NULL);
9437             gcc_assert (done);
9438           }
9439     }
9440   else
9441     vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9442 }
9443
9444 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9445    For loop vectorization this is done in vectorizable_call, but for SLP
9446    it needs to be deferred until end of vect_schedule_slp, because multiple
9447    SLP instances may refer to the same scalar stmt.  */
9448
9449 static void
9450 vect_remove_slp_scalar_calls (vec_info *vinfo,
9451                               slp_tree node, hash_set<slp_tree> &visited)
9452 {
9453   gimple *new_stmt;
9454   gimple_stmt_iterator gsi;
9455   int i;
9456   slp_tree child;
9457   tree lhs;
9458   stmt_vec_info stmt_info;
9459
9460   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9461     return;
9462
9463   if (visited.add (node))
9464     return;
9465
9466   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9467     vect_remove_slp_scalar_calls (vinfo, child, visited);
9468
9469   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9470     {
9471       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9472       if (!stmt || gimple_bb (stmt) == NULL)
9473         continue;
9474       if (is_pattern_stmt_p (stmt_info)
9475           || !PURE_SLP_STMT (stmt_info))
9476         continue;
9477       lhs = gimple_call_lhs (stmt);
9478       if (lhs)
9479         new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9480       else
9481         {
9482           new_stmt = gimple_build_nop ();
9483           unlink_stmt_vdef (stmt_info->stmt);
9484         }
9485       gsi = gsi_for_stmt (stmt);
9486       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9487       if (lhs)
9488         SSA_NAME_DEF_STMT (lhs) = new_stmt;
9489     }
9490 }
9491
9492 static void
9493 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9494 {
9495   hash_set<slp_tree> visited;
9496   vect_remove_slp_scalar_calls (vinfo, node, visited);
9497 }
9498
9499 /* Vectorize the instance root.  */
9500
9501 void
9502 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9503 {
9504   gassign *rstmt = NULL;
9505
9506   if (instance->kind == slp_inst_kind_ctor)
9507     {
9508       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9509         {
9510           tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9511           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9512           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9513                                           TREE_TYPE (vect_lhs)))
9514             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9515                                vect_lhs);
9516           rstmt = gimple_build_assign (root_lhs, vect_lhs);
9517         }
9518       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9519         {
9520           int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9521           tree child_def;
9522           int j;
9523           vec<constructor_elt, va_gc> *v;
9524           vec_alloc (v, nelts);
9525
9526           /* A CTOR can handle V16HI composition from VNx8HI so we
9527              do not need to convert vector elements if the types
9528              do not match.  */
9529           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9530             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9531           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9532           tree rtype
9533             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9534           tree r_constructor = build_constructor (rtype, v);
9535           rstmt = gimple_build_assign (lhs, r_constructor);
9536         }
9537     }
9538   else if (instance->kind == slp_inst_kind_bb_reduc)
9539     {
9540       /* Largely inspired by reduction chain epilogue handling in
9541          vect_create_epilog_for_reduction.  */
9542       vec<tree> vec_defs = vNULL;
9543       vect_get_slp_defs (node, &vec_defs);
9544       enum tree_code reduc_code
9545         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9546       /* ???  We actually have to reflect signs somewhere.  */
9547       if (reduc_code == MINUS_EXPR)
9548         reduc_code = PLUS_EXPR;
9549       gimple_seq epilogue = NULL;
9550       /* We may end up with more than one vector result, reduce them
9551          to one vector.  */
9552       tree vec_def = vec_defs[0];
9553       tree vectype = TREE_TYPE (vec_def);
9554       tree compute_vectype = vectype;
9555       bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9556                                  && TYPE_OVERFLOW_UNDEFINED (vectype)
9557                                  && operation_can_overflow (reduc_code));
9558       if (pun_for_overflow_p)
9559         {
9560           compute_vectype = unsigned_type_for (vectype);
9561           vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9562                                   compute_vectype, vec_def);
9563         }
9564       for (unsigned i = 1; i < vec_defs.length (); ++i)
9565         {
9566           tree def = vec_defs[i];
9567           if (pun_for_overflow_p)
9568             def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9569                                 compute_vectype, def);
9570           vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9571                                   vec_def, def);
9572         }
9573       vec_defs.release ();
9574       /* ???  Support other schemes than direct internal fn.  */
9575       internal_fn reduc_fn;
9576       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9577           || reduc_fn == IFN_LAST)
9578         gcc_unreachable ();
9579       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9580                                       TREE_TYPE (compute_vectype), vec_def);
9581       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9582         {
9583           tree rem_def = NULL_TREE;
9584           for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9585             {
9586               def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9587               if (!rem_def)
9588                 rem_def = def;
9589               else
9590                 rem_def = gimple_build (&epilogue, reduc_code,
9591                                         TREE_TYPE (scalar_def),
9592                                         rem_def, def);
9593             }
9594           scalar_def = gimple_build (&epilogue, reduc_code,
9595                                      TREE_TYPE (scalar_def),
9596                                      scalar_def, rem_def);
9597         }
9598       scalar_def = gimple_convert (&epilogue,
9599                                    TREE_TYPE (vectype), scalar_def);
9600       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9601       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9602       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9603       update_stmt (gsi_stmt (rgsi));
9604       return;
9605     }
9606   else
9607     gcc_unreachable ();
9608
9609   gcc_assert (rstmt);
9610
9611   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9612   gsi_replace (&rgsi, rstmt, true);
9613 }
9614
9615 struct slp_scc_info
9616 {
9617   bool on_stack;
9618   int dfs;
9619   int lowlink;
9620 };
9621
9622 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
9623
9624 static void
9625 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9626                    hash_map<slp_tree, slp_scc_info> &scc_info,
9627                    int &maxdfs, vec<slp_tree> &stack)
9628 {
9629   bool existed_p;
9630   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9631   gcc_assert (!existed_p);
9632   info->dfs = maxdfs;
9633   info->lowlink = maxdfs;
9634   maxdfs++;
9635
9636   /* Leaf.  */
9637   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9638     {
9639       info->on_stack = false;
9640       vect_schedule_slp_node (vinfo, node, instance);
9641       return;
9642     }
9643
9644   info->on_stack = true;
9645   stack.safe_push (node);
9646
9647   unsigned i;
9648   slp_tree child;
9649   /* DFS recurse.  */
9650   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9651     {
9652       if (!child)
9653         continue;
9654       slp_scc_info *child_info = scc_info.get (child);
9655       if (!child_info)
9656         {
9657           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9658           /* Recursion might have re-allocated the node.  */
9659           info = scc_info.get (node);
9660           child_info = scc_info.get (child);
9661           info->lowlink = MIN (info->lowlink, child_info->lowlink);
9662         }
9663       else if (child_info->on_stack)
9664         info->lowlink = MIN (info->lowlink, child_info->dfs);
9665     }
9666   if (info->lowlink != info->dfs)
9667     return;
9668
9669   auto_vec<slp_tree, 4> phis_to_fixup;
9670
9671   /* Singleton.  */
9672   if (stack.last () == node)
9673     {
9674       stack.pop ();
9675       info->on_stack = false;
9676       vect_schedule_slp_node (vinfo, node, instance);
9677       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9678           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9679         phis_to_fixup.quick_push (node);
9680     }
9681   else
9682     {
9683       /* SCC.  */
9684       int last_idx = stack.length () - 1;
9685       while (stack[last_idx] != node)
9686         last_idx--;
9687       /* We can break the cycle at PHIs who have at least one child
9688          code generated.  Then we could re-start the DFS walk until
9689          all nodes in the SCC are covered (we might have new entries
9690          for only back-reachable nodes).  But it's simpler to just
9691          iterate and schedule those that are ready.  */
9692       unsigned todo = stack.length () - last_idx;
9693       do
9694         {
9695           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9696             {
9697               slp_tree entry = stack[idx];
9698               if (!entry)
9699                 continue;
9700               bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9701                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9702               bool ready = !phi;
9703               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9704                   if (!child)
9705                     {
9706                       gcc_assert (phi);
9707                       ready = true;
9708                       break;
9709                     }
9710                   else if (scc_info.get (child)->on_stack)
9711                     {
9712                       if (!phi)
9713                         {
9714                           ready = false;
9715                           break;
9716                         }
9717                     }
9718                   else
9719                     {
9720                       if (phi)
9721                         {
9722                           ready = true;
9723                           break;
9724                         }
9725                     }
9726               if (ready)
9727                 {
9728                   vect_schedule_slp_node (vinfo, entry, instance);
9729                   scc_info.get (entry)->on_stack = false;
9730                   stack[idx] = NULL;
9731                   todo--;
9732                   if (phi)
9733                     phis_to_fixup.safe_push (entry);
9734                 }
9735             }
9736         }
9737       while (todo != 0);
9738
9739       /* Pop the SCC.  */
9740       stack.truncate (last_idx);
9741     }
9742
9743   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
9744   slp_tree phi_node;
9745   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9746     {
9747       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9748       edge_iterator ei;
9749       edge e;
9750       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9751         {
9752           unsigned dest_idx = e->dest_idx;
9753           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9754           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9755             continue;
9756           unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9757           /* Simply fill all args.  */
9758           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9759               != vect_first_order_recurrence)
9760             for (unsigned i = 0; i < n; ++i)
9761               {
9762                 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9763                 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9764                 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9765                              e, gimple_phi_arg_location (phi, dest_idx));
9766               }
9767           else
9768             {
9769               /* Unless it is a first order recurrence which needs
9770                  args filled in for both the PHI node and the permutes.  */
9771               gimple *perm
9772                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9773               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9774               add_phi_arg (as_a <gphi *> (rphi),
9775                            vect_get_slp_vect_def (child, n - 1),
9776                            e, gimple_phi_arg_location (phi, dest_idx));
9777               for (unsigned i = 0; i < n; ++i)
9778                 {
9779                   gimple *perm
9780                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9781                   if (i > 0)
9782                     gimple_assign_set_rhs1 (perm,
9783                                             vect_get_slp_vect_def (child, i - 1));
9784                   gimple_assign_set_rhs2 (perm,
9785                                           vect_get_slp_vect_def (child, i));
9786                   update_stmt (perm);
9787                 }
9788             }
9789         }
9790     }
9791 }
9792
9793 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
9794
9795 void
9796 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9797 {
9798   slp_instance instance;
9799   unsigned int i;
9800
9801   hash_map<slp_tree, slp_scc_info> scc_info;
9802   int maxdfs = 0;
9803   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9804     {
9805       slp_tree node = SLP_INSTANCE_TREE (instance);
9806       if (dump_enabled_p ())
9807         {
9808           dump_printf_loc (MSG_NOTE, vect_location,
9809                            "Vectorizing SLP tree:\n");
9810           /* ???  Dump all?  */
9811           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9812             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9813                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9814           vect_print_slp_graph (MSG_NOTE, vect_location,
9815                                 SLP_INSTANCE_TREE (instance));
9816         }
9817       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9818          have a PHI be the node breaking the cycle.  */
9819       auto_vec<slp_tree> stack;
9820       if (!scc_info.get (node))
9821         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9822
9823       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9824         vectorize_slp_instance_root_stmt (node, instance);
9825
9826       if (dump_enabled_p ())
9827         dump_printf_loc (MSG_NOTE, vect_location,
9828                          "vectorizing stmts using SLP.\n");
9829     }
9830
9831   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9832     {
9833       slp_tree root = SLP_INSTANCE_TREE (instance);
9834       stmt_vec_info store_info;
9835       unsigned int j;
9836
9837       /* Remove scalar call stmts.  Do not do this for basic-block
9838          vectorization as not all uses may be vectorized.
9839          ???  Why should this be necessary?  DCE should be able to
9840          remove the stmts itself.
9841          ???  For BB vectorization we can as well remove scalar
9842          stmts starting from the SLP tree root if they have no
9843          uses.  */
9844       if (is_a <loop_vec_info> (vinfo))
9845         vect_remove_slp_scalar_calls (vinfo, root);
9846
9847       /* Remove vectorized stores original scalar stmts.  */
9848       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9849         {
9850           if (!STMT_VINFO_DATA_REF (store_info)
9851               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9852             break;
9853
9854           store_info = vect_orig_stmt (store_info);
9855           /* Free the attached stmt_vec_info and remove the stmt.  */
9856           vinfo->remove_stmt (store_info);
9857
9858           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9859              to not crash in vect_free_slp_tree later.  */
9860           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9861             SLP_TREE_REPRESENTATIVE (root) = NULL;
9862         }
9863     }
9864 }