gcc/tree-vect-slp.cc

   1 /* SLP - Basic Block Vectorization
   2    Copyright (C) 2007-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #define INCLUDE_ALGORITHM
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "insn-config.h"
  35 #include "recog.h"              /* FIXME: for insn_data */
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "gimple-iterator.h"
  39 #include "cfgloop.h"
  40 #include "tree-vectorizer.h"
  41 #include "langhooks.h"
  42 #include "gimple-walk.h"
  43 #include "dbgcnt.h"
  44 #include "tree-vector-builder.h"
  45 #include "vec-perm-indices.h"
  46 #include "gimple-fold.h"
  47 #include "internal-fn.h"
  48 #include "dump-context.h"
  49 #include "cfganal.h"
  50 #include "tree-eh.h"
  51 #include "tree-cfg.h"
  52 #include "alloc-pool.h"
  53 #include "sreal.h"
  54 #include "predict.h"
  55
  56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
  57                                             load_permutation_t &,
  58                                             const vec<tree> &,
  59                                             gimple_stmt_iterator *,
  60                                             poly_uint64, bool, bool,
  61                                             unsigned *,
  62                                             unsigned * = nullptr,
  63                                             bool = false);
  64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
  65                                            slp_tree, lane_permutation_t &,
  66                                            vec<slp_tree> &, bool);
  67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
  68                                           slp_tree, stmt_vector_for_cost *);
  69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
  70
  71 static object_allocator<_slp_tree> *slp_tree_pool;
  72 static slp_tree slp_first_node;
  73
  74 void
  75 vect_slp_init (void)
  76 {
  77   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
  78 }
  79
  80 void
  81 vect_slp_fini (void)
  82 {
  83   while (slp_first_node)
  84     delete slp_first_node;
  85   delete slp_tree_pool;
  86   slp_tree_pool = NULL;
  87 }
  88
  89 void *
  90 _slp_tree::operator new (size_t n)
  91 {
  92   gcc_assert (n == sizeof (_slp_tree));
  93   return slp_tree_pool->allocate_raw ();
  94 }
  95
  96 void
  97 _slp_tree::operator delete (void *node, size_t n)
  98 {
  99   gcc_assert (n == sizeof (_slp_tree));
 100   slp_tree_pool->remove_raw (node);
 101 }
 102
 103
 104 /* Initialize a SLP node.  */
 105
 106 _slp_tree::_slp_tree ()
 107 {
 108   this->prev_node = NULL;
 109   if (slp_first_node)
 110     slp_first_node->prev_node = this;
 111   this->next_node = slp_first_node;
 112   slp_first_node = this;
 113   SLP_TREE_SCALAR_STMTS (this) = vNULL;
 114   SLP_TREE_SCALAR_OPS (this) = vNULL;
 115   SLP_TREE_VEC_DEFS (this) = vNULL;
 116   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
 117   SLP_TREE_CHILDREN (this) = vNULL;
 118   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
 119   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
 120   SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
 121   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
 122   SLP_TREE_CODE (this) = ERROR_MARK;
 123   SLP_TREE_VECTYPE (this) = NULL_TREE;
 124   SLP_TREE_REPRESENTATIVE (this) = NULL;
 125   SLP_TREE_REF_COUNT (this) = 1;
 126   this->failed = NULL;
 127   this->max_nunits = 1;
 128   this->lanes = 0;
 129 }
 130
 131 /* Tear down a SLP node.  */
 132
 133 _slp_tree::~_slp_tree ()
 134 {
 135   if (this->prev_node)
 136     this->prev_node->next_node = this->next_node;
 137   else
 138     slp_first_node = this->next_node;
 139   if (this->next_node)
 140     this->next_node->prev_node = this->prev_node;
 141   SLP_TREE_CHILDREN (this).release ();
 142   SLP_TREE_SCALAR_STMTS (this).release ();
 143   SLP_TREE_SCALAR_OPS (this).release ();
 144   SLP_TREE_VEC_DEFS (this).release ();
 145   SLP_TREE_LOAD_PERMUTATION (this).release ();
 146   SLP_TREE_LANE_PERMUTATION (this).release ();
 147   SLP_TREE_SIMD_CLONE_INFO (this).release ();
 148   if (this->failed)
 149     free (failed);
 150 }
 151
 152 /* Push the single SSA definition in DEF to the vector of vector defs.  */
 153
 154 void
 155 _slp_tree::push_vec_def (gimple *def)
 156 {
 157   if (gphi *phi = dyn_cast <gphi *> (def))
 158     vec_defs.quick_push (gimple_phi_result (phi));
 159   else
 160     {
 161       def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
 162       vec_defs.quick_push (get_def_from_ptr (defop));
 163     }
 164 }
 165
 166 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
 167
 168 void
 169 vect_free_slp_tree (slp_tree node)
 170 {
 171   int i;
 172   slp_tree child;
 173
 174   if (--SLP_TREE_REF_COUNT (node) != 0)
 175     return;
 176
 177   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
 178     if (child)
 179       vect_free_slp_tree (child);
 180
 181   /* If the node defines any SLP only patterns then those patterns are no
 182      longer valid and should be removed.  */
 183   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
 184   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
 185     {
 186       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
 187       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
 188       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
 189     }
 190
 191   delete node;
 192 }
 193
 194 /* Return a location suitable for dumpings related to the SLP instance.  */
 195
 196 dump_user_location_t
 197 _slp_instance::location () const
 198 {
 199   if (!root_stmts.is_empty ())
 200     return root_stmts[0]->stmt;
 201   else
 202     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
 203 }
 204
 205
 206 /* Free the memory allocated for the SLP instance.  */
 207
 208 void
 209 vect_free_slp_instance (slp_instance instance)
 210 {
 211   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
 212   SLP_INSTANCE_LOADS (instance).release ();
 213   SLP_INSTANCE_ROOT_STMTS (instance).release ();
 214   SLP_INSTANCE_REMAIN_DEFS (instance).release ();
 215   instance->subgraph_entries.release ();
 216   instance->cost_vec.release ();
 217   free (instance);
 218 }
 219
 220
 221 /* Create an SLP node for SCALAR_STMTS.  */
 222
 223 slp_tree
 224 vect_create_new_slp_node (unsigned nops, tree_code code)
 225 {
 226   slp_tree node = new _slp_tree;
 227   SLP_TREE_SCALAR_STMTS (node) = vNULL;
 228   SLP_TREE_CHILDREN (node).create (nops);
 229   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 230   SLP_TREE_CODE (node) = code;
 231   return node;
 232 }
 233 /* Create an SLP node for SCALAR_STMTS.  */
 234
 235 static slp_tree
 236 vect_create_new_slp_node (slp_tree node,
 237                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
 238 {
 239   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
 240   SLP_TREE_CHILDREN (node).create (nops);
 241   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 242   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 243   SLP_TREE_LANES (node) = scalar_stmts.length ();
 244   return node;
 245 }
 246
 247 /* Create an SLP node for SCALAR_STMTS.  */
 248
 249 static slp_tree
 250 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 251 {
 252   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
 253 }
 254
 255 /* Create an SLP node for OPS.  */
 256
 257 static slp_tree
 258 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
 259 {
 260   SLP_TREE_SCALAR_OPS (node) = ops;
 261   SLP_TREE_DEF_TYPE (node) = vect_external_def;
 262   SLP_TREE_LANES (node) = ops.length ();
 263   return node;
 264 }
 265
 266 /* Create an SLP node for OPS.  */
 267
 268 static slp_tree
 269 vect_create_new_slp_node (vec<tree> ops)
 270 {
 271   return vect_create_new_slp_node (new _slp_tree, ops);
 272 }
 273
 274
 275 /* This structure is used in creation of an SLP tree.  Each instance
 276    corresponds to the same operand in a group of scalar stmts in an SLP
 277    node.  */
 278 typedef struct _slp_oprnd_info
 279 {
 280   /* Def-stmts for the operands.  */
 281   vec<stmt_vec_info> def_stmts;
 282   /* Operands.  */
 283   vec<tree> ops;
 284   /* Information about the first statement, its vector def-type, type, the
 285      operand itself in case it's constant, and an indication if it's a pattern
 286      stmt and gather/scatter info.  */
 287   tree first_op_type;
 288   enum vect_def_type first_dt;
 289   bool any_pattern;
 290   bool first_gs_p;
 291   gather_scatter_info first_gs_info;
 292 } *slp_oprnd_info;
 293
 294
 295 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
 296    operand.  */
 297 static vec<slp_oprnd_info>
 298 vect_create_oprnd_info (int nops, int group_size)
 299 {
 300   int i;
 301   slp_oprnd_info oprnd_info;
 302   vec<slp_oprnd_info> oprnds_info;
 303
 304   oprnds_info.create (nops);
 305   for (i = 0; i < nops; i++)
 306     {
 307       oprnd_info = XNEW (struct _slp_oprnd_info);
 308       oprnd_info->def_stmts.create (group_size);
 309       oprnd_info->ops.create (group_size);
 310       oprnd_info->first_dt = vect_uninitialized_def;
 311       oprnd_info->first_op_type = NULL_TREE;
 312       oprnd_info->any_pattern = false;
 313       oprnd_info->first_gs_p = false;
 314       oprnds_info.quick_push (oprnd_info);
 315     }
 316
 317   return oprnds_info;
 318 }
 319
 320
 321 /* Free operands info.  */
 322
 323 static void
 324 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 325 {
 326   int i;
 327   slp_oprnd_info oprnd_info;
 328
 329   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
 330     {
 331       oprnd_info->def_stmts.release ();
 332       oprnd_info->ops.release ();
 333       XDELETE (oprnd_info);
 334     }
 335
 336   oprnds_info.release ();
 337 }
 338
 339 /* Return the execution frequency of NODE (so that a higher value indicates
 340    a "more important" node when optimizing for speed).  */
 341
 342 static sreal
 343 vect_slp_node_weight (slp_tree node)
 344 {
 345   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
 346   basic_block bb = gimple_bb (stmt_info->stmt);
 347   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
 348 }
 349
 350 /* Return true if STMTS contains a pattern statement.  */
 351
 352 static bool
 353 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
 354 {
 355   stmt_vec_info stmt_info;
 356   unsigned int i;
 357   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 358     if (is_pattern_stmt_p (stmt_info))
 359       return true;
 360   return false;
 361 }
 362
 363 /* Return true when all lanes in the external or constant NODE have
 364    the same value.  */
 365
 366 static bool
 367 vect_slp_tree_uniform_p (slp_tree node)
 368 {
 369   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
 370               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
 371
 372   /* Pre-exsting vectors.  */
 373   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
 374     return false;
 375
 376   unsigned i;
 377   tree op, first = NULL_TREE;
 378   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
 379     if (!first)
 380       first = op;
 381     else if (!operand_equal_p (first, op, 0))
 382       return false;
 383
 384   return true;
 385 }
 386
 387 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
 388    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
 389    of the chain.  */
 390
 391 int
 392 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 393                                       stmt_vec_info first_stmt_info)
 394 {
 395   stmt_vec_info next_stmt_info = first_stmt_info;
 396   int result = 0;
 397
 398   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
 399     return -1;
 400
 401   do
 402     {
 403       if (next_stmt_info == stmt_info)
 404         return result;
 405       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
 406       if (next_stmt_info)
 407         result += DR_GROUP_GAP (next_stmt_info);
 408     }
 409   while (next_stmt_info);
 410
 411   return -1;
 412 }
 413
 414 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
 415    using the method implemented by duplicate_and_interleave.  Return true
 416    if so, returning the number of intermediate vectors in *NVECTORS_OUT
 417    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
 418    (if nonnull).  */
 419
 420 bool
 421 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
 422                                 tree elt_type, unsigned int *nvectors_out,
 423                                 tree *vector_type_out,
 424                                 tree *permutes)
 425 {
 426   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
 427   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
 428     return false;
 429
 430   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
 431   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
 432   unsigned int nvectors = 1;
 433   for (;;)
 434     {
 435       scalar_int_mode int_mode;
 436       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
 437       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
 438         {
 439           /* Get the natural vector type for this SLP group size.  */
 440           tree int_type = build_nonstandard_integer_type
 441             (GET_MODE_BITSIZE (int_mode), 1);
 442           tree vector_type
 443             = get_vectype_for_scalar_type (vinfo, int_type, count);
 444           poly_int64 half_nelts;
 445           if (vector_type
 446               && VECTOR_MODE_P (TYPE_MODE (vector_type))
 447               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
 448                            GET_MODE_SIZE (base_vector_mode))
 449               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
 450                              2, &half_nelts))
 451             {
 452               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
 453                  together into elements of type INT_TYPE and using the result
 454                  to build NVECTORS vectors.  */
 455               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
 456               vec_perm_builder sel1 (nelts, 2, 3);
 457               vec_perm_builder sel2 (nelts, 2, 3);
 458
 459               for (unsigned int i = 0; i < 3; ++i)
 460                 {
 461                   sel1.quick_push (i);
 462                   sel1.quick_push (i + nelts);
 463                   sel2.quick_push (half_nelts + i);
 464                   sel2.quick_push (half_nelts + i + nelts);
 465                 }
 466               vec_perm_indices indices1 (sel1, 2, nelts);
 467               vec_perm_indices indices2 (sel2, 2, nelts);
 468               machine_mode vmode = TYPE_MODE (vector_type);
 469               if (can_vec_perm_const_p (vmode, vmode, indices1)
 470                   && can_vec_perm_const_p (vmode, vmode, indices2))
 471                 {
 472                   if (nvectors_out)
 473                     *nvectors_out = nvectors;
 474                   if (vector_type_out)
 475                     *vector_type_out = vector_type;
 476                   if (permutes)
 477                     {
 478                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
 479                                                                 indices1);
 480                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
 481                                                                 indices2);
 482                     }
 483                   return true;
 484                 }
 485             }
 486         }
 487       if (!multiple_p (elt_bytes, 2, &elt_bytes))
 488         return false;
 489       nvectors *= 2;
 490     }
 491 }
 492
 493 /* Return true if DTA and DTB match.  */
 494
 495 static bool
 496 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
 497 {
 498   return (dta == dtb
 499           || ((dta == vect_external_def || dta == vect_constant_def)
 500               && (dtb == vect_external_def || dtb == vect_constant_def)));
 501 }
 502
 503 static const int cond_expr_maps[3][5] = {
 504   { 4, -1, -2, 1, 2 },
 505   { 4, -2, -1, 1, 2 },
 506   { 4, -1, -2, 2, 1 }
 507 };
 508 static const int arg0_map[] = { 1, 0 };
 509 static const int arg1_map[] = { 1, 1 };
 510 static const int arg2_map[] = { 1, 2 };
 511 static const int arg1_arg4_map[] = { 2, 1, 4 };
 512 static const int arg3_arg2_map[] = { 2, 3, 2 };
 513 static const int op1_op0_map[] = { 2, 1, 0 };
 514 static const int off_map[] = { 1, -3 };
 515 static const int off_op0_map[] = { 2, -3, 0 };
 516 static const int off_arg2_map[] = { 2, -3, 2 };
 517 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
 518 static const int mask_call_maps[6][7] = {
 519   { 1, 1, },
 520   { 2, 1, 2, },
 521   { 3, 1, 2, 3, },
 522   { 4, 1, 2, 3, 4, },
 523   { 5, 1, 2, 3, 4, 5, },
 524   { 6, 1, 2, 3, 4, 5, 6 },
 525 };
 526
 527 /* For most SLP statements, there is a one-to-one mapping between
 528    gimple arguments and child nodes.  If that is not true for STMT,
 529    return an array that contains:
 530
 531    - the number of child nodes, followed by
 532    - for each child node, the index of the argument associated with that node.
 533      The special index -1 is the first operand of an embedded comparison and
 534      the special index -2 is the second operand of an embedded comparison.
 535      The special indes -3 is the offset of a gather as analyzed by
 536      vect_check_gather_scatter.
 537
 538    SWAP is as for vect_get_and_check_slp_defs.  */
 539
 540 static const int *
 541 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
 542                       unsigned char swap = 0)
 543 {
 544   if (auto assign = dyn_cast<const gassign *> (stmt))
 545     {
 546       if (gimple_assign_rhs_code (assign) == COND_EXPR
 547           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
 548         return cond_expr_maps[swap];
 549       if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
 550           && swap)
 551         return op1_op0_map;
 552       if (gather_scatter_p)
 553         return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
 554                 ? off_op0_map : off_map);
 555     }
 556   gcc_assert (!swap);
 557   if (auto call = dyn_cast<const gcall *> (stmt))
 558     {
 559       if (gimple_call_internal_p (call))
 560         switch (gimple_call_internal_fn (call))
 561           {
 562           case IFN_MASK_LOAD:
 563             return gather_scatter_p ? off_arg2_map : arg2_map;
 564
 565           case IFN_GATHER_LOAD:
 566             return arg1_map;
 567
 568           case IFN_MASK_GATHER_LOAD:
 569           case IFN_MASK_LEN_GATHER_LOAD:
 570             return arg1_arg4_map;
 571
 572           case IFN_MASK_STORE:
 573             return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
 574
 575           case IFN_MASK_CALL:
 576             {
 577               unsigned nargs = gimple_call_num_args (call);
 578               if (nargs >= 2 && nargs <= 7)
 579                 return mask_call_maps[nargs-2];
 580               else
 581                 return nullptr;
 582             }
 583
 584           case IFN_CLZ:
 585           case IFN_CTZ:
 586             return arg0_map;
 587
 588           default:
 589             break;
 590           }
 591     }
 592   return nullptr;
 593 }
 594
 595 /* Return the SLP node child index for operand OP of STMT.  */
 596
 597 int
 598 vect_slp_child_index_for_operand (const gimple *stmt, int op,
 599                                   bool gather_scatter_p)
 600 {
 601   const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
 602   if (!opmap)
 603     return op;
 604   for (int i = 1; i < 1 + opmap[0]; ++i)
 605     if (opmap[i] == op)
 606       return i - 1;
 607   gcc_unreachable ();
 608 }
 609
 610 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
 611    they are of a valid type and that they match the defs of the first stmt of
 612    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
 613    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
 614    indicates swap is required for cond_expr stmts.  Specifically, SWAP
 615    is 1 if STMT is cond and operands of comparison need to be swapped;
 616    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
 617
 618    If there was a fatal error return -1; if the error could be corrected by
 619    swapping operands of father node of this one, return 1; if everything is
 620    ok return 0.  */
 621 static int
 622 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 623                              bool *skip_args,
 624                              vec<stmt_vec_info> stmts, unsigned stmt_num,
 625                              vec<slp_oprnd_info> *oprnds_info)
 626 {
 627   stmt_vec_info stmt_info = stmts[stmt_num];
 628   tree oprnd;
 629   unsigned int i, number_of_oprnds;
 630   enum vect_def_type dt = vect_uninitialized_def;
 631   slp_oprnd_info oprnd_info;
 632   gather_scatter_info gs_info;
 633   unsigned int gs_op = -1u;
 634   unsigned int commutative_op = -1U;
 635   bool first = stmt_num == 0;
 636
 637   if (!is_a<gcall *> (stmt_info->stmt)
 638       && !is_a<gassign *> (stmt_info->stmt)
 639       && !is_a<gphi *> (stmt_info->stmt))
 640     return -1;
 641
 642   number_of_oprnds = gimple_num_args (stmt_info->stmt);
 643   const int *map
 644     = vect_get_operand_map (stmt_info->stmt,
 645                             STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
 646   if (map)
 647     number_of_oprnds = *map++;
 648   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 649     {
 650       if (gimple_call_internal_p (stmt))
 651         {
 652           internal_fn ifn = gimple_call_internal_fn (stmt);
 653           commutative_op = first_commutative_argument (ifn);
 654         }
 655     }
 656   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
 657     {
 658       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
 659         commutative_op = 0;
 660     }
 661
 662   bool swapped = (swap != 0);
 663   bool backedge = false;
 664   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
 665   for (i = 0; i < number_of_oprnds; i++)
 666     {
 667       oprnd_info = (*oprnds_info)[i];
 668       int opno = map ? map[i] : int (i);
 669       if (opno == -3)
 670         {
 671           gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
 672           if (!is_a <loop_vec_info> (vinfo)
 673               || !vect_check_gather_scatter (stmt_info,
 674                                              as_a <loop_vec_info> (vinfo),
 675                                              first ? &oprnd_info->first_gs_info
 676                                              : &gs_info))
 677             return -1;
 678
 679           if (first)
 680             {
 681               oprnd_info->first_gs_p = true;
 682               oprnd = oprnd_info->first_gs_info.offset;
 683             }
 684           else
 685             {
 686               gs_op = i;
 687               oprnd = gs_info.offset;
 688             }
 689         }
 690       else if (opno < 0)
 691         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
 692       else
 693         {
 694           oprnd = gimple_arg (stmt_info->stmt, opno);
 695           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
 696             {
 697               edge e = gimple_phi_arg_edge (stmt, opno);
 698               backedge = (is_a <bb_vec_info> (vinfo)
 699                           ? e->flags & EDGE_DFS_BACK
 700                           : dominated_by_p (CDI_DOMINATORS, e->src,
 701                                             gimple_bb (stmt_info->stmt)));
 702             }
 703         }
 704       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
 705         oprnd = TREE_OPERAND (oprnd, 0);
 706
 707       stmt_vec_info def_stmt_info;
 708       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
 709         {
 710           if (dump_enabled_p ())
 711             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 712                              "Build SLP failed: can't analyze def for %T\n",
 713                              oprnd);
 714
 715           return -1;
 716         }
 717
 718       if (skip_args[i])
 719         {
 720           oprnd_info->def_stmts.quick_push (NULL);
 721           oprnd_info->ops.quick_push (NULL_TREE);
 722           oprnd_info->first_dt = vect_uninitialized_def;
 723           continue;
 724         }
 725
 726       oprnd_info->def_stmts.quick_push (def_stmt_info);
 727       oprnd_info->ops.quick_push (oprnd);
 728
 729       if (def_stmt_info
 730           && is_pattern_stmt_p (def_stmt_info))
 731         {
 732           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
 733               != def_stmt_info)
 734             oprnd_info->any_pattern = true;
 735           else
 736             /* If we promote this to external use the original stmt def.  */
 737             oprnd_info->ops.last ()
 738               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
 739         }
 740
 741       /* If there's a extern def on a backedge make sure we can
 742          code-generate at the region start.
 743          ???  This is another case that could be fixed by adjusting
 744          how we split the function but at the moment we'd have conflicting
 745          goals there.  */
 746       if (backedge
 747           && dts[i] == vect_external_def
 748           && is_a <bb_vec_info> (vinfo)
 749           && TREE_CODE (oprnd) == SSA_NAME
 750           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
 751           && !dominated_by_p (CDI_DOMINATORS,
 752                               as_a <bb_vec_info> (vinfo)->bbs[0],
 753                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
 754         {
 755           if (dump_enabled_p ())
 756             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 757                              "Build SLP failed: extern def %T only defined "
 758                              "on backedge\n", oprnd);
 759           return -1;
 760         }
 761
 762       if (first)
 763         {
 764           tree type = TREE_TYPE (oprnd);
 765           dt = dts[i];
 766
 767           /* For the swapping logic below force vect_reduction_def
 768              for the reduction op in a SLP reduction group.  */
 769           if (!STMT_VINFO_DATA_REF (stmt_info)
 770               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 771               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
 772               && def_stmt_info)
 773             dts[i] = dt = vect_reduction_def;
 774
 775           /* Check the types of the definition.  */
 776           switch (dt)
 777             {
 778             case vect_external_def:
 779             case vect_constant_def:
 780             case vect_internal_def:
 781             case vect_reduction_def:
 782             case vect_induction_def:
 783             case vect_nested_cycle:
 784             case vect_first_order_recurrence:
 785               break;
 786
 787             default:
 788               /* FORNOW: Not supported.  */
 789               if (dump_enabled_p ())
 790                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 791                                  "Build SLP failed: illegal type of def %T\n",
 792                                  oprnd);
 793               return -1;
 794             }
 795
 796           oprnd_info->first_dt = dt;
 797           oprnd_info->first_op_type = type;
 798         }
 799     }
 800   if (first)
 801     return 0;
 802
 803   /* Now match the operand definition types to that of the first stmt.  */
 804   for (i = 0; i < number_of_oprnds;)
 805     {
 806       if (skip_args[i])
 807         {
 808           ++i;
 809           continue;
 810         }
 811
 812       oprnd_info = (*oprnds_info)[i];
 813       dt = dts[i];
 814       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
 815       oprnd = oprnd_info->ops[stmt_num];
 816       tree type = TREE_TYPE (oprnd);
 817
 818       if (!types_compatible_p (oprnd_info->first_op_type, type))
 819         {
 820           if (dump_enabled_p ())
 821             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 822                              "Build SLP failed: different operand types\n");
 823           return 1;
 824         }
 825
 826       if ((gs_op == i) != oprnd_info->first_gs_p)
 827         {
 828           if (dump_enabled_p ())
 829             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 830                              "Build SLP failed: mixed gather and non-gather\n");
 831           return 1;
 832         }
 833       else if (gs_op == i)
 834         {
 835           if (!operand_equal_p (oprnd_info->first_gs_info.base,
 836                                 gs_info.base))
 837             {
 838               if (dump_enabled_p ())
 839                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 840                                  "Build SLP failed: different gather base\n");
 841               return 1;
 842             }
 843           if (oprnd_info->first_gs_info.scale != gs_info.scale)
 844             {
 845               if (dump_enabled_p ())
 846                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 847                                  "Build SLP failed: different gather scale\n");
 848               return 1;
 849             }
 850         }
 851
 852       /* Not first stmt of the group, check that the def-stmt/s match
 853          the def-stmt/s of the first stmt.  Allow different definition
 854          types for reduction chains: the first stmt must be a
 855          vect_reduction_def (a phi node), and the rest
 856          end in the reduction chain.  */
 857       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
 858            && !(oprnd_info->first_dt == vect_reduction_def
 859                 && !STMT_VINFO_DATA_REF (stmt_info)
 860                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 861                 && def_stmt_info
 862                 && !STMT_VINFO_DATA_REF (def_stmt_info)
 863                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 864                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
 865           || (!STMT_VINFO_DATA_REF (stmt_info)
 866               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 867               && ((!def_stmt_info
 868                    || STMT_VINFO_DATA_REF (def_stmt_info)
 869                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 870                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 871                   != (oprnd_info->first_dt != vect_reduction_def))))
 872         {
 873           /* Try swapping operands if we got a mismatch.  For BB
 874              vectorization only in case it will clearly improve things.  */
 875           if (i == commutative_op && !swapped
 876               && (!is_a <bb_vec_info> (vinfo)
 877                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
 878                                              dts[i+1])
 879                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
 880                           || vect_def_types_match
 881                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
 882             {
 883               if (dump_enabled_p ())
 884                 dump_printf_loc (MSG_NOTE, vect_location,
 885                                  "trying swapped operands\n");
 886               std::swap (dts[i], dts[i+1]);
 887               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
 888                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
 889               std::swap ((*oprnds_info)[i]->ops[stmt_num],
 890                          (*oprnds_info)[i+1]->ops[stmt_num]);
 891               /* After swapping some operands we lost track whether an
 892                  operand has any pattern defs so be conservative here.  */
 893               if ((*oprnds_info)[i]->any_pattern
 894                   || (*oprnds_info)[i+1]->any_pattern)
 895                 (*oprnds_info)[i]->any_pattern
 896                   = (*oprnds_info)[i+1]->any_pattern = true;
 897               swapped = true;
 898               continue;
 899             }
 900
 901           if (is_a <bb_vec_info> (vinfo)
 902               && !oprnd_info->any_pattern)
 903             {
 904               /* Now for commutative ops we should see whether we can
 905                  make the other operand matching.  */
 906               if (dump_enabled_p ())
 907                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 908                                  "treating operand as external\n");
 909               oprnd_info->first_dt = dt = vect_external_def;
 910             }
 911           else
 912             {
 913               if (dump_enabled_p ())
 914                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 915                                  "Build SLP failed: different types\n");
 916               return 1;
 917             }
 918         }
 919
 920       /* Make sure to demote the overall operand to external.  */
 921       if (dt == vect_external_def)
 922         oprnd_info->first_dt = vect_external_def;
 923       /* For a SLP reduction chain we want to duplicate the reduction to
 924          each of the chain members.  That gets us a sane SLP graph (still
 925          the stmts are not 100% correct wrt the initial values).  */
 926       else if ((dt == vect_internal_def
 927                 || dt == vect_reduction_def)
 928                && oprnd_info->first_dt == vect_reduction_def
 929                && !STMT_VINFO_DATA_REF (stmt_info)
 930                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 931                && !STMT_VINFO_DATA_REF (def_stmt_info)
 932                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 933                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 934         {
 935           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
 936           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
 937         }
 938
 939       ++i;
 940     }
 941
 942   /* Swap operands.  */
 943   if (swapped)
 944     {
 945       if (dump_enabled_p ())
 946         dump_printf_loc (MSG_NOTE, vect_location,
 947                          "swapped operands to match def types in %G",
 948                          stmt_info->stmt);
 949     }
 950
 951   return 0;
 952 }
 953
 954 /* Return true if call statements CALL1 and CALL2 are similar enough
 955    to be combined into the same SLP group.  */
 956
 957 bool
 958 compatible_calls_p (gcall *call1, gcall *call2)
 959 {
 960   unsigned int nargs = gimple_call_num_args (call1);
 961   if (nargs != gimple_call_num_args (call2))
 962     return false;
 963
 964   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
 965     return false;
 966
 967   if (gimple_call_internal_p (call1))
 968     {
 969       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
 970                                TREE_TYPE (gimple_call_lhs (call2))))
 971         return false;
 972       for (unsigned int i = 0; i < nargs; ++i)
 973         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
 974                                  TREE_TYPE (gimple_call_arg (call2, i))))
 975           return false;
 976     }
 977   else
 978     {
 979       if (!operand_equal_p (gimple_call_fn (call1),
 980                             gimple_call_fn (call2), 0))
 981         return false;
 982
 983       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
 984         return false;
 985     }
 986
 987   /* Check that any unvectorized arguments are equal.  */
 988   if (const int *map = vect_get_operand_map (call1))
 989     {
 990       unsigned int nkept = *map++;
 991       unsigned int mapi = 0;
 992       for (unsigned int i = 0; i < nargs; ++i)
 993         if (mapi < nkept && map[mapi] == int (i))
 994           mapi += 1;
 995         else if (!operand_equal_p (gimple_call_arg (call1, i),
 996                                    gimple_call_arg (call2, i)))
 997           return false;
 998     }
 999
1000   return true;
1001 }
1002
1003 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1004    caller's attempt to find the vector type in STMT_INFO with the narrowest
1005    element type.  Return true if VECTYPE is nonnull and if it is valid
1006    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
1007    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
1008    vect_build_slp_tree.  */
1009
1010 static bool
1011 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1012                         unsigned int group_size,
1013                         tree vectype, poly_uint64 *max_nunits)
1014 {
1015   if (!vectype)
1016     {
1017       if (dump_enabled_p ())
1018         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1019                          "Build SLP failed: unsupported data-type in %G\n",
1020                          stmt_info->stmt);
1021       /* Fatal mismatch.  */
1022       return false;
1023     }
1024
1025   /* If populating the vector type requires unrolling then fail
1026      before adjusting *max_nunits for basic-block vectorization.  */
1027   if (is_a <bb_vec_info> (vinfo)
1028       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1029     {
1030       if (dump_enabled_p ())
1031         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1032                          "Build SLP failed: unrolling required "
1033                          "in basic block SLP\n");
1034       /* Fatal mismatch.  */
1035       return false;
1036     }
1037
1038   /* In case of multiple types we need to detect the smallest type.  */
1039   vect_update_max_nunits (max_nunits, vectype);
1040   return true;
1041 }
1042
1043 /* Verify if the scalar stmts STMTS are isomorphic, require data
1044    permutation or are of unsupported types of operation.  Return
1045    true if they are, otherwise return false and indicate in *MATCHES
1046    which stmts are not isomorphic to the first one.  If MATCHES[0]
1047    is false then this indicates the comparison could not be
1048    carried out or the stmts will never be vectorized by SLP.
1049
1050    Note COND_EXPR is possibly isomorphic to another one after swapping its
1051    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1052    the first stmt by swapping the two operands of comparison; set SWAP[i]
1053    to 2 if stmt I is isormorphic to the first stmt by inverting the code
1054    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1055    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
1056
1057 static bool
1058 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1059                        vec<stmt_vec_info> stmts, unsigned int group_size,
1060                        poly_uint64 *max_nunits, bool *matches,
1061                        bool *two_operators, tree *node_vectype)
1062 {
1063   unsigned int i;
1064   stmt_vec_info first_stmt_info = stmts[0];
1065   code_helper first_stmt_code = ERROR_MARK;
1066   code_helper alt_stmt_code = ERROR_MARK;
1067   code_helper rhs_code = ERROR_MARK;
1068   code_helper first_cond_code = ERROR_MARK;
1069   tree lhs;
1070   bool need_same_oprnds = false;
1071   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1072   stmt_vec_info first_load = NULL, prev_first_load = NULL;
1073   bool first_stmt_ldst_p = false, ldst_p = false;
1074   bool first_stmt_phi_p = false, phi_p = false;
1075   bool maybe_soft_fail = false;
1076   tree soft_fail_nunits_vectype = NULL_TREE;
1077
1078   /* For every stmt in NODE find its def stmt/s.  */
1079   stmt_vec_info stmt_info;
1080   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1081     {
1082       gimple *stmt = stmt_info->stmt;
1083       swap[i] = 0;
1084       matches[i] = false;
1085
1086       if (dump_enabled_p ())
1087         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1088
1089       /* Fail to vectorize statements marked as unvectorizable, throw
1090          or are volatile.  */
1091       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1092           || stmt_can_throw_internal (cfun, stmt)
1093           || gimple_has_volatile_ops (stmt))
1094         {
1095           if (dump_enabled_p ())
1096             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1097                              "Build SLP failed: unvectorizable statement %G",
1098                              stmt);
1099           /* ???  For BB vectorization we want to commutate operands in a way
1100              to shuffle all unvectorizable defs into one operand and have
1101              the other still vectorized.  The following doesn't reliably
1102              work for this though but it's the easiest we can do here.  */
1103           if (is_a <bb_vec_info> (vinfo) && i != 0)
1104             continue;
1105           /* Fatal mismatch.  */
1106           matches[0] = false;
1107           return false;
1108         }
1109
1110       gcall *call_stmt = dyn_cast <gcall *> (stmt);
1111       lhs = gimple_get_lhs (stmt);
1112       if (lhs == NULL_TREE
1113           && (!call_stmt
1114               || !gimple_call_internal_p (stmt)
1115               || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1116         {
1117           if (dump_enabled_p ())
1118             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1119                              "Build SLP failed: not GIMPLE_ASSIGN nor "
1120                              "GIMPLE_CALL %G", stmt);
1121           if (is_a <bb_vec_info> (vinfo) && i != 0)
1122             continue;
1123           /* Fatal mismatch.  */
1124           matches[0] = false;
1125           return false;
1126         }
1127
1128       tree nunits_vectype;
1129       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1130                                            &nunits_vectype, group_size))
1131         {
1132           if (is_a <bb_vec_info> (vinfo) && i != 0)
1133             continue;
1134           /* Fatal mismatch.  */
1135           matches[0] = false;
1136           return false;
1137         }
1138       /* Record nunits required but continue analysis, producing matches[]
1139          as if nunits was not an issue.  This allows splitting of groups
1140          to happen.  */
1141       if (nunits_vectype
1142           && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1143                                       nunits_vectype, max_nunits))
1144         {
1145           gcc_assert (is_a <bb_vec_info> (vinfo));
1146           maybe_soft_fail = true;
1147           soft_fail_nunits_vectype = nunits_vectype;
1148         }
1149
1150       gcc_assert (vectype);
1151
1152       if (call_stmt)
1153         {
1154           combined_fn cfn = gimple_call_combined_fn (call_stmt);
1155           if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1156             rhs_code = cfn;
1157           else
1158             rhs_code = CALL_EXPR;
1159
1160           if (cfn == CFN_MASK_LOAD
1161               || cfn == CFN_GATHER_LOAD
1162               || cfn == CFN_MASK_GATHER_LOAD
1163               || cfn == CFN_MASK_LEN_GATHER_LOAD)
1164             ldst_p = true;
1165           else if (cfn == CFN_MASK_STORE)
1166             {
1167               ldst_p = true;
1168               rhs_code = CFN_MASK_STORE;
1169             }
1170           else if ((cfn != CFN_LAST
1171                     && cfn != CFN_MASK_CALL
1172                     && internal_fn_p (cfn)
1173                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1174                    || gimple_call_tail_p (call_stmt)
1175                    || gimple_call_noreturn_p (call_stmt)
1176                    || gimple_call_chain (call_stmt))
1177             {
1178               if (dump_enabled_p ())
1179                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1180                                  "Build SLP failed: unsupported call type %G",
1181                                  (gimple *) call_stmt);
1182               if (is_a <bb_vec_info> (vinfo) && i != 0)
1183                 continue;
1184               /* Fatal mismatch.  */
1185               matches[0] = false;
1186               return false;
1187             }
1188         }
1189       else if (gimple_code (stmt) == GIMPLE_PHI)
1190         {
1191           rhs_code = ERROR_MARK;
1192           phi_p = true;
1193         }
1194       else
1195         {
1196           rhs_code = gimple_assign_rhs_code (stmt);
1197           ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1198         }
1199
1200       /* Check the operation.  */
1201       if (i == 0)
1202         {
1203           *node_vectype = vectype;
1204           first_stmt_code = rhs_code;
1205           first_stmt_ldst_p = ldst_p;
1206           first_stmt_phi_p = phi_p;
1207
1208           /* Shift arguments should be equal in all the packed stmts for a
1209              vector shift with scalar shift operand.  */
1210           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1211               || rhs_code == LROTATE_EXPR
1212               || rhs_code == RROTATE_EXPR)
1213             {
1214               /* First see if we have a vector/vector shift.  */
1215               if (!directly_supported_p (rhs_code, vectype, optab_vector))
1216                 {
1217                   /* No vector/vector shift, try for a vector/scalar shift.  */
1218                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1219                     {
1220                       if (dump_enabled_p ())
1221                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1222                                          "Build SLP failed: "
1223                                          "op not supported by target.\n");
1224                       if (is_a <bb_vec_info> (vinfo) && i != 0)
1225                         continue;
1226                       /* Fatal mismatch.  */
1227                       matches[0] = false;
1228                       return false;
1229                     }
1230                   need_same_oprnds = true;
1231                   first_op1 = gimple_assign_rhs2 (stmt);
1232                 }
1233             }
1234           else if (rhs_code == WIDEN_LSHIFT_EXPR)
1235             {
1236               need_same_oprnds = true;
1237               first_op1 = gimple_assign_rhs2 (stmt);
1238             }
1239           else if (!ldst_p
1240                    && rhs_code == BIT_FIELD_REF)
1241             {
1242               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1243               if (!is_a <bb_vec_info> (vinfo)
1244                   || TREE_CODE (vec) != SSA_NAME
1245                   /* When the element types are not compatible we pun the
1246                      source to the target vectype which requires equal size.  */
1247                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1248                        || !types_compatible_p (TREE_TYPE (vectype),
1249                                                TREE_TYPE (TREE_TYPE (vec))))
1250                       && !operand_equal_p (TYPE_SIZE (vectype),
1251                                            TYPE_SIZE (TREE_TYPE (vec)))))
1252                 {
1253                   if (dump_enabled_p ())
1254                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1255                                      "Build SLP failed: "
1256                                      "BIT_FIELD_REF not supported\n");
1257                   /* Fatal mismatch.  */
1258                   matches[0] = false;
1259                   return false;
1260                 }
1261             }
1262           else if (rhs_code == CFN_DIV_POW2)
1263             {
1264               need_same_oprnds = true;
1265               first_op1 = gimple_call_arg (call_stmt, 1);
1266             }
1267         }
1268       else
1269         {
1270           if (first_stmt_code != rhs_code
1271               && alt_stmt_code == ERROR_MARK)
1272             alt_stmt_code = rhs_code;
1273           if ((first_stmt_code != rhs_code
1274                && (first_stmt_code != IMAGPART_EXPR
1275                    || rhs_code != REALPART_EXPR)
1276                && (first_stmt_code != REALPART_EXPR
1277                    || rhs_code != IMAGPART_EXPR)
1278                /* Handle mismatches in plus/minus by computing both
1279                   and merging the results.  */
1280                && !((first_stmt_code == PLUS_EXPR
1281                      || first_stmt_code == MINUS_EXPR)
1282                     && (alt_stmt_code == PLUS_EXPR
1283                         || alt_stmt_code == MINUS_EXPR)
1284                     && rhs_code == alt_stmt_code)
1285                && !(first_stmt_code.is_tree_code ()
1286                     && rhs_code.is_tree_code ()
1287                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1288                         == tcc_comparison)
1289                     && (swap_tree_comparison (tree_code (first_stmt_code))
1290                         == tree_code (rhs_code)))
1291                && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1292                     && (first_stmt_code == ARRAY_REF
1293                         || first_stmt_code == BIT_FIELD_REF
1294                         || first_stmt_code == INDIRECT_REF
1295                         || first_stmt_code == COMPONENT_REF
1296                         || first_stmt_code == MEM_REF)
1297                     && (rhs_code == ARRAY_REF
1298                         || rhs_code == BIT_FIELD_REF
1299                         || rhs_code == INDIRECT_REF
1300                         || rhs_code == COMPONENT_REF
1301                         || rhs_code == MEM_REF)))
1302               || (ldst_p
1303                   && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1304                       != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1305               || (ldst_p
1306                   && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1307                       != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1308               || first_stmt_ldst_p != ldst_p
1309               || first_stmt_phi_p != phi_p)
1310             {
1311               if (dump_enabled_p ())
1312                 {
1313                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1314                                    "Build SLP failed: different operation "
1315                                    "in stmt %G", stmt);
1316                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1317                                    "original stmt %G", first_stmt_info->stmt);
1318                 }
1319               /* Mismatch.  */
1320               continue;
1321             }
1322
1323           if (!ldst_p
1324               && first_stmt_code == BIT_FIELD_REF
1325               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1326                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1327             {
1328               if (dump_enabled_p ())
1329                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1330                                  "Build SLP failed: different BIT_FIELD_REF "
1331                                  "arguments in %G", stmt);
1332               /* Mismatch.  */
1333               continue;
1334             }
1335
1336           if (call_stmt
1337               && first_stmt_code != CFN_MASK_LOAD
1338               && first_stmt_code != CFN_MASK_STORE)
1339             {
1340               if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1341                                        call_stmt))
1342                 {
1343                   if (dump_enabled_p ())
1344                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1345                                      "Build SLP failed: different calls in %G",
1346                                      stmt);
1347                   /* Mismatch.  */
1348                   continue;
1349                 }
1350             }
1351
1352           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1353               && (gimple_bb (first_stmt_info->stmt)
1354                   != gimple_bb (stmt_info->stmt)))
1355             {
1356               if (dump_enabled_p ())
1357                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1358                                  "Build SLP failed: different BB for PHI "
1359                                  "or possibly trapping operation in %G", stmt);
1360               /* Mismatch.  */
1361               continue;
1362             }
1363
1364           if (need_same_oprnds)
1365             {
1366               tree other_op1 = gimple_arg (stmt, 1);
1367               if (!operand_equal_p (first_op1, other_op1, 0))
1368                 {
1369                   if (dump_enabled_p ())
1370                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1371                                      "Build SLP failed: different shift "
1372                                      "arguments in %G", stmt);
1373                   /* Mismatch.  */
1374                   continue;
1375                 }
1376             }
1377
1378           if (!types_compatible_p (vectype, *node_vectype))
1379             {
1380               if (dump_enabled_p ())
1381                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1382                                  "Build SLP failed: different vector type "
1383                                  "in %G", stmt);
1384               /* Mismatch.  */
1385               continue;
1386             }
1387         }
1388
1389       /* Grouped store or load.  */
1390       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1391         {
1392           gcc_assert (ldst_p);
1393           if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1394             {
1395               /* Store.  */
1396               gcc_assert (rhs_code == CFN_MASK_STORE
1397                           || REFERENCE_CLASS_P (lhs)
1398                           || DECL_P (lhs));
1399             }
1400           else
1401             {
1402               /* Load.  */
1403               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1404               if (prev_first_load)
1405                 {
1406                   /* Check that there are no loads from different interleaving
1407                      chains in the same node.  */
1408                   if (prev_first_load != first_load)
1409                     {
1410                       if (dump_enabled_p ())
1411                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1412                                          vect_location,
1413                                          "Build SLP failed: different "
1414                                          "interleaving chains in one node %G",
1415                                          stmt);
1416                       /* Mismatch.  */
1417                       continue;
1418                     }
1419                 }
1420               else
1421                 prev_first_load = first_load;
1422            }
1423         }
1424       /* Non-grouped store or load.  */
1425       else if (ldst_p)
1426         {
1427           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1428               && rhs_code != CFN_GATHER_LOAD
1429               && rhs_code != CFN_MASK_GATHER_LOAD
1430               && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1431               && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1432               /* Not grouped loads are handled as externals for BB
1433                  vectorization.  For loop vectorization we can handle
1434                  splats the same we handle single element interleaving.  */
1435               && (is_a <bb_vec_info> (vinfo)
1436                   || stmt_info != first_stmt_info))
1437             {
1438               /* Not grouped load.  */
1439               if (dump_enabled_p ())
1440                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1441                                  "Build SLP failed: not grouped load %G", stmt);
1442
1443               if (i != 0)
1444                 continue;
1445               /* Fatal mismatch.  */
1446               matches[0] = false;
1447               return false;
1448             }
1449         }
1450       /* Not memory operation.  */
1451       else
1452         {
1453           if (!phi_p
1454               && rhs_code.is_tree_code ()
1455               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1456               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1457               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1458               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1459               && rhs_code != VIEW_CONVERT_EXPR
1460               && rhs_code != CALL_EXPR
1461               && rhs_code != BIT_FIELD_REF)
1462             {
1463               if (dump_enabled_p ())
1464                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1465                                  "Build SLP failed: operation unsupported %G",
1466                                  stmt);
1467               if (is_a <bb_vec_info> (vinfo) && i != 0)
1468                 continue;
1469               /* Fatal mismatch.  */
1470               matches[0] = false;
1471               return false;
1472             }
1473
1474           if (rhs_code == COND_EXPR)
1475             {
1476               tree cond_expr = gimple_assign_rhs1 (stmt);
1477               enum tree_code cond_code = TREE_CODE (cond_expr);
1478               enum tree_code swap_code = ERROR_MARK;
1479               enum tree_code invert_code = ERROR_MARK;
1480
1481               if (i == 0)
1482                 first_cond_code = TREE_CODE (cond_expr);
1483               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1484                 {
1485                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1486                   swap_code = swap_tree_comparison (cond_code);
1487                   invert_code = invert_tree_comparison (cond_code, honor_nans);
1488                 }
1489
1490               if (first_cond_code == cond_code)
1491                 ;
1492               /* Isomorphic can be achieved by swapping.  */
1493               else if (first_cond_code == swap_code)
1494                 swap[i] = 1;
1495               /* Isomorphic can be achieved by inverting.  */
1496               else if (first_cond_code == invert_code)
1497                 swap[i] = 2;
1498               else
1499                 {
1500                   if (dump_enabled_p ())
1501                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1502                                      "Build SLP failed: different"
1503                                      " operation %G", stmt);
1504                   /* Mismatch.  */
1505                   continue;
1506                 }
1507             }
1508
1509           if (rhs_code.is_tree_code ()
1510               && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1511               && (swap_tree_comparison ((tree_code)first_stmt_code)
1512                   == (tree_code)rhs_code))
1513             swap[i] = 1;
1514         }
1515
1516       matches[i] = true;
1517     }
1518
1519   for (i = 0; i < group_size; ++i)
1520     if (!matches[i])
1521       return false;
1522
1523   /* If we allowed a two-operation SLP node verify the target can cope
1524      with the permute we are going to use.  */
1525   if (alt_stmt_code != ERROR_MARK
1526       && (!alt_stmt_code.is_tree_code ()
1527           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1528               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1529     {
1530       *two_operators = true;
1531     }
1532
1533   if (maybe_soft_fail)
1534     {
1535       unsigned HOST_WIDE_INT const_nunits;
1536       if (!TYPE_VECTOR_SUBPARTS
1537             (soft_fail_nunits_vectype).is_constant (&const_nunits)
1538           || const_nunits > group_size)
1539         matches[0] = false;
1540       else
1541         {
1542           /* With constant vector elements simulate a mismatch at the
1543              point we need to split.  */
1544           unsigned tail = group_size & (const_nunits - 1);
1545           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1546         }
1547       return false;
1548     }
1549
1550   return true;
1551 }
1552
1553 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1554    Note we never remove apart from at destruction time so we do not
1555    need a special value for deleted that differs from empty.  */
1556 struct bst_traits
1557 {
1558   typedef vec <stmt_vec_info> value_type;
1559   typedef vec <stmt_vec_info> compare_type;
1560   static inline hashval_t hash (value_type);
1561   static inline bool equal (value_type existing, value_type candidate);
1562   static inline bool is_empty (value_type x) { return !x.exists (); }
1563   static inline bool is_deleted (value_type x) { return !x.exists (); }
1564   static const bool empty_zero_p = true;
1565   static inline void mark_empty (value_type &x) { x.release (); }
1566   static inline void mark_deleted (value_type &x) { x.release (); }
1567   static inline void remove (value_type &x) { x.release (); }
1568 };
1569 inline hashval_t
1570 bst_traits::hash (value_type x)
1571 {
1572   inchash::hash h;
1573   for (unsigned i = 0; i < x.length (); ++i)
1574     h.add_int (gimple_uid (x[i]->stmt));
1575   return h.end ();
1576 }
1577 inline bool
1578 bst_traits::equal (value_type existing, value_type candidate)
1579 {
1580   if (existing.length () != candidate.length ())
1581     return false;
1582   for (unsigned i = 0; i < existing.length (); ++i)
1583     if (existing[i] != candidate[i])
1584       return false;
1585   return true;
1586 }
1587
1588 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1589    but then vec::insert does memmove and that's not compatible with
1590    std::pair.  */
1591 struct chain_op_t
1592 {
1593   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1594       : code (code_), dt (dt_), op (op_) {}
1595   tree_code code;
1596   vect_def_type dt;
1597   tree op;
1598 };
1599
1600 /* Comparator for sorting associatable chains.  */
1601
1602 static int
1603 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1604 {
1605   auto *op1 = (const chain_op_t *) op1_;
1606   auto *op2 = (const chain_op_t *) op2_;
1607   if (op1->dt != op2->dt)
1608     return (int)op1->dt - (int)op2->dt;
1609   return (int)op1->code - (int)op2->code;
1610 }
1611
1612 /* Linearize the associatable expression chain at START with the
1613    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1614    filling CHAIN with the result and using WORKLIST as intermediate storage.
1615    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1616    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1617    stmts, starting with START.  */
1618
1619 static void
1620 vect_slp_linearize_chain (vec_info *vinfo,
1621                           vec<std::pair<tree_code, gimple *> > &worklist,
1622                           vec<chain_op_t> &chain,
1623                           enum tree_code code, gimple *start,
1624                           gimple *&code_stmt, gimple *&alt_code_stmt,
1625                           vec<gimple *> *chain_stmts)
1626 {
1627   /* For each lane linearize the addition/subtraction (or other
1628      uniform associatable operation) expression tree.  */
1629   worklist.safe_push (std::make_pair (code, start));
1630   while (!worklist.is_empty ())
1631     {
1632       auto entry = worklist.pop ();
1633       gassign *stmt = as_a <gassign *> (entry.second);
1634       enum tree_code in_code = entry.first;
1635       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1636       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1637       if (!code_stmt
1638           && gimple_assign_rhs_code (stmt) == code)
1639         code_stmt = stmt;
1640       else if (!alt_code_stmt
1641                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1642         alt_code_stmt = stmt;
1643       if (chain_stmts)
1644         chain_stmts->safe_push (stmt);
1645       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1646         {
1647           tree op = gimple_op (stmt, opnum);
1648           vect_def_type dt;
1649           stmt_vec_info def_stmt_info;
1650           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1651           gcc_assert (res);
1652           if (dt == vect_internal_def
1653               && is_pattern_stmt_p (def_stmt_info))
1654             op = gimple_get_lhs (def_stmt_info->stmt);
1655           gimple *use_stmt;
1656           use_operand_p use_p;
1657           if (dt == vect_internal_def
1658               && single_imm_use (op, &use_p, &use_stmt)
1659               && is_gimple_assign (def_stmt_info->stmt)
1660               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1661                   || (code == PLUS_EXPR
1662                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
1663                           == MINUS_EXPR))))
1664             {
1665               tree_code op_def_code = this_code;
1666               if (op_def_code == MINUS_EXPR && opnum == 1)
1667                 op_def_code = PLUS_EXPR;
1668               if (in_code == MINUS_EXPR)
1669                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1670               worklist.safe_push (std::make_pair (op_def_code,
1671                                                   def_stmt_info->stmt));
1672             }
1673           else
1674             {
1675               tree_code op_def_code = this_code;
1676               if (op_def_code == MINUS_EXPR && opnum == 1)
1677                 op_def_code = PLUS_EXPR;
1678               if (in_code == MINUS_EXPR)
1679                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1680               chain.safe_push (chain_op_t (op_def_code, dt, op));
1681             }
1682         }
1683     }
1684 }
1685
1686 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1687                   simple_hashmap_traits <bst_traits, slp_tree> >
1688   scalar_stmts_to_slp_tree_map_t;
1689
1690 static slp_tree
1691 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1692                        vec<stmt_vec_info> stmts, unsigned int group_size,
1693                        poly_uint64 *max_nunits,
1694                        bool *matches, unsigned *limit, unsigned *tree_size,
1695                        scalar_stmts_to_slp_tree_map_t *bst_map);
1696
1697 static slp_tree
1698 vect_build_slp_tree (vec_info *vinfo,
1699                      vec<stmt_vec_info> stmts, unsigned int group_size,
1700                      poly_uint64 *max_nunits,
1701                      bool *matches, unsigned *limit, unsigned *tree_size,
1702                      scalar_stmts_to_slp_tree_map_t *bst_map)
1703 {
1704   if (slp_tree *leader = bst_map->get (stmts))
1705     {
1706       if (dump_enabled_p ())
1707         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1708                          !(*leader)->failed ? "" : "failed ",
1709                          (void *) *leader);
1710       if (!(*leader)->failed)
1711         {
1712           SLP_TREE_REF_COUNT (*leader)++;
1713           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1714           stmts.release ();
1715           return *leader;
1716         }
1717       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1718       return NULL;
1719     }
1720
1721   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1722      so we can pick up backedge destinations during discovery.  */
1723   slp_tree res = new _slp_tree;
1724   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1725   SLP_TREE_SCALAR_STMTS (res) = stmts;
1726   bst_map->put (stmts.copy (), res);
1727
1728   if (*limit == 0)
1729     {
1730       if (dump_enabled_p ())
1731         dump_printf_loc (MSG_NOTE, vect_location,
1732                          "SLP discovery limit exceeded\n");
1733       /* Mark the node invalid so we can detect those when still in use
1734          as backedge destinations.  */
1735       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1736       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1737       res->failed = XNEWVEC (bool, group_size);
1738       memset (res->failed, 0, sizeof (bool) * group_size);
1739       memset (matches, 0, sizeof (bool) * group_size);
1740       return NULL;
1741     }
1742   --*limit;
1743
1744   if (dump_enabled_p ())
1745     dump_printf_loc (MSG_NOTE, vect_location,
1746                      "starting SLP discovery for node %p\n", (void *) res);
1747
1748   poly_uint64 this_max_nunits = 1;
1749   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1750                                         &this_max_nunits,
1751                                         matches, limit, tree_size, bst_map);
1752   if (!res_)
1753     {
1754       if (dump_enabled_p ())
1755         dump_printf_loc (MSG_NOTE, vect_location,
1756                          "SLP discovery for node %p failed\n", (void *) res);
1757       /* Mark the node invalid so we can detect those when still in use
1758          as backedge destinations.  */
1759       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1760       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1761       res->failed = XNEWVEC (bool, group_size);
1762       if (flag_checking)
1763         {
1764           unsigned i;
1765           for (i = 0; i < group_size; ++i)
1766             if (!matches[i])
1767               break;
1768           gcc_assert (i < group_size);
1769         }
1770       memcpy (res->failed, matches, sizeof (bool) * group_size);
1771     }
1772   else
1773     {
1774       if (dump_enabled_p ())
1775         dump_printf_loc (MSG_NOTE, vect_location,
1776                          "SLP discovery for node %p succeeded\n",
1777                          (void *) res);
1778       gcc_assert (res_ == res);
1779       res->max_nunits = this_max_nunits;
1780       vect_update_max_nunits (max_nunits, this_max_nunits);
1781       /* Keep a reference for the bst_map use.  */
1782       SLP_TREE_REF_COUNT (res)++;
1783     }
1784   return res_;
1785 }
1786
1787 /* Helper for building an associated SLP node chain.  */
1788
1789 static void
1790 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1791                                    slp_tree op0, slp_tree op1,
1792                                    stmt_vec_info oper1, stmt_vec_info oper2,
1793                                    vec<std::pair<unsigned, unsigned> > lperm)
1794 {
1795   unsigned group_size = SLP_TREE_LANES (op1);
1796
1797   slp_tree child1 = new _slp_tree;
1798   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1799   SLP_TREE_VECTYPE (child1) = vectype;
1800   SLP_TREE_LANES (child1) = group_size;
1801   SLP_TREE_CHILDREN (child1).create (2);
1802   SLP_TREE_CHILDREN (child1).quick_push (op0);
1803   SLP_TREE_CHILDREN (child1).quick_push (op1);
1804   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1805
1806   slp_tree child2 = new _slp_tree;
1807   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1808   SLP_TREE_VECTYPE (child2) = vectype;
1809   SLP_TREE_LANES (child2) = group_size;
1810   SLP_TREE_CHILDREN (child2).create (2);
1811   SLP_TREE_CHILDREN (child2).quick_push (op0);
1812   SLP_TREE_REF_COUNT (op0)++;
1813   SLP_TREE_CHILDREN (child2).quick_push (op1);
1814   SLP_TREE_REF_COUNT (op1)++;
1815   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1816
1817   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1818   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1819   SLP_TREE_VECTYPE (perm) = vectype;
1820   SLP_TREE_LANES (perm) = group_size;
1821   /* ???  We should set this NULL but that's not expected.  */
1822   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1823   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1824   SLP_TREE_CHILDREN (perm).quick_push (child1);
1825   SLP_TREE_CHILDREN (perm).quick_push (child2);
1826 }
1827
1828 /* Recursively build an SLP tree starting from NODE.
1829    Fail (and return a value not equal to zero) if def-stmts are not
1830    isomorphic, require data permutation or are of unsupported types of
1831    operation.  Otherwise, return 0.
1832    The value returned is the depth in the SLP tree where a mismatch
1833    was found.  */
1834
1835 static slp_tree
1836 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1837                        vec<stmt_vec_info> stmts, unsigned int group_size,
1838                        poly_uint64 *max_nunits,
1839                        bool *matches, unsigned *limit, unsigned *tree_size,
1840                        scalar_stmts_to_slp_tree_map_t *bst_map)
1841 {
1842   unsigned nops, i, this_tree_size = 0;
1843   poly_uint64 this_max_nunits = *max_nunits;
1844
1845   matches[0] = false;
1846
1847   stmt_vec_info stmt_info = stmts[0];
1848   if (!is_a<gcall *> (stmt_info->stmt)
1849       && !is_a<gassign *> (stmt_info->stmt)
1850       && !is_a<gphi *> (stmt_info->stmt))
1851     return NULL;
1852
1853   nops = gimple_num_args (stmt_info->stmt);
1854   if (const int *map = vect_get_operand_map (stmt_info->stmt,
1855                                              STMT_VINFO_GATHER_SCATTER_P
1856                                                (stmt_info)))
1857     nops = map[0];
1858
1859   /* If the SLP node is a PHI (induction or reduction), terminate
1860      the recursion.  */
1861   bool *skip_args = XALLOCAVEC (bool, nops);
1862   memset (skip_args, 0, sizeof (bool) * nops);
1863   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1864     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1865       {
1866         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1867         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1868                                                     group_size);
1869         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1870                                      max_nunits))
1871           return NULL;
1872
1873         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1874         if (def_type == vect_induction_def)
1875           {
1876             /* Induction PHIs are not cycles but walk the initial
1877                value.  Only for inner loops through, for outer loops
1878                we need to pick up the value from the actual PHIs
1879                to more easily support peeling and epilogue vectorization.  */
1880             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1881             if (!nested_in_vect_loop_p (loop, stmt_info))
1882               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1883             else
1884               loop = loop->inner;
1885             skip_args[loop_latch_edge (loop)->dest_idx] = true;
1886           }
1887         else if (def_type == vect_reduction_def
1888                  || def_type == vect_double_reduction_def
1889                  || def_type == vect_nested_cycle
1890                  || def_type == vect_first_order_recurrence)
1891           {
1892             /* Else def types have to match.  */
1893             stmt_vec_info other_info;
1894             bool all_same = true;
1895             FOR_EACH_VEC_ELT (stmts, i, other_info)
1896               {
1897                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1898                   return NULL;
1899                 if (other_info != stmt_info)
1900                   all_same = false;
1901               }
1902             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1903             /* Reduction initial values are not explicitely represented.  */
1904             if (def_type != vect_first_order_recurrence
1905                 && !nested_in_vect_loop_p (loop, stmt_info))
1906               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1907             /* Reduction chain backedge defs are filled manually.
1908                ???  Need a better way to identify a SLP reduction chain PHI.
1909                Or a better overall way to SLP match those.  */
1910             if (all_same && def_type == vect_reduction_def)
1911               skip_args[loop_latch_edge (loop)->dest_idx] = true;
1912           }
1913         else if (def_type != vect_internal_def)
1914           return NULL;
1915       }
1916
1917
1918   bool two_operators = false;
1919   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1920   tree vectype = NULL_TREE;
1921   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1922                               &this_max_nunits, matches, &two_operators,
1923                               &vectype))
1924     return NULL;
1925
1926   /* If the SLP node is a load, terminate the recursion unless masked.  */
1927   if (STMT_VINFO_DATA_REF (stmt_info)
1928       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1929     {
1930       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1931         gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1932       else
1933         {
1934           *max_nunits = this_max_nunits;
1935           (*tree_size)++;
1936           node = vect_create_new_slp_node (node, stmts, 0);
1937           SLP_TREE_VECTYPE (node) = vectype;
1938           /* And compute the load permutation.  Whether it is actually
1939              a permutation depends on the unrolling factor which is
1940              decided later.  */
1941           vec<unsigned> load_permutation;
1942           int j;
1943           stmt_vec_info load_info;
1944           load_permutation.create (group_size);
1945           stmt_vec_info first_stmt_info
1946             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1947           bool any_permute = false;
1948           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1949             {
1950               int load_place;
1951               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1952                 load_place = vect_get_place_in_interleaving_chain
1953                     (load_info, first_stmt_info);
1954               else
1955                 load_place = 0;
1956               gcc_assert (load_place != -1);
1957               any_permute |= load_place != j;
1958               load_permutation.quick_push (load_place);
1959             }
1960
1961           if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1962             {
1963               gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1964                           || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1965                           || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
1966                           || gimple_call_internal_p (stmt,
1967                                                      IFN_MASK_LEN_GATHER_LOAD));
1968               load_permutation.release ();
1969               /* We cannot handle permuted masked loads, see PR114375.  */
1970               if (any_permute
1971                   || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1972                       && DR_GROUP_SIZE (first_stmt_info) != group_size)
1973                   || STMT_VINFO_STRIDED_P (stmt_info))
1974                 {
1975                   matches[0] = false;
1976                   return NULL;
1977                 }
1978             }
1979           else
1980             {
1981               SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1982               return node;
1983             }
1984         }
1985     }
1986   else if (gimple_assign_single_p (stmt_info->stmt)
1987            && !gimple_vuse (stmt_info->stmt)
1988            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1989     {
1990       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1991          the same SSA name vector of a compatible type to vectype.  */
1992       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1993       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1994       stmt_vec_info estmt_info;
1995       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1996         {
1997           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1998           tree bfref = gimple_assign_rhs1 (estmt);
1999           HOST_WIDE_INT lane;
2000           if (!known_eq (bit_field_size (bfref),
2001                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
2002               || !constant_multiple_p (bit_field_offset (bfref),
2003                                        bit_field_size (bfref), &lane))
2004             {
2005               lperm.release ();
2006               matches[0] = false;
2007               return NULL;
2008             }
2009           lperm.safe_push (std::make_pair (0, (unsigned)lane));
2010         }
2011       slp_tree vnode = vect_create_new_slp_node (vNULL);
2012       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2013         /* ???  We record vectype here but we hide eventually necessary
2014            punning and instead rely on code generation to materialize
2015            VIEW_CONVERT_EXPRs as necessary.  We instead should make
2016            this explicit somehow.  */
2017         SLP_TREE_VECTYPE (vnode) = vectype;
2018       else
2019         {
2020           /* For different size but compatible elements we can still
2021              use VEC_PERM_EXPR without punning.  */
2022           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2023                       && types_compatible_p (TREE_TYPE (vectype),
2024                                              TREE_TYPE (TREE_TYPE (vec))));
2025           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2026         }
2027       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2028       unsigned HOST_WIDE_INT const_nunits;
2029       if (nunits.is_constant (&const_nunits))
2030         SLP_TREE_LANES (vnode) = const_nunits;
2031       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2032       /* We are always building a permutation node even if it is an identity
2033          permute to shield the rest of the vectorizer from the odd node
2034          representing an actual vector without any scalar ops.
2035          ???  We could hide it completely with making the permute node
2036          external?  */
2037       node = vect_create_new_slp_node (node, stmts, 1);
2038       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2039       SLP_TREE_LANE_PERMUTATION (node) = lperm;
2040       SLP_TREE_VECTYPE (node) = vectype;
2041       SLP_TREE_CHILDREN (node).quick_push (vnode);
2042       return node;
2043     }
2044   /* When discovery reaches an associatable operation see whether we can
2045      improve that to match up lanes in a way superior to the operand
2046      swapping code which at most looks at two defs.
2047      ???  For BB vectorization we cannot do the brute-force search
2048      for matching as we can succeed by means of builds from scalars
2049      and have no good way to "cost" one build against another.  */
2050   else if (is_a <loop_vec_info> (vinfo)
2051            /* ???  We don't handle !vect_internal_def defs below.  */
2052            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2053            && is_gimple_assign (stmt_info->stmt)
2054            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2055                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2056            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2057                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2058                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2059     {
2060       /* See if we have a chain of (mixed) adds or subtracts or other
2061          associatable ops.  */
2062       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2063       if (code == MINUS_EXPR)
2064         code = PLUS_EXPR;
2065       stmt_vec_info other_op_stmt_info = NULL;
2066       stmt_vec_info op_stmt_info = NULL;
2067       unsigned chain_len = 0;
2068       auto_vec<chain_op_t> chain;
2069       auto_vec<std::pair<tree_code, gimple *> > worklist;
2070       auto_vec<vec<chain_op_t> > chains (group_size);
2071       auto_vec<slp_tree, 4> children;
2072       bool hard_fail = true;
2073       for (unsigned lane = 0; lane < group_size; ++lane)
2074         {
2075           /* For each lane linearize the addition/subtraction (or other
2076              uniform associatable operation) expression tree.  */
2077           gimple *op_stmt = NULL, *other_op_stmt = NULL;
2078           vect_slp_linearize_chain (vinfo, worklist, chain, code,
2079                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
2080                                     NULL);
2081           if (!op_stmt_info && op_stmt)
2082             op_stmt_info = vinfo->lookup_stmt (op_stmt);
2083           if (!other_op_stmt_info && other_op_stmt)
2084             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2085           if (chain.length () == 2)
2086             {
2087               /* In a chain of just two elements resort to the regular
2088                  operand swapping scheme.  If we run into a length
2089                  mismatch still hard-FAIL.  */
2090               if (chain_len == 0)
2091                 hard_fail = false;
2092               else
2093                 {
2094                   matches[lane] = false;
2095                   /* ???  We might want to process the other lanes, but
2096                      make sure to not give false matching hints to the
2097                      caller for lanes we did not process.  */
2098                   if (lane != group_size - 1)
2099                     matches[0] = false;
2100                 }
2101               break;
2102             }
2103           else if (chain_len == 0)
2104             chain_len = chain.length ();
2105           else if (chain.length () != chain_len)
2106             {
2107               /* ???  Here we could slip in magic to compensate with
2108                  neutral operands.  */
2109               matches[lane] = false;
2110               if (lane != group_size - 1)
2111                 matches[0] = false;
2112               break;
2113             }
2114           chains.quick_push (chain.copy ());
2115           chain.truncate (0);
2116         }
2117       if (chains.length () == group_size)
2118         {
2119           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
2120           if (!op_stmt_info)
2121             {
2122               hard_fail = false;
2123               goto out;
2124             }
2125           /* Now we have a set of chains with the same length.  */
2126           /* 1. pre-sort according to def_type and operation.  */
2127           for (unsigned lane = 0; lane < group_size; ++lane)
2128             chains[lane].stablesort (dt_sort_cmp, vinfo);
2129           if (dump_enabled_p ())
2130             {
2131               dump_printf_loc (MSG_NOTE, vect_location,
2132                                "pre-sorted chains of %s\n",
2133                                get_tree_code_name (code));
2134               for (unsigned lane = 0; lane < group_size; ++lane)
2135                 {
2136                   for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2137                     dump_printf (MSG_NOTE, "%s %T ",
2138                                  get_tree_code_name (chains[lane][opnum].code),
2139                                  chains[lane][opnum].op);
2140                   dump_printf (MSG_NOTE, "\n");
2141                 }
2142             }
2143           /* 2. try to build children nodes, associating as necessary.  */
2144           for (unsigned n = 0; n < chain_len; ++n)
2145             {
2146               vect_def_type dt = chains[0][n].dt;
2147               unsigned lane;
2148               for (lane = 0; lane < group_size; ++lane)
2149                 if (chains[lane][n].dt != dt)
2150                   {
2151                     if (dt == vect_constant_def
2152                         && chains[lane][n].dt == vect_external_def)
2153                       dt = vect_external_def;
2154                     else if (dt == vect_external_def
2155                              && chains[lane][n].dt == vect_constant_def)
2156                       ;
2157                     else
2158                       break;
2159                   }
2160               if (lane != group_size)
2161                 {
2162                   if (dump_enabled_p ())
2163                     dump_printf_loc (MSG_NOTE, vect_location,
2164                                      "giving up on chain due to mismatched "
2165                                      "def types\n");
2166                   matches[lane] = false;
2167                   if (lane != group_size - 1)
2168                     matches[0] = false;
2169                   goto out;
2170                 }
2171               if (dt == vect_constant_def
2172                   || dt == vect_external_def)
2173                 {
2174                   /* Check whether we can build the invariant.  If we can't
2175                      we never will be able to.  */
2176                   tree type = TREE_TYPE (chains[0][n].op);
2177                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2178                       && (TREE_CODE (type) == BOOLEAN_TYPE
2179                           || !can_duplicate_and_interleave_p (vinfo, group_size,
2180                                                               type)))
2181                     {
2182                       matches[0] = false;
2183                       goto out;
2184                     }
2185                   vec<tree> ops;
2186                   ops.create (group_size);
2187                   for (lane = 0; lane < group_size; ++lane)
2188                     ops.quick_push (chains[lane][n].op);
2189                   slp_tree child = vect_create_new_slp_node (ops);
2190                   SLP_TREE_DEF_TYPE (child) = dt;
2191                   children.safe_push (child);
2192                 }
2193               else if (dt != vect_internal_def)
2194                 {
2195                   /* Not sure, we might need sth special.
2196                      gcc.dg/vect/pr96854.c,
2197                      gfortran.dg/vect/fast-math-pr37021.f90
2198                      and gfortran.dg/vect/pr61171.f trigger.  */
2199                   /* Soft-fail for now.  */
2200                   hard_fail = false;
2201                   goto out;
2202                 }
2203               else
2204                 {
2205                   vec<stmt_vec_info> op_stmts;
2206                   op_stmts.create (group_size);
2207                   slp_tree child = NULL;
2208                   /* Brute-force our way.  We have to consider a lane
2209                      failing after fixing an earlier fail up in the
2210                      SLP discovery recursion.  So track the current
2211                      permute per lane.  */
2212                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
2213                   memset (perms, 0, sizeof (unsigned) * group_size);
2214                   do
2215                     {
2216                       op_stmts.truncate (0);
2217                       for (lane = 0; lane < group_size; ++lane)
2218                         op_stmts.quick_push
2219                           (vinfo->lookup_def (chains[lane][n].op));
2220                       child = vect_build_slp_tree (vinfo, op_stmts,
2221                                                    group_size, &this_max_nunits,
2222                                                    matches, limit,
2223                                                    &this_tree_size, bst_map);
2224                       /* ???  We're likely getting too many fatal mismatches
2225                          here so maybe we want to ignore them (but then we
2226                          have no idea which lanes fatally mismatched).  */
2227                       if (child || !matches[0])
2228                         break;
2229                       /* Swap another lane we have not yet matched up into
2230                          lanes that did not match.  If we run out of
2231                          permute possibilities for a lane terminate the
2232                          search.  */
2233                       bool term = false;
2234                       for (lane = 1; lane < group_size; ++lane)
2235                         if (!matches[lane])
2236                           {
2237                             if (n + perms[lane] + 1 == chain_len)
2238                               {
2239                                 term = true;
2240                                 break;
2241                               }
2242                             std::swap (chains[lane][n],
2243                                        chains[lane][n + perms[lane] + 1]);
2244                             perms[lane]++;
2245                           }
2246                       if (term)
2247                         break;
2248                     }
2249                   while (1);
2250                   if (!child)
2251                     {
2252                       if (dump_enabled_p ())
2253                         dump_printf_loc (MSG_NOTE, vect_location,
2254                                          "failed to match up op %d\n", n);
2255                       op_stmts.release ();
2256                       if (lane != group_size - 1)
2257                         matches[0] = false;
2258                       else
2259                         matches[lane] = false;
2260                       goto out;
2261                     }
2262                   if (dump_enabled_p ())
2263                     {
2264                       dump_printf_loc (MSG_NOTE, vect_location,
2265                                        "matched up op %d to\n", n);
2266                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
2267                     }
2268                   children.safe_push (child);
2269                 }
2270             }
2271           /* 3. build SLP nodes to combine the chain.  */
2272           for (unsigned lane = 0; lane < group_size; ++lane)
2273             if (chains[lane][0].code != code)
2274               {
2275                 /* See if there's any alternate all-PLUS entry.  */
2276                 unsigned n;
2277                 for (n = 1; n < chain_len; ++n)
2278                   {
2279                     for (lane = 0; lane < group_size; ++lane)
2280                       if (chains[lane][n].code != code)
2281                         break;
2282                     if (lane == group_size)
2283                       break;
2284                   }
2285                 if (n != chain_len)
2286                   {
2287                     /* Swap that in at first position.  */
2288                     std::swap (children[0], children[n]);
2289                     for (lane = 0; lane < group_size; ++lane)
2290                       std::swap (chains[lane][0], chains[lane][n]);
2291                   }
2292                 else
2293                   {
2294                     /* ???  When this triggers and we end up with two
2295                        vect_constant/external_def up-front things break (ICE)
2296                        spectacularly finding an insertion place for the
2297                        all-constant op.  We should have a fully
2298                        vect_internal_def operand though(?) so we can swap
2299                        that into first place and then prepend the all-zero
2300                        constant.  */
2301                     if (dump_enabled_p ())
2302                       dump_printf_loc (MSG_NOTE, vect_location,
2303                                        "inserting constant zero to compensate "
2304                                        "for (partially) negated first "
2305                                        "operand\n");
2306                     chain_len++;
2307                     for (lane = 0; lane < group_size; ++lane)
2308                       chains[lane].safe_insert
2309                         (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2310                     vec<tree> zero_ops;
2311                     zero_ops.create (group_size);
2312                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2313                     for (lane = 1; lane < group_size; ++lane)
2314                       zero_ops.quick_push (zero_ops[0]);
2315                     slp_tree zero = vect_create_new_slp_node (zero_ops);
2316                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2317                     children.safe_insert (0, zero);
2318                   }
2319                 break;
2320               }
2321           for (unsigned i = 1; i < children.length (); ++i)
2322             {
2323               slp_tree op0 = children[i - 1];
2324               slp_tree op1 = children[i];
2325               bool this_two_op = false;
2326               for (unsigned lane = 0; lane < group_size; ++lane)
2327                 if (chains[lane][i].code != chains[0][i].code)
2328                   {
2329                     this_two_op = true;
2330                     break;
2331                   }
2332               slp_tree child;
2333               if (i == children.length () - 1)
2334                 child = vect_create_new_slp_node (node, stmts, 2);
2335               else
2336                 child = vect_create_new_slp_node (2, ERROR_MARK);
2337               if (this_two_op)
2338                 {
2339                   vec<std::pair<unsigned, unsigned> > lperm;
2340                   lperm.create (group_size);
2341                   for (unsigned lane = 0; lane < group_size; ++lane)
2342                     lperm.quick_push (std::make_pair
2343                       (chains[lane][i].code != chains[0][i].code, lane));
2344                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2345                                                      (chains[0][i].code == code
2346                                                       ? op_stmt_info
2347                                                       : other_op_stmt_info),
2348                                                      (chains[0][i].code == code
2349                                                       ? other_op_stmt_info
2350                                                       : op_stmt_info),
2351                                                      lperm);
2352                 }
2353               else
2354                 {
2355                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2356                   SLP_TREE_VECTYPE (child) = vectype;
2357                   SLP_TREE_LANES (child) = group_size;
2358                   SLP_TREE_CHILDREN (child).quick_push (op0);
2359                   SLP_TREE_CHILDREN (child).quick_push (op1);
2360                   SLP_TREE_REPRESENTATIVE (child)
2361                     = (chains[0][i].code == code
2362                        ? op_stmt_info : other_op_stmt_info);
2363                 }
2364               children[i] = child;
2365             }
2366           *tree_size += this_tree_size + 1;
2367           *max_nunits = this_max_nunits;
2368           while (!chains.is_empty ())
2369             chains.pop ().release ();
2370           return node;
2371         }
2372 out:
2373       while (!children.is_empty ())
2374         vect_free_slp_tree (children.pop ());
2375       while (!chains.is_empty ())
2376         chains.pop ().release ();
2377       /* Hard-fail, otherwise we might run into quadratic processing of the
2378          chains starting one stmt into the chain again.  */
2379       if (hard_fail)
2380         return NULL;
2381       /* Fall thru to normal processing.  */
2382     }
2383
2384   /* Get at the operands, verifying they are compatible.  */
2385   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2386   slp_oprnd_info oprnd_info;
2387   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2388     {
2389       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2390                                              stmts, i, &oprnds_info);
2391       if (res != 0)
2392         matches[(res == -1) ? 0 : i] = false;
2393       if (!matches[0])
2394         break;
2395     }
2396   for (i = 0; i < group_size; ++i)
2397     if (!matches[i])
2398       {
2399         vect_free_oprnd_info (oprnds_info);
2400         return NULL;
2401       }
2402   swap = NULL;
2403
2404   auto_vec<slp_tree, 4> children;
2405
2406   stmt_info = stmts[0];
2407
2408   /* Create SLP_TREE nodes for the definition node/s.  */
2409   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2410     {
2411       slp_tree child = nullptr;
2412       unsigned int j;
2413
2414       /* We're skipping certain operands from processing, for example
2415          outer loop reduction initial defs.  */
2416       if (skip_args[i])
2417         {
2418           children.safe_push (NULL);
2419           continue;
2420         }
2421
2422       if (oprnd_info->first_dt == vect_uninitialized_def)
2423         {
2424           /* COND_EXPR have one too many eventually if the condition
2425              is a SSA name.  */
2426           gcc_assert (i == 3 && nops == 4);
2427           continue;
2428         }
2429
2430       if (is_a <bb_vec_info> (vinfo)
2431           && oprnd_info->first_dt == vect_internal_def
2432           && !oprnd_info->any_pattern)
2433         {
2434           /* For BB vectorization, if all defs are the same do not
2435              bother to continue the build along the single-lane
2436              graph but use a splat of the scalar value.  */
2437           stmt_vec_info first_def = oprnd_info->def_stmts[0];
2438           for (j = 1; j < group_size; ++j)
2439             if (oprnd_info->def_stmts[j] != first_def)
2440               break;
2441           if (j == group_size
2442               /* But avoid doing this for loads where we may be
2443                  able to CSE things, unless the stmt is not
2444                  vectorizable.  */
2445               && (!STMT_VINFO_VECTORIZABLE (first_def)
2446                   || !gimple_vuse (first_def->stmt)))
2447             {
2448               if (dump_enabled_p ())
2449                 dump_printf_loc (MSG_NOTE, vect_location,
2450                                  "Using a splat of the uniform operand %G",
2451                                  first_def->stmt);
2452               oprnd_info->first_dt = vect_external_def;
2453             }
2454         }
2455
2456       if (oprnd_info->first_dt == vect_external_def
2457           || oprnd_info->first_dt == vect_constant_def)
2458         {
2459           if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2460             {
2461               tree op0;
2462               tree uniform_val = op0 = oprnd_info->ops[0];
2463               for (j = 1; j < oprnd_info->ops.length (); ++j)
2464                 if (!operand_equal_p (uniform_val, oprnd_info->ops[j]))
2465                   {
2466                     uniform_val = NULL_TREE;
2467                     break;
2468                   }
2469               if (!uniform_val
2470                   && !can_duplicate_and_interleave_p (vinfo,
2471                                                       oprnd_info->ops.length (),
2472                                                       TREE_TYPE (op0)))
2473                 {
2474                   matches[j] = false;
2475                   if (dump_enabled_p ())
2476                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2477                                      "Build SLP failed: invalid type of def "
2478                                      "for variable-length SLP %T\n", op0);
2479                   goto fail;
2480                 }
2481             }
2482           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2483           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2484           oprnd_info->ops = vNULL;
2485           children.safe_push (invnode);
2486           continue;
2487         }
2488
2489       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2490                                         group_size, &this_max_nunits,
2491                                         matches, limit,
2492                                         &this_tree_size, bst_map)) != NULL)
2493         {
2494           oprnd_info->def_stmts = vNULL;
2495           children.safe_push (child);
2496           continue;
2497         }
2498
2499       /* If the SLP build for operand zero failed and operand zero
2500          and one can be commutated try that for the scalar stmts
2501          that failed the match.  */
2502       if (i == 0
2503           /* A first scalar stmt mismatch signals a fatal mismatch.  */
2504           && matches[0]
2505           /* ???  For COND_EXPRs we can swap the comparison operands
2506              as well as the arms under some constraints.  */
2507           && nops == 2
2508           && oprnds_info[1]->first_dt == vect_internal_def
2509           && is_gimple_assign (stmt_info->stmt)
2510           /* Swapping operands for reductions breaks assumptions later on.  */
2511           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2512           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2513         {
2514           /* See whether we can swap the matching or the non-matching
2515              stmt operands.  */
2516           bool swap_not_matching = true;
2517           do
2518             {
2519               for (j = 0; j < group_size; ++j)
2520                 {
2521                   if (matches[j] != !swap_not_matching)
2522                     continue;
2523                   stmt_vec_info stmt_info = stmts[j];
2524                   /* Verify if we can swap operands of this stmt.  */
2525                   gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2526                   if (!stmt
2527                       || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2528                     {
2529                       if (!swap_not_matching)
2530                         goto fail;
2531                       swap_not_matching = false;
2532                       break;
2533                     }
2534                 }
2535             }
2536           while (j != group_size);
2537
2538           /* Swap mismatched definition stmts.  */
2539           if (dump_enabled_p ())
2540             dump_printf_loc (MSG_NOTE, vect_location,
2541                              "Re-trying with swapped operands of stmts ");
2542           for (j = 0; j < group_size; ++j)
2543             if (matches[j] == !swap_not_matching)
2544               {
2545                 std::swap (oprnds_info[0]->def_stmts[j],
2546                            oprnds_info[1]->def_stmts[j]);
2547                 std::swap (oprnds_info[0]->ops[j],
2548                            oprnds_info[1]->ops[j]);
2549                 if (dump_enabled_p ())
2550                   dump_printf (MSG_NOTE, "%d ", j);
2551               }
2552           if (dump_enabled_p ())
2553             dump_printf (MSG_NOTE, "\n");
2554           /* After swapping some operands we lost track whether an
2555              operand has any pattern defs so be conservative here.  */
2556           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2557             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2558           /* And try again with scratch 'matches' ... */
2559           bool *tem = XALLOCAVEC (bool, group_size);
2560           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2561                                             group_size, &this_max_nunits,
2562                                             tem, limit,
2563                                             &this_tree_size, bst_map)) != NULL)
2564             {
2565               oprnd_info->def_stmts = vNULL;
2566               children.safe_push (child);
2567               continue;
2568             }
2569         }
2570 fail:
2571
2572       /* If the SLP build failed and we analyze a basic-block
2573          simply treat nodes we fail to build as externally defined
2574          (and thus build vectors from the scalar defs).
2575          The cost model will reject outright expensive cases.
2576          ???  This doesn't treat cases where permutation ultimatively
2577          fails (or we don't try permutation below).  Ideally we'd
2578          even compute a permutation that will end up with the maximum
2579          SLP tree size...  */
2580       if (is_a <bb_vec_info> (vinfo)
2581           /* ???  Rejecting patterns this way doesn't work.  We'd have to
2582              do extra work to cancel the pattern so the uses see the
2583              scalar version.  */
2584           && !is_pattern_stmt_p (stmt_info)
2585           && !oprnd_info->any_pattern)
2586         {
2587           /* But if there's a leading vector sized set of matching stmts
2588              fail here so we can split the group.  This matches the condition
2589              vect_analyze_slp_instance uses.  */
2590           /* ???  We might want to split here and combine the results to support
2591              multiple vector sizes better.  */
2592           for (j = 0; j < group_size; ++j)
2593             if (!matches[j])
2594               break;
2595           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2596             {
2597               if (dump_enabled_p ())
2598                 dump_printf_loc (MSG_NOTE, vect_location,
2599                                  "Building vector operands from scalars\n");
2600               this_tree_size++;
2601               child = vect_create_new_slp_node (oprnd_info->ops);
2602               children.safe_push (child);
2603               oprnd_info->ops = vNULL;
2604               continue;
2605             }
2606         }
2607
2608       gcc_assert (child == NULL);
2609       FOR_EACH_VEC_ELT (children, j, child)
2610         if (child)
2611           vect_free_slp_tree (child);
2612       vect_free_oprnd_info (oprnds_info);
2613       return NULL;
2614     }
2615
2616   vect_free_oprnd_info (oprnds_info);
2617
2618   /* If we have all children of a child built up from uniform scalars
2619      or does more than one possibly expensive vector construction then
2620      just throw that away, causing it built up from scalars.
2621      The exception is the SLP node for the vector store.  */
2622   if (is_a <bb_vec_info> (vinfo)
2623       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2624       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2625          do extra work to cancel the pattern so the uses see the
2626          scalar version.  */
2627       && !is_pattern_stmt_p (stmt_info))
2628     {
2629       slp_tree child;
2630       unsigned j;
2631       bool all_uniform_p = true;
2632       unsigned n_vector_builds = 0;
2633       FOR_EACH_VEC_ELT (children, j, child)
2634         {
2635           if (!child)
2636             ;
2637           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2638             all_uniform_p = false;
2639           else if (!vect_slp_tree_uniform_p (child))
2640             {
2641               all_uniform_p = false;
2642               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2643                 n_vector_builds++;
2644             }
2645         }
2646       if (all_uniform_p
2647           || n_vector_builds > 1
2648           || (n_vector_builds == children.length ()
2649               && is_a <gphi *> (stmt_info->stmt)))
2650         {
2651           /* Roll back.  */
2652           matches[0] = false;
2653           FOR_EACH_VEC_ELT (children, j, child)
2654             if (child)
2655               vect_free_slp_tree (child);
2656
2657           if (dump_enabled_p ())
2658             dump_printf_loc (MSG_NOTE, vect_location,
2659                              "Building parent vector operands from "
2660                              "scalars instead\n");
2661           return NULL;
2662         }
2663     }
2664
2665   *tree_size += this_tree_size + 1;
2666   *max_nunits = this_max_nunits;
2667
2668   if (two_operators)
2669     {
2670       /* ???  We'd likely want to either cache in bst_map sth like
2671          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2672          the true { a+b, a+b, a+b, a+b } ... but there we don't have
2673          explicit stmts to put in so the keying on 'stmts' doesn't
2674          work (but we have the same issue with nodes that use 'ops').  */
2675       slp_tree one = new _slp_tree;
2676       slp_tree two = new _slp_tree;
2677       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2678       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2679       SLP_TREE_VECTYPE (one) = vectype;
2680       SLP_TREE_VECTYPE (two) = vectype;
2681       SLP_TREE_CHILDREN (one).safe_splice (children);
2682       SLP_TREE_CHILDREN (two).safe_splice (children);
2683       slp_tree child;
2684       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2685         SLP_TREE_REF_COUNT (child)++;
2686
2687       /* Here we record the original defs since this
2688          node represents the final lane configuration.  */
2689       node = vect_create_new_slp_node (node, stmts, 2);
2690       SLP_TREE_VECTYPE (node) = vectype;
2691       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2692       SLP_TREE_CHILDREN (node).quick_push (one);
2693       SLP_TREE_CHILDREN (node).quick_push (two);
2694       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2695       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2696       enum tree_code ocode = ERROR_MARK;
2697       stmt_vec_info ostmt_info;
2698       unsigned j = 0;
2699       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2700         {
2701           gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2702           if (gimple_assign_rhs_code (ostmt) != code0)
2703             {
2704               SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2705               ocode = gimple_assign_rhs_code (ostmt);
2706               j = i;
2707             }
2708           else
2709             SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2710         }
2711       SLP_TREE_CODE (one) = code0;
2712       SLP_TREE_CODE (two) = ocode;
2713       SLP_TREE_LANES (one) = stmts.length ();
2714       SLP_TREE_LANES (two) = stmts.length ();
2715       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2716       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2717       return node;
2718     }
2719
2720   node = vect_create_new_slp_node (node, stmts, nops);
2721   SLP_TREE_VECTYPE (node) = vectype;
2722   SLP_TREE_CHILDREN (node).splice (children);
2723   return node;
2724 }
2725
2726 /* Dump a single SLP tree NODE.  */
2727
2728 static void
2729 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2730                      slp_tree node)
2731 {
2732   unsigned i, j;
2733   slp_tree child;
2734   stmt_vec_info stmt_info;
2735   tree op;
2736
2737   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2738   dump_user_location_t user_loc = loc.get_user_location ();
2739   dump_printf_loc (metadata, user_loc,
2740                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2741                    ", refcnt=%u)",
2742                    SLP_TREE_DEF_TYPE (node) == vect_external_def
2743                    ? " (external)"
2744                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2745                       ? " (constant)"
2746                       : ""), (void *) node,
2747                    estimated_poly_value (node->max_nunits),
2748                                          SLP_TREE_REF_COUNT (node));
2749   if (SLP_TREE_VECTYPE (node))
2750     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2751   dump_printf (metadata, "\n");
2752   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2753     {
2754       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2755         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2756       else
2757         dump_printf_loc (metadata, user_loc, "op template: %G",
2758                          SLP_TREE_REPRESENTATIVE (node)->stmt);
2759     }
2760   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2761     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2762       dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
2763                        STMT_VINFO_LIVE_P (stmt_info) ? "[l] " : "",
2764                        i, stmt_info->stmt);
2765   else
2766     {
2767       dump_printf_loc (metadata, user_loc, "\t{ ");
2768       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2769         dump_printf (metadata, "%T%s ", op,
2770                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2771       dump_printf (metadata, "}\n");
2772     }
2773   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2774     {
2775       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2776       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2777         dump_printf (dump_kind, " %u", j);
2778       dump_printf (dump_kind, " }\n");
2779     }
2780   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2781     {
2782       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2783       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2784         dump_printf (dump_kind, " %u[%u]",
2785                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
2786                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
2787       dump_printf (dump_kind, " }\n");
2788     }
2789   if (SLP_TREE_CHILDREN (node).is_empty ())
2790     return;
2791   dump_printf_loc (metadata, user_loc, "\tchildren");
2792   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2793     dump_printf (dump_kind, " %p", (void *)child);
2794   dump_printf (dump_kind, "\n");
2795 }
2796
2797 DEBUG_FUNCTION void
2798 debug (slp_tree node)
2799 {
2800   debug_dump_context ctx;
2801   vect_print_slp_tree (MSG_NOTE,
2802                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2803                        node);
2804 }
2805
2806 /* Recursive helper for the dot producer below.  */
2807
2808 static void
2809 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2810 {
2811   if (visited.add (node))
2812     return;
2813
2814   fprintf (f, "\"%p\" [label=\"", (void *)node);
2815   vect_print_slp_tree (MSG_NOTE,
2816                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2817                        node);
2818   fprintf (f, "\"];\n");
2819
2820
2821   for (slp_tree child : SLP_TREE_CHILDREN (node))
2822     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2823
2824   for (slp_tree child : SLP_TREE_CHILDREN (node))
2825     if (child)
2826       dot_slp_tree (f, child, visited);
2827 }
2828
2829 DEBUG_FUNCTION void
2830 dot_slp_tree (const char *fname, slp_tree node)
2831 {
2832   FILE *f = fopen (fname, "w");
2833   fprintf (f, "digraph {\n");
2834   fflush (f);
2835     {
2836       debug_dump_context ctx (f);
2837       hash_set<slp_tree> visited;
2838       dot_slp_tree (f, node, visited);
2839     }
2840   fflush (f);
2841   fprintf (f, "}\n");
2842   fclose (f);
2843 }
2844
2845 DEBUG_FUNCTION void
2846 dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
2847 {
2848   FILE *f = fopen (fname, "w");
2849   fprintf (f, "digraph {\n");
2850   fflush (f);
2851     {
2852       debug_dump_context ctx (f);
2853       hash_set<slp_tree> visited;
2854       for (auto inst : slp_instances)
2855         dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
2856     }
2857   fflush (f);
2858   fprintf (f, "}\n");
2859   fclose (f);
2860 }
2861
2862 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
2863
2864 static void
2865 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2866                       slp_tree node, hash_set<slp_tree> &visited)
2867 {
2868   unsigned i;
2869   slp_tree child;
2870
2871   if (visited.add (node))
2872     return;
2873
2874   vect_print_slp_tree (dump_kind, loc, node);
2875
2876   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2877     if (child)
2878       vect_print_slp_graph (dump_kind, loc, child, visited);
2879 }
2880
2881 static void
2882 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2883                       slp_tree entry)
2884 {
2885   hash_set<slp_tree> visited;
2886   vect_print_slp_graph (dump_kind, loc, entry, visited);
2887 }
2888
2889 /* Mark the tree rooted at NODE with PURE_SLP.  */
2890
2891 static void
2892 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2893 {
2894   int i;
2895   stmt_vec_info stmt_info;
2896   slp_tree child;
2897
2898   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2899     return;
2900
2901   if (visited.add (node))
2902     return;
2903
2904   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2905     STMT_SLP_TYPE (stmt_info) = pure_slp;
2906
2907   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2908     if (child)
2909       vect_mark_slp_stmts (child, visited);
2910 }
2911
2912 static void
2913 vect_mark_slp_stmts (slp_tree node)
2914 {
2915   hash_set<slp_tree> visited;
2916   vect_mark_slp_stmts (node, visited);
2917 }
2918
2919 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
2920
2921 static void
2922 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2923 {
2924   int i;
2925   stmt_vec_info stmt_info;
2926   slp_tree child;
2927
2928   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2929     return;
2930
2931   if (visited.add (node))
2932     return;
2933
2934   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2935     {
2936       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2937                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2938       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2939     }
2940
2941   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2942     if (child)
2943       vect_mark_slp_stmts_relevant (child, visited);
2944 }
2945
2946 static void
2947 vect_mark_slp_stmts_relevant (slp_tree node)
2948 {
2949   hash_set<slp_tree> visited;
2950   vect_mark_slp_stmts_relevant (node, visited);
2951 }
2952
2953
2954 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
2955
2956 static void
2957 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2958                        hash_set<slp_tree> &visited)
2959 {
2960   if (!node || visited.add (node))
2961     return;
2962
2963   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2964     return;
2965
2966   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
2967     {
2968       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2969       if (STMT_VINFO_DATA_REF (stmt_info)
2970           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2971         loads.safe_push (node);
2972     }
2973
2974   unsigned i;
2975   slp_tree child;
2976   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2977     vect_gather_slp_loads (loads, child, visited);
2978 }
2979
2980
2981 /* Find the last store in SLP INSTANCE.  */
2982
2983 stmt_vec_info
2984 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2985 {
2986   stmt_vec_info last = NULL;
2987   stmt_vec_info stmt_vinfo;
2988
2989   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2990     {
2991       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2992       last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2993     }
2994
2995   return last;
2996 }
2997
2998 /* Find the first stmt in NODE.  */
2999
3000 stmt_vec_info
3001 vect_find_first_scalar_stmt_in_slp (slp_tree node)
3002 {
3003   stmt_vec_info first = NULL;
3004   stmt_vec_info stmt_vinfo;
3005
3006   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3007     {
3008       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3009       if (!first
3010           || get_later_stmt (stmt_vinfo, first) == first)
3011         first = stmt_vinfo;
3012     }
3013
3014   return first;
3015 }
3016
3017 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
3018    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
3019    (also containing the first GROUP1_SIZE stmts, since stores are
3020    consecutive), the second containing the remainder.
3021    Return the first stmt in the second group.  */
3022
3023 static stmt_vec_info
3024 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3025 {
3026   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3027   gcc_assert (group1_size > 0);
3028   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3029   gcc_assert (group2_size > 0);
3030   DR_GROUP_SIZE (first_vinfo) = group1_size;
3031
3032   stmt_vec_info stmt_info = first_vinfo;
3033   for (unsigned i = group1_size; i > 1; i--)
3034     {
3035       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3036       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3037     }
3038   /* STMT is now the last element of the first group.  */
3039   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3040   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3041
3042   DR_GROUP_SIZE (group2) = group2_size;
3043   for (stmt_info = group2; stmt_info;
3044        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3045     {
3046       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3047       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3048     }
3049
3050   /* For the second group, the DR_GROUP_GAP is that before the original group,
3051      plus skipping over the first vector.  */
3052   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3053
3054   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
3055   DR_GROUP_GAP (first_vinfo) += group2_size;
3056
3057   if (dump_enabled_p ())
3058     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3059                      group1_size, group2_size);
3060
3061   return group2;
3062 }
3063
3064 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3065    statements and a vector of NUNITS elements.  */
3066
3067 static poly_uint64
3068 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3069 {
3070   return exact_div (common_multiple (nunits, group_size), group_size);
3071 }
3072
3073 /* Helper that checks to see if a node is a load node.  */
3074
3075 static inline bool
3076 vect_is_slp_load_node  (slp_tree root)
3077 {
3078   return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3079          && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3080          && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3081 }
3082
3083
3084 /* Helper function of optimize_load_redistribution that performs the operation
3085    recursively.  */
3086
3087 static slp_tree
3088 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3089                                 vec_info *vinfo, unsigned int group_size,
3090                                 hash_map<slp_tree, slp_tree> *load_map,
3091                                 slp_tree root)
3092 {
3093   if (slp_tree *leader = load_map->get (root))
3094     return *leader;
3095
3096   slp_tree node;
3097   unsigned i;
3098
3099   /* For now, we don't know anything about externals so do not do anything.  */
3100   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3101     return NULL;
3102   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3103     {
3104       /* First convert this node into a load node and add it to the leaves
3105          list and flatten the permute from a lane to a load one.  If it's
3106          unneeded it will be elided later.  */
3107       vec<stmt_vec_info> stmts;
3108       stmts.create (SLP_TREE_LANES (root));
3109       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3110       for (unsigned j = 0; j < lane_perm.length (); j++)
3111         {
3112           std::pair<unsigned, unsigned> perm = lane_perm[j];
3113           node = SLP_TREE_CHILDREN (root)[perm.first];
3114
3115           if (!vect_is_slp_load_node (node)
3116               || SLP_TREE_CHILDREN (node).exists ())
3117             {
3118               stmts.release ();
3119               goto next;
3120             }
3121
3122           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3123         }
3124
3125       if (dump_enabled_p ())
3126         dump_printf_loc (MSG_NOTE, vect_location,
3127                          "converting stmts on permute node %p\n",
3128                          (void *) root);
3129
3130       bool *matches = XALLOCAVEC (bool, group_size);
3131       poly_uint64 max_nunits = 1;
3132       unsigned tree_size = 0, limit = 1;
3133       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3134                                   matches, &limit, &tree_size, bst_map);
3135       if (!node)
3136         stmts.release ();
3137
3138       load_map->put (root, node);
3139       return node;
3140     }
3141
3142 next:
3143   load_map->put (root, NULL);
3144
3145   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3146     {
3147       slp_tree value
3148         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3149                                           node);
3150       if (value)
3151         {
3152           SLP_TREE_REF_COUNT (value)++;
3153           SLP_TREE_CHILDREN (root)[i] = value;
3154           /* ???  We know the original leafs of the replaced nodes will
3155              be referenced by bst_map, only the permutes created by
3156              pattern matching are not.  */
3157           if (SLP_TREE_REF_COUNT (node) == 1)
3158             load_map->remove (node);
3159           vect_free_slp_tree (node);
3160         }
3161     }
3162
3163   return NULL;
3164 }
3165
3166 /* Temporary workaround for loads not being CSEd during SLP build.  This
3167    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3168    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3169    same DR such that the final operation is equal to a permuted load.  Such
3170    NODES are then directly converted into LOADS themselves.  The nodes are
3171    CSEd using BST_MAP.  */
3172
3173 static void
3174 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3175                               vec_info *vinfo, unsigned int group_size,
3176                               hash_map<slp_tree, slp_tree> *load_map,
3177                               slp_tree root)
3178 {
3179   slp_tree node;
3180   unsigned i;
3181
3182   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3183     {
3184       slp_tree value
3185         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3186                                           node);
3187       if (value)
3188         {
3189           SLP_TREE_REF_COUNT (value)++;
3190           SLP_TREE_CHILDREN (root)[i] = value;
3191           /* ???  We know the original leafs of the replaced nodes will
3192              be referenced by bst_map, only the permutes created by
3193              pattern matching are not.  */
3194           if (SLP_TREE_REF_COUNT (node) == 1)
3195             load_map->remove (node);
3196           vect_free_slp_tree (node);
3197         }
3198     }
3199 }
3200
3201 /* Helper function of vect_match_slp_patterns.
3202
3203    Attempts to match patterns against the slp tree rooted in REF_NODE using
3204    VINFO.  Patterns are matched in post-order traversal.
3205
3206    If matching is successful the value in REF_NODE is updated and returned, if
3207    not then it is returned unchanged.  */
3208
3209 static bool
3210 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3211                            slp_tree_to_load_perm_map_t *perm_cache,
3212                            slp_compat_nodes_map_t *compat_cache,
3213                            hash_set<slp_tree> *visited)
3214 {
3215   unsigned i;
3216   slp_tree node = *ref_node;
3217   bool found_p = false;
3218   if (!node || visited->add (node))
3219     return false;
3220
3221   slp_tree child;
3222   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3223     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3224                                           vinfo, perm_cache, compat_cache,
3225                                           visited);
3226
3227   for (unsigned x = 0; x < num__slp_patterns; x++)
3228     {
3229       vect_pattern *pattern
3230         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3231       if (pattern)
3232         {
3233           pattern->build (vinfo);
3234           delete pattern;
3235           found_p = true;
3236         }
3237     }
3238
3239   return found_p;
3240 }
3241
3242 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3243    vec_info VINFO.
3244
3245    The modified tree is returned.  Patterns are tried in order and multiple
3246    patterns may match.  */
3247
3248 static bool
3249 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3250                          hash_set<slp_tree> *visited,
3251                          slp_tree_to_load_perm_map_t *perm_cache,
3252                          slp_compat_nodes_map_t *compat_cache)
3253 {
3254   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3255   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3256
3257   if (dump_enabled_p ())
3258     dump_printf_loc (MSG_NOTE, vect_location,
3259                      "Analyzing SLP tree %p for patterns\n",
3260                      (void *) SLP_INSTANCE_TREE (instance));
3261
3262   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3263                                     visited);
3264 }
3265
3266 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3267    splitting into two, with the first split group having size NEW_GROUP_SIZE.
3268    Return true if we could use IFN_STORE_LANES instead and if that appears
3269    to be the better approach.  */
3270
3271 static bool
3272 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3273                                unsigned int group_size,
3274                                unsigned int new_group_size)
3275 {
3276   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3277   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3278   if (!vectype)
3279     return false;
3280   /* Allow the split if one of the two new groups would operate on full
3281      vectors *within* rather than across one scalar loop iteration.
3282      This is purely a heuristic, but it should work well for group
3283      sizes of 3 and 4, where the possible splits are:
3284
3285        3->2+1:  OK if the vector has exactly two elements
3286        4->2+2:  Likewise
3287        4->3+1:  Less clear-cut.  */
3288   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3289       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3290     return false;
3291   return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3292 }
3293
3294 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3295    vect_build_slp_tree to build a tree of packed stmts if possible.
3296    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3297
3298 static bool
3299 vect_analyze_slp_instance (vec_info *vinfo,
3300                            scalar_stmts_to_slp_tree_map_t *bst_map,
3301                            stmt_vec_info stmt_info, slp_instance_kind kind,
3302                            unsigned max_tree_size, unsigned *limit);
3303
3304 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3305    of KIND.  Return true if successful.  */
3306
3307 static bool
3308 vect_build_slp_instance (vec_info *vinfo,
3309                          slp_instance_kind kind,
3310                          vec<stmt_vec_info> &scalar_stmts,
3311                          vec<stmt_vec_info> &root_stmt_infos,
3312                          vec<tree> &remain,
3313                          unsigned max_tree_size, unsigned *limit,
3314                          scalar_stmts_to_slp_tree_map_t *bst_map,
3315                          /* ???  We need stmt_info for group splitting.  */
3316                          stmt_vec_info stmt_info_)
3317 {
3318   if (kind == slp_inst_kind_ctor)
3319     {
3320       if (dump_enabled_p ())
3321         dump_printf_loc (MSG_NOTE, vect_location,
3322                          "Analyzing vectorizable constructor: %G\n",
3323                          root_stmt_infos[0]->stmt);
3324     }
3325
3326   if (dump_enabled_p ())
3327     {
3328       dump_printf_loc (MSG_NOTE, vect_location,
3329                        "Starting SLP discovery for\n");
3330       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3331         dump_printf_loc (MSG_NOTE, vect_location,
3332                          "  %G", scalar_stmts[i]->stmt);
3333     }
3334
3335   /* Build the tree for the SLP instance.  */
3336   unsigned int group_size = scalar_stmts.length ();
3337   bool *matches = XALLOCAVEC (bool, group_size);
3338   poly_uint64 max_nunits = 1;
3339   unsigned tree_size = 0;
3340   unsigned i;
3341   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3342                                        &max_nunits, matches, limit,
3343                                        &tree_size, bst_map);
3344   if (node != NULL)
3345     {
3346       /* Calculate the unrolling factor based on the smallest type.  */
3347       poly_uint64 unrolling_factor
3348         = calculate_unrolling_factor (max_nunits, group_size);
3349
3350       if (maybe_ne (unrolling_factor, 1U)
3351           && is_a <bb_vec_info> (vinfo))
3352         {
3353           unsigned HOST_WIDE_INT const_max_nunits;
3354           if (!max_nunits.is_constant (&const_max_nunits)
3355               || const_max_nunits > group_size)
3356             {
3357               if (dump_enabled_p ())
3358                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3359                                  "Build SLP failed: store group "
3360                                  "size not a multiple of the vector size "
3361                                  "in basic block SLP\n");
3362               vect_free_slp_tree (node);
3363               return false;
3364             }
3365           /* Fatal mismatch.  */
3366           if (dump_enabled_p ())
3367             dump_printf_loc (MSG_NOTE, vect_location,
3368                              "SLP discovery succeeded but node needs "
3369                              "splitting\n");
3370           memset (matches, true, group_size);
3371           matches[group_size / const_max_nunits * const_max_nunits] = false;
3372           vect_free_slp_tree (node);
3373         }
3374       else
3375         {
3376           /* Create a new SLP instance.  */
3377           slp_instance new_instance = XNEW (class _slp_instance);
3378           SLP_INSTANCE_TREE (new_instance) = node;
3379           SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3380           SLP_INSTANCE_LOADS (new_instance) = vNULL;
3381           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3382           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3383           SLP_INSTANCE_KIND (new_instance) = kind;
3384           new_instance->reduc_phis = NULL;
3385           new_instance->cost_vec = vNULL;
3386           new_instance->subgraph_entries = vNULL;
3387
3388           if (dump_enabled_p ())
3389             dump_printf_loc (MSG_NOTE, vect_location,
3390                              "SLP size %u vs. limit %u.\n",
3391                              tree_size, max_tree_size);
3392
3393           /* Fixup SLP reduction chains.  */
3394           if (kind == slp_inst_kind_reduc_chain)
3395             {
3396               /* If this is a reduction chain with a conversion in front
3397                  amend the SLP tree with a node for that.  */
3398               gimple *scalar_def
3399                 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3400               if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3401                 {
3402                   /* Get at the conversion stmt - we know it's the single use
3403                      of the last stmt of the reduction chain.  */
3404                   use_operand_p use_p;
3405                   bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3406                                            &use_p, &scalar_def);
3407                   gcc_assert (r);
3408                   stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3409                   next_info = vect_stmt_to_vectorize (next_info);
3410                   scalar_stmts = vNULL;
3411                   scalar_stmts.create (group_size);
3412                   for (unsigned i = 0; i < group_size; ++i)
3413                     scalar_stmts.quick_push (next_info);
3414                   slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3415                   SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3416                   SLP_TREE_CHILDREN (conv).quick_push (node);
3417                   SLP_INSTANCE_TREE (new_instance) = conv;
3418                   /* We also have to fake this conversion stmt as SLP reduction
3419                      group so we don't have to mess with too much code
3420                      elsewhere.  */
3421                   REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3422                   REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3423                 }
3424               /* Fill the backedge child of the PHI SLP node.  The
3425                  general matching code cannot find it because the
3426                  scalar code does not reflect how we vectorize the
3427                  reduction.  */
3428               use_operand_p use_p;
3429               imm_use_iterator imm_iter;
3430               class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3431               FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3432                                      gimple_get_lhs (scalar_def))
3433                 /* There are exactly two non-debug uses, the reduction
3434                    PHI and the loop-closed PHI node.  */
3435                 if (!is_gimple_debug (USE_STMT (use_p))
3436                     && gimple_bb (USE_STMT (use_p)) == loop->header)
3437                   {
3438                     auto_vec<stmt_vec_info, 64> phis (group_size);
3439                     stmt_vec_info phi_info
3440                       = vinfo->lookup_stmt (USE_STMT (use_p));
3441                     for (unsigned i = 0; i < group_size; ++i)
3442                       phis.quick_push (phi_info);
3443                     slp_tree *phi_node = bst_map->get (phis);
3444                     unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3445                     SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3446                       = SLP_INSTANCE_TREE (new_instance);
3447                     SLP_INSTANCE_TREE (new_instance)->refcnt++;
3448                   }
3449             }
3450
3451           vinfo->slp_instances.safe_push (new_instance);
3452
3453           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3454              the number of scalar stmts in the root in a few places.
3455              Verify that assumption holds.  */
3456           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3457                         .length () == group_size);
3458
3459           if (dump_enabled_p ())
3460             {
3461               dump_printf_loc (MSG_NOTE, vect_location,
3462                                "Final SLP tree for instance %p:\n",
3463                                (void *) new_instance);
3464               vect_print_slp_graph (MSG_NOTE, vect_location,
3465                                     SLP_INSTANCE_TREE (new_instance));
3466             }
3467
3468           return true;
3469         }
3470     }
3471   else
3472     {
3473       /* Failed to SLP.  */
3474       /* Free the allocated memory.  */
3475       scalar_stmts.release ();
3476     }
3477
3478   stmt_vec_info stmt_info = stmt_info_;
3479   /* Try to break the group up into pieces.  */
3480   if (kind == slp_inst_kind_store)
3481     {
3482       /* ???  We could delay all the actual splitting of store-groups
3483          until after SLP discovery of the original group completed.
3484          Then we can recurse to vect_build_slp_instance directly.  */
3485       for (i = 0; i < group_size; i++)
3486         if (!matches[i])
3487           break;
3488
3489       /* For basic block SLP, try to break the group up into multiples of
3490          a vector size.  */
3491       if (is_a <bb_vec_info> (vinfo)
3492           && (i > 1 && i < group_size))
3493         {
3494           tree scalar_type
3495             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3496           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3497                                                       1 << floor_log2 (i));
3498           unsigned HOST_WIDE_INT const_nunits;
3499           if (vectype
3500               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3501             {
3502               /* Split into two groups at the first vector boundary.  */
3503               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3504               unsigned group1_size = i & ~(const_nunits - 1);
3505
3506               if (dump_enabled_p ())
3507                 dump_printf_loc (MSG_NOTE, vect_location,
3508                                  "Splitting SLP group at stmt %u\n", i);
3509               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3510                                                                group1_size);
3511               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3512                                                     kind, max_tree_size,
3513                                                     limit);
3514               /* Split the rest at the failure point and possibly
3515                  re-analyze the remaining matching part if it has
3516                  at least two lanes.  */
3517               if (group1_size < i
3518                   && (i + 1 < group_size
3519                       || i - group1_size > 1))
3520                 {
3521                   stmt_vec_info rest2 = rest;
3522                   rest = vect_split_slp_store_group (rest, i - group1_size);
3523                   if (i - group1_size > 1)
3524                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3525                                                       kind, max_tree_size,
3526                                                       limit);
3527                 }
3528               /* Re-analyze the non-matching tail if it has at least
3529                  two lanes.  */
3530               if (i + 1 < group_size)
3531                 res |= vect_analyze_slp_instance (vinfo, bst_map,
3532                                                   rest, kind, max_tree_size,
3533                                                   limit);
3534               return res;
3535             }
3536         }
3537
3538       /* For loop vectorization split into arbitrary pieces of size > 1.  */
3539       if (is_a <loop_vec_info> (vinfo)
3540           && (i > 1 && i < group_size)
3541           && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3542         {
3543           unsigned group1_size = i;
3544
3545           if (dump_enabled_p ())
3546             dump_printf_loc (MSG_NOTE, vect_location,
3547                              "Splitting SLP group at stmt %u\n", i);
3548
3549           stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3550                                                            group1_size);
3551           /* Loop vectorization cannot handle gaps in stores, make sure
3552              the split group appears as strided.  */
3553           STMT_VINFO_STRIDED_P (rest) = 1;
3554           DR_GROUP_GAP (rest) = 0;
3555           STMT_VINFO_STRIDED_P (stmt_info) = 1;
3556           DR_GROUP_GAP (stmt_info) = 0;
3557
3558           bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3559                                                 kind, max_tree_size, limit);
3560           if (i + 1 < group_size)
3561             res |= vect_analyze_slp_instance (vinfo, bst_map,
3562                                               rest, kind, max_tree_size, limit);
3563
3564           return res;
3565         }
3566
3567       /* Even though the first vector did not all match, we might be able to SLP
3568          (some) of the remainder.  FORNOW ignore this possibility.  */
3569     }
3570
3571   /* Failed to SLP.  */
3572   if (dump_enabled_p ())
3573     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3574   return false;
3575 }
3576
3577
3578 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3579    vect_build_slp_tree to build a tree of packed stmts if possible.
3580    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3581
3582 static bool
3583 vect_analyze_slp_instance (vec_info *vinfo,
3584                            scalar_stmts_to_slp_tree_map_t *bst_map,
3585                            stmt_vec_info stmt_info,
3586                            slp_instance_kind kind,
3587                            unsigned max_tree_size, unsigned *limit)
3588 {
3589   unsigned int i;
3590   vec<stmt_vec_info> scalar_stmts;
3591
3592   if (is_a <bb_vec_info> (vinfo))
3593     vect_location = stmt_info->stmt;
3594
3595   stmt_vec_info next_info = stmt_info;
3596   if (kind == slp_inst_kind_store)
3597     {
3598       /* Collect the stores and store them in scalar_stmts.  */
3599       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3600       while (next_info)
3601         {
3602           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3603           next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3604         }
3605     }
3606   else if (kind == slp_inst_kind_reduc_chain)
3607     {
3608       /* Collect the reduction stmts and store them in scalar_stmts.  */
3609       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3610       while (next_info)
3611         {
3612           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3613           next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3614         }
3615       /* Mark the first element of the reduction chain as reduction to properly
3616          transform the node.  In the reduction analysis phase only the last
3617          element of the chain is marked as reduction.  */
3618       STMT_VINFO_DEF_TYPE (stmt_info)
3619         = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3620       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3621         = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3622     }
3623   else if (kind == slp_inst_kind_reduc_group)
3624     {
3625       /* Collect reduction statements.  */
3626       const vec<stmt_vec_info> &reductions
3627         = as_a <loop_vec_info> (vinfo)->reductions;
3628       scalar_stmts.create (reductions.length ());
3629       for (i = 0; reductions.iterate (i, &next_info); i++)
3630         if ((STMT_VINFO_RELEVANT_P (next_info)
3631              || STMT_VINFO_LIVE_P (next_info))
3632             /* ???  Make sure we didn't skip a conversion around a reduction
3633                path.  In that case we'd have to reverse engineer that conversion
3634                stmt following the chain using reduc_idx and from the PHI
3635                using reduc_def.  */
3636             && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3637           scalar_stmts.quick_push (next_info);
3638       /* If less than two were relevant/live there's nothing to SLP.  */
3639       if (scalar_stmts.length () < 2)
3640         return false;
3641     }
3642   else
3643     gcc_unreachable ();
3644
3645   vec<stmt_vec_info> roots = vNULL;
3646   vec<tree> remain = vNULL;
3647   /* Build the tree for the SLP instance.  */
3648   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3649                                       roots, remain,
3650                                       max_tree_size, limit, bst_map,
3651                                       kind == slp_inst_kind_store
3652                                       ? stmt_info : NULL);
3653
3654   /* ???  If this is slp_inst_kind_store and the above succeeded here's
3655      where we should do store group splitting.  */
3656
3657   return res;
3658 }
3659
3660 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
3661    trees of packed scalar stmts if SLP is possible.  */
3662
3663 opt_result
3664 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3665 {
3666   unsigned int i;
3667   stmt_vec_info first_element;
3668   slp_instance instance;
3669
3670   DUMP_VECT_SCOPE ("vect_analyze_slp");
3671
3672   unsigned limit = max_tree_size;
3673
3674   scalar_stmts_to_slp_tree_map_t *bst_map
3675     = new scalar_stmts_to_slp_tree_map_t ();
3676
3677   /* Find SLP sequences starting from groups of grouped stores.  */
3678   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3679     vect_analyze_slp_instance (vinfo, bst_map, first_element,
3680                                slp_inst_kind_store, max_tree_size, &limit);
3681
3682   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3683     {
3684       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3685         {
3686           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3687           /* Apply patterns.  */
3688           for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
3689             bb_vinfo->roots[i].stmts[j]
3690               = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
3691           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3692                                        bb_vinfo->roots[i].stmts,
3693                                        bb_vinfo->roots[i].roots,
3694                                        bb_vinfo->roots[i].remain,
3695                                        max_tree_size, &limit, bst_map, NULL))
3696             {
3697               bb_vinfo->roots[i].stmts = vNULL;
3698               bb_vinfo->roots[i].roots = vNULL;
3699               bb_vinfo->roots[i].remain = vNULL;
3700             }
3701         }
3702     }
3703
3704   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3705     {
3706       /* Find SLP sequences starting from reduction chains.  */
3707       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3708         if (! STMT_VINFO_RELEVANT_P (first_element)
3709             && ! STMT_VINFO_LIVE_P (first_element))
3710           ;
3711         else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3712                                               slp_inst_kind_reduc_chain,
3713                                               max_tree_size, &limit))
3714           {
3715             /* Dissolve reduction chain group.  */
3716             stmt_vec_info vinfo = first_element;
3717             stmt_vec_info last = NULL;
3718             while (vinfo)
3719               {
3720                 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3721                 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3722                 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3723                 last = vinfo;
3724                 vinfo = next;
3725               }
3726             STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3727             /* It can be still vectorized as part of an SLP reduction.  */
3728             loop_vinfo->reductions.safe_push (last);
3729           }
3730
3731       /* Find SLP sequences starting from groups of reductions.  */
3732       if (loop_vinfo->reductions.length () > 1)
3733         vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3734                                    slp_inst_kind_reduc_group, max_tree_size,
3735                                    &limit);
3736     }
3737
3738   hash_set<slp_tree> visited_patterns;
3739   slp_tree_to_load_perm_map_t perm_cache;
3740   slp_compat_nodes_map_t compat_cache;
3741
3742   /* See if any patterns can be found in the SLP tree.  */
3743   bool pattern_found = false;
3744   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3745     pattern_found |= vect_match_slp_patterns (instance, vinfo,
3746                                               &visited_patterns, &perm_cache,
3747                                               &compat_cache);
3748
3749   /* If any were found optimize permutations of loads.  */
3750   if (pattern_found)
3751     {
3752       hash_map<slp_tree, slp_tree> load_map;
3753       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3754         {
3755           slp_tree root = SLP_INSTANCE_TREE (instance);
3756           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3757                                         &load_map, root);
3758         }
3759     }
3760
3761
3762
3763   /* The map keeps a reference on SLP nodes built, release that.  */
3764   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3765        it != bst_map->end (); ++it)
3766     if ((*it).second)
3767       vect_free_slp_tree ((*it).second);
3768   delete bst_map;
3769
3770   if (pattern_found && dump_enabled_p ())
3771     {
3772       dump_printf_loc (MSG_NOTE, vect_location,
3773                        "Pattern matched SLP tree\n");
3774       hash_set<slp_tree> visited;
3775       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3776         vect_print_slp_graph (MSG_NOTE, vect_location,
3777                               SLP_INSTANCE_TREE (instance), visited);
3778     }
3779
3780   return opt_result::success ();
3781 }
3782
3783 /* Estimates the cost of inserting layout changes into the SLP graph.
3784    It can also say that the insertion is impossible.  */
3785
3786 struct slpg_layout_cost
3787 {
3788   slpg_layout_cost () = default;
3789   slpg_layout_cost (sreal, bool);
3790
3791   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3792   bool is_possible () const { return depth != sreal::max (); }
3793
3794   bool operator== (const slpg_layout_cost &) const;
3795   bool operator!= (const slpg_layout_cost &) const;
3796
3797   bool is_better_than (const slpg_layout_cost &, bool) const;
3798
3799   void add_parallel_cost (const slpg_layout_cost &);
3800   void add_serial_cost (const slpg_layout_cost &);
3801   void split (unsigned int);
3802
3803   /* The longest sequence of layout changes needed during any traversal
3804      of the partition dag, weighted by execution frequency.
3805
3806      This is the most important metric when optimizing for speed, since
3807      it helps to ensure that we keep the number of operations on
3808      critical paths to a minimum.  */
3809   sreal depth = 0;
3810
3811   /* An estimate of the total number of operations needed.  It is weighted by
3812      execution frequency when optimizing for speed but not when optimizing for
3813      size.  In order to avoid double-counting, a node with a fanout of N will
3814      distribute 1/N of its total cost to each successor.
3815
3816      This is the most important metric when optimizing for size, since
3817      it helps to keep the total number of operations to a minimum,  */
3818   sreal total = 0;
3819 };
3820
3821 /* Construct costs for a node with weight WEIGHT.  A higher weight
3822    indicates more frequent execution.  IS_FOR_SIZE is true if we are
3823    optimizing for size rather than speed.  */
3824
3825 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3826   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3827 {
3828 }
3829
3830 bool
3831 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3832 {
3833   return depth == other.depth && total == other.total;
3834 }
3835
3836 bool
3837 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3838 {
3839   return !operator== (other);
3840 }
3841
3842 /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
3843    true if we are optimizing for size rather than speed.  */
3844
3845 bool
3846 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3847                                   bool is_for_size) const
3848 {
3849   if (is_for_size)
3850     {
3851       if (total != other.total)
3852         return total < other.total;
3853       return depth < other.depth;
3854     }
3855   else
3856     {
3857       if (depth != other.depth)
3858         return depth < other.depth;
3859       return total < other.total;
3860     }
3861 }
3862
3863 /* Increase the costs to account for something with cost INPUT_COST
3864    happening in parallel with the current costs.  */
3865
3866 void
3867 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3868 {
3869   depth = std::max (depth, input_cost.depth);
3870   total += input_cost.total;
3871 }
3872
3873 /* Increase the costs to account for something with cost INPUT_COST
3874    happening in series with the current costs.  */
3875
3876 void
3877 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3878 {
3879   depth += other.depth;
3880   total += other.total;
3881 }
3882
3883 /* Split the total cost among TIMES successors or predecessors.  */
3884
3885 void
3886 slpg_layout_cost::split (unsigned int times)
3887 {
3888   if (times > 1)
3889     total /= times;
3890 }
3891
3892 /* Information about one node in the SLP graph, for use during
3893    vect_optimize_slp_pass.  */
3894
3895 struct slpg_vertex
3896 {
3897   slpg_vertex (slp_tree node_) : node (node_) {}
3898
3899   /* The node itself.  */
3900   slp_tree node;
3901
3902   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
3903      partitions are flexible; they can have whichever layout consumers
3904      want them to have.  */
3905   int partition = -1;
3906
3907   /* The number of nodes that directly use the result of this one
3908      (i.e. the number of nodes that count this one as a child).  */
3909   unsigned int out_degree = 0;
3910
3911   /* The execution frequency of the node.  */
3912   sreal weight = 0;
3913
3914   /* The total execution frequency of all nodes that directly use the
3915      result of this one.  */
3916   sreal out_weight = 0;
3917 };
3918
3919 /* Information about one partition of the SLP graph, for use during
3920    vect_optimize_slp_pass.  */
3921
3922 struct slpg_partition_info
3923 {
3924   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3925      of m_partitioned_nodes.  */
3926   unsigned int node_begin = 0;
3927   unsigned int node_end = 0;
3928
3929   /* Which layout we've chosen to use for this partition, or -1 if
3930      we haven't picked one yet.  */
3931   int layout = -1;
3932
3933   /* The number of predecessors and successors in the partition dag.
3934      The predecessors always have lower partition numbers and the
3935      successors always have higher partition numbers.
3936
3937      Note that the directions of these edges are not necessarily the
3938      same as in the data flow graph.  For example, if an SCC has separate
3939      partitions for an inner loop and an outer loop, the inner loop's
3940      partition will have at least two incoming edges from the outer loop's
3941      partition: one for a live-in value and one for a live-out value.
3942      In data flow terms, one of these edges would also be from the outer loop
3943      to the inner loop, but the other would be in the opposite direction.  */
3944   unsigned int in_degree = 0;
3945   unsigned int out_degree = 0;
3946 };
3947
3948 /* Information about the costs of using a particular layout for a
3949    particular partition.  It can also say that the combination is
3950    impossible.  */
3951
3952 struct slpg_partition_layout_costs
3953 {
3954   bool is_possible () const { return internal_cost.is_possible (); }
3955   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3956
3957   /* The costs inherited from predecessor partitions.  */
3958   slpg_layout_cost in_cost;
3959
3960   /* The inherent cost of the layout within the node itself.  For example,
3961      this is nonzero for a load if choosing a particular layout would require
3962      the load to permute the loaded elements.  It is nonzero for a
3963      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3964      to full-vector moves.  */
3965   slpg_layout_cost internal_cost;
3966
3967   /* The costs inherited from successor partitions.  */
3968   slpg_layout_cost out_cost;
3969 };
3970
3971 /* This class tries to optimize the layout of vectors in order to avoid
3972    unnecessary shuffling.  At the moment, the set of possible layouts are
3973    restricted to bijective permutations.
3974
3975    The goal of the pass depends on whether we're optimizing for size or
3976    for speed.  When optimizing for size, the goal is to reduce the overall
3977    number of layout changes (including layout changes implied by things
3978    like load permutations).  When optimizing for speed, the goal is to
3979    reduce the maximum latency attributable to layout changes on any
3980    non-cyclical path through the data flow graph.
3981
3982    For example, when optimizing a loop nest for speed, we will prefer
3983    to make layout changes outside of a loop rather than inside of a loop,
3984    and will prefer to make layout changes in parallel rather than serially,
3985    even if that increases the overall number of layout changes.
3986
3987    The high-level procedure is:
3988
3989    (1) Build a graph in which edges go from uses (parents) to definitions
3990        (children).
3991
3992    (2) Divide the graph into a dag of strongly-connected components (SCCs).
3993
3994    (3) When optimizing for speed, partition the nodes in each SCC based
3995        on their containing cfg loop.  When optimizing for size, treat
3996        each SCC as a single partition.
3997
3998        This gives us a dag of partitions.  The goal is now to assign a
3999        layout to each partition.
4000
4001    (4) Construct a set of vector layouts that are worth considering.
4002        Record which nodes must keep their current layout.
4003
4004    (5) Perform a forward walk over the partition dag (from loads to stores)
4005        accumulating the "forward" cost of using each layout.  When visiting
4006        each partition, assign a tentative choice of layout to the partition
4007        and use that choice when calculating the cost of using a different
4008        layout in successor partitions.
4009
4010    (6) Perform a backward walk over the partition dag (from stores to loads),
4011        accumulating the "backward" cost of using each layout.  When visiting
4012        each partition, make a final choice of layout for that partition based
4013        on the accumulated forward costs (from (5)) and backward costs
4014        (from (6)).
4015
4016    (7) Apply the chosen layouts to the SLP graph.
4017
4018    For example, consider the SLP statements:
4019
4020    S1:      a_1 = load
4021        loop:
4022    S2:      a_2 = PHI<a_1, a_3>
4023    S3:      b_1 = load
4024    S4:      a_3 = a_2 + b_1
4025        exit:
4026    S5:      a_4 = PHI<a_3>
4027    S6:      store a_4
4028
4029    S2 and S4 form an SCC and are part of the same loop.  Every other
4030    statement is in a singleton SCC.  In this example there is a one-to-one
4031    mapping between SCCs and partitions and the partition dag looks like this;
4032
4033         S1     S3
4034          \     /
4035           S2+S4
4036             |
4037            S5
4038             |
4039            S6
4040
4041    S2, S3 and S4 will have a higher execution frequency than the other
4042    statements, so when optimizing for speed, the goal is to avoid any
4043    layout changes:
4044
4045    - within S3
4046    - within S2+S4
4047    - on the S3->S2+S4 edge
4048
4049    For example, if S3 was originally a reversing load, the goal of the
4050    pass is to make it an unreversed load and change the layout on the
4051    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
4052    on S1->S2+S4 and S5->S6 would also be acceptable.)
4053
4054    The difference between SCCs and partitions becomes important if we
4055    add an outer loop:
4056
4057    S1:      a_1 = ...
4058        loop1:
4059    S2:      a_2 = PHI<a_1, a_6>
4060    S3:      b_1 = load
4061    S4:      a_3 = a_2 + b_1
4062        loop2:
4063    S5:      a_4 = PHI<a_3, a_5>
4064    S6:      c_1 = load
4065    S7:      a_5 = a_4 + c_1
4066        exit2:
4067    S8:      a_6 = PHI<a_5>
4068    S9:      store a_6
4069        exit1:
4070
4071    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
4072    for speed, we usually do not want restrictions in the outer loop to "infect"
4073    the decision for the inner loop.  For example, if an outer-loop node
4074    in the SCC contains a statement with a fixed layout, that should not
4075    prevent the inner loop from using a different layout.  Conversely,
4076    the inner loop should not dictate a layout to the outer loop: if the
4077    outer loop does a lot of computation, then it may not be efficient to
4078    do all of that computation in the inner loop's preferred layout.
4079
4080    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
4081    and S5+S7 (inner).  We also try to arrange partitions so that:
4082
4083    - the partition for an outer loop comes before the partition for
4084      an inner loop
4085
4086    - if a sibling loop A dominates a sibling loop B, A's partition
4087      comes before B's
4088
4089    This gives the following partition dag for the example above:
4090
4091         S1        S3
4092          \        /
4093           S2+S4+S8   S6
4094            |   \\    /
4095            |    S5+S7
4096            |
4097           S9
4098
4099    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
4100    one for a reversal of the edge S7->S8.
4101
4102    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
4103    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
4104    preferred layout against the cost of changing the layout on entry to the
4105    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
4106
4107    Although this works well when optimizing for speed, it has the downside
4108    when optimizing for size that the choice of layout for S5+S7 is completely
4109    independent of S9, which lessens the chance of reducing the overall number
4110    of permutations.  We therefore do not partition SCCs when optimizing
4111    for size.
4112
4113    To give a concrete example of the difference between optimizing
4114    for size and speed, consider:
4115
4116    a[0] = (b[1] << c[3]) - d[1];
4117    a[1] = (b[0] << c[2]) - d[0];
4118    a[2] = (b[3] << c[1]) - d[3];
4119    a[3] = (b[2] << c[0]) - d[2];
4120
4121    There are three different layouts here: one for a, one for b and d,
4122    and one for c.  When optimizing for speed it is better to permute each
4123    of b, c and d into the order required by a, since those permutations
4124    happen in parallel.  But when optimizing for size, it is better to:
4125
4126    - permute c into the same order as b
4127    - do the arithmetic
4128    - permute the result into the order required by a
4129
4130    This gives 2 permutations rather than 3.  */
4131
4132 class vect_optimize_slp_pass
4133 {
4134 public:
4135   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
4136   void run ();
4137
4138 private:
4139   /* Graph building.  */
4140   struct loop *containing_loop (slp_tree);
4141   bool is_cfg_latch_edge (graph_edge *);
4142   void build_vertices (hash_set<slp_tree> &, slp_tree);
4143   void build_vertices ();
4144   void build_graph ();
4145
4146   /* Partitioning.  */
4147   void create_partitions ();
4148   template<typename T> void for_each_partition_edge (unsigned int, T);
4149
4150   /* Layout selection.  */
4151   bool is_compatible_layout (slp_tree, unsigned int);
4152   int change_layout_cost (slp_tree, unsigned int, unsigned int);
4153   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4154                                                        unsigned int);
4155   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4156                                int, unsigned int);
4157   int internal_node_cost (slp_tree, int, unsigned int);
4158   void start_choosing_layouts ();
4159
4160   /* Cost propagation.  */
4161   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4162                                      unsigned int, unsigned int);
4163   slpg_layout_cost total_in_cost (unsigned int);
4164   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4165   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4166   void forward_pass ();
4167   void backward_pass ();
4168
4169   /* Rematerialization.  */
4170   slp_tree get_result_with_layout (slp_tree, unsigned int);
4171   void materialize ();
4172
4173   /* Clean-up.  */
4174   void remove_redundant_permutations ();
4175
4176   void dump ();
4177
4178   vec_info *m_vinfo;
4179
4180   /* True if we should optimize the graph for size, false if we should
4181      optimize it for speed.  (It wouldn't be easy to make this decision
4182      more locally.)  */
4183   bool m_optimize_size;
4184
4185   /* A graph of all SLP nodes, with edges leading from uses to definitions.
4186      In other words, a node's predecessors are its slp_tree parents and
4187      a node's successors are its slp_tree children.  */
4188   graph *m_slpg = nullptr;
4189
4190   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
4191   auto_vec<slpg_vertex> m_vertices;
4192
4193   /* The list of all leaves of M_SLPG. such as external definitions, constants,
4194      and loads.  */
4195   auto_vec<int> m_leafs;
4196
4197   /* This array has one entry for every vector layout that we're considering.
4198      Element 0 is null and indicates "no change".  Other entries describe
4199      permutations that are inherent in the current graph and that we would
4200      like to reverse if possible.
4201
4202      For example, a permutation { 1, 2, 3, 0 } means that something has
4203      effectively been permuted in that way, such as a load group
4204      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4205      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4206      in order to put things "back" in order.  */
4207   auto_vec<vec<unsigned> > m_perms;
4208
4209   /* A partitioning of the nodes for which a layout must be chosen.
4210      Each partition represents an <SCC, cfg loop> pair; that is,
4211      nodes in different SCCs belong to different partitions, and nodes
4212      within an SCC can be further partitioned according to a containing
4213      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
4214
4215      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4216        from leaves (such as loads) to roots (such as stores).
4217
4218      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
4219   auto_vec<slpg_partition_info> m_partitions;
4220
4221   /* The list of all nodes for which a layout must be chosen.  Nodes for
4222      partition P come before the nodes for partition P+1.  Nodes within a
4223      partition are in reverse postorder.  */
4224   auto_vec<unsigned int> m_partitioned_nodes;
4225
4226   /* Index P * num-layouts + L contains the cost of using layout L
4227      for partition P.  */
4228   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4229
4230   /* Index N * num-layouts + L, if nonnull, is a node that provides the
4231      original output of node N adjusted to have layout L.  */
4232   auto_vec<slp_tree> m_node_layouts;
4233 };
4234
4235 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4236    Also record whether we should optimize anything for speed rather
4237    than size.  */
4238
4239 void
4240 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4241                                         slp_tree node)
4242 {
4243   unsigned i;
4244   slp_tree child;
4245
4246   if (visited.add (node))
4247     return;
4248
4249   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4250     {
4251       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4252       if (optimize_bb_for_speed_p (bb))
4253         m_optimize_size = false;
4254     }
4255
4256   node->vertex = m_vertices.length ();
4257   m_vertices.safe_push (slpg_vertex (node));
4258
4259   bool leaf = true;
4260   bool force_leaf = false;
4261   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4262     if (child)
4263       {
4264         leaf = false;
4265         build_vertices (visited, child);
4266       }
4267     else
4268       force_leaf = true;
4269   /* Since SLP discovery works along use-def edges all cycles have an
4270      entry - but there's the exception of cycles where we do not handle
4271      the entry explicitely (but with a NULL SLP node), like some reductions
4272      and inductions.  Force those SLP PHIs to act as leafs to make them
4273      backwards reachable.  */
4274   if (leaf || force_leaf)
4275     m_leafs.safe_push (node->vertex);
4276 }
4277
4278 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
4279
4280 void
4281 vect_optimize_slp_pass::build_vertices ()
4282 {
4283   hash_set<slp_tree> visited;
4284   unsigned i;
4285   slp_instance instance;
4286   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4287     build_vertices (visited, SLP_INSTANCE_TREE (instance));
4288 }
4289
4290 /* Apply (reverse) bijectite PERM to VEC.  */
4291
4292 template <class T>
4293 static void
4294 vect_slp_permute (vec<unsigned> perm,
4295                   vec<T> &vec, bool reverse)
4296 {
4297   auto_vec<T, 64> saved;
4298   saved.create (vec.length ());
4299   for (unsigned i = 0; i < vec.length (); ++i)
4300     saved.quick_push (vec[i]);
4301
4302   if (reverse)
4303     {
4304       for (unsigned i = 0; i < vec.length (); ++i)
4305         vec[perm[i]] = saved[i];
4306       for (unsigned i = 0; i < vec.length (); ++i)
4307         gcc_assert (vec[perm[i]] == saved[i]);
4308     }
4309   else
4310     {
4311       for (unsigned i = 0; i < vec.length (); ++i)
4312         vec[i] = saved[perm[i]];
4313       for (unsigned i = 0; i < vec.length (); ++i)
4314         gcc_assert (vec[i] == saved[perm[i]]);
4315     }
4316 }
4317
4318 /* Return the cfg loop that contains NODE.  */
4319
4320 struct loop *
4321 vect_optimize_slp_pass::containing_loop (slp_tree node)
4322 {
4323   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4324   if (!rep)
4325     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4326   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4327 }
4328
4329 /* Return true if UD (an edge from a use to a definition) is associated
4330    with a loop latch edge in the cfg.  */
4331
4332 bool
4333 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4334 {
4335   slp_tree use = m_vertices[ud->src].node;
4336   slp_tree def = m_vertices[ud->dest].node;
4337   if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
4338        || SLP_TREE_CODE (use) == VEC_PERM_EXPR)
4339       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4340     return false;
4341
4342   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4343   return (is_a<gphi *> (use_rep->stmt)
4344           && bb_loop_header_p (gimple_bb (use_rep->stmt))
4345           && containing_loop (def) == containing_loop (use));
4346 }
4347
4348 /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
4349    a nonnull data field.  */
4350
4351 void
4352 vect_optimize_slp_pass::build_graph ()
4353 {
4354   m_optimize_size = true;
4355   build_vertices ();
4356
4357   m_slpg = new_graph (m_vertices.length ());
4358   for (slpg_vertex &v : m_vertices)
4359     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4360       if (child)
4361         {
4362           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4363           if (is_cfg_latch_edge (ud))
4364             ud->data = this;
4365         }
4366 }
4367
4368 /* Return true if E corresponds to a loop latch edge in the cfg.  */
4369
4370 static bool
4371 skip_cfg_latch_edges (graph_edge *e)
4372 {
4373   return e->data;
4374 }
4375
4376 /* Create the node partitions.  */
4377
4378 void
4379 vect_optimize_slp_pass::create_partitions ()
4380 {
4381   /* Calculate a postorder of the graph, ignoring edges that correspond
4382      to natural latch edges in the cfg.  Reading the vector from the end
4383      to the beginning gives the reverse postorder.  */
4384   auto_vec<int> initial_rpo;
4385   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4386                false, NULL, skip_cfg_latch_edges);
4387   gcc_assert (initial_rpo.length () == m_vertices.length ());
4388
4389   /* Calculate the strongly connected components of the graph.  */
4390   auto_vec<int> scc_grouping;
4391   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4392
4393   /* Create a new index order in which all nodes from the same SCC are
4394      consecutive.  Use scc_pos to record the index of the first node in
4395      each SCC.  */
4396   auto_vec<unsigned int> scc_pos (num_sccs);
4397   int last_component = -1;
4398   unsigned int node_count = 0;
4399   for (unsigned int node_i : scc_grouping)
4400     {
4401       if (last_component != m_slpg->vertices[node_i].component)
4402         {
4403           last_component = m_slpg->vertices[node_i].component;
4404           gcc_assert (last_component == int (scc_pos.length ()));
4405           scc_pos.quick_push (node_count);
4406         }
4407       node_count += 1;
4408     }
4409   gcc_assert (node_count == initial_rpo.length ()
4410               && last_component + 1 == int (num_sccs));
4411
4412   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4413      inside each SCC following the RPO we calculated above.  The fact that
4414      we ignored natural latch edges when calculating the RPO should ensure
4415      that, for natural loop nests:
4416
4417      - the first node that we encounter in a cfg loop is the loop header phi
4418      - the loop header phis are in dominance order
4419
4420      Arranging for this is an optimization (see below) rather than a
4421      correctness issue.  Unnatural loops with a tangled mess of backedges
4422      will still work correctly, but might give poorer results.
4423
4424      Also update scc_pos so that it gives 1 + the index of the last node
4425      in the SCC.  */
4426   m_partitioned_nodes.safe_grow (node_count);
4427   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4428     {
4429       unsigned int node_i = initial_rpo[old_i];
4430       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4431       m_partitioned_nodes[new_i] = node_i;
4432     }
4433
4434   /* When optimizing for speed, partition each SCC based on the containing
4435      cfg loop. The order we constructed above should ensure that, for natural
4436      cfg loops, we'll create sub-SCC partitions for outer loops before
4437      the corresponding sub-SCC partitions for inner loops.  Similarly,
4438      when one sibling loop A dominates another sibling loop B, we should
4439      create a sub-SCC partition for A before a sub-SCC partition for B.
4440
4441      As above, nothing depends for correctness on whether this achieves
4442      a natural nesting, but we should get better results when it does.  */
4443   m_partitions.reserve (m_vertices.length ());
4444   unsigned int next_partition_i = 0;
4445   hash_map<struct loop *, int> loop_partitions;
4446   unsigned int rpo_begin = 0;
4447   unsigned int num_partitioned_nodes = 0;
4448   for (unsigned int rpo_end : scc_pos)
4449     {
4450       loop_partitions.empty ();
4451       unsigned int partition_i = next_partition_i;
4452       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4453         {
4454           /* Handle externals and constants optimistically throughout.
4455              But treat existing vectors as fixed since we do not handle
4456              permuting them.  */
4457           unsigned int node_i = m_partitioned_nodes[rpo_i];
4458           auto &vertex = m_vertices[node_i];
4459           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4460                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4461               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4462             vertex.partition = -1;
4463           else
4464             {
4465               bool existed;
4466               if (m_optimize_size)
4467                 existed = next_partition_i > partition_i;
4468               else
4469                 {
4470                   struct loop *loop = containing_loop (vertex.node);
4471                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
4472                   if (!existed)
4473                     entry = next_partition_i;
4474                   partition_i = entry;
4475                 }
4476               if (!existed)
4477                 {
4478                   m_partitions.quick_push (slpg_partition_info ());
4479                   next_partition_i += 1;
4480                 }
4481               vertex.partition = partition_i;
4482               num_partitioned_nodes += 1;
4483               m_partitions[partition_i].node_end += 1;
4484             }
4485         }
4486       rpo_begin = rpo_end;
4487     }
4488
4489   /* Assign ranges of consecutive node indices to each partition,
4490      in partition order.  Start with node_end being the same as
4491      node_begin so that the next loop can use it as a counter.  */
4492   unsigned int node_begin = 0;
4493   for (auto &partition : m_partitions)
4494     {
4495       partition.node_begin = node_begin;
4496       node_begin += partition.node_end;
4497       partition.node_end = partition.node_begin;
4498     }
4499   gcc_assert (node_begin == num_partitioned_nodes);
4500
4501   /* Finally build the list of nodes in partition order.  */
4502   m_partitioned_nodes.truncate (num_partitioned_nodes);
4503   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4504     {
4505       int partition_i = m_vertices[node_i].partition;
4506       if (partition_i >= 0)
4507         {
4508           unsigned int order_i = m_partitions[partition_i].node_end++;
4509           m_partitioned_nodes[order_i] = node_i;
4510         }
4511     }
4512 }
4513
4514 /* Look for edges from earlier partitions into node NODE_I and edges from
4515    node NODE_I into later partitions.  Call:
4516
4517       FN (ud, other_node_i)
4518
4519    for each such use-to-def edge ud, where other_node_i is the node at the
4520    other end of the edge.  */
4521
4522 template<typename T>
4523 void
4524 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4525 {
4526   int partition_i = m_vertices[node_i].partition;
4527   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4528        pred; pred = pred->pred_next)
4529     {
4530       int src_partition_i = m_vertices[pred->src].partition;
4531       if (src_partition_i >= 0 && src_partition_i != partition_i)
4532         fn (pred, pred->src);
4533     }
4534   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4535        succ; succ = succ->succ_next)
4536     {
4537       int dest_partition_i = m_vertices[succ->dest].partition;
4538       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4539         fn (succ, succ->dest);
4540     }
4541 }
4542
4543 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4544    that NODE would operate on.  This test is independent of NODE's actual
4545    operation.  */
4546
4547 bool
4548 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4549                                               unsigned int layout_i)
4550 {
4551   if (layout_i == 0)
4552     return true;
4553
4554   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4555     return false;
4556
4557   return true;
4558 }
4559
4560 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4561    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
4562    layouts is incompatible with NODE or if the change is not possible for
4563    some other reason.
4564
4565    The properties taken from NODE include the number of lanes and the
4566    vector type.  The actual operation doesn't matter.  */
4567
4568 int
4569 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4570                                             unsigned int from_layout_i,
4571                                             unsigned int to_layout_i)
4572 {
4573   if (!is_compatible_layout (node, from_layout_i)
4574       || !is_compatible_layout (node, to_layout_i))
4575     return -1;
4576
4577   if (from_layout_i == to_layout_i)
4578     return 0;
4579
4580   auto_vec<slp_tree, 1> children (1);
4581   children.quick_push (node);
4582   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4583   if (from_layout_i > 0)
4584     for (unsigned int i : m_perms[from_layout_i])
4585       perm.quick_push ({ 0, i });
4586   else
4587     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4588       perm.quick_push ({ 0, i });
4589   if (to_layout_i > 0)
4590     vect_slp_permute (m_perms[to_layout_i], perm, true);
4591   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4592                                                children, false);
4593   if (count >= 0)
4594     return MAX (count, 1);
4595
4596   /* ??? In principle we could try changing via layout 0, giving two
4597      layout changes rather than 1.  Doing that would require
4598      corresponding support in get_result_with_layout.  */
4599   return -1;
4600 }
4601
4602 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
4603
4604 inline slpg_partition_layout_costs &
4605 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4606                                                 unsigned int layout_i)
4607 {
4608   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4609 }
4610
4611 /* Change PERM in one of two ways:
4612
4613    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4614      chosen for child I of NODE.
4615
4616    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4617
4618    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
4619
4620 void
4621 vect_optimize_slp_pass::
4622 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4623                         int in_layout_i, unsigned int out_layout_i)
4624 {
4625   for (auto &entry : perm)
4626     {
4627       int this_in_layout_i = in_layout_i;
4628       if (this_in_layout_i < 0)
4629         {
4630           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4631           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4632           this_in_layout_i = m_partitions[in_partition_i].layout;
4633         }
4634       if (this_in_layout_i > 0)
4635         entry.second = m_perms[this_in_layout_i][entry.second];
4636     }
4637   if (out_layout_i > 0)
4638     vect_slp_permute (m_perms[out_layout_i], perm, true);
4639 }
4640
4641 /* Check whether the target allows NODE to be rearranged so that the node's
4642    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
4643    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
4644
4645    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4646    NODE can adapt to the layout changes that have (perhaps provisionally)
4647    been chosen for NODE's children, so that no extra permutations are
4648    needed on either the input or the output of NODE.
4649
4650    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4651    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4652
4653    IN_LAYOUT_I has no meaning for other types of node.
4654
4655    Keeping the node as-is is always valid.  If the target doesn't appear
4656    to support the node as-is, but might realistically support other layouts,
4657    then layout 0 instead has the cost of a worst-case permutation.  On the
4658    one hand, this ensures that every node has at least one valid layout,
4659    avoiding what would otherwise be an awkward special case.  On the other,
4660    it still encourages the pass to change an invalid pre-existing layout
4661    choice into a valid one.  */
4662
4663 int
4664 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4665                                             unsigned int out_layout_i)
4666 {
4667   const int fallback_cost = 1;
4668
4669   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4670     {
4671       auto_lane_permutation_t tmp_perm;
4672       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4673
4674       /* Check that the child nodes support the chosen layout.  Checking
4675          the first child is enough, since any second child would have the
4676          same shape.  */
4677       auto first_child = SLP_TREE_CHILDREN (node)[0];
4678       if (in_layout_i > 0
4679           && !is_compatible_layout (first_child, in_layout_i))
4680         return -1;
4681
4682       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4683       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4684                                                   node, tmp_perm,
4685                                                   SLP_TREE_CHILDREN (node),
4686                                                   false);
4687       if (count < 0)
4688         {
4689           if (in_layout_i == 0 && out_layout_i == 0)
4690             {
4691               /* Use the fallback cost if the node could in principle support
4692                  some nonzero layout for both the inputs and the outputs.
4693                  Otherwise assume that the node will be rejected later
4694                  and rebuilt from scalars.  */
4695               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4696                 return fallback_cost;
4697               return 0;
4698             }
4699           return -1;
4700         }
4701
4702       /* We currently have no way of telling whether the new layout is cheaper
4703          or more expensive than the old one.  But at least in principle,
4704          it should be worth making zero permutations (whole-vector shuffles)
4705          cheaper than real permutations, in case the pass is able to remove
4706          the latter.  */
4707       return count == 0 ? 0 : 1;
4708     }
4709
4710   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4711   if (rep
4712       && STMT_VINFO_DATA_REF (rep)
4713       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4714       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4715     {
4716       auto_load_permutation_t tmp_perm;
4717       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4718       if (out_layout_i > 0)
4719         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4720
4721       poly_uint64 vf = 1;
4722       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4723         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4724       unsigned int n_perms;
4725       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4726                                            nullptr, vf, true, false, &n_perms))
4727         {
4728           auto rep = SLP_TREE_REPRESENTATIVE (node);
4729           if (out_layout_i == 0)
4730             {
4731               /* Use the fallback cost if the load is an N-to-N permutation.
4732                  Otherwise assume that the node will be rejected later
4733                  and rebuilt from scalars.  */
4734               if (STMT_VINFO_GROUPED_ACCESS (rep)
4735                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4736                       == SLP_TREE_LANES (node)))
4737                 return fallback_cost;
4738               return 0;
4739             }
4740           return -1;
4741         }
4742
4743       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
4744       return n_perms == 0 ? 0 : 1;
4745     }
4746
4747   return 0;
4748 }
4749
4750 /* Decide which element layouts we should consider using.  Calculate the
4751    weights associated with inserting layout changes on partition edges.
4752    Also mark partitions that cannot change layout, by setting their
4753    layout to zero.  */
4754
4755 void
4756 vect_optimize_slp_pass::start_choosing_layouts ()
4757 {
4758   /* Used to assign unique permutation indices.  */
4759   using perm_hash = unbounded_hashmap_traits<
4760     vec_free_hash_base<int_hash_base<unsigned>>,
4761     int_hash<int, -1, -2>
4762   >;
4763   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4764
4765   /* Layout 0 is "no change".  */
4766   m_perms.safe_push (vNULL);
4767
4768   /* Create layouts from existing permutations.  */
4769   auto_load_permutation_t tmp_perm;
4770   for (unsigned int node_i : m_partitioned_nodes)
4771     {
4772       /* Leafs also double as entries to the reverse graph.  Allow the
4773          layout of those to be changed.  */
4774       auto &vertex = m_vertices[node_i];
4775       auto &partition = m_partitions[vertex.partition];
4776       if (!m_slpg->vertices[node_i].succ)
4777         partition.layout = 0;
4778
4779       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
4780       slp_tree node = vertex.node;
4781       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4782       slp_tree child;
4783       unsigned HOST_WIDE_INT imin, imax = 0;
4784       bool any_permute = false;
4785       tmp_perm.truncate (0);
4786       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4787         {
4788           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4789              unpermuted, record a layout that reverses this permutation.
4790
4791              We would need more work to cope with loads that are internally
4792              permuted and also have inputs (such as masks for
4793              IFN_MASK_LOADs).  */
4794           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4795           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4796             {
4797               partition.layout = -1;
4798               continue;
4799             }
4800           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4801           imin = DR_GROUP_SIZE (dr_stmt) + 1;
4802           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4803         }
4804       else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4805                && SLP_TREE_CHILDREN (node).length () == 1
4806                && (child = SLP_TREE_CHILDREN (node)[0])
4807                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4808                    .is_constant (&imin)))
4809         {
4810           /* If the child has the same vector size as this node,
4811              reversing the permutation can make the permutation a no-op.
4812              In other cases it can change a true permutation into a
4813              full-vector extract.  */
4814           tmp_perm.reserve (SLP_TREE_LANES (node));
4815           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4816             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4817         }
4818       else
4819         continue;
4820
4821       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4822         {
4823           unsigned idx = tmp_perm[j];
4824           imin = MIN (imin, idx);
4825           imax = MAX (imax, idx);
4826           if (idx - tmp_perm[0] != j)
4827             any_permute = true;
4828         }
4829       /* If the span doesn't match we'd disrupt VF computation, avoid
4830          that for now.  */
4831       if (imax - imin + 1 != SLP_TREE_LANES (node))
4832         continue;
4833       /* If there's no permute no need to split one out.  In this case
4834          we can consider turning a load into a permuted load, if that
4835          turns out to be cheaper than alternatives.  */
4836       if (!any_permute)
4837         {
4838           partition.layout = -1;
4839           continue;
4840         }
4841
4842       /* For now only handle true permutes, like
4843          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
4844          when permuting constants and invariants keeping the permute
4845          bijective.  */
4846       auto_sbitmap load_index (SLP_TREE_LANES (node));
4847       bitmap_clear (load_index);
4848       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4849         bitmap_set_bit (load_index, tmp_perm[j] - imin);
4850       unsigned j;
4851       for (j = 0; j < SLP_TREE_LANES (node); ++j)
4852         if (!bitmap_bit_p (load_index, j))
4853           break;
4854       if (j != SLP_TREE_LANES (node))
4855         continue;
4856
4857       vec<unsigned> perm = vNULL;
4858       perm.safe_grow (SLP_TREE_LANES (node), true);
4859       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4860         perm[j] = tmp_perm[j] - imin;
4861
4862       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4863         {
4864           /* Continue to use existing layouts, but don't add any more.  */
4865           int *entry = layout_ids.get (perm);
4866           partition.layout = entry ? *entry : 0;
4867           perm.release ();
4868         }
4869       else
4870         {
4871           bool existed;
4872           int &layout_i = layout_ids.get_or_insert (perm, &existed);
4873           if (existed)
4874             perm.release ();
4875           else
4876             {
4877               layout_i = m_perms.length ();
4878               m_perms.safe_push (perm);
4879             }
4880           partition.layout = layout_i;
4881         }
4882     }
4883
4884   /* Initially assume that every layout is possible and has zero cost
4885      in every partition.  */
4886   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4887                                               * m_perms.length ());
4888
4889   /* We have to mark outgoing permutations facing non-associating-reduction
4890      graph entries that are not represented as to be materialized.
4891      slp_inst_kind_bb_reduc currently only covers associatable reductions.  */
4892   for (slp_instance instance : m_vinfo->slp_instances)
4893     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4894       {
4895         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4896         m_partitions[m_vertices[node_i].partition].layout = 0;
4897       }
4898     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4899       {
4900         stmt_vec_info stmt_info
4901           = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4902         stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4903         if (needs_fold_left_reduction_p (TREE_TYPE
4904                                            (gimple_get_lhs (stmt_info->stmt)),
4905                                          STMT_VINFO_REDUC_CODE (reduc_info)))
4906           {
4907             unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4908             m_partitions[m_vertices[node_i].partition].layout = 0;
4909           }
4910       }
4911
4912   /* Check which layouts each node and partition can handle.  Calculate the
4913      weights associated with inserting layout changes on edges.  */
4914   for (unsigned int node_i : m_partitioned_nodes)
4915     {
4916       auto &vertex = m_vertices[node_i];
4917       auto &partition = m_partitions[vertex.partition];
4918       slp_tree node = vertex.node;
4919
4920       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4921         {
4922           vertex.weight = vect_slp_node_weight (node);
4923
4924           /* We do not handle stores with a permutation, so all
4925              incoming permutations must have been materialized.
4926
4927              We also don't handle masked grouped loads, which lack a
4928              permutation vector.  In this case the memory locations
4929              form an implicit second input to the loads, on top of the
4930              explicit mask input, and the memory input's layout cannot
4931              be changed.
4932
4933              On the other hand, we do support permuting gather loads and
4934              masked gather loads, where each scalar load is independent
4935              of the others.  This can be useful if the address/index input
4936              benefits from permutation.  */
4937           if (STMT_VINFO_DATA_REF (rep)
4938               && STMT_VINFO_GROUPED_ACCESS (rep)
4939               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4940             partition.layout = 0;
4941
4942           /* We cannot change the layout of an operation that is
4943              not independent on lanes.  Note this is an explicit
4944              negative list since that's much shorter than the respective
4945              positive one but it's critical to keep maintaining it.  */
4946           if (is_gimple_call (STMT_VINFO_STMT (rep)))
4947             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4948               {
4949               case CFN_COMPLEX_ADD_ROT90:
4950               case CFN_COMPLEX_ADD_ROT270:
4951               case CFN_COMPLEX_MUL:
4952               case CFN_COMPLEX_MUL_CONJ:
4953               case CFN_VEC_ADDSUB:
4954               case CFN_VEC_FMADDSUB:
4955               case CFN_VEC_FMSUBADD:
4956                 partition.layout = 0;
4957               default:;
4958               }
4959         }
4960
4961       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4962         {
4963           auto &other_vertex = m_vertices[other_node_i];
4964
4965           /* Count the number of edges from earlier partitions and the number
4966              of edges to later partitions.  */
4967           if (other_vertex.partition < vertex.partition)
4968             partition.in_degree += 1;
4969           else
4970             partition.out_degree += 1;
4971
4972           /* If the current node uses the result of OTHER_NODE_I, accumulate
4973              the effects of that.  */
4974           if (ud->src == int (node_i))
4975             {
4976               other_vertex.out_weight += vertex.weight;
4977               other_vertex.out_degree += 1;
4978             }
4979         };
4980       for_each_partition_edge (node_i, process_edge);
4981     }
4982 }
4983
4984 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4985    its current (provisional) choice of layout.  The inputs do not necessarily
4986    have the same layout as each other.  */
4987
4988 slpg_layout_cost
4989 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4990 {
4991   auto &vertex = m_vertices[node_i];
4992   slpg_layout_cost cost;
4993   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4994     {
4995       auto &other_vertex = m_vertices[other_node_i];
4996       if (other_vertex.partition < vertex.partition)
4997         {
4998           auto &other_partition = m_partitions[other_vertex.partition];
4999           auto &other_costs = partition_layout_costs (other_vertex.partition,
5000                                                       other_partition.layout);
5001           slpg_layout_cost this_cost = other_costs.in_cost;
5002           this_cost.add_serial_cost (other_costs.internal_cost);
5003           this_cost.split (other_partition.out_degree);
5004           cost.add_parallel_cost (this_cost);
5005         }
5006     };
5007   for_each_partition_edge (node_i, add_cost);
5008   return cost;
5009 }
5010
5011 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
5012    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
5013    slpg_layout_cost::impossible () if the change isn't possible.  */
5014
5015 slpg_layout_cost
5016 vect_optimize_slp_pass::
5017 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
5018                   unsigned int layout2_i)
5019 {
5020   auto &def_vertex = m_vertices[ud->dest];
5021   auto &use_vertex = m_vertices[ud->src];
5022   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
5023   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
5024   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
5025                                     use_layout_i);
5026   if (factor < 0)
5027     return slpg_layout_cost::impossible ();
5028
5029   /* We have a choice of putting the layout change at the site of the
5030      definition or at the site of the use.  Prefer the former when
5031      optimizing for size or when the execution frequency of the
5032      definition is no greater than the combined execution frequencies of
5033      the uses.  When putting the layout change at the site of the definition,
5034      divvy up the cost among all consumers.  */
5035   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
5036     {
5037       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
5038       cost.split (def_vertex.out_degree);
5039       return cost;
5040     }
5041   return { use_vertex.weight * factor, m_optimize_size };
5042 }
5043
5044 /* UD represents a use-def link between FROM_NODE_I and a node in a later
5045    partition; FROM_NODE_I could be the definition node or the use node.
5046    The node at the other end of the link wants to use layout TO_LAYOUT_I.
5047    Return the cost of any necessary fix-ups on edge UD, or return
5048    slpg_layout_cost::impossible () if the change isn't possible.
5049
5050    At this point, FROM_NODE_I's partition has chosen the cheapest
5051    layout based on the information available so far, but this choice
5052    is only provisional.  */
5053
5054 slpg_layout_cost
5055 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
5056                                       unsigned int to_layout_i)
5057 {
5058   auto &from_vertex = m_vertices[from_node_i];
5059   unsigned int from_partition_i = from_vertex.partition;
5060   slpg_partition_info &from_partition = m_partitions[from_partition_i];
5061   gcc_assert (from_partition.layout >= 0);
5062
5063   /* First calculate the cost on the assumption that FROM_PARTITION sticks
5064      with its current layout preference.  */
5065   slpg_layout_cost cost = slpg_layout_cost::impossible ();
5066   auto edge_cost = edge_layout_cost (ud, from_node_i,
5067                                      from_partition.layout, to_layout_i);
5068   if (edge_cost.is_possible ())
5069     {
5070       auto &from_costs = partition_layout_costs (from_partition_i,
5071                                                  from_partition.layout);
5072       cost = from_costs.in_cost;
5073       cost.add_serial_cost (from_costs.internal_cost);
5074       cost.split (from_partition.out_degree);
5075       cost.add_serial_cost (edge_cost);
5076     }
5077   else if (from_partition.layout == 0)
5078     /* We must allow the source partition to have layout 0 as a fallback,
5079        in case all other options turn out to be impossible.  */
5080     return cost;
5081
5082   /* Take the minimum of that cost and the cost that applies if
5083      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
5084   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
5085                                                       to_layout_i);
5086   if (direct_layout_costs.is_possible ())
5087     {
5088       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
5089       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
5090       direct_cost.split (from_partition.out_degree);
5091       if (!cost.is_possible ()
5092           || direct_cost.is_better_than (cost, m_optimize_size))
5093         cost = direct_cost;
5094     }
5095
5096   return cost;
5097 }
5098
5099 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
5100    partition; TO_NODE_I could be the definition node or the use node.
5101    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
5102    return the cost of any necessary fix-ups on edge UD, or
5103    slpg_layout_cost::impossible () if the choice cannot be made.
5104
5105    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
5106
5107 slpg_layout_cost
5108 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
5109                                        unsigned int from_layout_i)
5110 {
5111   auto &to_vertex = m_vertices[to_node_i];
5112   unsigned int to_partition_i = to_vertex.partition;
5113   slpg_partition_info &to_partition = m_partitions[to_partition_i];
5114   gcc_assert (to_partition.layout >= 0);
5115
5116   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
5117      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
5118      any other inputs keep their current choice of layout.  */
5119   auto &to_costs = partition_layout_costs (to_partition_i,
5120                                            to_partition.layout);
5121   if (ud->src == int (to_node_i)
5122       && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
5123     {
5124       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
5125       auto old_layout = from_partition.layout;
5126       from_partition.layout = from_layout_i;
5127       int factor = internal_node_cost (to_vertex.node, -1,
5128                                        to_partition.layout);
5129       from_partition.layout = old_layout;
5130       if (factor >= 0)
5131         {
5132           slpg_layout_cost cost = to_costs.out_cost;
5133           cost.add_serial_cost ({ to_vertex.weight * factor,
5134                                   m_optimize_size });
5135           cost.split (to_partition.in_degree);
5136           return cost;
5137         }
5138     }
5139
5140   /* Compute the cost if we insert any necessary layout change on edge UD.  */
5141   auto edge_cost = edge_layout_cost (ud, to_node_i,
5142                                      to_partition.layout, from_layout_i);
5143   if (edge_cost.is_possible ())
5144     {
5145       slpg_layout_cost cost = to_costs.out_cost;
5146       cost.add_serial_cost (to_costs.internal_cost);
5147       cost.split (to_partition.in_degree);
5148       cost.add_serial_cost (edge_cost);
5149       return cost;
5150     }
5151
5152   return slpg_layout_cost::impossible ();
5153 }
5154
5155 /* Make a forward pass through the partitions, accumulating input costs.
5156    Make a tentative (provisional) choice of layout for each partition,
5157    ensuring that this choice still allows later partitions to keep
5158    their original layout.  */
5159
5160 void
5161 vect_optimize_slp_pass::forward_pass ()
5162 {
5163   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5164        ++partition_i)
5165     {
5166       auto &partition = m_partitions[partition_i];
5167
5168       /* If the partition consists of a single VEC_PERM_EXPR, precompute
5169          the incoming cost that would apply if every predecessor partition
5170          keeps its current layout.  This is used within the loop below.  */
5171       slpg_layout_cost in_cost;
5172       slp_tree single_node = nullptr;
5173       if (partition.node_end == partition.node_begin + 1)
5174         {
5175           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5176           single_node = m_vertices[node_i].node;
5177           if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5178             in_cost = total_in_cost (node_i);
5179         }
5180
5181       /* Go through the possible layouts.  Decide which ones are valid
5182          for this partition and record which of the valid layouts has
5183          the lowest cost.  */
5184       unsigned int min_layout_i = 0;
5185       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5186       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5187         {
5188           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5189           if (!layout_costs.is_possible ())
5190             continue;
5191
5192           /* If the recorded layout is already 0 then the layout cannot
5193              change.  */
5194           if (partition.layout == 0 && layout_i != 0)
5195             {
5196               layout_costs.mark_impossible ();
5197               continue;
5198             }
5199
5200           bool is_possible = true;
5201           for (unsigned int order_i = partition.node_begin;
5202                order_i < partition.node_end; ++order_i)
5203             {
5204               unsigned int node_i = m_partitioned_nodes[order_i];
5205               auto &vertex = m_vertices[node_i];
5206
5207               /* Reject the layout if it is individually incompatible
5208                  with any node in the partition.  */
5209               if (!is_compatible_layout (vertex.node, layout_i))
5210                 {
5211                   is_possible = false;
5212                   break;
5213                 }
5214
5215               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5216                 {
5217                   auto &other_vertex = m_vertices[other_node_i];
5218                   if (other_vertex.partition < vertex.partition)
5219                     {
5220                       /* Accumulate the incoming costs from earlier
5221                          partitions, plus the cost of any layout changes
5222                          on UD itself.  */
5223                       auto cost = forward_cost (ud, other_node_i, layout_i);
5224                       if (!cost.is_possible ())
5225                         is_possible = false;
5226                       else
5227                         layout_costs.in_cost.add_parallel_cost (cost);
5228                     }
5229                   else
5230                     /* Reject the layout if it would make layout 0 impossible
5231                        for later partitions.  This amounts to testing that the
5232                        target supports reversing the layout change on edges
5233                        to later partitions.
5234
5235                        In principle, it might be possible to push a layout
5236                        change all the way down a graph, so that it never
5237                        needs to be reversed and so that the target doesn't
5238                        need to support the reverse operation.  But it would
5239                        be awkward to bail out if we hit a partition that
5240                        does not support the new layout, especially since
5241                        we are not dealing with a lattice.  */
5242                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
5243                                                      layout_i).is_possible ();
5244                 };
5245               for_each_partition_edge (node_i, add_cost);
5246
5247               /* Accumulate the cost of using LAYOUT_I within NODE,
5248                  both for the inputs and the outputs.  */
5249               int factor = internal_node_cost (vertex.node, layout_i,
5250                                                layout_i);
5251               if (factor < 0)
5252                 {
5253                   is_possible = false;
5254                   break;
5255                 }
5256               else if (factor)
5257                 layout_costs.internal_cost.add_serial_cost
5258                   ({ vertex.weight * factor, m_optimize_size });
5259             }
5260           if (!is_possible)
5261             {
5262               layout_costs.mark_impossible ();
5263               continue;
5264             }
5265
5266           /* Combine the incoming and partition-internal costs.  */
5267           slpg_layout_cost combined_cost = layout_costs.in_cost;
5268           combined_cost.add_serial_cost (layout_costs.internal_cost);
5269
5270           /* If this partition consists of a single VEC_PERM_EXPR, see
5271              if the VEC_PERM_EXPR can be changed to support output layout
5272              LAYOUT_I while keeping all the provisional choices of input
5273              layout.  */
5274           if (single_node
5275               && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5276             {
5277               int factor = internal_node_cost (single_node, -1, layout_i);
5278               if (factor >= 0)
5279                 {
5280                   auto weight = m_vertices[single_node->vertex].weight;
5281                   slpg_layout_cost internal_cost
5282                     = { weight * factor, m_optimize_size };
5283
5284                   slpg_layout_cost alt_cost = in_cost;
5285                   alt_cost.add_serial_cost (internal_cost);
5286                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5287                     {
5288                       combined_cost = alt_cost;
5289                       layout_costs.in_cost = in_cost;
5290                       layout_costs.internal_cost = internal_cost;
5291                     }
5292                 }
5293             }
5294
5295           /* Record the layout with the lowest cost.  Prefer layout 0 in
5296              the event of a tie between it and another layout.  */
5297           if (!min_layout_cost.is_possible ()
5298               || combined_cost.is_better_than (min_layout_cost,
5299                                                m_optimize_size))
5300             {
5301               min_layout_i = layout_i;
5302               min_layout_cost = combined_cost;
5303             }
5304         }
5305
5306       /* This loop's handling of earlier partitions should ensure that
5307          choosing the original layout for the current partition is no
5308          less valid than it was in the original graph, even with the
5309          provisional layout choices for those earlier partitions.  */
5310       gcc_assert (min_layout_cost.is_possible ());
5311       partition.layout = min_layout_i;
5312     }
5313 }
5314
5315 /* Make a backward pass through the partitions, accumulating output costs.
5316    Make a final choice of layout for each partition.  */
5317
5318 void
5319 vect_optimize_slp_pass::backward_pass ()
5320 {
5321   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5322     {
5323       auto &partition = m_partitions[partition_i];
5324
5325       unsigned int min_layout_i = 0;
5326       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5327       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5328         {
5329           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5330           if (!layout_costs.is_possible ())
5331             continue;
5332
5333           /* Accumulate the costs from successor partitions.  */
5334           bool is_possible = true;
5335           for (unsigned int order_i = partition.node_begin;
5336                order_i < partition.node_end; ++order_i)
5337             {
5338               unsigned int node_i = m_partitioned_nodes[order_i];
5339               auto &vertex = m_vertices[node_i];
5340               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5341                 {
5342                   auto &other_vertex = m_vertices[other_node_i];
5343                   auto &other_partition = m_partitions[other_vertex.partition];
5344                   if (other_vertex.partition > vertex.partition)
5345                     {
5346                       /* Accumulate the incoming costs from later
5347                          partitions, plus the cost of any layout changes
5348                          on UD itself.  */
5349                       auto cost = backward_cost (ud, other_node_i, layout_i);
5350                       if (!cost.is_possible ())
5351                         is_possible = false;
5352                       else
5353                         layout_costs.out_cost.add_parallel_cost (cost);
5354                     }
5355                   else
5356                     /* Make sure that earlier partitions can (if necessary
5357                        or beneficial) keep the layout that they chose in
5358                        the forward pass.  This ensures that there is at
5359                        least one valid choice of layout.  */
5360                     is_possible &= edge_layout_cost (ud, other_node_i,
5361                                                      other_partition.layout,
5362                                                      layout_i).is_possible ();
5363                 };
5364               for_each_partition_edge (node_i, add_cost);
5365             }
5366           if (!is_possible)
5367             {
5368               layout_costs.mark_impossible ();
5369               continue;
5370             }
5371
5372           /* Locally combine the costs from the forward and backward passes.
5373              (This combined cost is not passed on, since that would lead
5374              to double counting.)  */
5375           slpg_layout_cost combined_cost = layout_costs.in_cost;
5376           combined_cost.add_serial_cost (layout_costs.internal_cost);
5377           combined_cost.add_serial_cost (layout_costs.out_cost);
5378
5379           /* Record the layout with the lowest cost.  Prefer layout 0 in
5380              the event of a tie between it and another layout.  */
5381           if (!min_layout_cost.is_possible ()
5382               || combined_cost.is_better_than (min_layout_cost,
5383                                                m_optimize_size))
5384             {
5385               min_layout_i = layout_i;
5386               min_layout_cost = combined_cost;
5387             }
5388         }
5389
5390       gcc_assert (min_layout_cost.is_possible ());
5391       partition.layout = min_layout_i;
5392     }
5393 }
5394
5395 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5396    NODE already has the layout that was selected for its partition.  */
5397
5398 slp_tree
5399 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5400                                                 unsigned int to_layout_i)
5401 {
5402   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5403   slp_tree result = m_node_layouts[result_i];
5404   if (result)
5405     return result;
5406
5407   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5408       || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5409           /* We can't permute vector defs in place.  */
5410           && SLP_TREE_VEC_DEFS (node).is_empty ()))
5411     {
5412       /* If the vector is uniform or unchanged, there's nothing to do.  */
5413       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5414         result = node;
5415       else
5416         {
5417           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5418           result = vect_create_new_slp_node (scalar_ops);
5419           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5420         }
5421     }
5422   else
5423     {
5424       unsigned int partition_i = m_vertices[node->vertex].partition;
5425       unsigned int from_layout_i = m_partitions[partition_i].layout;
5426       if (from_layout_i == to_layout_i)
5427         return node;
5428
5429       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5430          permutation instead of a serial one.  Leave the new permutation
5431          in TMP_PERM on success.  */
5432       auto_lane_permutation_t tmp_perm;
5433       unsigned int num_inputs = 1;
5434       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5435         {
5436           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5437           if (from_layout_i != 0)
5438             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5439           if (to_layout_i != 0)
5440             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5441           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5442                                               tmp_perm,
5443                                               SLP_TREE_CHILDREN (node),
5444                                               false) >= 0)
5445             num_inputs = SLP_TREE_CHILDREN (node).length ();
5446           else
5447             tmp_perm.truncate (0);
5448         }
5449
5450       if (dump_enabled_p ())
5451         {
5452           if (tmp_perm.length () > 0)
5453             dump_printf_loc (MSG_NOTE, vect_location,
5454                              "duplicating permutation node %p with"
5455                              " layout %d\n",
5456                              (void *) node, to_layout_i);
5457           else
5458             dump_printf_loc (MSG_NOTE, vect_location,
5459                              "inserting permutation node in place of %p\n",
5460                              (void *) node);
5461         }
5462
5463       unsigned int num_lanes = SLP_TREE_LANES (node);
5464       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5465       if (SLP_TREE_SCALAR_STMTS (node).length ())
5466         {
5467           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5468           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5469           if (from_layout_i != 0)
5470             vect_slp_permute (m_perms[from_layout_i], stmts, false);
5471           if (to_layout_i != 0)
5472             vect_slp_permute (m_perms[to_layout_i], stmts, true);
5473         }
5474       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5475       SLP_TREE_LANES (result) = num_lanes;
5476       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5477       result->vertex = -1;
5478
5479       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5480       if (tmp_perm.length ())
5481         {
5482           lane_perm.safe_splice (tmp_perm);
5483           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5484         }
5485       else
5486         {
5487           lane_perm.create (num_lanes);
5488           for (unsigned j = 0; j < num_lanes; ++j)
5489             lane_perm.quick_push ({ 0, j });
5490           if (from_layout_i != 0)
5491             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5492           if (to_layout_i != 0)
5493             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5494           SLP_TREE_CHILDREN (result).safe_push (node);
5495         }
5496       for (slp_tree child : SLP_TREE_CHILDREN (result))
5497         child->refcnt++;
5498     }
5499   m_node_layouts[result_i] = result;
5500   return result;
5501 }
5502
5503 /* Apply the chosen vector layouts to the SLP graph.  */
5504
5505 void
5506 vect_optimize_slp_pass::materialize ()
5507 {
5508   /* We no longer need the costs, so avoid having two O(N * P) arrays
5509      live at the same time.  */
5510   m_partition_layout_costs.release ();
5511   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5512
5513   auto_sbitmap fully_folded (m_vertices.length ());
5514   bitmap_clear (fully_folded);
5515   for (unsigned int node_i : m_partitioned_nodes)
5516     {
5517       auto &vertex = m_vertices[node_i];
5518       slp_tree node = vertex.node;
5519       int layout_i = m_partitions[vertex.partition].layout;
5520       gcc_assert (layout_i >= 0);
5521
5522       /* Rearrange the scalar statements to match the chosen layout.  */
5523       if (layout_i > 0)
5524         vect_slp_permute (m_perms[layout_i],
5525                           SLP_TREE_SCALAR_STMTS (node), true);
5526
5527       /* Update load and lane permutations.  */
5528       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5529         {
5530           /* First try to absorb the input vector layouts.  If that fails,
5531              force the inputs to have layout LAYOUT_I too.  We checked that
5532              that was possible before deciding to use nonzero output layouts.
5533              (Note that at this stage we don't really have any guarantee that
5534              the target supports the original VEC_PERM_EXPR.)  */
5535           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5536           auto_lane_permutation_t tmp_perm;
5537           tmp_perm.safe_splice (perm);
5538           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5539           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5540                                               tmp_perm,
5541                                               SLP_TREE_CHILDREN (node),
5542                                               false) >= 0)
5543             {
5544               if (dump_enabled_p ()
5545                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5546                                   perm.begin ()))
5547                 dump_printf_loc (MSG_NOTE, vect_location,
5548                                  "absorbing input layouts into %p\n",
5549                                  (void *) node);
5550               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5551               bitmap_set_bit (fully_folded, node_i);
5552             }
5553           else
5554             {
5555               /* Not MSG_MISSED because it would make no sense to users.  */
5556               if (dump_enabled_p ())
5557                 dump_printf_loc (MSG_NOTE, vect_location,
5558                                  "failed to absorb input layouts into %p\n",
5559                                  (void *) node);
5560               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5561             }
5562         }
5563       else
5564         {
5565           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5566           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5567           if (layout_i > 0)
5568             /* ???  When we handle non-bijective permutes the idea
5569                is that we can force the load-permutation to be
5570                { min, min + 1, min + 2, ... max }.  But then the
5571                scalar defs might no longer match the lane content
5572                which means wrong-code with live lane vectorization.
5573                So we possibly have to have NULL entries for those.  */
5574             vect_slp_permute (m_perms[layout_i], load_perm, true);
5575         }
5576     }
5577
5578   /* Do this before any nodes disappear, since it involves a walk
5579      over the leaves.  */
5580   remove_redundant_permutations ();
5581
5582   /* Replace each child with a correctly laid-out version.  */
5583   for (unsigned int node_i : m_partitioned_nodes)
5584     {
5585       /* Skip nodes that have already been handled above.  */
5586       if (bitmap_bit_p (fully_folded, node_i))
5587         continue;
5588
5589       auto &vertex = m_vertices[node_i];
5590       int in_layout_i = m_partitions[vertex.partition].layout;
5591       gcc_assert (in_layout_i >= 0);
5592
5593       unsigned j;
5594       slp_tree child;
5595       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5596         {
5597           if (!child)
5598             continue;
5599
5600           slp_tree new_child = get_result_with_layout (child, in_layout_i);
5601           if (new_child != child)
5602             {
5603               vect_free_slp_tree (child);
5604               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5605               new_child->refcnt += 1;
5606             }
5607         }
5608     }
5609 }
5610
5611 /* Elide load permutations that are not necessary.  Such permutations might
5612    be pre-existing, rather than created by the layout optimizations.  */
5613
5614 void
5615 vect_optimize_slp_pass::remove_redundant_permutations ()
5616 {
5617   for (unsigned int node_i : m_leafs)
5618     {
5619       slp_tree node = m_vertices[node_i].node;
5620       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5621         continue;
5622
5623       /* In basic block vectorization we allow any subchain of an interleaving
5624          chain.
5625          FORNOW: not in loop SLP because of realignment complications.  */
5626       if (is_a <bb_vec_info> (m_vinfo))
5627         {
5628           bool subchain_p = true;
5629           stmt_vec_info next_load_info = NULL;
5630           stmt_vec_info load_info;
5631           unsigned j;
5632           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5633             {
5634               if (j != 0
5635                   && (next_load_info != load_info
5636                       || DR_GROUP_GAP (load_info) != 1))
5637                 {
5638                   subchain_p = false;
5639                   break;
5640                 }
5641               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5642             }
5643           if (subchain_p)
5644             {
5645               SLP_TREE_LOAD_PERMUTATION (node).release ();
5646               continue;
5647             }
5648         }
5649       else
5650         {
5651           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5652           stmt_vec_info load_info;
5653           bool this_load_permuted = false;
5654           unsigned j;
5655           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5656             if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5657               {
5658                 this_load_permuted = true;
5659                 break;
5660               }
5661           /* When this isn't a grouped access we know it's single element
5662              and contiguous.  */
5663           if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5664             {
5665               if (!this_load_permuted
5666                   && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5667                       || SLP_TREE_LANES (node) == 1))
5668                 SLP_TREE_LOAD_PERMUTATION (node).release ();
5669               continue;
5670             }
5671           stmt_vec_info first_stmt_info
5672             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5673           if (!this_load_permuted
5674               /* The load requires permutation when unrolling exposes
5675                  a gap either because the group is larger than the SLP
5676                  group-size or because there is a gap between the groups.  */
5677               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5678                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5679                       && DR_GROUP_GAP (first_stmt_info) == 0)))
5680             {
5681               SLP_TREE_LOAD_PERMUTATION (node).release ();
5682               continue;
5683             }
5684         }
5685     }
5686 }
5687
5688 /* Print the partition graph and layout information to the dump file.  */
5689
5690 void
5691 vect_optimize_slp_pass::dump ()
5692 {
5693   dump_printf_loc (MSG_NOTE, vect_location,
5694                    "SLP optimize permutations:\n");
5695   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5696     {
5697       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
5698       const char *sep = "";
5699       for (unsigned int idx : m_perms[layout_i])
5700         {
5701           dump_printf (MSG_NOTE, "%s%d", sep, idx);
5702           sep = ", ";
5703         }
5704       dump_printf (MSG_NOTE, " }\n");
5705     }
5706   dump_printf_loc (MSG_NOTE, vect_location,
5707                    "SLP optimize partitions:\n");
5708   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5709        ++partition_i)
5710     {
5711       auto &partition = m_partitions[partition_i];
5712       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
5713       dump_printf_loc (MSG_NOTE, vect_location,
5714                        "  partition %d (layout %d):\n",
5715                        partition_i, partition.layout);
5716       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
5717       for (unsigned int order_i = partition.node_begin;
5718            order_i < partition.node_end; ++order_i)
5719         {
5720           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5721           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
5722                            (void *) vertex.node);
5723           dump_printf_loc (MSG_NOTE, vect_location,
5724                            "          weight: %f\n",
5725                            vertex.weight.to_double ());
5726           if (vertex.out_degree)
5727             dump_printf_loc (MSG_NOTE, vect_location,
5728                              "          out weight: %f (degree %d)\n",
5729                              vertex.out_weight.to_double (),
5730                              vertex.out_degree);
5731           if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5732             dump_printf_loc (MSG_NOTE, vect_location,
5733                              "          op: VEC_PERM_EXPR\n");
5734           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5735             dump_printf_loc (MSG_NOTE, vect_location,
5736                              "          op template: %G", rep->stmt);
5737         }
5738       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
5739       for (unsigned int order_i = partition.node_begin;
5740            order_i < partition.node_end; ++order_i)
5741         {
5742           unsigned int node_i = m_partitioned_nodes[order_i];
5743           auto &vertex = m_vertices[node_i];
5744           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5745             {
5746               auto &other_vertex = m_vertices[other_node_i];
5747               if (other_vertex.partition < vertex.partition)
5748                 dump_printf_loc (MSG_NOTE, vect_location,
5749                                  "      - %p [%d] --> %p\n",
5750                                  (void *) other_vertex.node,
5751                                  other_vertex.partition,
5752                                  (void *) vertex.node);
5753               else
5754                 dump_printf_loc (MSG_NOTE, vect_location,
5755                                  "      - %p --> [%d] %p\n",
5756                                  (void *) vertex.node,
5757                                  other_vertex.partition,
5758                                  (void *) other_vertex.node);
5759             };
5760           for_each_partition_edge (node_i, print_edge);
5761         }
5762
5763       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5764         {
5765           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5766           if (layout_costs.is_possible ())
5767             {
5768               dump_printf_loc (MSG_NOTE, vect_location,
5769                                "    layout %d:%s\n", layout_i,
5770                                partition.layout == int (layout_i)
5771                                ? " (*)" : "");
5772               slpg_layout_cost combined_cost = layout_costs.in_cost;
5773               combined_cost.add_serial_cost (layout_costs.internal_cost);
5774               combined_cost.add_serial_cost (layout_costs.out_cost);
5775 #define TEMPLATE "{depth: %f, total: %f}"
5776               dump_printf_loc (MSG_NOTE, vect_location,
5777                                "        " TEMPLATE "\n",
5778                                layout_costs.in_cost.depth.to_double (),
5779                                layout_costs.in_cost.total.to_double ());
5780               dump_printf_loc (MSG_NOTE, vect_location,
5781                                "      + " TEMPLATE "\n",
5782                                layout_costs.internal_cost.depth.to_double (),
5783                                layout_costs.internal_cost.total.to_double ());
5784               dump_printf_loc (MSG_NOTE, vect_location,
5785                                "      + " TEMPLATE "\n",
5786                                layout_costs.out_cost.depth.to_double (),
5787                                layout_costs.out_cost.total.to_double ());
5788               dump_printf_loc (MSG_NOTE, vect_location,
5789                                "      = " TEMPLATE "\n",
5790                                combined_cost.depth.to_double (),
5791                                combined_cost.total.to_double ());
5792 #undef TEMPLATE
5793             }
5794           else
5795             dump_printf_loc (MSG_NOTE, vect_location,
5796                              "    layout %d: rejected\n", layout_i);
5797         }
5798     }
5799 }
5800
5801 /* Main entry point for the SLP graph optimization pass.  */
5802
5803 void
5804 vect_optimize_slp_pass::run ()
5805 {
5806   build_graph ();
5807   create_partitions ();
5808   start_choosing_layouts ();
5809   if (m_perms.length () > 1)
5810     {
5811       forward_pass ();
5812       backward_pass ();
5813       if (dump_enabled_p ())
5814         dump ();
5815       materialize ();
5816       while (!m_perms.is_empty ())
5817         m_perms.pop ().release ();
5818     }
5819   else
5820     remove_redundant_permutations ();
5821   free_graph (m_slpg);
5822 }
5823
5824 /* Optimize the SLP graph of VINFO.  */
5825
5826 void
5827 vect_optimize_slp (vec_info *vinfo)
5828 {
5829   if (vinfo->slp_instances.is_empty ())
5830     return;
5831   vect_optimize_slp_pass (vinfo).run ();
5832 }
5833
5834 /* Gather loads reachable from the individual SLP graph entries.  */
5835
5836 void
5837 vect_gather_slp_loads (vec_info *vinfo)
5838 {
5839   unsigned i;
5840   slp_instance instance;
5841   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5842     {
5843       hash_set<slp_tree> visited;
5844       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5845                              SLP_INSTANCE_TREE (instance), visited);
5846     }
5847 }
5848
5849
5850 /* For each possible SLP instance decide whether to SLP it and calculate overall
5851    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
5852    least one instance.  */
5853
5854 bool
5855 vect_make_slp_decision (loop_vec_info loop_vinfo)
5856 {
5857   unsigned int i;
5858   poly_uint64 unrolling_factor = 1;
5859   const vec<slp_instance> &slp_instances
5860     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5861   slp_instance instance;
5862   int decided_to_slp = 0;
5863
5864   DUMP_VECT_SCOPE ("vect_make_slp_decision");
5865
5866   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5867     {
5868       /* FORNOW: SLP if you can.  */
5869       /* All unroll factors have the form:
5870
5871            GET_MODE_SIZE (vinfo->vector_mode) * X
5872
5873          for some rational X, so they must have a common multiple.  */
5874       unrolling_factor
5875         = force_common_multiple (unrolling_factor,
5876                                  SLP_INSTANCE_UNROLLING_FACTOR (instance));
5877
5878       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
5879          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5880          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
5881       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5882       decided_to_slp++;
5883     }
5884
5885   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5886
5887   if (decided_to_slp && dump_enabled_p ())
5888     {
5889       dump_printf_loc (MSG_NOTE, vect_location,
5890                        "Decided to SLP %d instances. Unrolling factor ",
5891                        decided_to_slp);
5892       dump_dec (MSG_NOTE, unrolling_factor);
5893       dump_printf (MSG_NOTE, "\n");
5894     }
5895
5896   return (decided_to_slp > 0);
5897 }
5898
5899 /* Private data for vect_detect_hybrid_slp.  */
5900 struct vdhs_data
5901 {
5902   loop_vec_info loop_vinfo;
5903   vec<stmt_vec_info> *worklist;
5904 };
5905
5906 /* Walker for walk_gimple_op.  */
5907
5908 static tree
5909 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5910 {
5911   walk_stmt_info *wi = (walk_stmt_info *)data;
5912   vdhs_data *dat = (vdhs_data *)wi->info;
5913
5914   if (wi->is_lhs)
5915     return NULL_TREE;
5916
5917   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5918   if (!def_stmt_info)
5919     return NULL_TREE;
5920   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5921   if (PURE_SLP_STMT (def_stmt_info))
5922     {
5923       if (dump_enabled_p ())
5924         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5925                          def_stmt_info->stmt);
5926       STMT_SLP_TYPE (def_stmt_info) = hybrid;
5927       dat->worklist->safe_push (def_stmt_info);
5928     }
5929
5930   return NULL_TREE;
5931 }
5932
5933 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5934    if so, otherwise pushing it to WORKLIST.  */
5935
5936 static void
5937 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5938                                vec<stmt_vec_info> &worklist,
5939                                stmt_vec_info stmt_info)
5940 {
5941   if (dump_enabled_p ())
5942     dump_printf_loc (MSG_NOTE, vect_location,
5943                      "Processing hybrid candidate : %G", stmt_info->stmt);
5944   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5945   imm_use_iterator iter2;
5946   ssa_op_iter iter1;
5947   use_operand_p use_p;
5948   def_operand_p def_p;
5949   bool any_def = false;
5950   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5951     {
5952       any_def = true;
5953       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5954         {
5955           if (is_gimple_debug (USE_STMT (use_p)))
5956             continue;
5957           stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5958           /* An out-of loop use means this is a loop_vect sink.  */
5959           if (!use_info)
5960             {
5961               if (dump_enabled_p ())
5962                 dump_printf_loc (MSG_NOTE, vect_location,
5963                                  "Found loop_vect sink: %G", stmt_info->stmt);
5964               worklist.safe_push (stmt_info);
5965               return;
5966             }
5967           else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5968             {
5969               if (dump_enabled_p ())
5970                 dump_printf_loc (MSG_NOTE, vect_location,
5971                                  "Found loop_vect use: %G", use_info->stmt);
5972               worklist.safe_push (stmt_info);
5973               return;
5974             }
5975         }
5976     }
5977   /* No def means this is a loo_vect sink.  */
5978   if (!any_def)
5979     {
5980       if (dump_enabled_p ())
5981         dump_printf_loc (MSG_NOTE, vect_location,
5982                          "Found loop_vect sink: %G", stmt_info->stmt);
5983       worklist.safe_push (stmt_info);
5984       return;
5985     }
5986   if (dump_enabled_p ())
5987     dump_printf_loc (MSG_NOTE, vect_location,
5988                      "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5989   STMT_SLP_TYPE (stmt_info) = pure_slp;
5990 }
5991
5992 /* Find stmts that must be both vectorized and SLPed.  */
5993
5994 void
5995 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5996 {
5997   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5998
5999   /* All stmts participating in SLP are marked pure_slp, all other
6000      stmts are loop_vect.
6001      First collect all loop_vect stmts into a worklist.
6002      SLP patterns cause not all original scalar stmts to appear in
6003      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
6004      Rectify this here and do a backward walk over the IL only considering
6005      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
6006      mark them as pure_slp.  */
6007   auto_vec<stmt_vec_info> worklist;
6008   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
6009     {
6010       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
6011       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
6012            gsi_next (&gsi))
6013         {
6014           gphi *phi = gsi.phi ();
6015           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
6016           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
6017             maybe_push_to_hybrid_worklist (loop_vinfo,
6018                                            worklist, stmt_info);
6019         }
6020       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
6021            gsi_prev (&gsi))
6022         {
6023           gimple *stmt = gsi_stmt (gsi);
6024           if (is_gimple_debug (stmt))
6025             continue;
6026           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
6027           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
6028             {
6029               for (gimple_stmt_iterator gsi2
6030                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
6031                    !gsi_end_p (gsi2); gsi_next (&gsi2))
6032                 {
6033                   stmt_vec_info patt_info
6034                     = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
6035                   if (!STMT_SLP_TYPE (patt_info)
6036                       && STMT_VINFO_RELEVANT (patt_info))
6037                     maybe_push_to_hybrid_worklist (loop_vinfo,
6038                                                    worklist, patt_info);
6039                 }
6040               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6041             }
6042           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
6043             maybe_push_to_hybrid_worklist (loop_vinfo,
6044                                            worklist, stmt_info);
6045         }
6046     }
6047
6048   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
6049      mark any SLP vectorized stmt as hybrid.
6050      ???  We're visiting def stmts N times (once for each non-SLP and
6051      once for each hybrid-SLP use).  */
6052   walk_stmt_info wi;
6053   vdhs_data dat;
6054   dat.worklist = &worklist;
6055   dat.loop_vinfo = loop_vinfo;
6056   memset (&wi, 0, sizeof (wi));
6057   wi.info = (void *)&dat;
6058   while (!worklist.is_empty ())
6059     {
6060       stmt_vec_info stmt_info = worklist.pop ();
6061       /* Since SSA operands are not set up for pattern stmts we need
6062          to use walk_gimple_op.  */
6063       wi.is_lhs = 0;
6064       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
6065       /* For gather/scatter make sure to walk the offset operand, that
6066          can be a scaling and conversion away.  */
6067       gather_scatter_info gs_info;
6068       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
6069           && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
6070         {
6071           int dummy;
6072           vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
6073         }
6074     }
6075 }
6076
6077
6078 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
6079
6080 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
6081   : vec_info (vec_info::bb, shared),
6082     bbs (_bbs),
6083     roots (vNULL)
6084 {
6085   for (unsigned i = 0; i < bbs.length (); ++i)
6086     {
6087       if (i != 0)
6088         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6089              gsi_next (&si))
6090           {
6091             gphi *phi = si.phi ();
6092             gimple_set_uid (phi, 0);
6093             add_stmt (phi);
6094           }
6095       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6096            !gsi_end_p (gsi); gsi_next (&gsi))
6097         {
6098           gimple *stmt = gsi_stmt (gsi);
6099           gimple_set_uid (stmt, 0);
6100           if (is_gimple_debug (stmt))
6101             continue;
6102           add_stmt (stmt);
6103         }
6104     }
6105 }
6106
6107
6108 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
6109    stmts in the basic block.  */
6110
6111 _bb_vec_info::~_bb_vec_info ()
6112 {
6113   /* Reset region marker.  */
6114   for (unsigned i = 0; i < bbs.length (); ++i)
6115     {
6116       if (i != 0)
6117         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6118              gsi_next (&si))
6119           {
6120             gphi *phi = si.phi ();
6121             gimple_set_uid (phi, -1);
6122           }
6123       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6124            !gsi_end_p (gsi); gsi_next (&gsi))
6125         {
6126           gimple *stmt = gsi_stmt (gsi);
6127           gimple_set_uid (stmt, -1);
6128         }
6129     }
6130
6131   for (unsigned i = 0; i < roots.length (); ++i)
6132     {
6133       roots[i].stmts.release ();
6134       roots[i].roots.release ();
6135       roots[i].remain.release ();
6136     }
6137   roots.release ();
6138 }
6139
6140 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
6141    given then that child nodes have already been processed, and that
6142    their def types currently match their SLP node's def type.  */
6143
6144 static bool
6145 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
6146                                     slp_instance node_instance,
6147                                     stmt_vector_for_cost *cost_vec)
6148 {
6149   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6150
6151   /* Calculate the number of vector statements to be created for the
6152      scalar stmts in this node.  For SLP reductions it is equal to the
6153      number of vector statements in the children (which has already been
6154      calculated by the recursive call).  Otherwise it is the number of
6155      scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
6156      VF divided by the number of elements in a vector.  */
6157   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6158       && !STMT_VINFO_DATA_REF (stmt_info)
6159       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6160     {
6161       for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6162         if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6163           {
6164             SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6165               = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6166             break;
6167           }
6168     }
6169   else
6170     {
6171       poly_uint64 vf;
6172       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6173         vf = loop_vinfo->vectorization_factor;
6174       else
6175         vf = 1;
6176       unsigned int group_size = SLP_TREE_LANES (node);
6177       tree vectype = SLP_TREE_VECTYPE (node);
6178       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6179         = vect_get_num_vectors (vf * group_size, vectype);
6180     }
6181
6182   /* Handle purely internal nodes.  */
6183   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6184     {
6185       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6186         return false;
6187
6188       stmt_vec_info slp_stmt_info;
6189       unsigned int i;
6190       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6191         {
6192           if (STMT_VINFO_LIVE_P (slp_stmt_info)
6193               && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6194                                                node_instance, i,
6195                                                false, cost_vec))
6196             return false;
6197         }
6198       return true;
6199     }
6200
6201   bool dummy;
6202   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6203                             node, node_instance, cost_vec);
6204 }
6205
6206 /* Try to build NODE from scalars, returning true on success.
6207    NODE_INSTANCE is the SLP instance that contains NODE.  */
6208
6209 static bool
6210 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6211                               slp_instance node_instance)
6212 {
6213   stmt_vec_info stmt_info;
6214   unsigned int i;
6215
6216   if (!is_a <bb_vec_info> (vinfo)
6217       || node == SLP_INSTANCE_TREE (node_instance)
6218       || !SLP_TREE_SCALAR_STMTS (node).exists ()
6219       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6220       /* Force the mask use to be built from scalars instead.  */
6221       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6222     return false;
6223
6224   if (dump_enabled_p ())
6225     dump_printf_loc (MSG_NOTE, vect_location,
6226                      "Building vector operands of %p from scalars instead\n",
6227                      (void *) node);
6228
6229   /* Don't remove and free the child nodes here, since they could be
6230      referenced by other structures.  The analysis and scheduling phases
6231      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
6232   unsigned int group_size = SLP_TREE_LANES (node);
6233   SLP_TREE_DEF_TYPE (node) = vect_external_def;
6234   /* Invariants get their vector type from the uses.  */
6235   SLP_TREE_VECTYPE (node) = NULL_TREE;
6236   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6237   SLP_TREE_LOAD_PERMUTATION (node).release ();
6238   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6239     {
6240       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6241       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6242     }
6243   return true;
6244 }
6245
6246 /* Return true if all elements of the slice are the same.  */
6247 bool
6248 vect_scalar_ops_slice::all_same_p () const
6249 {
6250   for (unsigned int i = 1; i < length; ++i)
6251     if (!operand_equal_p (op (0), op (i)))
6252       return false;
6253   return true;
6254 }
6255
6256 hashval_t
6257 vect_scalar_ops_slice_hash::hash (const value_type &s)
6258 {
6259   hashval_t hash = 0;
6260   for (unsigned i = 0; i < s.length; ++i)
6261     hash = iterative_hash_expr (s.op (i), hash);
6262   return hash;
6263 }
6264
6265 bool
6266 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6267                                    const compare_type &s2)
6268 {
6269   if (s1.length != s2.length)
6270     return false;
6271   for (unsigned i = 0; i < s1.length; ++i)
6272     if (!operand_equal_p (s1.op (i), s2.op (i)))
6273       return false;
6274   return true;
6275 }
6276
6277 /* Compute the prologue cost for invariant or constant operands represented
6278    by NODE.  */
6279
6280 static void
6281 vect_prologue_cost_for_slp (slp_tree node,
6282                             stmt_vector_for_cost *cost_vec)
6283 {
6284   /* There's a special case of an existing vector, that costs nothing.  */
6285   if (SLP_TREE_SCALAR_OPS (node).length () == 0
6286       && !SLP_TREE_VEC_DEFS (node).is_empty ())
6287     return;
6288   /* Without looking at the actual initializer a vector of
6289      constants can be implemented as load from the constant pool.
6290      When all elements are the same we can use a splat.  */
6291   tree vectype = SLP_TREE_VECTYPE (node);
6292   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6293   unsigned HOST_WIDE_INT const_nunits;
6294   unsigned nelt_limit;
6295   auto ops = &SLP_TREE_SCALAR_OPS (node);
6296   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6297   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6298       && ! multiple_p (const_nunits, group_size))
6299     {
6300       nelt_limit = const_nunits;
6301       hash_set<vect_scalar_ops_slice_hash> vector_ops;
6302       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6303         if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6304           starts.quick_push (i * const_nunits);
6305     }
6306   else
6307     {
6308       /* If either the vector has variable length or the vectors
6309          are composed of repeated whole groups we only need to
6310          cost construction once.  All vectors will be the same.  */
6311       nelt_limit = group_size;
6312       starts.quick_push (0);
6313     }
6314   /* ???  We're just tracking whether vectors in a single node are the same.
6315      Ideally we'd do something more global.  */
6316   bool passed = false;
6317   for (unsigned int start : starts)
6318     {
6319       vect_cost_for_stmt kind;
6320       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6321         kind = vector_load;
6322       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6323         kind = scalar_to_vec;
6324       else
6325         kind = vec_construct;
6326       /* The target cost hook has no idea which part of the SLP node
6327          we are costing so avoid passing it down more than once.  Pass
6328          it to the first vec_construct or scalar_to_vec part since for those
6329          the x86 backend tries to account for GPR to XMM register moves.  */
6330       record_stmt_cost (cost_vec, 1, kind,
6331                         (kind != vector_load && !passed) ? node : nullptr,
6332                         vectype, 0, vect_prologue);
6333       if (kind != vector_load)
6334         passed = true;
6335     }
6336 }
6337
6338 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6339    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6340
6341    Return true if the operations are supported.  */
6342
6343 static bool
6344 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6345                                   slp_instance node_instance,
6346                                   hash_set<slp_tree> &visited_set,
6347                                   vec<slp_tree> &visited_vec,
6348                                   stmt_vector_for_cost *cost_vec)
6349 {
6350   int i, j;
6351   slp_tree child;
6352
6353   /* Assume we can code-generate all invariants.  */
6354   if (!node
6355       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6356       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6357     return true;
6358
6359   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6360     {
6361       if (dump_enabled_p ())
6362         dump_printf_loc (MSG_NOTE, vect_location,
6363                          "Failed cyclic SLP reference in %p\n", (void *) node);
6364       return false;
6365     }
6366   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6367
6368   /* If we already analyzed the exact same set of scalar stmts we're done.
6369      We share the generated vector stmts for those.  */
6370   if (visited_set.add (node))
6371     return true;
6372   visited_vec.safe_push (node);
6373
6374   bool res = true;
6375   unsigned visited_rec_start = visited_vec.length ();
6376   unsigned cost_vec_rec_start = cost_vec->length ();
6377   bool seen_non_constant_child = false;
6378   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6379     {
6380       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6381                                               visited_set, visited_vec,
6382                                               cost_vec);
6383       if (!res)
6384         break;
6385       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6386         seen_non_constant_child = true;
6387     }
6388   /* We're having difficulties scheduling nodes with just constant
6389      operands and no scalar stmts since we then cannot compute a stmt
6390      insertion place.  */
6391   if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6392     {
6393       if (dump_enabled_p ())
6394         dump_printf_loc (MSG_NOTE, vect_location,
6395                          "Cannot vectorize all-constant op node %p\n",
6396                          (void *) node);
6397       res = false;
6398     }
6399
6400   if (res)
6401     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6402                                               cost_vec);
6403   /* If analysis failed we have to pop all recursive visited nodes
6404      plus ourselves.  */
6405   if (!res)
6406     {
6407       while (visited_vec.length () >= visited_rec_start)
6408         visited_set.remove (visited_vec.pop ());
6409       cost_vec->truncate (cost_vec_rec_start);
6410     }
6411
6412   /* When the node can be vectorized cost invariant nodes it references.
6413      This is not done in DFS order to allow the refering node
6414      vectorizable_* calls to nail down the invariant nodes vector type
6415      and possibly unshare it if it needs a different vector type than
6416      other referrers.  */
6417   if (res)
6418     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6419       if (child
6420           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6421               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6422           /* Perform usual caching, note code-generation still
6423              code-gens these nodes multiple times but we expect
6424              to CSE them later.  */
6425           && !visited_set.add (child))
6426         {
6427           visited_vec.safe_push (child);
6428           /* ???  After auditing more code paths make a "default"
6429              and push the vector type from NODE to all children
6430              if it is not already set.  */
6431           /* Compute the number of vectors to be generated.  */
6432           tree vector_type = SLP_TREE_VECTYPE (child);
6433           if (!vector_type)
6434             {
6435               /* For shifts with a scalar argument we don't need
6436                  to cost or code-generate anything.
6437                  ???  Represent this more explicitely.  */
6438               gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6439                            == shift_vec_info_type)
6440                           && j == 1);
6441               continue;
6442             }
6443           unsigned group_size = SLP_TREE_LANES (child);
6444           poly_uint64 vf = 1;
6445           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6446             vf = loop_vinfo->vectorization_factor;
6447           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6448             = vect_get_num_vectors (vf * group_size, vector_type);
6449           /* And cost them.  */
6450           vect_prologue_cost_for_slp (child, cost_vec);
6451         }
6452
6453   /* If this node or any of its children can't be vectorized, try pruning
6454      the tree here rather than felling the whole thing.  */
6455   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6456     {
6457       /* We'll need to revisit this for invariant costing and number
6458          of vectorized stmt setting.   */
6459       res = true;
6460     }
6461
6462   return res;
6463 }
6464
6465 /* Given a definition DEF, analyze if it will have any live scalar use after
6466    performing SLP vectorization whose information is represented by BB_VINFO,
6467    and record result into hash map SCALAR_USE_MAP as cache for later fast
6468    check.  If recursion DEPTH exceeds a limit, stop analysis and make a
6469    conservative assumption.  Return 0 if no scalar use, 1 if there is, -1
6470    means recursion is limited.  */
6471
6472 static int
6473 vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
6474                         hash_map<tree, int> &scalar_use_map,
6475                         int depth = 0)
6476 {
6477   const int depth_limit = 2;
6478   imm_use_iterator use_iter;
6479   gimple *use_stmt;
6480
6481   if (int *res = scalar_use_map.get (def))
6482     return *res;
6483
6484   int scalar_use = 1;
6485
6486   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
6487     {
6488       if (is_gimple_debug (use_stmt))
6489         continue;
6490
6491       stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6492
6493       if (!use_stmt_info)
6494         break;
6495
6496       if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6497         continue;
6498
6499       /* Do not step forward when encounter PHI statement, since it may
6500          involve cyclic reference and cause infinite recursive invocation.  */
6501       if (gimple_code (use_stmt) == GIMPLE_PHI)
6502         break;
6503
6504       /* When pattern recognition is involved, a statement whose definition is
6505          consumed in some pattern, may not be included in the final replacement
6506          pattern statements, so would be skipped when building SLP graph.
6507
6508          * Original
6509           char a_c = *(char *) a;
6510           char b_c = *(char *) b;
6511           unsigned short a_s = (unsigned short) a_c;
6512           int a_i = (int) a_s;
6513           int b_i = (int) b_c;
6514           int r_i = a_i - b_i;
6515
6516          * After pattern replacement
6517           a_s = (unsigned short) a_c;
6518           a_i = (int) a_s;
6519
6520           patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
6521           patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
6522
6523           patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
6524           patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
6525
6526          The definitions of a_i(original statement) and b_i(pattern statement)
6527          are related to, but actually not part of widen_minus pattern.
6528          Vectorizing the pattern does not cause these definition statements to
6529          be marked as PURE_SLP.  For this case, we need to recursively check
6530          whether their uses are all absorbed into vectorized code.  But there
6531          is an exception that some use may participate in an vectorized
6532          operation via an external SLP node containing that use as an element.
6533          The parameter "scalar_use_map" tags such kind of SSA as having scalar
6534          use in advance.  */
6535       tree lhs = gimple_get_lhs (use_stmt);
6536
6537       if (!lhs || TREE_CODE (lhs) != SSA_NAME)
6538         break;
6539
6540       if (depth_limit && depth >= depth_limit)
6541         return -1;
6542
6543       if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
6544                                                 depth + 1)))
6545         break;
6546     }
6547
6548   if (end_imm_use_stmt_p (&use_iter))
6549     scalar_use = 0;
6550
6551   /* If recursion is limited, do not cache result for non-root defs.  */
6552   if (!depth || scalar_use >= 0)
6553     {
6554       bool added = scalar_use_map.put (def, scalar_use);
6555       gcc_assert (!added);
6556     }
6557
6558   return scalar_use;
6559 }
6560
6561 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6562    region and that can be vectorized using vectorizable_live_operation
6563    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
6564    scalar code computing it to be retained.  */
6565
6566 static void
6567 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6568                              slp_instance instance,
6569                              stmt_vector_for_cost *cost_vec,
6570                              hash_map<tree, int> &scalar_use_map,
6571                              hash_set<stmt_vec_info> &svisited,
6572                              hash_set<slp_tree> &visited)
6573 {
6574   if (visited.add (node))
6575     return;
6576
6577   unsigned i;
6578   stmt_vec_info stmt_info;
6579   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6580   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6581     {
6582       if (svisited.contains (stmt_info))
6583         continue;
6584       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6585       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6586           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6587         /* Only the pattern root stmt computes the original scalar value.  */
6588         continue;
6589       bool mark_visited = true;
6590       gimple *orig_stmt = orig_stmt_info->stmt;
6591       ssa_op_iter op_iter;
6592       def_operand_p def_p;
6593       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6594         {
6595           if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
6596                                       scalar_use_map))
6597             {
6598               STMT_VINFO_LIVE_P (stmt_info) = true;
6599               if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
6600                                                instance, i, false, cost_vec))
6601                 /* ???  So we know we can vectorize the live stmt from one SLP
6602                    node.  If we cannot do so from all or none consistently
6603                    we'd have to record which SLP node (and lane) we want to
6604                    use for the live operation.  So make sure we can
6605                    code-generate from all nodes.  */
6606                 mark_visited = false;
6607               else
6608                 STMT_VINFO_LIVE_P (stmt_info) = false;
6609             }
6610
6611           /* We have to verify whether we can insert the lane extract
6612              before all uses.  The following is a conservative approximation.
6613              We cannot put this into vectorizable_live_operation because
6614              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6615              doesn't work.
6616              Note that while the fact that we emit code for loads at the
6617              first load should make this a non-problem leafs we construct
6618              from scalars are vectorized after the last scalar def.
6619              ???  If we'd actually compute the insert location during
6620              analysis we could use sth less conservative than the last
6621              scalar stmt in the node for the dominance check.  */
6622           /* ???  What remains is "live" uses in vector CTORs in the same
6623              SLP graph which is where those uses can end up code-generated
6624              right after their definition instead of close to their original
6625              use.  But that would restrict us to code-generate lane-extracts
6626              from the latest stmt in a node.  So we compensate for this
6627              during code-generation, simply not replacing uses for those
6628              hopefully rare cases.  */
6629           imm_use_iterator use_iter;
6630           gimple *use_stmt;
6631           stmt_vec_info use_stmt_info;
6632
6633           if (STMT_VINFO_LIVE_P (stmt_info))
6634             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6635               if (!is_gimple_debug (use_stmt)
6636                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6637                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6638                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6639                 {
6640                   if (dump_enabled_p ())
6641                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6642                                      "Cannot determine insertion place for "
6643                                      "lane extract\n");
6644                   STMT_VINFO_LIVE_P (stmt_info) = false;
6645                   mark_visited = true;
6646                 }
6647         }
6648       if (mark_visited)
6649         svisited.add (stmt_info);
6650     }
6651
6652   slp_tree child;
6653   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6654     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6655       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
6656                                    scalar_use_map, svisited, visited);
6657 }
6658
6659 /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
6660    are live outside of the basic-block vectorized region and that can be
6661    vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
6662
6663 static void
6664 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
6665 {
6666   if (bb_vinfo->slp_instances.is_empty ())
6667     return;
6668
6669   hash_set<stmt_vec_info> svisited;
6670   hash_set<slp_tree> visited;
6671   hash_map<tree, int> scalar_use_map;
6672   auto_vec<slp_tree> worklist;
6673
6674   for (slp_instance instance : bb_vinfo->slp_instances)
6675     {
6676       if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
6677         for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
6678           if (TREE_CODE (op) == SSA_NAME)
6679             scalar_use_map.put (op, 1);
6680       if (!visited.add (SLP_INSTANCE_TREE (instance)))
6681         worklist.safe_push (SLP_INSTANCE_TREE (instance));
6682     }
6683
6684   do
6685     {
6686       slp_tree node = worklist.pop ();
6687
6688       if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
6689         {
6690           for (tree op : SLP_TREE_SCALAR_OPS (node))
6691             if (TREE_CODE (op) == SSA_NAME)
6692               scalar_use_map.put (op, 1);
6693         }
6694       else
6695         {
6696           for (slp_tree child : SLP_TREE_CHILDREN (node))
6697             if (child && !visited.add (child))
6698               worklist.safe_push (child);
6699         }
6700     }
6701   while (!worklist.is_empty ());
6702
6703   visited.empty ();
6704
6705   for (slp_instance instance : bb_vinfo->slp_instances)
6706     {
6707       vect_location = instance->location ();
6708       vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6709                                    instance, &instance->cost_vec,
6710                                    scalar_use_map, svisited, visited);
6711     }
6712 }
6713
6714 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
6715
6716 static bool
6717 vectorizable_bb_reduc_epilogue (slp_instance instance,
6718                                 stmt_vector_for_cost *cost_vec)
6719 {
6720   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6721   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6722   if (reduc_code == MINUS_EXPR)
6723     reduc_code = PLUS_EXPR;
6724   internal_fn reduc_fn;
6725   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6726   if (!vectype
6727       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6728       || reduc_fn == IFN_LAST
6729       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6730       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6731                                      TREE_TYPE (vectype)))
6732     {
6733       if (dump_enabled_p ())
6734         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6735                          "not vectorized: basic block reduction epilogue "
6736                          "operation unsupported.\n");
6737       return false;
6738     }
6739
6740   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6741      cost log2 vector operations plus shuffles and one extraction.  */
6742   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6743   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6744                     vectype, 0, vect_body);
6745   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6746                     vectype, 0, vect_body);
6747   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6748                     vectype, 0, vect_body);
6749
6750   /* Since we replace all stmts of a possibly longer scalar reduction
6751      chain account for the extra scalar stmts for that.  */
6752   record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
6753                     instance->root_stmts[0], 0, vect_body);
6754   return true;
6755 }
6756
6757 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6758    and recurse to children.  */
6759
6760 static void
6761 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6762                               hash_set<slp_tree> &visited)
6763 {
6764   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6765       || visited.add (node))
6766     return;
6767
6768   stmt_vec_info stmt;
6769   unsigned i;
6770   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6771     roots.remove (vect_orig_stmt (stmt));
6772
6773   slp_tree child;
6774   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6775     if (child)
6776       vect_slp_prune_covered_roots (child, roots, visited);
6777 }
6778
6779 /* Analyze statements in SLP instances of VINFO.  Return true if the
6780    operations are supported. */
6781
6782 bool
6783 vect_slp_analyze_operations (vec_info *vinfo)
6784 {
6785   slp_instance instance;
6786   int i;
6787
6788   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6789
6790   hash_set<slp_tree> visited;
6791   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6792     {
6793       auto_vec<slp_tree> visited_vec;
6794       stmt_vector_for_cost cost_vec;
6795       cost_vec.create (2);
6796       if (is_a <bb_vec_info> (vinfo))
6797         vect_location = instance->location ();
6798       if (!vect_slp_analyze_node_operations (vinfo,
6799                                              SLP_INSTANCE_TREE (instance),
6800                                              instance, visited, visited_vec,
6801                                              &cost_vec)
6802           /* CTOR instances require vectorized defs for the SLP tree root.  */
6803           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6804               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6805                   != vect_internal_def
6806                   /* Make sure we vectorized with the expected type.  */
6807                   || !useless_type_conversion_p
6808                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6809                                               (instance->root_stmts[0]->stmt))),
6810                          TREE_TYPE (SLP_TREE_VECTYPE
6811                                             (SLP_INSTANCE_TREE (instance))))))
6812           /* Check we can vectorize the reduction.  */
6813           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6814               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6815         {
6816           slp_tree node = SLP_INSTANCE_TREE (instance);
6817           stmt_vec_info stmt_info;
6818           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6819             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6820           else
6821             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6822           if (dump_enabled_p ())
6823             dump_printf_loc (MSG_NOTE, vect_location,
6824                              "removing SLP instance operations starting from: %G",
6825                              stmt_info->stmt);
6826           vect_free_slp_instance (instance);
6827           vinfo->slp_instances.ordered_remove (i);
6828           cost_vec.release ();
6829           while (!visited_vec.is_empty ())
6830             visited.remove (visited_vec.pop ());
6831         }
6832       else
6833         {
6834           i++;
6835           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6836             {
6837               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6838               cost_vec.release ();
6839             }
6840           else
6841             /* For BB vectorization remember the SLP graph entry
6842                cost for later.  */
6843             instance->cost_vec = cost_vec;
6844         }
6845     }
6846
6847   /* Now look for SLP instances with a root that are covered by other
6848      instances and remove them.  */
6849   hash_set<stmt_vec_info> roots;
6850   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6851     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6852       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6853   if (!roots.is_empty ())
6854     {
6855       visited.empty ();
6856       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6857         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6858                                       visited);
6859       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6860         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6861             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6862           {
6863             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6864             if (dump_enabled_p ())
6865               dump_printf_loc (MSG_NOTE, vect_location,
6866                                "removing SLP instance operations starting "
6867                                "from: %G", root->stmt);
6868             vect_free_slp_instance (instance);
6869             vinfo->slp_instances.ordered_remove (i);
6870           }
6871         else
6872           ++i;
6873     }
6874
6875   /* Compute vectorizable live stmts.  */
6876   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6877     vect_bb_slp_mark_live_stmts (bb_vinfo);
6878
6879   return !vinfo->slp_instances.is_empty ();
6880 }
6881
6882 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6883    closing the eventual chain.  */
6884
6885 static slp_instance
6886 get_ultimate_leader (slp_instance instance,
6887                      hash_map<slp_instance, slp_instance> &instance_leader)
6888 {
6889   auto_vec<slp_instance *, 8> chain;
6890   slp_instance *tem;
6891   while (*(tem = instance_leader.get (instance)) != instance)
6892     {
6893       chain.safe_push (tem);
6894       instance = *tem;
6895     }
6896   while (!chain.is_empty ())
6897     *chain.pop () = instance;
6898   return instance;
6899 }
6900
6901 namespace {
6902 /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
6903    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6904    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
6905
6906    INSTANCE_LEADER is as for get_ultimate_leader.  */
6907
6908 template<typename T>
6909 bool
6910 vect_map_to_instance (slp_instance instance, T key,
6911                       hash_map<T, slp_instance> &key_to_instance,
6912                       hash_map<slp_instance, slp_instance> &instance_leader)
6913 {
6914   bool existed_p;
6915   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6916   if (!existed_p)
6917     ;
6918   else if (key_instance != instance)
6919     {
6920       /* If we're running into a previously marked key make us the
6921          leader of the current ultimate leader.  This keeps the
6922          leader chain acyclic and works even when the current instance
6923          connects two previously independent graph parts.  */
6924       slp_instance key_leader
6925         = get_ultimate_leader (key_instance, instance_leader);
6926       if (key_leader != instance)
6927         instance_leader.put (key_leader, instance);
6928     }
6929   key_instance = instance;
6930   return existed_p;
6931 }
6932 }
6933
6934 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
6935
6936 static void
6937 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6938                            slp_instance instance, slp_tree node,
6939                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6940                            hash_map<slp_tree, slp_instance> &node_to_instance,
6941                            hash_map<slp_instance, slp_instance> &instance_leader)
6942 {
6943   stmt_vec_info stmt_info;
6944   unsigned i;
6945
6946   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6947     vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6948                           instance_leader);
6949
6950   if (vect_map_to_instance (instance, node, node_to_instance,
6951                             instance_leader))
6952     return;
6953
6954   slp_tree child;
6955   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6956     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6957       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6958                                  node_to_instance, instance_leader);
6959 }
6960
6961 /* Partition the SLP graph into pieces that can be costed independently.  */
6962
6963 static void
6964 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6965 {
6966   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6967
6968   /* First walk the SLP graph assigning each involved scalar stmt a
6969      corresponding SLP graph entry and upon visiting a previously
6970      marked stmt, make the stmts leader the current SLP graph entry.  */
6971   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6972   hash_map<slp_tree, slp_instance> node_to_instance;
6973   hash_map<slp_instance, slp_instance> instance_leader;
6974   slp_instance instance;
6975   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6976     {
6977       instance_leader.put (instance, instance);
6978       vect_bb_partition_graph_r (bb_vinfo,
6979                                  instance, SLP_INSTANCE_TREE (instance),
6980                                  stmt_to_instance, node_to_instance,
6981                                  instance_leader);
6982     }
6983
6984   /* Then collect entries to each independent subgraph.  */
6985   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6986     {
6987       slp_instance leader = get_ultimate_leader (instance, instance_leader);
6988       leader->subgraph_entries.safe_push (instance);
6989       if (dump_enabled_p ()
6990           && leader != instance)
6991         dump_printf_loc (MSG_NOTE, vect_location,
6992                          "instance %p is leader of %p\n",
6993                          (void *) leader, (void *) instance);
6994     }
6995 }
6996
6997 /* Compute the set of scalar stmts participating in internal and external
6998    nodes.  */
6999
7000 static void
7001 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
7002                                          hash_set<slp_tree> &visited,
7003                                          hash_set<stmt_vec_info> &vstmts,
7004                                          hash_set<stmt_vec_info> &estmts)
7005 {
7006   int i;
7007   stmt_vec_info stmt_info;
7008   slp_tree child;
7009
7010   if (visited.add (node))
7011     return;
7012
7013   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
7014     {
7015       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7016         vstmts.add (stmt_info);
7017
7018       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7019         if (child)
7020           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
7021                                                    vstmts, estmts);
7022     }
7023   else
7024     for (tree def : SLP_TREE_SCALAR_OPS (node))
7025       {
7026         stmt_vec_info def_stmt = vinfo->lookup_def (def);
7027         if (def_stmt)
7028           estmts.add (def_stmt);
7029       }
7030 }
7031
7032
7033 /* Compute the scalar cost of the SLP node NODE and its children
7034    and return it.  Do not account defs that are marked in LIFE and
7035    update LIFE according to uses of NODE.  */
7036
7037 static void
7038 vect_bb_slp_scalar_cost (vec_info *vinfo,
7039                          slp_tree node, vec<bool, va_heap> *life,
7040                          stmt_vector_for_cost *cost_vec,
7041                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
7042                          hash_set<slp_tree> &visited)
7043 {
7044   unsigned i;
7045   stmt_vec_info stmt_info;
7046   slp_tree child;
7047
7048   if (visited.add (node))
7049     return;
7050
7051   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7052     {
7053       ssa_op_iter op_iter;
7054       def_operand_p def_p;
7055
7056       if ((*life)[i])
7057         continue;
7058
7059       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
7060       gimple *orig_stmt = orig_stmt_info->stmt;
7061
7062       /* If there is a non-vectorized use of the defs then the scalar
7063          stmt is kept live in which case we do not account it or any
7064          required defs in the SLP children in the scalar cost.  This
7065          way we make the vectorization more costly when compared to
7066          the scalar cost.  */
7067       if (!STMT_VINFO_LIVE_P (stmt_info))
7068         {
7069           auto_vec<gimple *, 8> worklist;
7070           hash_set<gimple *> *worklist_visited = NULL;
7071           worklist.quick_push (orig_stmt);
7072           do
7073             {
7074               gimple *work_stmt = worklist.pop ();
7075               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
7076                 {
7077                   imm_use_iterator use_iter;
7078                   gimple *use_stmt;
7079                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
7080                                          DEF_FROM_PTR (def_p))
7081                     if (!is_gimple_debug (use_stmt))
7082                       {
7083                         stmt_vec_info use_stmt_info
7084                           = vinfo->lookup_stmt (use_stmt);
7085                         if (!use_stmt_info
7086                             || !vectorized_scalar_stmts.contains (use_stmt_info))
7087                           {
7088                             if (use_stmt_info
7089                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
7090                               {
7091                                 /* For stmts participating in patterns we have
7092                                    to check its uses recursively.  */
7093                                 if (!worklist_visited)
7094                                   worklist_visited = new hash_set<gimple *> ();
7095                                 if (!worklist_visited->add (use_stmt))
7096                                   worklist.safe_push (use_stmt);
7097                                 continue;
7098                               }
7099                             (*life)[i] = true;
7100                             goto next_lane;
7101                           }
7102                       }
7103                 }
7104             }
7105           while (!worklist.is_empty ());
7106 next_lane:
7107           if (worklist_visited)
7108             delete worklist_visited;
7109           if ((*life)[i])
7110             continue;
7111         }
7112
7113       /* Count scalar stmts only once.  */
7114       if (gimple_visited_p (orig_stmt))
7115         continue;
7116       gimple_set_visited (orig_stmt, true);
7117
7118       vect_cost_for_stmt kind;
7119       if (STMT_VINFO_DATA_REF (orig_stmt_info))
7120         {
7121           if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
7122             kind = scalar_load;
7123           else
7124             kind = scalar_store;
7125         }
7126       else if (vect_nop_conversion_p (orig_stmt_info))
7127         continue;
7128       /* For single-argument PHIs assume coalescing which means zero cost
7129          for the scalar and the vector PHIs.  This avoids artificially
7130          favoring the vector path (but may pessimize it in some cases).  */
7131       else if (is_a <gphi *> (orig_stmt_info->stmt)
7132                && gimple_phi_num_args
7133                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
7134         continue;
7135       else
7136         kind = scalar_stmt;
7137       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
7138                         SLP_TREE_VECTYPE (node), 0, vect_body);
7139     }
7140
7141   auto_vec<bool, 20> subtree_life;
7142   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7143     {
7144       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7145         {
7146           /* Do not directly pass LIFE to the recursive call, copy it to
7147              confine changes in the callee to the current child/subtree.  */
7148           if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7149             {
7150               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
7151               for (unsigned j = 0;
7152                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
7153                 {
7154                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
7155                   if (perm.first == i)
7156                     subtree_life[perm.second] = (*life)[j];
7157                 }
7158             }
7159           else
7160             {
7161               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
7162               subtree_life.safe_splice (*life);
7163             }
7164           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
7165                                    vectorized_scalar_stmts, visited);
7166           subtree_life.truncate (0);
7167         }
7168     }
7169 }
7170
7171 /* Comparator for the loop-index sorted cost vectors.  */
7172
7173 static int
7174 li_cost_vec_cmp (const void *a_, const void *b_)
7175 {
7176   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
7177   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
7178   if (a->first < b->first)
7179     return -1;
7180   else if (a->first == b->first)
7181     return 0;
7182   return 1;
7183 }
7184
7185 /* Check if vectorization of the basic block is profitable for the
7186    subgraph denoted by SLP_INSTANCES.  */
7187
7188 static bool
7189 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
7190                                     vec<slp_instance> slp_instances,
7191                                     loop_p orig_loop)
7192 {
7193   slp_instance instance;
7194   int i;
7195   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
7196   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
7197
7198   if (dump_enabled_p ())
7199     {
7200       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
7201       hash_set<slp_tree> visited;
7202       FOR_EACH_VEC_ELT (slp_instances, i, instance)
7203         vect_print_slp_graph (MSG_NOTE, vect_location,
7204                               SLP_INSTANCE_TREE (instance), visited);
7205     }
7206
7207   /* Compute the set of scalar stmts we know will go away 'locally' when
7208      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
7209      not accurate for nodes promoted extern late or for scalar stmts that
7210      are used both in extern defs and in vectorized defs.  */
7211   hash_set<stmt_vec_info> vectorized_scalar_stmts;
7212   hash_set<stmt_vec_info> scalar_stmts_in_externs;
7213   hash_set<slp_tree> visited;
7214   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7215     {
7216       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
7217                                                SLP_INSTANCE_TREE (instance),
7218                                                visited,
7219                                                vectorized_scalar_stmts,
7220                                                scalar_stmts_in_externs);
7221       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
7222         vectorized_scalar_stmts.add (rstmt);
7223     }
7224   /* Scalar stmts used as defs in external nodes need to be preseved, so
7225      remove them from vectorized_scalar_stmts.  */
7226   for (stmt_vec_info stmt : scalar_stmts_in_externs)
7227     vectorized_scalar_stmts.remove (stmt);
7228
7229   /* Calculate scalar cost and sum the cost for the vector stmts
7230      previously collected.  */
7231   stmt_vector_for_cost scalar_costs = vNULL;
7232   stmt_vector_for_cost vector_costs = vNULL;
7233   visited.empty ();
7234   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7235     {
7236       auto_vec<bool, 20> life;
7237       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
7238                               true);
7239       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7240         record_stmt_cost (&scalar_costs,
7241                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
7242                           scalar_stmt,
7243                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
7244       vect_bb_slp_scalar_cost (bb_vinfo,
7245                                SLP_INSTANCE_TREE (instance),
7246                                &life, &scalar_costs, vectorized_scalar_stmts,
7247                                visited);
7248       vector_costs.safe_splice (instance->cost_vec);
7249       instance->cost_vec.release ();
7250     }
7251
7252   if (dump_enabled_p ())
7253     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
7254
7255   /* When costing non-loop vectorization we need to consider each covered
7256      loop independently and make sure vectorization is profitable.  For
7257      now we assume a loop may be not entered or executed an arbitrary
7258      number of iterations (???  static information can provide more
7259      precise info here) which means we can simply cost each containing
7260      loops stmts separately.  */
7261
7262   /* First produce cost vectors sorted by loop index.  */
7263   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7264     li_scalar_costs (scalar_costs.length ());
7265   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7266     li_vector_costs (vector_costs.length ());
7267   stmt_info_for_cost *cost;
7268   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7269     {
7270       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7271       li_scalar_costs.quick_push (std::make_pair (l, cost));
7272     }
7273   /* Use a random used loop as fallback in case the first vector_costs
7274      entry does not have a stmt_info associated with it.  */
7275   unsigned l = li_scalar_costs[0].first;
7276   FOR_EACH_VEC_ELT (vector_costs, i, cost)
7277     {
7278       /* We inherit from the previous COST, invariants, externals and
7279          extracts immediately follow the cost for the related stmt.  */
7280       if (cost->stmt_info)
7281         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7282       li_vector_costs.quick_push (std::make_pair (l, cost));
7283     }
7284   li_scalar_costs.qsort (li_cost_vec_cmp);
7285   li_vector_costs.qsort (li_cost_vec_cmp);
7286
7287   /* Now cost the portions individually.  */
7288   unsigned vi = 0;
7289   unsigned si = 0;
7290   bool profitable = true;
7291   while (si < li_scalar_costs.length ()
7292          && vi < li_vector_costs.length ())
7293     {
7294       unsigned sl = li_scalar_costs[si].first;
7295       unsigned vl = li_vector_costs[vi].first;
7296       if (sl != vl)
7297         {
7298           if (dump_enabled_p ())
7299             dump_printf_loc (MSG_NOTE, vect_location,
7300                              "Scalar %d and vector %d loop part do not "
7301                              "match up, skipping scalar part\n", sl, vl);
7302           /* Skip the scalar part, assuming zero cost on the vector side.  */
7303           do
7304             {
7305               si++;
7306             }
7307           while (si < li_scalar_costs.length ()
7308                  && li_scalar_costs[si].first == sl);
7309           continue;
7310         }
7311
7312       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
7313       do
7314         {
7315           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
7316           si++;
7317         }
7318       while (si < li_scalar_costs.length ()
7319              && li_scalar_costs[si].first == sl);
7320       unsigned dummy;
7321       finish_cost (scalar_target_cost_data, nullptr,
7322                    &dummy, &scalar_cost, &dummy);
7323
7324       /* Complete the target-specific vector cost calculation.  */
7325       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
7326       do
7327         {
7328           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
7329           vi++;
7330         }
7331       while (vi < li_vector_costs.length ()
7332              && li_vector_costs[vi].first == vl);
7333       finish_cost (vect_target_cost_data, scalar_target_cost_data,
7334                    &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7335       delete scalar_target_cost_data;
7336       delete vect_target_cost_data;
7337
7338       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7339
7340       if (dump_enabled_p ())
7341         {
7342           dump_printf_loc (MSG_NOTE, vect_location,
7343                            "Cost model analysis for part in loop %d:\n", sl);
7344           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
7345                        vec_inside_cost + vec_outside_cost);
7346           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
7347         }
7348
7349       /* Vectorization is profitable if its cost is more than the cost of scalar
7350          version.  Note that we err on the vector side for equal cost because
7351          the cost estimate is otherwise quite pessimistic (constant uses are
7352          free on the scalar side but cost a load on the vector side for
7353          example).  */
7354       if (vec_outside_cost + vec_inside_cost > scalar_cost)
7355         {
7356           profitable = false;
7357           break;
7358         }
7359     }
7360   if (profitable && vi < li_vector_costs.length ())
7361     {
7362       if (dump_enabled_p ())
7363         dump_printf_loc (MSG_NOTE, vect_location,
7364                          "Excess vector cost for part in loop %d:\n",
7365                          li_vector_costs[vi].first);
7366       profitable = false;
7367     }
7368
7369   /* Unset visited flag.  This is delayed when the subgraph is profitable
7370      and we process the loop for remaining unvectorized if-converted code.  */
7371   if (!orig_loop || !profitable)
7372     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7373       gimple_set_visited  (cost->stmt_info->stmt, false);
7374
7375   scalar_costs.release ();
7376   vector_costs.release ();
7377
7378   return profitable;
7379 }
7380
7381 /* qsort comparator for lane defs.  */
7382
7383 static int
7384 vld_cmp (const void *a_, const void *b_)
7385 {
7386   auto *a = (const std::pair<unsigned, tree> *)a_;
7387   auto *b = (const std::pair<unsigned, tree> *)b_;
7388   return a->first - b->first;
7389 }
7390
7391 /* Return true if USE_STMT is a vector lane insert into VEC and set
7392    *THIS_LANE to the lane number that is set.  */
7393
7394 static bool
7395 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7396 {
7397   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7398   if (!use_ass
7399       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7400       || (vec
7401           ? gimple_assign_rhs1 (use_ass) != vec
7402           : ((vec = gimple_assign_rhs1 (use_ass)), false))
7403       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7404                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7405       || !constant_multiple_p
7406             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7407              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7408              this_lane))
7409     return false;
7410   return true;
7411 }
7412
7413 /* Find any vectorizable constructors and add them to the grouped_store
7414    array.  */
7415
7416 static void
7417 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7418 {
7419   for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7420     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7421          !gsi_end_p (gsi); gsi_next (&gsi))
7422     {
7423       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7424       if (!assign)
7425         continue;
7426
7427       tree rhs = gimple_assign_rhs1 (assign);
7428       enum tree_code code = gimple_assign_rhs_code (assign);
7429       use_operand_p use_p;
7430       gimple *use_stmt;
7431       if (code == CONSTRUCTOR)
7432         {
7433           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7434               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7435                            CONSTRUCTOR_NELTS (rhs))
7436               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7437               || uniform_vector_p (rhs))
7438             continue;
7439
7440           unsigned j;
7441           tree val;
7442           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7443             if (TREE_CODE (val) != SSA_NAME
7444                 || !bb_vinfo->lookup_def (val))
7445               break;
7446           if (j != CONSTRUCTOR_NELTS (rhs))
7447             continue;
7448
7449           vec<stmt_vec_info> roots = vNULL;
7450           roots.safe_push (bb_vinfo->lookup_stmt (assign));
7451           vec<stmt_vec_info> stmts;
7452           stmts.create (CONSTRUCTOR_NELTS (rhs));
7453           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7454             stmts.quick_push
7455               (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7456           bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7457                                                stmts, roots));
7458         }
7459       else if (code == BIT_INSERT_EXPR
7460                && VECTOR_TYPE_P (TREE_TYPE (rhs))
7461                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7462                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7463                && integer_zerop (gimple_assign_rhs3 (assign))
7464                && useless_type_conversion_p
7465                     (TREE_TYPE (TREE_TYPE (rhs)),
7466                      TREE_TYPE (gimple_assign_rhs2 (assign)))
7467                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7468         {
7469           /* We start to match on insert to lane zero but since the
7470              inserts need not be ordered we'd have to search both
7471              the def and the use chains.  */
7472           tree vectype = TREE_TYPE (rhs);
7473           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7474           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7475           auto_sbitmap lanes (nlanes);
7476           bitmap_clear (lanes);
7477           bitmap_set_bit (lanes, 0);
7478           tree def = gimple_assign_lhs (assign);
7479           lane_defs.quick_push
7480                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
7481           unsigned lanes_found = 1;
7482           /* Start with the use chains, the last stmt will be the root.  */
7483           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7484           vec<stmt_vec_info> roots = vNULL;
7485           roots.safe_push (last);
7486           do
7487             {
7488               use_operand_p use_p;
7489               gimple *use_stmt;
7490               if (!single_imm_use (def, &use_p, &use_stmt))
7491                 break;
7492               unsigned this_lane;
7493               if (!bb_vinfo->lookup_stmt (use_stmt)
7494                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7495                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7496                 break;
7497               if (bitmap_bit_p (lanes, this_lane))
7498                 break;
7499               lanes_found++;
7500               bitmap_set_bit (lanes, this_lane);
7501               gassign *use_ass = as_a <gassign *> (use_stmt);
7502               lane_defs.quick_push (std::make_pair
7503                                      (this_lane, gimple_assign_rhs2 (use_ass)));
7504               last = bb_vinfo->lookup_stmt (use_ass);
7505               roots.safe_push (last);
7506               def = gimple_assign_lhs (use_ass);
7507             }
7508           while (lanes_found < nlanes);
7509           if (roots.length () > 1)
7510             std::swap(roots[0], roots[roots.length () - 1]);
7511           if (lanes_found < nlanes)
7512             {
7513               /* Now search the def chain.  */
7514               def = gimple_assign_rhs1 (assign);
7515               do
7516                 {
7517                   if (TREE_CODE (def) != SSA_NAME
7518                       || !has_single_use (def))
7519                     break;
7520                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7521                   unsigned this_lane;
7522                   if (!bb_vinfo->lookup_stmt (def_stmt)
7523                       || !vect_slp_is_lane_insert (def_stmt,
7524                                                    NULL_TREE, &this_lane)
7525                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7526                     break;
7527                   if (bitmap_bit_p (lanes, this_lane))
7528                     break;
7529                   lanes_found++;
7530                   bitmap_set_bit (lanes, this_lane);
7531                   lane_defs.quick_push (std::make_pair
7532                                           (this_lane,
7533                                            gimple_assign_rhs2 (def_stmt)));
7534                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7535                   def = gimple_assign_rhs1 (def_stmt);
7536                 }
7537               while (lanes_found < nlanes);
7538             }
7539           if (lanes_found == nlanes)
7540             {
7541               /* Sort lane_defs after the lane index and register the root.  */
7542               lane_defs.qsort (vld_cmp);
7543               vec<stmt_vec_info> stmts;
7544               stmts.create (nlanes);
7545               for (unsigned i = 0; i < nlanes; ++i)
7546                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7547               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7548                                                    stmts, roots));
7549             }
7550           else
7551             roots.release ();
7552         }
7553       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7554                && (associative_tree_code (code) || code == MINUS_EXPR)
7555                /* ???  This pessimizes a two-element reduction.  PR54400.
7556                   ???  In-order reduction could be handled if we only
7557                   traverse one operand chain in vect_slp_linearize_chain.  */
7558                && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7559                /* Ops with constants at the tail can be stripped here.  */
7560                && TREE_CODE (rhs) == SSA_NAME
7561                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7562                /* Should be the chain end.  */
7563                && (!single_imm_use (gimple_assign_lhs (assign),
7564                                     &use_p, &use_stmt)
7565                    || !is_gimple_assign (use_stmt)
7566                    || (gimple_assign_rhs_code (use_stmt) != code
7567                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
7568                            || (gimple_assign_rhs_code (use_stmt)
7569                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7570         {
7571           /* We start the match at the end of a possible association
7572              chain.  */
7573           auto_vec<chain_op_t> chain;
7574           auto_vec<std::pair<tree_code, gimple *> > worklist;
7575           auto_vec<gimple *> chain_stmts;
7576           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7577           if (code == MINUS_EXPR)
7578             code = PLUS_EXPR;
7579           internal_fn reduc_fn;
7580           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7581               || reduc_fn == IFN_LAST)
7582             continue;
7583           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7584                                     /* ??? */
7585                                     code_stmt, alt_code_stmt, &chain_stmts);
7586           if (chain.length () > 1)
7587             {
7588               /* Sort the chain according to def_type and operation.  */
7589               chain.sort (dt_sort_cmp, bb_vinfo);
7590               /* ???  Now we'd want to strip externals and constants
7591                  but record those to be handled in the epilogue.  */
7592               /* ???  For now do not allow mixing ops or externs/constants.  */
7593               bool invalid = false;
7594               unsigned remain_cnt = 0;
7595               unsigned last_idx = 0;
7596               for (unsigned i = 0; i < chain.length (); ++i)
7597                 {
7598                   if (chain[i].code != code)
7599                     {
7600                       invalid = true;
7601                       break;
7602                     }
7603                   if (chain[i].dt != vect_internal_def
7604                       /* Avoid stmts where the def is not the LHS, like
7605                          ASMs.  */
7606                       || (gimple_get_lhs (bb_vinfo->lookup_def
7607                                                       (chain[i].op)->stmt)
7608                           != chain[i].op))
7609                     remain_cnt++;
7610                   else
7611                     last_idx = i;
7612                 }
7613               /* Make sure to have an even number of lanes as we later do
7614                  all-or-nothing discovery, not trying to split further.  */
7615               if ((chain.length () - remain_cnt) & 1)
7616                 remain_cnt++;
7617               if (!invalid && chain.length () - remain_cnt > 1)
7618                 {
7619                   vec<stmt_vec_info> stmts;
7620                   vec<tree> remain = vNULL;
7621                   stmts.create (chain.length ());
7622                   if (remain_cnt > 0)
7623                     remain.create (remain_cnt);
7624                   for (unsigned i = 0; i < chain.length (); ++i)
7625                     {
7626                       stmt_vec_info stmt_info;
7627                       if (chain[i].dt == vect_internal_def
7628                           && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
7629                               gimple_get_lhs (stmt_info->stmt) == chain[i].op)
7630                           && (i != last_idx
7631                               || (stmts.length () & 1)))
7632                         stmts.quick_push (stmt_info);
7633                       else
7634                         remain.quick_push (chain[i].op);
7635                     }
7636                   vec<stmt_vec_info> roots;
7637                   roots.create (chain_stmts.length ());
7638                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
7639                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7640                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7641                                                        stmts, roots, remain));
7642                 }
7643             }
7644         }
7645     }
7646 }
7647
7648 /* Walk the grouped store chains and replace entries with their
7649    pattern variant if any.  */
7650
7651 static void
7652 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7653 {
7654   stmt_vec_info first_element;
7655   unsigned i;
7656
7657   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7658     {
7659       /* We also have CTORs in this array.  */
7660       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7661         continue;
7662       if (STMT_VINFO_IN_PATTERN_P (first_element))
7663         {
7664           stmt_vec_info orig = first_element;
7665           first_element = STMT_VINFO_RELATED_STMT (first_element);
7666           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7667           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7668           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7669           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7670           vinfo->grouped_stores[i] = first_element;
7671         }
7672       stmt_vec_info prev = first_element;
7673       while (DR_GROUP_NEXT_ELEMENT (prev))
7674         {
7675           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7676           if (STMT_VINFO_IN_PATTERN_P (elt))
7677             {
7678               stmt_vec_info orig = elt;
7679               elt = STMT_VINFO_RELATED_STMT (elt);
7680               DR_GROUP_NEXT_ELEMENT (prev) = elt;
7681               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7682               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7683             }
7684           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7685           prev = elt;
7686         }
7687     }
7688 }
7689
7690 /* Check if the region described by BB_VINFO can be vectorized, returning
7691    true if so.  When returning false, set FATAL to true if the same failure
7692    would prevent vectorization at other vector sizes, false if it is still
7693    worth trying other sizes.  N_STMTS is the number of statements in the
7694    region.  */
7695
7696 static bool
7697 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7698                        vec<int> *dataref_groups)
7699 {
7700   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7701
7702   slp_instance instance;
7703   int i;
7704   poly_uint64 min_vf = 2;
7705
7706   /* The first group of checks is independent of the vector size.  */
7707   fatal = true;
7708
7709   /* Analyze the data references.  */
7710
7711   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7712     {
7713       if (dump_enabled_p ())
7714         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7715                          "not vectorized: unhandled data-ref in basic "
7716                          "block.\n");
7717       return false;
7718     }
7719
7720   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7721     {
7722      if (dump_enabled_p ())
7723        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7724                         "not vectorized: unhandled data access in "
7725                         "basic block.\n");
7726       return false;
7727     }
7728
7729   vect_slp_check_for_roots (bb_vinfo);
7730
7731   /* If there are no grouped stores and no constructors in the region
7732      there is no need to continue with pattern recog as vect_analyze_slp
7733      will fail anyway.  */
7734   if (bb_vinfo->grouped_stores.is_empty ()
7735       && bb_vinfo->roots.is_empty ())
7736     {
7737       if (dump_enabled_p ())
7738         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7739                          "not vectorized: no grouped stores in "
7740                          "basic block.\n");
7741       return false;
7742     }
7743
7744   /* While the rest of the analysis below depends on it in some way.  */
7745   fatal = false;
7746
7747   vect_pattern_recog (bb_vinfo);
7748
7749   /* Update store groups from pattern processing.  */
7750   vect_fixup_store_groups_with_patterns (bb_vinfo);
7751
7752   /* Check the SLP opportunities in the basic block, analyze and build SLP
7753      trees.  */
7754   if (!vect_analyze_slp (bb_vinfo, n_stmts))
7755     {
7756       if (dump_enabled_p ())
7757         {
7758           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7759                            "Failed to SLP the basic block.\n");
7760           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7761                            "not vectorized: failed to find SLP opportunities "
7762                            "in basic block.\n");
7763         }
7764       return false;
7765     }
7766
7767   /* Optimize permutations.  */
7768   vect_optimize_slp (bb_vinfo);
7769
7770   /* Gather the loads reachable from the SLP graph entries.  */
7771   vect_gather_slp_loads (bb_vinfo);
7772
7773   vect_record_base_alignments (bb_vinfo);
7774
7775   /* Analyze and verify the alignment of data references and the
7776      dependence in the SLP instances.  */
7777   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7778     {
7779       vect_location = instance->location ();
7780       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7781           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7782         {
7783           slp_tree node = SLP_INSTANCE_TREE (instance);
7784           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7785           if (dump_enabled_p ())
7786             dump_printf_loc (MSG_NOTE, vect_location,
7787                              "removing SLP instance operations starting from: %G",
7788                              stmt_info->stmt);
7789           vect_free_slp_instance (instance);
7790           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7791           continue;
7792         }
7793
7794       /* Mark all the statements that we want to vectorize as pure SLP and
7795          relevant.  */
7796       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7797       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7798       unsigned j;
7799       stmt_vec_info root;
7800       /* Likewise consider instance root stmts as vectorized.  */
7801       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7802         STMT_SLP_TYPE (root) = pure_slp;
7803
7804       i++;
7805     }
7806   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7807     return false;
7808
7809   if (!vect_slp_analyze_operations (bb_vinfo))
7810     {
7811       if (dump_enabled_p ())
7812         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7813                          "not vectorized: bad operation in basic block.\n");
7814       return false;
7815     }
7816
7817   vect_bb_partition_graph (bb_vinfo);
7818
7819   return true;
7820 }
7821
7822 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
7823    basic blocks in BBS, returning true on success.
7824    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
7825
7826 static bool
7827 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7828                  vec<int> *dataref_groups, unsigned int n_stmts,
7829                  loop_p orig_loop)
7830 {
7831   bb_vec_info bb_vinfo;
7832   auto_vector_modes vector_modes;
7833
7834   /* Autodetect first vector size we try.  */
7835   machine_mode next_vector_mode = VOIDmode;
7836   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7837   unsigned int mode_i = 0;
7838
7839   vec_info_shared shared;
7840
7841   machine_mode autodetected_vector_mode = VOIDmode;
7842   while (1)
7843     {
7844       bool vectorized = false;
7845       bool fatal = false;
7846       bb_vinfo = new _bb_vec_info (bbs, &shared);
7847
7848       bool first_time_p = shared.datarefs.is_empty ();
7849       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7850       if (first_time_p)
7851         bb_vinfo->shared->save_datarefs ();
7852       else
7853         bb_vinfo->shared->check_datarefs ();
7854       bb_vinfo->vector_mode = next_vector_mode;
7855
7856       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7857         {
7858           if (dump_enabled_p ())
7859             {
7860               dump_printf_loc (MSG_NOTE, vect_location,
7861                                "***** Analysis succeeded with vector mode"
7862                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7863               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7864             }
7865
7866           bb_vinfo->shared->check_datarefs ();
7867
7868           bool force_clear = false;
7869           auto_vec<slp_instance> profitable_subgraphs;
7870           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7871             {
7872               if (instance->subgraph_entries.is_empty ())
7873                 continue;
7874
7875               dump_user_location_t saved_vect_location = vect_location;
7876               vect_location = instance->location ();
7877               if (!unlimited_cost_model (NULL)
7878                   && !vect_bb_vectorization_profitable_p
7879                         (bb_vinfo, instance->subgraph_entries, orig_loop))
7880                 {
7881                   if (dump_enabled_p ())
7882                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7883                                      "not vectorized: vectorization is not "
7884                                      "profitable.\n");
7885                   vect_location = saved_vect_location;
7886                   continue;
7887                 }
7888
7889               vect_location = saved_vect_location;
7890               if (!dbg_cnt (vect_slp))
7891                 {
7892                   force_clear = true;
7893                   continue;
7894                 }
7895
7896               profitable_subgraphs.safe_push (instance);
7897             }
7898
7899           /* When we're vectorizing an if-converted loop body make sure
7900              we vectorized all if-converted code.  */
7901           if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
7902             {
7903               gcc_assert (bb_vinfo->bbs.length () == 1);
7904               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7905                    !gsi_end_p (gsi); gsi_next (&gsi))
7906                 {
7907                   /* The costing above left us with DCEable vectorized scalar
7908                      stmts having the visited flag set on profitable
7909                      subgraphs.  Do the delayed clearing of the flag here.  */
7910                   if (gimple_visited_p (gsi_stmt (gsi)))
7911                     {
7912                       gimple_set_visited (gsi_stmt (gsi), false);
7913                       continue;
7914                     }
7915                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7916                     continue;
7917
7918                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7919                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
7920                       {
7921                         if (!profitable_subgraphs.is_empty ()
7922                             && dump_enabled_p ())
7923                           dump_printf_loc (MSG_NOTE, vect_location,
7924                                            "not profitable because of "
7925                                            "unprofitable if-converted scalar "
7926                                            "code\n");
7927                         profitable_subgraphs.truncate (0);
7928                       }
7929                 }
7930             }
7931
7932           /* Finally schedule the profitable subgraphs.  */
7933           for (slp_instance instance : profitable_subgraphs)
7934             {
7935               if (!vectorized && dump_enabled_p ())
7936                 dump_printf_loc (MSG_NOTE, vect_location,
7937                                  "Basic block will be vectorized "
7938                                  "using SLP\n");
7939               vectorized = true;
7940
7941               /* Dump before scheduling as store vectorization will remove
7942                  the original stores and mess with the instance tree
7943                  so querying its location will eventually ICE.  */
7944               if (flag_checking)
7945                 for (slp_instance sub : instance->subgraph_entries)
7946                   gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7947               unsigned HOST_WIDE_INT bytes;
7948               if (dump_enabled_p ())
7949                 for (slp_instance sub : instance->subgraph_entries)
7950                   {
7951                     tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7952                     if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7953                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7954                                        sub->location (),
7955                                        "basic block part vectorized using %wu "
7956                                        "byte vectors\n", bytes);
7957                     else
7958                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7959                                        sub->location (),
7960                                        "basic block part vectorized using "
7961                                        "variable length vectors\n");
7962                   }
7963
7964               dump_user_location_t saved_vect_location = vect_location;
7965               vect_location = instance->location ();
7966
7967               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7968
7969               vect_location = saved_vect_location;
7970             }
7971         }
7972       else
7973         {
7974           if (dump_enabled_p ())
7975             dump_printf_loc (MSG_NOTE, vect_location,
7976                              "***** Analysis failed with vector mode %s\n",
7977                              GET_MODE_NAME (bb_vinfo->vector_mode));
7978         }
7979
7980       if (mode_i == 0)
7981         autodetected_vector_mode = bb_vinfo->vector_mode;
7982
7983       if (!fatal)
7984         while (mode_i < vector_modes.length ()
7985                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7986           {
7987             if (dump_enabled_p ())
7988               dump_printf_loc (MSG_NOTE, vect_location,
7989                                "***** The result for vector mode %s would"
7990                                " be the same\n",
7991                                GET_MODE_NAME (vector_modes[mode_i]));
7992             mode_i += 1;
7993           }
7994
7995       delete bb_vinfo;
7996
7997       if (mode_i < vector_modes.length ()
7998           && VECTOR_MODE_P (autodetected_vector_mode)
7999           && (related_vector_mode (vector_modes[mode_i],
8000                                    GET_MODE_INNER (autodetected_vector_mode))
8001               == autodetected_vector_mode)
8002           && (related_vector_mode (autodetected_vector_mode,
8003                                    GET_MODE_INNER (vector_modes[mode_i]))
8004               == vector_modes[mode_i]))
8005         {
8006           if (dump_enabled_p ())
8007             dump_printf_loc (MSG_NOTE, vect_location,
8008                              "***** Skipping vector mode %s, which would"
8009                              " repeat the analysis for %s\n",
8010                              GET_MODE_NAME (vector_modes[mode_i]),
8011                              GET_MODE_NAME (autodetected_vector_mode));
8012           mode_i += 1;
8013         }
8014
8015       if (vectorized
8016           || mode_i == vector_modes.length ()
8017           || autodetected_vector_mode == VOIDmode
8018           /* If vect_slp_analyze_bb_1 signaled that analysis for all
8019              vector sizes will fail do not bother iterating.  */
8020           || fatal)
8021         return vectorized;
8022
8023       /* Try the next biggest vector size.  */
8024       next_vector_mode = vector_modes[mode_i++];
8025       if (dump_enabled_p ())
8026         dump_printf_loc (MSG_NOTE, vect_location,
8027                          "***** Re-trying analysis with vector mode %s\n",
8028                          GET_MODE_NAME (next_vector_mode));
8029     }
8030 }
8031
8032
8033 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
8034    true if anything in the basic-block was vectorized.  */
8035
8036 static bool
8037 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
8038 {
8039   vec<data_reference_p> datarefs = vNULL;
8040   auto_vec<int> dataref_groups;
8041   int insns = 0;
8042   int current_group = 0;
8043
8044   for (unsigned i = 0; i < bbs.length (); i++)
8045     {
8046       basic_block bb = bbs[i];
8047       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
8048            gsi_next (&gsi))
8049         {
8050           gimple *stmt = gsi_stmt (gsi);
8051           if (is_gimple_debug (stmt))
8052             continue;
8053
8054           insns++;
8055
8056           if (gimple_location (stmt) != UNKNOWN_LOCATION)
8057             vect_location = stmt;
8058
8059           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
8060                                               &dataref_groups, current_group))
8061             ++current_group;
8062         }
8063       /* New BBs always start a new DR group.  */
8064       ++current_group;
8065     }
8066
8067   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
8068 }
8069
8070 /* Special entry for the BB vectorizer.  Analyze and transform a single
8071    if-converted BB with ORIG_LOOPs body being the not if-converted
8072    representation.  Returns true if anything in the basic-block was
8073    vectorized.  */
8074
8075 bool
8076 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
8077 {
8078   auto_vec<basic_block> bbs;
8079   bbs.safe_push (bb);
8080   return vect_slp_bbs (bbs, orig_loop);
8081 }
8082
8083 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
8084    true if anything in the basic-block was vectorized.  */
8085
8086 bool
8087 vect_slp_function (function *fun)
8088 {
8089   bool r = false;
8090   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
8091   auto_bitmap exit_bbs;
8092   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
8093   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
8094   unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
8095                                                       true, rpo, NULL);
8096
8097   /* For the moment split the function into pieces to avoid making
8098      the iteration on the vector mode moot.  Split at points we know
8099      to not handle well which is CFG merges (SLP discovery doesn't
8100      handle non-loop-header PHIs) and loop exits.  Since pattern
8101      recog requires reverse iteration to visit uses before defs
8102      simply chop RPO into pieces.  */
8103   auto_vec<basic_block> bbs;
8104   for (unsigned i = 0; i < n; i++)
8105     {
8106       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
8107       bool split = false;
8108
8109       /* Split when a BB is not dominated by the first block.  */
8110       if (!bbs.is_empty ()
8111           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
8112         {
8113           if (dump_enabled_p ())
8114             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8115                              "splitting region at dominance boundary bb%d\n",
8116                              bb->index);
8117           split = true;
8118         }
8119       /* Split when the loop determined by the first block
8120          is exited.  This is because we eventually insert
8121          invariants at region begin.  */
8122       else if (!bbs.is_empty ()
8123                && bbs[0]->loop_father != bb->loop_father
8124                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
8125         {
8126           if (dump_enabled_p ())
8127             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8128                              "splitting region at loop %d exit at bb%d\n",
8129                              bbs[0]->loop_father->num, bb->index);
8130           split = true;
8131         }
8132       else if (!bbs.is_empty ()
8133                && bb->loop_father->header == bb
8134                && bb->loop_father->dont_vectorize)
8135         {
8136           if (dump_enabled_p ())
8137             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8138                              "splitting region at dont-vectorize loop %d "
8139                              "entry at bb%d\n",
8140                              bb->loop_father->num, bb->index);
8141           split = true;
8142         }
8143
8144       if (split && !bbs.is_empty ())
8145         {
8146           r |= vect_slp_bbs (bbs, NULL);
8147           bbs.truncate (0);
8148         }
8149
8150       if (bbs.is_empty ())
8151         {
8152           /* We need to be able to insert at the head of the region which
8153              we cannot for region starting with a returns-twice call.  */
8154           if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
8155             if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
8156               {
8157                 if (dump_enabled_p ())
8158                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8159                                    "skipping bb%d as start of region as it "
8160                                    "starts with returns-twice call\n",
8161                                    bb->index);
8162                 continue;
8163               }
8164           /* If the loop this BB belongs to is marked as not to be vectorized
8165              honor that also for BB vectorization.  */
8166           if (bb->loop_father->dont_vectorize)
8167             continue;
8168         }
8169
8170       bbs.safe_push (bb);
8171
8172       /* When we have a stmt ending this block and defining a
8173          value we have to insert on edges when inserting after it for
8174          a vector containing its definition.  Avoid this for now.  */
8175       if (gimple *last = *gsi_last_bb (bb))
8176         if (gimple_get_lhs (last)
8177             && is_ctrl_altering_stmt (last))
8178           {
8179             if (dump_enabled_p ())
8180               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8181                                "splitting region at control altering "
8182                                "definition %G", last);
8183             r |= vect_slp_bbs (bbs, NULL);
8184             bbs.truncate (0);
8185           }
8186     }
8187
8188   if (!bbs.is_empty ())
8189     r |= vect_slp_bbs (bbs, NULL);
8190
8191   free (rpo);
8192
8193   return r;
8194 }
8195
8196 /* Build a variable-length vector in which the elements in ELTS are repeated
8197    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
8198    RESULTS and add any new instructions to SEQ.
8199
8200    The approach we use is:
8201
8202    (1) Find a vector mode VM with integer elements of mode IM.
8203
8204    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8205        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
8206        from small vectors to IM.
8207
8208    (3) Duplicate each ELTS'[I] into a vector of mode VM.
8209
8210    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
8211        correct byte contents.
8212
8213    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
8214
8215    We try to find the largest IM for which this sequence works, in order
8216    to cut down on the number of interleaves.  */
8217
8218 void
8219 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
8220                           const vec<tree> &elts, unsigned int nresults,
8221                           vec<tree> &results)
8222 {
8223   unsigned int nelts = elts.length ();
8224   tree element_type = TREE_TYPE (vector_type);
8225
8226   /* (1) Find a vector mode VM with integer elements of mode IM.  */
8227   unsigned int nvectors = 1;
8228   tree new_vector_type;
8229   tree permutes[2];
8230   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
8231                                        &nvectors, &new_vector_type,
8232                                        permutes))
8233     gcc_unreachable ();
8234
8235   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
8236   unsigned int partial_nelts = nelts / nvectors;
8237   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
8238
8239   tree_vector_builder partial_elts;
8240   auto_vec<tree, 32> pieces (nvectors * 2);
8241   pieces.quick_grow_cleared (nvectors * 2);
8242   for (unsigned int i = 0; i < nvectors; ++i)
8243     {
8244       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8245              ELTS' has mode IM.  */
8246       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
8247       for (unsigned int j = 0; j < partial_nelts; ++j)
8248         partial_elts.quick_push (elts[i * partial_nelts + j]);
8249       tree t = gimple_build_vector (seq, &partial_elts);
8250       t = gimple_build (seq, VIEW_CONVERT_EXPR,
8251                         TREE_TYPE (new_vector_type), t);
8252
8253       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
8254       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
8255     }
8256
8257   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
8258          correct byte contents.
8259
8260      Conceptually, we need to repeat the following operation log2(nvectors)
8261      times, where hi_start = nvectors / 2:
8262
8263         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
8264         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
8265
8266      However, if each input repeats every N elements and the VF is
8267      a multiple of N * 2, the HI result is the same as the LO result.
8268      This will be true for the first N1 iterations of the outer loop,
8269      followed by N2 iterations for which both the LO and HI results
8270      are needed.  I.e.:
8271
8272         N1 + N2 = log2(nvectors)
8273
8274      Each "N1 iteration" doubles the number of redundant vectors and the
8275      effect of the process as a whole is to have a sequence of nvectors/2**N1
8276      vectors that repeats 2**N1 times.  Rather than generate these redundant
8277      vectors, we halve the number of vectors for each N1 iteration.  */
8278   unsigned int in_start = 0;
8279   unsigned int out_start = nvectors;
8280   unsigned int new_nvectors = nvectors;
8281   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
8282     {
8283       unsigned int hi_start = new_nvectors / 2;
8284       unsigned int out_i = 0;
8285       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
8286         {
8287           if ((in_i & 1) != 0
8288               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
8289                              2 * in_repeat))
8290             continue;
8291
8292           tree output = make_ssa_name (new_vector_type);
8293           tree input1 = pieces[in_start + (in_i / 2)];
8294           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
8295           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
8296                                                input1, input2,
8297                                                permutes[in_i & 1]);
8298           gimple_seq_add_stmt (seq, stmt);
8299           pieces[out_start + out_i] = output;
8300           out_i += 1;
8301         }
8302       std::swap (in_start, out_start);
8303       new_nvectors = out_i;
8304     }
8305
8306   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
8307   results.reserve (nresults);
8308   for (unsigned int i = 0; i < nresults; ++i)
8309     if (i < new_nvectors)
8310       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
8311                                         pieces[in_start + i]));
8312     else
8313       results.quick_push (results[i - new_nvectors]);
8314 }
8315
8316
8317 /* For constant and loop invariant defs in OP_NODE this function creates
8318    vector defs that will be used in the vectorized stmts and stores them
8319    to SLP_TREE_VEC_DEFS of OP_NODE.  */
8320
8321 static void
8322 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8323 {
8324   unsigned HOST_WIDE_INT nunits;
8325   tree vec_cst;
8326   unsigned j, number_of_places_left_in_vector;
8327   tree vector_type;
8328   tree vop;
8329   int group_size = op_node->ops.length ();
8330   unsigned int vec_num, i;
8331   unsigned number_of_copies = 1;
8332   bool constant_p;
8333   gimple_seq ctor_seq = NULL;
8334   auto_vec<tree, 16> permute_results;
8335
8336   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
8337   vector_type = SLP_TREE_VECTYPE (op_node);
8338
8339   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8340   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
8341   auto_vec<tree> voprnds (number_of_vectors);
8342
8343   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8344      created vectors. It is greater than 1 if unrolling is performed.
8345
8346      For example, we have two scalar operands, s1 and s2 (e.g., group of
8347      strided accesses of size two), while NUNITS is four (i.e., four scalars
8348      of this type can be packed in a vector).  The output vector will contain
8349      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
8350      will be 2).
8351
8352      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8353      containing the operands.
8354
8355      For example, NUNITS is four as before, and the group size is 8
8356      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
8357      {s5, s6, s7, s8}.  */
8358
8359   /* When using duplicate_and_interleave, we just need one element for
8360      each scalar statement.  */
8361   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
8362     nunits = group_size;
8363
8364   number_of_copies = nunits * number_of_vectors / group_size;
8365
8366   number_of_places_left_in_vector = nunits;
8367   constant_p = true;
8368   tree uniform_elt = NULL_TREE;
8369   tree_vector_builder elts (vector_type, nunits, 1);
8370   elts.quick_grow (nunits);
8371   stmt_vec_info insert_after = NULL;
8372   for (j = 0; j < number_of_copies; j++)
8373     {
8374       tree op;
8375       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
8376         {
8377           /* Create 'vect_ = {op0,op1,...,opn}'.  */
8378           tree orig_op = op;
8379           if (number_of_places_left_in_vector == nunits)
8380             uniform_elt = op;
8381           else if (uniform_elt && operand_equal_p (uniform_elt, op))
8382             op = elts[number_of_places_left_in_vector];
8383           else
8384             uniform_elt = NULL_TREE;
8385           number_of_places_left_in_vector--;
8386           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8387             {
8388               if (CONSTANT_CLASS_P (op))
8389                 {
8390                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8391                     {
8392                       /* Can't use VIEW_CONVERT_EXPR for booleans because
8393                          of possibly different sizes of scalar value and
8394                          vector element.  */
8395                       if (integer_zerop (op))
8396                         op = build_int_cst (TREE_TYPE (vector_type), 0);
8397                       else if (integer_onep (op))
8398                         op = build_all_ones_cst (TREE_TYPE (vector_type));
8399                       else
8400                         gcc_unreachable ();
8401                     }
8402                   else
8403                     op = fold_unary (VIEW_CONVERT_EXPR,
8404                                      TREE_TYPE (vector_type), op);
8405                   gcc_assert (op && CONSTANT_CLASS_P (op));
8406                 }
8407               else
8408                 {
8409                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8410                   gimple *init_stmt;
8411                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8412                     {
8413                       tree true_val
8414                         = build_all_ones_cst (TREE_TYPE (vector_type));
8415                       tree false_val
8416                         = build_zero_cst (TREE_TYPE (vector_type));
8417                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8418                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8419                                                        op, true_val,
8420                                                        false_val);
8421                     }
8422                   else
8423                     {
8424                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8425                                    op);
8426                       init_stmt
8427                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8428                                                op);
8429                     }
8430                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
8431                   op = new_temp;
8432                 }
8433             }
8434           elts[number_of_places_left_in_vector] = op;
8435           if (!CONSTANT_CLASS_P (op))
8436             constant_p = false;
8437           /* For BB vectorization we have to compute an insert location
8438              when a def is inside the analyzed region since we cannot
8439              simply insert at the BB start in this case.  */
8440           stmt_vec_info opdef;
8441           if (TREE_CODE (orig_op) == SSA_NAME
8442               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8443               && is_a <bb_vec_info> (vinfo)
8444               && (opdef = vinfo->lookup_def (orig_op)))
8445             {
8446               if (!insert_after)
8447                 insert_after = opdef;
8448               else
8449                 insert_after = get_later_stmt (insert_after, opdef);
8450             }
8451
8452           if (number_of_places_left_in_vector == 0)
8453             {
8454               auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
8455               if (uniform_elt)
8456                 vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
8457                                                         elts[0]);
8458               else if (constant_p
8459                        ? multiple_p (type_nunits, nunits)
8460                        : known_eq (type_nunits, nunits))
8461                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8462               else
8463                 {
8464                   if (permute_results.is_empty ())
8465                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8466                                               elts, number_of_vectors,
8467                                               permute_results);
8468                   vec_cst = permute_results[number_of_vectors - j - 1];
8469                 }
8470               if (!gimple_seq_empty_p (ctor_seq))
8471                 {
8472                   if (insert_after)
8473                     {
8474                       gimple_stmt_iterator gsi;
8475                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8476                         {
8477                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8478                           gsi_insert_seq_before (&gsi, ctor_seq,
8479                                                  GSI_CONTINUE_LINKING);
8480                         }
8481                       else if (!stmt_ends_bb_p (insert_after->stmt))
8482                         {
8483                           gsi = gsi_for_stmt (insert_after->stmt);
8484                           gsi_insert_seq_after (&gsi, ctor_seq,
8485                                                 GSI_CONTINUE_LINKING);
8486                         }
8487                       else
8488                         {
8489                           /* When we want to insert after a def where the
8490                              defining stmt throws then insert on the fallthru
8491                              edge.  */
8492                           edge e = find_fallthru_edge
8493                                      (gimple_bb (insert_after->stmt)->succs);
8494                           basic_block new_bb
8495                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8496                           gcc_assert (!new_bb);
8497                         }
8498                     }
8499                   else
8500                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
8501                   ctor_seq = NULL;
8502                 }
8503               voprnds.quick_push (vec_cst);
8504               insert_after = NULL;
8505               number_of_places_left_in_vector = nunits;
8506               constant_p = true;
8507               elts.new_vector (vector_type, nunits, 1);
8508               elts.quick_grow (nunits);
8509             }
8510         }
8511     }
8512
8513   /* Since the vectors are created in the reverse order, we should invert
8514      them.  */
8515   vec_num = voprnds.length ();
8516   for (j = vec_num; j != 0; j--)
8517     {
8518       vop = voprnds[j - 1];
8519       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8520     }
8521
8522   /* In case that VF is greater than the unrolling factor needed for the SLP
8523      group of stmts, NUMBER_OF_VECTORS to be created is greater than
8524      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8525      to replicate the vectors.  */
8526   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8527     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8528          i++)
8529       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8530 }
8531
8532 /* Get the Ith vectorized definition from SLP_NODE.  */
8533
8534 tree
8535 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8536 {
8537   return SLP_TREE_VEC_DEFS (slp_node)[i];
8538 }
8539
8540 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
8541
8542 void
8543 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8544 {
8545   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8546   vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8547 }
8548
8549 /* Get N vectorized definitions for SLP_NODE.  */
8550
8551 void
8552 vect_get_slp_defs (vec_info *,
8553                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8554 {
8555   if (n == -1U)
8556     n = SLP_TREE_CHILDREN (slp_node).length ();
8557
8558   for (unsigned i = 0; i < n; ++i)
8559     {
8560       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8561       vec<tree> vec_defs = vNULL;
8562       vect_get_slp_defs (child, &vec_defs);
8563       vec_oprnds->quick_push (vec_defs);
8564     }
8565 }
8566
8567 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8568    - PERM gives the permutation that the caller wants to use for NODE,
8569      which might be different from SLP_LOAD_PERMUTATION.
8570    - DUMP_P controls whether the function dumps information.  */
8571
8572 static bool
8573 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8574                                 load_permutation_t &perm,
8575                                 const vec<tree> &dr_chain,
8576                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
8577                                 bool analyze_only, bool dump_p,
8578                                 unsigned *n_perms, unsigned int *n_loads,
8579                                 bool dce_chain)
8580 {
8581   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8582   int vec_index = 0;
8583   tree vectype = SLP_TREE_VECTYPE (node);
8584   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8585   unsigned int mask_element;
8586   unsigned dr_group_size;
8587   machine_mode mode;
8588
8589   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8590     dr_group_size = 1;
8591   else
8592     {
8593       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8594       dr_group_size = DR_GROUP_SIZE (stmt_info);
8595     }
8596
8597   mode = TYPE_MODE (vectype);
8598   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8599   unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8600
8601   /* Initialize the vect stmts of NODE to properly insert the generated
8602      stmts later.  */
8603   if (! analyze_only)
8604     for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8605       SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8606
8607   /* Generate permutation masks for every NODE. Number of masks for each NODE
8608      is equal to GROUP_SIZE.
8609      E.g., we have a group of three nodes with three loads from the same
8610      location in each node, and the vector size is 4. I.e., we have a
8611      a0b0c0a1b1c1... sequence and we need to create the following vectors:
8612      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8613      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8614      ...
8615
8616      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8617      The last mask is illegal since we assume two operands for permute
8618      operation, and the mask element values can't be outside that range.
8619      Hence, the last mask must be converted into {2,5,5,5}.
8620      For the first two permutations we need the first and the second input
8621      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8622      we need the second and the third vectors: {b1,c1,a2,b2} and
8623      {c2,a3,b3,c3}.  */
8624
8625   int vect_stmts_counter = 0;
8626   unsigned int index = 0;
8627   int first_vec_index = -1;
8628   int second_vec_index = -1;
8629   bool noop_p = true;
8630   *n_perms = 0;
8631
8632   vec_perm_builder mask;
8633   unsigned int nelts_to_build;
8634   unsigned int nvectors_per_build;
8635   unsigned int in_nlanes;
8636   bool repeating_p = (group_size == dr_group_size
8637                       && multiple_p (nunits, group_size));
8638   if (repeating_p)
8639     {
8640       /* A single vector contains a whole number of copies of the node, so:
8641          (a) all permutes can use the same mask; and
8642          (b) the permutes only need a single vector input.  */
8643       mask.new_vector (nunits, group_size, 3);
8644       nelts_to_build = mask.encoded_nelts ();
8645       /* It's possible to obtain zero nstmts during analyze_only, so make
8646          it at least one to ensure the later computation for n_perms
8647          proceed.  */
8648       nvectors_per_build = nstmts > 0 ? nstmts : 1;
8649       in_nlanes = dr_group_size * 3;
8650     }
8651   else
8652     {
8653       /* We need to construct a separate mask for each vector statement.  */
8654       unsigned HOST_WIDE_INT const_nunits, const_vf;
8655       if (!nunits.is_constant (&const_nunits)
8656           || !vf.is_constant (&const_vf))
8657         return false;
8658       mask.new_vector (const_nunits, const_nunits, 1);
8659       nelts_to_build = const_vf * group_size;
8660       nvectors_per_build = 1;
8661       in_nlanes = const_vf * dr_group_size;
8662     }
8663   auto_sbitmap used_in_lanes (in_nlanes);
8664   bitmap_clear (used_in_lanes);
8665   auto_bitmap used_defs;
8666
8667   unsigned int count = mask.encoded_nelts ();
8668   mask.quick_grow (count);
8669   vec_perm_indices indices;
8670
8671   for (unsigned int j = 0; j < nelts_to_build; j++)
8672     {
8673       unsigned int iter_num = j / group_size;
8674       unsigned int stmt_num = j % group_size;
8675       unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8676       bitmap_set_bit (used_in_lanes, i);
8677       if (repeating_p)
8678         {
8679           first_vec_index = 0;
8680           mask_element = i;
8681         }
8682       else
8683         {
8684           /* Enforced before the loop when !repeating_p.  */
8685           unsigned int const_nunits = nunits.to_constant ();
8686           vec_index = i / const_nunits;
8687           mask_element = i % const_nunits;
8688           if (vec_index == first_vec_index
8689               || first_vec_index == -1)
8690             {
8691               first_vec_index = vec_index;
8692             }
8693           else if (vec_index == second_vec_index
8694                    || second_vec_index == -1)
8695             {
8696               second_vec_index = vec_index;
8697               mask_element += const_nunits;
8698             }
8699           else
8700             {
8701               if (dump_p)
8702                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8703                                  "permutation requires at "
8704                                  "least three vectors %G",
8705                                  stmt_info->stmt);
8706               gcc_assert (analyze_only);
8707               return false;
8708             }
8709
8710           gcc_assert (mask_element < 2 * const_nunits);
8711         }
8712
8713       if (mask_element != index)
8714         noop_p = false;
8715       mask[index++] = mask_element;
8716
8717       if (index == count)
8718         {
8719           if (!noop_p)
8720             {
8721               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8722               if (!can_vec_perm_const_p (mode, mode, indices))
8723                 {
8724                   if (dump_p)
8725                     {
8726                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8727                                        "unsupported vect permute { ");
8728                       for (i = 0; i < count; ++i)
8729                         {
8730                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8731                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8732                         }
8733                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8734                     }
8735                   gcc_assert (analyze_only);
8736                   return false;
8737                 }
8738
8739               tree mask_vec = NULL_TREE;
8740               if (!analyze_only)
8741                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8742
8743               if (second_vec_index == -1)
8744                 second_vec_index = first_vec_index;
8745
8746               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8747                 {
8748                   ++*n_perms;
8749                   if (analyze_only)
8750                     continue;
8751                   /* Generate the permute statement if necessary.  */
8752                   tree first_vec = dr_chain[first_vec_index + ri];
8753                   tree second_vec = dr_chain[second_vec_index + ri];
8754                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8755                   tree perm_dest
8756                     = vect_create_destination_var (gimple_assign_lhs (stmt),
8757                                                    vectype);
8758                   perm_dest = make_ssa_name (perm_dest);
8759                   gimple *perm_stmt
8760                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8761                                            second_vec, mask_vec);
8762                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8763                                                gsi);
8764                   if (dce_chain)
8765                     {
8766                       bitmap_set_bit (used_defs, first_vec_index + ri);
8767                       bitmap_set_bit (used_defs, second_vec_index + ri);
8768                     }
8769
8770                   /* Store the vector statement in NODE.  */
8771                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8772                 }
8773             }
8774           else if (!analyze_only)
8775             {
8776               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8777                 {
8778                   tree first_vec = dr_chain[first_vec_index + ri];
8779                   /* If mask was NULL_TREE generate the requested
8780                      identity transform.  */
8781                   if (dce_chain)
8782                     bitmap_set_bit (used_defs, first_vec_index + ri);
8783
8784                   /* Store the vector statement in NODE.  */
8785                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8786                 }
8787             }
8788
8789           index = 0;
8790           first_vec_index = -1;
8791           second_vec_index = -1;
8792           noop_p = true;
8793         }
8794     }
8795
8796   if (n_loads)
8797     {
8798       if (repeating_p)
8799         *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8800       else
8801         {
8802           /* Enforced above when !repeating_p.  */
8803           unsigned int const_nunits = nunits.to_constant ();
8804           *n_loads = 0;
8805           bool load_seen = false;
8806           for (unsigned i = 0; i < in_nlanes; ++i)
8807             {
8808               if (i % const_nunits == 0)
8809                 {
8810                   if (load_seen)
8811                     *n_loads += 1;
8812                   load_seen = false;
8813                 }
8814               if (bitmap_bit_p (used_in_lanes, i))
8815                 load_seen = true;
8816             }
8817           if (load_seen)
8818             *n_loads += 1;
8819         }
8820     }
8821
8822   if (dce_chain)
8823     for (unsigned i = 0; i < dr_chain.length (); ++i)
8824       if (!bitmap_bit_p (used_defs, i))
8825         {
8826           tree def = dr_chain[i];
8827           do
8828             {
8829               gimple *stmt = SSA_NAME_DEF_STMT (def);
8830               if (is_gimple_assign (stmt)
8831                   && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
8832                       || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
8833                 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
8834               else
8835                 def = NULL;
8836               gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8837               gsi_remove (&rgsi, true);
8838               release_defs (stmt);
8839             }
8840           while (def);
8841         }
8842
8843   return true;
8844 }
8845
8846 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8847    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8848    permute statements for the SLP node NODE.  Store the number of vector
8849    permute instructions in *N_PERMS and the number of vector load
8850    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
8851    that were not needed.  */
8852
8853 bool
8854 vect_transform_slp_perm_load (vec_info *vinfo,
8855                               slp_tree node, const vec<tree> &dr_chain,
8856                               gimple_stmt_iterator *gsi, poly_uint64 vf,
8857                               bool analyze_only, unsigned *n_perms,
8858                               unsigned int *n_loads, bool dce_chain)
8859 {
8860   return vect_transform_slp_perm_load_1 (vinfo, node,
8861                                          SLP_TREE_LOAD_PERMUTATION (node),
8862                                          dr_chain, gsi, vf, analyze_only,
8863                                          dump_enabled_p (), n_perms, n_loads,
8864                                          dce_chain);
8865 }
8866
8867 /* Produce the next vector result for SLP permutation NODE by adding a vector
8868    statement at GSI.  If MASK_VEC is nonnull, add:
8869
8870       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8871
8872    otherwise add:
8873
8874       <new SSA name> = FIRST_DEF.  */
8875
8876 static void
8877 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8878                           slp_tree node, tree first_def, tree second_def,
8879                           tree mask_vec, poly_uint64 identity_offset)
8880 {
8881   tree vectype = SLP_TREE_VECTYPE (node);
8882
8883   /* ???  We SLP match existing vector element extracts but
8884      allow punning which we need to re-instantiate at uses
8885      but have no good way of explicitly representing.  */
8886   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8887       && !types_compatible_p (TREE_TYPE (first_def), vectype))
8888     {
8889       gassign *conv_stmt
8890         = gimple_build_assign (make_ssa_name (vectype),
8891                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8892       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8893       first_def = gimple_assign_lhs (conv_stmt);
8894     }
8895   gassign *perm_stmt;
8896   tree perm_dest = make_ssa_name (vectype);
8897   if (mask_vec)
8898     {
8899       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8900                            TYPE_SIZE (vectype))
8901           && !types_compatible_p (TREE_TYPE (second_def), vectype))
8902         {
8903           gassign *conv_stmt
8904             = gimple_build_assign (make_ssa_name (vectype),
8905                                    build1 (VIEW_CONVERT_EXPR,
8906                                            vectype, second_def));
8907           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8908           second_def = gimple_assign_lhs (conv_stmt);
8909         }
8910       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8911                                        first_def, second_def,
8912                                        mask_vec);
8913     }
8914   else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8915     {
8916       /* For identity permutes we still need to handle the case
8917          of offsetted extracts or concats.  */
8918       unsigned HOST_WIDE_INT c;
8919       auto first_def_nunits
8920         = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8921       if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8922         {
8923           unsigned HOST_WIDE_INT elsz
8924             = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8925           tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8926                                  TYPE_SIZE (vectype),
8927                                  bitsize_int (identity_offset * elsz));
8928           perm_stmt = gimple_build_assign (perm_dest, lowpart);
8929         }
8930       else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8931                                     first_def_nunits, &c) && c == 2)
8932         {
8933           tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8934                                             NULL_TREE, second_def);
8935           perm_stmt = gimple_build_assign (perm_dest, ctor);
8936         }
8937       else
8938         gcc_unreachable ();
8939     }
8940   else
8941     {
8942       /* We need a copy here in case the def was external.  */
8943       perm_stmt = gimple_build_assign (perm_dest, first_def);
8944     }
8945   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8946   /* Store the vector statement in NODE.  */
8947   node->push_vec_def (perm_stmt);
8948 }
8949
8950 /* Subroutine of vectorizable_slp_permutation.  Check whether the target
8951    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8952    If GSI is nonnull, emit the permutation there.
8953
8954    When GSI is null, the only purpose of NODE is to give properties
8955    of the result, such as the vector type and number of SLP lanes.
8956    The node does not need to be a VEC_PERM_EXPR.
8957
8958    If the target supports the operation, return the number of individual
8959    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
8960    dump file if DUMP_P is true.  */
8961
8962 static int
8963 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8964                                 slp_tree node, lane_permutation_t &perm,
8965                                 vec<slp_tree> &children, bool dump_p)
8966 {
8967   tree vectype = SLP_TREE_VECTYPE (node);
8968
8969   /* ???  We currently only support all same vector input types
8970      while the SLP IL should really do a concat + select and thus accept
8971      arbitrary mismatches.  */
8972   slp_tree child;
8973   unsigned i;
8974   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8975   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8976   tree op_vectype = NULL_TREE;
8977   FOR_EACH_VEC_ELT (children, i, child)
8978     if (SLP_TREE_VECTYPE (child))
8979       {
8980         op_vectype = SLP_TREE_VECTYPE (child);
8981         break;
8982       }
8983   if (!op_vectype)
8984     op_vectype = vectype;
8985   FOR_EACH_VEC_ELT (children, i, child)
8986     {
8987       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8988            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8989           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8990           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8991         {
8992           if (dump_p)
8993             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8994                              "Unsupported vector types in lane permutation\n");
8995           return -1;
8996         }
8997       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8998         repeating_p = false;
8999     }
9000
9001   gcc_assert (perm.length () == SLP_TREE_LANES (node));
9002   if (dump_p)
9003     {
9004       dump_printf_loc (MSG_NOTE, vect_location,
9005                        "vectorizing permutation");
9006       for (unsigned i = 0; i < perm.length (); ++i)
9007         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
9008       if (repeating_p)
9009         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
9010       dump_printf (MSG_NOTE, "\n");
9011     }
9012
9013   /* REPEATING_P is true if every output vector is guaranteed to use the
9014      same permute vector.  We can handle that case for both variable-length
9015      and constant-length vectors, but we only handle other cases for
9016      constant-length vectors.
9017
9018      Set:
9019
9020      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
9021        mask vector that we want to build.
9022
9023      - NCOPIES to the number of copies of PERM that we need in order
9024        to build the necessary permute mask vectors.
9025
9026      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
9027        for each permute mask vector.  This is only relevant when GSI is
9028        nonnull.  */
9029   uint64_t npatterns;
9030   unsigned nelts_per_pattern;
9031   uint64_t ncopies;
9032   unsigned noutputs_per_mask;
9033   if (repeating_p)
9034     {
9035       /* We need a single permute mask vector that has the form:
9036
9037            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
9038
9039          In other words, the original n-element permute in PERM is
9040          "unrolled" to fill a full vector.  The stepped vector encoding
9041          that we use for permutes requires 3n elements.  */
9042       npatterns = SLP_TREE_LANES (node);
9043       nelts_per_pattern = ncopies = 3;
9044       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9045     }
9046   else
9047     {
9048       /* Calculate every element of every permute mask vector explicitly,
9049          instead of relying on the pattern described above.  */
9050       if (!nunits.is_constant (&npatterns)
9051           || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
9052         return -1;
9053       nelts_per_pattern = ncopies = 1;
9054       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
9055         if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
9056           return -1;
9057       noutputs_per_mask = 1;
9058     }
9059   unsigned olanes = ncopies * SLP_TREE_LANES (node);
9060   gcc_assert (repeating_p || multiple_p (olanes, nunits));
9061
9062   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
9063      from the { SLP operand, scalar lane } permutation as recorded in the
9064      SLP node as intermediate step.  This part should already work
9065      with SLP children with arbitrary number of lanes.  */
9066   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
9067   auto_vec<unsigned> active_lane;
9068   vperm.create (olanes);
9069   active_lane.safe_grow_cleared (children.length (), true);
9070   for (unsigned i = 0; i < ncopies; ++i)
9071     {
9072       for (unsigned pi = 0; pi < perm.length (); ++pi)
9073         {
9074           std::pair<unsigned, unsigned> p = perm[pi];
9075           tree vtype = SLP_TREE_VECTYPE (children[p.first]);
9076           if (repeating_p)
9077             vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
9078           else
9079             {
9080               /* We checked above that the vectors are constant-length.  */
9081               unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
9082               unsigned vi = (active_lane[p.first] + p.second) / vnunits;
9083               unsigned vl = (active_lane[p.first] + p.second) % vnunits;
9084               vperm.quick_push ({{p.first, vi}, vl});
9085             }
9086         }
9087       /* Advance to the next group.  */
9088       for (unsigned j = 0; j < children.length (); ++j)
9089         active_lane[j] += SLP_TREE_LANES (children[j]);
9090     }
9091
9092   if (dump_p)
9093     {
9094       dump_printf_loc (MSG_NOTE, vect_location,
9095                        "vectorizing permutation");
9096       for (unsigned i = 0; i < perm.length (); ++i)
9097         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
9098       if (repeating_p)
9099         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
9100       dump_printf (MSG_NOTE, "\n");
9101       dump_printf_loc (MSG_NOTE, vect_location, "as");
9102       for (unsigned i = 0; i < vperm.length (); ++i)
9103         {
9104           if (i != 0
9105               && (repeating_p
9106                   ? multiple_p (i, npatterns)
9107                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
9108             dump_printf (MSG_NOTE, ",");
9109           dump_printf (MSG_NOTE, " vops%u[%u][%u]",
9110                        vperm[i].first.first, vperm[i].first.second,
9111                        vperm[i].second);
9112         }
9113       dump_printf (MSG_NOTE, "\n");
9114     }
9115
9116   /* We can only handle two-vector permutes, everything else should
9117      be lowered on the SLP level.  The following is closely inspired
9118      by vect_transform_slp_perm_load and is supposed to eventually
9119      replace it.
9120      ???   As intermediate step do code-gen in the SLP tree representation
9121      somehow?  */
9122   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
9123   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
9124   unsigned int index = 0;
9125   poly_uint64 mask_element;
9126   vec_perm_builder mask;
9127   mask.new_vector (nunits, npatterns, nelts_per_pattern);
9128   unsigned int count = mask.encoded_nelts ();
9129   mask.quick_grow (count);
9130   vec_perm_indices indices;
9131   unsigned nperms = 0;
9132   for (unsigned i = 0; i < vperm.length (); ++i)
9133     {
9134       mask_element = vperm[i].second;
9135       if (first_vec.first == -1U
9136           || first_vec == vperm[i].first)
9137         first_vec = vperm[i].first;
9138       else if (second_vec.first == -1U
9139                || second_vec == vperm[i].first)
9140         {
9141           second_vec = vperm[i].first;
9142           mask_element += nunits;
9143         }
9144       else
9145         {
9146           if (dump_p)
9147             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9148                              "permutation requires at "
9149                              "least three vectors\n");
9150           gcc_assert (!gsi);
9151           return -1;
9152         }
9153
9154       mask[index++] = mask_element;
9155
9156       if (index == count)
9157         {
9158           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
9159                               TYPE_VECTOR_SUBPARTS (op_vectype));
9160           bool identity_p = (indices.series_p (0, 1, mask[0], 1)
9161                              && constant_multiple_p (mask[0], nunits));
9162           machine_mode vmode = TYPE_MODE (vectype);
9163           machine_mode op_vmode = TYPE_MODE (op_vectype);
9164           unsigned HOST_WIDE_INT c;
9165           if ((!identity_p
9166                && !can_vec_perm_const_p (vmode, op_vmode, indices))
9167               || (identity_p
9168                   && !known_le (nunits,
9169                                 TYPE_VECTOR_SUBPARTS (op_vectype))
9170                   && (!constant_multiple_p (nunits,
9171                                             TYPE_VECTOR_SUBPARTS (op_vectype),
9172                                             &c) || c != 2)))
9173             {
9174               if (dump_p)
9175                 {
9176                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
9177                                    vect_location,
9178                                    "unsupported vect permute { ");
9179                   for (i = 0; i < count; ++i)
9180                     {
9181                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
9182                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
9183                     }
9184                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
9185                 }
9186               gcc_assert (!gsi);
9187               return -1;
9188             }
9189
9190           if (!identity_p)
9191             nperms++;
9192           if (gsi)
9193             {
9194               if (second_vec.first == -1U)
9195                 second_vec = first_vec;
9196
9197               slp_tree
9198                 first_node = children[first_vec.first],
9199                 second_node = children[second_vec.first];
9200
9201               tree mask_vec = NULL_TREE;
9202               if (!identity_p)
9203                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
9204
9205               for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
9206                 {
9207                   tree first_def
9208                     = vect_get_slp_vect_def (first_node,
9209                                              first_vec.second + vi);
9210                   tree second_def
9211                     = vect_get_slp_vect_def (second_node,
9212                                              second_vec.second + vi);
9213                   vect_add_slp_permutation (vinfo, gsi, node, first_def,
9214                                             second_def, mask_vec, mask[0]);
9215                 }
9216             }
9217
9218           index = 0;
9219           first_vec = std::make_pair (-1U, -1U);
9220           second_vec = std::make_pair (-1U, -1U);
9221         }
9222     }
9223
9224   return nperms;
9225 }
9226
9227 /* Vectorize the SLP permutations in NODE as specified
9228    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
9229    child number and lane number.
9230    Interleaving of two two-lane two-child SLP subtrees (not supported):
9231      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
9232    A blend of two four-lane two-child SLP subtrees:
9233      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
9234    Highpart of a four-lane one-child SLP subtree (not supported):
9235      [ { 0, 2 }, { 0, 3 } ]
9236    Where currently only a subset is supported by code generating below.  */
9237
9238 static bool
9239 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
9240                               slp_tree node, stmt_vector_for_cost *cost_vec)
9241 {
9242   tree vectype = SLP_TREE_VECTYPE (node);
9243   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
9244   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
9245                                                SLP_TREE_CHILDREN (node),
9246                                                dump_enabled_p ());
9247   if (nperms < 0)
9248     return false;
9249
9250   if (!gsi)
9251     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
9252
9253   return true;
9254 }
9255
9256 /* Vectorize SLP NODE.  */
9257
9258 static void
9259 vect_schedule_slp_node (vec_info *vinfo,
9260                         slp_tree node, slp_instance instance)
9261 {
9262   gimple_stmt_iterator si;
9263   int i;
9264   slp_tree child;
9265
9266   /* Vectorize externals and constants.  */
9267   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
9268       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
9269     {
9270       /* ???  vectorizable_shift can end up using a scalar operand which is
9271          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
9272          node in this case.  */
9273       if (!SLP_TREE_VECTYPE (node))
9274         return;
9275
9276       /* There are two reasons vector defs might already exist.  The first
9277          is that we are vectorizing an existing vector def.  The second is
9278          when performing BB vectorization shared constant/external nodes
9279          are not split apart during partitioning so during the code-gen
9280          DFS walk we can end up visiting them twice.  */
9281       if (! SLP_TREE_VEC_DEFS (node).exists ())
9282         vect_create_constant_vectors (vinfo, node);
9283       return;
9284     }
9285
9286   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
9287
9288   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
9289
9290   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
9291   SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
9292
9293   if (dump_enabled_p ())
9294     dump_printf_loc (MSG_NOTE, vect_location,
9295                      "------>vectorizing SLP node starting from: %G",
9296                      stmt_info->stmt);
9297
9298   if (STMT_VINFO_DATA_REF (stmt_info)
9299       && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9300     {
9301       /* Vectorized loads go before the first scalar load to make it
9302          ready early, vectorized stores go before the last scalar
9303          stmt which is where all uses are ready.  */
9304       stmt_vec_info last_stmt_info = NULL;
9305       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
9306         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
9307       else /* DR_IS_WRITE */
9308         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
9309       si = gsi_for_stmt (last_stmt_info->stmt);
9310     }
9311   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
9312             || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
9313             || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
9314            && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9315     {
9316       /* For PHI node vectorization we do not use the insertion iterator.  */
9317       si = gsi_none ();
9318     }
9319   else
9320     {
9321       /* Emit other stmts after the children vectorized defs which is
9322          earliest possible.  */
9323       gimple *last_stmt = NULL;
9324       if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
9325         if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
9326             || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
9327           {
9328             /* But avoid scheduling internal defs outside of the loop when
9329                we might have only implicitly tracked loop mask/len defs.  */
9330             gimple_stmt_iterator si
9331               = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
9332             last_stmt = *si;
9333           }
9334       bool seen_vector_def = false;
9335       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9336         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9337           {
9338             /* For fold-left reductions we are retaining the scalar
9339                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
9340                set so the representation isn't perfect.  Resort to the
9341                last scalar def here.  */
9342             if (SLP_TREE_VEC_DEFS (child).is_empty ())
9343               {
9344                 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
9345                             == cycle_phi_info_type);
9346                 gphi *phi = as_a <gphi *>
9347                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
9348                 if (!last_stmt
9349                     || vect_stmt_dominates_stmt_p (last_stmt, phi))
9350                   last_stmt = phi;
9351               }
9352             /* We are emitting all vectorized stmts in the same place and
9353                the last one is the last.
9354                ???  Unless we have a load permutation applied and that
9355                figures to re-use an earlier generated load.  */
9356             unsigned j;
9357             tree vdef;
9358             FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9359               {
9360                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9361                 if (!last_stmt
9362                     || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9363                   last_stmt = vstmt;
9364               }
9365           }
9366         else if (!SLP_TREE_VECTYPE (child))
9367           {
9368             /* For externals we use unvectorized at all scalar defs.  */
9369             unsigned j;
9370             tree def;
9371             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9372               if (TREE_CODE (def) == SSA_NAME
9373                   && !SSA_NAME_IS_DEFAULT_DEF (def))
9374                 {
9375                   gimple *stmt = SSA_NAME_DEF_STMT (def);
9376                   if (!last_stmt
9377                       || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9378                     last_stmt = stmt;
9379                 }
9380           }
9381         else
9382           {
9383             /* For externals we have to look at all defs since their
9384                insertion place is decided per vector.  But beware
9385                of pre-existing vectors where we need to make sure
9386                we do not insert before the region boundary.  */
9387             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9388                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9389               seen_vector_def = true;
9390             else
9391               {
9392                 unsigned j;
9393                 tree vdef;
9394                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9395                   if (TREE_CODE (vdef) == SSA_NAME
9396                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9397                     {
9398                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9399                       if (!last_stmt
9400                           || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9401                         last_stmt = vstmt;
9402                     }
9403               }
9404           }
9405       /* This can happen when all children are pre-existing vectors or
9406          constants.  */
9407       if (!last_stmt)
9408         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9409       if (!last_stmt)
9410         {
9411           gcc_assert (seen_vector_def);
9412           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9413         }
9414       else if (is_ctrl_altering_stmt (last_stmt))
9415         {
9416           /* We split regions to vectorize at control altering stmts
9417              with a definition so this must be an external which
9418              we can insert at the start of the region.  */
9419           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9420         }
9421       else if (is_a <bb_vec_info> (vinfo)
9422                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9423                && gimple_could_trap_p (stmt_info->stmt))
9424         {
9425           /* We've constrained possibly trapping operations to all come
9426              from the same basic-block, if vectorized defs would allow earlier
9427              scheduling still force vectorized stmts to the original block.
9428              This is only necessary for BB vectorization since for loop vect
9429              all operations are in a single BB and scalar stmt based
9430              placement doesn't play well with epilogue vectorization.  */
9431           gcc_assert (dominated_by_p (CDI_DOMINATORS,
9432                                       gimple_bb (stmt_info->stmt),
9433                                       gimple_bb (last_stmt)));
9434           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9435         }
9436       else if (is_a <gphi *> (last_stmt))
9437         si = gsi_after_labels (gimple_bb (last_stmt));
9438       else
9439         {
9440           si = gsi_for_stmt (last_stmt);
9441           gsi_next (&si);
9442         }
9443     }
9444
9445   /* Handle purely internal nodes.  */
9446   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9447     {
9448       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
9449          be shared with different SLP nodes (but usually it's the same
9450          operation apart from the case the stmt is only there for denoting
9451          the actual scalar lane defs ...).  So do not call vect_transform_stmt
9452          but open-code it here (partly).  */
9453       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9454       gcc_assert (done);
9455       stmt_vec_info slp_stmt_info;
9456       unsigned int i;
9457       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9458         if (STMT_VINFO_LIVE_P (slp_stmt_info))
9459           {
9460             done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9461                                                 instance, i, true, NULL);
9462             gcc_assert (done);
9463           }
9464     }
9465   else
9466     vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9467 }
9468
9469 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9470    For loop vectorization this is done in vectorizable_call, but for SLP
9471    it needs to be deferred until end of vect_schedule_slp, because multiple
9472    SLP instances may refer to the same scalar stmt.  */
9473
9474 static void
9475 vect_remove_slp_scalar_calls (vec_info *vinfo,
9476                               slp_tree node, hash_set<slp_tree> &visited)
9477 {
9478   gimple *new_stmt;
9479   gimple_stmt_iterator gsi;
9480   int i;
9481   slp_tree child;
9482   tree lhs;
9483   stmt_vec_info stmt_info;
9484
9485   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9486     return;
9487
9488   if (visited.add (node))
9489     return;
9490
9491   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9492     vect_remove_slp_scalar_calls (vinfo, child, visited);
9493
9494   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9495     {
9496       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9497       if (!stmt || gimple_bb (stmt) == NULL)
9498         continue;
9499       if (is_pattern_stmt_p (stmt_info)
9500           || !PURE_SLP_STMT (stmt_info))
9501         continue;
9502       lhs = gimple_call_lhs (stmt);
9503       if (lhs)
9504         new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9505       else
9506         {
9507           new_stmt = gimple_build_nop ();
9508           unlink_stmt_vdef (stmt_info->stmt);
9509         }
9510       gsi = gsi_for_stmt (stmt);
9511       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9512       if (lhs)
9513         SSA_NAME_DEF_STMT (lhs) = new_stmt;
9514     }
9515 }
9516
9517 static void
9518 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9519 {
9520   hash_set<slp_tree> visited;
9521   vect_remove_slp_scalar_calls (vinfo, node, visited);
9522 }
9523
9524 /* Vectorize the instance root.  */
9525
9526 void
9527 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9528 {
9529   gassign *rstmt = NULL;
9530
9531   if (instance->kind == slp_inst_kind_ctor)
9532     {
9533       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9534         {
9535           tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9536           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9537           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9538                                           TREE_TYPE (vect_lhs)))
9539             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9540                                vect_lhs);
9541           rstmt = gimple_build_assign (root_lhs, vect_lhs);
9542         }
9543       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9544         {
9545           int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9546           tree child_def;
9547           int j;
9548           vec<constructor_elt, va_gc> *v;
9549           vec_alloc (v, nelts);
9550
9551           /* A CTOR can handle V16HI composition from VNx8HI so we
9552              do not need to convert vector elements if the types
9553              do not match.  */
9554           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9555             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9556           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9557           tree rtype
9558             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9559           tree r_constructor = build_constructor (rtype, v);
9560           rstmt = gimple_build_assign (lhs, r_constructor);
9561         }
9562     }
9563   else if (instance->kind == slp_inst_kind_bb_reduc)
9564     {
9565       /* Largely inspired by reduction chain epilogue handling in
9566          vect_create_epilog_for_reduction.  */
9567       vec<tree> vec_defs = vNULL;
9568       vect_get_slp_defs (node, &vec_defs);
9569       enum tree_code reduc_code
9570         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9571       /* ???  We actually have to reflect signs somewhere.  */
9572       if (reduc_code == MINUS_EXPR)
9573         reduc_code = PLUS_EXPR;
9574       gimple_seq epilogue = NULL;
9575       /* We may end up with more than one vector result, reduce them
9576          to one vector.  */
9577       tree vec_def = vec_defs[0];
9578       tree vectype = TREE_TYPE (vec_def);
9579       tree compute_vectype = vectype;
9580       bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9581                                  && TYPE_OVERFLOW_UNDEFINED (vectype)
9582                                  && operation_can_overflow (reduc_code));
9583       if (pun_for_overflow_p)
9584         {
9585           compute_vectype = unsigned_type_for (vectype);
9586           vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9587                                   compute_vectype, vec_def);
9588         }
9589       for (unsigned i = 1; i < vec_defs.length (); ++i)
9590         {
9591           tree def = vec_defs[i];
9592           if (pun_for_overflow_p)
9593             def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9594                                 compute_vectype, def);
9595           vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9596                                   vec_def, def);
9597         }
9598       vec_defs.release ();
9599       /* ???  Support other schemes than direct internal fn.  */
9600       internal_fn reduc_fn;
9601       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9602           || reduc_fn == IFN_LAST)
9603         gcc_unreachable ();
9604       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9605                                       TREE_TYPE (compute_vectype), vec_def);
9606       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9607         {
9608           tree rem_def = NULL_TREE;
9609           for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9610             {
9611               def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9612               if (!rem_def)
9613                 rem_def = def;
9614               else
9615                 rem_def = gimple_build (&epilogue, reduc_code,
9616                                         TREE_TYPE (scalar_def),
9617                                         rem_def, def);
9618             }
9619           scalar_def = gimple_build (&epilogue, reduc_code,
9620                                      TREE_TYPE (scalar_def),
9621                                      scalar_def, rem_def);
9622         }
9623       scalar_def = gimple_convert (&epilogue,
9624                                    TREE_TYPE (vectype), scalar_def);
9625       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9626       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9627       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9628       update_stmt (gsi_stmt (rgsi));
9629       return;
9630     }
9631   else
9632     gcc_unreachable ();
9633
9634   gcc_assert (rstmt);
9635
9636   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9637   gsi_replace (&rgsi, rstmt, true);
9638 }
9639
9640 struct slp_scc_info
9641 {
9642   bool on_stack;
9643   int dfs;
9644   int lowlink;
9645 };
9646
9647 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
9648
9649 static void
9650 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9651                    hash_map<slp_tree, slp_scc_info> &scc_info,
9652                    int &maxdfs, vec<slp_tree> &stack)
9653 {
9654   bool existed_p;
9655   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9656   gcc_assert (!existed_p);
9657   info->dfs = maxdfs;
9658   info->lowlink = maxdfs;
9659   maxdfs++;
9660
9661   /* Leaf.  */
9662   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9663     {
9664       info->on_stack = false;
9665       vect_schedule_slp_node (vinfo, node, instance);
9666       return;
9667     }
9668
9669   info->on_stack = true;
9670   stack.safe_push (node);
9671
9672   unsigned i;
9673   slp_tree child;
9674   /* DFS recurse.  */
9675   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9676     {
9677       if (!child)
9678         continue;
9679       slp_scc_info *child_info = scc_info.get (child);
9680       if (!child_info)
9681         {
9682           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9683           /* Recursion might have re-allocated the node.  */
9684           info = scc_info.get (node);
9685           child_info = scc_info.get (child);
9686           info->lowlink = MIN (info->lowlink, child_info->lowlink);
9687         }
9688       else if (child_info->on_stack)
9689         info->lowlink = MIN (info->lowlink, child_info->dfs);
9690     }
9691   if (info->lowlink != info->dfs)
9692     return;
9693
9694   auto_vec<slp_tree, 4> phis_to_fixup;
9695
9696   /* Singleton.  */
9697   if (stack.last () == node)
9698     {
9699       stack.pop ();
9700       info->on_stack = false;
9701       vect_schedule_slp_node (vinfo, node, instance);
9702       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9703           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9704         phis_to_fixup.quick_push (node);
9705     }
9706   else
9707     {
9708       /* SCC.  */
9709       int last_idx = stack.length () - 1;
9710       while (stack[last_idx] != node)
9711         last_idx--;
9712       /* We can break the cycle at PHIs who have at least one child
9713          code generated.  Then we could re-start the DFS walk until
9714          all nodes in the SCC are covered (we might have new entries
9715          for only back-reachable nodes).  But it's simpler to just
9716          iterate and schedule those that are ready.  */
9717       unsigned todo = stack.length () - last_idx;
9718       do
9719         {
9720           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9721             {
9722               slp_tree entry = stack[idx];
9723               if (!entry)
9724                 continue;
9725               bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9726                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9727               bool ready = !phi;
9728               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9729                   if (!child)
9730                     {
9731                       gcc_assert (phi);
9732                       ready = true;
9733                       break;
9734                     }
9735                   else if (scc_info.get (child)->on_stack)
9736                     {
9737                       if (!phi)
9738                         {
9739                           ready = false;
9740                           break;
9741                         }
9742                     }
9743                   else
9744                     {
9745                       if (phi)
9746                         {
9747                           ready = true;
9748                           break;
9749                         }
9750                     }
9751               if (ready)
9752                 {
9753                   vect_schedule_slp_node (vinfo, entry, instance);
9754                   scc_info.get (entry)->on_stack = false;
9755                   stack[idx] = NULL;
9756                   todo--;
9757                   if (phi)
9758                     phis_to_fixup.safe_push (entry);
9759                 }
9760             }
9761         }
9762       while (todo != 0);
9763
9764       /* Pop the SCC.  */
9765       stack.truncate (last_idx);
9766     }
9767
9768   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
9769   slp_tree phi_node;
9770   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9771     {
9772       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9773       edge_iterator ei;
9774       edge e;
9775       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9776         {
9777           unsigned dest_idx = e->dest_idx;
9778           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9779           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9780             continue;
9781           unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9782           /* Simply fill all args.  */
9783           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9784               != vect_first_order_recurrence)
9785             for (unsigned i = 0; i < n; ++i)
9786               {
9787                 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9788                 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9789                 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9790                              e, gimple_phi_arg_location (phi, dest_idx));
9791               }
9792           else
9793             {
9794               /* Unless it is a first order recurrence which needs
9795                  args filled in for both the PHI node and the permutes.  */
9796               gimple *perm
9797                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9798               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9799               add_phi_arg (as_a <gphi *> (rphi),
9800                            vect_get_slp_vect_def (child, n - 1),
9801                            e, gimple_phi_arg_location (phi, dest_idx));
9802               for (unsigned i = 0; i < n; ++i)
9803                 {
9804                   gimple *perm
9805                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9806                   if (i > 0)
9807                     gimple_assign_set_rhs1 (perm,
9808                                             vect_get_slp_vect_def (child, i - 1));
9809                   gimple_assign_set_rhs2 (perm,
9810                                           vect_get_slp_vect_def (child, i));
9811                   update_stmt (perm);
9812                 }
9813             }
9814         }
9815     }
9816 }
9817
9818 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
9819
9820 void
9821 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9822 {
9823   slp_instance instance;
9824   unsigned int i;
9825
9826   hash_map<slp_tree, slp_scc_info> scc_info;
9827   int maxdfs = 0;
9828   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9829     {
9830       slp_tree node = SLP_INSTANCE_TREE (instance);
9831       if (dump_enabled_p ())
9832         {
9833           dump_printf_loc (MSG_NOTE, vect_location,
9834                            "Vectorizing SLP tree:\n");
9835           /* ???  Dump all?  */
9836           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9837             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9838                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9839           vect_print_slp_graph (MSG_NOTE, vect_location,
9840                                 SLP_INSTANCE_TREE (instance));
9841         }
9842       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9843          have a PHI be the node breaking the cycle.  */
9844       auto_vec<slp_tree> stack;
9845       if (!scc_info.get (node))
9846         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9847
9848       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9849         vectorize_slp_instance_root_stmt (node, instance);
9850
9851       if (dump_enabled_p ())
9852         dump_printf_loc (MSG_NOTE, vect_location,
9853                          "vectorizing stmts using SLP.\n");
9854     }
9855
9856   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9857     {
9858       slp_tree root = SLP_INSTANCE_TREE (instance);
9859       stmt_vec_info store_info;
9860       unsigned int j;
9861
9862       /* Remove scalar call stmts.  Do not do this for basic-block
9863          vectorization as not all uses may be vectorized.
9864          ???  Why should this be necessary?  DCE should be able to
9865          remove the stmts itself.
9866          ???  For BB vectorization we can as well remove scalar
9867          stmts starting from the SLP tree root if they have no
9868          uses.  */
9869       if (is_a <loop_vec_info> (vinfo))
9870         vect_remove_slp_scalar_calls (vinfo, root);
9871
9872       /* Remove vectorized stores original scalar stmts.  */
9873       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9874         {
9875           if (!STMT_VINFO_DATA_REF (store_info)
9876               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9877             break;
9878
9879           store_info = vect_orig_stmt (store_info);
9880           /* Free the attached stmt_vec_info and remove the stmt.  */
9881           vinfo->remove_stmt (store_info);
9882
9883           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9884              to not crash in vect_free_slp_tree later.  */
9885           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9886             SLP_TREE_REPRESENTATIVE (root) = NULL;
9887         }
9888     }
9889 }