]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.cc
[PATCH v1 1/1] RISC-V: Nan-box the result of movbf on soft-bf16
[thirdparty/gcc.git] / gcc / tree-vect-slp.cc
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #define INCLUDE_ALGORITHM
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "insn-config.h"
35 #include "recog.h" /* FIXME: for insn_data */
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48 #include "dump-context.h"
49 #include "cfganal.h"
50 #include "tree-eh.h"
51 #include "tree-cfg.h"
52 #include "alloc-pool.h"
53 #include "sreal.h"
54 #include "predict.h"
55
56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
57 load_permutation_t &,
58 const vec<tree> &,
59 gimple_stmt_iterator *,
60 poly_uint64, bool, bool,
61 unsigned *,
62 unsigned * = nullptr,
63 bool = false);
64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
65 slp_tree, lane_permutation_t &,
66 vec<slp_tree> &, bool);
67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
68 slp_tree, stmt_vector_for_cost *);
69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
70
71 static object_allocator<_slp_tree> *slp_tree_pool;
72 static slp_tree slp_first_node;
73
74 void
75 vect_slp_init (void)
76 {
77 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
78 }
79
80 void
81 vect_slp_fini (void)
82 {
83 while (slp_first_node)
84 delete slp_first_node;
85 delete slp_tree_pool;
86 slp_tree_pool = NULL;
87 }
88
89 void *
90 _slp_tree::operator new (size_t n)
91 {
92 gcc_assert (n == sizeof (_slp_tree));
93 return slp_tree_pool->allocate_raw ();
94 }
95
96 void
97 _slp_tree::operator delete (void *node, size_t n)
98 {
99 gcc_assert (n == sizeof (_slp_tree));
100 slp_tree_pool->remove_raw (node);
101 }
102
103
104 /* Initialize a SLP node. */
105
106 _slp_tree::_slp_tree ()
107 {
108 this->prev_node = NULL;
109 if (slp_first_node)
110 slp_first_node->prev_node = this;
111 this->next_node = slp_first_node;
112 slp_first_node = this;
113 SLP_TREE_SCALAR_STMTS (this) = vNULL;
114 SLP_TREE_SCALAR_OPS (this) = vNULL;
115 SLP_TREE_VEC_DEFS (this) = vNULL;
116 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
117 SLP_TREE_CHILDREN (this) = vNULL;
118 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
119 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
120 SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
121 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
122 SLP_TREE_CODE (this) = ERROR_MARK;
123 SLP_TREE_VECTYPE (this) = NULL_TREE;
124 SLP_TREE_REPRESENTATIVE (this) = NULL;
125 SLP_TREE_REF_COUNT (this) = 1;
126 this->failed = NULL;
127 this->max_nunits = 1;
128 this->lanes = 0;
129 }
130
131 /* Tear down a SLP node. */
132
133 _slp_tree::~_slp_tree ()
134 {
135 if (this->prev_node)
136 this->prev_node->next_node = this->next_node;
137 else
138 slp_first_node = this->next_node;
139 if (this->next_node)
140 this->next_node->prev_node = this->prev_node;
141 SLP_TREE_CHILDREN (this).release ();
142 SLP_TREE_SCALAR_STMTS (this).release ();
143 SLP_TREE_SCALAR_OPS (this).release ();
144 SLP_TREE_VEC_DEFS (this).release ();
145 SLP_TREE_LOAD_PERMUTATION (this).release ();
146 SLP_TREE_LANE_PERMUTATION (this).release ();
147 SLP_TREE_SIMD_CLONE_INFO (this).release ();
148 if (this->failed)
149 free (failed);
150 }
151
152 /* Push the single SSA definition in DEF to the vector of vector defs. */
153
154 void
155 _slp_tree::push_vec_def (gimple *def)
156 {
157 if (gphi *phi = dyn_cast <gphi *> (def))
158 vec_defs.quick_push (gimple_phi_result (phi));
159 else
160 {
161 def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
162 vec_defs.quick_push (get_def_from_ptr (defop));
163 }
164 }
165
166 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
167
168 void
169 vect_free_slp_tree (slp_tree node)
170 {
171 int i;
172 slp_tree child;
173
174 if (--SLP_TREE_REF_COUNT (node) != 0)
175 return;
176
177 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
178 if (child)
179 vect_free_slp_tree (child);
180
181 /* If the node defines any SLP only patterns then those patterns are no
182 longer valid and should be removed. */
183 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
184 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
185 {
186 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
187 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
188 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
189 }
190
191 delete node;
192 }
193
194 /* Return a location suitable for dumpings related to the SLP instance. */
195
196 dump_user_location_t
197 _slp_instance::location () const
198 {
199 if (!root_stmts.is_empty ())
200 return root_stmts[0]->stmt;
201 else
202 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
203 }
204
205
206 /* Free the memory allocated for the SLP instance. */
207
208 void
209 vect_free_slp_instance (slp_instance instance)
210 {
211 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
212 SLP_INSTANCE_LOADS (instance).release ();
213 SLP_INSTANCE_ROOT_STMTS (instance).release ();
214 SLP_INSTANCE_REMAIN_DEFS (instance).release ();
215 instance->subgraph_entries.release ();
216 instance->cost_vec.release ();
217 free (instance);
218 }
219
220
221 /* Create an SLP node for SCALAR_STMTS. */
222
223 slp_tree
224 vect_create_new_slp_node (unsigned nops, tree_code code)
225 {
226 slp_tree node = new _slp_tree;
227 SLP_TREE_SCALAR_STMTS (node) = vNULL;
228 SLP_TREE_CHILDREN (node).create (nops);
229 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
230 SLP_TREE_CODE (node) = code;
231 return node;
232 }
233 /* Create an SLP node for SCALAR_STMTS. */
234
235 static slp_tree
236 vect_create_new_slp_node (slp_tree node,
237 vec<stmt_vec_info> scalar_stmts, unsigned nops)
238 {
239 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
240 SLP_TREE_CHILDREN (node).create (nops);
241 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
242 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
243 SLP_TREE_LANES (node) = scalar_stmts.length ();
244 return node;
245 }
246
247 /* Create an SLP node for SCALAR_STMTS. */
248
249 static slp_tree
250 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
251 {
252 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
253 }
254
255 /* Create an SLP node for OPS. */
256
257 static slp_tree
258 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
259 {
260 SLP_TREE_SCALAR_OPS (node) = ops;
261 SLP_TREE_DEF_TYPE (node) = vect_external_def;
262 SLP_TREE_LANES (node) = ops.length ();
263 return node;
264 }
265
266 /* Create an SLP node for OPS. */
267
268 static slp_tree
269 vect_create_new_slp_node (vec<tree> ops)
270 {
271 return vect_create_new_slp_node (new _slp_tree, ops);
272 }
273
274
275 /* This structure is used in creation of an SLP tree. Each instance
276 corresponds to the same operand in a group of scalar stmts in an SLP
277 node. */
278 typedef struct _slp_oprnd_info
279 {
280 /* Def-stmts for the operands. */
281 vec<stmt_vec_info> def_stmts;
282 /* Operands. */
283 vec<tree> ops;
284 /* Information about the first statement, its vector def-type, type, the
285 operand itself in case it's constant, and an indication if it's a pattern
286 stmt and gather/scatter info. */
287 tree first_op_type;
288 enum vect_def_type first_dt;
289 bool any_pattern;
290 bool first_gs_p;
291 gather_scatter_info first_gs_info;
292 } *slp_oprnd_info;
293
294
295 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
296 operand. */
297 static vec<slp_oprnd_info>
298 vect_create_oprnd_info (int nops, int group_size)
299 {
300 int i;
301 slp_oprnd_info oprnd_info;
302 vec<slp_oprnd_info> oprnds_info;
303
304 oprnds_info.create (nops);
305 for (i = 0; i < nops; i++)
306 {
307 oprnd_info = XNEW (struct _slp_oprnd_info);
308 oprnd_info->def_stmts.create (group_size);
309 oprnd_info->ops.create (group_size);
310 oprnd_info->first_dt = vect_uninitialized_def;
311 oprnd_info->first_op_type = NULL_TREE;
312 oprnd_info->any_pattern = false;
313 oprnd_info->first_gs_p = false;
314 oprnds_info.quick_push (oprnd_info);
315 }
316
317 return oprnds_info;
318 }
319
320
321 /* Free operands info. */
322
323 static void
324 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
325 {
326 int i;
327 slp_oprnd_info oprnd_info;
328
329 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
330 {
331 oprnd_info->def_stmts.release ();
332 oprnd_info->ops.release ();
333 XDELETE (oprnd_info);
334 }
335
336 oprnds_info.release ();
337 }
338
339 /* Return the execution frequency of NODE (so that a higher value indicates
340 a "more important" node when optimizing for speed). */
341
342 static sreal
343 vect_slp_node_weight (slp_tree node)
344 {
345 stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
346 basic_block bb = gimple_bb (stmt_info->stmt);
347 return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
348 }
349
350 /* Return true if STMTS contains a pattern statement. */
351
352 static bool
353 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
354 {
355 stmt_vec_info stmt_info;
356 unsigned int i;
357 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
358 if (is_pattern_stmt_p (stmt_info))
359 return true;
360 return false;
361 }
362
363 /* Return true when all lanes in the external or constant NODE have
364 the same value. */
365
366 static bool
367 vect_slp_tree_uniform_p (slp_tree node)
368 {
369 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
370 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
371
372 /* Pre-exsting vectors. */
373 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
374 return false;
375
376 unsigned i;
377 tree op, first = NULL_TREE;
378 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
379 if (!first)
380 first = op;
381 else if (!operand_equal_p (first, op, 0))
382 return false;
383
384 return true;
385 }
386
387 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
388 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
389 of the chain. */
390
391 int
392 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
393 stmt_vec_info first_stmt_info)
394 {
395 stmt_vec_info next_stmt_info = first_stmt_info;
396 int result = 0;
397
398 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
399 return -1;
400
401 do
402 {
403 if (next_stmt_info == stmt_info)
404 return result;
405 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
406 if (next_stmt_info)
407 result += DR_GROUP_GAP (next_stmt_info);
408 }
409 while (next_stmt_info);
410
411 return -1;
412 }
413
414 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
415 using the method implemented by duplicate_and_interleave. Return true
416 if so, returning the number of intermediate vectors in *NVECTORS_OUT
417 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
418 (if nonnull). */
419
420 bool
421 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
422 tree elt_type, unsigned int *nvectors_out,
423 tree *vector_type_out,
424 tree *permutes)
425 {
426 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
427 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
428 return false;
429
430 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
431 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
432 unsigned int nvectors = 1;
433 for (;;)
434 {
435 scalar_int_mode int_mode;
436 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
437 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
438 {
439 /* Get the natural vector type for this SLP group size. */
440 tree int_type = build_nonstandard_integer_type
441 (GET_MODE_BITSIZE (int_mode), 1);
442 tree vector_type
443 = get_vectype_for_scalar_type (vinfo, int_type, count);
444 poly_int64 half_nelts;
445 if (vector_type
446 && VECTOR_MODE_P (TYPE_MODE (vector_type))
447 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
448 GET_MODE_SIZE (base_vector_mode))
449 && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
450 2, &half_nelts))
451 {
452 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
453 together into elements of type INT_TYPE and using the result
454 to build NVECTORS vectors. */
455 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
456 vec_perm_builder sel1 (nelts, 2, 3);
457 vec_perm_builder sel2 (nelts, 2, 3);
458
459 for (unsigned int i = 0; i < 3; ++i)
460 {
461 sel1.quick_push (i);
462 sel1.quick_push (i + nelts);
463 sel2.quick_push (half_nelts + i);
464 sel2.quick_push (half_nelts + i + nelts);
465 }
466 vec_perm_indices indices1 (sel1, 2, nelts);
467 vec_perm_indices indices2 (sel2, 2, nelts);
468 machine_mode vmode = TYPE_MODE (vector_type);
469 if (can_vec_perm_const_p (vmode, vmode, indices1)
470 && can_vec_perm_const_p (vmode, vmode, indices2))
471 {
472 if (nvectors_out)
473 *nvectors_out = nvectors;
474 if (vector_type_out)
475 *vector_type_out = vector_type;
476 if (permutes)
477 {
478 permutes[0] = vect_gen_perm_mask_checked (vector_type,
479 indices1);
480 permutes[1] = vect_gen_perm_mask_checked (vector_type,
481 indices2);
482 }
483 return true;
484 }
485 }
486 }
487 if (!multiple_p (elt_bytes, 2, &elt_bytes))
488 return false;
489 nvectors *= 2;
490 }
491 }
492
493 /* Return true if DTA and DTB match. */
494
495 static bool
496 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
497 {
498 return (dta == dtb
499 || ((dta == vect_external_def || dta == vect_constant_def)
500 && (dtb == vect_external_def || dtb == vect_constant_def)));
501 }
502
503 static const int cond_expr_maps[3][5] = {
504 { 4, -1, -2, 1, 2 },
505 { 4, -2, -1, 1, 2 },
506 { 4, -1, -2, 2, 1 }
507 };
508 static const int arg0_map[] = { 1, 0 };
509 static const int arg1_map[] = { 1, 1 };
510 static const int arg2_map[] = { 1, 2 };
511 static const int arg1_arg4_map[] = { 2, 1, 4 };
512 static const int arg3_arg2_map[] = { 2, 3, 2 };
513 static const int op1_op0_map[] = { 2, 1, 0 };
514 static const int off_map[] = { 1, -3 };
515 static const int off_op0_map[] = { 2, -3, 0 };
516 static const int off_arg2_map[] = { 2, -3, 2 };
517 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
518 static const int mask_call_maps[6][7] = {
519 { 1, 1, },
520 { 2, 1, 2, },
521 { 3, 1, 2, 3, },
522 { 4, 1, 2, 3, 4, },
523 { 5, 1, 2, 3, 4, 5, },
524 { 6, 1, 2, 3, 4, 5, 6 },
525 };
526
527 /* For most SLP statements, there is a one-to-one mapping between
528 gimple arguments and child nodes. If that is not true for STMT,
529 return an array that contains:
530
531 - the number of child nodes, followed by
532 - for each child node, the index of the argument associated with that node.
533 The special index -1 is the first operand of an embedded comparison and
534 the special index -2 is the second operand of an embedded comparison.
535 The special indes -3 is the offset of a gather as analyzed by
536 vect_check_gather_scatter.
537
538 SWAP is as for vect_get_and_check_slp_defs. */
539
540 static const int *
541 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
542 unsigned char swap = 0)
543 {
544 if (auto assign = dyn_cast<const gassign *> (stmt))
545 {
546 if (gimple_assign_rhs_code (assign) == COND_EXPR
547 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
548 return cond_expr_maps[swap];
549 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
550 && swap)
551 return op1_op0_map;
552 if (gather_scatter_p)
553 return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
554 ? off_op0_map : off_map);
555 }
556 gcc_assert (!swap);
557 if (auto call = dyn_cast<const gcall *> (stmt))
558 {
559 if (gimple_call_internal_p (call))
560 switch (gimple_call_internal_fn (call))
561 {
562 case IFN_MASK_LOAD:
563 return gather_scatter_p ? off_arg2_map : arg2_map;
564
565 case IFN_GATHER_LOAD:
566 return arg1_map;
567
568 case IFN_MASK_GATHER_LOAD:
569 case IFN_MASK_LEN_GATHER_LOAD:
570 return arg1_arg4_map;
571
572 case IFN_MASK_STORE:
573 return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
574
575 case IFN_MASK_CALL:
576 {
577 unsigned nargs = gimple_call_num_args (call);
578 if (nargs >= 2 && nargs <= 7)
579 return mask_call_maps[nargs-2];
580 else
581 return nullptr;
582 }
583
584 case IFN_CLZ:
585 case IFN_CTZ:
586 return arg0_map;
587
588 default:
589 break;
590 }
591 }
592 return nullptr;
593 }
594
595 /* Return the SLP node child index for operand OP of STMT. */
596
597 int
598 vect_slp_child_index_for_operand (const gimple *stmt, int op,
599 bool gather_scatter_p)
600 {
601 const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
602 if (!opmap)
603 return op;
604 for (int i = 1; i < 1 + opmap[0]; ++i)
605 if (opmap[i] == op)
606 return i - 1;
607 gcc_unreachable ();
608 }
609
610 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
611 they are of a valid type and that they match the defs of the first stmt of
612 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
613 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
614 indicates swap is required for cond_expr stmts. Specifically, SWAP
615 is 1 if STMT is cond and operands of comparison need to be swapped;
616 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
617
618 If there was a fatal error return -1; if the error could be corrected by
619 swapping operands of father node of this one, return 1; if everything is
620 ok return 0. */
621 static int
622 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
623 bool *skip_args,
624 vec<stmt_vec_info> stmts, unsigned stmt_num,
625 vec<slp_oprnd_info> *oprnds_info)
626 {
627 stmt_vec_info stmt_info = stmts[stmt_num];
628 tree oprnd;
629 unsigned int i, number_of_oprnds;
630 enum vect_def_type dt = vect_uninitialized_def;
631 slp_oprnd_info oprnd_info;
632 gather_scatter_info gs_info;
633 unsigned int gs_op = -1u;
634 unsigned int commutative_op = -1U;
635 bool first = stmt_num == 0;
636
637 if (!is_a<gcall *> (stmt_info->stmt)
638 && !is_a<gassign *> (stmt_info->stmt)
639 && !is_a<gphi *> (stmt_info->stmt))
640 return -1;
641
642 number_of_oprnds = gimple_num_args (stmt_info->stmt);
643 const int *map
644 = vect_get_operand_map (stmt_info->stmt,
645 STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
646 if (map)
647 number_of_oprnds = *map++;
648 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
649 {
650 if (gimple_call_internal_p (stmt))
651 {
652 internal_fn ifn = gimple_call_internal_fn (stmt);
653 commutative_op = first_commutative_argument (ifn);
654 }
655 }
656 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
657 {
658 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
659 commutative_op = 0;
660 }
661
662 bool swapped = (swap != 0);
663 bool backedge = false;
664 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
665 for (i = 0; i < number_of_oprnds; i++)
666 {
667 oprnd_info = (*oprnds_info)[i];
668 int opno = map ? map[i] : int (i);
669 if (opno == -3)
670 {
671 gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
672 if (!is_a <loop_vec_info> (vinfo)
673 || !vect_check_gather_scatter (stmt_info,
674 as_a <loop_vec_info> (vinfo),
675 first ? &oprnd_info->first_gs_info
676 : &gs_info))
677 return -1;
678
679 if (first)
680 {
681 oprnd_info->first_gs_p = true;
682 oprnd = oprnd_info->first_gs_info.offset;
683 }
684 else
685 {
686 gs_op = i;
687 oprnd = gs_info.offset;
688 }
689 }
690 else if (opno < 0)
691 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
692 else
693 {
694 oprnd = gimple_arg (stmt_info->stmt, opno);
695 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
696 {
697 edge e = gimple_phi_arg_edge (stmt, opno);
698 backedge = (is_a <bb_vec_info> (vinfo)
699 ? e->flags & EDGE_DFS_BACK
700 : dominated_by_p (CDI_DOMINATORS, e->src,
701 gimple_bb (stmt_info->stmt)));
702 }
703 }
704 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
705 oprnd = TREE_OPERAND (oprnd, 0);
706
707 stmt_vec_info def_stmt_info;
708 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
709 {
710 if (dump_enabled_p ())
711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
712 "Build SLP failed: can't analyze def for %T\n",
713 oprnd);
714
715 return -1;
716 }
717
718 if (skip_args[i])
719 {
720 oprnd_info->def_stmts.quick_push (NULL);
721 oprnd_info->ops.quick_push (NULL_TREE);
722 oprnd_info->first_dt = vect_uninitialized_def;
723 continue;
724 }
725
726 oprnd_info->def_stmts.quick_push (def_stmt_info);
727 oprnd_info->ops.quick_push (oprnd);
728
729 if (def_stmt_info
730 && is_pattern_stmt_p (def_stmt_info))
731 {
732 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
733 != def_stmt_info)
734 oprnd_info->any_pattern = true;
735 else
736 /* If we promote this to external use the original stmt def. */
737 oprnd_info->ops.last ()
738 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
739 }
740
741 /* If there's a extern def on a backedge make sure we can
742 code-generate at the region start.
743 ??? This is another case that could be fixed by adjusting
744 how we split the function but at the moment we'd have conflicting
745 goals there. */
746 if (backedge
747 && dts[i] == vect_external_def
748 && is_a <bb_vec_info> (vinfo)
749 && TREE_CODE (oprnd) == SSA_NAME
750 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
751 && !dominated_by_p (CDI_DOMINATORS,
752 as_a <bb_vec_info> (vinfo)->bbs[0],
753 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
754 {
755 if (dump_enabled_p ())
756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
757 "Build SLP failed: extern def %T only defined "
758 "on backedge\n", oprnd);
759 return -1;
760 }
761
762 if (first)
763 {
764 tree type = TREE_TYPE (oprnd);
765 dt = dts[i];
766
767 /* For the swapping logic below force vect_reduction_def
768 for the reduction op in a SLP reduction group. */
769 if (!STMT_VINFO_DATA_REF (stmt_info)
770 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
771 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
772 && def_stmt_info)
773 dts[i] = dt = vect_reduction_def;
774
775 /* Check the types of the definition. */
776 switch (dt)
777 {
778 case vect_external_def:
779 case vect_constant_def:
780 case vect_internal_def:
781 case vect_reduction_def:
782 case vect_induction_def:
783 case vect_nested_cycle:
784 case vect_first_order_recurrence:
785 break;
786
787 default:
788 /* FORNOW: Not supported. */
789 if (dump_enabled_p ())
790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
791 "Build SLP failed: illegal type of def %T\n",
792 oprnd);
793 return -1;
794 }
795
796 oprnd_info->first_dt = dt;
797 oprnd_info->first_op_type = type;
798 }
799 }
800 if (first)
801 return 0;
802
803 /* Now match the operand definition types to that of the first stmt. */
804 for (i = 0; i < number_of_oprnds;)
805 {
806 if (skip_args[i])
807 {
808 ++i;
809 continue;
810 }
811
812 oprnd_info = (*oprnds_info)[i];
813 dt = dts[i];
814 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
815 oprnd = oprnd_info->ops[stmt_num];
816 tree type = TREE_TYPE (oprnd);
817
818 if (!types_compatible_p (oprnd_info->first_op_type, type))
819 {
820 if (dump_enabled_p ())
821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822 "Build SLP failed: different operand types\n");
823 return 1;
824 }
825
826 if ((gs_op == i) != oprnd_info->first_gs_p)
827 {
828 if (dump_enabled_p ())
829 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
830 "Build SLP failed: mixed gather and non-gather\n");
831 return 1;
832 }
833 else if (gs_op == i)
834 {
835 if (!operand_equal_p (oprnd_info->first_gs_info.base,
836 gs_info.base))
837 {
838 if (dump_enabled_p ())
839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
840 "Build SLP failed: different gather base\n");
841 return 1;
842 }
843 if (oprnd_info->first_gs_info.scale != gs_info.scale)
844 {
845 if (dump_enabled_p ())
846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
847 "Build SLP failed: different gather scale\n");
848 return 1;
849 }
850 }
851
852 /* Not first stmt of the group, check that the def-stmt/s match
853 the def-stmt/s of the first stmt. Allow different definition
854 types for reduction chains: the first stmt must be a
855 vect_reduction_def (a phi node), and the rest
856 end in the reduction chain. */
857 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
858 && !(oprnd_info->first_dt == vect_reduction_def
859 && !STMT_VINFO_DATA_REF (stmt_info)
860 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
861 && def_stmt_info
862 && !STMT_VINFO_DATA_REF (def_stmt_info)
863 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
864 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
865 || (!STMT_VINFO_DATA_REF (stmt_info)
866 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
867 && ((!def_stmt_info
868 || STMT_VINFO_DATA_REF (def_stmt_info)
869 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
870 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
871 != (oprnd_info->first_dt != vect_reduction_def))))
872 {
873 /* Try swapping operands if we got a mismatch. For BB
874 vectorization only in case it will clearly improve things. */
875 if (i == commutative_op && !swapped
876 && (!is_a <bb_vec_info> (vinfo)
877 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
878 dts[i+1])
879 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
880 || vect_def_types_match
881 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
882 {
883 if (dump_enabled_p ())
884 dump_printf_loc (MSG_NOTE, vect_location,
885 "trying swapped operands\n");
886 std::swap (dts[i], dts[i+1]);
887 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
888 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
889 std::swap ((*oprnds_info)[i]->ops[stmt_num],
890 (*oprnds_info)[i+1]->ops[stmt_num]);
891 /* After swapping some operands we lost track whether an
892 operand has any pattern defs so be conservative here. */
893 if ((*oprnds_info)[i]->any_pattern
894 || (*oprnds_info)[i+1]->any_pattern)
895 (*oprnds_info)[i]->any_pattern
896 = (*oprnds_info)[i+1]->any_pattern = true;
897 swapped = true;
898 continue;
899 }
900
901 if (is_a <bb_vec_info> (vinfo)
902 && !oprnd_info->any_pattern)
903 {
904 /* Now for commutative ops we should see whether we can
905 make the other operand matching. */
906 if (dump_enabled_p ())
907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
908 "treating operand as external\n");
909 oprnd_info->first_dt = dt = vect_external_def;
910 }
911 else
912 {
913 if (dump_enabled_p ())
914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
915 "Build SLP failed: different types\n");
916 return 1;
917 }
918 }
919
920 /* Make sure to demote the overall operand to external. */
921 if (dt == vect_external_def)
922 oprnd_info->first_dt = vect_external_def;
923 /* For a SLP reduction chain we want to duplicate the reduction to
924 each of the chain members. That gets us a sane SLP graph (still
925 the stmts are not 100% correct wrt the initial values). */
926 else if ((dt == vect_internal_def
927 || dt == vect_reduction_def)
928 && oprnd_info->first_dt == vect_reduction_def
929 && !STMT_VINFO_DATA_REF (stmt_info)
930 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
931 && !STMT_VINFO_DATA_REF (def_stmt_info)
932 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
933 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
934 {
935 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
936 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
937 }
938
939 ++i;
940 }
941
942 /* Swap operands. */
943 if (swapped)
944 {
945 if (dump_enabled_p ())
946 dump_printf_loc (MSG_NOTE, vect_location,
947 "swapped operands to match def types in %G",
948 stmt_info->stmt);
949 }
950
951 return 0;
952 }
953
954 /* Return true if call statements CALL1 and CALL2 are similar enough
955 to be combined into the same SLP group. */
956
957 bool
958 compatible_calls_p (gcall *call1, gcall *call2)
959 {
960 unsigned int nargs = gimple_call_num_args (call1);
961 if (nargs != gimple_call_num_args (call2))
962 return false;
963
964 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
965 return false;
966
967 if (gimple_call_internal_p (call1))
968 {
969 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
970 TREE_TYPE (gimple_call_lhs (call2))))
971 return false;
972 for (unsigned int i = 0; i < nargs; ++i)
973 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
974 TREE_TYPE (gimple_call_arg (call2, i))))
975 return false;
976 }
977 else
978 {
979 if (!operand_equal_p (gimple_call_fn (call1),
980 gimple_call_fn (call2), 0))
981 return false;
982
983 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
984 return false;
985 }
986
987 /* Check that any unvectorized arguments are equal. */
988 if (const int *map = vect_get_operand_map (call1))
989 {
990 unsigned int nkept = *map++;
991 unsigned int mapi = 0;
992 for (unsigned int i = 0; i < nargs; ++i)
993 if (mapi < nkept && map[mapi] == int (i))
994 mapi += 1;
995 else if (!operand_equal_p (gimple_call_arg (call1, i),
996 gimple_call_arg (call2, i)))
997 return false;
998 }
999
1000 return true;
1001 }
1002
1003 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1004 caller's attempt to find the vector type in STMT_INFO with the narrowest
1005 element type. Return true if VECTYPE is nonnull and if it is valid
1006 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1007 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1008 vect_build_slp_tree. */
1009
1010 static bool
1011 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1012 unsigned int group_size,
1013 tree vectype, poly_uint64 *max_nunits)
1014 {
1015 if (!vectype)
1016 {
1017 if (dump_enabled_p ())
1018 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1019 "Build SLP failed: unsupported data-type in %G\n",
1020 stmt_info->stmt);
1021 /* Fatal mismatch. */
1022 return false;
1023 }
1024
1025 /* If populating the vector type requires unrolling then fail
1026 before adjusting *max_nunits for basic-block vectorization. */
1027 if (is_a <bb_vec_info> (vinfo)
1028 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1029 {
1030 if (dump_enabled_p ())
1031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1032 "Build SLP failed: unrolling required "
1033 "in basic block SLP\n");
1034 /* Fatal mismatch. */
1035 return false;
1036 }
1037
1038 /* In case of multiple types we need to detect the smallest type. */
1039 vect_update_max_nunits (max_nunits, vectype);
1040 return true;
1041 }
1042
1043 /* Verify if the scalar stmts STMTS are isomorphic, require data
1044 permutation or are of unsupported types of operation. Return
1045 true if they are, otherwise return false and indicate in *MATCHES
1046 which stmts are not isomorphic to the first one. If MATCHES[0]
1047 is false then this indicates the comparison could not be
1048 carried out or the stmts will never be vectorized by SLP.
1049
1050 Note COND_EXPR is possibly isomorphic to another one after swapping its
1051 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1052 the first stmt by swapping the two operands of comparison; set SWAP[i]
1053 to 2 if stmt I is isormorphic to the first stmt by inverting the code
1054 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1055 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1056
1057 static bool
1058 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1059 vec<stmt_vec_info> stmts, unsigned int group_size,
1060 poly_uint64 *max_nunits, bool *matches,
1061 bool *two_operators, tree *node_vectype)
1062 {
1063 unsigned int i;
1064 stmt_vec_info first_stmt_info = stmts[0];
1065 code_helper first_stmt_code = ERROR_MARK;
1066 code_helper alt_stmt_code = ERROR_MARK;
1067 code_helper rhs_code = ERROR_MARK;
1068 code_helper first_cond_code = ERROR_MARK;
1069 tree lhs;
1070 bool need_same_oprnds = false;
1071 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1072 stmt_vec_info first_load = NULL, prev_first_load = NULL;
1073 bool first_stmt_ldst_p = false, ldst_p = false;
1074 bool first_stmt_phi_p = false, phi_p = false;
1075 bool maybe_soft_fail = false;
1076 tree soft_fail_nunits_vectype = NULL_TREE;
1077
1078 /* For every stmt in NODE find its def stmt/s. */
1079 stmt_vec_info stmt_info;
1080 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1081 {
1082 gimple *stmt = stmt_info->stmt;
1083 swap[i] = 0;
1084 matches[i] = false;
1085
1086 if (dump_enabled_p ())
1087 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1088
1089 /* Fail to vectorize statements marked as unvectorizable, throw
1090 or are volatile. */
1091 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1092 || stmt_can_throw_internal (cfun, stmt)
1093 || gimple_has_volatile_ops (stmt))
1094 {
1095 if (dump_enabled_p ())
1096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1097 "Build SLP failed: unvectorizable statement %G",
1098 stmt);
1099 /* ??? For BB vectorization we want to commutate operands in a way
1100 to shuffle all unvectorizable defs into one operand and have
1101 the other still vectorized. The following doesn't reliably
1102 work for this though but it's the easiest we can do here. */
1103 if (is_a <bb_vec_info> (vinfo) && i != 0)
1104 continue;
1105 /* Fatal mismatch. */
1106 matches[0] = false;
1107 return false;
1108 }
1109
1110 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1111 lhs = gimple_get_lhs (stmt);
1112 if (lhs == NULL_TREE
1113 && (!call_stmt
1114 || !gimple_call_internal_p (stmt)
1115 || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1116 {
1117 if (dump_enabled_p ())
1118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1119 "Build SLP failed: not GIMPLE_ASSIGN nor "
1120 "GIMPLE_CALL %G", stmt);
1121 if (is_a <bb_vec_info> (vinfo) && i != 0)
1122 continue;
1123 /* Fatal mismatch. */
1124 matches[0] = false;
1125 return false;
1126 }
1127
1128 tree nunits_vectype;
1129 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1130 &nunits_vectype, group_size))
1131 {
1132 if (is_a <bb_vec_info> (vinfo) && i != 0)
1133 continue;
1134 /* Fatal mismatch. */
1135 matches[0] = false;
1136 return false;
1137 }
1138 /* Record nunits required but continue analysis, producing matches[]
1139 as if nunits was not an issue. This allows splitting of groups
1140 to happen. */
1141 if (nunits_vectype
1142 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1143 nunits_vectype, max_nunits))
1144 {
1145 gcc_assert (is_a <bb_vec_info> (vinfo));
1146 maybe_soft_fail = true;
1147 soft_fail_nunits_vectype = nunits_vectype;
1148 }
1149
1150 gcc_assert (vectype);
1151
1152 if (call_stmt)
1153 {
1154 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1155 if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1156 rhs_code = cfn;
1157 else
1158 rhs_code = CALL_EXPR;
1159
1160 if (cfn == CFN_MASK_LOAD
1161 || cfn == CFN_GATHER_LOAD
1162 || cfn == CFN_MASK_GATHER_LOAD
1163 || cfn == CFN_MASK_LEN_GATHER_LOAD)
1164 ldst_p = true;
1165 else if (cfn == CFN_MASK_STORE)
1166 {
1167 ldst_p = true;
1168 rhs_code = CFN_MASK_STORE;
1169 }
1170 else if ((cfn != CFN_LAST
1171 && cfn != CFN_MASK_CALL
1172 && internal_fn_p (cfn)
1173 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1174 || gimple_call_tail_p (call_stmt)
1175 || gimple_call_noreturn_p (call_stmt)
1176 || gimple_call_chain (call_stmt))
1177 {
1178 if (dump_enabled_p ())
1179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1180 "Build SLP failed: unsupported call type %G",
1181 (gimple *) call_stmt);
1182 if (is_a <bb_vec_info> (vinfo) && i != 0)
1183 continue;
1184 /* Fatal mismatch. */
1185 matches[0] = false;
1186 return false;
1187 }
1188 }
1189 else if (gimple_code (stmt) == GIMPLE_PHI)
1190 {
1191 rhs_code = ERROR_MARK;
1192 phi_p = true;
1193 }
1194 else
1195 {
1196 rhs_code = gimple_assign_rhs_code (stmt);
1197 ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1198 }
1199
1200 /* Check the operation. */
1201 if (i == 0)
1202 {
1203 *node_vectype = vectype;
1204 first_stmt_code = rhs_code;
1205 first_stmt_ldst_p = ldst_p;
1206 first_stmt_phi_p = phi_p;
1207
1208 /* Shift arguments should be equal in all the packed stmts for a
1209 vector shift with scalar shift operand. */
1210 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1211 || rhs_code == LROTATE_EXPR
1212 || rhs_code == RROTATE_EXPR)
1213 {
1214 /* First see if we have a vector/vector shift. */
1215 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1216 {
1217 /* No vector/vector shift, try for a vector/scalar shift. */
1218 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1219 {
1220 if (dump_enabled_p ())
1221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1222 "Build SLP failed: "
1223 "op not supported by target.\n");
1224 if (is_a <bb_vec_info> (vinfo) && i != 0)
1225 continue;
1226 /* Fatal mismatch. */
1227 matches[0] = false;
1228 return false;
1229 }
1230 need_same_oprnds = true;
1231 first_op1 = gimple_assign_rhs2 (stmt);
1232 }
1233 }
1234 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1235 {
1236 need_same_oprnds = true;
1237 first_op1 = gimple_assign_rhs2 (stmt);
1238 }
1239 else if (!ldst_p
1240 && rhs_code == BIT_FIELD_REF)
1241 {
1242 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1243 if (!is_a <bb_vec_info> (vinfo)
1244 || TREE_CODE (vec) != SSA_NAME
1245 /* When the element types are not compatible we pun the
1246 source to the target vectype which requires equal size. */
1247 || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1248 || !types_compatible_p (TREE_TYPE (vectype),
1249 TREE_TYPE (TREE_TYPE (vec))))
1250 && !operand_equal_p (TYPE_SIZE (vectype),
1251 TYPE_SIZE (TREE_TYPE (vec)))))
1252 {
1253 if (dump_enabled_p ())
1254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1255 "Build SLP failed: "
1256 "BIT_FIELD_REF not supported\n");
1257 /* Fatal mismatch. */
1258 matches[0] = false;
1259 return false;
1260 }
1261 }
1262 else if (rhs_code == CFN_DIV_POW2)
1263 {
1264 need_same_oprnds = true;
1265 first_op1 = gimple_call_arg (call_stmt, 1);
1266 }
1267 }
1268 else
1269 {
1270 if (first_stmt_code != rhs_code
1271 && alt_stmt_code == ERROR_MARK)
1272 alt_stmt_code = rhs_code;
1273 if ((first_stmt_code != rhs_code
1274 && (first_stmt_code != IMAGPART_EXPR
1275 || rhs_code != REALPART_EXPR)
1276 && (first_stmt_code != REALPART_EXPR
1277 || rhs_code != IMAGPART_EXPR)
1278 /* Handle mismatches in plus/minus by computing both
1279 and merging the results. */
1280 && !((first_stmt_code == PLUS_EXPR
1281 || first_stmt_code == MINUS_EXPR)
1282 && (alt_stmt_code == PLUS_EXPR
1283 || alt_stmt_code == MINUS_EXPR)
1284 && rhs_code == alt_stmt_code)
1285 && !(first_stmt_code.is_tree_code ()
1286 && rhs_code.is_tree_code ()
1287 && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1288 == tcc_comparison)
1289 && (swap_tree_comparison (tree_code (first_stmt_code))
1290 == tree_code (rhs_code)))
1291 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1292 && (first_stmt_code == ARRAY_REF
1293 || first_stmt_code == BIT_FIELD_REF
1294 || first_stmt_code == INDIRECT_REF
1295 || first_stmt_code == COMPONENT_REF
1296 || first_stmt_code == MEM_REF)
1297 && (rhs_code == ARRAY_REF
1298 || rhs_code == BIT_FIELD_REF
1299 || rhs_code == INDIRECT_REF
1300 || rhs_code == COMPONENT_REF
1301 || rhs_code == MEM_REF)))
1302 || (ldst_p
1303 && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1304 != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1305 || (ldst_p
1306 && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1307 != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1308 || first_stmt_ldst_p != ldst_p
1309 || first_stmt_phi_p != phi_p)
1310 {
1311 if (dump_enabled_p ())
1312 {
1313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1314 "Build SLP failed: different operation "
1315 "in stmt %G", stmt);
1316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1317 "original stmt %G", first_stmt_info->stmt);
1318 }
1319 /* Mismatch. */
1320 continue;
1321 }
1322
1323 if (!ldst_p
1324 && first_stmt_code == BIT_FIELD_REF
1325 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1326 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1327 {
1328 if (dump_enabled_p ())
1329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1330 "Build SLP failed: different BIT_FIELD_REF "
1331 "arguments in %G", stmt);
1332 /* Mismatch. */
1333 continue;
1334 }
1335
1336 if (call_stmt
1337 && first_stmt_code != CFN_MASK_LOAD
1338 && first_stmt_code != CFN_MASK_STORE)
1339 {
1340 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1341 call_stmt))
1342 {
1343 if (dump_enabled_p ())
1344 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1345 "Build SLP failed: different calls in %G",
1346 stmt);
1347 /* Mismatch. */
1348 continue;
1349 }
1350 }
1351
1352 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1353 && (gimple_bb (first_stmt_info->stmt)
1354 != gimple_bb (stmt_info->stmt)))
1355 {
1356 if (dump_enabled_p ())
1357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1358 "Build SLP failed: different BB for PHI "
1359 "or possibly trapping operation in %G", stmt);
1360 /* Mismatch. */
1361 continue;
1362 }
1363
1364 if (need_same_oprnds)
1365 {
1366 tree other_op1 = gimple_arg (stmt, 1);
1367 if (!operand_equal_p (first_op1, other_op1, 0))
1368 {
1369 if (dump_enabled_p ())
1370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1371 "Build SLP failed: different shift "
1372 "arguments in %G", stmt);
1373 /* Mismatch. */
1374 continue;
1375 }
1376 }
1377
1378 if (!types_compatible_p (vectype, *node_vectype))
1379 {
1380 if (dump_enabled_p ())
1381 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1382 "Build SLP failed: different vector type "
1383 "in %G", stmt);
1384 /* Mismatch. */
1385 continue;
1386 }
1387 }
1388
1389 /* Grouped store or load. */
1390 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1391 {
1392 gcc_assert (ldst_p);
1393 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1394 {
1395 /* Store. */
1396 gcc_assert (rhs_code == CFN_MASK_STORE
1397 || REFERENCE_CLASS_P (lhs)
1398 || DECL_P (lhs));
1399 }
1400 else
1401 {
1402 /* Load. */
1403 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1404 if (prev_first_load)
1405 {
1406 /* Check that there are no loads from different interleaving
1407 chains in the same node. */
1408 if (prev_first_load != first_load)
1409 {
1410 if (dump_enabled_p ())
1411 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1412 vect_location,
1413 "Build SLP failed: different "
1414 "interleaving chains in one node %G",
1415 stmt);
1416 /* Mismatch. */
1417 continue;
1418 }
1419 }
1420 else
1421 prev_first_load = first_load;
1422 }
1423 }
1424 /* Non-grouped store or load. */
1425 else if (ldst_p)
1426 {
1427 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1428 && rhs_code != CFN_GATHER_LOAD
1429 && rhs_code != CFN_MASK_GATHER_LOAD
1430 && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1431 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1432 /* Not grouped loads are handled as externals for BB
1433 vectorization. For loop vectorization we can handle
1434 splats the same we handle single element interleaving. */
1435 && (is_a <bb_vec_info> (vinfo)
1436 || stmt_info != first_stmt_info))
1437 {
1438 /* Not grouped load. */
1439 if (dump_enabled_p ())
1440 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1441 "Build SLP failed: not grouped load %G", stmt);
1442
1443 if (i != 0)
1444 continue;
1445 /* Fatal mismatch. */
1446 matches[0] = false;
1447 return false;
1448 }
1449 }
1450 /* Not memory operation. */
1451 else
1452 {
1453 if (!phi_p
1454 && rhs_code.is_tree_code ()
1455 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1456 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1457 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1458 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1459 && rhs_code != VIEW_CONVERT_EXPR
1460 && rhs_code != CALL_EXPR
1461 && rhs_code != BIT_FIELD_REF)
1462 {
1463 if (dump_enabled_p ())
1464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1465 "Build SLP failed: operation unsupported %G",
1466 stmt);
1467 if (is_a <bb_vec_info> (vinfo) && i != 0)
1468 continue;
1469 /* Fatal mismatch. */
1470 matches[0] = false;
1471 return false;
1472 }
1473
1474 if (rhs_code == COND_EXPR)
1475 {
1476 tree cond_expr = gimple_assign_rhs1 (stmt);
1477 enum tree_code cond_code = TREE_CODE (cond_expr);
1478 enum tree_code swap_code = ERROR_MARK;
1479 enum tree_code invert_code = ERROR_MARK;
1480
1481 if (i == 0)
1482 first_cond_code = TREE_CODE (cond_expr);
1483 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1484 {
1485 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1486 swap_code = swap_tree_comparison (cond_code);
1487 invert_code = invert_tree_comparison (cond_code, honor_nans);
1488 }
1489
1490 if (first_cond_code == cond_code)
1491 ;
1492 /* Isomorphic can be achieved by swapping. */
1493 else if (first_cond_code == swap_code)
1494 swap[i] = 1;
1495 /* Isomorphic can be achieved by inverting. */
1496 else if (first_cond_code == invert_code)
1497 swap[i] = 2;
1498 else
1499 {
1500 if (dump_enabled_p ())
1501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1502 "Build SLP failed: different"
1503 " operation %G", stmt);
1504 /* Mismatch. */
1505 continue;
1506 }
1507 }
1508
1509 if (rhs_code.is_tree_code ()
1510 && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1511 && (swap_tree_comparison ((tree_code)first_stmt_code)
1512 == (tree_code)rhs_code))
1513 swap[i] = 1;
1514 }
1515
1516 matches[i] = true;
1517 }
1518
1519 for (i = 0; i < group_size; ++i)
1520 if (!matches[i])
1521 return false;
1522
1523 /* If we allowed a two-operation SLP node verify the target can cope
1524 with the permute we are going to use. */
1525 if (alt_stmt_code != ERROR_MARK
1526 && (!alt_stmt_code.is_tree_code ()
1527 || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1528 && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1529 {
1530 *two_operators = true;
1531 }
1532
1533 if (maybe_soft_fail)
1534 {
1535 unsigned HOST_WIDE_INT const_nunits;
1536 if (!TYPE_VECTOR_SUBPARTS
1537 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1538 || const_nunits > group_size)
1539 matches[0] = false;
1540 else
1541 {
1542 /* With constant vector elements simulate a mismatch at the
1543 point we need to split. */
1544 unsigned tail = group_size & (const_nunits - 1);
1545 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1546 }
1547 return false;
1548 }
1549
1550 return true;
1551 }
1552
1553 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1554 Note we never remove apart from at destruction time so we do not
1555 need a special value for deleted that differs from empty. */
1556 struct bst_traits
1557 {
1558 typedef vec <stmt_vec_info> value_type;
1559 typedef vec <stmt_vec_info> compare_type;
1560 static inline hashval_t hash (value_type);
1561 static inline bool equal (value_type existing, value_type candidate);
1562 static inline bool is_empty (value_type x) { return !x.exists (); }
1563 static inline bool is_deleted (value_type x) { return !x.exists (); }
1564 static const bool empty_zero_p = true;
1565 static inline void mark_empty (value_type &x) { x.release (); }
1566 static inline void mark_deleted (value_type &x) { x.release (); }
1567 static inline void remove (value_type &x) { x.release (); }
1568 };
1569 inline hashval_t
1570 bst_traits::hash (value_type x)
1571 {
1572 inchash::hash h;
1573 for (unsigned i = 0; i < x.length (); ++i)
1574 h.add_int (gimple_uid (x[i]->stmt));
1575 return h.end ();
1576 }
1577 inline bool
1578 bst_traits::equal (value_type existing, value_type candidate)
1579 {
1580 if (existing.length () != candidate.length ())
1581 return false;
1582 for (unsigned i = 0; i < existing.length (); ++i)
1583 if (existing[i] != candidate[i])
1584 return false;
1585 return true;
1586 }
1587
1588 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1589 but then vec::insert does memmove and that's not compatible with
1590 std::pair. */
1591 struct chain_op_t
1592 {
1593 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1594 : code (code_), dt (dt_), op (op_) {}
1595 tree_code code;
1596 vect_def_type dt;
1597 tree op;
1598 };
1599
1600 /* Comparator for sorting associatable chains. */
1601
1602 static int
1603 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1604 {
1605 auto *op1 = (const chain_op_t *) op1_;
1606 auto *op2 = (const chain_op_t *) op2_;
1607 if (op1->dt != op2->dt)
1608 return (int)op1->dt - (int)op2->dt;
1609 return (int)op1->code - (int)op2->code;
1610 }
1611
1612 /* Linearize the associatable expression chain at START with the
1613 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1614 filling CHAIN with the result and using WORKLIST as intermediate storage.
1615 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1616 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1617 stmts, starting with START. */
1618
1619 static void
1620 vect_slp_linearize_chain (vec_info *vinfo,
1621 vec<std::pair<tree_code, gimple *> > &worklist,
1622 vec<chain_op_t> &chain,
1623 enum tree_code code, gimple *start,
1624 gimple *&code_stmt, gimple *&alt_code_stmt,
1625 vec<gimple *> *chain_stmts)
1626 {
1627 /* For each lane linearize the addition/subtraction (or other
1628 uniform associatable operation) expression tree. */
1629 worklist.safe_push (std::make_pair (code, start));
1630 while (!worklist.is_empty ())
1631 {
1632 auto entry = worklist.pop ();
1633 gassign *stmt = as_a <gassign *> (entry.second);
1634 enum tree_code in_code = entry.first;
1635 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1636 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1637 if (!code_stmt
1638 && gimple_assign_rhs_code (stmt) == code)
1639 code_stmt = stmt;
1640 else if (!alt_code_stmt
1641 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1642 alt_code_stmt = stmt;
1643 if (chain_stmts)
1644 chain_stmts->safe_push (stmt);
1645 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1646 {
1647 tree op = gimple_op (stmt, opnum);
1648 vect_def_type dt;
1649 stmt_vec_info def_stmt_info;
1650 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1651 gcc_assert (res);
1652 if (dt == vect_internal_def
1653 && is_pattern_stmt_p (def_stmt_info))
1654 op = gimple_get_lhs (def_stmt_info->stmt);
1655 gimple *use_stmt;
1656 use_operand_p use_p;
1657 if (dt == vect_internal_def
1658 && single_imm_use (op, &use_p, &use_stmt)
1659 && is_gimple_assign (def_stmt_info->stmt)
1660 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1661 || (code == PLUS_EXPR
1662 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1663 == MINUS_EXPR))))
1664 {
1665 tree_code op_def_code = this_code;
1666 if (op_def_code == MINUS_EXPR && opnum == 1)
1667 op_def_code = PLUS_EXPR;
1668 if (in_code == MINUS_EXPR)
1669 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1670 worklist.safe_push (std::make_pair (op_def_code,
1671 def_stmt_info->stmt));
1672 }
1673 else
1674 {
1675 tree_code op_def_code = this_code;
1676 if (op_def_code == MINUS_EXPR && opnum == 1)
1677 op_def_code = PLUS_EXPR;
1678 if (in_code == MINUS_EXPR)
1679 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1680 chain.safe_push (chain_op_t (op_def_code, dt, op));
1681 }
1682 }
1683 }
1684 }
1685
1686 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1687 simple_hashmap_traits <bst_traits, slp_tree> >
1688 scalar_stmts_to_slp_tree_map_t;
1689
1690 static slp_tree
1691 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1692 vec<stmt_vec_info> stmts, unsigned int group_size,
1693 poly_uint64 *max_nunits,
1694 bool *matches, unsigned *limit, unsigned *tree_size,
1695 scalar_stmts_to_slp_tree_map_t *bst_map);
1696
1697 static slp_tree
1698 vect_build_slp_tree (vec_info *vinfo,
1699 vec<stmt_vec_info> stmts, unsigned int group_size,
1700 poly_uint64 *max_nunits,
1701 bool *matches, unsigned *limit, unsigned *tree_size,
1702 scalar_stmts_to_slp_tree_map_t *bst_map)
1703 {
1704 if (slp_tree *leader = bst_map->get (stmts))
1705 {
1706 if (dump_enabled_p ())
1707 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1708 !(*leader)->failed ? "" : "failed ",
1709 (void *) *leader);
1710 if (!(*leader)->failed)
1711 {
1712 SLP_TREE_REF_COUNT (*leader)++;
1713 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1714 stmts.release ();
1715 return *leader;
1716 }
1717 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1718 return NULL;
1719 }
1720
1721 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1722 so we can pick up backedge destinations during discovery. */
1723 slp_tree res = new _slp_tree;
1724 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1725 SLP_TREE_SCALAR_STMTS (res) = stmts;
1726 bst_map->put (stmts.copy (), res);
1727
1728 if (*limit == 0)
1729 {
1730 if (dump_enabled_p ())
1731 dump_printf_loc (MSG_NOTE, vect_location,
1732 "SLP discovery limit exceeded\n");
1733 /* Mark the node invalid so we can detect those when still in use
1734 as backedge destinations. */
1735 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1736 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1737 res->failed = XNEWVEC (bool, group_size);
1738 memset (res->failed, 0, sizeof (bool) * group_size);
1739 memset (matches, 0, sizeof (bool) * group_size);
1740 return NULL;
1741 }
1742 --*limit;
1743
1744 if (dump_enabled_p ())
1745 dump_printf_loc (MSG_NOTE, vect_location,
1746 "starting SLP discovery for node %p\n", (void *) res);
1747
1748 poly_uint64 this_max_nunits = 1;
1749 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1750 &this_max_nunits,
1751 matches, limit, tree_size, bst_map);
1752 if (!res_)
1753 {
1754 if (dump_enabled_p ())
1755 dump_printf_loc (MSG_NOTE, vect_location,
1756 "SLP discovery for node %p failed\n", (void *) res);
1757 /* Mark the node invalid so we can detect those when still in use
1758 as backedge destinations. */
1759 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1760 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1761 res->failed = XNEWVEC (bool, group_size);
1762 if (flag_checking)
1763 {
1764 unsigned i;
1765 for (i = 0; i < group_size; ++i)
1766 if (!matches[i])
1767 break;
1768 gcc_assert (i < group_size);
1769 }
1770 memcpy (res->failed, matches, sizeof (bool) * group_size);
1771 }
1772 else
1773 {
1774 if (dump_enabled_p ())
1775 dump_printf_loc (MSG_NOTE, vect_location,
1776 "SLP discovery for node %p succeeded\n",
1777 (void *) res);
1778 gcc_assert (res_ == res);
1779 res->max_nunits = this_max_nunits;
1780 vect_update_max_nunits (max_nunits, this_max_nunits);
1781 /* Keep a reference for the bst_map use. */
1782 SLP_TREE_REF_COUNT (res)++;
1783 }
1784 return res_;
1785 }
1786
1787 /* Helper for building an associated SLP node chain. */
1788
1789 static void
1790 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1791 slp_tree op0, slp_tree op1,
1792 stmt_vec_info oper1, stmt_vec_info oper2,
1793 vec<std::pair<unsigned, unsigned> > lperm)
1794 {
1795 unsigned group_size = SLP_TREE_LANES (op1);
1796
1797 slp_tree child1 = new _slp_tree;
1798 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1799 SLP_TREE_VECTYPE (child1) = vectype;
1800 SLP_TREE_LANES (child1) = group_size;
1801 SLP_TREE_CHILDREN (child1).create (2);
1802 SLP_TREE_CHILDREN (child1).quick_push (op0);
1803 SLP_TREE_CHILDREN (child1).quick_push (op1);
1804 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1805
1806 slp_tree child2 = new _slp_tree;
1807 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1808 SLP_TREE_VECTYPE (child2) = vectype;
1809 SLP_TREE_LANES (child2) = group_size;
1810 SLP_TREE_CHILDREN (child2).create (2);
1811 SLP_TREE_CHILDREN (child2).quick_push (op0);
1812 SLP_TREE_REF_COUNT (op0)++;
1813 SLP_TREE_CHILDREN (child2).quick_push (op1);
1814 SLP_TREE_REF_COUNT (op1)++;
1815 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1816
1817 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1818 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1819 SLP_TREE_VECTYPE (perm) = vectype;
1820 SLP_TREE_LANES (perm) = group_size;
1821 /* ??? We should set this NULL but that's not expected. */
1822 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1823 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1824 SLP_TREE_CHILDREN (perm).quick_push (child1);
1825 SLP_TREE_CHILDREN (perm).quick_push (child2);
1826 }
1827
1828 /* Recursively build an SLP tree starting from NODE.
1829 Fail (and return a value not equal to zero) if def-stmts are not
1830 isomorphic, require data permutation or are of unsupported types of
1831 operation. Otherwise, return 0.
1832 The value returned is the depth in the SLP tree where a mismatch
1833 was found. */
1834
1835 static slp_tree
1836 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1837 vec<stmt_vec_info> stmts, unsigned int group_size,
1838 poly_uint64 *max_nunits,
1839 bool *matches, unsigned *limit, unsigned *tree_size,
1840 scalar_stmts_to_slp_tree_map_t *bst_map)
1841 {
1842 unsigned nops, i, this_tree_size = 0;
1843 poly_uint64 this_max_nunits = *max_nunits;
1844
1845 matches[0] = false;
1846
1847 stmt_vec_info stmt_info = stmts[0];
1848 if (!is_a<gcall *> (stmt_info->stmt)
1849 && !is_a<gassign *> (stmt_info->stmt)
1850 && !is_a<gphi *> (stmt_info->stmt))
1851 return NULL;
1852
1853 nops = gimple_num_args (stmt_info->stmt);
1854 if (const int *map = vect_get_operand_map (stmt_info->stmt,
1855 STMT_VINFO_GATHER_SCATTER_P
1856 (stmt_info)))
1857 nops = map[0];
1858
1859 /* If the SLP node is a PHI (induction or reduction), terminate
1860 the recursion. */
1861 bool *skip_args = XALLOCAVEC (bool, nops);
1862 memset (skip_args, 0, sizeof (bool) * nops);
1863 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1864 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1865 {
1866 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1867 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1868 group_size);
1869 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1870 max_nunits))
1871 return NULL;
1872
1873 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1874 if (def_type == vect_induction_def)
1875 {
1876 /* Induction PHIs are not cycles but walk the initial
1877 value. Only for inner loops through, for outer loops
1878 we need to pick up the value from the actual PHIs
1879 to more easily support peeling and epilogue vectorization. */
1880 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1881 if (!nested_in_vect_loop_p (loop, stmt_info))
1882 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1883 else
1884 loop = loop->inner;
1885 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1886 }
1887 else if (def_type == vect_reduction_def
1888 || def_type == vect_double_reduction_def
1889 || def_type == vect_nested_cycle
1890 || def_type == vect_first_order_recurrence)
1891 {
1892 /* Else def types have to match. */
1893 stmt_vec_info other_info;
1894 bool all_same = true;
1895 FOR_EACH_VEC_ELT (stmts, i, other_info)
1896 {
1897 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1898 return NULL;
1899 if (other_info != stmt_info)
1900 all_same = false;
1901 }
1902 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1903 /* Reduction initial values are not explicitely represented. */
1904 if (def_type != vect_first_order_recurrence
1905 && !nested_in_vect_loop_p (loop, stmt_info))
1906 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1907 /* Reduction chain backedge defs are filled manually.
1908 ??? Need a better way to identify a SLP reduction chain PHI.
1909 Or a better overall way to SLP match those. */
1910 if (all_same && def_type == vect_reduction_def)
1911 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1912 }
1913 else if (def_type != vect_internal_def)
1914 return NULL;
1915 }
1916
1917
1918 bool two_operators = false;
1919 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1920 tree vectype = NULL_TREE;
1921 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1922 &this_max_nunits, matches, &two_operators,
1923 &vectype))
1924 return NULL;
1925
1926 /* If the SLP node is a load, terminate the recursion unless masked. */
1927 if (STMT_VINFO_DATA_REF (stmt_info)
1928 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1929 {
1930 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1931 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1932 else
1933 {
1934 *max_nunits = this_max_nunits;
1935 (*tree_size)++;
1936 node = vect_create_new_slp_node (node, stmts, 0);
1937 SLP_TREE_VECTYPE (node) = vectype;
1938 /* And compute the load permutation. Whether it is actually
1939 a permutation depends on the unrolling factor which is
1940 decided later. */
1941 vec<unsigned> load_permutation;
1942 int j;
1943 stmt_vec_info load_info;
1944 load_permutation.create (group_size);
1945 stmt_vec_info first_stmt_info
1946 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1947 bool any_permute = false;
1948 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1949 {
1950 int load_place;
1951 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1952 load_place = vect_get_place_in_interleaving_chain
1953 (load_info, first_stmt_info);
1954 else
1955 load_place = 0;
1956 gcc_assert (load_place != -1);
1957 any_permute |= load_place != j;
1958 load_permutation.quick_push (load_place);
1959 }
1960
1961 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1962 {
1963 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1964 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1965 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
1966 || gimple_call_internal_p (stmt,
1967 IFN_MASK_LEN_GATHER_LOAD));
1968 load_permutation.release ();
1969 /* We cannot handle permuted masked loads, see PR114375. */
1970 if (any_permute
1971 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1972 && DR_GROUP_SIZE (first_stmt_info) != group_size)
1973 || STMT_VINFO_STRIDED_P (stmt_info))
1974 {
1975 matches[0] = false;
1976 return NULL;
1977 }
1978 }
1979 else
1980 {
1981 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1982 return node;
1983 }
1984 }
1985 }
1986 else if (gimple_assign_single_p (stmt_info->stmt)
1987 && !gimple_vuse (stmt_info->stmt)
1988 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1989 {
1990 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1991 the same SSA name vector of a compatible type to vectype. */
1992 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1993 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1994 stmt_vec_info estmt_info;
1995 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1996 {
1997 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1998 tree bfref = gimple_assign_rhs1 (estmt);
1999 HOST_WIDE_INT lane;
2000 if (!known_eq (bit_field_size (bfref),
2001 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
2002 || !constant_multiple_p (bit_field_offset (bfref),
2003 bit_field_size (bfref), &lane))
2004 {
2005 lperm.release ();
2006 matches[0] = false;
2007 return NULL;
2008 }
2009 lperm.safe_push (std::make_pair (0, (unsigned)lane));
2010 }
2011 slp_tree vnode = vect_create_new_slp_node (vNULL);
2012 if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2013 /* ??? We record vectype here but we hide eventually necessary
2014 punning and instead rely on code generation to materialize
2015 VIEW_CONVERT_EXPRs as necessary. We instead should make
2016 this explicit somehow. */
2017 SLP_TREE_VECTYPE (vnode) = vectype;
2018 else
2019 {
2020 /* For different size but compatible elements we can still
2021 use VEC_PERM_EXPR without punning. */
2022 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2023 && types_compatible_p (TREE_TYPE (vectype),
2024 TREE_TYPE (TREE_TYPE (vec))));
2025 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2026 }
2027 auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2028 unsigned HOST_WIDE_INT const_nunits;
2029 if (nunits.is_constant (&const_nunits))
2030 SLP_TREE_LANES (vnode) = const_nunits;
2031 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2032 /* We are always building a permutation node even if it is an identity
2033 permute to shield the rest of the vectorizer from the odd node
2034 representing an actual vector without any scalar ops.
2035 ??? We could hide it completely with making the permute node
2036 external? */
2037 node = vect_create_new_slp_node (node, stmts, 1);
2038 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2039 SLP_TREE_LANE_PERMUTATION (node) = lperm;
2040 SLP_TREE_VECTYPE (node) = vectype;
2041 SLP_TREE_CHILDREN (node).quick_push (vnode);
2042 return node;
2043 }
2044 /* When discovery reaches an associatable operation see whether we can
2045 improve that to match up lanes in a way superior to the operand
2046 swapping code which at most looks at two defs.
2047 ??? For BB vectorization we cannot do the brute-force search
2048 for matching as we can succeed by means of builds from scalars
2049 and have no good way to "cost" one build against another. */
2050 else if (is_a <loop_vec_info> (vinfo)
2051 /* ??? We don't handle !vect_internal_def defs below. */
2052 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2053 && is_gimple_assign (stmt_info->stmt)
2054 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2055 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2056 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2057 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2058 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2059 {
2060 /* See if we have a chain of (mixed) adds or subtracts or other
2061 associatable ops. */
2062 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2063 if (code == MINUS_EXPR)
2064 code = PLUS_EXPR;
2065 stmt_vec_info other_op_stmt_info = NULL;
2066 stmt_vec_info op_stmt_info = NULL;
2067 unsigned chain_len = 0;
2068 auto_vec<chain_op_t> chain;
2069 auto_vec<std::pair<tree_code, gimple *> > worklist;
2070 auto_vec<vec<chain_op_t> > chains (group_size);
2071 auto_vec<slp_tree, 4> children;
2072 bool hard_fail = true;
2073 for (unsigned lane = 0; lane < group_size; ++lane)
2074 {
2075 /* For each lane linearize the addition/subtraction (or other
2076 uniform associatable operation) expression tree. */
2077 gimple *op_stmt = NULL, *other_op_stmt = NULL;
2078 vect_slp_linearize_chain (vinfo, worklist, chain, code,
2079 stmts[lane]->stmt, op_stmt, other_op_stmt,
2080 NULL);
2081 if (!op_stmt_info && op_stmt)
2082 op_stmt_info = vinfo->lookup_stmt (op_stmt);
2083 if (!other_op_stmt_info && other_op_stmt)
2084 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2085 if (chain.length () == 2)
2086 {
2087 /* In a chain of just two elements resort to the regular
2088 operand swapping scheme. If we run into a length
2089 mismatch still hard-FAIL. */
2090 if (chain_len == 0)
2091 hard_fail = false;
2092 else
2093 {
2094 matches[lane] = false;
2095 /* ??? We might want to process the other lanes, but
2096 make sure to not give false matching hints to the
2097 caller for lanes we did not process. */
2098 if (lane != group_size - 1)
2099 matches[0] = false;
2100 }
2101 break;
2102 }
2103 else if (chain_len == 0)
2104 chain_len = chain.length ();
2105 else if (chain.length () != chain_len)
2106 {
2107 /* ??? Here we could slip in magic to compensate with
2108 neutral operands. */
2109 matches[lane] = false;
2110 if (lane != group_size - 1)
2111 matches[0] = false;
2112 break;
2113 }
2114 chains.quick_push (chain.copy ());
2115 chain.truncate (0);
2116 }
2117 if (chains.length () == group_size)
2118 {
2119 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2120 if (!op_stmt_info)
2121 {
2122 hard_fail = false;
2123 goto out;
2124 }
2125 /* Now we have a set of chains with the same length. */
2126 /* 1. pre-sort according to def_type and operation. */
2127 for (unsigned lane = 0; lane < group_size; ++lane)
2128 chains[lane].stablesort (dt_sort_cmp, vinfo);
2129 if (dump_enabled_p ())
2130 {
2131 dump_printf_loc (MSG_NOTE, vect_location,
2132 "pre-sorted chains of %s\n",
2133 get_tree_code_name (code));
2134 for (unsigned lane = 0; lane < group_size; ++lane)
2135 {
2136 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2137 dump_printf (MSG_NOTE, "%s %T ",
2138 get_tree_code_name (chains[lane][opnum].code),
2139 chains[lane][opnum].op);
2140 dump_printf (MSG_NOTE, "\n");
2141 }
2142 }
2143 /* 2. try to build children nodes, associating as necessary. */
2144 for (unsigned n = 0; n < chain_len; ++n)
2145 {
2146 vect_def_type dt = chains[0][n].dt;
2147 unsigned lane;
2148 for (lane = 0; lane < group_size; ++lane)
2149 if (chains[lane][n].dt != dt)
2150 {
2151 if (dt == vect_constant_def
2152 && chains[lane][n].dt == vect_external_def)
2153 dt = vect_external_def;
2154 else if (dt == vect_external_def
2155 && chains[lane][n].dt == vect_constant_def)
2156 ;
2157 else
2158 break;
2159 }
2160 if (lane != group_size)
2161 {
2162 if (dump_enabled_p ())
2163 dump_printf_loc (MSG_NOTE, vect_location,
2164 "giving up on chain due to mismatched "
2165 "def types\n");
2166 matches[lane] = false;
2167 if (lane != group_size - 1)
2168 matches[0] = false;
2169 goto out;
2170 }
2171 if (dt == vect_constant_def
2172 || dt == vect_external_def)
2173 {
2174 /* Check whether we can build the invariant. If we can't
2175 we never will be able to. */
2176 tree type = TREE_TYPE (chains[0][n].op);
2177 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2178 && (TREE_CODE (type) == BOOLEAN_TYPE
2179 || !can_duplicate_and_interleave_p (vinfo, group_size,
2180 type)))
2181 {
2182 matches[0] = false;
2183 goto out;
2184 }
2185 vec<tree> ops;
2186 ops.create (group_size);
2187 for (lane = 0; lane < group_size; ++lane)
2188 ops.quick_push (chains[lane][n].op);
2189 slp_tree child = vect_create_new_slp_node (ops);
2190 SLP_TREE_DEF_TYPE (child) = dt;
2191 children.safe_push (child);
2192 }
2193 else if (dt != vect_internal_def)
2194 {
2195 /* Not sure, we might need sth special.
2196 gcc.dg/vect/pr96854.c,
2197 gfortran.dg/vect/fast-math-pr37021.f90
2198 and gfortran.dg/vect/pr61171.f trigger. */
2199 /* Soft-fail for now. */
2200 hard_fail = false;
2201 goto out;
2202 }
2203 else
2204 {
2205 vec<stmt_vec_info> op_stmts;
2206 op_stmts.create (group_size);
2207 slp_tree child = NULL;
2208 /* Brute-force our way. We have to consider a lane
2209 failing after fixing an earlier fail up in the
2210 SLP discovery recursion. So track the current
2211 permute per lane. */
2212 unsigned *perms = XALLOCAVEC (unsigned, group_size);
2213 memset (perms, 0, sizeof (unsigned) * group_size);
2214 do
2215 {
2216 op_stmts.truncate (0);
2217 for (lane = 0; lane < group_size; ++lane)
2218 op_stmts.quick_push
2219 (vinfo->lookup_def (chains[lane][n].op));
2220 child = vect_build_slp_tree (vinfo, op_stmts,
2221 group_size, &this_max_nunits,
2222 matches, limit,
2223 &this_tree_size, bst_map);
2224 /* ??? We're likely getting too many fatal mismatches
2225 here so maybe we want to ignore them (but then we
2226 have no idea which lanes fatally mismatched). */
2227 if (child || !matches[0])
2228 break;
2229 /* Swap another lane we have not yet matched up into
2230 lanes that did not match. If we run out of
2231 permute possibilities for a lane terminate the
2232 search. */
2233 bool term = false;
2234 for (lane = 1; lane < group_size; ++lane)
2235 if (!matches[lane])
2236 {
2237 if (n + perms[lane] + 1 == chain_len)
2238 {
2239 term = true;
2240 break;
2241 }
2242 std::swap (chains[lane][n],
2243 chains[lane][n + perms[lane] + 1]);
2244 perms[lane]++;
2245 }
2246 if (term)
2247 break;
2248 }
2249 while (1);
2250 if (!child)
2251 {
2252 if (dump_enabled_p ())
2253 dump_printf_loc (MSG_NOTE, vect_location,
2254 "failed to match up op %d\n", n);
2255 op_stmts.release ();
2256 if (lane != group_size - 1)
2257 matches[0] = false;
2258 else
2259 matches[lane] = false;
2260 goto out;
2261 }
2262 if (dump_enabled_p ())
2263 {
2264 dump_printf_loc (MSG_NOTE, vect_location,
2265 "matched up op %d to\n", n);
2266 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2267 }
2268 children.safe_push (child);
2269 }
2270 }
2271 /* 3. build SLP nodes to combine the chain. */
2272 for (unsigned lane = 0; lane < group_size; ++lane)
2273 if (chains[lane][0].code != code)
2274 {
2275 /* See if there's any alternate all-PLUS entry. */
2276 unsigned n;
2277 for (n = 1; n < chain_len; ++n)
2278 {
2279 for (lane = 0; lane < group_size; ++lane)
2280 if (chains[lane][n].code != code)
2281 break;
2282 if (lane == group_size)
2283 break;
2284 }
2285 if (n != chain_len)
2286 {
2287 /* Swap that in at first position. */
2288 std::swap (children[0], children[n]);
2289 for (lane = 0; lane < group_size; ++lane)
2290 std::swap (chains[lane][0], chains[lane][n]);
2291 }
2292 else
2293 {
2294 /* ??? When this triggers and we end up with two
2295 vect_constant/external_def up-front things break (ICE)
2296 spectacularly finding an insertion place for the
2297 all-constant op. We should have a fully
2298 vect_internal_def operand though(?) so we can swap
2299 that into first place and then prepend the all-zero
2300 constant. */
2301 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_NOTE, vect_location,
2303 "inserting constant zero to compensate "
2304 "for (partially) negated first "
2305 "operand\n");
2306 chain_len++;
2307 for (lane = 0; lane < group_size; ++lane)
2308 chains[lane].safe_insert
2309 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2310 vec<tree> zero_ops;
2311 zero_ops.create (group_size);
2312 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2313 for (lane = 1; lane < group_size; ++lane)
2314 zero_ops.quick_push (zero_ops[0]);
2315 slp_tree zero = vect_create_new_slp_node (zero_ops);
2316 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2317 children.safe_insert (0, zero);
2318 }
2319 break;
2320 }
2321 for (unsigned i = 1; i < children.length (); ++i)
2322 {
2323 slp_tree op0 = children[i - 1];
2324 slp_tree op1 = children[i];
2325 bool this_two_op = false;
2326 for (unsigned lane = 0; lane < group_size; ++lane)
2327 if (chains[lane][i].code != chains[0][i].code)
2328 {
2329 this_two_op = true;
2330 break;
2331 }
2332 slp_tree child;
2333 if (i == children.length () - 1)
2334 child = vect_create_new_slp_node (node, stmts, 2);
2335 else
2336 child = vect_create_new_slp_node (2, ERROR_MARK);
2337 if (this_two_op)
2338 {
2339 vec<std::pair<unsigned, unsigned> > lperm;
2340 lperm.create (group_size);
2341 for (unsigned lane = 0; lane < group_size; ++lane)
2342 lperm.quick_push (std::make_pair
2343 (chains[lane][i].code != chains[0][i].code, lane));
2344 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2345 (chains[0][i].code == code
2346 ? op_stmt_info
2347 : other_op_stmt_info),
2348 (chains[0][i].code == code
2349 ? other_op_stmt_info
2350 : op_stmt_info),
2351 lperm);
2352 }
2353 else
2354 {
2355 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2356 SLP_TREE_VECTYPE (child) = vectype;
2357 SLP_TREE_LANES (child) = group_size;
2358 SLP_TREE_CHILDREN (child).quick_push (op0);
2359 SLP_TREE_CHILDREN (child).quick_push (op1);
2360 SLP_TREE_REPRESENTATIVE (child)
2361 = (chains[0][i].code == code
2362 ? op_stmt_info : other_op_stmt_info);
2363 }
2364 children[i] = child;
2365 }
2366 *tree_size += this_tree_size + 1;
2367 *max_nunits = this_max_nunits;
2368 while (!chains.is_empty ())
2369 chains.pop ().release ();
2370 return node;
2371 }
2372 out:
2373 while (!children.is_empty ())
2374 vect_free_slp_tree (children.pop ());
2375 while (!chains.is_empty ())
2376 chains.pop ().release ();
2377 /* Hard-fail, otherwise we might run into quadratic processing of the
2378 chains starting one stmt into the chain again. */
2379 if (hard_fail)
2380 return NULL;
2381 /* Fall thru to normal processing. */
2382 }
2383
2384 /* Get at the operands, verifying they are compatible. */
2385 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2386 slp_oprnd_info oprnd_info;
2387 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2388 {
2389 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2390 stmts, i, &oprnds_info);
2391 if (res != 0)
2392 matches[(res == -1) ? 0 : i] = false;
2393 if (!matches[0])
2394 break;
2395 }
2396 for (i = 0; i < group_size; ++i)
2397 if (!matches[i])
2398 {
2399 vect_free_oprnd_info (oprnds_info);
2400 return NULL;
2401 }
2402 swap = NULL;
2403
2404 auto_vec<slp_tree, 4> children;
2405
2406 stmt_info = stmts[0];
2407
2408 /* Create SLP_TREE nodes for the definition node/s. */
2409 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2410 {
2411 slp_tree child = nullptr;
2412 unsigned int j;
2413
2414 /* We're skipping certain operands from processing, for example
2415 outer loop reduction initial defs. */
2416 if (skip_args[i])
2417 {
2418 children.safe_push (NULL);
2419 continue;
2420 }
2421
2422 if (oprnd_info->first_dt == vect_uninitialized_def)
2423 {
2424 /* COND_EXPR have one too many eventually if the condition
2425 is a SSA name. */
2426 gcc_assert (i == 3 && nops == 4);
2427 continue;
2428 }
2429
2430 if (is_a <bb_vec_info> (vinfo)
2431 && oprnd_info->first_dt == vect_internal_def
2432 && !oprnd_info->any_pattern)
2433 {
2434 /* For BB vectorization, if all defs are the same do not
2435 bother to continue the build along the single-lane
2436 graph but use a splat of the scalar value. */
2437 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2438 for (j = 1; j < group_size; ++j)
2439 if (oprnd_info->def_stmts[j] != first_def)
2440 break;
2441 if (j == group_size
2442 /* But avoid doing this for loads where we may be
2443 able to CSE things, unless the stmt is not
2444 vectorizable. */
2445 && (!STMT_VINFO_VECTORIZABLE (first_def)
2446 || !gimple_vuse (first_def->stmt)))
2447 {
2448 if (dump_enabled_p ())
2449 dump_printf_loc (MSG_NOTE, vect_location,
2450 "Using a splat of the uniform operand %G",
2451 first_def->stmt);
2452 oprnd_info->first_dt = vect_external_def;
2453 }
2454 }
2455
2456 if (oprnd_info->first_dt == vect_external_def
2457 || oprnd_info->first_dt == vect_constant_def)
2458 {
2459 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2460 {
2461 tree op0;
2462 tree uniform_val = op0 = oprnd_info->ops[0];
2463 for (j = 1; j < oprnd_info->ops.length (); ++j)
2464 if (!operand_equal_p (uniform_val, oprnd_info->ops[j]))
2465 {
2466 uniform_val = NULL_TREE;
2467 break;
2468 }
2469 if (!uniform_val
2470 && !can_duplicate_and_interleave_p (vinfo,
2471 oprnd_info->ops.length (),
2472 TREE_TYPE (op0)))
2473 {
2474 matches[j] = false;
2475 if (dump_enabled_p ())
2476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2477 "Build SLP failed: invalid type of def "
2478 "for variable-length SLP %T\n", op0);
2479 goto fail;
2480 }
2481 }
2482 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2483 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2484 oprnd_info->ops = vNULL;
2485 children.safe_push (invnode);
2486 continue;
2487 }
2488
2489 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2490 group_size, &this_max_nunits,
2491 matches, limit,
2492 &this_tree_size, bst_map)) != NULL)
2493 {
2494 oprnd_info->def_stmts = vNULL;
2495 children.safe_push (child);
2496 continue;
2497 }
2498
2499 /* If the SLP build for operand zero failed and operand zero
2500 and one can be commutated try that for the scalar stmts
2501 that failed the match. */
2502 if (i == 0
2503 /* A first scalar stmt mismatch signals a fatal mismatch. */
2504 && matches[0]
2505 /* ??? For COND_EXPRs we can swap the comparison operands
2506 as well as the arms under some constraints. */
2507 && nops == 2
2508 && oprnds_info[1]->first_dt == vect_internal_def
2509 && is_gimple_assign (stmt_info->stmt)
2510 /* Swapping operands for reductions breaks assumptions later on. */
2511 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2512 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2513 {
2514 /* See whether we can swap the matching or the non-matching
2515 stmt operands. */
2516 bool swap_not_matching = true;
2517 do
2518 {
2519 for (j = 0; j < group_size; ++j)
2520 {
2521 if (matches[j] != !swap_not_matching)
2522 continue;
2523 stmt_vec_info stmt_info = stmts[j];
2524 /* Verify if we can swap operands of this stmt. */
2525 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2526 if (!stmt
2527 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2528 {
2529 if (!swap_not_matching)
2530 goto fail;
2531 swap_not_matching = false;
2532 break;
2533 }
2534 }
2535 }
2536 while (j != group_size);
2537
2538 /* Swap mismatched definition stmts. */
2539 if (dump_enabled_p ())
2540 dump_printf_loc (MSG_NOTE, vect_location,
2541 "Re-trying with swapped operands of stmts ");
2542 for (j = 0; j < group_size; ++j)
2543 if (matches[j] == !swap_not_matching)
2544 {
2545 std::swap (oprnds_info[0]->def_stmts[j],
2546 oprnds_info[1]->def_stmts[j]);
2547 std::swap (oprnds_info[0]->ops[j],
2548 oprnds_info[1]->ops[j]);
2549 if (dump_enabled_p ())
2550 dump_printf (MSG_NOTE, "%d ", j);
2551 }
2552 if (dump_enabled_p ())
2553 dump_printf (MSG_NOTE, "\n");
2554 /* After swapping some operands we lost track whether an
2555 operand has any pattern defs so be conservative here. */
2556 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2557 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2558 /* And try again with scratch 'matches' ... */
2559 bool *tem = XALLOCAVEC (bool, group_size);
2560 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2561 group_size, &this_max_nunits,
2562 tem, limit,
2563 &this_tree_size, bst_map)) != NULL)
2564 {
2565 oprnd_info->def_stmts = vNULL;
2566 children.safe_push (child);
2567 continue;
2568 }
2569 }
2570 fail:
2571
2572 /* If the SLP build failed and we analyze a basic-block
2573 simply treat nodes we fail to build as externally defined
2574 (and thus build vectors from the scalar defs).
2575 The cost model will reject outright expensive cases.
2576 ??? This doesn't treat cases where permutation ultimatively
2577 fails (or we don't try permutation below). Ideally we'd
2578 even compute a permutation that will end up with the maximum
2579 SLP tree size... */
2580 if (is_a <bb_vec_info> (vinfo)
2581 /* ??? Rejecting patterns this way doesn't work. We'd have to
2582 do extra work to cancel the pattern so the uses see the
2583 scalar version. */
2584 && !is_pattern_stmt_p (stmt_info)
2585 && !oprnd_info->any_pattern)
2586 {
2587 /* But if there's a leading vector sized set of matching stmts
2588 fail here so we can split the group. This matches the condition
2589 vect_analyze_slp_instance uses. */
2590 /* ??? We might want to split here and combine the results to support
2591 multiple vector sizes better. */
2592 for (j = 0; j < group_size; ++j)
2593 if (!matches[j])
2594 break;
2595 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2596 {
2597 if (dump_enabled_p ())
2598 dump_printf_loc (MSG_NOTE, vect_location,
2599 "Building vector operands from scalars\n");
2600 this_tree_size++;
2601 child = vect_create_new_slp_node (oprnd_info->ops);
2602 children.safe_push (child);
2603 oprnd_info->ops = vNULL;
2604 continue;
2605 }
2606 }
2607
2608 gcc_assert (child == NULL);
2609 FOR_EACH_VEC_ELT (children, j, child)
2610 if (child)
2611 vect_free_slp_tree (child);
2612 vect_free_oprnd_info (oprnds_info);
2613 return NULL;
2614 }
2615
2616 vect_free_oprnd_info (oprnds_info);
2617
2618 /* If we have all children of a child built up from uniform scalars
2619 or does more than one possibly expensive vector construction then
2620 just throw that away, causing it built up from scalars.
2621 The exception is the SLP node for the vector store. */
2622 if (is_a <bb_vec_info> (vinfo)
2623 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2624 /* ??? Rejecting patterns this way doesn't work. We'd have to
2625 do extra work to cancel the pattern so the uses see the
2626 scalar version. */
2627 && !is_pattern_stmt_p (stmt_info))
2628 {
2629 slp_tree child;
2630 unsigned j;
2631 bool all_uniform_p = true;
2632 unsigned n_vector_builds = 0;
2633 FOR_EACH_VEC_ELT (children, j, child)
2634 {
2635 if (!child)
2636 ;
2637 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2638 all_uniform_p = false;
2639 else if (!vect_slp_tree_uniform_p (child))
2640 {
2641 all_uniform_p = false;
2642 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2643 n_vector_builds++;
2644 }
2645 }
2646 if (all_uniform_p
2647 || n_vector_builds > 1
2648 || (n_vector_builds == children.length ()
2649 && is_a <gphi *> (stmt_info->stmt)))
2650 {
2651 /* Roll back. */
2652 matches[0] = false;
2653 FOR_EACH_VEC_ELT (children, j, child)
2654 if (child)
2655 vect_free_slp_tree (child);
2656
2657 if (dump_enabled_p ())
2658 dump_printf_loc (MSG_NOTE, vect_location,
2659 "Building parent vector operands from "
2660 "scalars instead\n");
2661 return NULL;
2662 }
2663 }
2664
2665 *tree_size += this_tree_size + 1;
2666 *max_nunits = this_max_nunits;
2667
2668 if (two_operators)
2669 {
2670 /* ??? We'd likely want to either cache in bst_map sth like
2671 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2672 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2673 explicit stmts to put in so the keying on 'stmts' doesn't
2674 work (but we have the same issue with nodes that use 'ops'). */
2675 slp_tree one = new _slp_tree;
2676 slp_tree two = new _slp_tree;
2677 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2678 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2679 SLP_TREE_VECTYPE (one) = vectype;
2680 SLP_TREE_VECTYPE (two) = vectype;
2681 SLP_TREE_CHILDREN (one).safe_splice (children);
2682 SLP_TREE_CHILDREN (two).safe_splice (children);
2683 slp_tree child;
2684 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2685 SLP_TREE_REF_COUNT (child)++;
2686
2687 /* Here we record the original defs since this
2688 node represents the final lane configuration. */
2689 node = vect_create_new_slp_node (node, stmts, 2);
2690 SLP_TREE_VECTYPE (node) = vectype;
2691 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2692 SLP_TREE_CHILDREN (node).quick_push (one);
2693 SLP_TREE_CHILDREN (node).quick_push (two);
2694 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2695 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2696 enum tree_code ocode = ERROR_MARK;
2697 stmt_vec_info ostmt_info;
2698 unsigned j = 0;
2699 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2700 {
2701 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2702 if (gimple_assign_rhs_code (ostmt) != code0)
2703 {
2704 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2705 ocode = gimple_assign_rhs_code (ostmt);
2706 j = i;
2707 }
2708 else
2709 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2710 }
2711 SLP_TREE_CODE (one) = code0;
2712 SLP_TREE_CODE (two) = ocode;
2713 SLP_TREE_LANES (one) = stmts.length ();
2714 SLP_TREE_LANES (two) = stmts.length ();
2715 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2716 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2717 return node;
2718 }
2719
2720 node = vect_create_new_slp_node (node, stmts, nops);
2721 SLP_TREE_VECTYPE (node) = vectype;
2722 SLP_TREE_CHILDREN (node).splice (children);
2723 return node;
2724 }
2725
2726 /* Dump a single SLP tree NODE. */
2727
2728 static void
2729 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2730 slp_tree node)
2731 {
2732 unsigned i, j;
2733 slp_tree child;
2734 stmt_vec_info stmt_info;
2735 tree op;
2736
2737 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2738 dump_user_location_t user_loc = loc.get_user_location ();
2739 dump_printf_loc (metadata, user_loc,
2740 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2741 ", refcnt=%u)",
2742 SLP_TREE_DEF_TYPE (node) == vect_external_def
2743 ? " (external)"
2744 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2745 ? " (constant)"
2746 : ""), (void *) node,
2747 estimated_poly_value (node->max_nunits),
2748 SLP_TREE_REF_COUNT (node));
2749 if (SLP_TREE_VECTYPE (node))
2750 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2751 dump_printf (metadata, "\n");
2752 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2753 {
2754 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2755 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2756 else
2757 dump_printf_loc (metadata, user_loc, "op template: %G",
2758 SLP_TREE_REPRESENTATIVE (node)->stmt);
2759 }
2760 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2761 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2762 dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
2763 STMT_VINFO_LIVE_P (stmt_info) ? "[l] " : "",
2764 i, stmt_info->stmt);
2765 else
2766 {
2767 dump_printf_loc (metadata, user_loc, "\t{ ");
2768 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2769 dump_printf (metadata, "%T%s ", op,
2770 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2771 dump_printf (metadata, "}\n");
2772 }
2773 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2774 {
2775 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2776 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2777 dump_printf (dump_kind, " %u", j);
2778 dump_printf (dump_kind, " }\n");
2779 }
2780 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2781 {
2782 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2783 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2784 dump_printf (dump_kind, " %u[%u]",
2785 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2786 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2787 dump_printf (dump_kind, " }\n");
2788 }
2789 if (SLP_TREE_CHILDREN (node).is_empty ())
2790 return;
2791 dump_printf_loc (metadata, user_loc, "\tchildren");
2792 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2793 dump_printf (dump_kind, " %p", (void *)child);
2794 dump_printf (dump_kind, "\n");
2795 }
2796
2797 DEBUG_FUNCTION void
2798 debug (slp_tree node)
2799 {
2800 debug_dump_context ctx;
2801 vect_print_slp_tree (MSG_NOTE,
2802 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2803 node);
2804 }
2805
2806 /* Recursive helper for the dot producer below. */
2807
2808 static void
2809 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2810 {
2811 if (visited.add (node))
2812 return;
2813
2814 fprintf (f, "\"%p\" [label=\"", (void *)node);
2815 vect_print_slp_tree (MSG_NOTE,
2816 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2817 node);
2818 fprintf (f, "\"];\n");
2819
2820
2821 for (slp_tree child : SLP_TREE_CHILDREN (node))
2822 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2823
2824 for (slp_tree child : SLP_TREE_CHILDREN (node))
2825 if (child)
2826 dot_slp_tree (f, child, visited);
2827 }
2828
2829 DEBUG_FUNCTION void
2830 dot_slp_tree (const char *fname, slp_tree node)
2831 {
2832 FILE *f = fopen (fname, "w");
2833 fprintf (f, "digraph {\n");
2834 fflush (f);
2835 {
2836 debug_dump_context ctx (f);
2837 hash_set<slp_tree> visited;
2838 dot_slp_tree (f, node, visited);
2839 }
2840 fflush (f);
2841 fprintf (f, "}\n");
2842 fclose (f);
2843 }
2844
2845 DEBUG_FUNCTION void
2846 dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
2847 {
2848 FILE *f = fopen (fname, "w");
2849 fprintf (f, "digraph {\n");
2850 fflush (f);
2851 {
2852 debug_dump_context ctx (f);
2853 hash_set<slp_tree> visited;
2854 for (auto inst : slp_instances)
2855 dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
2856 }
2857 fflush (f);
2858 fprintf (f, "}\n");
2859 fclose (f);
2860 }
2861
2862 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2863
2864 static void
2865 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2866 slp_tree node, hash_set<slp_tree> &visited)
2867 {
2868 unsigned i;
2869 slp_tree child;
2870
2871 if (visited.add (node))
2872 return;
2873
2874 vect_print_slp_tree (dump_kind, loc, node);
2875
2876 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2877 if (child)
2878 vect_print_slp_graph (dump_kind, loc, child, visited);
2879 }
2880
2881 static void
2882 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2883 slp_tree entry)
2884 {
2885 hash_set<slp_tree> visited;
2886 vect_print_slp_graph (dump_kind, loc, entry, visited);
2887 }
2888
2889 /* Mark the tree rooted at NODE with PURE_SLP. */
2890
2891 static void
2892 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2893 {
2894 int i;
2895 stmt_vec_info stmt_info;
2896 slp_tree child;
2897
2898 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2899 return;
2900
2901 if (visited.add (node))
2902 return;
2903
2904 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2905 STMT_SLP_TYPE (stmt_info) = pure_slp;
2906
2907 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2908 if (child)
2909 vect_mark_slp_stmts (child, visited);
2910 }
2911
2912 static void
2913 vect_mark_slp_stmts (slp_tree node)
2914 {
2915 hash_set<slp_tree> visited;
2916 vect_mark_slp_stmts (node, visited);
2917 }
2918
2919 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2920
2921 static void
2922 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2923 {
2924 int i;
2925 stmt_vec_info stmt_info;
2926 slp_tree child;
2927
2928 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2929 return;
2930
2931 if (visited.add (node))
2932 return;
2933
2934 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2935 {
2936 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2937 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2938 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2939 }
2940
2941 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2942 if (child)
2943 vect_mark_slp_stmts_relevant (child, visited);
2944 }
2945
2946 static void
2947 vect_mark_slp_stmts_relevant (slp_tree node)
2948 {
2949 hash_set<slp_tree> visited;
2950 vect_mark_slp_stmts_relevant (node, visited);
2951 }
2952
2953
2954 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2955
2956 static void
2957 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2958 hash_set<slp_tree> &visited)
2959 {
2960 if (!node || visited.add (node))
2961 return;
2962
2963 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2964 return;
2965
2966 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
2967 {
2968 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2969 if (STMT_VINFO_DATA_REF (stmt_info)
2970 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2971 loads.safe_push (node);
2972 }
2973
2974 unsigned i;
2975 slp_tree child;
2976 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2977 vect_gather_slp_loads (loads, child, visited);
2978 }
2979
2980
2981 /* Find the last store in SLP INSTANCE. */
2982
2983 stmt_vec_info
2984 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2985 {
2986 stmt_vec_info last = NULL;
2987 stmt_vec_info stmt_vinfo;
2988
2989 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2990 {
2991 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2992 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2993 }
2994
2995 return last;
2996 }
2997
2998 /* Find the first stmt in NODE. */
2999
3000 stmt_vec_info
3001 vect_find_first_scalar_stmt_in_slp (slp_tree node)
3002 {
3003 stmt_vec_info first = NULL;
3004 stmt_vec_info stmt_vinfo;
3005
3006 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3007 {
3008 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3009 if (!first
3010 || get_later_stmt (stmt_vinfo, first) == first)
3011 first = stmt_vinfo;
3012 }
3013
3014 return first;
3015 }
3016
3017 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
3018 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
3019 (also containing the first GROUP1_SIZE stmts, since stores are
3020 consecutive), the second containing the remainder.
3021 Return the first stmt in the second group. */
3022
3023 static stmt_vec_info
3024 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3025 {
3026 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3027 gcc_assert (group1_size > 0);
3028 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3029 gcc_assert (group2_size > 0);
3030 DR_GROUP_SIZE (first_vinfo) = group1_size;
3031
3032 stmt_vec_info stmt_info = first_vinfo;
3033 for (unsigned i = group1_size; i > 1; i--)
3034 {
3035 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3036 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3037 }
3038 /* STMT is now the last element of the first group. */
3039 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3040 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3041
3042 DR_GROUP_SIZE (group2) = group2_size;
3043 for (stmt_info = group2; stmt_info;
3044 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3045 {
3046 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3047 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3048 }
3049
3050 /* For the second group, the DR_GROUP_GAP is that before the original group,
3051 plus skipping over the first vector. */
3052 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3053
3054 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
3055 DR_GROUP_GAP (first_vinfo) += group2_size;
3056
3057 if (dump_enabled_p ())
3058 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3059 group1_size, group2_size);
3060
3061 return group2;
3062 }
3063
3064 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3065 statements and a vector of NUNITS elements. */
3066
3067 static poly_uint64
3068 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3069 {
3070 return exact_div (common_multiple (nunits, group_size), group_size);
3071 }
3072
3073 /* Helper that checks to see if a node is a load node. */
3074
3075 static inline bool
3076 vect_is_slp_load_node (slp_tree root)
3077 {
3078 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3079 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3080 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3081 }
3082
3083
3084 /* Helper function of optimize_load_redistribution that performs the operation
3085 recursively. */
3086
3087 static slp_tree
3088 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3089 vec_info *vinfo, unsigned int group_size,
3090 hash_map<slp_tree, slp_tree> *load_map,
3091 slp_tree root)
3092 {
3093 if (slp_tree *leader = load_map->get (root))
3094 return *leader;
3095
3096 slp_tree node;
3097 unsigned i;
3098
3099 /* For now, we don't know anything about externals so do not do anything. */
3100 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3101 return NULL;
3102 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3103 {
3104 /* First convert this node into a load node and add it to the leaves
3105 list and flatten the permute from a lane to a load one. If it's
3106 unneeded it will be elided later. */
3107 vec<stmt_vec_info> stmts;
3108 stmts.create (SLP_TREE_LANES (root));
3109 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3110 for (unsigned j = 0; j < lane_perm.length (); j++)
3111 {
3112 std::pair<unsigned, unsigned> perm = lane_perm[j];
3113 node = SLP_TREE_CHILDREN (root)[perm.first];
3114
3115 if (!vect_is_slp_load_node (node)
3116 || SLP_TREE_CHILDREN (node).exists ())
3117 {
3118 stmts.release ();
3119 goto next;
3120 }
3121
3122 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3123 }
3124
3125 if (dump_enabled_p ())
3126 dump_printf_loc (MSG_NOTE, vect_location,
3127 "converting stmts on permute node %p\n",
3128 (void *) root);
3129
3130 bool *matches = XALLOCAVEC (bool, group_size);
3131 poly_uint64 max_nunits = 1;
3132 unsigned tree_size = 0, limit = 1;
3133 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3134 matches, &limit, &tree_size, bst_map);
3135 if (!node)
3136 stmts.release ();
3137
3138 load_map->put (root, node);
3139 return node;
3140 }
3141
3142 next:
3143 load_map->put (root, NULL);
3144
3145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3146 {
3147 slp_tree value
3148 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3149 node);
3150 if (value)
3151 {
3152 SLP_TREE_REF_COUNT (value)++;
3153 SLP_TREE_CHILDREN (root)[i] = value;
3154 /* ??? We know the original leafs of the replaced nodes will
3155 be referenced by bst_map, only the permutes created by
3156 pattern matching are not. */
3157 if (SLP_TREE_REF_COUNT (node) == 1)
3158 load_map->remove (node);
3159 vect_free_slp_tree (node);
3160 }
3161 }
3162
3163 return NULL;
3164 }
3165
3166 /* Temporary workaround for loads not being CSEd during SLP build. This
3167 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3168 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3169 same DR such that the final operation is equal to a permuted load. Such
3170 NODES are then directly converted into LOADS themselves. The nodes are
3171 CSEd using BST_MAP. */
3172
3173 static void
3174 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3175 vec_info *vinfo, unsigned int group_size,
3176 hash_map<slp_tree, slp_tree> *load_map,
3177 slp_tree root)
3178 {
3179 slp_tree node;
3180 unsigned i;
3181
3182 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3183 {
3184 slp_tree value
3185 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3186 node);
3187 if (value)
3188 {
3189 SLP_TREE_REF_COUNT (value)++;
3190 SLP_TREE_CHILDREN (root)[i] = value;
3191 /* ??? We know the original leafs of the replaced nodes will
3192 be referenced by bst_map, only the permutes created by
3193 pattern matching are not. */
3194 if (SLP_TREE_REF_COUNT (node) == 1)
3195 load_map->remove (node);
3196 vect_free_slp_tree (node);
3197 }
3198 }
3199 }
3200
3201 /* Helper function of vect_match_slp_patterns.
3202
3203 Attempts to match patterns against the slp tree rooted in REF_NODE using
3204 VINFO. Patterns are matched in post-order traversal.
3205
3206 If matching is successful the value in REF_NODE is updated and returned, if
3207 not then it is returned unchanged. */
3208
3209 static bool
3210 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3211 slp_tree_to_load_perm_map_t *perm_cache,
3212 slp_compat_nodes_map_t *compat_cache,
3213 hash_set<slp_tree> *visited)
3214 {
3215 unsigned i;
3216 slp_tree node = *ref_node;
3217 bool found_p = false;
3218 if (!node || visited->add (node))
3219 return false;
3220
3221 slp_tree child;
3222 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3223 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3224 vinfo, perm_cache, compat_cache,
3225 visited);
3226
3227 for (unsigned x = 0; x < num__slp_patterns; x++)
3228 {
3229 vect_pattern *pattern
3230 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3231 if (pattern)
3232 {
3233 pattern->build (vinfo);
3234 delete pattern;
3235 found_p = true;
3236 }
3237 }
3238
3239 return found_p;
3240 }
3241
3242 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3243 vec_info VINFO.
3244
3245 The modified tree is returned. Patterns are tried in order and multiple
3246 patterns may match. */
3247
3248 static bool
3249 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3250 hash_set<slp_tree> *visited,
3251 slp_tree_to_load_perm_map_t *perm_cache,
3252 slp_compat_nodes_map_t *compat_cache)
3253 {
3254 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3255 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3256
3257 if (dump_enabled_p ())
3258 dump_printf_loc (MSG_NOTE, vect_location,
3259 "Analyzing SLP tree %p for patterns\n",
3260 (void *) SLP_INSTANCE_TREE (instance));
3261
3262 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3263 visited);
3264 }
3265
3266 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3267 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3268 Return true if we could use IFN_STORE_LANES instead and if that appears
3269 to be the better approach. */
3270
3271 static bool
3272 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3273 unsigned int group_size,
3274 unsigned int new_group_size)
3275 {
3276 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3277 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3278 if (!vectype)
3279 return false;
3280 /* Allow the split if one of the two new groups would operate on full
3281 vectors *within* rather than across one scalar loop iteration.
3282 This is purely a heuristic, but it should work well for group
3283 sizes of 3 and 4, where the possible splits are:
3284
3285 3->2+1: OK if the vector has exactly two elements
3286 4->2+2: Likewise
3287 4->3+1: Less clear-cut. */
3288 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3289 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3290 return false;
3291 return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3292 }
3293
3294 /* Analyze an SLP instance starting from a group of grouped stores. Call
3295 vect_build_slp_tree to build a tree of packed stmts if possible.
3296 Return FALSE if it's impossible to SLP any stmt in the loop. */
3297
3298 static bool
3299 vect_analyze_slp_instance (vec_info *vinfo,
3300 scalar_stmts_to_slp_tree_map_t *bst_map,
3301 stmt_vec_info stmt_info, slp_instance_kind kind,
3302 unsigned max_tree_size, unsigned *limit);
3303
3304 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3305 of KIND. Return true if successful. */
3306
3307 static bool
3308 vect_build_slp_instance (vec_info *vinfo,
3309 slp_instance_kind kind,
3310 vec<stmt_vec_info> &scalar_stmts,
3311 vec<stmt_vec_info> &root_stmt_infos,
3312 vec<tree> &remain,
3313 unsigned max_tree_size, unsigned *limit,
3314 scalar_stmts_to_slp_tree_map_t *bst_map,
3315 /* ??? We need stmt_info for group splitting. */
3316 stmt_vec_info stmt_info_)
3317 {
3318 if (kind == slp_inst_kind_ctor)
3319 {
3320 if (dump_enabled_p ())
3321 dump_printf_loc (MSG_NOTE, vect_location,
3322 "Analyzing vectorizable constructor: %G\n",
3323 root_stmt_infos[0]->stmt);
3324 }
3325
3326 if (dump_enabled_p ())
3327 {
3328 dump_printf_loc (MSG_NOTE, vect_location,
3329 "Starting SLP discovery for\n");
3330 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3331 dump_printf_loc (MSG_NOTE, vect_location,
3332 " %G", scalar_stmts[i]->stmt);
3333 }
3334
3335 /* Build the tree for the SLP instance. */
3336 unsigned int group_size = scalar_stmts.length ();
3337 bool *matches = XALLOCAVEC (bool, group_size);
3338 poly_uint64 max_nunits = 1;
3339 unsigned tree_size = 0;
3340 unsigned i;
3341 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3342 &max_nunits, matches, limit,
3343 &tree_size, bst_map);
3344 if (node != NULL)
3345 {
3346 /* Calculate the unrolling factor based on the smallest type. */
3347 poly_uint64 unrolling_factor
3348 = calculate_unrolling_factor (max_nunits, group_size);
3349
3350 if (maybe_ne (unrolling_factor, 1U)
3351 && is_a <bb_vec_info> (vinfo))
3352 {
3353 unsigned HOST_WIDE_INT const_max_nunits;
3354 if (!max_nunits.is_constant (&const_max_nunits)
3355 || const_max_nunits > group_size)
3356 {
3357 if (dump_enabled_p ())
3358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3359 "Build SLP failed: store group "
3360 "size not a multiple of the vector size "
3361 "in basic block SLP\n");
3362 vect_free_slp_tree (node);
3363 return false;
3364 }
3365 /* Fatal mismatch. */
3366 if (dump_enabled_p ())
3367 dump_printf_loc (MSG_NOTE, vect_location,
3368 "SLP discovery succeeded but node needs "
3369 "splitting\n");
3370 memset (matches, true, group_size);
3371 matches[group_size / const_max_nunits * const_max_nunits] = false;
3372 vect_free_slp_tree (node);
3373 }
3374 else
3375 {
3376 /* Create a new SLP instance. */
3377 slp_instance new_instance = XNEW (class _slp_instance);
3378 SLP_INSTANCE_TREE (new_instance) = node;
3379 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3380 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3381 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3382 SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3383 SLP_INSTANCE_KIND (new_instance) = kind;
3384 new_instance->reduc_phis = NULL;
3385 new_instance->cost_vec = vNULL;
3386 new_instance->subgraph_entries = vNULL;
3387
3388 if (dump_enabled_p ())
3389 dump_printf_loc (MSG_NOTE, vect_location,
3390 "SLP size %u vs. limit %u.\n",
3391 tree_size, max_tree_size);
3392
3393 /* Fixup SLP reduction chains. */
3394 if (kind == slp_inst_kind_reduc_chain)
3395 {
3396 /* If this is a reduction chain with a conversion in front
3397 amend the SLP tree with a node for that. */
3398 gimple *scalar_def
3399 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3400 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3401 {
3402 /* Get at the conversion stmt - we know it's the single use
3403 of the last stmt of the reduction chain. */
3404 use_operand_p use_p;
3405 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3406 &use_p, &scalar_def);
3407 gcc_assert (r);
3408 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3409 next_info = vect_stmt_to_vectorize (next_info);
3410 scalar_stmts = vNULL;
3411 scalar_stmts.create (group_size);
3412 for (unsigned i = 0; i < group_size; ++i)
3413 scalar_stmts.quick_push (next_info);
3414 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3415 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3416 SLP_TREE_CHILDREN (conv).quick_push (node);
3417 SLP_INSTANCE_TREE (new_instance) = conv;
3418 /* We also have to fake this conversion stmt as SLP reduction
3419 group so we don't have to mess with too much code
3420 elsewhere. */
3421 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3422 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3423 }
3424 /* Fill the backedge child of the PHI SLP node. The
3425 general matching code cannot find it because the
3426 scalar code does not reflect how we vectorize the
3427 reduction. */
3428 use_operand_p use_p;
3429 imm_use_iterator imm_iter;
3430 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3431 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3432 gimple_get_lhs (scalar_def))
3433 /* There are exactly two non-debug uses, the reduction
3434 PHI and the loop-closed PHI node. */
3435 if (!is_gimple_debug (USE_STMT (use_p))
3436 && gimple_bb (USE_STMT (use_p)) == loop->header)
3437 {
3438 auto_vec<stmt_vec_info, 64> phis (group_size);
3439 stmt_vec_info phi_info
3440 = vinfo->lookup_stmt (USE_STMT (use_p));
3441 for (unsigned i = 0; i < group_size; ++i)
3442 phis.quick_push (phi_info);
3443 slp_tree *phi_node = bst_map->get (phis);
3444 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3445 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3446 = SLP_INSTANCE_TREE (new_instance);
3447 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3448 }
3449 }
3450
3451 vinfo->slp_instances.safe_push (new_instance);
3452
3453 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3454 the number of scalar stmts in the root in a few places.
3455 Verify that assumption holds. */
3456 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3457 .length () == group_size);
3458
3459 if (dump_enabled_p ())
3460 {
3461 dump_printf_loc (MSG_NOTE, vect_location,
3462 "Final SLP tree for instance %p:\n",
3463 (void *) new_instance);
3464 vect_print_slp_graph (MSG_NOTE, vect_location,
3465 SLP_INSTANCE_TREE (new_instance));
3466 }
3467
3468 return true;
3469 }
3470 }
3471 else
3472 {
3473 /* Failed to SLP. */
3474 /* Free the allocated memory. */
3475 scalar_stmts.release ();
3476 }
3477
3478 stmt_vec_info stmt_info = stmt_info_;
3479 /* Try to break the group up into pieces. */
3480 if (kind == slp_inst_kind_store)
3481 {
3482 /* ??? We could delay all the actual splitting of store-groups
3483 until after SLP discovery of the original group completed.
3484 Then we can recurse to vect_build_slp_instance directly. */
3485 for (i = 0; i < group_size; i++)
3486 if (!matches[i])
3487 break;
3488
3489 /* For basic block SLP, try to break the group up into multiples of
3490 a vector size. */
3491 if (is_a <bb_vec_info> (vinfo)
3492 && (i > 1 && i < group_size))
3493 {
3494 tree scalar_type
3495 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3496 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3497 1 << floor_log2 (i));
3498 unsigned HOST_WIDE_INT const_nunits;
3499 if (vectype
3500 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3501 {
3502 /* Split into two groups at the first vector boundary. */
3503 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3504 unsigned group1_size = i & ~(const_nunits - 1);
3505
3506 if (dump_enabled_p ())
3507 dump_printf_loc (MSG_NOTE, vect_location,
3508 "Splitting SLP group at stmt %u\n", i);
3509 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3510 group1_size);
3511 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3512 kind, max_tree_size,
3513 limit);
3514 /* Split the rest at the failure point and possibly
3515 re-analyze the remaining matching part if it has
3516 at least two lanes. */
3517 if (group1_size < i
3518 && (i + 1 < group_size
3519 || i - group1_size > 1))
3520 {
3521 stmt_vec_info rest2 = rest;
3522 rest = vect_split_slp_store_group (rest, i - group1_size);
3523 if (i - group1_size > 1)
3524 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3525 kind, max_tree_size,
3526 limit);
3527 }
3528 /* Re-analyze the non-matching tail if it has at least
3529 two lanes. */
3530 if (i + 1 < group_size)
3531 res |= vect_analyze_slp_instance (vinfo, bst_map,
3532 rest, kind, max_tree_size,
3533 limit);
3534 return res;
3535 }
3536 }
3537
3538 /* For loop vectorization split into arbitrary pieces of size > 1. */
3539 if (is_a <loop_vec_info> (vinfo)
3540 && (i > 1 && i < group_size)
3541 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3542 {
3543 unsigned group1_size = i;
3544
3545 if (dump_enabled_p ())
3546 dump_printf_loc (MSG_NOTE, vect_location,
3547 "Splitting SLP group at stmt %u\n", i);
3548
3549 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3550 group1_size);
3551 /* Loop vectorization cannot handle gaps in stores, make sure
3552 the split group appears as strided. */
3553 STMT_VINFO_STRIDED_P (rest) = 1;
3554 DR_GROUP_GAP (rest) = 0;
3555 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3556 DR_GROUP_GAP (stmt_info) = 0;
3557
3558 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3559 kind, max_tree_size, limit);
3560 if (i + 1 < group_size)
3561 res |= vect_analyze_slp_instance (vinfo, bst_map,
3562 rest, kind, max_tree_size, limit);
3563
3564 return res;
3565 }
3566
3567 /* Even though the first vector did not all match, we might be able to SLP
3568 (some) of the remainder. FORNOW ignore this possibility. */
3569 }
3570
3571 /* Failed to SLP. */
3572 if (dump_enabled_p ())
3573 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3574 return false;
3575 }
3576
3577
3578 /* Analyze an SLP instance starting from a group of grouped stores. Call
3579 vect_build_slp_tree to build a tree of packed stmts if possible.
3580 Return FALSE if it's impossible to SLP any stmt in the loop. */
3581
3582 static bool
3583 vect_analyze_slp_instance (vec_info *vinfo,
3584 scalar_stmts_to_slp_tree_map_t *bst_map,
3585 stmt_vec_info stmt_info,
3586 slp_instance_kind kind,
3587 unsigned max_tree_size, unsigned *limit)
3588 {
3589 unsigned int i;
3590 vec<stmt_vec_info> scalar_stmts;
3591
3592 if (is_a <bb_vec_info> (vinfo))
3593 vect_location = stmt_info->stmt;
3594
3595 stmt_vec_info next_info = stmt_info;
3596 if (kind == slp_inst_kind_store)
3597 {
3598 /* Collect the stores and store them in scalar_stmts. */
3599 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3600 while (next_info)
3601 {
3602 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3603 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3604 }
3605 }
3606 else if (kind == slp_inst_kind_reduc_chain)
3607 {
3608 /* Collect the reduction stmts and store them in scalar_stmts. */
3609 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3610 while (next_info)
3611 {
3612 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3613 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3614 }
3615 /* Mark the first element of the reduction chain as reduction to properly
3616 transform the node. In the reduction analysis phase only the last
3617 element of the chain is marked as reduction. */
3618 STMT_VINFO_DEF_TYPE (stmt_info)
3619 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3620 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3621 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3622 }
3623 else if (kind == slp_inst_kind_reduc_group)
3624 {
3625 /* Collect reduction statements. */
3626 const vec<stmt_vec_info> &reductions
3627 = as_a <loop_vec_info> (vinfo)->reductions;
3628 scalar_stmts.create (reductions.length ());
3629 for (i = 0; reductions.iterate (i, &next_info); i++)
3630 if ((STMT_VINFO_RELEVANT_P (next_info)
3631 || STMT_VINFO_LIVE_P (next_info))
3632 /* ??? Make sure we didn't skip a conversion around a reduction
3633 path. In that case we'd have to reverse engineer that conversion
3634 stmt following the chain using reduc_idx and from the PHI
3635 using reduc_def. */
3636 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3637 scalar_stmts.quick_push (next_info);
3638 /* If less than two were relevant/live there's nothing to SLP. */
3639 if (scalar_stmts.length () < 2)
3640 return false;
3641 }
3642 else
3643 gcc_unreachable ();
3644
3645 vec<stmt_vec_info> roots = vNULL;
3646 vec<tree> remain = vNULL;
3647 /* Build the tree for the SLP instance. */
3648 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3649 roots, remain,
3650 max_tree_size, limit, bst_map,
3651 kind == slp_inst_kind_store
3652 ? stmt_info : NULL);
3653
3654 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3655 where we should do store group splitting. */
3656
3657 return res;
3658 }
3659
3660 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3661 trees of packed scalar stmts if SLP is possible. */
3662
3663 opt_result
3664 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3665 {
3666 unsigned int i;
3667 stmt_vec_info first_element;
3668 slp_instance instance;
3669
3670 DUMP_VECT_SCOPE ("vect_analyze_slp");
3671
3672 unsigned limit = max_tree_size;
3673
3674 scalar_stmts_to_slp_tree_map_t *bst_map
3675 = new scalar_stmts_to_slp_tree_map_t ();
3676
3677 /* Find SLP sequences starting from groups of grouped stores. */
3678 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3679 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3680 slp_inst_kind_store, max_tree_size, &limit);
3681
3682 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3683 {
3684 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3685 {
3686 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3687 /* Apply patterns. */
3688 for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
3689 bb_vinfo->roots[i].stmts[j]
3690 = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
3691 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3692 bb_vinfo->roots[i].stmts,
3693 bb_vinfo->roots[i].roots,
3694 bb_vinfo->roots[i].remain,
3695 max_tree_size, &limit, bst_map, NULL))
3696 {
3697 bb_vinfo->roots[i].stmts = vNULL;
3698 bb_vinfo->roots[i].roots = vNULL;
3699 bb_vinfo->roots[i].remain = vNULL;
3700 }
3701 }
3702 }
3703
3704 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3705 {
3706 /* Find SLP sequences starting from reduction chains. */
3707 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3708 if (! STMT_VINFO_RELEVANT_P (first_element)
3709 && ! STMT_VINFO_LIVE_P (first_element))
3710 ;
3711 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3712 slp_inst_kind_reduc_chain,
3713 max_tree_size, &limit))
3714 {
3715 /* Dissolve reduction chain group. */
3716 stmt_vec_info vinfo = first_element;
3717 stmt_vec_info last = NULL;
3718 while (vinfo)
3719 {
3720 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3721 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3722 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3723 last = vinfo;
3724 vinfo = next;
3725 }
3726 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3727 /* It can be still vectorized as part of an SLP reduction. */
3728 loop_vinfo->reductions.safe_push (last);
3729 }
3730
3731 /* Find SLP sequences starting from groups of reductions. */
3732 if (loop_vinfo->reductions.length () > 1)
3733 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3734 slp_inst_kind_reduc_group, max_tree_size,
3735 &limit);
3736 }
3737
3738 hash_set<slp_tree> visited_patterns;
3739 slp_tree_to_load_perm_map_t perm_cache;
3740 slp_compat_nodes_map_t compat_cache;
3741
3742 /* See if any patterns can be found in the SLP tree. */
3743 bool pattern_found = false;
3744 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3745 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3746 &visited_patterns, &perm_cache,
3747 &compat_cache);
3748
3749 /* If any were found optimize permutations of loads. */
3750 if (pattern_found)
3751 {
3752 hash_map<slp_tree, slp_tree> load_map;
3753 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3754 {
3755 slp_tree root = SLP_INSTANCE_TREE (instance);
3756 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3757 &load_map, root);
3758 }
3759 }
3760
3761
3762
3763 /* The map keeps a reference on SLP nodes built, release that. */
3764 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3765 it != bst_map->end (); ++it)
3766 if ((*it).second)
3767 vect_free_slp_tree ((*it).second);
3768 delete bst_map;
3769
3770 if (pattern_found && dump_enabled_p ())
3771 {
3772 dump_printf_loc (MSG_NOTE, vect_location,
3773 "Pattern matched SLP tree\n");
3774 hash_set<slp_tree> visited;
3775 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3776 vect_print_slp_graph (MSG_NOTE, vect_location,
3777 SLP_INSTANCE_TREE (instance), visited);
3778 }
3779
3780 return opt_result::success ();
3781 }
3782
3783 /* Estimates the cost of inserting layout changes into the SLP graph.
3784 It can also say that the insertion is impossible. */
3785
3786 struct slpg_layout_cost
3787 {
3788 slpg_layout_cost () = default;
3789 slpg_layout_cost (sreal, bool);
3790
3791 static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3792 bool is_possible () const { return depth != sreal::max (); }
3793
3794 bool operator== (const slpg_layout_cost &) const;
3795 bool operator!= (const slpg_layout_cost &) const;
3796
3797 bool is_better_than (const slpg_layout_cost &, bool) const;
3798
3799 void add_parallel_cost (const slpg_layout_cost &);
3800 void add_serial_cost (const slpg_layout_cost &);
3801 void split (unsigned int);
3802
3803 /* The longest sequence of layout changes needed during any traversal
3804 of the partition dag, weighted by execution frequency.
3805
3806 This is the most important metric when optimizing for speed, since
3807 it helps to ensure that we keep the number of operations on
3808 critical paths to a minimum. */
3809 sreal depth = 0;
3810
3811 /* An estimate of the total number of operations needed. It is weighted by
3812 execution frequency when optimizing for speed but not when optimizing for
3813 size. In order to avoid double-counting, a node with a fanout of N will
3814 distribute 1/N of its total cost to each successor.
3815
3816 This is the most important metric when optimizing for size, since
3817 it helps to keep the total number of operations to a minimum, */
3818 sreal total = 0;
3819 };
3820
3821 /* Construct costs for a node with weight WEIGHT. A higher weight
3822 indicates more frequent execution. IS_FOR_SIZE is true if we are
3823 optimizing for size rather than speed. */
3824
3825 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3826 : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3827 {
3828 }
3829
3830 bool
3831 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3832 {
3833 return depth == other.depth && total == other.total;
3834 }
3835
3836 bool
3837 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3838 {
3839 return !operator== (other);
3840 }
3841
3842 /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
3843 true if we are optimizing for size rather than speed. */
3844
3845 bool
3846 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3847 bool is_for_size) const
3848 {
3849 if (is_for_size)
3850 {
3851 if (total != other.total)
3852 return total < other.total;
3853 return depth < other.depth;
3854 }
3855 else
3856 {
3857 if (depth != other.depth)
3858 return depth < other.depth;
3859 return total < other.total;
3860 }
3861 }
3862
3863 /* Increase the costs to account for something with cost INPUT_COST
3864 happening in parallel with the current costs. */
3865
3866 void
3867 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3868 {
3869 depth = std::max (depth, input_cost.depth);
3870 total += input_cost.total;
3871 }
3872
3873 /* Increase the costs to account for something with cost INPUT_COST
3874 happening in series with the current costs. */
3875
3876 void
3877 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3878 {
3879 depth += other.depth;
3880 total += other.total;
3881 }
3882
3883 /* Split the total cost among TIMES successors or predecessors. */
3884
3885 void
3886 slpg_layout_cost::split (unsigned int times)
3887 {
3888 if (times > 1)
3889 total /= times;
3890 }
3891
3892 /* Information about one node in the SLP graph, for use during
3893 vect_optimize_slp_pass. */
3894
3895 struct slpg_vertex
3896 {
3897 slpg_vertex (slp_tree node_) : node (node_) {}
3898
3899 /* The node itself. */
3900 slp_tree node;
3901
3902 /* Which partition the node belongs to, or -1 if none. Nodes outside of
3903 partitions are flexible; they can have whichever layout consumers
3904 want them to have. */
3905 int partition = -1;
3906
3907 /* The number of nodes that directly use the result of this one
3908 (i.e. the number of nodes that count this one as a child). */
3909 unsigned int out_degree = 0;
3910
3911 /* The execution frequency of the node. */
3912 sreal weight = 0;
3913
3914 /* The total execution frequency of all nodes that directly use the
3915 result of this one. */
3916 sreal out_weight = 0;
3917 };
3918
3919 /* Information about one partition of the SLP graph, for use during
3920 vect_optimize_slp_pass. */
3921
3922 struct slpg_partition_info
3923 {
3924 /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3925 of m_partitioned_nodes. */
3926 unsigned int node_begin = 0;
3927 unsigned int node_end = 0;
3928
3929 /* Which layout we've chosen to use for this partition, or -1 if
3930 we haven't picked one yet. */
3931 int layout = -1;
3932
3933 /* The number of predecessors and successors in the partition dag.
3934 The predecessors always have lower partition numbers and the
3935 successors always have higher partition numbers.
3936
3937 Note that the directions of these edges are not necessarily the
3938 same as in the data flow graph. For example, if an SCC has separate
3939 partitions for an inner loop and an outer loop, the inner loop's
3940 partition will have at least two incoming edges from the outer loop's
3941 partition: one for a live-in value and one for a live-out value.
3942 In data flow terms, one of these edges would also be from the outer loop
3943 to the inner loop, but the other would be in the opposite direction. */
3944 unsigned int in_degree = 0;
3945 unsigned int out_degree = 0;
3946 };
3947
3948 /* Information about the costs of using a particular layout for a
3949 particular partition. It can also say that the combination is
3950 impossible. */
3951
3952 struct slpg_partition_layout_costs
3953 {
3954 bool is_possible () const { return internal_cost.is_possible (); }
3955 void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3956
3957 /* The costs inherited from predecessor partitions. */
3958 slpg_layout_cost in_cost;
3959
3960 /* The inherent cost of the layout within the node itself. For example,
3961 this is nonzero for a load if choosing a particular layout would require
3962 the load to permute the loaded elements. It is nonzero for a
3963 VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3964 to full-vector moves. */
3965 slpg_layout_cost internal_cost;
3966
3967 /* The costs inherited from successor partitions. */
3968 slpg_layout_cost out_cost;
3969 };
3970
3971 /* This class tries to optimize the layout of vectors in order to avoid
3972 unnecessary shuffling. At the moment, the set of possible layouts are
3973 restricted to bijective permutations.
3974
3975 The goal of the pass depends on whether we're optimizing for size or
3976 for speed. When optimizing for size, the goal is to reduce the overall
3977 number of layout changes (including layout changes implied by things
3978 like load permutations). When optimizing for speed, the goal is to
3979 reduce the maximum latency attributable to layout changes on any
3980 non-cyclical path through the data flow graph.
3981
3982 For example, when optimizing a loop nest for speed, we will prefer
3983 to make layout changes outside of a loop rather than inside of a loop,
3984 and will prefer to make layout changes in parallel rather than serially,
3985 even if that increases the overall number of layout changes.
3986
3987 The high-level procedure is:
3988
3989 (1) Build a graph in which edges go from uses (parents) to definitions
3990 (children).
3991
3992 (2) Divide the graph into a dag of strongly-connected components (SCCs).
3993
3994 (3) When optimizing for speed, partition the nodes in each SCC based
3995 on their containing cfg loop. When optimizing for size, treat
3996 each SCC as a single partition.
3997
3998 This gives us a dag of partitions. The goal is now to assign a
3999 layout to each partition.
4000
4001 (4) Construct a set of vector layouts that are worth considering.
4002 Record which nodes must keep their current layout.
4003
4004 (5) Perform a forward walk over the partition dag (from loads to stores)
4005 accumulating the "forward" cost of using each layout. When visiting
4006 each partition, assign a tentative choice of layout to the partition
4007 and use that choice when calculating the cost of using a different
4008 layout in successor partitions.
4009
4010 (6) Perform a backward walk over the partition dag (from stores to loads),
4011 accumulating the "backward" cost of using each layout. When visiting
4012 each partition, make a final choice of layout for that partition based
4013 on the accumulated forward costs (from (5)) and backward costs
4014 (from (6)).
4015
4016 (7) Apply the chosen layouts to the SLP graph.
4017
4018 For example, consider the SLP statements:
4019
4020 S1: a_1 = load
4021 loop:
4022 S2: a_2 = PHI<a_1, a_3>
4023 S3: b_1 = load
4024 S4: a_3 = a_2 + b_1
4025 exit:
4026 S5: a_4 = PHI<a_3>
4027 S6: store a_4
4028
4029 S2 and S4 form an SCC and are part of the same loop. Every other
4030 statement is in a singleton SCC. In this example there is a one-to-one
4031 mapping between SCCs and partitions and the partition dag looks like this;
4032
4033 S1 S3
4034 \ /
4035 S2+S4
4036 |
4037 S5
4038 |
4039 S6
4040
4041 S2, S3 and S4 will have a higher execution frequency than the other
4042 statements, so when optimizing for speed, the goal is to avoid any
4043 layout changes:
4044
4045 - within S3
4046 - within S2+S4
4047 - on the S3->S2+S4 edge
4048
4049 For example, if S3 was originally a reversing load, the goal of the
4050 pass is to make it an unreversed load and change the layout on the
4051 S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
4052 on S1->S2+S4 and S5->S6 would also be acceptable.)
4053
4054 The difference between SCCs and partitions becomes important if we
4055 add an outer loop:
4056
4057 S1: a_1 = ...
4058 loop1:
4059 S2: a_2 = PHI<a_1, a_6>
4060 S3: b_1 = load
4061 S4: a_3 = a_2 + b_1
4062 loop2:
4063 S5: a_4 = PHI<a_3, a_5>
4064 S6: c_1 = load
4065 S7: a_5 = a_4 + c_1
4066 exit2:
4067 S8: a_6 = PHI<a_5>
4068 S9: store a_6
4069 exit1:
4070
4071 Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
4072 for speed, we usually do not want restrictions in the outer loop to "infect"
4073 the decision for the inner loop. For example, if an outer-loop node
4074 in the SCC contains a statement with a fixed layout, that should not
4075 prevent the inner loop from using a different layout. Conversely,
4076 the inner loop should not dictate a layout to the outer loop: if the
4077 outer loop does a lot of computation, then it may not be efficient to
4078 do all of that computation in the inner loop's preferred layout.
4079
4080 So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
4081 and S5+S7 (inner). We also try to arrange partitions so that:
4082
4083 - the partition for an outer loop comes before the partition for
4084 an inner loop
4085
4086 - if a sibling loop A dominates a sibling loop B, A's partition
4087 comes before B's
4088
4089 This gives the following partition dag for the example above:
4090
4091 S1 S3
4092 \ /
4093 S2+S4+S8 S6
4094 | \\ /
4095 | S5+S7
4096 |
4097 S9
4098
4099 There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
4100 one for a reversal of the edge S7->S8.
4101
4102 The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
4103 for S2+S4+S8 therefore has to balance the cost of using the outer loop's
4104 preferred layout against the cost of changing the layout on entry to the
4105 inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
4106
4107 Although this works well when optimizing for speed, it has the downside
4108 when optimizing for size that the choice of layout for S5+S7 is completely
4109 independent of S9, which lessens the chance of reducing the overall number
4110 of permutations. We therefore do not partition SCCs when optimizing
4111 for size.
4112
4113 To give a concrete example of the difference between optimizing
4114 for size and speed, consider:
4115
4116 a[0] = (b[1] << c[3]) - d[1];
4117 a[1] = (b[0] << c[2]) - d[0];
4118 a[2] = (b[3] << c[1]) - d[3];
4119 a[3] = (b[2] << c[0]) - d[2];
4120
4121 There are three different layouts here: one for a, one for b and d,
4122 and one for c. When optimizing for speed it is better to permute each
4123 of b, c and d into the order required by a, since those permutations
4124 happen in parallel. But when optimizing for size, it is better to:
4125
4126 - permute c into the same order as b
4127 - do the arithmetic
4128 - permute the result into the order required by a
4129
4130 This gives 2 permutations rather than 3. */
4131
4132 class vect_optimize_slp_pass
4133 {
4134 public:
4135 vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
4136 void run ();
4137
4138 private:
4139 /* Graph building. */
4140 struct loop *containing_loop (slp_tree);
4141 bool is_cfg_latch_edge (graph_edge *);
4142 void build_vertices (hash_set<slp_tree> &, slp_tree);
4143 void build_vertices ();
4144 void build_graph ();
4145
4146 /* Partitioning. */
4147 void create_partitions ();
4148 template<typename T> void for_each_partition_edge (unsigned int, T);
4149
4150 /* Layout selection. */
4151 bool is_compatible_layout (slp_tree, unsigned int);
4152 int change_layout_cost (slp_tree, unsigned int, unsigned int);
4153 slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4154 unsigned int);
4155 void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4156 int, unsigned int);
4157 int internal_node_cost (slp_tree, int, unsigned int);
4158 void start_choosing_layouts ();
4159
4160 /* Cost propagation. */
4161 slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4162 unsigned int, unsigned int);
4163 slpg_layout_cost total_in_cost (unsigned int);
4164 slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4165 slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4166 void forward_pass ();
4167 void backward_pass ();
4168
4169 /* Rematerialization. */
4170 slp_tree get_result_with_layout (slp_tree, unsigned int);
4171 void materialize ();
4172
4173 /* Clean-up. */
4174 void remove_redundant_permutations ();
4175
4176 void dump ();
4177
4178 vec_info *m_vinfo;
4179
4180 /* True if we should optimize the graph for size, false if we should
4181 optimize it for speed. (It wouldn't be easy to make this decision
4182 more locally.) */
4183 bool m_optimize_size;
4184
4185 /* A graph of all SLP nodes, with edges leading from uses to definitions.
4186 In other words, a node's predecessors are its slp_tree parents and
4187 a node's successors are its slp_tree children. */
4188 graph *m_slpg = nullptr;
4189
4190 /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
4191 auto_vec<slpg_vertex> m_vertices;
4192
4193 /* The list of all leaves of M_SLPG. such as external definitions, constants,
4194 and loads. */
4195 auto_vec<int> m_leafs;
4196
4197 /* This array has one entry for every vector layout that we're considering.
4198 Element 0 is null and indicates "no change". Other entries describe
4199 permutations that are inherent in the current graph and that we would
4200 like to reverse if possible.
4201
4202 For example, a permutation { 1, 2, 3, 0 } means that something has
4203 effectively been permuted in that way, such as a load group
4204 { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4205 We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4206 in order to put things "back" in order. */
4207 auto_vec<vec<unsigned> > m_perms;
4208
4209 /* A partitioning of the nodes for which a layout must be chosen.
4210 Each partition represents an <SCC, cfg loop> pair; that is,
4211 nodes in different SCCs belong to different partitions, and nodes
4212 within an SCC can be further partitioned according to a containing
4213 cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
4214
4215 - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4216 from leaves (such as loads) to roots (such as stores).
4217
4218 - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
4219 auto_vec<slpg_partition_info> m_partitions;
4220
4221 /* The list of all nodes for which a layout must be chosen. Nodes for
4222 partition P come before the nodes for partition P+1. Nodes within a
4223 partition are in reverse postorder. */
4224 auto_vec<unsigned int> m_partitioned_nodes;
4225
4226 /* Index P * num-layouts + L contains the cost of using layout L
4227 for partition P. */
4228 auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4229
4230 /* Index N * num-layouts + L, if nonnull, is a node that provides the
4231 original output of node N adjusted to have layout L. */
4232 auto_vec<slp_tree> m_node_layouts;
4233 };
4234
4235 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4236 Also record whether we should optimize anything for speed rather
4237 than size. */
4238
4239 void
4240 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4241 slp_tree node)
4242 {
4243 unsigned i;
4244 slp_tree child;
4245
4246 if (visited.add (node))
4247 return;
4248
4249 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4250 {
4251 basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4252 if (optimize_bb_for_speed_p (bb))
4253 m_optimize_size = false;
4254 }
4255
4256 node->vertex = m_vertices.length ();
4257 m_vertices.safe_push (slpg_vertex (node));
4258
4259 bool leaf = true;
4260 bool force_leaf = false;
4261 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4262 if (child)
4263 {
4264 leaf = false;
4265 build_vertices (visited, child);
4266 }
4267 else
4268 force_leaf = true;
4269 /* Since SLP discovery works along use-def edges all cycles have an
4270 entry - but there's the exception of cycles where we do not handle
4271 the entry explicitely (but with a NULL SLP node), like some reductions
4272 and inductions. Force those SLP PHIs to act as leafs to make them
4273 backwards reachable. */
4274 if (leaf || force_leaf)
4275 m_leafs.safe_push (node->vertex);
4276 }
4277
4278 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
4279
4280 void
4281 vect_optimize_slp_pass::build_vertices ()
4282 {
4283 hash_set<slp_tree> visited;
4284 unsigned i;
4285 slp_instance instance;
4286 FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4287 build_vertices (visited, SLP_INSTANCE_TREE (instance));
4288 }
4289
4290 /* Apply (reverse) bijectite PERM to VEC. */
4291
4292 template <class T>
4293 static void
4294 vect_slp_permute (vec<unsigned> perm,
4295 vec<T> &vec, bool reverse)
4296 {
4297 auto_vec<T, 64> saved;
4298 saved.create (vec.length ());
4299 for (unsigned i = 0; i < vec.length (); ++i)
4300 saved.quick_push (vec[i]);
4301
4302 if (reverse)
4303 {
4304 for (unsigned i = 0; i < vec.length (); ++i)
4305 vec[perm[i]] = saved[i];
4306 for (unsigned i = 0; i < vec.length (); ++i)
4307 gcc_assert (vec[perm[i]] == saved[i]);
4308 }
4309 else
4310 {
4311 for (unsigned i = 0; i < vec.length (); ++i)
4312 vec[i] = saved[perm[i]];
4313 for (unsigned i = 0; i < vec.length (); ++i)
4314 gcc_assert (vec[i] == saved[perm[i]]);
4315 }
4316 }
4317
4318 /* Return the cfg loop that contains NODE. */
4319
4320 struct loop *
4321 vect_optimize_slp_pass::containing_loop (slp_tree node)
4322 {
4323 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4324 if (!rep)
4325 return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4326 return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4327 }
4328
4329 /* Return true if UD (an edge from a use to a definition) is associated
4330 with a loop latch edge in the cfg. */
4331
4332 bool
4333 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4334 {
4335 slp_tree use = m_vertices[ud->src].node;
4336 slp_tree def = m_vertices[ud->dest].node;
4337 if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
4338 || SLP_TREE_CODE (use) == VEC_PERM_EXPR)
4339 || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4340 return false;
4341
4342 stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4343 return (is_a<gphi *> (use_rep->stmt)
4344 && bb_loop_header_p (gimple_bb (use_rep->stmt))
4345 && containing_loop (def) == containing_loop (use));
4346 }
4347
4348 /* Build the graph. Mark edges that correspond to cfg loop latch edges with
4349 a nonnull data field. */
4350
4351 void
4352 vect_optimize_slp_pass::build_graph ()
4353 {
4354 m_optimize_size = true;
4355 build_vertices ();
4356
4357 m_slpg = new_graph (m_vertices.length ());
4358 for (slpg_vertex &v : m_vertices)
4359 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4360 if (child)
4361 {
4362 graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4363 if (is_cfg_latch_edge (ud))
4364 ud->data = this;
4365 }
4366 }
4367
4368 /* Return true if E corresponds to a loop latch edge in the cfg. */
4369
4370 static bool
4371 skip_cfg_latch_edges (graph_edge *e)
4372 {
4373 return e->data;
4374 }
4375
4376 /* Create the node partitions. */
4377
4378 void
4379 vect_optimize_slp_pass::create_partitions ()
4380 {
4381 /* Calculate a postorder of the graph, ignoring edges that correspond
4382 to natural latch edges in the cfg. Reading the vector from the end
4383 to the beginning gives the reverse postorder. */
4384 auto_vec<int> initial_rpo;
4385 graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4386 false, NULL, skip_cfg_latch_edges);
4387 gcc_assert (initial_rpo.length () == m_vertices.length ());
4388
4389 /* Calculate the strongly connected components of the graph. */
4390 auto_vec<int> scc_grouping;
4391 unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4392
4393 /* Create a new index order in which all nodes from the same SCC are
4394 consecutive. Use scc_pos to record the index of the first node in
4395 each SCC. */
4396 auto_vec<unsigned int> scc_pos (num_sccs);
4397 int last_component = -1;
4398 unsigned int node_count = 0;
4399 for (unsigned int node_i : scc_grouping)
4400 {
4401 if (last_component != m_slpg->vertices[node_i].component)
4402 {
4403 last_component = m_slpg->vertices[node_i].component;
4404 gcc_assert (last_component == int (scc_pos.length ()));
4405 scc_pos.quick_push (node_count);
4406 }
4407 node_count += 1;
4408 }
4409 gcc_assert (node_count == initial_rpo.length ()
4410 && last_component + 1 == int (num_sccs));
4411
4412 /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4413 inside each SCC following the RPO we calculated above. The fact that
4414 we ignored natural latch edges when calculating the RPO should ensure
4415 that, for natural loop nests:
4416
4417 - the first node that we encounter in a cfg loop is the loop header phi
4418 - the loop header phis are in dominance order
4419
4420 Arranging for this is an optimization (see below) rather than a
4421 correctness issue. Unnatural loops with a tangled mess of backedges
4422 will still work correctly, but might give poorer results.
4423
4424 Also update scc_pos so that it gives 1 + the index of the last node
4425 in the SCC. */
4426 m_partitioned_nodes.safe_grow (node_count);
4427 for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4428 {
4429 unsigned int node_i = initial_rpo[old_i];
4430 unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4431 m_partitioned_nodes[new_i] = node_i;
4432 }
4433
4434 /* When optimizing for speed, partition each SCC based on the containing
4435 cfg loop. The order we constructed above should ensure that, for natural
4436 cfg loops, we'll create sub-SCC partitions for outer loops before
4437 the corresponding sub-SCC partitions for inner loops. Similarly,
4438 when one sibling loop A dominates another sibling loop B, we should
4439 create a sub-SCC partition for A before a sub-SCC partition for B.
4440
4441 As above, nothing depends for correctness on whether this achieves
4442 a natural nesting, but we should get better results when it does. */
4443 m_partitions.reserve (m_vertices.length ());
4444 unsigned int next_partition_i = 0;
4445 hash_map<struct loop *, int> loop_partitions;
4446 unsigned int rpo_begin = 0;
4447 unsigned int num_partitioned_nodes = 0;
4448 for (unsigned int rpo_end : scc_pos)
4449 {
4450 loop_partitions.empty ();
4451 unsigned int partition_i = next_partition_i;
4452 for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4453 {
4454 /* Handle externals and constants optimistically throughout.
4455 But treat existing vectors as fixed since we do not handle
4456 permuting them. */
4457 unsigned int node_i = m_partitioned_nodes[rpo_i];
4458 auto &vertex = m_vertices[node_i];
4459 if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4460 && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4461 || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4462 vertex.partition = -1;
4463 else
4464 {
4465 bool existed;
4466 if (m_optimize_size)
4467 existed = next_partition_i > partition_i;
4468 else
4469 {
4470 struct loop *loop = containing_loop (vertex.node);
4471 auto &entry = loop_partitions.get_or_insert (loop, &existed);
4472 if (!existed)
4473 entry = next_partition_i;
4474 partition_i = entry;
4475 }
4476 if (!existed)
4477 {
4478 m_partitions.quick_push (slpg_partition_info ());
4479 next_partition_i += 1;
4480 }
4481 vertex.partition = partition_i;
4482 num_partitioned_nodes += 1;
4483 m_partitions[partition_i].node_end += 1;
4484 }
4485 }
4486 rpo_begin = rpo_end;
4487 }
4488
4489 /* Assign ranges of consecutive node indices to each partition,
4490 in partition order. Start with node_end being the same as
4491 node_begin so that the next loop can use it as a counter. */
4492 unsigned int node_begin = 0;
4493 for (auto &partition : m_partitions)
4494 {
4495 partition.node_begin = node_begin;
4496 node_begin += partition.node_end;
4497 partition.node_end = partition.node_begin;
4498 }
4499 gcc_assert (node_begin == num_partitioned_nodes);
4500
4501 /* Finally build the list of nodes in partition order. */
4502 m_partitioned_nodes.truncate (num_partitioned_nodes);
4503 for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4504 {
4505 int partition_i = m_vertices[node_i].partition;
4506 if (partition_i >= 0)
4507 {
4508 unsigned int order_i = m_partitions[partition_i].node_end++;
4509 m_partitioned_nodes[order_i] = node_i;
4510 }
4511 }
4512 }
4513
4514 /* Look for edges from earlier partitions into node NODE_I and edges from
4515 node NODE_I into later partitions. Call:
4516
4517 FN (ud, other_node_i)
4518
4519 for each such use-to-def edge ud, where other_node_i is the node at the
4520 other end of the edge. */
4521
4522 template<typename T>
4523 void
4524 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4525 {
4526 int partition_i = m_vertices[node_i].partition;
4527 for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4528 pred; pred = pred->pred_next)
4529 {
4530 int src_partition_i = m_vertices[pred->src].partition;
4531 if (src_partition_i >= 0 && src_partition_i != partition_i)
4532 fn (pred, pred->src);
4533 }
4534 for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4535 succ; succ = succ->succ_next)
4536 {
4537 int dest_partition_i = m_vertices[succ->dest].partition;
4538 if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4539 fn (succ, succ->dest);
4540 }
4541 }
4542
4543 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4544 that NODE would operate on. This test is independent of NODE's actual
4545 operation. */
4546
4547 bool
4548 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4549 unsigned int layout_i)
4550 {
4551 if (layout_i == 0)
4552 return true;
4553
4554 if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4555 return false;
4556
4557 return true;
4558 }
4559
4560 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4561 to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
4562 layouts is incompatible with NODE or if the change is not possible for
4563 some other reason.
4564
4565 The properties taken from NODE include the number of lanes and the
4566 vector type. The actual operation doesn't matter. */
4567
4568 int
4569 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4570 unsigned int from_layout_i,
4571 unsigned int to_layout_i)
4572 {
4573 if (!is_compatible_layout (node, from_layout_i)
4574 || !is_compatible_layout (node, to_layout_i))
4575 return -1;
4576
4577 if (from_layout_i == to_layout_i)
4578 return 0;
4579
4580 auto_vec<slp_tree, 1> children (1);
4581 children.quick_push (node);
4582 auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4583 if (from_layout_i > 0)
4584 for (unsigned int i : m_perms[from_layout_i])
4585 perm.quick_push ({ 0, i });
4586 else
4587 for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4588 perm.quick_push ({ 0, i });
4589 if (to_layout_i > 0)
4590 vect_slp_permute (m_perms[to_layout_i], perm, true);
4591 auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4592 children, false);
4593 if (count >= 0)
4594 return MAX (count, 1);
4595
4596 /* ??? In principle we could try changing via layout 0, giving two
4597 layout changes rather than 1. Doing that would require
4598 corresponding support in get_result_with_layout. */
4599 return -1;
4600 }
4601
4602 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
4603
4604 inline slpg_partition_layout_costs &
4605 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4606 unsigned int layout_i)
4607 {
4608 return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4609 }
4610
4611 /* Change PERM in one of two ways:
4612
4613 - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4614 chosen for child I of NODE.
4615
4616 - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4617
4618 In both cases, arrange for the output to have layout OUT_LAYOUT_I */
4619
4620 void
4621 vect_optimize_slp_pass::
4622 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4623 int in_layout_i, unsigned int out_layout_i)
4624 {
4625 for (auto &entry : perm)
4626 {
4627 int this_in_layout_i = in_layout_i;
4628 if (this_in_layout_i < 0)
4629 {
4630 slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4631 unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4632 this_in_layout_i = m_partitions[in_partition_i].layout;
4633 }
4634 if (this_in_layout_i > 0)
4635 entry.second = m_perms[this_in_layout_i][entry.second];
4636 }
4637 if (out_layout_i > 0)
4638 vect_slp_permute (m_perms[out_layout_i], perm, true);
4639 }
4640
4641 /* Check whether the target allows NODE to be rearranged so that the node's
4642 output has layout OUT_LAYOUT_I. Return the cost of the change if so,
4643 in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
4644
4645 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4646 NODE can adapt to the layout changes that have (perhaps provisionally)
4647 been chosen for NODE's children, so that no extra permutations are
4648 needed on either the input or the output of NODE.
4649
4650 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4651 that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4652
4653 IN_LAYOUT_I has no meaning for other types of node.
4654
4655 Keeping the node as-is is always valid. If the target doesn't appear
4656 to support the node as-is, but might realistically support other layouts,
4657 then layout 0 instead has the cost of a worst-case permutation. On the
4658 one hand, this ensures that every node has at least one valid layout,
4659 avoiding what would otherwise be an awkward special case. On the other,
4660 it still encourages the pass to change an invalid pre-existing layout
4661 choice into a valid one. */
4662
4663 int
4664 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4665 unsigned int out_layout_i)
4666 {
4667 const int fallback_cost = 1;
4668
4669 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4670 {
4671 auto_lane_permutation_t tmp_perm;
4672 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4673
4674 /* Check that the child nodes support the chosen layout. Checking
4675 the first child is enough, since any second child would have the
4676 same shape. */
4677 auto first_child = SLP_TREE_CHILDREN (node)[0];
4678 if (in_layout_i > 0
4679 && !is_compatible_layout (first_child, in_layout_i))
4680 return -1;
4681
4682 change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4683 int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4684 node, tmp_perm,
4685 SLP_TREE_CHILDREN (node),
4686 false);
4687 if (count < 0)
4688 {
4689 if (in_layout_i == 0 && out_layout_i == 0)
4690 {
4691 /* Use the fallback cost if the node could in principle support
4692 some nonzero layout for both the inputs and the outputs.
4693 Otherwise assume that the node will be rejected later
4694 and rebuilt from scalars. */
4695 if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4696 return fallback_cost;
4697 return 0;
4698 }
4699 return -1;
4700 }
4701
4702 /* We currently have no way of telling whether the new layout is cheaper
4703 or more expensive than the old one. But at least in principle,
4704 it should be worth making zero permutations (whole-vector shuffles)
4705 cheaper than real permutations, in case the pass is able to remove
4706 the latter. */
4707 return count == 0 ? 0 : 1;
4708 }
4709
4710 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4711 if (rep
4712 && STMT_VINFO_DATA_REF (rep)
4713 && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4714 && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4715 {
4716 auto_load_permutation_t tmp_perm;
4717 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4718 if (out_layout_i > 0)
4719 vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4720
4721 poly_uint64 vf = 1;
4722 if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4723 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4724 unsigned int n_perms;
4725 if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4726 nullptr, vf, true, false, &n_perms))
4727 {
4728 auto rep = SLP_TREE_REPRESENTATIVE (node);
4729 if (out_layout_i == 0)
4730 {
4731 /* Use the fallback cost if the load is an N-to-N permutation.
4732 Otherwise assume that the node will be rejected later
4733 and rebuilt from scalars. */
4734 if (STMT_VINFO_GROUPED_ACCESS (rep)
4735 && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4736 == SLP_TREE_LANES (node)))
4737 return fallback_cost;
4738 return 0;
4739 }
4740 return -1;
4741 }
4742
4743 /* See the comment above the corresponding VEC_PERM_EXPR handling. */
4744 return n_perms == 0 ? 0 : 1;
4745 }
4746
4747 return 0;
4748 }
4749
4750 /* Decide which element layouts we should consider using. Calculate the
4751 weights associated with inserting layout changes on partition edges.
4752 Also mark partitions that cannot change layout, by setting their
4753 layout to zero. */
4754
4755 void
4756 vect_optimize_slp_pass::start_choosing_layouts ()
4757 {
4758 /* Used to assign unique permutation indices. */
4759 using perm_hash = unbounded_hashmap_traits<
4760 vec_free_hash_base<int_hash_base<unsigned>>,
4761 int_hash<int, -1, -2>
4762 >;
4763 hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4764
4765 /* Layout 0 is "no change". */
4766 m_perms.safe_push (vNULL);
4767
4768 /* Create layouts from existing permutations. */
4769 auto_load_permutation_t tmp_perm;
4770 for (unsigned int node_i : m_partitioned_nodes)
4771 {
4772 /* Leafs also double as entries to the reverse graph. Allow the
4773 layout of those to be changed. */
4774 auto &vertex = m_vertices[node_i];
4775 auto &partition = m_partitions[vertex.partition];
4776 if (!m_slpg->vertices[node_i].succ)
4777 partition.layout = 0;
4778
4779 /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
4780 slp_tree node = vertex.node;
4781 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4782 slp_tree child;
4783 unsigned HOST_WIDE_INT imin, imax = 0;
4784 bool any_permute = false;
4785 tmp_perm.truncate (0);
4786 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4787 {
4788 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4789 unpermuted, record a layout that reverses this permutation.
4790
4791 We would need more work to cope with loads that are internally
4792 permuted and also have inputs (such as masks for
4793 IFN_MASK_LOADs). */
4794 gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4795 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4796 {
4797 partition.layout = -1;
4798 continue;
4799 }
4800 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4801 imin = DR_GROUP_SIZE (dr_stmt) + 1;
4802 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4803 }
4804 else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4805 && SLP_TREE_CHILDREN (node).length () == 1
4806 && (child = SLP_TREE_CHILDREN (node)[0])
4807 && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4808 .is_constant (&imin)))
4809 {
4810 /* If the child has the same vector size as this node,
4811 reversing the permutation can make the permutation a no-op.
4812 In other cases it can change a true permutation into a
4813 full-vector extract. */
4814 tmp_perm.reserve (SLP_TREE_LANES (node));
4815 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4816 tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4817 }
4818 else
4819 continue;
4820
4821 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4822 {
4823 unsigned idx = tmp_perm[j];
4824 imin = MIN (imin, idx);
4825 imax = MAX (imax, idx);
4826 if (idx - tmp_perm[0] != j)
4827 any_permute = true;
4828 }
4829 /* If the span doesn't match we'd disrupt VF computation, avoid
4830 that for now. */
4831 if (imax - imin + 1 != SLP_TREE_LANES (node))
4832 continue;
4833 /* If there's no permute no need to split one out. In this case
4834 we can consider turning a load into a permuted load, if that
4835 turns out to be cheaper than alternatives. */
4836 if (!any_permute)
4837 {
4838 partition.layout = -1;
4839 continue;
4840 }
4841
4842 /* For now only handle true permutes, like
4843 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
4844 when permuting constants and invariants keeping the permute
4845 bijective. */
4846 auto_sbitmap load_index (SLP_TREE_LANES (node));
4847 bitmap_clear (load_index);
4848 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4849 bitmap_set_bit (load_index, tmp_perm[j] - imin);
4850 unsigned j;
4851 for (j = 0; j < SLP_TREE_LANES (node); ++j)
4852 if (!bitmap_bit_p (load_index, j))
4853 break;
4854 if (j != SLP_TREE_LANES (node))
4855 continue;
4856
4857 vec<unsigned> perm = vNULL;
4858 perm.safe_grow (SLP_TREE_LANES (node), true);
4859 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4860 perm[j] = tmp_perm[j] - imin;
4861
4862 if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4863 {
4864 /* Continue to use existing layouts, but don't add any more. */
4865 int *entry = layout_ids.get (perm);
4866 partition.layout = entry ? *entry : 0;
4867 perm.release ();
4868 }
4869 else
4870 {
4871 bool existed;
4872 int &layout_i = layout_ids.get_or_insert (perm, &existed);
4873 if (existed)
4874 perm.release ();
4875 else
4876 {
4877 layout_i = m_perms.length ();
4878 m_perms.safe_push (perm);
4879 }
4880 partition.layout = layout_i;
4881 }
4882 }
4883
4884 /* Initially assume that every layout is possible and has zero cost
4885 in every partition. */
4886 m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4887 * m_perms.length ());
4888
4889 /* We have to mark outgoing permutations facing non-associating-reduction
4890 graph entries that are not represented as to be materialized.
4891 slp_inst_kind_bb_reduc currently only covers associatable reductions. */
4892 for (slp_instance instance : m_vinfo->slp_instances)
4893 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4894 {
4895 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4896 m_partitions[m_vertices[node_i].partition].layout = 0;
4897 }
4898 else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4899 {
4900 stmt_vec_info stmt_info
4901 = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4902 stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4903 if (needs_fold_left_reduction_p (TREE_TYPE
4904 (gimple_get_lhs (stmt_info->stmt)),
4905 STMT_VINFO_REDUC_CODE (reduc_info)))
4906 {
4907 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4908 m_partitions[m_vertices[node_i].partition].layout = 0;
4909 }
4910 }
4911
4912 /* Check which layouts each node and partition can handle. Calculate the
4913 weights associated with inserting layout changes on edges. */
4914 for (unsigned int node_i : m_partitioned_nodes)
4915 {
4916 auto &vertex = m_vertices[node_i];
4917 auto &partition = m_partitions[vertex.partition];
4918 slp_tree node = vertex.node;
4919
4920 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4921 {
4922 vertex.weight = vect_slp_node_weight (node);
4923
4924 /* We do not handle stores with a permutation, so all
4925 incoming permutations must have been materialized.
4926
4927 We also don't handle masked grouped loads, which lack a
4928 permutation vector. In this case the memory locations
4929 form an implicit second input to the loads, on top of the
4930 explicit mask input, and the memory input's layout cannot
4931 be changed.
4932
4933 On the other hand, we do support permuting gather loads and
4934 masked gather loads, where each scalar load is independent
4935 of the others. This can be useful if the address/index input
4936 benefits from permutation. */
4937 if (STMT_VINFO_DATA_REF (rep)
4938 && STMT_VINFO_GROUPED_ACCESS (rep)
4939 && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4940 partition.layout = 0;
4941
4942 /* We cannot change the layout of an operation that is
4943 not independent on lanes. Note this is an explicit
4944 negative list since that's much shorter than the respective
4945 positive one but it's critical to keep maintaining it. */
4946 if (is_gimple_call (STMT_VINFO_STMT (rep)))
4947 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4948 {
4949 case CFN_COMPLEX_ADD_ROT90:
4950 case CFN_COMPLEX_ADD_ROT270:
4951 case CFN_COMPLEX_MUL:
4952 case CFN_COMPLEX_MUL_CONJ:
4953 case CFN_VEC_ADDSUB:
4954 case CFN_VEC_FMADDSUB:
4955 case CFN_VEC_FMSUBADD:
4956 partition.layout = 0;
4957 default:;
4958 }
4959 }
4960
4961 auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4962 {
4963 auto &other_vertex = m_vertices[other_node_i];
4964
4965 /* Count the number of edges from earlier partitions and the number
4966 of edges to later partitions. */
4967 if (other_vertex.partition < vertex.partition)
4968 partition.in_degree += 1;
4969 else
4970 partition.out_degree += 1;
4971
4972 /* If the current node uses the result of OTHER_NODE_I, accumulate
4973 the effects of that. */
4974 if (ud->src == int (node_i))
4975 {
4976 other_vertex.out_weight += vertex.weight;
4977 other_vertex.out_degree += 1;
4978 }
4979 };
4980 for_each_partition_edge (node_i, process_edge);
4981 }
4982 }
4983
4984 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4985 its current (provisional) choice of layout. The inputs do not necessarily
4986 have the same layout as each other. */
4987
4988 slpg_layout_cost
4989 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4990 {
4991 auto &vertex = m_vertices[node_i];
4992 slpg_layout_cost cost;
4993 auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4994 {
4995 auto &other_vertex = m_vertices[other_node_i];
4996 if (other_vertex.partition < vertex.partition)
4997 {
4998 auto &other_partition = m_partitions[other_vertex.partition];
4999 auto &other_costs = partition_layout_costs (other_vertex.partition,
5000 other_partition.layout);
5001 slpg_layout_cost this_cost = other_costs.in_cost;
5002 this_cost.add_serial_cost (other_costs.internal_cost);
5003 this_cost.split (other_partition.out_degree);
5004 cost.add_parallel_cost (this_cost);
5005 }
5006 };
5007 for_each_partition_edge (node_i, add_cost);
5008 return cost;
5009 }
5010
5011 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
5012 and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
5013 slpg_layout_cost::impossible () if the change isn't possible. */
5014
5015 slpg_layout_cost
5016 vect_optimize_slp_pass::
5017 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
5018 unsigned int layout2_i)
5019 {
5020 auto &def_vertex = m_vertices[ud->dest];
5021 auto &use_vertex = m_vertices[ud->src];
5022 auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
5023 auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
5024 auto factor = change_layout_cost (def_vertex.node, def_layout_i,
5025 use_layout_i);
5026 if (factor < 0)
5027 return slpg_layout_cost::impossible ();
5028
5029 /* We have a choice of putting the layout change at the site of the
5030 definition or at the site of the use. Prefer the former when
5031 optimizing for size or when the execution frequency of the
5032 definition is no greater than the combined execution frequencies of
5033 the uses. When putting the layout change at the site of the definition,
5034 divvy up the cost among all consumers. */
5035 if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
5036 {
5037 slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
5038 cost.split (def_vertex.out_degree);
5039 return cost;
5040 }
5041 return { use_vertex.weight * factor, m_optimize_size };
5042 }
5043
5044 /* UD represents a use-def link between FROM_NODE_I and a node in a later
5045 partition; FROM_NODE_I could be the definition node or the use node.
5046 The node at the other end of the link wants to use layout TO_LAYOUT_I.
5047 Return the cost of any necessary fix-ups on edge UD, or return
5048 slpg_layout_cost::impossible () if the change isn't possible.
5049
5050 At this point, FROM_NODE_I's partition has chosen the cheapest
5051 layout based on the information available so far, but this choice
5052 is only provisional. */
5053
5054 slpg_layout_cost
5055 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
5056 unsigned int to_layout_i)
5057 {
5058 auto &from_vertex = m_vertices[from_node_i];
5059 unsigned int from_partition_i = from_vertex.partition;
5060 slpg_partition_info &from_partition = m_partitions[from_partition_i];
5061 gcc_assert (from_partition.layout >= 0);
5062
5063 /* First calculate the cost on the assumption that FROM_PARTITION sticks
5064 with its current layout preference. */
5065 slpg_layout_cost cost = slpg_layout_cost::impossible ();
5066 auto edge_cost = edge_layout_cost (ud, from_node_i,
5067 from_partition.layout, to_layout_i);
5068 if (edge_cost.is_possible ())
5069 {
5070 auto &from_costs = partition_layout_costs (from_partition_i,
5071 from_partition.layout);
5072 cost = from_costs.in_cost;
5073 cost.add_serial_cost (from_costs.internal_cost);
5074 cost.split (from_partition.out_degree);
5075 cost.add_serial_cost (edge_cost);
5076 }
5077 else if (from_partition.layout == 0)
5078 /* We must allow the source partition to have layout 0 as a fallback,
5079 in case all other options turn out to be impossible. */
5080 return cost;
5081
5082 /* Take the minimum of that cost and the cost that applies if
5083 FROM_PARTITION instead switches to TO_LAYOUT_I. */
5084 auto &direct_layout_costs = partition_layout_costs (from_partition_i,
5085 to_layout_i);
5086 if (direct_layout_costs.is_possible ())
5087 {
5088 slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
5089 direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
5090 direct_cost.split (from_partition.out_degree);
5091 if (!cost.is_possible ()
5092 || direct_cost.is_better_than (cost, m_optimize_size))
5093 cost = direct_cost;
5094 }
5095
5096 return cost;
5097 }
5098
5099 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
5100 partition; TO_NODE_I could be the definition node or the use node.
5101 The node at the other end of the link wants to use layout FROM_LAYOUT_I;
5102 return the cost of any necessary fix-ups on edge UD, or
5103 slpg_layout_cost::impossible () if the choice cannot be made.
5104
5105 At this point, TO_NODE_I's partition has a fixed choice of layout. */
5106
5107 slpg_layout_cost
5108 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
5109 unsigned int from_layout_i)
5110 {
5111 auto &to_vertex = m_vertices[to_node_i];
5112 unsigned int to_partition_i = to_vertex.partition;
5113 slpg_partition_info &to_partition = m_partitions[to_partition_i];
5114 gcc_assert (to_partition.layout >= 0);
5115
5116 /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
5117 adjusted for this input having layout FROM_LAYOUT_I. Assume that
5118 any other inputs keep their current choice of layout. */
5119 auto &to_costs = partition_layout_costs (to_partition_i,
5120 to_partition.layout);
5121 if (ud->src == int (to_node_i)
5122 && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
5123 {
5124 auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
5125 auto old_layout = from_partition.layout;
5126 from_partition.layout = from_layout_i;
5127 int factor = internal_node_cost (to_vertex.node, -1,
5128 to_partition.layout);
5129 from_partition.layout = old_layout;
5130 if (factor >= 0)
5131 {
5132 slpg_layout_cost cost = to_costs.out_cost;
5133 cost.add_serial_cost ({ to_vertex.weight * factor,
5134 m_optimize_size });
5135 cost.split (to_partition.in_degree);
5136 return cost;
5137 }
5138 }
5139
5140 /* Compute the cost if we insert any necessary layout change on edge UD. */
5141 auto edge_cost = edge_layout_cost (ud, to_node_i,
5142 to_partition.layout, from_layout_i);
5143 if (edge_cost.is_possible ())
5144 {
5145 slpg_layout_cost cost = to_costs.out_cost;
5146 cost.add_serial_cost (to_costs.internal_cost);
5147 cost.split (to_partition.in_degree);
5148 cost.add_serial_cost (edge_cost);
5149 return cost;
5150 }
5151
5152 return slpg_layout_cost::impossible ();
5153 }
5154
5155 /* Make a forward pass through the partitions, accumulating input costs.
5156 Make a tentative (provisional) choice of layout for each partition,
5157 ensuring that this choice still allows later partitions to keep
5158 their original layout. */
5159
5160 void
5161 vect_optimize_slp_pass::forward_pass ()
5162 {
5163 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5164 ++partition_i)
5165 {
5166 auto &partition = m_partitions[partition_i];
5167
5168 /* If the partition consists of a single VEC_PERM_EXPR, precompute
5169 the incoming cost that would apply if every predecessor partition
5170 keeps its current layout. This is used within the loop below. */
5171 slpg_layout_cost in_cost;
5172 slp_tree single_node = nullptr;
5173 if (partition.node_end == partition.node_begin + 1)
5174 {
5175 unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5176 single_node = m_vertices[node_i].node;
5177 if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5178 in_cost = total_in_cost (node_i);
5179 }
5180
5181 /* Go through the possible layouts. Decide which ones are valid
5182 for this partition and record which of the valid layouts has
5183 the lowest cost. */
5184 unsigned int min_layout_i = 0;
5185 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5186 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5187 {
5188 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5189 if (!layout_costs.is_possible ())
5190 continue;
5191
5192 /* If the recorded layout is already 0 then the layout cannot
5193 change. */
5194 if (partition.layout == 0 && layout_i != 0)
5195 {
5196 layout_costs.mark_impossible ();
5197 continue;
5198 }
5199
5200 bool is_possible = true;
5201 for (unsigned int order_i = partition.node_begin;
5202 order_i < partition.node_end; ++order_i)
5203 {
5204 unsigned int node_i = m_partitioned_nodes[order_i];
5205 auto &vertex = m_vertices[node_i];
5206
5207 /* Reject the layout if it is individually incompatible
5208 with any node in the partition. */
5209 if (!is_compatible_layout (vertex.node, layout_i))
5210 {
5211 is_possible = false;
5212 break;
5213 }
5214
5215 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5216 {
5217 auto &other_vertex = m_vertices[other_node_i];
5218 if (other_vertex.partition < vertex.partition)
5219 {
5220 /* Accumulate the incoming costs from earlier
5221 partitions, plus the cost of any layout changes
5222 on UD itself. */
5223 auto cost = forward_cost (ud, other_node_i, layout_i);
5224 if (!cost.is_possible ())
5225 is_possible = false;
5226 else
5227 layout_costs.in_cost.add_parallel_cost (cost);
5228 }
5229 else
5230 /* Reject the layout if it would make layout 0 impossible
5231 for later partitions. This amounts to testing that the
5232 target supports reversing the layout change on edges
5233 to later partitions.
5234
5235 In principle, it might be possible to push a layout
5236 change all the way down a graph, so that it never
5237 needs to be reversed and so that the target doesn't
5238 need to support the reverse operation. But it would
5239 be awkward to bail out if we hit a partition that
5240 does not support the new layout, especially since
5241 we are not dealing with a lattice. */
5242 is_possible &= edge_layout_cost (ud, other_node_i, 0,
5243 layout_i).is_possible ();
5244 };
5245 for_each_partition_edge (node_i, add_cost);
5246
5247 /* Accumulate the cost of using LAYOUT_I within NODE,
5248 both for the inputs and the outputs. */
5249 int factor = internal_node_cost (vertex.node, layout_i,
5250 layout_i);
5251 if (factor < 0)
5252 {
5253 is_possible = false;
5254 break;
5255 }
5256 else if (factor)
5257 layout_costs.internal_cost.add_serial_cost
5258 ({ vertex.weight * factor, m_optimize_size });
5259 }
5260 if (!is_possible)
5261 {
5262 layout_costs.mark_impossible ();
5263 continue;
5264 }
5265
5266 /* Combine the incoming and partition-internal costs. */
5267 slpg_layout_cost combined_cost = layout_costs.in_cost;
5268 combined_cost.add_serial_cost (layout_costs.internal_cost);
5269
5270 /* If this partition consists of a single VEC_PERM_EXPR, see
5271 if the VEC_PERM_EXPR can be changed to support output layout
5272 LAYOUT_I while keeping all the provisional choices of input
5273 layout. */
5274 if (single_node
5275 && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5276 {
5277 int factor = internal_node_cost (single_node, -1, layout_i);
5278 if (factor >= 0)
5279 {
5280 auto weight = m_vertices[single_node->vertex].weight;
5281 slpg_layout_cost internal_cost
5282 = { weight * factor, m_optimize_size };
5283
5284 slpg_layout_cost alt_cost = in_cost;
5285 alt_cost.add_serial_cost (internal_cost);
5286 if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5287 {
5288 combined_cost = alt_cost;
5289 layout_costs.in_cost = in_cost;
5290 layout_costs.internal_cost = internal_cost;
5291 }
5292 }
5293 }
5294
5295 /* Record the layout with the lowest cost. Prefer layout 0 in
5296 the event of a tie between it and another layout. */
5297 if (!min_layout_cost.is_possible ()
5298 || combined_cost.is_better_than (min_layout_cost,
5299 m_optimize_size))
5300 {
5301 min_layout_i = layout_i;
5302 min_layout_cost = combined_cost;
5303 }
5304 }
5305
5306 /* This loop's handling of earlier partitions should ensure that
5307 choosing the original layout for the current partition is no
5308 less valid than it was in the original graph, even with the
5309 provisional layout choices for those earlier partitions. */
5310 gcc_assert (min_layout_cost.is_possible ());
5311 partition.layout = min_layout_i;
5312 }
5313 }
5314
5315 /* Make a backward pass through the partitions, accumulating output costs.
5316 Make a final choice of layout for each partition. */
5317
5318 void
5319 vect_optimize_slp_pass::backward_pass ()
5320 {
5321 for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5322 {
5323 auto &partition = m_partitions[partition_i];
5324
5325 unsigned int min_layout_i = 0;
5326 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5327 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5328 {
5329 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5330 if (!layout_costs.is_possible ())
5331 continue;
5332
5333 /* Accumulate the costs from successor partitions. */
5334 bool is_possible = true;
5335 for (unsigned int order_i = partition.node_begin;
5336 order_i < partition.node_end; ++order_i)
5337 {
5338 unsigned int node_i = m_partitioned_nodes[order_i];
5339 auto &vertex = m_vertices[node_i];
5340 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5341 {
5342 auto &other_vertex = m_vertices[other_node_i];
5343 auto &other_partition = m_partitions[other_vertex.partition];
5344 if (other_vertex.partition > vertex.partition)
5345 {
5346 /* Accumulate the incoming costs from later
5347 partitions, plus the cost of any layout changes
5348 on UD itself. */
5349 auto cost = backward_cost (ud, other_node_i, layout_i);
5350 if (!cost.is_possible ())
5351 is_possible = false;
5352 else
5353 layout_costs.out_cost.add_parallel_cost (cost);
5354 }
5355 else
5356 /* Make sure that earlier partitions can (if necessary
5357 or beneficial) keep the layout that they chose in
5358 the forward pass. This ensures that there is at
5359 least one valid choice of layout. */
5360 is_possible &= edge_layout_cost (ud, other_node_i,
5361 other_partition.layout,
5362 layout_i).is_possible ();
5363 };
5364 for_each_partition_edge (node_i, add_cost);
5365 }
5366 if (!is_possible)
5367 {
5368 layout_costs.mark_impossible ();
5369 continue;
5370 }
5371
5372 /* Locally combine the costs from the forward and backward passes.
5373 (This combined cost is not passed on, since that would lead
5374 to double counting.) */
5375 slpg_layout_cost combined_cost = layout_costs.in_cost;
5376 combined_cost.add_serial_cost (layout_costs.internal_cost);
5377 combined_cost.add_serial_cost (layout_costs.out_cost);
5378
5379 /* Record the layout with the lowest cost. Prefer layout 0 in
5380 the event of a tie between it and another layout. */
5381 if (!min_layout_cost.is_possible ()
5382 || combined_cost.is_better_than (min_layout_cost,
5383 m_optimize_size))
5384 {
5385 min_layout_i = layout_i;
5386 min_layout_cost = combined_cost;
5387 }
5388 }
5389
5390 gcc_assert (min_layout_cost.is_possible ());
5391 partition.layout = min_layout_i;
5392 }
5393 }
5394
5395 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5396 NODE already has the layout that was selected for its partition. */
5397
5398 slp_tree
5399 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5400 unsigned int to_layout_i)
5401 {
5402 unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5403 slp_tree result = m_node_layouts[result_i];
5404 if (result)
5405 return result;
5406
5407 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5408 || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5409 /* We can't permute vector defs in place. */
5410 && SLP_TREE_VEC_DEFS (node).is_empty ()))
5411 {
5412 /* If the vector is uniform or unchanged, there's nothing to do. */
5413 if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5414 result = node;
5415 else
5416 {
5417 auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5418 result = vect_create_new_slp_node (scalar_ops);
5419 vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5420 }
5421 }
5422 else
5423 {
5424 unsigned int partition_i = m_vertices[node->vertex].partition;
5425 unsigned int from_layout_i = m_partitions[partition_i].layout;
5426 if (from_layout_i == to_layout_i)
5427 return node;
5428
5429 /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5430 permutation instead of a serial one. Leave the new permutation
5431 in TMP_PERM on success. */
5432 auto_lane_permutation_t tmp_perm;
5433 unsigned int num_inputs = 1;
5434 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5435 {
5436 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5437 if (from_layout_i != 0)
5438 vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5439 if (to_layout_i != 0)
5440 vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5441 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5442 tmp_perm,
5443 SLP_TREE_CHILDREN (node),
5444 false) >= 0)
5445 num_inputs = SLP_TREE_CHILDREN (node).length ();
5446 else
5447 tmp_perm.truncate (0);
5448 }
5449
5450 if (dump_enabled_p ())
5451 {
5452 if (tmp_perm.length () > 0)
5453 dump_printf_loc (MSG_NOTE, vect_location,
5454 "duplicating permutation node %p with"
5455 " layout %d\n",
5456 (void *) node, to_layout_i);
5457 else
5458 dump_printf_loc (MSG_NOTE, vect_location,
5459 "inserting permutation node in place of %p\n",
5460 (void *) node);
5461 }
5462
5463 unsigned int num_lanes = SLP_TREE_LANES (node);
5464 result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5465 if (SLP_TREE_SCALAR_STMTS (node).length ())
5466 {
5467 auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5468 stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5469 if (from_layout_i != 0)
5470 vect_slp_permute (m_perms[from_layout_i], stmts, false);
5471 if (to_layout_i != 0)
5472 vect_slp_permute (m_perms[to_layout_i], stmts, true);
5473 }
5474 SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5475 SLP_TREE_LANES (result) = num_lanes;
5476 SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5477 result->vertex = -1;
5478
5479 auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5480 if (tmp_perm.length ())
5481 {
5482 lane_perm.safe_splice (tmp_perm);
5483 SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5484 }
5485 else
5486 {
5487 lane_perm.create (num_lanes);
5488 for (unsigned j = 0; j < num_lanes; ++j)
5489 lane_perm.quick_push ({ 0, j });
5490 if (from_layout_i != 0)
5491 vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5492 if (to_layout_i != 0)
5493 vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5494 SLP_TREE_CHILDREN (result).safe_push (node);
5495 }
5496 for (slp_tree child : SLP_TREE_CHILDREN (result))
5497 child->refcnt++;
5498 }
5499 m_node_layouts[result_i] = result;
5500 return result;
5501 }
5502
5503 /* Apply the chosen vector layouts to the SLP graph. */
5504
5505 void
5506 vect_optimize_slp_pass::materialize ()
5507 {
5508 /* We no longer need the costs, so avoid having two O(N * P) arrays
5509 live at the same time. */
5510 m_partition_layout_costs.release ();
5511 m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5512
5513 auto_sbitmap fully_folded (m_vertices.length ());
5514 bitmap_clear (fully_folded);
5515 for (unsigned int node_i : m_partitioned_nodes)
5516 {
5517 auto &vertex = m_vertices[node_i];
5518 slp_tree node = vertex.node;
5519 int layout_i = m_partitions[vertex.partition].layout;
5520 gcc_assert (layout_i >= 0);
5521
5522 /* Rearrange the scalar statements to match the chosen layout. */
5523 if (layout_i > 0)
5524 vect_slp_permute (m_perms[layout_i],
5525 SLP_TREE_SCALAR_STMTS (node), true);
5526
5527 /* Update load and lane permutations. */
5528 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5529 {
5530 /* First try to absorb the input vector layouts. If that fails,
5531 force the inputs to have layout LAYOUT_I too. We checked that
5532 that was possible before deciding to use nonzero output layouts.
5533 (Note that at this stage we don't really have any guarantee that
5534 the target supports the original VEC_PERM_EXPR.) */
5535 auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5536 auto_lane_permutation_t tmp_perm;
5537 tmp_perm.safe_splice (perm);
5538 change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5539 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5540 tmp_perm,
5541 SLP_TREE_CHILDREN (node),
5542 false) >= 0)
5543 {
5544 if (dump_enabled_p ()
5545 && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5546 perm.begin ()))
5547 dump_printf_loc (MSG_NOTE, vect_location,
5548 "absorbing input layouts into %p\n",
5549 (void *) node);
5550 std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5551 bitmap_set_bit (fully_folded, node_i);
5552 }
5553 else
5554 {
5555 /* Not MSG_MISSED because it would make no sense to users. */
5556 if (dump_enabled_p ())
5557 dump_printf_loc (MSG_NOTE, vect_location,
5558 "failed to absorb input layouts into %p\n",
5559 (void *) node);
5560 change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5561 }
5562 }
5563 else
5564 {
5565 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5566 auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5567 if (layout_i > 0)
5568 /* ??? When we handle non-bijective permutes the idea
5569 is that we can force the load-permutation to be
5570 { min, min + 1, min + 2, ... max }. But then the
5571 scalar defs might no longer match the lane content
5572 which means wrong-code with live lane vectorization.
5573 So we possibly have to have NULL entries for those. */
5574 vect_slp_permute (m_perms[layout_i], load_perm, true);
5575 }
5576 }
5577
5578 /* Do this before any nodes disappear, since it involves a walk
5579 over the leaves. */
5580 remove_redundant_permutations ();
5581
5582 /* Replace each child with a correctly laid-out version. */
5583 for (unsigned int node_i : m_partitioned_nodes)
5584 {
5585 /* Skip nodes that have already been handled above. */
5586 if (bitmap_bit_p (fully_folded, node_i))
5587 continue;
5588
5589 auto &vertex = m_vertices[node_i];
5590 int in_layout_i = m_partitions[vertex.partition].layout;
5591 gcc_assert (in_layout_i >= 0);
5592
5593 unsigned j;
5594 slp_tree child;
5595 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5596 {
5597 if (!child)
5598 continue;
5599
5600 slp_tree new_child = get_result_with_layout (child, in_layout_i);
5601 if (new_child != child)
5602 {
5603 vect_free_slp_tree (child);
5604 SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5605 new_child->refcnt += 1;
5606 }
5607 }
5608 }
5609 }
5610
5611 /* Elide load permutations that are not necessary. Such permutations might
5612 be pre-existing, rather than created by the layout optimizations. */
5613
5614 void
5615 vect_optimize_slp_pass::remove_redundant_permutations ()
5616 {
5617 for (unsigned int node_i : m_leafs)
5618 {
5619 slp_tree node = m_vertices[node_i].node;
5620 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5621 continue;
5622
5623 /* In basic block vectorization we allow any subchain of an interleaving
5624 chain.
5625 FORNOW: not in loop SLP because of realignment complications. */
5626 if (is_a <bb_vec_info> (m_vinfo))
5627 {
5628 bool subchain_p = true;
5629 stmt_vec_info next_load_info = NULL;
5630 stmt_vec_info load_info;
5631 unsigned j;
5632 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5633 {
5634 if (j != 0
5635 && (next_load_info != load_info
5636 || DR_GROUP_GAP (load_info) != 1))
5637 {
5638 subchain_p = false;
5639 break;
5640 }
5641 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5642 }
5643 if (subchain_p)
5644 {
5645 SLP_TREE_LOAD_PERMUTATION (node).release ();
5646 continue;
5647 }
5648 }
5649 else
5650 {
5651 loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5652 stmt_vec_info load_info;
5653 bool this_load_permuted = false;
5654 unsigned j;
5655 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5656 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5657 {
5658 this_load_permuted = true;
5659 break;
5660 }
5661 /* When this isn't a grouped access we know it's single element
5662 and contiguous. */
5663 if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5664 {
5665 if (!this_load_permuted
5666 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5667 || SLP_TREE_LANES (node) == 1))
5668 SLP_TREE_LOAD_PERMUTATION (node).release ();
5669 continue;
5670 }
5671 stmt_vec_info first_stmt_info
5672 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5673 if (!this_load_permuted
5674 /* The load requires permutation when unrolling exposes
5675 a gap either because the group is larger than the SLP
5676 group-size or because there is a gap between the groups. */
5677 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5678 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5679 && DR_GROUP_GAP (first_stmt_info) == 0)))
5680 {
5681 SLP_TREE_LOAD_PERMUTATION (node).release ();
5682 continue;
5683 }
5684 }
5685 }
5686 }
5687
5688 /* Print the partition graph and layout information to the dump file. */
5689
5690 void
5691 vect_optimize_slp_pass::dump ()
5692 {
5693 dump_printf_loc (MSG_NOTE, vect_location,
5694 "SLP optimize permutations:\n");
5695 for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5696 {
5697 dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
5698 const char *sep = "";
5699 for (unsigned int idx : m_perms[layout_i])
5700 {
5701 dump_printf (MSG_NOTE, "%s%d", sep, idx);
5702 sep = ", ";
5703 }
5704 dump_printf (MSG_NOTE, " }\n");
5705 }
5706 dump_printf_loc (MSG_NOTE, vect_location,
5707 "SLP optimize partitions:\n");
5708 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5709 ++partition_i)
5710 {
5711 auto &partition = m_partitions[partition_i];
5712 dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
5713 dump_printf_loc (MSG_NOTE, vect_location,
5714 " partition %d (layout %d):\n",
5715 partition_i, partition.layout);
5716 dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
5717 for (unsigned int order_i = partition.node_begin;
5718 order_i < partition.node_end; ++order_i)
5719 {
5720 auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5721 dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
5722 (void *) vertex.node);
5723 dump_printf_loc (MSG_NOTE, vect_location,
5724 " weight: %f\n",
5725 vertex.weight.to_double ());
5726 if (vertex.out_degree)
5727 dump_printf_loc (MSG_NOTE, vect_location,
5728 " out weight: %f (degree %d)\n",
5729 vertex.out_weight.to_double (),
5730 vertex.out_degree);
5731 if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5732 dump_printf_loc (MSG_NOTE, vect_location,
5733 " op: VEC_PERM_EXPR\n");
5734 else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5735 dump_printf_loc (MSG_NOTE, vect_location,
5736 " op template: %G", rep->stmt);
5737 }
5738 dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
5739 for (unsigned int order_i = partition.node_begin;
5740 order_i < partition.node_end; ++order_i)
5741 {
5742 unsigned int node_i = m_partitioned_nodes[order_i];
5743 auto &vertex = m_vertices[node_i];
5744 auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5745 {
5746 auto &other_vertex = m_vertices[other_node_i];
5747 if (other_vertex.partition < vertex.partition)
5748 dump_printf_loc (MSG_NOTE, vect_location,
5749 " - %p [%d] --> %p\n",
5750 (void *) other_vertex.node,
5751 other_vertex.partition,
5752 (void *) vertex.node);
5753 else
5754 dump_printf_loc (MSG_NOTE, vect_location,
5755 " - %p --> [%d] %p\n",
5756 (void *) vertex.node,
5757 other_vertex.partition,
5758 (void *) other_vertex.node);
5759 };
5760 for_each_partition_edge (node_i, print_edge);
5761 }
5762
5763 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5764 {
5765 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5766 if (layout_costs.is_possible ())
5767 {
5768 dump_printf_loc (MSG_NOTE, vect_location,
5769 " layout %d:%s\n", layout_i,
5770 partition.layout == int (layout_i)
5771 ? " (*)" : "");
5772 slpg_layout_cost combined_cost = layout_costs.in_cost;
5773 combined_cost.add_serial_cost (layout_costs.internal_cost);
5774 combined_cost.add_serial_cost (layout_costs.out_cost);
5775 #define TEMPLATE "{depth: %f, total: %f}"
5776 dump_printf_loc (MSG_NOTE, vect_location,
5777 " " TEMPLATE "\n",
5778 layout_costs.in_cost.depth.to_double (),
5779 layout_costs.in_cost.total.to_double ());
5780 dump_printf_loc (MSG_NOTE, vect_location,
5781 " + " TEMPLATE "\n",
5782 layout_costs.internal_cost.depth.to_double (),
5783 layout_costs.internal_cost.total.to_double ());
5784 dump_printf_loc (MSG_NOTE, vect_location,
5785 " + " TEMPLATE "\n",
5786 layout_costs.out_cost.depth.to_double (),
5787 layout_costs.out_cost.total.to_double ());
5788 dump_printf_loc (MSG_NOTE, vect_location,
5789 " = " TEMPLATE "\n",
5790 combined_cost.depth.to_double (),
5791 combined_cost.total.to_double ());
5792 #undef TEMPLATE
5793 }
5794 else
5795 dump_printf_loc (MSG_NOTE, vect_location,
5796 " layout %d: rejected\n", layout_i);
5797 }
5798 }
5799 }
5800
5801 /* Main entry point for the SLP graph optimization pass. */
5802
5803 void
5804 vect_optimize_slp_pass::run ()
5805 {
5806 build_graph ();
5807 create_partitions ();
5808 start_choosing_layouts ();
5809 if (m_perms.length () > 1)
5810 {
5811 forward_pass ();
5812 backward_pass ();
5813 if (dump_enabled_p ())
5814 dump ();
5815 materialize ();
5816 while (!m_perms.is_empty ())
5817 m_perms.pop ().release ();
5818 }
5819 else
5820 remove_redundant_permutations ();
5821 free_graph (m_slpg);
5822 }
5823
5824 /* Optimize the SLP graph of VINFO. */
5825
5826 void
5827 vect_optimize_slp (vec_info *vinfo)
5828 {
5829 if (vinfo->slp_instances.is_empty ())
5830 return;
5831 vect_optimize_slp_pass (vinfo).run ();
5832 }
5833
5834 /* Gather loads reachable from the individual SLP graph entries. */
5835
5836 void
5837 vect_gather_slp_loads (vec_info *vinfo)
5838 {
5839 unsigned i;
5840 slp_instance instance;
5841 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5842 {
5843 hash_set<slp_tree> visited;
5844 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5845 SLP_INSTANCE_TREE (instance), visited);
5846 }
5847 }
5848
5849
5850 /* For each possible SLP instance decide whether to SLP it and calculate overall
5851 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
5852 least one instance. */
5853
5854 bool
5855 vect_make_slp_decision (loop_vec_info loop_vinfo)
5856 {
5857 unsigned int i;
5858 poly_uint64 unrolling_factor = 1;
5859 const vec<slp_instance> &slp_instances
5860 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5861 slp_instance instance;
5862 int decided_to_slp = 0;
5863
5864 DUMP_VECT_SCOPE ("vect_make_slp_decision");
5865
5866 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5867 {
5868 /* FORNOW: SLP if you can. */
5869 /* All unroll factors have the form:
5870
5871 GET_MODE_SIZE (vinfo->vector_mode) * X
5872
5873 for some rational X, so they must have a common multiple. */
5874 unrolling_factor
5875 = force_common_multiple (unrolling_factor,
5876 SLP_INSTANCE_UNROLLING_FACTOR (instance));
5877
5878 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
5879 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5880 loop-based vectorization. Such stmts will be marked as HYBRID. */
5881 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5882 decided_to_slp++;
5883 }
5884
5885 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5886
5887 if (decided_to_slp && dump_enabled_p ())
5888 {
5889 dump_printf_loc (MSG_NOTE, vect_location,
5890 "Decided to SLP %d instances. Unrolling factor ",
5891 decided_to_slp);
5892 dump_dec (MSG_NOTE, unrolling_factor);
5893 dump_printf (MSG_NOTE, "\n");
5894 }
5895
5896 return (decided_to_slp > 0);
5897 }
5898
5899 /* Private data for vect_detect_hybrid_slp. */
5900 struct vdhs_data
5901 {
5902 loop_vec_info loop_vinfo;
5903 vec<stmt_vec_info> *worklist;
5904 };
5905
5906 /* Walker for walk_gimple_op. */
5907
5908 static tree
5909 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5910 {
5911 walk_stmt_info *wi = (walk_stmt_info *)data;
5912 vdhs_data *dat = (vdhs_data *)wi->info;
5913
5914 if (wi->is_lhs)
5915 return NULL_TREE;
5916
5917 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5918 if (!def_stmt_info)
5919 return NULL_TREE;
5920 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5921 if (PURE_SLP_STMT (def_stmt_info))
5922 {
5923 if (dump_enabled_p ())
5924 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5925 def_stmt_info->stmt);
5926 STMT_SLP_TYPE (def_stmt_info) = hybrid;
5927 dat->worklist->safe_push (def_stmt_info);
5928 }
5929
5930 return NULL_TREE;
5931 }
5932
5933 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5934 if so, otherwise pushing it to WORKLIST. */
5935
5936 static void
5937 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5938 vec<stmt_vec_info> &worklist,
5939 stmt_vec_info stmt_info)
5940 {
5941 if (dump_enabled_p ())
5942 dump_printf_loc (MSG_NOTE, vect_location,
5943 "Processing hybrid candidate : %G", stmt_info->stmt);
5944 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5945 imm_use_iterator iter2;
5946 ssa_op_iter iter1;
5947 use_operand_p use_p;
5948 def_operand_p def_p;
5949 bool any_def = false;
5950 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5951 {
5952 any_def = true;
5953 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5954 {
5955 if (is_gimple_debug (USE_STMT (use_p)))
5956 continue;
5957 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5958 /* An out-of loop use means this is a loop_vect sink. */
5959 if (!use_info)
5960 {
5961 if (dump_enabled_p ())
5962 dump_printf_loc (MSG_NOTE, vect_location,
5963 "Found loop_vect sink: %G", stmt_info->stmt);
5964 worklist.safe_push (stmt_info);
5965 return;
5966 }
5967 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5968 {
5969 if (dump_enabled_p ())
5970 dump_printf_loc (MSG_NOTE, vect_location,
5971 "Found loop_vect use: %G", use_info->stmt);
5972 worklist.safe_push (stmt_info);
5973 return;
5974 }
5975 }
5976 }
5977 /* No def means this is a loo_vect sink. */
5978 if (!any_def)
5979 {
5980 if (dump_enabled_p ())
5981 dump_printf_loc (MSG_NOTE, vect_location,
5982 "Found loop_vect sink: %G", stmt_info->stmt);
5983 worklist.safe_push (stmt_info);
5984 return;
5985 }
5986 if (dump_enabled_p ())
5987 dump_printf_loc (MSG_NOTE, vect_location,
5988 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5989 STMT_SLP_TYPE (stmt_info) = pure_slp;
5990 }
5991
5992 /* Find stmts that must be both vectorized and SLPed. */
5993
5994 void
5995 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5996 {
5997 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5998
5999 /* All stmts participating in SLP are marked pure_slp, all other
6000 stmts are loop_vect.
6001 First collect all loop_vect stmts into a worklist.
6002 SLP patterns cause not all original scalar stmts to appear in
6003 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
6004 Rectify this here and do a backward walk over the IL only considering
6005 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
6006 mark them as pure_slp. */
6007 auto_vec<stmt_vec_info> worklist;
6008 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
6009 {
6010 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
6011 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
6012 gsi_next (&gsi))
6013 {
6014 gphi *phi = gsi.phi ();
6015 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
6016 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
6017 maybe_push_to_hybrid_worklist (loop_vinfo,
6018 worklist, stmt_info);
6019 }
6020 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
6021 gsi_prev (&gsi))
6022 {
6023 gimple *stmt = gsi_stmt (gsi);
6024 if (is_gimple_debug (stmt))
6025 continue;
6026 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
6027 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
6028 {
6029 for (gimple_stmt_iterator gsi2
6030 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
6031 !gsi_end_p (gsi2); gsi_next (&gsi2))
6032 {
6033 stmt_vec_info patt_info
6034 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
6035 if (!STMT_SLP_TYPE (patt_info)
6036 && STMT_VINFO_RELEVANT (patt_info))
6037 maybe_push_to_hybrid_worklist (loop_vinfo,
6038 worklist, patt_info);
6039 }
6040 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6041 }
6042 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
6043 maybe_push_to_hybrid_worklist (loop_vinfo,
6044 worklist, stmt_info);
6045 }
6046 }
6047
6048 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
6049 mark any SLP vectorized stmt as hybrid.
6050 ??? We're visiting def stmts N times (once for each non-SLP and
6051 once for each hybrid-SLP use). */
6052 walk_stmt_info wi;
6053 vdhs_data dat;
6054 dat.worklist = &worklist;
6055 dat.loop_vinfo = loop_vinfo;
6056 memset (&wi, 0, sizeof (wi));
6057 wi.info = (void *)&dat;
6058 while (!worklist.is_empty ())
6059 {
6060 stmt_vec_info stmt_info = worklist.pop ();
6061 /* Since SSA operands are not set up for pattern stmts we need
6062 to use walk_gimple_op. */
6063 wi.is_lhs = 0;
6064 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
6065 /* For gather/scatter make sure to walk the offset operand, that
6066 can be a scaling and conversion away. */
6067 gather_scatter_info gs_info;
6068 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
6069 && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
6070 {
6071 int dummy;
6072 vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
6073 }
6074 }
6075 }
6076
6077
6078 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
6079
6080 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
6081 : vec_info (vec_info::bb, shared),
6082 bbs (_bbs),
6083 roots (vNULL)
6084 {
6085 for (unsigned i = 0; i < bbs.length (); ++i)
6086 {
6087 if (i != 0)
6088 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6089 gsi_next (&si))
6090 {
6091 gphi *phi = si.phi ();
6092 gimple_set_uid (phi, 0);
6093 add_stmt (phi);
6094 }
6095 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6096 !gsi_end_p (gsi); gsi_next (&gsi))
6097 {
6098 gimple *stmt = gsi_stmt (gsi);
6099 gimple_set_uid (stmt, 0);
6100 if (is_gimple_debug (stmt))
6101 continue;
6102 add_stmt (stmt);
6103 }
6104 }
6105 }
6106
6107
6108 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
6109 stmts in the basic block. */
6110
6111 _bb_vec_info::~_bb_vec_info ()
6112 {
6113 /* Reset region marker. */
6114 for (unsigned i = 0; i < bbs.length (); ++i)
6115 {
6116 if (i != 0)
6117 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6118 gsi_next (&si))
6119 {
6120 gphi *phi = si.phi ();
6121 gimple_set_uid (phi, -1);
6122 }
6123 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6124 !gsi_end_p (gsi); gsi_next (&gsi))
6125 {
6126 gimple *stmt = gsi_stmt (gsi);
6127 gimple_set_uid (stmt, -1);
6128 }
6129 }
6130
6131 for (unsigned i = 0; i < roots.length (); ++i)
6132 {
6133 roots[i].stmts.release ();
6134 roots[i].roots.release ();
6135 roots[i].remain.release ();
6136 }
6137 roots.release ();
6138 }
6139
6140 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
6141 given then that child nodes have already been processed, and that
6142 their def types currently match their SLP node's def type. */
6143
6144 static bool
6145 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
6146 slp_instance node_instance,
6147 stmt_vector_for_cost *cost_vec)
6148 {
6149 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6150
6151 /* Calculate the number of vector statements to be created for the
6152 scalar stmts in this node. For SLP reductions it is equal to the
6153 number of vector statements in the children (which has already been
6154 calculated by the recursive call). Otherwise it is the number of
6155 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
6156 VF divided by the number of elements in a vector. */
6157 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6158 && !STMT_VINFO_DATA_REF (stmt_info)
6159 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6160 {
6161 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6162 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6163 {
6164 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6165 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6166 break;
6167 }
6168 }
6169 else
6170 {
6171 poly_uint64 vf;
6172 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6173 vf = loop_vinfo->vectorization_factor;
6174 else
6175 vf = 1;
6176 unsigned int group_size = SLP_TREE_LANES (node);
6177 tree vectype = SLP_TREE_VECTYPE (node);
6178 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6179 = vect_get_num_vectors (vf * group_size, vectype);
6180 }
6181
6182 /* Handle purely internal nodes. */
6183 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6184 {
6185 if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6186 return false;
6187
6188 stmt_vec_info slp_stmt_info;
6189 unsigned int i;
6190 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6191 {
6192 if (STMT_VINFO_LIVE_P (slp_stmt_info)
6193 && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6194 node_instance, i,
6195 false, cost_vec))
6196 return false;
6197 }
6198 return true;
6199 }
6200
6201 bool dummy;
6202 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6203 node, node_instance, cost_vec);
6204 }
6205
6206 /* Try to build NODE from scalars, returning true on success.
6207 NODE_INSTANCE is the SLP instance that contains NODE. */
6208
6209 static bool
6210 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6211 slp_instance node_instance)
6212 {
6213 stmt_vec_info stmt_info;
6214 unsigned int i;
6215
6216 if (!is_a <bb_vec_info> (vinfo)
6217 || node == SLP_INSTANCE_TREE (node_instance)
6218 || !SLP_TREE_SCALAR_STMTS (node).exists ()
6219 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6220 /* Force the mask use to be built from scalars instead. */
6221 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6222 return false;
6223
6224 if (dump_enabled_p ())
6225 dump_printf_loc (MSG_NOTE, vect_location,
6226 "Building vector operands of %p from scalars instead\n",
6227 (void *) node);
6228
6229 /* Don't remove and free the child nodes here, since they could be
6230 referenced by other structures. The analysis and scheduling phases
6231 (need to) ignore child nodes of anything that isn't vect_internal_def. */
6232 unsigned int group_size = SLP_TREE_LANES (node);
6233 SLP_TREE_DEF_TYPE (node) = vect_external_def;
6234 /* Invariants get their vector type from the uses. */
6235 SLP_TREE_VECTYPE (node) = NULL_TREE;
6236 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6237 SLP_TREE_LOAD_PERMUTATION (node).release ();
6238 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6239 {
6240 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6241 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6242 }
6243 return true;
6244 }
6245
6246 /* Return true if all elements of the slice are the same. */
6247 bool
6248 vect_scalar_ops_slice::all_same_p () const
6249 {
6250 for (unsigned int i = 1; i < length; ++i)
6251 if (!operand_equal_p (op (0), op (i)))
6252 return false;
6253 return true;
6254 }
6255
6256 hashval_t
6257 vect_scalar_ops_slice_hash::hash (const value_type &s)
6258 {
6259 hashval_t hash = 0;
6260 for (unsigned i = 0; i < s.length; ++i)
6261 hash = iterative_hash_expr (s.op (i), hash);
6262 return hash;
6263 }
6264
6265 bool
6266 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6267 const compare_type &s2)
6268 {
6269 if (s1.length != s2.length)
6270 return false;
6271 for (unsigned i = 0; i < s1.length; ++i)
6272 if (!operand_equal_p (s1.op (i), s2.op (i)))
6273 return false;
6274 return true;
6275 }
6276
6277 /* Compute the prologue cost for invariant or constant operands represented
6278 by NODE. */
6279
6280 static void
6281 vect_prologue_cost_for_slp (slp_tree node,
6282 stmt_vector_for_cost *cost_vec)
6283 {
6284 /* There's a special case of an existing vector, that costs nothing. */
6285 if (SLP_TREE_SCALAR_OPS (node).length () == 0
6286 && !SLP_TREE_VEC_DEFS (node).is_empty ())
6287 return;
6288 /* Without looking at the actual initializer a vector of
6289 constants can be implemented as load from the constant pool.
6290 When all elements are the same we can use a splat. */
6291 tree vectype = SLP_TREE_VECTYPE (node);
6292 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6293 unsigned HOST_WIDE_INT const_nunits;
6294 unsigned nelt_limit;
6295 auto ops = &SLP_TREE_SCALAR_OPS (node);
6296 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6297 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6298 && ! multiple_p (const_nunits, group_size))
6299 {
6300 nelt_limit = const_nunits;
6301 hash_set<vect_scalar_ops_slice_hash> vector_ops;
6302 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6303 if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6304 starts.quick_push (i * const_nunits);
6305 }
6306 else
6307 {
6308 /* If either the vector has variable length or the vectors
6309 are composed of repeated whole groups we only need to
6310 cost construction once. All vectors will be the same. */
6311 nelt_limit = group_size;
6312 starts.quick_push (0);
6313 }
6314 /* ??? We're just tracking whether vectors in a single node are the same.
6315 Ideally we'd do something more global. */
6316 bool passed = false;
6317 for (unsigned int start : starts)
6318 {
6319 vect_cost_for_stmt kind;
6320 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6321 kind = vector_load;
6322 else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6323 kind = scalar_to_vec;
6324 else
6325 kind = vec_construct;
6326 /* The target cost hook has no idea which part of the SLP node
6327 we are costing so avoid passing it down more than once. Pass
6328 it to the first vec_construct or scalar_to_vec part since for those
6329 the x86 backend tries to account for GPR to XMM register moves. */
6330 record_stmt_cost (cost_vec, 1, kind,
6331 (kind != vector_load && !passed) ? node : nullptr,
6332 vectype, 0, vect_prologue);
6333 if (kind != vector_load)
6334 passed = true;
6335 }
6336 }
6337
6338 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6339 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6340
6341 Return true if the operations are supported. */
6342
6343 static bool
6344 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6345 slp_instance node_instance,
6346 hash_set<slp_tree> &visited_set,
6347 vec<slp_tree> &visited_vec,
6348 stmt_vector_for_cost *cost_vec)
6349 {
6350 int i, j;
6351 slp_tree child;
6352
6353 /* Assume we can code-generate all invariants. */
6354 if (!node
6355 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6356 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6357 return true;
6358
6359 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6360 {
6361 if (dump_enabled_p ())
6362 dump_printf_loc (MSG_NOTE, vect_location,
6363 "Failed cyclic SLP reference in %p\n", (void *) node);
6364 return false;
6365 }
6366 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6367
6368 /* If we already analyzed the exact same set of scalar stmts we're done.
6369 We share the generated vector stmts for those. */
6370 if (visited_set.add (node))
6371 return true;
6372 visited_vec.safe_push (node);
6373
6374 bool res = true;
6375 unsigned visited_rec_start = visited_vec.length ();
6376 unsigned cost_vec_rec_start = cost_vec->length ();
6377 bool seen_non_constant_child = false;
6378 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6379 {
6380 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6381 visited_set, visited_vec,
6382 cost_vec);
6383 if (!res)
6384 break;
6385 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6386 seen_non_constant_child = true;
6387 }
6388 /* We're having difficulties scheduling nodes with just constant
6389 operands and no scalar stmts since we then cannot compute a stmt
6390 insertion place. */
6391 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6392 {
6393 if (dump_enabled_p ())
6394 dump_printf_loc (MSG_NOTE, vect_location,
6395 "Cannot vectorize all-constant op node %p\n",
6396 (void *) node);
6397 res = false;
6398 }
6399
6400 if (res)
6401 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6402 cost_vec);
6403 /* If analysis failed we have to pop all recursive visited nodes
6404 plus ourselves. */
6405 if (!res)
6406 {
6407 while (visited_vec.length () >= visited_rec_start)
6408 visited_set.remove (visited_vec.pop ());
6409 cost_vec->truncate (cost_vec_rec_start);
6410 }
6411
6412 /* When the node can be vectorized cost invariant nodes it references.
6413 This is not done in DFS order to allow the refering node
6414 vectorizable_* calls to nail down the invariant nodes vector type
6415 and possibly unshare it if it needs a different vector type than
6416 other referrers. */
6417 if (res)
6418 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6419 if (child
6420 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6421 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6422 /* Perform usual caching, note code-generation still
6423 code-gens these nodes multiple times but we expect
6424 to CSE them later. */
6425 && !visited_set.add (child))
6426 {
6427 visited_vec.safe_push (child);
6428 /* ??? After auditing more code paths make a "default"
6429 and push the vector type from NODE to all children
6430 if it is not already set. */
6431 /* Compute the number of vectors to be generated. */
6432 tree vector_type = SLP_TREE_VECTYPE (child);
6433 if (!vector_type)
6434 {
6435 /* For shifts with a scalar argument we don't need
6436 to cost or code-generate anything.
6437 ??? Represent this more explicitely. */
6438 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6439 == shift_vec_info_type)
6440 && j == 1);
6441 continue;
6442 }
6443 unsigned group_size = SLP_TREE_LANES (child);
6444 poly_uint64 vf = 1;
6445 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6446 vf = loop_vinfo->vectorization_factor;
6447 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6448 = vect_get_num_vectors (vf * group_size, vector_type);
6449 /* And cost them. */
6450 vect_prologue_cost_for_slp (child, cost_vec);
6451 }
6452
6453 /* If this node or any of its children can't be vectorized, try pruning
6454 the tree here rather than felling the whole thing. */
6455 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6456 {
6457 /* We'll need to revisit this for invariant costing and number
6458 of vectorized stmt setting. */
6459 res = true;
6460 }
6461
6462 return res;
6463 }
6464
6465 /* Given a definition DEF, analyze if it will have any live scalar use after
6466 performing SLP vectorization whose information is represented by BB_VINFO,
6467 and record result into hash map SCALAR_USE_MAP as cache for later fast
6468 check. If recursion DEPTH exceeds a limit, stop analysis and make a
6469 conservative assumption. Return 0 if no scalar use, 1 if there is, -1
6470 means recursion is limited. */
6471
6472 static int
6473 vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
6474 hash_map<tree, int> &scalar_use_map,
6475 int depth = 0)
6476 {
6477 const int depth_limit = 2;
6478 imm_use_iterator use_iter;
6479 gimple *use_stmt;
6480
6481 if (int *res = scalar_use_map.get (def))
6482 return *res;
6483
6484 int scalar_use = 1;
6485
6486 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
6487 {
6488 if (is_gimple_debug (use_stmt))
6489 continue;
6490
6491 stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6492
6493 if (!use_stmt_info)
6494 break;
6495
6496 if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6497 continue;
6498
6499 /* Do not step forward when encounter PHI statement, since it may
6500 involve cyclic reference and cause infinite recursive invocation. */
6501 if (gimple_code (use_stmt) == GIMPLE_PHI)
6502 break;
6503
6504 /* When pattern recognition is involved, a statement whose definition is
6505 consumed in some pattern, may not be included in the final replacement
6506 pattern statements, so would be skipped when building SLP graph.
6507
6508 * Original
6509 char a_c = *(char *) a;
6510 char b_c = *(char *) b;
6511 unsigned short a_s = (unsigned short) a_c;
6512 int a_i = (int) a_s;
6513 int b_i = (int) b_c;
6514 int r_i = a_i - b_i;
6515
6516 * After pattern replacement
6517 a_s = (unsigned short) a_c;
6518 a_i = (int) a_s;
6519
6520 patt_b_s = (unsigned short) b_c; // b_i = (int) b_c
6521 patt_b_i = (int) patt_b_s; // b_i = (int) b_c
6522
6523 patt_r_s = widen_minus(a_c, b_c); // r_i = a_i - b_i
6524 patt_r_i = (int) patt_r_s; // r_i = a_i - b_i
6525
6526 The definitions of a_i(original statement) and b_i(pattern statement)
6527 are related to, but actually not part of widen_minus pattern.
6528 Vectorizing the pattern does not cause these definition statements to
6529 be marked as PURE_SLP. For this case, we need to recursively check
6530 whether their uses are all absorbed into vectorized code. But there
6531 is an exception that some use may participate in an vectorized
6532 operation via an external SLP node containing that use as an element.
6533 The parameter "scalar_use_map" tags such kind of SSA as having scalar
6534 use in advance. */
6535 tree lhs = gimple_get_lhs (use_stmt);
6536
6537 if (!lhs || TREE_CODE (lhs) != SSA_NAME)
6538 break;
6539
6540 if (depth_limit && depth >= depth_limit)
6541 return -1;
6542
6543 if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
6544 depth + 1)))
6545 break;
6546 }
6547
6548 if (end_imm_use_stmt_p (&use_iter))
6549 scalar_use = 0;
6550
6551 /* If recursion is limited, do not cache result for non-root defs. */
6552 if (!depth || scalar_use >= 0)
6553 {
6554 bool added = scalar_use_map.put (def, scalar_use);
6555 gcc_assert (!added);
6556 }
6557
6558 return scalar_use;
6559 }
6560
6561 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6562 region and that can be vectorized using vectorizable_live_operation
6563 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
6564 scalar code computing it to be retained. */
6565
6566 static void
6567 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6568 slp_instance instance,
6569 stmt_vector_for_cost *cost_vec,
6570 hash_map<tree, int> &scalar_use_map,
6571 hash_set<stmt_vec_info> &svisited,
6572 hash_set<slp_tree> &visited)
6573 {
6574 if (visited.add (node))
6575 return;
6576
6577 unsigned i;
6578 stmt_vec_info stmt_info;
6579 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6580 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6581 {
6582 if (svisited.contains (stmt_info))
6583 continue;
6584 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6585 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6586 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6587 /* Only the pattern root stmt computes the original scalar value. */
6588 continue;
6589 bool mark_visited = true;
6590 gimple *orig_stmt = orig_stmt_info->stmt;
6591 ssa_op_iter op_iter;
6592 def_operand_p def_p;
6593 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6594 {
6595 if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
6596 scalar_use_map))
6597 {
6598 STMT_VINFO_LIVE_P (stmt_info) = true;
6599 if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
6600 instance, i, false, cost_vec))
6601 /* ??? So we know we can vectorize the live stmt from one SLP
6602 node. If we cannot do so from all or none consistently
6603 we'd have to record which SLP node (and lane) we want to
6604 use for the live operation. So make sure we can
6605 code-generate from all nodes. */
6606 mark_visited = false;
6607 else
6608 STMT_VINFO_LIVE_P (stmt_info) = false;
6609 }
6610
6611 /* We have to verify whether we can insert the lane extract
6612 before all uses. The following is a conservative approximation.
6613 We cannot put this into vectorizable_live_operation because
6614 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6615 doesn't work.
6616 Note that while the fact that we emit code for loads at the
6617 first load should make this a non-problem leafs we construct
6618 from scalars are vectorized after the last scalar def.
6619 ??? If we'd actually compute the insert location during
6620 analysis we could use sth less conservative than the last
6621 scalar stmt in the node for the dominance check. */
6622 /* ??? What remains is "live" uses in vector CTORs in the same
6623 SLP graph which is where those uses can end up code-generated
6624 right after their definition instead of close to their original
6625 use. But that would restrict us to code-generate lane-extracts
6626 from the latest stmt in a node. So we compensate for this
6627 during code-generation, simply not replacing uses for those
6628 hopefully rare cases. */
6629 imm_use_iterator use_iter;
6630 gimple *use_stmt;
6631 stmt_vec_info use_stmt_info;
6632
6633 if (STMT_VINFO_LIVE_P (stmt_info))
6634 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6635 if (!is_gimple_debug (use_stmt)
6636 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6637 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6638 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6639 {
6640 if (dump_enabled_p ())
6641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6642 "Cannot determine insertion place for "
6643 "lane extract\n");
6644 STMT_VINFO_LIVE_P (stmt_info) = false;
6645 mark_visited = true;
6646 }
6647 }
6648 if (mark_visited)
6649 svisited.add (stmt_info);
6650 }
6651
6652 slp_tree child;
6653 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6654 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6655 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
6656 scalar_use_map, svisited, visited);
6657 }
6658
6659 /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
6660 are live outside of the basic-block vectorized region and that can be
6661 vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. */
6662
6663 static void
6664 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
6665 {
6666 if (bb_vinfo->slp_instances.is_empty ())
6667 return;
6668
6669 hash_set<stmt_vec_info> svisited;
6670 hash_set<slp_tree> visited;
6671 hash_map<tree, int> scalar_use_map;
6672 auto_vec<slp_tree> worklist;
6673
6674 for (slp_instance instance : bb_vinfo->slp_instances)
6675 {
6676 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
6677 for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
6678 if (TREE_CODE (op) == SSA_NAME)
6679 scalar_use_map.put (op, 1);
6680 if (!visited.add (SLP_INSTANCE_TREE (instance)))
6681 worklist.safe_push (SLP_INSTANCE_TREE (instance));
6682 }
6683
6684 do
6685 {
6686 slp_tree node = worklist.pop ();
6687
6688 if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
6689 {
6690 for (tree op : SLP_TREE_SCALAR_OPS (node))
6691 if (TREE_CODE (op) == SSA_NAME)
6692 scalar_use_map.put (op, 1);
6693 }
6694 else
6695 {
6696 for (slp_tree child : SLP_TREE_CHILDREN (node))
6697 if (child && !visited.add (child))
6698 worklist.safe_push (child);
6699 }
6700 }
6701 while (!worklist.is_empty ());
6702
6703 visited.empty ();
6704
6705 for (slp_instance instance : bb_vinfo->slp_instances)
6706 {
6707 vect_location = instance->location ();
6708 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6709 instance, &instance->cost_vec,
6710 scalar_use_map, svisited, visited);
6711 }
6712 }
6713
6714 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
6715
6716 static bool
6717 vectorizable_bb_reduc_epilogue (slp_instance instance,
6718 stmt_vector_for_cost *cost_vec)
6719 {
6720 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6721 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6722 if (reduc_code == MINUS_EXPR)
6723 reduc_code = PLUS_EXPR;
6724 internal_fn reduc_fn;
6725 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6726 if (!vectype
6727 || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6728 || reduc_fn == IFN_LAST
6729 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6730 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6731 TREE_TYPE (vectype)))
6732 {
6733 if (dump_enabled_p ())
6734 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6735 "not vectorized: basic block reduction epilogue "
6736 "operation unsupported.\n");
6737 return false;
6738 }
6739
6740 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6741 cost log2 vector operations plus shuffles and one extraction. */
6742 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6743 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6744 vectype, 0, vect_body);
6745 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6746 vectype, 0, vect_body);
6747 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6748 vectype, 0, vect_body);
6749
6750 /* Since we replace all stmts of a possibly longer scalar reduction
6751 chain account for the extra scalar stmts for that. */
6752 record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
6753 instance->root_stmts[0], 0, vect_body);
6754 return true;
6755 }
6756
6757 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6758 and recurse to children. */
6759
6760 static void
6761 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6762 hash_set<slp_tree> &visited)
6763 {
6764 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6765 || visited.add (node))
6766 return;
6767
6768 stmt_vec_info stmt;
6769 unsigned i;
6770 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6771 roots.remove (vect_orig_stmt (stmt));
6772
6773 slp_tree child;
6774 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6775 if (child)
6776 vect_slp_prune_covered_roots (child, roots, visited);
6777 }
6778
6779 /* Analyze statements in SLP instances of VINFO. Return true if the
6780 operations are supported. */
6781
6782 bool
6783 vect_slp_analyze_operations (vec_info *vinfo)
6784 {
6785 slp_instance instance;
6786 int i;
6787
6788 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6789
6790 hash_set<slp_tree> visited;
6791 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6792 {
6793 auto_vec<slp_tree> visited_vec;
6794 stmt_vector_for_cost cost_vec;
6795 cost_vec.create (2);
6796 if (is_a <bb_vec_info> (vinfo))
6797 vect_location = instance->location ();
6798 if (!vect_slp_analyze_node_operations (vinfo,
6799 SLP_INSTANCE_TREE (instance),
6800 instance, visited, visited_vec,
6801 &cost_vec)
6802 /* CTOR instances require vectorized defs for the SLP tree root. */
6803 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6804 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6805 != vect_internal_def
6806 /* Make sure we vectorized with the expected type. */
6807 || !useless_type_conversion_p
6808 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6809 (instance->root_stmts[0]->stmt))),
6810 TREE_TYPE (SLP_TREE_VECTYPE
6811 (SLP_INSTANCE_TREE (instance))))))
6812 /* Check we can vectorize the reduction. */
6813 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6814 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6815 {
6816 slp_tree node = SLP_INSTANCE_TREE (instance);
6817 stmt_vec_info stmt_info;
6818 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6819 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6820 else
6821 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6822 if (dump_enabled_p ())
6823 dump_printf_loc (MSG_NOTE, vect_location,
6824 "removing SLP instance operations starting from: %G",
6825 stmt_info->stmt);
6826 vect_free_slp_instance (instance);
6827 vinfo->slp_instances.ordered_remove (i);
6828 cost_vec.release ();
6829 while (!visited_vec.is_empty ())
6830 visited.remove (visited_vec.pop ());
6831 }
6832 else
6833 {
6834 i++;
6835 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6836 {
6837 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6838 cost_vec.release ();
6839 }
6840 else
6841 /* For BB vectorization remember the SLP graph entry
6842 cost for later. */
6843 instance->cost_vec = cost_vec;
6844 }
6845 }
6846
6847 /* Now look for SLP instances with a root that are covered by other
6848 instances and remove them. */
6849 hash_set<stmt_vec_info> roots;
6850 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6851 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6852 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6853 if (!roots.is_empty ())
6854 {
6855 visited.empty ();
6856 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6857 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6858 visited);
6859 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6860 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6861 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6862 {
6863 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6864 if (dump_enabled_p ())
6865 dump_printf_loc (MSG_NOTE, vect_location,
6866 "removing SLP instance operations starting "
6867 "from: %G", root->stmt);
6868 vect_free_slp_instance (instance);
6869 vinfo->slp_instances.ordered_remove (i);
6870 }
6871 else
6872 ++i;
6873 }
6874
6875 /* Compute vectorizable live stmts. */
6876 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6877 vect_bb_slp_mark_live_stmts (bb_vinfo);
6878
6879 return !vinfo->slp_instances.is_empty ();
6880 }
6881
6882 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6883 closing the eventual chain. */
6884
6885 static slp_instance
6886 get_ultimate_leader (slp_instance instance,
6887 hash_map<slp_instance, slp_instance> &instance_leader)
6888 {
6889 auto_vec<slp_instance *, 8> chain;
6890 slp_instance *tem;
6891 while (*(tem = instance_leader.get (instance)) != instance)
6892 {
6893 chain.safe_push (tem);
6894 instance = *tem;
6895 }
6896 while (!chain.is_empty ())
6897 *chain.pop () = instance;
6898 return instance;
6899 }
6900
6901 namespace {
6902 /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
6903 KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6904 for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
6905
6906 INSTANCE_LEADER is as for get_ultimate_leader. */
6907
6908 template<typename T>
6909 bool
6910 vect_map_to_instance (slp_instance instance, T key,
6911 hash_map<T, slp_instance> &key_to_instance,
6912 hash_map<slp_instance, slp_instance> &instance_leader)
6913 {
6914 bool existed_p;
6915 slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6916 if (!existed_p)
6917 ;
6918 else if (key_instance != instance)
6919 {
6920 /* If we're running into a previously marked key make us the
6921 leader of the current ultimate leader. This keeps the
6922 leader chain acyclic and works even when the current instance
6923 connects two previously independent graph parts. */
6924 slp_instance key_leader
6925 = get_ultimate_leader (key_instance, instance_leader);
6926 if (key_leader != instance)
6927 instance_leader.put (key_leader, instance);
6928 }
6929 key_instance = instance;
6930 return existed_p;
6931 }
6932 }
6933
6934 /* Worker of vect_bb_partition_graph, recurse on NODE. */
6935
6936 static void
6937 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6938 slp_instance instance, slp_tree node,
6939 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6940 hash_map<slp_tree, slp_instance> &node_to_instance,
6941 hash_map<slp_instance, slp_instance> &instance_leader)
6942 {
6943 stmt_vec_info stmt_info;
6944 unsigned i;
6945
6946 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6947 vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6948 instance_leader);
6949
6950 if (vect_map_to_instance (instance, node, node_to_instance,
6951 instance_leader))
6952 return;
6953
6954 slp_tree child;
6955 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6956 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6957 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6958 node_to_instance, instance_leader);
6959 }
6960
6961 /* Partition the SLP graph into pieces that can be costed independently. */
6962
6963 static void
6964 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6965 {
6966 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6967
6968 /* First walk the SLP graph assigning each involved scalar stmt a
6969 corresponding SLP graph entry and upon visiting a previously
6970 marked stmt, make the stmts leader the current SLP graph entry. */
6971 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6972 hash_map<slp_tree, slp_instance> node_to_instance;
6973 hash_map<slp_instance, slp_instance> instance_leader;
6974 slp_instance instance;
6975 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6976 {
6977 instance_leader.put (instance, instance);
6978 vect_bb_partition_graph_r (bb_vinfo,
6979 instance, SLP_INSTANCE_TREE (instance),
6980 stmt_to_instance, node_to_instance,
6981 instance_leader);
6982 }
6983
6984 /* Then collect entries to each independent subgraph. */
6985 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6986 {
6987 slp_instance leader = get_ultimate_leader (instance, instance_leader);
6988 leader->subgraph_entries.safe_push (instance);
6989 if (dump_enabled_p ()
6990 && leader != instance)
6991 dump_printf_loc (MSG_NOTE, vect_location,
6992 "instance %p is leader of %p\n",
6993 (void *) leader, (void *) instance);
6994 }
6995 }
6996
6997 /* Compute the set of scalar stmts participating in internal and external
6998 nodes. */
6999
7000 static void
7001 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
7002 hash_set<slp_tree> &visited,
7003 hash_set<stmt_vec_info> &vstmts,
7004 hash_set<stmt_vec_info> &estmts)
7005 {
7006 int i;
7007 stmt_vec_info stmt_info;
7008 slp_tree child;
7009
7010 if (visited.add (node))
7011 return;
7012
7013 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
7014 {
7015 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7016 vstmts.add (stmt_info);
7017
7018 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7019 if (child)
7020 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
7021 vstmts, estmts);
7022 }
7023 else
7024 for (tree def : SLP_TREE_SCALAR_OPS (node))
7025 {
7026 stmt_vec_info def_stmt = vinfo->lookup_def (def);
7027 if (def_stmt)
7028 estmts.add (def_stmt);
7029 }
7030 }
7031
7032
7033 /* Compute the scalar cost of the SLP node NODE and its children
7034 and return it. Do not account defs that are marked in LIFE and
7035 update LIFE according to uses of NODE. */
7036
7037 static void
7038 vect_bb_slp_scalar_cost (vec_info *vinfo,
7039 slp_tree node, vec<bool, va_heap> *life,
7040 stmt_vector_for_cost *cost_vec,
7041 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
7042 hash_set<slp_tree> &visited)
7043 {
7044 unsigned i;
7045 stmt_vec_info stmt_info;
7046 slp_tree child;
7047
7048 if (visited.add (node))
7049 return;
7050
7051 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7052 {
7053 ssa_op_iter op_iter;
7054 def_operand_p def_p;
7055
7056 if ((*life)[i])
7057 continue;
7058
7059 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
7060 gimple *orig_stmt = orig_stmt_info->stmt;
7061
7062 /* If there is a non-vectorized use of the defs then the scalar
7063 stmt is kept live in which case we do not account it or any
7064 required defs in the SLP children in the scalar cost. This
7065 way we make the vectorization more costly when compared to
7066 the scalar cost. */
7067 if (!STMT_VINFO_LIVE_P (stmt_info))
7068 {
7069 auto_vec<gimple *, 8> worklist;
7070 hash_set<gimple *> *worklist_visited = NULL;
7071 worklist.quick_push (orig_stmt);
7072 do
7073 {
7074 gimple *work_stmt = worklist.pop ();
7075 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
7076 {
7077 imm_use_iterator use_iter;
7078 gimple *use_stmt;
7079 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
7080 DEF_FROM_PTR (def_p))
7081 if (!is_gimple_debug (use_stmt))
7082 {
7083 stmt_vec_info use_stmt_info
7084 = vinfo->lookup_stmt (use_stmt);
7085 if (!use_stmt_info
7086 || !vectorized_scalar_stmts.contains (use_stmt_info))
7087 {
7088 if (use_stmt_info
7089 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
7090 {
7091 /* For stmts participating in patterns we have
7092 to check its uses recursively. */
7093 if (!worklist_visited)
7094 worklist_visited = new hash_set<gimple *> ();
7095 if (!worklist_visited->add (use_stmt))
7096 worklist.safe_push (use_stmt);
7097 continue;
7098 }
7099 (*life)[i] = true;
7100 goto next_lane;
7101 }
7102 }
7103 }
7104 }
7105 while (!worklist.is_empty ());
7106 next_lane:
7107 if (worklist_visited)
7108 delete worklist_visited;
7109 if ((*life)[i])
7110 continue;
7111 }
7112
7113 /* Count scalar stmts only once. */
7114 if (gimple_visited_p (orig_stmt))
7115 continue;
7116 gimple_set_visited (orig_stmt, true);
7117
7118 vect_cost_for_stmt kind;
7119 if (STMT_VINFO_DATA_REF (orig_stmt_info))
7120 {
7121 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
7122 kind = scalar_load;
7123 else
7124 kind = scalar_store;
7125 }
7126 else if (vect_nop_conversion_p (orig_stmt_info))
7127 continue;
7128 /* For single-argument PHIs assume coalescing which means zero cost
7129 for the scalar and the vector PHIs. This avoids artificially
7130 favoring the vector path (but may pessimize it in some cases). */
7131 else if (is_a <gphi *> (orig_stmt_info->stmt)
7132 && gimple_phi_num_args
7133 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
7134 continue;
7135 else
7136 kind = scalar_stmt;
7137 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
7138 SLP_TREE_VECTYPE (node), 0, vect_body);
7139 }
7140
7141 auto_vec<bool, 20> subtree_life;
7142 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7143 {
7144 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7145 {
7146 /* Do not directly pass LIFE to the recursive call, copy it to
7147 confine changes in the callee to the current child/subtree. */
7148 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7149 {
7150 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
7151 for (unsigned j = 0;
7152 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
7153 {
7154 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
7155 if (perm.first == i)
7156 subtree_life[perm.second] = (*life)[j];
7157 }
7158 }
7159 else
7160 {
7161 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
7162 subtree_life.safe_splice (*life);
7163 }
7164 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
7165 vectorized_scalar_stmts, visited);
7166 subtree_life.truncate (0);
7167 }
7168 }
7169 }
7170
7171 /* Comparator for the loop-index sorted cost vectors. */
7172
7173 static int
7174 li_cost_vec_cmp (const void *a_, const void *b_)
7175 {
7176 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
7177 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
7178 if (a->first < b->first)
7179 return -1;
7180 else if (a->first == b->first)
7181 return 0;
7182 return 1;
7183 }
7184
7185 /* Check if vectorization of the basic block is profitable for the
7186 subgraph denoted by SLP_INSTANCES. */
7187
7188 static bool
7189 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
7190 vec<slp_instance> slp_instances,
7191 loop_p orig_loop)
7192 {
7193 slp_instance instance;
7194 int i;
7195 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
7196 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
7197
7198 if (dump_enabled_p ())
7199 {
7200 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
7201 hash_set<slp_tree> visited;
7202 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7203 vect_print_slp_graph (MSG_NOTE, vect_location,
7204 SLP_INSTANCE_TREE (instance), visited);
7205 }
7206
7207 /* Compute the set of scalar stmts we know will go away 'locally' when
7208 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
7209 not accurate for nodes promoted extern late or for scalar stmts that
7210 are used both in extern defs and in vectorized defs. */
7211 hash_set<stmt_vec_info> vectorized_scalar_stmts;
7212 hash_set<stmt_vec_info> scalar_stmts_in_externs;
7213 hash_set<slp_tree> visited;
7214 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7215 {
7216 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
7217 SLP_INSTANCE_TREE (instance),
7218 visited,
7219 vectorized_scalar_stmts,
7220 scalar_stmts_in_externs);
7221 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
7222 vectorized_scalar_stmts.add (rstmt);
7223 }
7224 /* Scalar stmts used as defs in external nodes need to be preseved, so
7225 remove them from vectorized_scalar_stmts. */
7226 for (stmt_vec_info stmt : scalar_stmts_in_externs)
7227 vectorized_scalar_stmts.remove (stmt);
7228
7229 /* Calculate scalar cost and sum the cost for the vector stmts
7230 previously collected. */
7231 stmt_vector_for_cost scalar_costs = vNULL;
7232 stmt_vector_for_cost vector_costs = vNULL;
7233 visited.empty ();
7234 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7235 {
7236 auto_vec<bool, 20> life;
7237 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
7238 true);
7239 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7240 record_stmt_cost (&scalar_costs,
7241 SLP_INSTANCE_ROOT_STMTS (instance).length (),
7242 scalar_stmt,
7243 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
7244 vect_bb_slp_scalar_cost (bb_vinfo,
7245 SLP_INSTANCE_TREE (instance),
7246 &life, &scalar_costs, vectorized_scalar_stmts,
7247 visited);
7248 vector_costs.safe_splice (instance->cost_vec);
7249 instance->cost_vec.release ();
7250 }
7251
7252 if (dump_enabled_p ())
7253 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
7254
7255 /* When costing non-loop vectorization we need to consider each covered
7256 loop independently and make sure vectorization is profitable. For
7257 now we assume a loop may be not entered or executed an arbitrary
7258 number of iterations (??? static information can provide more
7259 precise info here) which means we can simply cost each containing
7260 loops stmts separately. */
7261
7262 /* First produce cost vectors sorted by loop index. */
7263 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7264 li_scalar_costs (scalar_costs.length ());
7265 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7266 li_vector_costs (vector_costs.length ());
7267 stmt_info_for_cost *cost;
7268 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7269 {
7270 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7271 li_scalar_costs.quick_push (std::make_pair (l, cost));
7272 }
7273 /* Use a random used loop as fallback in case the first vector_costs
7274 entry does not have a stmt_info associated with it. */
7275 unsigned l = li_scalar_costs[0].first;
7276 FOR_EACH_VEC_ELT (vector_costs, i, cost)
7277 {
7278 /* We inherit from the previous COST, invariants, externals and
7279 extracts immediately follow the cost for the related stmt. */
7280 if (cost->stmt_info)
7281 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7282 li_vector_costs.quick_push (std::make_pair (l, cost));
7283 }
7284 li_scalar_costs.qsort (li_cost_vec_cmp);
7285 li_vector_costs.qsort (li_cost_vec_cmp);
7286
7287 /* Now cost the portions individually. */
7288 unsigned vi = 0;
7289 unsigned si = 0;
7290 bool profitable = true;
7291 while (si < li_scalar_costs.length ()
7292 && vi < li_vector_costs.length ())
7293 {
7294 unsigned sl = li_scalar_costs[si].first;
7295 unsigned vl = li_vector_costs[vi].first;
7296 if (sl != vl)
7297 {
7298 if (dump_enabled_p ())
7299 dump_printf_loc (MSG_NOTE, vect_location,
7300 "Scalar %d and vector %d loop part do not "
7301 "match up, skipping scalar part\n", sl, vl);
7302 /* Skip the scalar part, assuming zero cost on the vector side. */
7303 do
7304 {
7305 si++;
7306 }
7307 while (si < li_scalar_costs.length ()
7308 && li_scalar_costs[si].first == sl);
7309 continue;
7310 }
7311
7312 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
7313 do
7314 {
7315 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
7316 si++;
7317 }
7318 while (si < li_scalar_costs.length ()
7319 && li_scalar_costs[si].first == sl);
7320 unsigned dummy;
7321 finish_cost (scalar_target_cost_data, nullptr,
7322 &dummy, &scalar_cost, &dummy);
7323
7324 /* Complete the target-specific vector cost calculation. */
7325 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
7326 do
7327 {
7328 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
7329 vi++;
7330 }
7331 while (vi < li_vector_costs.length ()
7332 && li_vector_costs[vi].first == vl);
7333 finish_cost (vect_target_cost_data, scalar_target_cost_data,
7334 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7335 delete scalar_target_cost_data;
7336 delete vect_target_cost_data;
7337
7338 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7339
7340 if (dump_enabled_p ())
7341 {
7342 dump_printf_loc (MSG_NOTE, vect_location,
7343 "Cost model analysis for part in loop %d:\n", sl);
7344 dump_printf (MSG_NOTE, " Vector cost: %d\n",
7345 vec_inside_cost + vec_outside_cost);
7346 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
7347 }
7348
7349 /* Vectorization is profitable if its cost is more than the cost of scalar
7350 version. Note that we err on the vector side for equal cost because
7351 the cost estimate is otherwise quite pessimistic (constant uses are
7352 free on the scalar side but cost a load on the vector side for
7353 example). */
7354 if (vec_outside_cost + vec_inside_cost > scalar_cost)
7355 {
7356 profitable = false;
7357 break;
7358 }
7359 }
7360 if (profitable && vi < li_vector_costs.length ())
7361 {
7362 if (dump_enabled_p ())
7363 dump_printf_loc (MSG_NOTE, vect_location,
7364 "Excess vector cost for part in loop %d:\n",
7365 li_vector_costs[vi].first);
7366 profitable = false;
7367 }
7368
7369 /* Unset visited flag. This is delayed when the subgraph is profitable
7370 and we process the loop for remaining unvectorized if-converted code. */
7371 if (!orig_loop || !profitable)
7372 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7373 gimple_set_visited (cost->stmt_info->stmt, false);
7374
7375 scalar_costs.release ();
7376 vector_costs.release ();
7377
7378 return profitable;
7379 }
7380
7381 /* qsort comparator for lane defs. */
7382
7383 static int
7384 vld_cmp (const void *a_, const void *b_)
7385 {
7386 auto *a = (const std::pair<unsigned, tree> *)a_;
7387 auto *b = (const std::pair<unsigned, tree> *)b_;
7388 return a->first - b->first;
7389 }
7390
7391 /* Return true if USE_STMT is a vector lane insert into VEC and set
7392 *THIS_LANE to the lane number that is set. */
7393
7394 static bool
7395 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7396 {
7397 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7398 if (!use_ass
7399 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7400 || (vec
7401 ? gimple_assign_rhs1 (use_ass) != vec
7402 : ((vec = gimple_assign_rhs1 (use_ass)), false))
7403 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7404 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7405 || !constant_multiple_p
7406 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7407 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7408 this_lane))
7409 return false;
7410 return true;
7411 }
7412
7413 /* Find any vectorizable constructors and add them to the grouped_store
7414 array. */
7415
7416 static void
7417 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7418 {
7419 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7420 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7421 !gsi_end_p (gsi); gsi_next (&gsi))
7422 {
7423 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7424 if (!assign)
7425 continue;
7426
7427 tree rhs = gimple_assign_rhs1 (assign);
7428 enum tree_code code = gimple_assign_rhs_code (assign);
7429 use_operand_p use_p;
7430 gimple *use_stmt;
7431 if (code == CONSTRUCTOR)
7432 {
7433 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7434 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7435 CONSTRUCTOR_NELTS (rhs))
7436 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7437 || uniform_vector_p (rhs))
7438 continue;
7439
7440 unsigned j;
7441 tree val;
7442 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7443 if (TREE_CODE (val) != SSA_NAME
7444 || !bb_vinfo->lookup_def (val))
7445 break;
7446 if (j != CONSTRUCTOR_NELTS (rhs))
7447 continue;
7448
7449 vec<stmt_vec_info> roots = vNULL;
7450 roots.safe_push (bb_vinfo->lookup_stmt (assign));
7451 vec<stmt_vec_info> stmts;
7452 stmts.create (CONSTRUCTOR_NELTS (rhs));
7453 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7454 stmts.quick_push
7455 (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7456 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7457 stmts, roots));
7458 }
7459 else if (code == BIT_INSERT_EXPR
7460 && VECTOR_TYPE_P (TREE_TYPE (rhs))
7461 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7462 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7463 && integer_zerop (gimple_assign_rhs3 (assign))
7464 && useless_type_conversion_p
7465 (TREE_TYPE (TREE_TYPE (rhs)),
7466 TREE_TYPE (gimple_assign_rhs2 (assign)))
7467 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7468 {
7469 /* We start to match on insert to lane zero but since the
7470 inserts need not be ordered we'd have to search both
7471 the def and the use chains. */
7472 tree vectype = TREE_TYPE (rhs);
7473 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7474 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7475 auto_sbitmap lanes (nlanes);
7476 bitmap_clear (lanes);
7477 bitmap_set_bit (lanes, 0);
7478 tree def = gimple_assign_lhs (assign);
7479 lane_defs.quick_push
7480 (std::make_pair (0, gimple_assign_rhs2 (assign)));
7481 unsigned lanes_found = 1;
7482 /* Start with the use chains, the last stmt will be the root. */
7483 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7484 vec<stmt_vec_info> roots = vNULL;
7485 roots.safe_push (last);
7486 do
7487 {
7488 use_operand_p use_p;
7489 gimple *use_stmt;
7490 if (!single_imm_use (def, &use_p, &use_stmt))
7491 break;
7492 unsigned this_lane;
7493 if (!bb_vinfo->lookup_stmt (use_stmt)
7494 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7495 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7496 break;
7497 if (bitmap_bit_p (lanes, this_lane))
7498 break;
7499 lanes_found++;
7500 bitmap_set_bit (lanes, this_lane);
7501 gassign *use_ass = as_a <gassign *> (use_stmt);
7502 lane_defs.quick_push (std::make_pair
7503 (this_lane, gimple_assign_rhs2 (use_ass)));
7504 last = bb_vinfo->lookup_stmt (use_ass);
7505 roots.safe_push (last);
7506 def = gimple_assign_lhs (use_ass);
7507 }
7508 while (lanes_found < nlanes);
7509 if (roots.length () > 1)
7510 std::swap(roots[0], roots[roots.length () - 1]);
7511 if (lanes_found < nlanes)
7512 {
7513 /* Now search the def chain. */
7514 def = gimple_assign_rhs1 (assign);
7515 do
7516 {
7517 if (TREE_CODE (def) != SSA_NAME
7518 || !has_single_use (def))
7519 break;
7520 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7521 unsigned this_lane;
7522 if (!bb_vinfo->lookup_stmt (def_stmt)
7523 || !vect_slp_is_lane_insert (def_stmt,
7524 NULL_TREE, &this_lane)
7525 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7526 break;
7527 if (bitmap_bit_p (lanes, this_lane))
7528 break;
7529 lanes_found++;
7530 bitmap_set_bit (lanes, this_lane);
7531 lane_defs.quick_push (std::make_pair
7532 (this_lane,
7533 gimple_assign_rhs2 (def_stmt)));
7534 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7535 def = gimple_assign_rhs1 (def_stmt);
7536 }
7537 while (lanes_found < nlanes);
7538 }
7539 if (lanes_found == nlanes)
7540 {
7541 /* Sort lane_defs after the lane index and register the root. */
7542 lane_defs.qsort (vld_cmp);
7543 vec<stmt_vec_info> stmts;
7544 stmts.create (nlanes);
7545 for (unsigned i = 0; i < nlanes; ++i)
7546 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7547 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7548 stmts, roots));
7549 }
7550 else
7551 roots.release ();
7552 }
7553 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7554 && (associative_tree_code (code) || code == MINUS_EXPR)
7555 /* ??? This pessimizes a two-element reduction. PR54400.
7556 ??? In-order reduction could be handled if we only
7557 traverse one operand chain in vect_slp_linearize_chain. */
7558 && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7559 /* Ops with constants at the tail can be stripped here. */
7560 && TREE_CODE (rhs) == SSA_NAME
7561 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7562 /* Should be the chain end. */
7563 && (!single_imm_use (gimple_assign_lhs (assign),
7564 &use_p, &use_stmt)
7565 || !is_gimple_assign (use_stmt)
7566 || (gimple_assign_rhs_code (use_stmt) != code
7567 && ((code != PLUS_EXPR && code != MINUS_EXPR)
7568 || (gimple_assign_rhs_code (use_stmt)
7569 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7570 {
7571 /* We start the match at the end of a possible association
7572 chain. */
7573 auto_vec<chain_op_t> chain;
7574 auto_vec<std::pair<tree_code, gimple *> > worklist;
7575 auto_vec<gimple *> chain_stmts;
7576 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7577 if (code == MINUS_EXPR)
7578 code = PLUS_EXPR;
7579 internal_fn reduc_fn;
7580 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7581 || reduc_fn == IFN_LAST)
7582 continue;
7583 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7584 /* ??? */
7585 code_stmt, alt_code_stmt, &chain_stmts);
7586 if (chain.length () > 1)
7587 {
7588 /* Sort the chain according to def_type and operation. */
7589 chain.sort (dt_sort_cmp, bb_vinfo);
7590 /* ??? Now we'd want to strip externals and constants
7591 but record those to be handled in the epilogue. */
7592 /* ??? For now do not allow mixing ops or externs/constants. */
7593 bool invalid = false;
7594 unsigned remain_cnt = 0;
7595 unsigned last_idx = 0;
7596 for (unsigned i = 0; i < chain.length (); ++i)
7597 {
7598 if (chain[i].code != code)
7599 {
7600 invalid = true;
7601 break;
7602 }
7603 if (chain[i].dt != vect_internal_def
7604 /* Avoid stmts where the def is not the LHS, like
7605 ASMs. */
7606 || (gimple_get_lhs (bb_vinfo->lookup_def
7607 (chain[i].op)->stmt)
7608 != chain[i].op))
7609 remain_cnt++;
7610 else
7611 last_idx = i;
7612 }
7613 /* Make sure to have an even number of lanes as we later do
7614 all-or-nothing discovery, not trying to split further. */
7615 if ((chain.length () - remain_cnt) & 1)
7616 remain_cnt++;
7617 if (!invalid && chain.length () - remain_cnt > 1)
7618 {
7619 vec<stmt_vec_info> stmts;
7620 vec<tree> remain = vNULL;
7621 stmts.create (chain.length ());
7622 if (remain_cnt > 0)
7623 remain.create (remain_cnt);
7624 for (unsigned i = 0; i < chain.length (); ++i)
7625 {
7626 stmt_vec_info stmt_info;
7627 if (chain[i].dt == vect_internal_def
7628 && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
7629 gimple_get_lhs (stmt_info->stmt) == chain[i].op)
7630 && (i != last_idx
7631 || (stmts.length () & 1)))
7632 stmts.quick_push (stmt_info);
7633 else
7634 remain.quick_push (chain[i].op);
7635 }
7636 vec<stmt_vec_info> roots;
7637 roots.create (chain_stmts.length ());
7638 for (unsigned i = 0; i < chain_stmts.length (); ++i)
7639 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7640 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7641 stmts, roots, remain));
7642 }
7643 }
7644 }
7645 }
7646 }
7647
7648 /* Walk the grouped store chains and replace entries with their
7649 pattern variant if any. */
7650
7651 static void
7652 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7653 {
7654 stmt_vec_info first_element;
7655 unsigned i;
7656
7657 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7658 {
7659 /* We also have CTORs in this array. */
7660 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7661 continue;
7662 if (STMT_VINFO_IN_PATTERN_P (first_element))
7663 {
7664 stmt_vec_info orig = first_element;
7665 first_element = STMT_VINFO_RELATED_STMT (first_element);
7666 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7667 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7668 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7669 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7670 vinfo->grouped_stores[i] = first_element;
7671 }
7672 stmt_vec_info prev = first_element;
7673 while (DR_GROUP_NEXT_ELEMENT (prev))
7674 {
7675 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7676 if (STMT_VINFO_IN_PATTERN_P (elt))
7677 {
7678 stmt_vec_info orig = elt;
7679 elt = STMT_VINFO_RELATED_STMT (elt);
7680 DR_GROUP_NEXT_ELEMENT (prev) = elt;
7681 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7682 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7683 }
7684 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7685 prev = elt;
7686 }
7687 }
7688 }
7689
7690 /* Check if the region described by BB_VINFO can be vectorized, returning
7691 true if so. When returning false, set FATAL to true if the same failure
7692 would prevent vectorization at other vector sizes, false if it is still
7693 worth trying other sizes. N_STMTS is the number of statements in the
7694 region. */
7695
7696 static bool
7697 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7698 vec<int> *dataref_groups)
7699 {
7700 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7701
7702 slp_instance instance;
7703 int i;
7704 poly_uint64 min_vf = 2;
7705
7706 /* The first group of checks is independent of the vector size. */
7707 fatal = true;
7708
7709 /* Analyze the data references. */
7710
7711 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7712 {
7713 if (dump_enabled_p ())
7714 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7715 "not vectorized: unhandled data-ref in basic "
7716 "block.\n");
7717 return false;
7718 }
7719
7720 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7721 {
7722 if (dump_enabled_p ())
7723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7724 "not vectorized: unhandled data access in "
7725 "basic block.\n");
7726 return false;
7727 }
7728
7729 vect_slp_check_for_roots (bb_vinfo);
7730
7731 /* If there are no grouped stores and no constructors in the region
7732 there is no need to continue with pattern recog as vect_analyze_slp
7733 will fail anyway. */
7734 if (bb_vinfo->grouped_stores.is_empty ()
7735 && bb_vinfo->roots.is_empty ())
7736 {
7737 if (dump_enabled_p ())
7738 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7739 "not vectorized: no grouped stores in "
7740 "basic block.\n");
7741 return false;
7742 }
7743
7744 /* While the rest of the analysis below depends on it in some way. */
7745 fatal = false;
7746
7747 vect_pattern_recog (bb_vinfo);
7748
7749 /* Update store groups from pattern processing. */
7750 vect_fixup_store_groups_with_patterns (bb_vinfo);
7751
7752 /* Check the SLP opportunities in the basic block, analyze and build SLP
7753 trees. */
7754 if (!vect_analyze_slp (bb_vinfo, n_stmts))
7755 {
7756 if (dump_enabled_p ())
7757 {
7758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7759 "Failed to SLP the basic block.\n");
7760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7761 "not vectorized: failed to find SLP opportunities "
7762 "in basic block.\n");
7763 }
7764 return false;
7765 }
7766
7767 /* Optimize permutations. */
7768 vect_optimize_slp (bb_vinfo);
7769
7770 /* Gather the loads reachable from the SLP graph entries. */
7771 vect_gather_slp_loads (bb_vinfo);
7772
7773 vect_record_base_alignments (bb_vinfo);
7774
7775 /* Analyze and verify the alignment of data references and the
7776 dependence in the SLP instances. */
7777 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7778 {
7779 vect_location = instance->location ();
7780 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7781 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7782 {
7783 slp_tree node = SLP_INSTANCE_TREE (instance);
7784 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7785 if (dump_enabled_p ())
7786 dump_printf_loc (MSG_NOTE, vect_location,
7787 "removing SLP instance operations starting from: %G",
7788 stmt_info->stmt);
7789 vect_free_slp_instance (instance);
7790 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7791 continue;
7792 }
7793
7794 /* Mark all the statements that we want to vectorize as pure SLP and
7795 relevant. */
7796 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7797 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7798 unsigned j;
7799 stmt_vec_info root;
7800 /* Likewise consider instance root stmts as vectorized. */
7801 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7802 STMT_SLP_TYPE (root) = pure_slp;
7803
7804 i++;
7805 }
7806 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7807 return false;
7808
7809 if (!vect_slp_analyze_operations (bb_vinfo))
7810 {
7811 if (dump_enabled_p ())
7812 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7813 "not vectorized: bad operation in basic block.\n");
7814 return false;
7815 }
7816
7817 vect_bb_partition_graph (bb_vinfo);
7818
7819 return true;
7820 }
7821
7822 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
7823 basic blocks in BBS, returning true on success.
7824 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
7825
7826 static bool
7827 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7828 vec<int> *dataref_groups, unsigned int n_stmts,
7829 loop_p orig_loop)
7830 {
7831 bb_vec_info bb_vinfo;
7832 auto_vector_modes vector_modes;
7833
7834 /* Autodetect first vector size we try. */
7835 machine_mode next_vector_mode = VOIDmode;
7836 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7837 unsigned int mode_i = 0;
7838
7839 vec_info_shared shared;
7840
7841 machine_mode autodetected_vector_mode = VOIDmode;
7842 while (1)
7843 {
7844 bool vectorized = false;
7845 bool fatal = false;
7846 bb_vinfo = new _bb_vec_info (bbs, &shared);
7847
7848 bool first_time_p = shared.datarefs.is_empty ();
7849 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7850 if (first_time_p)
7851 bb_vinfo->shared->save_datarefs ();
7852 else
7853 bb_vinfo->shared->check_datarefs ();
7854 bb_vinfo->vector_mode = next_vector_mode;
7855
7856 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7857 {
7858 if (dump_enabled_p ())
7859 {
7860 dump_printf_loc (MSG_NOTE, vect_location,
7861 "***** Analysis succeeded with vector mode"
7862 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7863 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7864 }
7865
7866 bb_vinfo->shared->check_datarefs ();
7867
7868 bool force_clear = false;
7869 auto_vec<slp_instance> profitable_subgraphs;
7870 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7871 {
7872 if (instance->subgraph_entries.is_empty ())
7873 continue;
7874
7875 dump_user_location_t saved_vect_location = vect_location;
7876 vect_location = instance->location ();
7877 if (!unlimited_cost_model (NULL)
7878 && !vect_bb_vectorization_profitable_p
7879 (bb_vinfo, instance->subgraph_entries, orig_loop))
7880 {
7881 if (dump_enabled_p ())
7882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7883 "not vectorized: vectorization is not "
7884 "profitable.\n");
7885 vect_location = saved_vect_location;
7886 continue;
7887 }
7888
7889 vect_location = saved_vect_location;
7890 if (!dbg_cnt (vect_slp))
7891 {
7892 force_clear = true;
7893 continue;
7894 }
7895
7896 profitable_subgraphs.safe_push (instance);
7897 }
7898
7899 /* When we're vectorizing an if-converted loop body make sure
7900 we vectorized all if-converted code. */
7901 if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
7902 {
7903 gcc_assert (bb_vinfo->bbs.length () == 1);
7904 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7905 !gsi_end_p (gsi); gsi_next (&gsi))
7906 {
7907 /* The costing above left us with DCEable vectorized scalar
7908 stmts having the visited flag set on profitable
7909 subgraphs. Do the delayed clearing of the flag here. */
7910 if (gimple_visited_p (gsi_stmt (gsi)))
7911 {
7912 gimple_set_visited (gsi_stmt (gsi), false);
7913 continue;
7914 }
7915 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7916 continue;
7917
7918 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7919 if (gimple_assign_rhs_code (ass) == COND_EXPR)
7920 {
7921 if (!profitable_subgraphs.is_empty ()
7922 && dump_enabled_p ())
7923 dump_printf_loc (MSG_NOTE, vect_location,
7924 "not profitable because of "
7925 "unprofitable if-converted scalar "
7926 "code\n");
7927 profitable_subgraphs.truncate (0);
7928 }
7929 }
7930 }
7931
7932 /* Finally schedule the profitable subgraphs. */
7933 for (slp_instance instance : profitable_subgraphs)
7934 {
7935 if (!vectorized && dump_enabled_p ())
7936 dump_printf_loc (MSG_NOTE, vect_location,
7937 "Basic block will be vectorized "
7938 "using SLP\n");
7939 vectorized = true;
7940
7941 /* Dump before scheduling as store vectorization will remove
7942 the original stores and mess with the instance tree
7943 so querying its location will eventually ICE. */
7944 if (flag_checking)
7945 for (slp_instance sub : instance->subgraph_entries)
7946 gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7947 unsigned HOST_WIDE_INT bytes;
7948 if (dump_enabled_p ())
7949 for (slp_instance sub : instance->subgraph_entries)
7950 {
7951 tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7952 if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7953 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7954 sub->location (),
7955 "basic block part vectorized using %wu "
7956 "byte vectors\n", bytes);
7957 else
7958 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7959 sub->location (),
7960 "basic block part vectorized using "
7961 "variable length vectors\n");
7962 }
7963
7964 dump_user_location_t saved_vect_location = vect_location;
7965 vect_location = instance->location ();
7966
7967 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7968
7969 vect_location = saved_vect_location;
7970 }
7971 }
7972 else
7973 {
7974 if (dump_enabled_p ())
7975 dump_printf_loc (MSG_NOTE, vect_location,
7976 "***** Analysis failed with vector mode %s\n",
7977 GET_MODE_NAME (bb_vinfo->vector_mode));
7978 }
7979
7980 if (mode_i == 0)
7981 autodetected_vector_mode = bb_vinfo->vector_mode;
7982
7983 if (!fatal)
7984 while (mode_i < vector_modes.length ()
7985 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7986 {
7987 if (dump_enabled_p ())
7988 dump_printf_loc (MSG_NOTE, vect_location,
7989 "***** The result for vector mode %s would"
7990 " be the same\n",
7991 GET_MODE_NAME (vector_modes[mode_i]));
7992 mode_i += 1;
7993 }
7994
7995 delete bb_vinfo;
7996
7997 if (mode_i < vector_modes.length ()
7998 && VECTOR_MODE_P (autodetected_vector_mode)
7999 && (related_vector_mode (vector_modes[mode_i],
8000 GET_MODE_INNER (autodetected_vector_mode))
8001 == autodetected_vector_mode)
8002 && (related_vector_mode (autodetected_vector_mode,
8003 GET_MODE_INNER (vector_modes[mode_i]))
8004 == vector_modes[mode_i]))
8005 {
8006 if (dump_enabled_p ())
8007 dump_printf_loc (MSG_NOTE, vect_location,
8008 "***** Skipping vector mode %s, which would"
8009 " repeat the analysis for %s\n",
8010 GET_MODE_NAME (vector_modes[mode_i]),
8011 GET_MODE_NAME (autodetected_vector_mode));
8012 mode_i += 1;
8013 }
8014
8015 if (vectorized
8016 || mode_i == vector_modes.length ()
8017 || autodetected_vector_mode == VOIDmode
8018 /* If vect_slp_analyze_bb_1 signaled that analysis for all
8019 vector sizes will fail do not bother iterating. */
8020 || fatal)
8021 return vectorized;
8022
8023 /* Try the next biggest vector size. */
8024 next_vector_mode = vector_modes[mode_i++];
8025 if (dump_enabled_p ())
8026 dump_printf_loc (MSG_NOTE, vect_location,
8027 "***** Re-trying analysis with vector mode %s\n",
8028 GET_MODE_NAME (next_vector_mode));
8029 }
8030 }
8031
8032
8033 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
8034 true if anything in the basic-block was vectorized. */
8035
8036 static bool
8037 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
8038 {
8039 vec<data_reference_p> datarefs = vNULL;
8040 auto_vec<int> dataref_groups;
8041 int insns = 0;
8042 int current_group = 0;
8043
8044 for (unsigned i = 0; i < bbs.length (); i++)
8045 {
8046 basic_block bb = bbs[i];
8047 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
8048 gsi_next (&gsi))
8049 {
8050 gimple *stmt = gsi_stmt (gsi);
8051 if (is_gimple_debug (stmt))
8052 continue;
8053
8054 insns++;
8055
8056 if (gimple_location (stmt) != UNKNOWN_LOCATION)
8057 vect_location = stmt;
8058
8059 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
8060 &dataref_groups, current_group))
8061 ++current_group;
8062 }
8063 /* New BBs always start a new DR group. */
8064 ++current_group;
8065 }
8066
8067 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
8068 }
8069
8070 /* Special entry for the BB vectorizer. Analyze and transform a single
8071 if-converted BB with ORIG_LOOPs body being the not if-converted
8072 representation. Returns true if anything in the basic-block was
8073 vectorized. */
8074
8075 bool
8076 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
8077 {
8078 auto_vec<basic_block> bbs;
8079 bbs.safe_push (bb);
8080 return vect_slp_bbs (bbs, orig_loop);
8081 }
8082
8083 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
8084 true if anything in the basic-block was vectorized. */
8085
8086 bool
8087 vect_slp_function (function *fun)
8088 {
8089 bool r = false;
8090 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
8091 auto_bitmap exit_bbs;
8092 bitmap_set_bit (exit_bbs, EXIT_BLOCK);
8093 edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
8094 unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
8095 true, rpo, NULL);
8096
8097 /* For the moment split the function into pieces to avoid making
8098 the iteration on the vector mode moot. Split at points we know
8099 to not handle well which is CFG merges (SLP discovery doesn't
8100 handle non-loop-header PHIs) and loop exits. Since pattern
8101 recog requires reverse iteration to visit uses before defs
8102 simply chop RPO into pieces. */
8103 auto_vec<basic_block> bbs;
8104 for (unsigned i = 0; i < n; i++)
8105 {
8106 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
8107 bool split = false;
8108
8109 /* Split when a BB is not dominated by the first block. */
8110 if (!bbs.is_empty ()
8111 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
8112 {
8113 if (dump_enabled_p ())
8114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8115 "splitting region at dominance boundary bb%d\n",
8116 bb->index);
8117 split = true;
8118 }
8119 /* Split when the loop determined by the first block
8120 is exited. This is because we eventually insert
8121 invariants at region begin. */
8122 else if (!bbs.is_empty ()
8123 && bbs[0]->loop_father != bb->loop_father
8124 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
8125 {
8126 if (dump_enabled_p ())
8127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8128 "splitting region at loop %d exit at bb%d\n",
8129 bbs[0]->loop_father->num, bb->index);
8130 split = true;
8131 }
8132 else if (!bbs.is_empty ()
8133 && bb->loop_father->header == bb
8134 && bb->loop_father->dont_vectorize)
8135 {
8136 if (dump_enabled_p ())
8137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8138 "splitting region at dont-vectorize loop %d "
8139 "entry at bb%d\n",
8140 bb->loop_father->num, bb->index);
8141 split = true;
8142 }
8143
8144 if (split && !bbs.is_empty ())
8145 {
8146 r |= vect_slp_bbs (bbs, NULL);
8147 bbs.truncate (0);
8148 }
8149
8150 if (bbs.is_empty ())
8151 {
8152 /* We need to be able to insert at the head of the region which
8153 we cannot for region starting with a returns-twice call. */
8154 if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
8155 if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
8156 {
8157 if (dump_enabled_p ())
8158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8159 "skipping bb%d as start of region as it "
8160 "starts with returns-twice call\n",
8161 bb->index);
8162 continue;
8163 }
8164 /* If the loop this BB belongs to is marked as not to be vectorized
8165 honor that also for BB vectorization. */
8166 if (bb->loop_father->dont_vectorize)
8167 continue;
8168 }
8169
8170 bbs.safe_push (bb);
8171
8172 /* When we have a stmt ending this block and defining a
8173 value we have to insert on edges when inserting after it for
8174 a vector containing its definition. Avoid this for now. */
8175 if (gimple *last = *gsi_last_bb (bb))
8176 if (gimple_get_lhs (last)
8177 && is_ctrl_altering_stmt (last))
8178 {
8179 if (dump_enabled_p ())
8180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8181 "splitting region at control altering "
8182 "definition %G", last);
8183 r |= vect_slp_bbs (bbs, NULL);
8184 bbs.truncate (0);
8185 }
8186 }
8187
8188 if (!bbs.is_empty ())
8189 r |= vect_slp_bbs (bbs, NULL);
8190
8191 free (rpo);
8192
8193 return r;
8194 }
8195
8196 /* Build a variable-length vector in which the elements in ELTS are repeated
8197 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
8198 RESULTS and add any new instructions to SEQ.
8199
8200 The approach we use is:
8201
8202 (1) Find a vector mode VM with integer elements of mode IM.
8203
8204 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8205 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
8206 from small vectors to IM.
8207
8208 (3) Duplicate each ELTS'[I] into a vector of mode VM.
8209
8210 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
8211 correct byte contents.
8212
8213 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
8214
8215 We try to find the largest IM for which this sequence works, in order
8216 to cut down on the number of interleaves. */
8217
8218 void
8219 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
8220 const vec<tree> &elts, unsigned int nresults,
8221 vec<tree> &results)
8222 {
8223 unsigned int nelts = elts.length ();
8224 tree element_type = TREE_TYPE (vector_type);
8225
8226 /* (1) Find a vector mode VM with integer elements of mode IM. */
8227 unsigned int nvectors = 1;
8228 tree new_vector_type;
8229 tree permutes[2];
8230 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
8231 &nvectors, &new_vector_type,
8232 permutes))
8233 gcc_unreachable ();
8234
8235 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
8236 unsigned int partial_nelts = nelts / nvectors;
8237 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
8238
8239 tree_vector_builder partial_elts;
8240 auto_vec<tree, 32> pieces (nvectors * 2);
8241 pieces.quick_grow_cleared (nvectors * 2);
8242 for (unsigned int i = 0; i < nvectors; ++i)
8243 {
8244 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8245 ELTS' has mode IM. */
8246 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
8247 for (unsigned int j = 0; j < partial_nelts; ++j)
8248 partial_elts.quick_push (elts[i * partial_nelts + j]);
8249 tree t = gimple_build_vector (seq, &partial_elts);
8250 t = gimple_build (seq, VIEW_CONVERT_EXPR,
8251 TREE_TYPE (new_vector_type), t);
8252
8253 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
8254 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
8255 }
8256
8257 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
8258 correct byte contents.
8259
8260 Conceptually, we need to repeat the following operation log2(nvectors)
8261 times, where hi_start = nvectors / 2:
8262
8263 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
8264 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
8265
8266 However, if each input repeats every N elements and the VF is
8267 a multiple of N * 2, the HI result is the same as the LO result.
8268 This will be true for the first N1 iterations of the outer loop,
8269 followed by N2 iterations for which both the LO and HI results
8270 are needed. I.e.:
8271
8272 N1 + N2 = log2(nvectors)
8273
8274 Each "N1 iteration" doubles the number of redundant vectors and the
8275 effect of the process as a whole is to have a sequence of nvectors/2**N1
8276 vectors that repeats 2**N1 times. Rather than generate these redundant
8277 vectors, we halve the number of vectors for each N1 iteration. */
8278 unsigned int in_start = 0;
8279 unsigned int out_start = nvectors;
8280 unsigned int new_nvectors = nvectors;
8281 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
8282 {
8283 unsigned int hi_start = new_nvectors / 2;
8284 unsigned int out_i = 0;
8285 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
8286 {
8287 if ((in_i & 1) != 0
8288 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
8289 2 * in_repeat))
8290 continue;
8291
8292 tree output = make_ssa_name (new_vector_type);
8293 tree input1 = pieces[in_start + (in_i / 2)];
8294 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
8295 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
8296 input1, input2,
8297 permutes[in_i & 1]);
8298 gimple_seq_add_stmt (seq, stmt);
8299 pieces[out_start + out_i] = output;
8300 out_i += 1;
8301 }
8302 std::swap (in_start, out_start);
8303 new_nvectors = out_i;
8304 }
8305
8306 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
8307 results.reserve (nresults);
8308 for (unsigned int i = 0; i < nresults; ++i)
8309 if (i < new_nvectors)
8310 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
8311 pieces[in_start + i]));
8312 else
8313 results.quick_push (results[i - new_nvectors]);
8314 }
8315
8316
8317 /* For constant and loop invariant defs in OP_NODE this function creates
8318 vector defs that will be used in the vectorized stmts and stores them
8319 to SLP_TREE_VEC_DEFS of OP_NODE. */
8320
8321 static void
8322 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8323 {
8324 unsigned HOST_WIDE_INT nunits;
8325 tree vec_cst;
8326 unsigned j, number_of_places_left_in_vector;
8327 tree vector_type;
8328 tree vop;
8329 int group_size = op_node->ops.length ();
8330 unsigned int vec_num, i;
8331 unsigned number_of_copies = 1;
8332 bool constant_p;
8333 gimple_seq ctor_seq = NULL;
8334 auto_vec<tree, 16> permute_results;
8335
8336 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
8337 vector_type = SLP_TREE_VECTYPE (op_node);
8338
8339 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8340 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
8341 auto_vec<tree> voprnds (number_of_vectors);
8342
8343 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8344 created vectors. It is greater than 1 if unrolling is performed.
8345
8346 For example, we have two scalar operands, s1 and s2 (e.g., group of
8347 strided accesses of size two), while NUNITS is four (i.e., four scalars
8348 of this type can be packed in a vector). The output vector will contain
8349 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
8350 will be 2).
8351
8352 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8353 containing the operands.
8354
8355 For example, NUNITS is four as before, and the group size is 8
8356 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
8357 {s5, s6, s7, s8}. */
8358
8359 /* When using duplicate_and_interleave, we just need one element for
8360 each scalar statement. */
8361 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
8362 nunits = group_size;
8363
8364 number_of_copies = nunits * number_of_vectors / group_size;
8365
8366 number_of_places_left_in_vector = nunits;
8367 constant_p = true;
8368 tree uniform_elt = NULL_TREE;
8369 tree_vector_builder elts (vector_type, nunits, 1);
8370 elts.quick_grow (nunits);
8371 stmt_vec_info insert_after = NULL;
8372 for (j = 0; j < number_of_copies; j++)
8373 {
8374 tree op;
8375 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
8376 {
8377 /* Create 'vect_ = {op0,op1,...,opn}'. */
8378 tree orig_op = op;
8379 if (number_of_places_left_in_vector == nunits)
8380 uniform_elt = op;
8381 else if (uniform_elt && operand_equal_p (uniform_elt, op))
8382 op = elts[number_of_places_left_in_vector];
8383 else
8384 uniform_elt = NULL_TREE;
8385 number_of_places_left_in_vector--;
8386 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8387 {
8388 if (CONSTANT_CLASS_P (op))
8389 {
8390 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8391 {
8392 /* Can't use VIEW_CONVERT_EXPR for booleans because
8393 of possibly different sizes of scalar value and
8394 vector element. */
8395 if (integer_zerop (op))
8396 op = build_int_cst (TREE_TYPE (vector_type), 0);
8397 else if (integer_onep (op))
8398 op = build_all_ones_cst (TREE_TYPE (vector_type));
8399 else
8400 gcc_unreachable ();
8401 }
8402 else
8403 op = fold_unary (VIEW_CONVERT_EXPR,
8404 TREE_TYPE (vector_type), op);
8405 gcc_assert (op && CONSTANT_CLASS_P (op));
8406 }
8407 else
8408 {
8409 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8410 gimple *init_stmt;
8411 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8412 {
8413 tree true_val
8414 = build_all_ones_cst (TREE_TYPE (vector_type));
8415 tree false_val
8416 = build_zero_cst (TREE_TYPE (vector_type));
8417 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8418 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8419 op, true_val,
8420 false_val);
8421 }
8422 else
8423 {
8424 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8425 op);
8426 init_stmt
8427 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8428 op);
8429 }
8430 gimple_seq_add_stmt (&ctor_seq, init_stmt);
8431 op = new_temp;
8432 }
8433 }
8434 elts[number_of_places_left_in_vector] = op;
8435 if (!CONSTANT_CLASS_P (op))
8436 constant_p = false;
8437 /* For BB vectorization we have to compute an insert location
8438 when a def is inside the analyzed region since we cannot
8439 simply insert at the BB start in this case. */
8440 stmt_vec_info opdef;
8441 if (TREE_CODE (orig_op) == SSA_NAME
8442 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8443 && is_a <bb_vec_info> (vinfo)
8444 && (opdef = vinfo->lookup_def (orig_op)))
8445 {
8446 if (!insert_after)
8447 insert_after = opdef;
8448 else
8449 insert_after = get_later_stmt (insert_after, opdef);
8450 }
8451
8452 if (number_of_places_left_in_vector == 0)
8453 {
8454 auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
8455 if (uniform_elt)
8456 vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
8457 elts[0]);
8458 else if (constant_p
8459 ? multiple_p (type_nunits, nunits)
8460 : known_eq (type_nunits, nunits))
8461 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8462 else
8463 {
8464 if (permute_results.is_empty ())
8465 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8466 elts, number_of_vectors,
8467 permute_results);
8468 vec_cst = permute_results[number_of_vectors - j - 1];
8469 }
8470 if (!gimple_seq_empty_p (ctor_seq))
8471 {
8472 if (insert_after)
8473 {
8474 gimple_stmt_iterator gsi;
8475 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8476 {
8477 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8478 gsi_insert_seq_before (&gsi, ctor_seq,
8479 GSI_CONTINUE_LINKING);
8480 }
8481 else if (!stmt_ends_bb_p (insert_after->stmt))
8482 {
8483 gsi = gsi_for_stmt (insert_after->stmt);
8484 gsi_insert_seq_after (&gsi, ctor_seq,
8485 GSI_CONTINUE_LINKING);
8486 }
8487 else
8488 {
8489 /* When we want to insert after a def where the
8490 defining stmt throws then insert on the fallthru
8491 edge. */
8492 edge e = find_fallthru_edge
8493 (gimple_bb (insert_after->stmt)->succs);
8494 basic_block new_bb
8495 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8496 gcc_assert (!new_bb);
8497 }
8498 }
8499 else
8500 vinfo->insert_seq_on_entry (NULL, ctor_seq);
8501 ctor_seq = NULL;
8502 }
8503 voprnds.quick_push (vec_cst);
8504 insert_after = NULL;
8505 number_of_places_left_in_vector = nunits;
8506 constant_p = true;
8507 elts.new_vector (vector_type, nunits, 1);
8508 elts.quick_grow (nunits);
8509 }
8510 }
8511 }
8512
8513 /* Since the vectors are created in the reverse order, we should invert
8514 them. */
8515 vec_num = voprnds.length ();
8516 for (j = vec_num; j != 0; j--)
8517 {
8518 vop = voprnds[j - 1];
8519 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8520 }
8521
8522 /* In case that VF is greater than the unrolling factor needed for the SLP
8523 group of stmts, NUMBER_OF_VECTORS to be created is greater than
8524 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8525 to replicate the vectors. */
8526 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8527 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8528 i++)
8529 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8530 }
8531
8532 /* Get the Ith vectorized definition from SLP_NODE. */
8533
8534 tree
8535 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8536 {
8537 return SLP_TREE_VEC_DEFS (slp_node)[i];
8538 }
8539
8540 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
8541
8542 void
8543 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8544 {
8545 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8546 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8547 }
8548
8549 /* Get N vectorized definitions for SLP_NODE. */
8550
8551 void
8552 vect_get_slp_defs (vec_info *,
8553 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8554 {
8555 if (n == -1U)
8556 n = SLP_TREE_CHILDREN (slp_node).length ();
8557
8558 for (unsigned i = 0; i < n; ++i)
8559 {
8560 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8561 vec<tree> vec_defs = vNULL;
8562 vect_get_slp_defs (child, &vec_defs);
8563 vec_oprnds->quick_push (vec_defs);
8564 }
8565 }
8566
8567 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8568 - PERM gives the permutation that the caller wants to use for NODE,
8569 which might be different from SLP_LOAD_PERMUTATION.
8570 - DUMP_P controls whether the function dumps information. */
8571
8572 static bool
8573 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8574 load_permutation_t &perm,
8575 const vec<tree> &dr_chain,
8576 gimple_stmt_iterator *gsi, poly_uint64 vf,
8577 bool analyze_only, bool dump_p,
8578 unsigned *n_perms, unsigned int *n_loads,
8579 bool dce_chain)
8580 {
8581 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8582 int vec_index = 0;
8583 tree vectype = SLP_TREE_VECTYPE (node);
8584 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8585 unsigned int mask_element;
8586 unsigned dr_group_size;
8587 machine_mode mode;
8588
8589 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8590 dr_group_size = 1;
8591 else
8592 {
8593 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8594 dr_group_size = DR_GROUP_SIZE (stmt_info);
8595 }
8596
8597 mode = TYPE_MODE (vectype);
8598 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8599 unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8600
8601 /* Initialize the vect stmts of NODE to properly insert the generated
8602 stmts later. */
8603 if (! analyze_only)
8604 for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8605 SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8606
8607 /* Generate permutation masks for every NODE. Number of masks for each NODE
8608 is equal to GROUP_SIZE.
8609 E.g., we have a group of three nodes with three loads from the same
8610 location in each node, and the vector size is 4. I.e., we have a
8611 a0b0c0a1b1c1... sequence and we need to create the following vectors:
8612 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8613 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8614 ...
8615
8616 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8617 The last mask is illegal since we assume two operands for permute
8618 operation, and the mask element values can't be outside that range.
8619 Hence, the last mask must be converted into {2,5,5,5}.
8620 For the first two permutations we need the first and the second input
8621 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8622 we need the second and the third vectors: {b1,c1,a2,b2} and
8623 {c2,a3,b3,c3}. */
8624
8625 int vect_stmts_counter = 0;
8626 unsigned int index = 0;
8627 int first_vec_index = -1;
8628 int second_vec_index = -1;
8629 bool noop_p = true;
8630 *n_perms = 0;
8631
8632 vec_perm_builder mask;
8633 unsigned int nelts_to_build;
8634 unsigned int nvectors_per_build;
8635 unsigned int in_nlanes;
8636 bool repeating_p = (group_size == dr_group_size
8637 && multiple_p (nunits, group_size));
8638 if (repeating_p)
8639 {
8640 /* A single vector contains a whole number of copies of the node, so:
8641 (a) all permutes can use the same mask; and
8642 (b) the permutes only need a single vector input. */
8643 mask.new_vector (nunits, group_size, 3);
8644 nelts_to_build = mask.encoded_nelts ();
8645 /* It's possible to obtain zero nstmts during analyze_only, so make
8646 it at least one to ensure the later computation for n_perms
8647 proceed. */
8648 nvectors_per_build = nstmts > 0 ? nstmts : 1;
8649 in_nlanes = dr_group_size * 3;
8650 }
8651 else
8652 {
8653 /* We need to construct a separate mask for each vector statement. */
8654 unsigned HOST_WIDE_INT const_nunits, const_vf;
8655 if (!nunits.is_constant (&const_nunits)
8656 || !vf.is_constant (&const_vf))
8657 return false;
8658 mask.new_vector (const_nunits, const_nunits, 1);
8659 nelts_to_build = const_vf * group_size;
8660 nvectors_per_build = 1;
8661 in_nlanes = const_vf * dr_group_size;
8662 }
8663 auto_sbitmap used_in_lanes (in_nlanes);
8664 bitmap_clear (used_in_lanes);
8665 auto_bitmap used_defs;
8666
8667 unsigned int count = mask.encoded_nelts ();
8668 mask.quick_grow (count);
8669 vec_perm_indices indices;
8670
8671 for (unsigned int j = 0; j < nelts_to_build; j++)
8672 {
8673 unsigned int iter_num = j / group_size;
8674 unsigned int stmt_num = j % group_size;
8675 unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8676 bitmap_set_bit (used_in_lanes, i);
8677 if (repeating_p)
8678 {
8679 first_vec_index = 0;
8680 mask_element = i;
8681 }
8682 else
8683 {
8684 /* Enforced before the loop when !repeating_p. */
8685 unsigned int const_nunits = nunits.to_constant ();
8686 vec_index = i / const_nunits;
8687 mask_element = i % const_nunits;
8688 if (vec_index == first_vec_index
8689 || first_vec_index == -1)
8690 {
8691 first_vec_index = vec_index;
8692 }
8693 else if (vec_index == second_vec_index
8694 || second_vec_index == -1)
8695 {
8696 second_vec_index = vec_index;
8697 mask_element += const_nunits;
8698 }
8699 else
8700 {
8701 if (dump_p)
8702 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8703 "permutation requires at "
8704 "least three vectors %G",
8705 stmt_info->stmt);
8706 gcc_assert (analyze_only);
8707 return false;
8708 }
8709
8710 gcc_assert (mask_element < 2 * const_nunits);
8711 }
8712
8713 if (mask_element != index)
8714 noop_p = false;
8715 mask[index++] = mask_element;
8716
8717 if (index == count)
8718 {
8719 if (!noop_p)
8720 {
8721 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8722 if (!can_vec_perm_const_p (mode, mode, indices))
8723 {
8724 if (dump_p)
8725 {
8726 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8727 "unsupported vect permute { ");
8728 for (i = 0; i < count; ++i)
8729 {
8730 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8731 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8732 }
8733 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8734 }
8735 gcc_assert (analyze_only);
8736 return false;
8737 }
8738
8739 tree mask_vec = NULL_TREE;
8740 if (!analyze_only)
8741 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8742
8743 if (second_vec_index == -1)
8744 second_vec_index = first_vec_index;
8745
8746 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8747 {
8748 ++*n_perms;
8749 if (analyze_only)
8750 continue;
8751 /* Generate the permute statement if necessary. */
8752 tree first_vec = dr_chain[first_vec_index + ri];
8753 tree second_vec = dr_chain[second_vec_index + ri];
8754 gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8755 tree perm_dest
8756 = vect_create_destination_var (gimple_assign_lhs (stmt),
8757 vectype);
8758 perm_dest = make_ssa_name (perm_dest);
8759 gimple *perm_stmt
8760 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8761 second_vec, mask_vec);
8762 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8763 gsi);
8764 if (dce_chain)
8765 {
8766 bitmap_set_bit (used_defs, first_vec_index + ri);
8767 bitmap_set_bit (used_defs, second_vec_index + ri);
8768 }
8769
8770 /* Store the vector statement in NODE. */
8771 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8772 }
8773 }
8774 else if (!analyze_only)
8775 {
8776 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8777 {
8778 tree first_vec = dr_chain[first_vec_index + ri];
8779 /* If mask was NULL_TREE generate the requested
8780 identity transform. */
8781 if (dce_chain)
8782 bitmap_set_bit (used_defs, first_vec_index + ri);
8783
8784 /* Store the vector statement in NODE. */
8785 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8786 }
8787 }
8788
8789 index = 0;
8790 first_vec_index = -1;
8791 second_vec_index = -1;
8792 noop_p = true;
8793 }
8794 }
8795
8796 if (n_loads)
8797 {
8798 if (repeating_p)
8799 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8800 else
8801 {
8802 /* Enforced above when !repeating_p. */
8803 unsigned int const_nunits = nunits.to_constant ();
8804 *n_loads = 0;
8805 bool load_seen = false;
8806 for (unsigned i = 0; i < in_nlanes; ++i)
8807 {
8808 if (i % const_nunits == 0)
8809 {
8810 if (load_seen)
8811 *n_loads += 1;
8812 load_seen = false;
8813 }
8814 if (bitmap_bit_p (used_in_lanes, i))
8815 load_seen = true;
8816 }
8817 if (load_seen)
8818 *n_loads += 1;
8819 }
8820 }
8821
8822 if (dce_chain)
8823 for (unsigned i = 0; i < dr_chain.length (); ++i)
8824 if (!bitmap_bit_p (used_defs, i))
8825 {
8826 tree def = dr_chain[i];
8827 do
8828 {
8829 gimple *stmt = SSA_NAME_DEF_STMT (def);
8830 if (is_gimple_assign (stmt)
8831 && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
8832 || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
8833 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
8834 else
8835 def = NULL;
8836 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8837 gsi_remove (&rgsi, true);
8838 release_defs (stmt);
8839 }
8840 while (def);
8841 }
8842
8843 return true;
8844 }
8845
8846 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8847 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8848 permute statements for the SLP node NODE. Store the number of vector
8849 permute instructions in *N_PERMS and the number of vector load
8850 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
8851 that were not needed. */
8852
8853 bool
8854 vect_transform_slp_perm_load (vec_info *vinfo,
8855 slp_tree node, const vec<tree> &dr_chain,
8856 gimple_stmt_iterator *gsi, poly_uint64 vf,
8857 bool analyze_only, unsigned *n_perms,
8858 unsigned int *n_loads, bool dce_chain)
8859 {
8860 return vect_transform_slp_perm_load_1 (vinfo, node,
8861 SLP_TREE_LOAD_PERMUTATION (node),
8862 dr_chain, gsi, vf, analyze_only,
8863 dump_enabled_p (), n_perms, n_loads,
8864 dce_chain);
8865 }
8866
8867 /* Produce the next vector result for SLP permutation NODE by adding a vector
8868 statement at GSI. If MASK_VEC is nonnull, add:
8869
8870 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8871
8872 otherwise add:
8873
8874 <new SSA name> = FIRST_DEF. */
8875
8876 static void
8877 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8878 slp_tree node, tree first_def, tree second_def,
8879 tree mask_vec, poly_uint64 identity_offset)
8880 {
8881 tree vectype = SLP_TREE_VECTYPE (node);
8882
8883 /* ??? We SLP match existing vector element extracts but
8884 allow punning which we need to re-instantiate at uses
8885 but have no good way of explicitly representing. */
8886 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8887 && !types_compatible_p (TREE_TYPE (first_def), vectype))
8888 {
8889 gassign *conv_stmt
8890 = gimple_build_assign (make_ssa_name (vectype),
8891 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8892 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8893 first_def = gimple_assign_lhs (conv_stmt);
8894 }
8895 gassign *perm_stmt;
8896 tree perm_dest = make_ssa_name (vectype);
8897 if (mask_vec)
8898 {
8899 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8900 TYPE_SIZE (vectype))
8901 && !types_compatible_p (TREE_TYPE (second_def), vectype))
8902 {
8903 gassign *conv_stmt
8904 = gimple_build_assign (make_ssa_name (vectype),
8905 build1 (VIEW_CONVERT_EXPR,
8906 vectype, second_def));
8907 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8908 second_def = gimple_assign_lhs (conv_stmt);
8909 }
8910 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8911 first_def, second_def,
8912 mask_vec);
8913 }
8914 else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8915 {
8916 /* For identity permutes we still need to handle the case
8917 of offsetted extracts or concats. */
8918 unsigned HOST_WIDE_INT c;
8919 auto first_def_nunits
8920 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8921 if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8922 {
8923 unsigned HOST_WIDE_INT elsz
8924 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8925 tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8926 TYPE_SIZE (vectype),
8927 bitsize_int (identity_offset * elsz));
8928 perm_stmt = gimple_build_assign (perm_dest, lowpart);
8929 }
8930 else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8931 first_def_nunits, &c) && c == 2)
8932 {
8933 tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8934 NULL_TREE, second_def);
8935 perm_stmt = gimple_build_assign (perm_dest, ctor);
8936 }
8937 else
8938 gcc_unreachable ();
8939 }
8940 else
8941 {
8942 /* We need a copy here in case the def was external. */
8943 perm_stmt = gimple_build_assign (perm_dest, first_def);
8944 }
8945 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8946 /* Store the vector statement in NODE. */
8947 node->push_vec_def (perm_stmt);
8948 }
8949
8950 /* Subroutine of vectorizable_slp_permutation. Check whether the target
8951 can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8952 If GSI is nonnull, emit the permutation there.
8953
8954 When GSI is null, the only purpose of NODE is to give properties
8955 of the result, such as the vector type and number of SLP lanes.
8956 The node does not need to be a VEC_PERM_EXPR.
8957
8958 If the target supports the operation, return the number of individual
8959 VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
8960 dump file if DUMP_P is true. */
8961
8962 static int
8963 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8964 slp_tree node, lane_permutation_t &perm,
8965 vec<slp_tree> &children, bool dump_p)
8966 {
8967 tree vectype = SLP_TREE_VECTYPE (node);
8968
8969 /* ??? We currently only support all same vector input types
8970 while the SLP IL should really do a concat + select and thus accept
8971 arbitrary mismatches. */
8972 slp_tree child;
8973 unsigned i;
8974 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8975 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8976 tree op_vectype = NULL_TREE;
8977 FOR_EACH_VEC_ELT (children, i, child)
8978 if (SLP_TREE_VECTYPE (child))
8979 {
8980 op_vectype = SLP_TREE_VECTYPE (child);
8981 break;
8982 }
8983 if (!op_vectype)
8984 op_vectype = vectype;
8985 FOR_EACH_VEC_ELT (children, i, child)
8986 {
8987 if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8988 && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8989 || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8990 || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8991 {
8992 if (dump_p)
8993 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8994 "Unsupported vector types in lane permutation\n");
8995 return -1;
8996 }
8997 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8998 repeating_p = false;
8999 }
9000
9001 gcc_assert (perm.length () == SLP_TREE_LANES (node));
9002 if (dump_p)
9003 {
9004 dump_printf_loc (MSG_NOTE, vect_location,
9005 "vectorizing permutation");
9006 for (unsigned i = 0; i < perm.length (); ++i)
9007 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
9008 if (repeating_p)
9009 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
9010 dump_printf (MSG_NOTE, "\n");
9011 }
9012
9013 /* REPEATING_P is true if every output vector is guaranteed to use the
9014 same permute vector. We can handle that case for both variable-length
9015 and constant-length vectors, but we only handle other cases for
9016 constant-length vectors.
9017
9018 Set:
9019
9020 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
9021 mask vector that we want to build.
9022
9023 - NCOPIES to the number of copies of PERM that we need in order
9024 to build the necessary permute mask vectors.
9025
9026 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
9027 for each permute mask vector. This is only relevant when GSI is
9028 nonnull. */
9029 uint64_t npatterns;
9030 unsigned nelts_per_pattern;
9031 uint64_t ncopies;
9032 unsigned noutputs_per_mask;
9033 if (repeating_p)
9034 {
9035 /* We need a single permute mask vector that has the form:
9036
9037 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
9038
9039 In other words, the original n-element permute in PERM is
9040 "unrolled" to fill a full vector. The stepped vector encoding
9041 that we use for permutes requires 3n elements. */
9042 npatterns = SLP_TREE_LANES (node);
9043 nelts_per_pattern = ncopies = 3;
9044 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9045 }
9046 else
9047 {
9048 /* Calculate every element of every permute mask vector explicitly,
9049 instead of relying on the pattern described above. */
9050 if (!nunits.is_constant (&npatterns)
9051 || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
9052 return -1;
9053 nelts_per_pattern = ncopies = 1;
9054 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
9055 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
9056 return -1;
9057 noutputs_per_mask = 1;
9058 }
9059 unsigned olanes = ncopies * SLP_TREE_LANES (node);
9060 gcc_assert (repeating_p || multiple_p (olanes, nunits));
9061
9062 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
9063 from the { SLP operand, scalar lane } permutation as recorded in the
9064 SLP node as intermediate step. This part should already work
9065 with SLP children with arbitrary number of lanes. */
9066 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
9067 auto_vec<unsigned> active_lane;
9068 vperm.create (olanes);
9069 active_lane.safe_grow_cleared (children.length (), true);
9070 for (unsigned i = 0; i < ncopies; ++i)
9071 {
9072 for (unsigned pi = 0; pi < perm.length (); ++pi)
9073 {
9074 std::pair<unsigned, unsigned> p = perm[pi];
9075 tree vtype = SLP_TREE_VECTYPE (children[p.first]);
9076 if (repeating_p)
9077 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
9078 else
9079 {
9080 /* We checked above that the vectors are constant-length. */
9081 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
9082 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
9083 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
9084 vperm.quick_push ({{p.first, vi}, vl});
9085 }
9086 }
9087 /* Advance to the next group. */
9088 for (unsigned j = 0; j < children.length (); ++j)
9089 active_lane[j] += SLP_TREE_LANES (children[j]);
9090 }
9091
9092 if (dump_p)
9093 {
9094 dump_printf_loc (MSG_NOTE, vect_location,
9095 "vectorizing permutation");
9096 for (unsigned i = 0; i < perm.length (); ++i)
9097 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
9098 if (repeating_p)
9099 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
9100 dump_printf (MSG_NOTE, "\n");
9101 dump_printf_loc (MSG_NOTE, vect_location, "as");
9102 for (unsigned i = 0; i < vperm.length (); ++i)
9103 {
9104 if (i != 0
9105 && (repeating_p
9106 ? multiple_p (i, npatterns)
9107 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
9108 dump_printf (MSG_NOTE, ",");
9109 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
9110 vperm[i].first.first, vperm[i].first.second,
9111 vperm[i].second);
9112 }
9113 dump_printf (MSG_NOTE, "\n");
9114 }
9115
9116 /* We can only handle two-vector permutes, everything else should
9117 be lowered on the SLP level. The following is closely inspired
9118 by vect_transform_slp_perm_load and is supposed to eventually
9119 replace it.
9120 ??? As intermediate step do code-gen in the SLP tree representation
9121 somehow? */
9122 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
9123 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
9124 unsigned int index = 0;
9125 poly_uint64 mask_element;
9126 vec_perm_builder mask;
9127 mask.new_vector (nunits, npatterns, nelts_per_pattern);
9128 unsigned int count = mask.encoded_nelts ();
9129 mask.quick_grow (count);
9130 vec_perm_indices indices;
9131 unsigned nperms = 0;
9132 for (unsigned i = 0; i < vperm.length (); ++i)
9133 {
9134 mask_element = vperm[i].second;
9135 if (first_vec.first == -1U
9136 || first_vec == vperm[i].first)
9137 first_vec = vperm[i].first;
9138 else if (second_vec.first == -1U
9139 || second_vec == vperm[i].first)
9140 {
9141 second_vec = vperm[i].first;
9142 mask_element += nunits;
9143 }
9144 else
9145 {
9146 if (dump_p)
9147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9148 "permutation requires at "
9149 "least three vectors\n");
9150 gcc_assert (!gsi);
9151 return -1;
9152 }
9153
9154 mask[index++] = mask_element;
9155
9156 if (index == count)
9157 {
9158 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
9159 TYPE_VECTOR_SUBPARTS (op_vectype));
9160 bool identity_p = (indices.series_p (0, 1, mask[0], 1)
9161 && constant_multiple_p (mask[0], nunits));
9162 machine_mode vmode = TYPE_MODE (vectype);
9163 machine_mode op_vmode = TYPE_MODE (op_vectype);
9164 unsigned HOST_WIDE_INT c;
9165 if ((!identity_p
9166 && !can_vec_perm_const_p (vmode, op_vmode, indices))
9167 || (identity_p
9168 && !known_le (nunits,
9169 TYPE_VECTOR_SUBPARTS (op_vectype))
9170 && (!constant_multiple_p (nunits,
9171 TYPE_VECTOR_SUBPARTS (op_vectype),
9172 &c) || c != 2)))
9173 {
9174 if (dump_p)
9175 {
9176 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
9177 vect_location,
9178 "unsupported vect permute { ");
9179 for (i = 0; i < count; ++i)
9180 {
9181 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
9182 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
9183 }
9184 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
9185 }
9186 gcc_assert (!gsi);
9187 return -1;
9188 }
9189
9190 if (!identity_p)
9191 nperms++;
9192 if (gsi)
9193 {
9194 if (second_vec.first == -1U)
9195 second_vec = first_vec;
9196
9197 slp_tree
9198 first_node = children[first_vec.first],
9199 second_node = children[second_vec.first];
9200
9201 tree mask_vec = NULL_TREE;
9202 if (!identity_p)
9203 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
9204
9205 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
9206 {
9207 tree first_def
9208 = vect_get_slp_vect_def (first_node,
9209 first_vec.second + vi);
9210 tree second_def
9211 = vect_get_slp_vect_def (second_node,
9212 second_vec.second + vi);
9213 vect_add_slp_permutation (vinfo, gsi, node, first_def,
9214 second_def, mask_vec, mask[0]);
9215 }
9216 }
9217
9218 index = 0;
9219 first_vec = std::make_pair (-1U, -1U);
9220 second_vec = std::make_pair (-1U, -1U);
9221 }
9222 }
9223
9224 return nperms;
9225 }
9226
9227 /* Vectorize the SLP permutations in NODE as specified
9228 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
9229 child number and lane number.
9230 Interleaving of two two-lane two-child SLP subtrees (not supported):
9231 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
9232 A blend of two four-lane two-child SLP subtrees:
9233 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
9234 Highpart of a four-lane one-child SLP subtree (not supported):
9235 [ { 0, 2 }, { 0, 3 } ]
9236 Where currently only a subset is supported by code generating below. */
9237
9238 static bool
9239 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
9240 slp_tree node, stmt_vector_for_cost *cost_vec)
9241 {
9242 tree vectype = SLP_TREE_VECTYPE (node);
9243 lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
9244 int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
9245 SLP_TREE_CHILDREN (node),
9246 dump_enabled_p ());
9247 if (nperms < 0)
9248 return false;
9249
9250 if (!gsi)
9251 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
9252
9253 return true;
9254 }
9255
9256 /* Vectorize SLP NODE. */
9257
9258 static void
9259 vect_schedule_slp_node (vec_info *vinfo,
9260 slp_tree node, slp_instance instance)
9261 {
9262 gimple_stmt_iterator si;
9263 int i;
9264 slp_tree child;
9265
9266 /* Vectorize externals and constants. */
9267 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
9268 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
9269 {
9270 /* ??? vectorizable_shift can end up using a scalar operand which is
9271 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
9272 node in this case. */
9273 if (!SLP_TREE_VECTYPE (node))
9274 return;
9275
9276 /* There are two reasons vector defs might already exist. The first
9277 is that we are vectorizing an existing vector def. The second is
9278 when performing BB vectorization shared constant/external nodes
9279 are not split apart during partitioning so during the code-gen
9280 DFS walk we can end up visiting them twice. */
9281 if (! SLP_TREE_VEC_DEFS (node).exists ())
9282 vect_create_constant_vectors (vinfo, node);
9283 return;
9284 }
9285
9286 gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
9287
9288 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
9289
9290 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
9291 SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
9292
9293 if (dump_enabled_p ())
9294 dump_printf_loc (MSG_NOTE, vect_location,
9295 "------>vectorizing SLP node starting from: %G",
9296 stmt_info->stmt);
9297
9298 if (STMT_VINFO_DATA_REF (stmt_info)
9299 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9300 {
9301 /* Vectorized loads go before the first scalar load to make it
9302 ready early, vectorized stores go before the last scalar
9303 stmt which is where all uses are ready. */
9304 stmt_vec_info last_stmt_info = NULL;
9305 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
9306 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
9307 else /* DR_IS_WRITE */
9308 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
9309 si = gsi_for_stmt (last_stmt_info->stmt);
9310 }
9311 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
9312 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
9313 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
9314 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9315 {
9316 /* For PHI node vectorization we do not use the insertion iterator. */
9317 si = gsi_none ();
9318 }
9319 else
9320 {
9321 /* Emit other stmts after the children vectorized defs which is
9322 earliest possible. */
9323 gimple *last_stmt = NULL;
9324 if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
9325 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
9326 || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
9327 {
9328 /* But avoid scheduling internal defs outside of the loop when
9329 we might have only implicitly tracked loop mask/len defs. */
9330 gimple_stmt_iterator si
9331 = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
9332 last_stmt = *si;
9333 }
9334 bool seen_vector_def = false;
9335 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9336 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9337 {
9338 /* For fold-left reductions we are retaining the scalar
9339 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
9340 set so the representation isn't perfect. Resort to the
9341 last scalar def here. */
9342 if (SLP_TREE_VEC_DEFS (child).is_empty ())
9343 {
9344 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
9345 == cycle_phi_info_type);
9346 gphi *phi = as_a <gphi *>
9347 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
9348 if (!last_stmt
9349 || vect_stmt_dominates_stmt_p (last_stmt, phi))
9350 last_stmt = phi;
9351 }
9352 /* We are emitting all vectorized stmts in the same place and
9353 the last one is the last.
9354 ??? Unless we have a load permutation applied and that
9355 figures to re-use an earlier generated load. */
9356 unsigned j;
9357 tree vdef;
9358 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9359 {
9360 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9361 if (!last_stmt
9362 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9363 last_stmt = vstmt;
9364 }
9365 }
9366 else if (!SLP_TREE_VECTYPE (child))
9367 {
9368 /* For externals we use unvectorized at all scalar defs. */
9369 unsigned j;
9370 tree def;
9371 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9372 if (TREE_CODE (def) == SSA_NAME
9373 && !SSA_NAME_IS_DEFAULT_DEF (def))
9374 {
9375 gimple *stmt = SSA_NAME_DEF_STMT (def);
9376 if (!last_stmt
9377 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9378 last_stmt = stmt;
9379 }
9380 }
9381 else
9382 {
9383 /* For externals we have to look at all defs since their
9384 insertion place is decided per vector. But beware
9385 of pre-existing vectors where we need to make sure
9386 we do not insert before the region boundary. */
9387 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9388 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9389 seen_vector_def = true;
9390 else
9391 {
9392 unsigned j;
9393 tree vdef;
9394 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9395 if (TREE_CODE (vdef) == SSA_NAME
9396 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9397 {
9398 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9399 if (!last_stmt
9400 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9401 last_stmt = vstmt;
9402 }
9403 }
9404 }
9405 /* This can happen when all children are pre-existing vectors or
9406 constants. */
9407 if (!last_stmt)
9408 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9409 if (!last_stmt)
9410 {
9411 gcc_assert (seen_vector_def);
9412 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9413 }
9414 else if (is_ctrl_altering_stmt (last_stmt))
9415 {
9416 /* We split regions to vectorize at control altering stmts
9417 with a definition so this must be an external which
9418 we can insert at the start of the region. */
9419 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9420 }
9421 else if (is_a <bb_vec_info> (vinfo)
9422 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9423 && gimple_could_trap_p (stmt_info->stmt))
9424 {
9425 /* We've constrained possibly trapping operations to all come
9426 from the same basic-block, if vectorized defs would allow earlier
9427 scheduling still force vectorized stmts to the original block.
9428 This is only necessary for BB vectorization since for loop vect
9429 all operations are in a single BB and scalar stmt based
9430 placement doesn't play well with epilogue vectorization. */
9431 gcc_assert (dominated_by_p (CDI_DOMINATORS,
9432 gimple_bb (stmt_info->stmt),
9433 gimple_bb (last_stmt)));
9434 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9435 }
9436 else if (is_a <gphi *> (last_stmt))
9437 si = gsi_after_labels (gimple_bb (last_stmt));
9438 else
9439 {
9440 si = gsi_for_stmt (last_stmt);
9441 gsi_next (&si);
9442 }
9443 }
9444
9445 /* Handle purely internal nodes. */
9446 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9447 {
9448 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
9449 be shared with different SLP nodes (but usually it's the same
9450 operation apart from the case the stmt is only there for denoting
9451 the actual scalar lane defs ...). So do not call vect_transform_stmt
9452 but open-code it here (partly). */
9453 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9454 gcc_assert (done);
9455 stmt_vec_info slp_stmt_info;
9456 unsigned int i;
9457 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9458 if (STMT_VINFO_LIVE_P (slp_stmt_info))
9459 {
9460 done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9461 instance, i, true, NULL);
9462 gcc_assert (done);
9463 }
9464 }
9465 else
9466 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9467 }
9468
9469 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9470 For loop vectorization this is done in vectorizable_call, but for SLP
9471 it needs to be deferred until end of vect_schedule_slp, because multiple
9472 SLP instances may refer to the same scalar stmt. */
9473
9474 static void
9475 vect_remove_slp_scalar_calls (vec_info *vinfo,
9476 slp_tree node, hash_set<slp_tree> &visited)
9477 {
9478 gimple *new_stmt;
9479 gimple_stmt_iterator gsi;
9480 int i;
9481 slp_tree child;
9482 tree lhs;
9483 stmt_vec_info stmt_info;
9484
9485 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9486 return;
9487
9488 if (visited.add (node))
9489 return;
9490
9491 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9492 vect_remove_slp_scalar_calls (vinfo, child, visited);
9493
9494 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9495 {
9496 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9497 if (!stmt || gimple_bb (stmt) == NULL)
9498 continue;
9499 if (is_pattern_stmt_p (stmt_info)
9500 || !PURE_SLP_STMT (stmt_info))
9501 continue;
9502 lhs = gimple_call_lhs (stmt);
9503 if (lhs)
9504 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9505 else
9506 {
9507 new_stmt = gimple_build_nop ();
9508 unlink_stmt_vdef (stmt_info->stmt);
9509 }
9510 gsi = gsi_for_stmt (stmt);
9511 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9512 if (lhs)
9513 SSA_NAME_DEF_STMT (lhs) = new_stmt;
9514 }
9515 }
9516
9517 static void
9518 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9519 {
9520 hash_set<slp_tree> visited;
9521 vect_remove_slp_scalar_calls (vinfo, node, visited);
9522 }
9523
9524 /* Vectorize the instance root. */
9525
9526 void
9527 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9528 {
9529 gassign *rstmt = NULL;
9530
9531 if (instance->kind == slp_inst_kind_ctor)
9532 {
9533 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9534 {
9535 tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9536 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9537 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9538 TREE_TYPE (vect_lhs)))
9539 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9540 vect_lhs);
9541 rstmt = gimple_build_assign (root_lhs, vect_lhs);
9542 }
9543 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9544 {
9545 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9546 tree child_def;
9547 int j;
9548 vec<constructor_elt, va_gc> *v;
9549 vec_alloc (v, nelts);
9550
9551 /* A CTOR can handle V16HI composition from VNx8HI so we
9552 do not need to convert vector elements if the types
9553 do not match. */
9554 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9555 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9556 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9557 tree rtype
9558 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9559 tree r_constructor = build_constructor (rtype, v);
9560 rstmt = gimple_build_assign (lhs, r_constructor);
9561 }
9562 }
9563 else if (instance->kind == slp_inst_kind_bb_reduc)
9564 {
9565 /* Largely inspired by reduction chain epilogue handling in
9566 vect_create_epilog_for_reduction. */
9567 vec<tree> vec_defs = vNULL;
9568 vect_get_slp_defs (node, &vec_defs);
9569 enum tree_code reduc_code
9570 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9571 /* ??? We actually have to reflect signs somewhere. */
9572 if (reduc_code == MINUS_EXPR)
9573 reduc_code = PLUS_EXPR;
9574 gimple_seq epilogue = NULL;
9575 /* We may end up with more than one vector result, reduce them
9576 to one vector. */
9577 tree vec_def = vec_defs[0];
9578 tree vectype = TREE_TYPE (vec_def);
9579 tree compute_vectype = vectype;
9580 bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9581 && TYPE_OVERFLOW_UNDEFINED (vectype)
9582 && operation_can_overflow (reduc_code));
9583 if (pun_for_overflow_p)
9584 {
9585 compute_vectype = unsigned_type_for (vectype);
9586 vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9587 compute_vectype, vec_def);
9588 }
9589 for (unsigned i = 1; i < vec_defs.length (); ++i)
9590 {
9591 tree def = vec_defs[i];
9592 if (pun_for_overflow_p)
9593 def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9594 compute_vectype, def);
9595 vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9596 vec_def, def);
9597 }
9598 vec_defs.release ();
9599 /* ??? Support other schemes than direct internal fn. */
9600 internal_fn reduc_fn;
9601 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9602 || reduc_fn == IFN_LAST)
9603 gcc_unreachable ();
9604 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9605 TREE_TYPE (compute_vectype), vec_def);
9606 if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9607 {
9608 tree rem_def = NULL_TREE;
9609 for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9610 {
9611 def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9612 if (!rem_def)
9613 rem_def = def;
9614 else
9615 rem_def = gimple_build (&epilogue, reduc_code,
9616 TREE_TYPE (scalar_def),
9617 rem_def, def);
9618 }
9619 scalar_def = gimple_build (&epilogue, reduc_code,
9620 TREE_TYPE (scalar_def),
9621 scalar_def, rem_def);
9622 }
9623 scalar_def = gimple_convert (&epilogue,
9624 TREE_TYPE (vectype), scalar_def);
9625 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9626 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9627 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9628 update_stmt (gsi_stmt (rgsi));
9629 return;
9630 }
9631 else
9632 gcc_unreachable ();
9633
9634 gcc_assert (rstmt);
9635
9636 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9637 gsi_replace (&rgsi, rstmt, true);
9638 }
9639
9640 struct slp_scc_info
9641 {
9642 bool on_stack;
9643 int dfs;
9644 int lowlink;
9645 };
9646
9647 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
9648
9649 static void
9650 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9651 hash_map<slp_tree, slp_scc_info> &scc_info,
9652 int &maxdfs, vec<slp_tree> &stack)
9653 {
9654 bool existed_p;
9655 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9656 gcc_assert (!existed_p);
9657 info->dfs = maxdfs;
9658 info->lowlink = maxdfs;
9659 maxdfs++;
9660
9661 /* Leaf. */
9662 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9663 {
9664 info->on_stack = false;
9665 vect_schedule_slp_node (vinfo, node, instance);
9666 return;
9667 }
9668
9669 info->on_stack = true;
9670 stack.safe_push (node);
9671
9672 unsigned i;
9673 slp_tree child;
9674 /* DFS recurse. */
9675 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9676 {
9677 if (!child)
9678 continue;
9679 slp_scc_info *child_info = scc_info.get (child);
9680 if (!child_info)
9681 {
9682 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9683 /* Recursion might have re-allocated the node. */
9684 info = scc_info.get (node);
9685 child_info = scc_info.get (child);
9686 info->lowlink = MIN (info->lowlink, child_info->lowlink);
9687 }
9688 else if (child_info->on_stack)
9689 info->lowlink = MIN (info->lowlink, child_info->dfs);
9690 }
9691 if (info->lowlink != info->dfs)
9692 return;
9693
9694 auto_vec<slp_tree, 4> phis_to_fixup;
9695
9696 /* Singleton. */
9697 if (stack.last () == node)
9698 {
9699 stack.pop ();
9700 info->on_stack = false;
9701 vect_schedule_slp_node (vinfo, node, instance);
9702 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9703 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9704 phis_to_fixup.quick_push (node);
9705 }
9706 else
9707 {
9708 /* SCC. */
9709 int last_idx = stack.length () - 1;
9710 while (stack[last_idx] != node)
9711 last_idx--;
9712 /* We can break the cycle at PHIs who have at least one child
9713 code generated. Then we could re-start the DFS walk until
9714 all nodes in the SCC are covered (we might have new entries
9715 for only back-reachable nodes). But it's simpler to just
9716 iterate and schedule those that are ready. */
9717 unsigned todo = stack.length () - last_idx;
9718 do
9719 {
9720 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9721 {
9722 slp_tree entry = stack[idx];
9723 if (!entry)
9724 continue;
9725 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9726 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9727 bool ready = !phi;
9728 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9729 if (!child)
9730 {
9731 gcc_assert (phi);
9732 ready = true;
9733 break;
9734 }
9735 else if (scc_info.get (child)->on_stack)
9736 {
9737 if (!phi)
9738 {
9739 ready = false;
9740 break;
9741 }
9742 }
9743 else
9744 {
9745 if (phi)
9746 {
9747 ready = true;
9748 break;
9749 }
9750 }
9751 if (ready)
9752 {
9753 vect_schedule_slp_node (vinfo, entry, instance);
9754 scc_info.get (entry)->on_stack = false;
9755 stack[idx] = NULL;
9756 todo--;
9757 if (phi)
9758 phis_to_fixup.safe_push (entry);
9759 }
9760 }
9761 }
9762 while (todo != 0);
9763
9764 /* Pop the SCC. */
9765 stack.truncate (last_idx);
9766 }
9767
9768 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
9769 slp_tree phi_node;
9770 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9771 {
9772 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9773 edge_iterator ei;
9774 edge e;
9775 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9776 {
9777 unsigned dest_idx = e->dest_idx;
9778 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9779 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9780 continue;
9781 unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9782 /* Simply fill all args. */
9783 if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9784 != vect_first_order_recurrence)
9785 for (unsigned i = 0; i < n; ++i)
9786 {
9787 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9788 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9789 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9790 e, gimple_phi_arg_location (phi, dest_idx));
9791 }
9792 else
9793 {
9794 /* Unless it is a first order recurrence which needs
9795 args filled in for both the PHI node and the permutes. */
9796 gimple *perm
9797 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9798 gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9799 add_phi_arg (as_a <gphi *> (rphi),
9800 vect_get_slp_vect_def (child, n - 1),
9801 e, gimple_phi_arg_location (phi, dest_idx));
9802 for (unsigned i = 0; i < n; ++i)
9803 {
9804 gimple *perm
9805 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9806 if (i > 0)
9807 gimple_assign_set_rhs1 (perm,
9808 vect_get_slp_vect_def (child, i - 1));
9809 gimple_assign_set_rhs2 (perm,
9810 vect_get_slp_vect_def (child, i));
9811 update_stmt (perm);
9812 }
9813 }
9814 }
9815 }
9816 }
9817
9818 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
9819
9820 void
9821 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9822 {
9823 slp_instance instance;
9824 unsigned int i;
9825
9826 hash_map<slp_tree, slp_scc_info> scc_info;
9827 int maxdfs = 0;
9828 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9829 {
9830 slp_tree node = SLP_INSTANCE_TREE (instance);
9831 if (dump_enabled_p ())
9832 {
9833 dump_printf_loc (MSG_NOTE, vect_location,
9834 "Vectorizing SLP tree:\n");
9835 /* ??? Dump all? */
9836 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9837 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9838 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9839 vect_print_slp_graph (MSG_NOTE, vect_location,
9840 SLP_INSTANCE_TREE (instance));
9841 }
9842 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9843 have a PHI be the node breaking the cycle. */
9844 auto_vec<slp_tree> stack;
9845 if (!scc_info.get (node))
9846 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9847
9848 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9849 vectorize_slp_instance_root_stmt (node, instance);
9850
9851 if (dump_enabled_p ())
9852 dump_printf_loc (MSG_NOTE, vect_location,
9853 "vectorizing stmts using SLP.\n");
9854 }
9855
9856 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9857 {
9858 slp_tree root = SLP_INSTANCE_TREE (instance);
9859 stmt_vec_info store_info;
9860 unsigned int j;
9861
9862 /* Remove scalar call stmts. Do not do this for basic-block
9863 vectorization as not all uses may be vectorized.
9864 ??? Why should this be necessary? DCE should be able to
9865 remove the stmts itself.
9866 ??? For BB vectorization we can as well remove scalar
9867 stmts starting from the SLP tree root if they have no
9868 uses. */
9869 if (is_a <loop_vec_info> (vinfo))
9870 vect_remove_slp_scalar_calls (vinfo, root);
9871
9872 /* Remove vectorized stores original scalar stmts. */
9873 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9874 {
9875 if (!STMT_VINFO_DATA_REF (store_info)
9876 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9877 break;
9878
9879 store_info = vect_orig_stmt (store_info);
9880 /* Free the attached stmt_vec_info and remove the stmt. */
9881 vinfo->remove_stmt (store_info);
9882
9883 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9884 to not crash in vect_free_slp_tree later. */
9885 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9886 SLP_TREE_REPRESENTATIVE (root) = NULL;
9887 }
9888 }
9889 }