]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.cc
tree-optimization/114736 - SLP DFS walk issue
[thirdparty/gcc.git] / gcc / tree-vect-slp.cc
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #define INCLUDE_ALGORITHM
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "insn-config.h"
35 #include "recog.h" /* FIXME: for insn_data */
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48 #include "dump-context.h"
49 #include "cfganal.h"
50 #include "tree-eh.h"
51 #include "tree-cfg.h"
52 #include "alloc-pool.h"
53 #include "sreal.h"
54 #include "predict.h"
55
56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
57 load_permutation_t &,
58 const vec<tree> &,
59 gimple_stmt_iterator *,
60 poly_uint64, bool, bool,
61 unsigned *,
62 unsigned * = nullptr,
63 bool = false);
64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
65 slp_tree, lane_permutation_t &,
66 vec<slp_tree> &, bool);
67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
68 slp_tree, stmt_vector_for_cost *);
69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
70
71 static object_allocator<_slp_tree> *slp_tree_pool;
72 static slp_tree slp_first_node;
73
74 void
75 vect_slp_init (void)
76 {
77 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
78 }
79
80 void
81 vect_slp_fini (void)
82 {
83 while (slp_first_node)
84 delete slp_first_node;
85 delete slp_tree_pool;
86 slp_tree_pool = NULL;
87 }
88
89 void *
90 _slp_tree::operator new (size_t n)
91 {
92 gcc_assert (n == sizeof (_slp_tree));
93 return slp_tree_pool->allocate_raw ();
94 }
95
96 void
97 _slp_tree::operator delete (void *node, size_t n)
98 {
99 gcc_assert (n == sizeof (_slp_tree));
100 slp_tree_pool->remove_raw (node);
101 }
102
103
104 /* Initialize a SLP node. */
105
106 _slp_tree::_slp_tree ()
107 {
108 this->prev_node = NULL;
109 if (slp_first_node)
110 slp_first_node->prev_node = this;
111 this->next_node = slp_first_node;
112 slp_first_node = this;
113 SLP_TREE_SCALAR_STMTS (this) = vNULL;
114 SLP_TREE_SCALAR_OPS (this) = vNULL;
115 SLP_TREE_VEC_DEFS (this) = vNULL;
116 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
117 SLP_TREE_CHILDREN (this) = vNULL;
118 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
119 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
120 SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
121 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
122 SLP_TREE_CODE (this) = ERROR_MARK;
123 SLP_TREE_VECTYPE (this) = NULL_TREE;
124 SLP_TREE_REPRESENTATIVE (this) = NULL;
125 SLP_TREE_REF_COUNT (this) = 1;
126 this->failed = NULL;
127 this->max_nunits = 1;
128 this->lanes = 0;
129 }
130
131 /* Tear down a SLP node. */
132
133 _slp_tree::~_slp_tree ()
134 {
135 if (this->prev_node)
136 this->prev_node->next_node = this->next_node;
137 else
138 slp_first_node = this->next_node;
139 if (this->next_node)
140 this->next_node->prev_node = this->prev_node;
141 SLP_TREE_CHILDREN (this).release ();
142 SLP_TREE_SCALAR_STMTS (this).release ();
143 SLP_TREE_SCALAR_OPS (this).release ();
144 SLP_TREE_VEC_DEFS (this).release ();
145 SLP_TREE_LOAD_PERMUTATION (this).release ();
146 SLP_TREE_LANE_PERMUTATION (this).release ();
147 SLP_TREE_SIMD_CLONE_INFO (this).release ();
148 if (this->failed)
149 free (failed);
150 }
151
152 /* Push the single SSA definition in DEF to the vector of vector defs. */
153
154 void
155 _slp_tree::push_vec_def (gimple *def)
156 {
157 if (gphi *phi = dyn_cast <gphi *> (def))
158 vec_defs.quick_push (gimple_phi_result (phi));
159 else
160 {
161 def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
162 vec_defs.quick_push (get_def_from_ptr (defop));
163 }
164 }
165
166 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
167
168 void
169 vect_free_slp_tree (slp_tree node)
170 {
171 int i;
172 slp_tree child;
173
174 if (--SLP_TREE_REF_COUNT (node) != 0)
175 return;
176
177 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
178 if (child)
179 vect_free_slp_tree (child);
180
181 /* If the node defines any SLP only patterns then those patterns are no
182 longer valid and should be removed. */
183 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
184 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
185 {
186 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
187 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
188 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
189 }
190
191 delete node;
192 }
193
194 /* Return a location suitable for dumpings related to the SLP instance. */
195
196 dump_user_location_t
197 _slp_instance::location () const
198 {
199 if (!root_stmts.is_empty ())
200 return root_stmts[0]->stmt;
201 else
202 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
203 }
204
205
206 /* Free the memory allocated for the SLP instance. */
207
208 void
209 vect_free_slp_instance (slp_instance instance)
210 {
211 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
212 SLP_INSTANCE_LOADS (instance).release ();
213 SLP_INSTANCE_ROOT_STMTS (instance).release ();
214 SLP_INSTANCE_REMAIN_DEFS (instance).release ();
215 instance->subgraph_entries.release ();
216 instance->cost_vec.release ();
217 free (instance);
218 }
219
220
221 /* Create an SLP node for SCALAR_STMTS. */
222
223 slp_tree
224 vect_create_new_slp_node (unsigned nops, tree_code code)
225 {
226 slp_tree node = new _slp_tree;
227 SLP_TREE_SCALAR_STMTS (node) = vNULL;
228 SLP_TREE_CHILDREN (node).create (nops);
229 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
230 SLP_TREE_CODE (node) = code;
231 return node;
232 }
233 /* Create an SLP node for SCALAR_STMTS. */
234
235 static slp_tree
236 vect_create_new_slp_node (slp_tree node,
237 vec<stmt_vec_info> scalar_stmts, unsigned nops)
238 {
239 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
240 SLP_TREE_CHILDREN (node).create (nops);
241 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
242 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
243 SLP_TREE_LANES (node) = scalar_stmts.length ();
244 return node;
245 }
246
247 /* Create an SLP node for SCALAR_STMTS. */
248
249 static slp_tree
250 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
251 {
252 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
253 }
254
255 /* Create an SLP node for OPS. */
256
257 static slp_tree
258 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
259 {
260 SLP_TREE_SCALAR_OPS (node) = ops;
261 SLP_TREE_DEF_TYPE (node) = vect_external_def;
262 SLP_TREE_LANES (node) = ops.length ();
263 return node;
264 }
265
266 /* Create an SLP node for OPS. */
267
268 static slp_tree
269 vect_create_new_slp_node (vec<tree> ops)
270 {
271 return vect_create_new_slp_node (new _slp_tree, ops);
272 }
273
274
275 /* This structure is used in creation of an SLP tree. Each instance
276 corresponds to the same operand in a group of scalar stmts in an SLP
277 node. */
278 typedef struct _slp_oprnd_info
279 {
280 /* Def-stmts for the operands. */
281 vec<stmt_vec_info> def_stmts;
282 /* Operands. */
283 vec<tree> ops;
284 /* Information about the first statement, its vector def-type, type, the
285 operand itself in case it's constant, and an indication if it's a pattern
286 stmt and gather/scatter info. */
287 tree first_op_type;
288 enum vect_def_type first_dt;
289 bool any_pattern;
290 bool first_gs_p;
291 gather_scatter_info first_gs_info;
292 } *slp_oprnd_info;
293
294
295 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
296 operand. */
297 static vec<slp_oprnd_info>
298 vect_create_oprnd_info (int nops, int group_size)
299 {
300 int i;
301 slp_oprnd_info oprnd_info;
302 vec<slp_oprnd_info> oprnds_info;
303
304 oprnds_info.create (nops);
305 for (i = 0; i < nops; i++)
306 {
307 oprnd_info = XNEW (struct _slp_oprnd_info);
308 oprnd_info->def_stmts.create (group_size);
309 oprnd_info->ops.create (group_size);
310 oprnd_info->first_dt = vect_uninitialized_def;
311 oprnd_info->first_op_type = NULL_TREE;
312 oprnd_info->any_pattern = false;
313 oprnd_info->first_gs_p = false;
314 oprnds_info.quick_push (oprnd_info);
315 }
316
317 return oprnds_info;
318 }
319
320
321 /* Free operands info. */
322
323 static void
324 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
325 {
326 int i;
327 slp_oprnd_info oprnd_info;
328
329 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
330 {
331 oprnd_info->def_stmts.release ();
332 oprnd_info->ops.release ();
333 XDELETE (oprnd_info);
334 }
335
336 oprnds_info.release ();
337 }
338
339 /* Return the execution frequency of NODE (so that a higher value indicates
340 a "more important" node when optimizing for speed). */
341
342 static sreal
343 vect_slp_node_weight (slp_tree node)
344 {
345 stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
346 basic_block bb = gimple_bb (stmt_info->stmt);
347 return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
348 }
349
350 /* Return true if STMTS contains a pattern statement. */
351
352 static bool
353 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
354 {
355 stmt_vec_info stmt_info;
356 unsigned int i;
357 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
358 if (is_pattern_stmt_p (stmt_info))
359 return true;
360 return false;
361 }
362
363 /* Return true when all lanes in the external or constant NODE have
364 the same value. */
365
366 static bool
367 vect_slp_tree_uniform_p (slp_tree node)
368 {
369 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
370 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
371
372 /* Pre-exsting vectors. */
373 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
374 return false;
375
376 unsigned i;
377 tree op, first = NULL_TREE;
378 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
379 if (!first)
380 first = op;
381 else if (!operand_equal_p (first, op, 0))
382 return false;
383
384 return true;
385 }
386
387 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
388 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
389 of the chain. */
390
391 int
392 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
393 stmt_vec_info first_stmt_info)
394 {
395 stmt_vec_info next_stmt_info = first_stmt_info;
396 int result = 0;
397
398 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
399 return -1;
400
401 do
402 {
403 if (next_stmt_info == stmt_info)
404 return result;
405 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
406 if (next_stmt_info)
407 result += DR_GROUP_GAP (next_stmt_info);
408 }
409 while (next_stmt_info);
410
411 return -1;
412 }
413
414 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
415 using the method implemented by duplicate_and_interleave. Return true
416 if so, returning the number of intermediate vectors in *NVECTORS_OUT
417 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
418 (if nonnull). */
419
420 bool
421 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
422 tree elt_type, unsigned int *nvectors_out,
423 tree *vector_type_out,
424 tree *permutes)
425 {
426 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
427 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
428 return false;
429
430 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
431 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
432 unsigned int nvectors = 1;
433 for (;;)
434 {
435 scalar_int_mode int_mode;
436 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
437 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
438 {
439 /* Get the natural vector type for this SLP group size. */
440 tree int_type = build_nonstandard_integer_type
441 (GET_MODE_BITSIZE (int_mode), 1);
442 tree vector_type
443 = get_vectype_for_scalar_type (vinfo, int_type, count);
444 poly_int64 half_nelts;
445 if (vector_type
446 && VECTOR_MODE_P (TYPE_MODE (vector_type))
447 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
448 GET_MODE_SIZE (base_vector_mode))
449 && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
450 2, &half_nelts))
451 {
452 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
453 together into elements of type INT_TYPE and using the result
454 to build NVECTORS vectors. */
455 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
456 vec_perm_builder sel1 (nelts, 2, 3);
457 vec_perm_builder sel2 (nelts, 2, 3);
458
459 for (unsigned int i = 0; i < 3; ++i)
460 {
461 sel1.quick_push (i);
462 sel1.quick_push (i + nelts);
463 sel2.quick_push (half_nelts + i);
464 sel2.quick_push (half_nelts + i + nelts);
465 }
466 vec_perm_indices indices1 (sel1, 2, nelts);
467 vec_perm_indices indices2 (sel2, 2, nelts);
468 machine_mode vmode = TYPE_MODE (vector_type);
469 if (can_vec_perm_const_p (vmode, vmode, indices1)
470 && can_vec_perm_const_p (vmode, vmode, indices2))
471 {
472 if (nvectors_out)
473 *nvectors_out = nvectors;
474 if (vector_type_out)
475 *vector_type_out = vector_type;
476 if (permutes)
477 {
478 permutes[0] = vect_gen_perm_mask_checked (vector_type,
479 indices1);
480 permutes[1] = vect_gen_perm_mask_checked (vector_type,
481 indices2);
482 }
483 return true;
484 }
485 }
486 }
487 if (!multiple_p (elt_bytes, 2, &elt_bytes))
488 return false;
489 nvectors *= 2;
490 }
491 }
492
493 /* Return true if DTA and DTB match. */
494
495 static bool
496 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
497 {
498 return (dta == dtb
499 || ((dta == vect_external_def || dta == vect_constant_def)
500 && (dtb == vect_external_def || dtb == vect_constant_def)));
501 }
502
503 static const int cond_expr_maps[3][5] = {
504 { 4, -1, -2, 1, 2 },
505 { 4, -2, -1, 1, 2 },
506 { 4, -1, -2, 2, 1 }
507 };
508 static const int arg0_map[] = { 1, 0 };
509 static const int arg1_map[] = { 1, 1 };
510 static const int arg2_map[] = { 1, 2 };
511 static const int arg1_arg4_map[] = { 2, 1, 4 };
512 static const int arg3_arg2_map[] = { 2, 3, 2 };
513 static const int op1_op0_map[] = { 2, 1, 0 };
514 static const int off_map[] = { 1, -3 };
515 static const int off_op0_map[] = { 2, -3, 0 };
516 static const int off_arg2_map[] = { 2, -3, 2 };
517 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
518 static const int mask_call_maps[6][7] = {
519 { 1, 1, },
520 { 2, 1, 2, },
521 { 3, 1, 2, 3, },
522 { 4, 1, 2, 3, 4, },
523 { 5, 1, 2, 3, 4, 5, },
524 { 6, 1, 2, 3, 4, 5, 6 },
525 };
526
527 /* For most SLP statements, there is a one-to-one mapping between
528 gimple arguments and child nodes. If that is not true for STMT,
529 return an array that contains:
530
531 - the number of child nodes, followed by
532 - for each child node, the index of the argument associated with that node.
533 The special index -1 is the first operand of an embedded comparison and
534 the special index -2 is the second operand of an embedded comparison.
535 The special indes -3 is the offset of a gather as analyzed by
536 vect_check_gather_scatter.
537
538 SWAP is as for vect_get_and_check_slp_defs. */
539
540 static const int *
541 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
542 unsigned char swap = 0)
543 {
544 if (auto assign = dyn_cast<const gassign *> (stmt))
545 {
546 if (gimple_assign_rhs_code (assign) == COND_EXPR
547 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
548 return cond_expr_maps[swap];
549 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
550 && swap)
551 return op1_op0_map;
552 if (gather_scatter_p)
553 return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
554 ? off_op0_map : off_map);
555 }
556 gcc_assert (!swap);
557 if (auto call = dyn_cast<const gcall *> (stmt))
558 {
559 if (gimple_call_internal_p (call))
560 switch (gimple_call_internal_fn (call))
561 {
562 case IFN_MASK_LOAD:
563 return gather_scatter_p ? off_arg2_map : arg2_map;
564
565 case IFN_GATHER_LOAD:
566 return arg1_map;
567
568 case IFN_MASK_GATHER_LOAD:
569 case IFN_MASK_LEN_GATHER_LOAD:
570 return arg1_arg4_map;
571
572 case IFN_MASK_STORE:
573 return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
574
575 case IFN_MASK_CALL:
576 {
577 unsigned nargs = gimple_call_num_args (call);
578 if (nargs >= 2 && nargs <= 7)
579 return mask_call_maps[nargs-2];
580 else
581 return nullptr;
582 }
583
584 case IFN_CLZ:
585 case IFN_CTZ:
586 return arg0_map;
587
588 default:
589 break;
590 }
591 }
592 return nullptr;
593 }
594
595 /* Return the SLP node child index for operand OP of STMT. */
596
597 int
598 vect_slp_child_index_for_operand (const gimple *stmt, int op,
599 bool gather_scatter_p)
600 {
601 const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
602 if (!opmap)
603 return op;
604 for (int i = 1; i < 1 + opmap[0]; ++i)
605 if (opmap[i] == op)
606 return i - 1;
607 gcc_unreachable ();
608 }
609
610 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
611 they are of a valid type and that they match the defs of the first stmt of
612 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
613 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
614 indicates swap is required for cond_expr stmts. Specifically, SWAP
615 is 1 if STMT is cond and operands of comparison need to be swapped;
616 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
617
618 If there was a fatal error return -1; if the error could be corrected by
619 swapping operands of father node of this one, return 1; if everything is
620 ok return 0. */
621 static int
622 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
623 bool *skip_args,
624 vec<stmt_vec_info> stmts, unsigned stmt_num,
625 vec<slp_oprnd_info> *oprnds_info)
626 {
627 stmt_vec_info stmt_info = stmts[stmt_num];
628 tree oprnd;
629 unsigned int i, number_of_oprnds;
630 enum vect_def_type dt = vect_uninitialized_def;
631 slp_oprnd_info oprnd_info;
632 gather_scatter_info gs_info;
633 unsigned int gs_op = -1u;
634 unsigned int commutative_op = -1U;
635 bool first = stmt_num == 0;
636
637 if (!is_a<gcall *> (stmt_info->stmt)
638 && !is_a<gassign *> (stmt_info->stmt)
639 && !is_a<gphi *> (stmt_info->stmt))
640 return -1;
641
642 number_of_oprnds = gimple_num_args (stmt_info->stmt);
643 const int *map
644 = vect_get_operand_map (stmt_info->stmt,
645 STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
646 if (map)
647 number_of_oprnds = *map++;
648 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
649 {
650 if (gimple_call_internal_p (stmt))
651 {
652 internal_fn ifn = gimple_call_internal_fn (stmt);
653 commutative_op = first_commutative_argument (ifn);
654 }
655 }
656 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
657 {
658 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
659 commutative_op = 0;
660 }
661
662 bool swapped = (swap != 0);
663 bool backedge = false;
664 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
665 for (i = 0; i < number_of_oprnds; i++)
666 {
667 oprnd_info = (*oprnds_info)[i];
668 int opno = map ? map[i] : int (i);
669 if (opno == -3)
670 {
671 gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
672 if (!is_a <loop_vec_info> (vinfo)
673 || !vect_check_gather_scatter (stmt_info,
674 as_a <loop_vec_info> (vinfo),
675 first ? &oprnd_info->first_gs_info
676 : &gs_info))
677 return -1;
678
679 if (first)
680 {
681 oprnd_info->first_gs_p = true;
682 oprnd = oprnd_info->first_gs_info.offset;
683 }
684 else
685 {
686 gs_op = i;
687 oprnd = gs_info.offset;
688 }
689 }
690 else if (opno < 0)
691 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
692 else
693 {
694 oprnd = gimple_arg (stmt_info->stmt, opno);
695 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
696 {
697 edge e = gimple_phi_arg_edge (stmt, opno);
698 backedge = (is_a <bb_vec_info> (vinfo)
699 ? e->flags & EDGE_DFS_BACK
700 : dominated_by_p (CDI_DOMINATORS, e->src,
701 gimple_bb (stmt_info->stmt)));
702 }
703 }
704 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
705 oprnd = TREE_OPERAND (oprnd, 0);
706
707 stmt_vec_info def_stmt_info;
708 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
709 {
710 if (dump_enabled_p ())
711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
712 "Build SLP failed: can't analyze def for %T\n",
713 oprnd);
714
715 return -1;
716 }
717
718 if (skip_args[i])
719 {
720 oprnd_info->def_stmts.quick_push (NULL);
721 oprnd_info->ops.quick_push (NULL_TREE);
722 oprnd_info->first_dt = vect_uninitialized_def;
723 continue;
724 }
725
726 oprnd_info->def_stmts.quick_push (def_stmt_info);
727 oprnd_info->ops.quick_push (oprnd);
728
729 if (def_stmt_info
730 && is_pattern_stmt_p (def_stmt_info))
731 {
732 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
733 != def_stmt_info)
734 oprnd_info->any_pattern = true;
735 else
736 /* If we promote this to external use the original stmt def. */
737 oprnd_info->ops.last ()
738 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
739 }
740
741 /* If there's a extern def on a backedge make sure we can
742 code-generate at the region start.
743 ??? This is another case that could be fixed by adjusting
744 how we split the function but at the moment we'd have conflicting
745 goals there. */
746 if (backedge
747 && dts[i] == vect_external_def
748 && is_a <bb_vec_info> (vinfo)
749 && TREE_CODE (oprnd) == SSA_NAME
750 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
751 && !dominated_by_p (CDI_DOMINATORS,
752 as_a <bb_vec_info> (vinfo)->bbs[0],
753 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
754 {
755 if (dump_enabled_p ())
756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
757 "Build SLP failed: extern def %T only defined "
758 "on backedge\n", oprnd);
759 return -1;
760 }
761
762 if (first)
763 {
764 tree type = TREE_TYPE (oprnd);
765 dt = dts[i];
766
767 /* For the swapping logic below force vect_reduction_def
768 for the reduction op in a SLP reduction group. */
769 if (!STMT_VINFO_DATA_REF (stmt_info)
770 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
771 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
772 && def_stmt_info)
773 dts[i] = dt = vect_reduction_def;
774
775 /* Check the types of the definition. */
776 switch (dt)
777 {
778 case vect_external_def:
779 case vect_constant_def:
780 case vect_internal_def:
781 case vect_reduction_def:
782 case vect_induction_def:
783 case vect_nested_cycle:
784 case vect_first_order_recurrence:
785 break;
786
787 default:
788 /* FORNOW: Not supported. */
789 if (dump_enabled_p ())
790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
791 "Build SLP failed: illegal type of def %T\n",
792 oprnd);
793 return -1;
794 }
795
796 oprnd_info->first_dt = dt;
797 oprnd_info->first_op_type = type;
798 }
799 }
800 if (first)
801 return 0;
802
803 /* Now match the operand definition types to that of the first stmt. */
804 for (i = 0; i < number_of_oprnds;)
805 {
806 if (skip_args[i])
807 {
808 ++i;
809 continue;
810 }
811
812 oprnd_info = (*oprnds_info)[i];
813 dt = dts[i];
814 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
815 oprnd = oprnd_info->ops[stmt_num];
816 tree type = TREE_TYPE (oprnd);
817
818 if (!types_compatible_p (oprnd_info->first_op_type, type))
819 {
820 if (dump_enabled_p ())
821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822 "Build SLP failed: different operand types\n");
823 return 1;
824 }
825
826 if ((gs_op == i) != oprnd_info->first_gs_p)
827 {
828 if (dump_enabled_p ())
829 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
830 "Build SLP failed: mixed gather and non-gather\n");
831 return 1;
832 }
833 else if (gs_op == i)
834 {
835 if (!operand_equal_p (oprnd_info->first_gs_info.base,
836 gs_info.base))
837 {
838 if (dump_enabled_p ())
839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
840 "Build SLP failed: different gather base\n");
841 return 1;
842 }
843 if (oprnd_info->first_gs_info.scale != gs_info.scale)
844 {
845 if (dump_enabled_p ())
846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
847 "Build SLP failed: different gather scale\n");
848 return 1;
849 }
850 }
851
852 /* Not first stmt of the group, check that the def-stmt/s match
853 the def-stmt/s of the first stmt. Allow different definition
854 types for reduction chains: the first stmt must be a
855 vect_reduction_def (a phi node), and the rest
856 end in the reduction chain. */
857 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
858 && !(oprnd_info->first_dt == vect_reduction_def
859 && !STMT_VINFO_DATA_REF (stmt_info)
860 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
861 && def_stmt_info
862 && !STMT_VINFO_DATA_REF (def_stmt_info)
863 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
864 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
865 || (!STMT_VINFO_DATA_REF (stmt_info)
866 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
867 && ((!def_stmt_info
868 || STMT_VINFO_DATA_REF (def_stmt_info)
869 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
870 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
871 != (oprnd_info->first_dt != vect_reduction_def))))
872 {
873 /* Try swapping operands if we got a mismatch. For BB
874 vectorization only in case it will clearly improve things. */
875 if (i == commutative_op && !swapped
876 && (!is_a <bb_vec_info> (vinfo)
877 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
878 dts[i+1])
879 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
880 || vect_def_types_match
881 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
882 {
883 if (dump_enabled_p ())
884 dump_printf_loc (MSG_NOTE, vect_location,
885 "trying swapped operands\n");
886 std::swap (dts[i], dts[i+1]);
887 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
888 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
889 std::swap ((*oprnds_info)[i]->ops[stmt_num],
890 (*oprnds_info)[i+1]->ops[stmt_num]);
891 swapped = true;
892 continue;
893 }
894
895 if (is_a <bb_vec_info> (vinfo)
896 && !oprnd_info->any_pattern)
897 {
898 /* Now for commutative ops we should see whether we can
899 make the other operand matching. */
900 if (dump_enabled_p ())
901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
902 "treating operand as external\n");
903 oprnd_info->first_dt = dt = vect_external_def;
904 }
905 else
906 {
907 if (dump_enabled_p ())
908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
909 "Build SLP failed: different types\n");
910 return 1;
911 }
912 }
913
914 /* Make sure to demote the overall operand to external. */
915 if (dt == vect_external_def)
916 oprnd_info->first_dt = vect_external_def;
917 /* For a SLP reduction chain we want to duplicate the reduction to
918 each of the chain members. That gets us a sane SLP graph (still
919 the stmts are not 100% correct wrt the initial values). */
920 else if ((dt == vect_internal_def
921 || dt == vect_reduction_def)
922 && oprnd_info->first_dt == vect_reduction_def
923 && !STMT_VINFO_DATA_REF (stmt_info)
924 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
925 && !STMT_VINFO_DATA_REF (def_stmt_info)
926 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
927 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
928 {
929 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
930 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
931 }
932
933 ++i;
934 }
935
936 /* Swap operands. */
937 if (swapped)
938 {
939 if (dump_enabled_p ())
940 dump_printf_loc (MSG_NOTE, vect_location,
941 "swapped operands to match def types in %G",
942 stmt_info->stmt);
943 }
944
945 return 0;
946 }
947
948 /* Return true if call statements CALL1 and CALL2 are similar enough
949 to be combined into the same SLP group. */
950
951 bool
952 compatible_calls_p (gcall *call1, gcall *call2)
953 {
954 unsigned int nargs = gimple_call_num_args (call1);
955 if (nargs != gimple_call_num_args (call2))
956 return false;
957
958 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
959 return false;
960
961 if (gimple_call_internal_p (call1))
962 {
963 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
964 TREE_TYPE (gimple_call_lhs (call2))))
965 return false;
966 for (unsigned int i = 0; i < nargs; ++i)
967 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
968 TREE_TYPE (gimple_call_arg (call2, i))))
969 return false;
970 }
971 else
972 {
973 if (!operand_equal_p (gimple_call_fn (call1),
974 gimple_call_fn (call2), 0))
975 return false;
976
977 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
978 return false;
979 }
980
981 /* Check that any unvectorized arguments are equal. */
982 if (const int *map = vect_get_operand_map (call1))
983 {
984 unsigned int nkept = *map++;
985 unsigned int mapi = 0;
986 for (unsigned int i = 0; i < nargs; ++i)
987 if (mapi < nkept && map[mapi] == int (i))
988 mapi += 1;
989 else if (!operand_equal_p (gimple_call_arg (call1, i),
990 gimple_call_arg (call2, i)))
991 return false;
992 }
993
994 return true;
995 }
996
997 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
998 caller's attempt to find the vector type in STMT_INFO with the narrowest
999 element type. Return true if VECTYPE is nonnull and if it is valid
1000 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1001 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1002 vect_build_slp_tree. */
1003
1004 static bool
1005 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1006 unsigned int group_size,
1007 tree vectype, poly_uint64 *max_nunits)
1008 {
1009 if (!vectype)
1010 {
1011 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1013 "Build SLP failed: unsupported data-type in %G\n",
1014 stmt_info->stmt);
1015 /* Fatal mismatch. */
1016 return false;
1017 }
1018
1019 /* If populating the vector type requires unrolling then fail
1020 before adjusting *max_nunits for basic-block vectorization. */
1021 if (is_a <bb_vec_info> (vinfo)
1022 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1023 {
1024 if (dump_enabled_p ())
1025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1026 "Build SLP failed: unrolling required "
1027 "in basic block SLP\n");
1028 /* Fatal mismatch. */
1029 return false;
1030 }
1031
1032 /* In case of multiple types we need to detect the smallest type. */
1033 vect_update_max_nunits (max_nunits, vectype);
1034 return true;
1035 }
1036
1037 /* Verify if the scalar stmts STMTS are isomorphic, require data
1038 permutation or are of unsupported types of operation. Return
1039 true if they are, otherwise return false and indicate in *MATCHES
1040 which stmts are not isomorphic to the first one. If MATCHES[0]
1041 is false then this indicates the comparison could not be
1042 carried out or the stmts will never be vectorized by SLP.
1043
1044 Note COND_EXPR is possibly isomorphic to another one after swapping its
1045 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1046 the first stmt by swapping the two operands of comparison; set SWAP[i]
1047 to 2 if stmt I is isormorphic to the first stmt by inverting the code
1048 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1049 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1050
1051 static bool
1052 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1053 vec<stmt_vec_info> stmts, unsigned int group_size,
1054 poly_uint64 *max_nunits, bool *matches,
1055 bool *two_operators, tree *node_vectype)
1056 {
1057 unsigned int i;
1058 stmt_vec_info first_stmt_info = stmts[0];
1059 code_helper first_stmt_code = ERROR_MARK;
1060 code_helper alt_stmt_code = ERROR_MARK;
1061 code_helper rhs_code = ERROR_MARK;
1062 code_helper first_cond_code = ERROR_MARK;
1063 tree lhs;
1064 bool need_same_oprnds = false;
1065 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1066 stmt_vec_info first_load = NULL, prev_first_load = NULL;
1067 bool first_stmt_ldst_p = false, ldst_p = false;
1068 bool first_stmt_phi_p = false, phi_p = false;
1069 bool maybe_soft_fail = false;
1070 tree soft_fail_nunits_vectype = NULL_TREE;
1071
1072 /* For every stmt in NODE find its def stmt/s. */
1073 stmt_vec_info stmt_info;
1074 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1075 {
1076 gimple *stmt = stmt_info->stmt;
1077 swap[i] = 0;
1078 matches[i] = false;
1079
1080 if (dump_enabled_p ())
1081 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1082
1083 /* Fail to vectorize statements marked as unvectorizable, throw
1084 or are volatile. */
1085 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1086 || stmt_can_throw_internal (cfun, stmt)
1087 || gimple_has_volatile_ops (stmt))
1088 {
1089 if (dump_enabled_p ())
1090 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1091 "Build SLP failed: unvectorizable statement %G",
1092 stmt);
1093 /* ??? For BB vectorization we want to commutate operands in a way
1094 to shuffle all unvectorizable defs into one operand and have
1095 the other still vectorized. The following doesn't reliably
1096 work for this though but it's the easiest we can do here. */
1097 if (is_a <bb_vec_info> (vinfo) && i != 0)
1098 continue;
1099 /* Fatal mismatch. */
1100 matches[0] = false;
1101 return false;
1102 }
1103
1104 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1105 lhs = gimple_get_lhs (stmt);
1106 if (lhs == NULL_TREE
1107 && (!call_stmt
1108 || !gimple_call_internal_p (stmt)
1109 || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1110 {
1111 if (dump_enabled_p ())
1112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113 "Build SLP failed: not GIMPLE_ASSIGN nor "
1114 "GIMPLE_CALL %G", stmt);
1115 if (is_a <bb_vec_info> (vinfo) && i != 0)
1116 continue;
1117 /* Fatal mismatch. */
1118 matches[0] = false;
1119 return false;
1120 }
1121
1122 tree nunits_vectype;
1123 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1124 &nunits_vectype, group_size))
1125 {
1126 if (is_a <bb_vec_info> (vinfo) && i != 0)
1127 continue;
1128 /* Fatal mismatch. */
1129 matches[0] = false;
1130 return false;
1131 }
1132 /* Record nunits required but continue analysis, producing matches[]
1133 as if nunits was not an issue. This allows splitting of groups
1134 to happen. */
1135 if (nunits_vectype
1136 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1137 nunits_vectype, max_nunits))
1138 {
1139 gcc_assert (is_a <bb_vec_info> (vinfo));
1140 maybe_soft_fail = true;
1141 soft_fail_nunits_vectype = nunits_vectype;
1142 }
1143
1144 gcc_assert (vectype);
1145
1146 if (call_stmt)
1147 {
1148 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1149 if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1150 rhs_code = cfn;
1151 else
1152 rhs_code = CALL_EXPR;
1153
1154 if (cfn == CFN_MASK_LOAD
1155 || cfn == CFN_GATHER_LOAD
1156 || cfn == CFN_MASK_GATHER_LOAD
1157 || cfn == CFN_MASK_LEN_GATHER_LOAD)
1158 ldst_p = true;
1159 else if (cfn == CFN_MASK_STORE)
1160 {
1161 ldst_p = true;
1162 rhs_code = CFN_MASK_STORE;
1163 }
1164 else if ((cfn != CFN_LAST
1165 && cfn != CFN_MASK_CALL
1166 && internal_fn_p (cfn)
1167 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1168 || gimple_call_tail_p (call_stmt)
1169 || gimple_call_noreturn_p (call_stmt)
1170 || gimple_call_chain (call_stmt))
1171 {
1172 if (dump_enabled_p ())
1173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1174 "Build SLP failed: unsupported call type %G",
1175 (gimple *) call_stmt);
1176 if (is_a <bb_vec_info> (vinfo) && i != 0)
1177 continue;
1178 /* Fatal mismatch. */
1179 matches[0] = false;
1180 return false;
1181 }
1182 }
1183 else if (gimple_code (stmt) == GIMPLE_PHI)
1184 {
1185 rhs_code = ERROR_MARK;
1186 phi_p = true;
1187 }
1188 else
1189 {
1190 rhs_code = gimple_assign_rhs_code (stmt);
1191 ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1192 }
1193
1194 /* Check the operation. */
1195 if (i == 0)
1196 {
1197 *node_vectype = vectype;
1198 first_stmt_code = rhs_code;
1199 first_stmt_ldst_p = ldst_p;
1200 first_stmt_phi_p = phi_p;
1201
1202 /* Shift arguments should be equal in all the packed stmts for a
1203 vector shift with scalar shift operand. */
1204 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1205 || rhs_code == LROTATE_EXPR
1206 || rhs_code == RROTATE_EXPR)
1207 {
1208 /* First see if we have a vector/vector shift. */
1209 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1210 {
1211 /* No vector/vector shift, try for a vector/scalar shift. */
1212 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1213 {
1214 if (dump_enabled_p ())
1215 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1216 "Build SLP failed: "
1217 "op not supported by target.\n");
1218 if (is_a <bb_vec_info> (vinfo) && i != 0)
1219 continue;
1220 /* Fatal mismatch. */
1221 matches[0] = false;
1222 return false;
1223 }
1224 need_same_oprnds = true;
1225 first_op1 = gimple_assign_rhs2 (stmt);
1226 }
1227 }
1228 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1229 {
1230 need_same_oprnds = true;
1231 first_op1 = gimple_assign_rhs2 (stmt);
1232 }
1233 else if (!ldst_p
1234 && rhs_code == BIT_FIELD_REF)
1235 {
1236 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1237 if (!is_a <bb_vec_info> (vinfo)
1238 || TREE_CODE (vec) != SSA_NAME
1239 /* When the element types are not compatible we pun the
1240 source to the target vectype which requires equal size. */
1241 || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1242 || !types_compatible_p (TREE_TYPE (vectype),
1243 TREE_TYPE (TREE_TYPE (vec))))
1244 && !operand_equal_p (TYPE_SIZE (vectype),
1245 TYPE_SIZE (TREE_TYPE (vec)))))
1246 {
1247 if (dump_enabled_p ())
1248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249 "Build SLP failed: "
1250 "BIT_FIELD_REF not supported\n");
1251 /* Fatal mismatch. */
1252 matches[0] = false;
1253 return false;
1254 }
1255 }
1256 else if (rhs_code == CFN_DIV_POW2)
1257 {
1258 need_same_oprnds = true;
1259 first_op1 = gimple_call_arg (call_stmt, 1);
1260 }
1261 }
1262 else
1263 {
1264 if (first_stmt_code != rhs_code
1265 && alt_stmt_code == ERROR_MARK)
1266 alt_stmt_code = rhs_code;
1267 if ((first_stmt_code != rhs_code
1268 && (first_stmt_code != IMAGPART_EXPR
1269 || rhs_code != REALPART_EXPR)
1270 && (first_stmt_code != REALPART_EXPR
1271 || rhs_code != IMAGPART_EXPR)
1272 /* Handle mismatches in plus/minus by computing both
1273 and merging the results. */
1274 && !((first_stmt_code == PLUS_EXPR
1275 || first_stmt_code == MINUS_EXPR)
1276 && (alt_stmt_code == PLUS_EXPR
1277 || alt_stmt_code == MINUS_EXPR)
1278 && rhs_code == alt_stmt_code)
1279 && !(first_stmt_code.is_tree_code ()
1280 && rhs_code.is_tree_code ()
1281 && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1282 == tcc_comparison)
1283 && (swap_tree_comparison (tree_code (first_stmt_code))
1284 == tree_code (rhs_code)))
1285 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1286 && (first_stmt_code == ARRAY_REF
1287 || first_stmt_code == BIT_FIELD_REF
1288 || first_stmt_code == INDIRECT_REF
1289 || first_stmt_code == COMPONENT_REF
1290 || first_stmt_code == MEM_REF)
1291 && (rhs_code == ARRAY_REF
1292 || rhs_code == BIT_FIELD_REF
1293 || rhs_code == INDIRECT_REF
1294 || rhs_code == COMPONENT_REF
1295 || rhs_code == MEM_REF)))
1296 || (ldst_p
1297 && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1298 != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1299 || (ldst_p
1300 && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1301 != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1302 || first_stmt_ldst_p != ldst_p
1303 || first_stmt_phi_p != phi_p)
1304 {
1305 if (dump_enabled_p ())
1306 {
1307 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1308 "Build SLP failed: different operation "
1309 "in stmt %G", stmt);
1310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1311 "original stmt %G", first_stmt_info->stmt);
1312 }
1313 /* Mismatch. */
1314 continue;
1315 }
1316
1317 if (!ldst_p
1318 && first_stmt_code == BIT_FIELD_REF
1319 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1320 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1321 {
1322 if (dump_enabled_p ())
1323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1324 "Build SLP failed: different BIT_FIELD_REF "
1325 "arguments in %G", stmt);
1326 /* Mismatch. */
1327 continue;
1328 }
1329
1330 if (call_stmt
1331 && first_stmt_code != CFN_MASK_LOAD
1332 && first_stmt_code != CFN_MASK_STORE)
1333 {
1334 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1335 call_stmt))
1336 {
1337 if (dump_enabled_p ())
1338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1339 "Build SLP failed: different calls in %G",
1340 stmt);
1341 /* Mismatch. */
1342 continue;
1343 }
1344 }
1345
1346 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1347 && (gimple_bb (first_stmt_info->stmt)
1348 != gimple_bb (stmt_info->stmt)))
1349 {
1350 if (dump_enabled_p ())
1351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1352 "Build SLP failed: different BB for PHI "
1353 "or possibly trapping operation in %G", stmt);
1354 /* Mismatch. */
1355 continue;
1356 }
1357
1358 if (need_same_oprnds)
1359 {
1360 tree other_op1 = gimple_arg (stmt, 1);
1361 if (!operand_equal_p (first_op1, other_op1, 0))
1362 {
1363 if (dump_enabled_p ())
1364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1365 "Build SLP failed: different shift "
1366 "arguments in %G", stmt);
1367 /* Mismatch. */
1368 continue;
1369 }
1370 }
1371
1372 if (!types_compatible_p (vectype, *node_vectype))
1373 {
1374 if (dump_enabled_p ())
1375 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1376 "Build SLP failed: different vector type "
1377 "in %G", stmt);
1378 /* Mismatch. */
1379 continue;
1380 }
1381 }
1382
1383 /* Grouped store or load. */
1384 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1385 {
1386 gcc_assert (ldst_p);
1387 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1388 {
1389 /* Store. */
1390 gcc_assert (rhs_code == CFN_MASK_STORE
1391 || REFERENCE_CLASS_P (lhs)
1392 || DECL_P (lhs));
1393 }
1394 else
1395 {
1396 /* Load. */
1397 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1398 if (prev_first_load)
1399 {
1400 /* Check that there are no loads from different interleaving
1401 chains in the same node. */
1402 if (prev_first_load != first_load)
1403 {
1404 if (dump_enabled_p ())
1405 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1406 vect_location,
1407 "Build SLP failed: different "
1408 "interleaving chains in one node %G",
1409 stmt);
1410 /* Mismatch. */
1411 continue;
1412 }
1413 }
1414 else
1415 prev_first_load = first_load;
1416 }
1417 }
1418 /* Non-grouped store or load. */
1419 else if (ldst_p)
1420 {
1421 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1422 && rhs_code != CFN_GATHER_LOAD
1423 && rhs_code != CFN_MASK_GATHER_LOAD
1424 && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1425 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1426 /* Not grouped loads are handled as externals for BB
1427 vectorization. For loop vectorization we can handle
1428 splats the same we handle single element interleaving. */
1429 && (is_a <bb_vec_info> (vinfo)
1430 || stmt_info != first_stmt_info))
1431 {
1432 /* Not grouped load. */
1433 if (dump_enabled_p ())
1434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1435 "Build SLP failed: not grouped load %G", stmt);
1436
1437 if (i != 0)
1438 continue;
1439 /* Fatal mismatch. */
1440 matches[0] = false;
1441 return false;
1442 }
1443 }
1444 /* Not memory operation. */
1445 else
1446 {
1447 if (!phi_p
1448 && rhs_code.is_tree_code ()
1449 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1450 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1451 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1452 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1453 && rhs_code != VIEW_CONVERT_EXPR
1454 && rhs_code != CALL_EXPR
1455 && rhs_code != BIT_FIELD_REF)
1456 {
1457 if (dump_enabled_p ())
1458 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1459 "Build SLP failed: operation unsupported %G",
1460 stmt);
1461 if (is_a <bb_vec_info> (vinfo) && i != 0)
1462 continue;
1463 /* Fatal mismatch. */
1464 matches[0] = false;
1465 return false;
1466 }
1467
1468 if (rhs_code == COND_EXPR)
1469 {
1470 tree cond_expr = gimple_assign_rhs1 (stmt);
1471 enum tree_code cond_code = TREE_CODE (cond_expr);
1472 enum tree_code swap_code = ERROR_MARK;
1473 enum tree_code invert_code = ERROR_MARK;
1474
1475 if (i == 0)
1476 first_cond_code = TREE_CODE (cond_expr);
1477 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1478 {
1479 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1480 swap_code = swap_tree_comparison (cond_code);
1481 invert_code = invert_tree_comparison (cond_code, honor_nans);
1482 }
1483
1484 if (first_cond_code == cond_code)
1485 ;
1486 /* Isomorphic can be achieved by swapping. */
1487 else if (first_cond_code == swap_code)
1488 swap[i] = 1;
1489 /* Isomorphic can be achieved by inverting. */
1490 else if (first_cond_code == invert_code)
1491 swap[i] = 2;
1492 else
1493 {
1494 if (dump_enabled_p ())
1495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1496 "Build SLP failed: different"
1497 " operation %G", stmt);
1498 /* Mismatch. */
1499 continue;
1500 }
1501 }
1502
1503 if (rhs_code.is_tree_code ()
1504 && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1505 && (swap_tree_comparison ((tree_code)first_stmt_code)
1506 == (tree_code)rhs_code))
1507 swap[i] = 1;
1508 }
1509
1510 matches[i] = true;
1511 }
1512
1513 for (i = 0; i < group_size; ++i)
1514 if (!matches[i])
1515 return false;
1516
1517 /* If we allowed a two-operation SLP node verify the target can cope
1518 with the permute we are going to use. */
1519 if (alt_stmt_code != ERROR_MARK
1520 && (!alt_stmt_code.is_tree_code ()
1521 || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1522 && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1523 {
1524 *two_operators = true;
1525 }
1526
1527 if (maybe_soft_fail)
1528 {
1529 unsigned HOST_WIDE_INT const_nunits;
1530 if (!TYPE_VECTOR_SUBPARTS
1531 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1532 || const_nunits > group_size)
1533 matches[0] = false;
1534 else
1535 {
1536 /* With constant vector elements simulate a mismatch at the
1537 point we need to split. */
1538 unsigned tail = group_size & (const_nunits - 1);
1539 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1540 }
1541 return false;
1542 }
1543
1544 return true;
1545 }
1546
1547 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1548 Note we never remove apart from at destruction time so we do not
1549 need a special value for deleted that differs from empty. */
1550 struct bst_traits
1551 {
1552 typedef vec <stmt_vec_info> value_type;
1553 typedef vec <stmt_vec_info> compare_type;
1554 static inline hashval_t hash (value_type);
1555 static inline bool equal (value_type existing, value_type candidate);
1556 static inline bool is_empty (value_type x) { return !x.exists (); }
1557 static inline bool is_deleted (value_type x) { return !x.exists (); }
1558 static const bool empty_zero_p = true;
1559 static inline void mark_empty (value_type &x) { x.release (); }
1560 static inline void mark_deleted (value_type &x) { x.release (); }
1561 static inline void remove (value_type &x) { x.release (); }
1562 };
1563 inline hashval_t
1564 bst_traits::hash (value_type x)
1565 {
1566 inchash::hash h;
1567 for (unsigned i = 0; i < x.length (); ++i)
1568 h.add_int (gimple_uid (x[i]->stmt));
1569 return h.end ();
1570 }
1571 inline bool
1572 bst_traits::equal (value_type existing, value_type candidate)
1573 {
1574 if (existing.length () != candidate.length ())
1575 return false;
1576 for (unsigned i = 0; i < existing.length (); ++i)
1577 if (existing[i] != candidate[i])
1578 return false;
1579 return true;
1580 }
1581
1582 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1583 but then vec::insert does memmove and that's not compatible with
1584 std::pair. */
1585 struct chain_op_t
1586 {
1587 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1588 : code (code_), dt (dt_), op (op_) {}
1589 tree_code code;
1590 vect_def_type dt;
1591 tree op;
1592 };
1593
1594 /* Comparator for sorting associatable chains. */
1595
1596 static int
1597 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1598 {
1599 auto *op1 = (const chain_op_t *) op1_;
1600 auto *op2 = (const chain_op_t *) op2_;
1601 if (op1->dt != op2->dt)
1602 return (int)op1->dt - (int)op2->dt;
1603 return (int)op1->code - (int)op2->code;
1604 }
1605
1606 /* Linearize the associatable expression chain at START with the
1607 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1608 filling CHAIN with the result and using WORKLIST as intermediate storage.
1609 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1610 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1611 stmts, starting with START. */
1612
1613 static void
1614 vect_slp_linearize_chain (vec_info *vinfo,
1615 vec<std::pair<tree_code, gimple *> > &worklist,
1616 vec<chain_op_t> &chain,
1617 enum tree_code code, gimple *start,
1618 gimple *&code_stmt, gimple *&alt_code_stmt,
1619 vec<gimple *> *chain_stmts)
1620 {
1621 /* For each lane linearize the addition/subtraction (or other
1622 uniform associatable operation) expression tree. */
1623 worklist.safe_push (std::make_pair (code, start));
1624 while (!worklist.is_empty ())
1625 {
1626 auto entry = worklist.pop ();
1627 gassign *stmt = as_a <gassign *> (entry.second);
1628 enum tree_code in_code = entry.first;
1629 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1630 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1631 if (!code_stmt
1632 && gimple_assign_rhs_code (stmt) == code)
1633 code_stmt = stmt;
1634 else if (!alt_code_stmt
1635 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1636 alt_code_stmt = stmt;
1637 if (chain_stmts)
1638 chain_stmts->safe_push (stmt);
1639 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1640 {
1641 tree op = gimple_op (stmt, opnum);
1642 vect_def_type dt;
1643 stmt_vec_info def_stmt_info;
1644 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1645 gcc_assert (res);
1646 if (dt == vect_internal_def
1647 && is_pattern_stmt_p (def_stmt_info))
1648 op = gimple_get_lhs (def_stmt_info->stmt);
1649 gimple *use_stmt;
1650 use_operand_p use_p;
1651 if (dt == vect_internal_def
1652 && single_imm_use (op, &use_p, &use_stmt)
1653 && is_gimple_assign (def_stmt_info->stmt)
1654 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1655 || (code == PLUS_EXPR
1656 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1657 == MINUS_EXPR))))
1658 {
1659 tree_code op_def_code = this_code;
1660 if (op_def_code == MINUS_EXPR && opnum == 1)
1661 op_def_code = PLUS_EXPR;
1662 if (in_code == MINUS_EXPR)
1663 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1664 worklist.safe_push (std::make_pair (op_def_code,
1665 def_stmt_info->stmt));
1666 }
1667 else
1668 {
1669 tree_code op_def_code = this_code;
1670 if (op_def_code == MINUS_EXPR && opnum == 1)
1671 op_def_code = PLUS_EXPR;
1672 if (in_code == MINUS_EXPR)
1673 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1674 chain.safe_push (chain_op_t (op_def_code, dt, op));
1675 }
1676 }
1677 }
1678 }
1679
1680 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1681 simple_hashmap_traits <bst_traits, slp_tree> >
1682 scalar_stmts_to_slp_tree_map_t;
1683
1684 static slp_tree
1685 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1686 vec<stmt_vec_info> stmts, unsigned int group_size,
1687 poly_uint64 *max_nunits,
1688 bool *matches, unsigned *limit, unsigned *tree_size,
1689 scalar_stmts_to_slp_tree_map_t *bst_map);
1690
1691 static slp_tree
1692 vect_build_slp_tree (vec_info *vinfo,
1693 vec<stmt_vec_info> stmts, unsigned int group_size,
1694 poly_uint64 *max_nunits,
1695 bool *matches, unsigned *limit, unsigned *tree_size,
1696 scalar_stmts_to_slp_tree_map_t *bst_map)
1697 {
1698 if (slp_tree *leader = bst_map->get (stmts))
1699 {
1700 if (dump_enabled_p ())
1701 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1702 !(*leader)->failed ? "" : "failed ",
1703 (void *) *leader);
1704 if (!(*leader)->failed)
1705 {
1706 SLP_TREE_REF_COUNT (*leader)++;
1707 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1708 stmts.release ();
1709 return *leader;
1710 }
1711 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1712 return NULL;
1713 }
1714
1715 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1716 so we can pick up backedge destinations during discovery. */
1717 slp_tree res = new _slp_tree;
1718 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1719 SLP_TREE_SCALAR_STMTS (res) = stmts;
1720 bst_map->put (stmts.copy (), res);
1721
1722 if (*limit == 0)
1723 {
1724 if (dump_enabled_p ())
1725 dump_printf_loc (MSG_NOTE, vect_location,
1726 "SLP discovery limit exceeded\n");
1727 /* Mark the node invalid so we can detect those when still in use
1728 as backedge destinations. */
1729 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1730 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1731 res->failed = XNEWVEC (bool, group_size);
1732 memset (res->failed, 0, sizeof (bool) * group_size);
1733 memset (matches, 0, sizeof (bool) * group_size);
1734 return NULL;
1735 }
1736 --*limit;
1737
1738 if (dump_enabled_p ())
1739 dump_printf_loc (MSG_NOTE, vect_location,
1740 "starting SLP discovery for node %p\n", (void *) res);
1741
1742 poly_uint64 this_max_nunits = 1;
1743 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1744 &this_max_nunits,
1745 matches, limit, tree_size, bst_map);
1746 if (!res_)
1747 {
1748 if (dump_enabled_p ())
1749 dump_printf_loc (MSG_NOTE, vect_location,
1750 "SLP discovery for node %p failed\n", (void *) res);
1751 /* Mark the node invalid so we can detect those when still in use
1752 as backedge destinations. */
1753 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1754 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1755 res->failed = XNEWVEC (bool, group_size);
1756 if (flag_checking)
1757 {
1758 unsigned i;
1759 for (i = 0; i < group_size; ++i)
1760 if (!matches[i])
1761 break;
1762 gcc_assert (i < group_size);
1763 }
1764 memcpy (res->failed, matches, sizeof (bool) * group_size);
1765 }
1766 else
1767 {
1768 if (dump_enabled_p ())
1769 dump_printf_loc (MSG_NOTE, vect_location,
1770 "SLP discovery for node %p succeeded\n",
1771 (void *) res);
1772 gcc_assert (res_ == res);
1773 res->max_nunits = this_max_nunits;
1774 vect_update_max_nunits (max_nunits, this_max_nunits);
1775 /* Keep a reference for the bst_map use. */
1776 SLP_TREE_REF_COUNT (res)++;
1777 }
1778 return res_;
1779 }
1780
1781 /* Helper for building an associated SLP node chain. */
1782
1783 static void
1784 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1785 slp_tree op0, slp_tree op1,
1786 stmt_vec_info oper1, stmt_vec_info oper2,
1787 vec<std::pair<unsigned, unsigned> > lperm)
1788 {
1789 unsigned group_size = SLP_TREE_LANES (op1);
1790
1791 slp_tree child1 = new _slp_tree;
1792 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1793 SLP_TREE_VECTYPE (child1) = vectype;
1794 SLP_TREE_LANES (child1) = group_size;
1795 SLP_TREE_CHILDREN (child1).create (2);
1796 SLP_TREE_CHILDREN (child1).quick_push (op0);
1797 SLP_TREE_CHILDREN (child1).quick_push (op1);
1798 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1799
1800 slp_tree child2 = new _slp_tree;
1801 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1802 SLP_TREE_VECTYPE (child2) = vectype;
1803 SLP_TREE_LANES (child2) = group_size;
1804 SLP_TREE_CHILDREN (child2).create (2);
1805 SLP_TREE_CHILDREN (child2).quick_push (op0);
1806 SLP_TREE_REF_COUNT (op0)++;
1807 SLP_TREE_CHILDREN (child2).quick_push (op1);
1808 SLP_TREE_REF_COUNT (op1)++;
1809 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1810
1811 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1812 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1813 SLP_TREE_VECTYPE (perm) = vectype;
1814 SLP_TREE_LANES (perm) = group_size;
1815 /* ??? We should set this NULL but that's not expected. */
1816 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1817 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1818 SLP_TREE_CHILDREN (perm).quick_push (child1);
1819 SLP_TREE_CHILDREN (perm).quick_push (child2);
1820 }
1821
1822 /* Recursively build an SLP tree starting from NODE.
1823 Fail (and return a value not equal to zero) if def-stmts are not
1824 isomorphic, require data permutation or are of unsupported types of
1825 operation. Otherwise, return 0.
1826 The value returned is the depth in the SLP tree where a mismatch
1827 was found. */
1828
1829 static slp_tree
1830 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1831 vec<stmt_vec_info> stmts, unsigned int group_size,
1832 poly_uint64 *max_nunits,
1833 bool *matches, unsigned *limit, unsigned *tree_size,
1834 scalar_stmts_to_slp_tree_map_t *bst_map)
1835 {
1836 unsigned nops, i, this_tree_size = 0;
1837 poly_uint64 this_max_nunits = *max_nunits;
1838
1839 matches[0] = false;
1840
1841 stmt_vec_info stmt_info = stmts[0];
1842 if (!is_a<gcall *> (stmt_info->stmt)
1843 && !is_a<gassign *> (stmt_info->stmt)
1844 && !is_a<gphi *> (stmt_info->stmt))
1845 return NULL;
1846
1847 nops = gimple_num_args (stmt_info->stmt);
1848 if (const int *map = vect_get_operand_map (stmt_info->stmt,
1849 STMT_VINFO_GATHER_SCATTER_P
1850 (stmt_info)))
1851 nops = map[0];
1852
1853 /* If the SLP node is a PHI (induction or reduction), terminate
1854 the recursion. */
1855 bool *skip_args = XALLOCAVEC (bool, nops);
1856 memset (skip_args, 0, sizeof (bool) * nops);
1857 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1858 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1859 {
1860 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1861 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1862 group_size);
1863 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1864 max_nunits))
1865 return NULL;
1866
1867 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1868 if (def_type == vect_induction_def)
1869 {
1870 /* Induction PHIs are not cycles but walk the initial
1871 value. Only for inner loops through, for outer loops
1872 we need to pick up the value from the actual PHIs
1873 to more easily support peeling and epilogue vectorization. */
1874 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1875 if (!nested_in_vect_loop_p (loop, stmt_info))
1876 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1877 else
1878 loop = loop->inner;
1879 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1880 }
1881 else if (def_type == vect_reduction_def
1882 || def_type == vect_double_reduction_def
1883 || def_type == vect_nested_cycle
1884 || def_type == vect_first_order_recurrence)
1885 {
1886 /* Else def types have to match. */
1887 stmt_vec_info other_info;
1888 bool all_same = true;
1889 FOR_EACH_VEC_ELT (stmts, i, other_info)
1890 {
1891 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1892 return NULL;
1893 if (other_info != stmt_info)
1894 all_same = false;
1895 }
1896 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1897 /* Reduction initial values are not explicitely represented. */
1898 if (def_type != vect_first_order_recurrence
1899 && !nested_in_vect_loop_p (loop, stmt_info))
1900 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1901 /* Reduction chain backedge defs are filled manually.
1902 ??? Need a better way to identify a SLP reduction chain PHI.
1903 Or a better overall way to SLP match those. */
1904 if (all_same && def_type == vect_reduction_def)
1905 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1906 }
1907 else if (def_type != vect_internal_def)
1908 return NULL;
1909 }
1910
1911
1912 bool two_operators = false;
1913 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1914 tree vectype = NULL_TREE;
1915 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1916 &this_max_nunits, matches, &two_operators,
1917 &vectype))
1918 return NULL;
1919
1920 /* If the SLP node is a load, terminate the recursion unless masked. */
1921 if (STMT_VINFO_DATA_REF (stmt_info)
1922 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1923 {
1924 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1925 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1926 else
1927 {
1928 *max_nunits = this_max_nunits;
1929 (*tree_size)++;
1930 node = vect_create_new_slp_node (node, stmts, 0);
1931 SLP_TREE_VECTYPE (node) = vectype;
1932 /* And compute the load permutation. Whether it is actually
1933 a permutation depends on the unrolling factor which is
1934 decided later. */
1935 vec<unsigned> load_permutation;
1936 int j;
1937 stmt_vec_info load_info;
1938 load_permutation.create (group_size);
1939 stmt_vec_info first_stmt_info
1940 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1941 bool any_permute = false;
1942 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1943 {
1944 int load_place;
1945 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1946 load_place = vect_get_place_in_interleaving_chain
1947 (load_info, first_stmt_info);
1948 else
1949 load_place = 0;
1950 gcc_assert (load_place != -1);
1951 any_permute |= load_place != j;
1952 load_permutation.quick_push (load_place);
1953 }
1954
1955 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1956 {
1957 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1958 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1959 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
1960 || gimple_call_internal_p (stmt,
1961 IFN_MASK_LEN_GATHER_LOAD));
1962 load_permutation.release ();
1963 /* We cannot handle permuted masked loads, see PR114375. */
1964 if (any_permute
1965 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1966 && DR_GROUP_SIZE (first_stmt_info) != group_size)
1967 || STMT_VINFO_STRIDED_P (stmt_info))
1968 {
1969 matches[0] = false;
1970 return NULL;
1971 }
1972 }
1973 else
1974 {
1975 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1976 return node;
1977 }
1978 }
1979 }
1980 else if (gimple_assign_single_p (stmt_info->stmt)
1981 && !gimple_vuse (stmt_info->stmt)
1982 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1983 {
1984 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1985 the same SSA name vector of a compatible type to vectype. */
1986 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1987 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1988 stmt_vec_info estmt_info;
1989 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1990 {
1991 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1992 tree bfref = gimple_assign_rhs1 (estmt);
1993 HOST_WIDE_INT lane;
1994 if (!known_eq (bit_field_size (bfref),
1995 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1996 || !constant_multiple_p (bit_field_offset (bfref),
1997 bit_field_size (bfref), &lane))
1998 {
1999 lperm.release ();
2000 matches[0] = false;
2001 return NULL;
2002 }
2003 lperm.safe_push (std::make_pair (0, (unsigned)lane));
2004 }
2005 slp_tree vnode = vect_create_new_slp_node (vNULL);
2006 if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2007 /* ??? We record vectype here but we hide eventually necessary
2008 punning and instead rely on code generation to materialize
2009 VIEW_CONVERT_EXPRs as necessary. We instead should make
2010 this explicit somehow. */
2011 SLP_TREE_VECTYPE (vnode) = vectype;
2012 else
2013 {
2014 /* For different size but compatible elements we can still
2015 use VEC_PERM_EXPR without punning. */
2016 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2017 && types_compatible_p (TREE_TYPE (vectype),
2018 TREE_TYPE (TREE_TYPE (vec))));
2019 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2020 }
2021 auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2022 unsigned HOST_WIDE_INT const_nunits;
2023 if (nunits.is_constant (&const_nunits))
2024 SLP_TREE_LANES (vnode) = const_nunits;
2025 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2026 /* We are always building a permutation node even if it is an identity
2027 permute to shield the rest of the vectorizer from the odd node
2028 representing an actual vector without any scalar ops.
2029 ??? We could hide it completely with making the permute node
2030 external? */
2031 node = vect_create_new_slp_node (node, stmts, 1);
2032 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2033 SLP_TREE_LANE_PERMUTATION (node) = lperm;
2034 SLP_TREE_VECTYPE (node) = vectype;
2035 SLP_TREE_CHILDREN (node).quick_push (vnode);
2036 return node;
2037 }
2038 /* When discovery reaches an associatable operation see whether we can
2039 improve that to match up lanes in a way superior to the operand
2040 swapping code which at most looks at two defs.
2041 ??? For BB vectorization we cannot do the brute-force search
2042 for matching as we can succeed by means of builds from scalars
2043 and have no good way to "cost" one build against another. */
2044 else if (is_a <loop_vec_info> (vinfo)
2045 /* ??? We don't handle !vect_internal_def defs below. */
2046 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2047 && is_gimple_assign (stmt_info->stmt)
2048 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2049 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2050 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2051 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2052 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2053 {
2054 /* See if we have a chain of (mixed) adds or subtracts or other
2055 associatable ops. */
2056 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2057 if (code == MINUS_EXPR)
2058 code = PLUS_EXPR;
2059 stmt_vec_info other_op_stmt_info = NULL;
2060 stmt_vec_info op_stmt_info = NULL;
2061 unsigned chain_len = 0;
2062 auto_vec<chain_op_t> chain;
2063 auto_vec<std::pair<tree_code, gimple *> > worklist;
2064 auto_vec<vec<chain_op_t> > chains (group_size);
2065 auto_vec<slp_tree, 4> children;
2066 bool hard_fail = true;
2067 for (unsigned lane = 0; lane < group_size; ++lane)
2068 {
2069 /* For each lane linearize the addition/subtraction (or other
2070 uniform associatable operation) expression tree. */
2071 gimple *op_stmt = NULL, *other_op_stmt = NULL;
2072 vect_slp_linearize_chain (vinfo, worklist, chain, code,
2073 stmts[lane]->stmt, op_stmt, other_op_stmt,
2074 NULL);
2075 if (!op_stmt_info && op_stmt)
2076 op_stmt_info = vinfo->lookup_stmt (op_stmt);
2077 if (!other_op_stmt_info && other_op_stmt)
2078 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2079 if (chain.length () == 2)
2080 {
2081 /* In a chain of just two elements resort to the regular
2082 operand swapping scheme. If we run into a length
2083 mismatch still hard-FAIL. */
2084 if (chain_len == 0)
2085 hard_fail = false;
2086 else
2087 {
2088 matches[lane] = false;
2089 /* ??? We might want to process the other lanes, but
2090 make sure to not give false matching hints to the
2091 caller for lanes we did not process. */
2092 if (lane != group_size - 1)
2093 matches[0] = false;
2094 }
2095 break;
2096 }
2097 else if (chain_len == 0)
2098 chain_len = chain.length ();
2099 else if (chain.length () != chain_len)
2100 {
2101 /* ??? Here we could slip in magic to compensate with
2102 neutral operands. */
2103 matches[lane] = false;
2104 if (lane != group_size - 1)
2105 matches[0] = false;
2106 break;
2107 }
2108 chains.quick_push (chain.copy ());
2109 chain.truncate (0);
2110 }
2111 if (chains.length () == group_size)
2112 {
2113 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2114 if (!op_stmt_info)
2115 {
2116 hard_fail = false;
2117 goto out;
2118 }
2119 /* Now we have a set of chains with the same length. */
2120 /* 1. pre-sort according to def_type and operation. */
2121 for (unsigned lane = 0; lane < group_size; ++lane)
2122 chains[lane].stablesort (dt_sort_cmp, vinfo);
2123 if (dump_enabled_p ())
2124 {
2125 dump_printf_loc (MSG_NOTE, vect_location,
2126 "pre-sorted chains of %s\n",
2127 get_tree_code_name (code));
2128 for (unsigned lane = 0; lane < group_size; ++lane)
2129 {
2130 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2131 dump_printf (MSG_NOTE, "%s %T ",
2132 get_tree_code_name (chains[lane][opnum].code),
2133 chains[lane][opnum].op);
2134 dump_printf (MSG_NOTE, "\n");
2135 }
2136 }
2137 /* 2. try to build children nodes, associating as necessary. */
2138 for (unsigned n = 0; n < chain_len; ++n)
2139 {
2140 vect_def_type dt = chains[0][n].dt;
2141 unsigned lane;
2142 for (lane = 0; lane < group_size; ++lane)
2143 if (chains[lane][n].dt != dt)
2144 {
2145 if (dt == vect_constant_def
2146 && chains[lane][n].dt == vect_external_def)
2147 dt = vect_external_def;
2148 else if (dt == vect_external_def
2149 && chains[lane][n].dt == vect_constant_def)
2150 ;
2151 else
2152 break;
2153 }
2154 if (lane != group_size)
2155 {
2156 if (dump_enabled_p ())
2157 dump_printf_loc (MSG_NOTE, vect_location,
2158 "giving up on chain due to mismatched "
2159 "def types\n");
2160 matches[lane] = false;
2161 if (lane != group_size - 1)
2162 matches[0] = false;
2163 goto out;
2164 }
2165 if (dt == vect_constant_def
2166 || dt == vect_external_def)
2167 {
2168 /* Check whether we can build the invariant. If we can't
2169 we never will be able to. */
2170 tree type = TREE_TYPE (chains[0][n].op);
2171 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2172 && (TREE_CODE (type) == BOOLEAN_TYPE
2173 || !can_duplicate_and_interleave_p (vinfo, group_size,
2174 type)))
2175 {
2176 matches[0] = false;
2177 goto out;
2178 }
2179 vec<tree> ops;
2180 ops.create (group_size);
2181 for (lane = 0; lane < group_size; ++lane)
2182 ops.quick_push (chains[lane][n].op);
2183 slp_tree child = vect_create_new_slp_node (ops);
2184 SLP_TREE_DEF_TYPE (child) = dt;
2185 children.safe_push (child);
2186 }
2187 else if (dt != vect_internal_def)
2188 {
2189 /* Not sure, we might need sth special.
2190 gcc.dg/vect/pr96854.c,
2191 gfortran.dg/vect/fast-math-pr37021.f90
2192 and gfortran.dg/vect/pr61171.f trigger. */
2193 /* Soft-fail for now. */
2194 hard_fail = false;
2195 goto out;
2196 }
2197 else
2198 {
2199 vec<stmt_vec_info> op_stmts;
2200 op_stmts.create (group_size);
2201 slp_tree child = NULL;
2202 /* Brute-force our way. We have to consider a lane
2203 failing after fixing an earlier fail up in the
2204 SLP discovery recursion. So track the current
2205 permute per lane. */
2206 unsigned *perms = XALLOCAVEC (unsigned, group_size);
2207 memset (perms, 0, sizeof (unsigned) * group_size);
2208 do
2209 {
2210 op_stmts.truncate (0);
2211 for (lane = 0; lane < group_size; ++lane)
2212 op_stmts.quick_push
2213 (vinfo->lookup_def (chains[lane][n].op));
2214 child = vect_build_slp_tree (vinfo, op_stmts,
2215 group_size, &this_max_nunits,
2216 matches, limit,
2217 &this_tree_size, bst_map);
2218 /* ??? We're likely getting too many fatal mismatches
2219 here so maybe we want to ignore them (but then we
2220 have no idea which lanes fatally mismatched). */
2221 if (child || !matches[0])
2222 break;
2223 /* Swap another lane we have not yet matched up into
2224 lanes that did not match. If we run out of
2225 permute possibilities for a lane terminate the
2226 search. */
2227 bool term = false;
2228 for (lane = 1; lane < group_size; ++lane)
2229 if (!matches[lane])
2230 {
2231 if (n + perms[lane] + 1 == chain_len)
2232 {
2233 term = true;
2234 break;
2235 }
2236 std::swap (chains[lane][n],
2237 chains[lane][n + perms[lane] + 1]);
2238 perms[lane]++;
2239 }
2240 if (term)
2241 break;
2242 }
2243 while (1);
2244 if (!child)
2245 {
2246 if (dump_enabled_p ())
2247 dump_printf_loc (MSG_NOTE, vect_location,
2248 "failed to match up op %d\n", n);
2249 op_stmts.release ();
2250 if (lane != group_size - 1)
2251 matches[0] = false;
2252 else
2253 matches[lane] = false;
2254 goto out;
2255 }
2256 if (dump_enabled_p ())
2257 {
2258 dump_printf_loc (MSG_NOTE, vect_location,
2259 "matched up op %d to\n", n);
2260 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2261 }
2262 children.safe_push (child);
2263 }
2264 }
2265 /* 3. build SLP nodes to combine the chain. */
2266 for (unsigned lane = 0; lane < group_size; ++lane)
2267 if (chains[lane][0].code != code)
2268 {
2269 /* See if there's any alternate all-PLUS entry. */
2270 unsigned n;
2271 for (n = 1; n < chain_len; ++n)
2272 {
2273 for (lane = 0; lane < group_size; ++lane)
2274 if (chains[lane][n].code != code)
2275 break;
2276 if (lane == group_size)
2277 break;
2278 }
2279 if (n != chain_len)
2280 {
2281 /* Swap that in at first position. */
2282 std::swap (children[0], children[n]);
2283 for (lane = 0; lane < group_size; ++lane)
2284 std::swap (chains[lane][0], chains[lane][n]);
2285 }
2286 else
2287 {
2288 /* ??? When this triggers and we end up with two
2289 vect_constant/external_def up-front things break (ICE)
2290 spectacularly finding an insertion place for the
2291 all-constant op. We should have a fully
2292 vect_internal_def operand though(?) so we can swap
2293 that into first place and then prepend the all-zero
2294 constant. */
2295 if (dump_enabled_p ())
2296 dump_printf_loc (MSG_NOTE, vect_location,
2297 "inserting constant zero to compensate "
2298 "for (partially) negated first "
2299 "operand\n");
2300 chain_len++;
2301 for (lane = 0; lane < group_size; ++lane)
2302 chains[lane].safe_insert
2303 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2304 vec<tree> zero_ops;
2305 zero_ops.create (group_size);
2306 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2307 for (lane = 1; lane < group_size; ++lane)
2308 zero_ops.quick_push (zero_ops[0]);
2309 slp_tree zero = vect_create_new_slp_node (zero_ops);
2310 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2311 children.safe_insert (0, zero);
2312 }
2313 break;
2314 }
2315 for (unsigned i = 1; i < children.length (); ++i)
2316 {
2317 slp_tree op0 = children[i - 1];
2318 slp_tree op1 = children[i];
2319 bool this_two_op = false;
2320 for (unsigned lane = 0; lane < group_size; ++lane)
2321 if (chains[lane][i].code != chains[0][i].code)
2322 {
2323 this_two_op = true;
2324 break;
2325 }
2326 slp_tree child;
2327 if (i == children.length () - 1)
2328 child = vect_create_new_slp_node (node, stmts, 2);
2329 else
2330 child = vect_create_new_slp_node (2, ERROR_MARK);
2331 if (this_two_op)
2332 {
2333 vec<std::pair<unsigned, unsigned> > lperm;
2334 lperm.create (group_size);
2335 for (unsigned lane = 0; lane < group_size; ++lane)
2336 lperm.quick_push (std::make_pair
2337 (chains[lane][i].code != chains[0][i].code, lane));
2338 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2339 (chains[0][i].code == code
2340 ? op_stmt_info
2341 : other_op_stmt_info),
2342 (chains[0][i].code == code
2343 ? other_op_stmt_info
2344 : op_stmt_info),
2345 lperm);
2346 }
2347 else
2348 {
2349 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2350 SLP_TREE_VECTYPE (child) = vectype;
2351 SLP_TREE_LANES (child) = group_size;
2352 SLP_TREE_CHILDREN (child).quick_push (op0);
2353 SLP_TREE_CHILDREN (child).quick_push (op1);
2354 SLP_TREE_REPRESENTATIVE (child)
2355 = (chains[0][i].code == code
2356 ? op_stmt_info : other_op_stmt_info);
2357 }
2358 children[i] = child;
2359 }
2360 *tree_size += this_tree_size + 1;
2361 *max_nunits = this_max_nunits;
2362 while (!chains.is_empty ())
2363 chains.pop ().release ();
2364 return node;
2365 }
2366 out:
2367 while (!children.is_empty ())
2368 vect_free_slp_tree (children.pop ());
2369 while (!chains.is_empty ())
2370 chains.pop ().release ();
2371 /* Hard-fail, otherwise we might run into quadratic processing of the
2372 chains starting one stmt into the chain again. */
2373 if (hard_fail)
2374 return NULL;
2375 /* Fall thru to normal processing. */
2376 }
2377
2378 /* Get at the operands, verifying they are compatible. */
2379 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2380 slp_oprnd_info oprnd_info;
2381 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2382 {
2383 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2384 stmts, i, &oprnds_info);
2385 if (res != 0)
2386 matches[(res == -1) ? 0 : i] = false;
2387 if (!matches[0])
2388 break;
2389 }
2390 for (i = 0; i < group_size; ++i)
2391 if (!matches[i])
2392 {
2393 vect_free_oprnd_info (oprnds_info);
2394 return NULL;
2395 }
2396 swap = NULL;
2397
2398 auto_vec<slp_tree, 4> children;
2399
2400 stmt_info = stmts[0];
2401
2402 /* Create SLP_TREE nodes for the definition node/s. */
2403 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2404 {
2405 slp_tree child = nullptr;
2406 unsigned int j;
2407
2408 /* We're skipping certain operands from processing, for example
2409 outer loop reduction initial defs. */
2410 if (skip_args[i])
2411 {
2412 children.safe_push (NULL);
2413 continue;
2414 }
2415
2416 if (oprnd_info->first_dt == vect_uninitialized_def)
2417 {
2418 /* COND_EXPR have one too many eventually if the condition
2419 is a SSA name. */
2420 gcc_assert (i == 3 && nops == 4);
2421 continue;
2422 }
2423
2424 if (is_a <bb_vec_info> (vinfo)
2425 && oprnd_info->first_dt == vect_internal_def
2426 && !oprnd_info->any_pattern)
2427 {
2428 /* For BB vectorization, if all defs are the same do not
2429 bother to continue the build along the single-lane
2430 graph but use a splat of the scalar value. */
2431 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2432 for (j = 1; j < group_size; ++j)
2433 if (oprnd_info->def_stmts[j] != first_def)
2434 break;
2435 if (j == group_size
2436 /* But avoid doing this for loads where we may be
2437 able to CSE things, unless the stmt is not
2438 vectorizable. */
2439 && (!STMT_VINFO_VECTORIZABLE (first_def)
2440 || !gimple_vuse (first_def->stmt)))
2441 {
2442 if (dump_enabled_p ())
2443 dump_printf_loc (MSG_NOTE, vect_location,
2444 "Using a splat of the uniform operand %G",
2445 first_def->stmt);
2446 oprnd_info->first_dt = vect_external_def;
2447 }
2448 }
2449
2450 if (oprnd_info->first_dt == vect_external_def
2451 || oprnd_info->first_dt == vect_constant_def)
2452 {
2453 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2454 {
2455 tree op0;
2456 tree uniform_val = op0 = oprnd_info->ops[0];
2457 for (j = 1; j < oprnd_info->ops.length (); ++j)
2458 if (!operand_equal_p (uniform_val, oprnd_info->ops[j]))
2459 {
2460 uniform_val = NULL_TREE;
2461 break;
2462 }
2463 if (!uniform_val
2464 && !can_duplicate_and_interleave_p (vinfo,
2465 oprnd_info->ops.length (),
2466 TREE_TYPE (op0)))
2467 {
2468 matches[j] = false;
2469 if (dump_enabled_p ())
2470 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2471 "Build SLP failed: invalid type of def "
2472 "for variable-length SLP %T\n", op0);
2473 goto fail;
2474 }
2475 }
2476 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2477 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2478 oprnd_info->ops = vNULL;
2479 children.safe_push (invnode);
2480 continue;
2481 }
2482
2483 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2484 group_size, &this_max_nunits,
2485 matches, limit,
2486 &this_tree_size, bst_map)) != NULL)
2487 {
2488 oprnd_info->def_stmts = vNULL;
2489 children.safe_push (child);
2490 continue;
2491 }
2492
2493 /* If the SLP build for operand zero failed and operand zero
2494 and one can be commutated try that for the scalar stmts
2495 that failed the match. */
2496 if (i == 0
2497 /* A first scalar stmt mismatch signals a fatal mismatch. */
2498 && matches[0]
2499 /* ??? For COND_EXPRs we can swap the comparison operands
2500 as well as the arms under some constraints. */
2501 && nops == 2
2502 && oprnds_info[1]->first_dt == vect_internal_def
2503 && is_gimple_assign (stmt_info->stmt)
2504 /* Swapping operands for reductions breaks assumptions later on. */
2505 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2506 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2507 {
2508 /* See whether we can swap the matching or the non-matching
2509 stmt operands. */
2510 bool swap_not_matching = true;
2511 do
2512 {
2513 for (j = 0; j < group_size; ++j)
2514 {
2515 if (matches[j] != !swap_not_matching)
2516 continue;
2517 stmt_vec_info stmt_info = stmts[j];
2518 /* Verify if we can swap operands of this stmt. */
2519 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2520 if (!stmt
2521 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2522 {
2523 if (!swap_not_matching)
2524 goto fail;
2525 swap_not_matching = false;
2526 break;
2527 }
2528 }
2529 }
2530 while (j != group_size);
2531
2532 /* Swap mismatched definition stmts. */
2533 if (dump_enabled_p ())
2534 dump_printf_loc (MSG_NOTE, vect_location,
2535 "Re-trying with swapped operands of stmts ");
2536 for (j = 0; j < group_size; ++j)
2537 if (matches[j] == !swap_not_matching)
2538 {
2539 std::swap (oprnds_info[0]->def_stmts[j],
2540 oprnds_info[1]->def_stmts[j]);
2541 std::swap (oprnds_info[0]->ops[j],
2542 oprnds_info[1]->ops[j]);
2543 if (dump_enabled_p ())
2544 dump_printf (MSG_NOTE, "%d ", j);
2545 }
2546 if (dump_enabled_p ())
2547 dump_printf (MSG_NOTE, "\n");
2548 /* After swapping some operands we lost track whether an
2549 operand has any pattern defs so be conservative here. */
2550 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2551 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2552 /* And try again with scratch 'matches' ... */
2553 bool *tem = XALLOCAVEC (bool, group_size);
2554 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2555 group_size, &this_max_nunits,
2556 tem, limit,
2557 &this_tree_size, bst_map)) != NULL)
2558 {
2559 oprnd_info->def_stmts = vNULL;
2560 children.safe_push (child);
2561 continue;
2562 }
2563 }
2564 fail:
2565
2566 /* If the SLP build failed and we analyze a basic-block
2567 simply treat nodes we fail to build as externally defined
2568 (and thus build vectors from the scalar defs).
2569 The cost model will reject outright expensive cases.
2570 ??? This doesn't treat cases where permutation ultimatively
2571 fails (or we don't try permutation below). Ideally we'd
2572 even compute a permutation that will end up with the maximum
2573 SLP tree size... */
2574 if (is_a <bb_vec_info> (vinfo)
2575 /* ??? Rejecting patterns this way doesn't work. We'd have to
2576 do extra work to cancel the pattern so the uses see the
2577 scalar version. */
2578 && !is_pattern_stmt_p (stmt_info)
2579 && !oprnd_info->any_pattern)
2580 {
2581 /* But if there's a leading vector sized set of matching stmts
2582 fail here so we can split the group. This matches the condition
2583 vect_analyze_slp_instance uses. */
2584 /* ??? We might want to split here and combine the results to support
2585 multiple vector sizes better. */
2586 for (j = 0; j < group_size; ++j)
2587 if (!matches[j])
2588 break;
2589 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2590 {
2591 if (dump_enabled_p ())
2592 dump_printf_loc (MSG_NOTE, vect_location,
2593 "Building vector operands from scalars\n");
2594 this_tree_size++;
2595 child = vect_create_new_slp_node (oprnd_info->ops);
2596 children.safe_push (child);
2597 oprnd_info->ops = vNULL;
2598 continue;
2599 }
2600 }
2601
2602 gcc_assert (child == NULL);
2603 FOR_EACH_VEC_ELT (children, j, child)
2604 if (child)
2605 vect_free_slp_tree (child);
2606 vect_free_oprnd_info (oprnds_info);
2607 return NULL;
2608 }
2609
2610 vect_free_oprnd_info (oprnds_info);
2611
2612 /* If we have all children of a child built up from uniform scalars
2613 or does more than one possibly expensive vector construction then
2614 just throw that away, causing it built up from scalars.
2615 The exception is the SLP node for the vector store. */
2616 if (is_a <bb_vec_info> (vinfo)
2617 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2618 /* ??? Rejecting patterns this way doesn't work. We'd have to
2619 do extra work to cancel the pattern so the uses see the
2620 scalar version. */
2621 && !is_pattern_stmt_p (stmt_info))
2622 {
2623 slp_tree child;
2624 unsigned j;
2625 bool all_uniform_p = true;
2626 unsigned n_vector_builds = 0;
2627 FOR_EACH_VEC_ELT (children, j, child)
2628 {
2629 if (!child)
2630 ;
2631 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2632 all_uniform_p = false;
2633 else if (!vect_slp_tree_uniform_p (child))
2634 {
2635 all_uniform_p = false;
2636 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2637 n_vector_builds++;
2638 }
2639 }
2640 if (all_uniform_p
2641 || n_vector_builds > 1
2642 || (n_vector_builds == children.length ()
2643 && is_a <gphi *> (stmt_info->stmt)))
2644 {
2645 /* Roll back. */
2646 matches[0] = false;
2647 FOR_EACH_VEC_ELT (children, j, child)
2648 if (child)
2649 vect_free_slp_tree (child);
2650
2651 if (dump_enabled_p ())
2652 dump_printf_loc (MSG_NOTE, vect_location,
2653 "Building parent vector operands from "
2654 "scalars instead\n");
2655 return NULL;
2656 }
2657 }
2658
2659 *tree_size += this_tree_size + 1;
2660 *max_nunits = this_max_nunits;
2661
2662 if (two_operators)
2663 {
2664 /* ??? We'd likely want to either cache in bst_map sth like
2665 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2666 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2667 explicit stmts to put in so the keying on 'stmts' doesn't
2668 work (but we have the same issue with nodes that use 'ops'). */
2669 slp_tree one = new _slp_tree;
2670 slp_tree two = new _slp_tree;
2671 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2672 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2673 SLP_TREE_VECTYPE (one) = vectype;
2674 SLP_TREE_VECTYPE (two) = vectype;
2675 SLP_TREE_CHILDREN (one).safe_splice (children);
2676 SLP_TREE_CHILDREN (two).safe_splice (children);
2677 slp_tree child;
2678 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2679 SLP_TREE_REF_COUNT (child)++;
2680
2681 /* Here we record the original defs since this
2682 node represents the final lane configuration. */
2683 node = vect_create_new_slp_node (node, stmts, 2);
2684 SLP_TREE_VECTYPE (node) = vectype;
2685 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2686 SLP_TREE_CHILDREN (node).quick_push (one);
2687 SLP_TREE_CHILDREN (node).quick_push (two);
2688 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2689 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2690 enum tree_code ocode = ERROR_MARK;
2691 stmt_vec_info ostmt_info;
2692 unsigned j = 0;
2693 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2694 {
2695 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2696 if (gimple_assign_rhs_code (ostmt) != code0)
2697 {
2698 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2699 ocode = gimple_assign_rhs_code (ostmt);
2700 j = i;
2701 }
2702 else
2703 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2704 }
2705 SLP_TREE_CODE (one) = code0;
2706 SLP_TREE_CODE (two) = ocode;
2707 SLP_TREE_LANES (one) = stmts.length ();
2708 SLP_TREE_LANES (two) = stmts.length ();
2709 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2710 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2711 return node;
2712 }
2713
2714 node = vect_create_new_slp_node (node, stmts, nops);
2715 SLP_TREE_VECTYPE (node) = vectype;
2716 SLP_TREE_CHILDREN (node).splice (children);
2717 return node;
2718 }
2719
2720 /* Dump a single SLP tree NODE. */
2721
2722 static void
2723 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2724 slp_tree node)
2725 {
2726 unsigned i, j;
2727 slp_tree child;
2728 stmt_vec_info stmt_info;
2729 tree op;
2730
2731 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2732 dump_user_location_t user_loc = loc.get_user_location ();
2733 dump_printf_loc (metadata, user_loc,
2734 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2735 ", refcnt=%u)",
2736 SLP_TREE_DEF_TYPE (node) == vect_external_def
2737 ? " (external)"
2738 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2739 ? " (constant)"
2740 : ""), (void *) node,
2741 estimated_poly_value (node->max_nunits),
2742 SLP_TREE_REF_COUNT (node));
2743 if (SLP_TREE_VECTYPE (node))
2744 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2745 dump_printf (metadata, "\n");
2746 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2747 {
2748 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2749 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2750 else
2751 dump_printf_loc (metadata, user_loc, "op template: %G",
2752 SLP_TREE_REPRESENTATIVE (node)->stmt);
2753 }
2754 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2755 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2756 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2757 else
2758 {
2759 dump_printf_loc (metadata, user_loc, "\t{ ");
2760 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2761 dump_printf (metadata, "%T%s ", op,
2762 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2763 dump_printf (metadata, "}\n");
2764 }
2765 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2766 {
2767 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2768 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2769 dump_printf (dump_kind, " %u", j);
2770 dump_printf (dump_kind, " }\n");
2771 }
2772 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2773 {
2774 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2775 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2776 dump_printf (dump_kind, " %u[%u]",
2777 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2778 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2779 dump_printf (dump_kind, " }\n");
2780 }
2781 if (SLP_TREE_CHILDREN (node).is_empty ())
2782 return;
2783 dump_printf_loc (metadata, user_loc, "\tchildren");
2784 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2785 dump_printf (dump_kind, " %p", (void *)child);
2786 dump_printf (dump_kind, "\n");
2787 }
2788
2789 DEBUG_FUNCTION void
2790 debug (slp_tree node)
2791 {
2792 debug_dump_context ctx;
2793 vect_print_slp_tree (MSG_NOTE,
2794 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2795 node);
2796 }
2797
2798 /* Recursive helper for the dot producer below. */
2799
2800 static void
2801 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2802 {
2803 if (visited.add (node))
2804 return;
2805
2806 fprintf (f, "\"%p\" [label=\"", (void *)node);
2807 vect_print_slp_tree (MSG_NOTE,
2808 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2809 node);
2810 fprintf (f, "\"];\n");
2811
2812
2813 for (slp_tree child : SLP_TREE_CHILDREN (node))
2814 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2815
2816 for (slp_tree child : SLP_TREE_CHILDREN (node))
2817 if (child)
2818 dot_slp_tree (f, child, visited);
2819 }
2820
2821 DEBUG_FUNCTION void
2822 dot_slp_tree (const char *fname, slp_tree node)
2823 {
2824 FILE *f = fopen (fname, "w");
2825 fprintf (f, "digraph {\n");
2826 fflush (f);
2827 {
2828 debug_dump_context ctx (f);
2829 hash_set<slp_tree> visited;
2830 dot_slp_tree (f, node, visited);
2831 }
2832 fflush (f);
2833 fprintf (f, "}\n");
2834 fclose (f);
2835 }
2836
2837 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2838
2839 static void
2840 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2841 slp_tree node, hash_set<slp_tree> &visited)
2842 {
2843 unsigned i;
2844 slp_tree child;
2845
2846 if (visited.add (node))
2847 return;
2848
2849 vect_print_slp_tree (dump_kind, loc, node);
2850
2851 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2852 if (child)
2853 vect_print_slp_graph (dump_kind, loc, child, visited);
2854 }
2855
2856 static void
2857 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2858 slp_tree entry)
2859 {
2860 hash_set<slp_tree> visited;
2861 vect_print_slp_graph (dump_kind, loc, entry, visited);
2862 }
2863
2864 /* Mark the tree rooted at NODE with PURE_SLP. */
2865
2866 static void
2867 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2868 {
2869 int i;
2870 stmt_vec_info stmt_info;
2871 slp_tree child;
2872
2873 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2874 return;
2875
2876 if (visited.add (node))
2877 return;
2878
2879 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2880 STMT_SLP_TYPE (stmt_info) = pure_slp;
2881
2882 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2883 if (child)
2884 vect_mark_slp_stmts (child, visited);
2885 }
2886
2887 static void
2888 vect_mark_slp_stmts (slp_tree node)
2889 {
2890 hash_set<slp_tree> visited;
2891 vect_mark_slp_stmts (node, visited);
2892 }
2893
2894 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2895
2896 static void
2897 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2898 {
2899 int i;
2900 stmt_vec_info stmt_info;
2901 slp_tree child;
2902
2903 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2904 return;
2905
2906 if (visited.add (node))
2907 return;
2908
2909 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2910 {
2911 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2912 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2913 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2914 }
2915
2916 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2917 if (child)
2918 vect_mark_slp_stmts_relevant (child, visited);
2919 }
2920
2921 static void
2922 vect_mark_slp_stmts_relevant (slp_tree node)
2923 {
2924 hash_set<slp_tree> visited;
2925 vect_mark_slp_stmts_relevant (node, visited);
2926 }
2927
2928
2929 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2930
2931 static void
2932 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2933 hash_set<slp_tree> &visited)
2934 {
2935 if (!node || visited.add (node))
2936 return;
2937
2938 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2939 return;
2940
2941 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
2942 {
2943 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2944 if (STMT_VINFO_DATA_REF (stmt_info)
2945 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2946 loads.safe_push (node);
2947 }
2948
2949 unsigned i;
2950 slp_tree child;
2951 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2952 vect_gather_slp_loads (loads, child, visited);
2953 }
2954
2955
2956 /* Find the last store in SLP INSTANCE. */
2957
2958 stmt_vec_info
2959 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2960 {
2961 stmt_vec_info last = NULL;
2962 stmt_vec_info stmt_vinfo;
2963
2964 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2965 {
2966 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2967 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2968 }
2969
2970 return last;
2971 }
2972
2973 /* Find the first stmt in NODE. */
2974
2975 stmt_vec_info
2976 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2977 {
2978 stmt_vec_info first = NULL;
2979 stmt_vec_info stmt_vinfo;
2980
2981 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2982 {
2983 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2984 if (!first
2985 || get_later_stmt (stmt_vinfo, first) == first)
2986 first = stmt_vinfo;
2987 }
2988
2989 return first;
2990 }
2991
2992 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2993 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2994 (also containing the first GROUP1_SIZE stmts, since stores are
2995 consecutive), the second containing the remainder.
2996 Return the first stmt in the second group. */
2997
2998 static stmt_vec_info
2999 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3000 {
3001 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3002 gcc_assert (group1_size > 0);
3003 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3004 gcc_assert (group2_size > 0);
3005 DR_GROUP_SIZE (first_vinfo) = group1_size;
3006
3007 stmt_vec_info stmt_info = first_vinfo;
3008 for (unsigned i = group1_size; i > 1; i--)
3009 {
3010 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3011 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3012 }
3013 /* STMT is now the last element of the first group. */
3014 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3015 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3016
3017 DR_GROUP_SIZE (group2) = group2_size;
3018 for (stmt_info = group2; stmt_info;
3019 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3020 {
3021 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3022 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3023 }
3024
3025 /* For the second group, the DR_GROUP_GAP is that before the original group,
3026 plus skipping over the first vector. */
3027 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3028
3029 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
3030 DR_GROUP_GAP (first_vinfo) += group2_size;
3031
3032 if (dump_enabled_p ())
3033 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3034 group1_size, group2_size);
3035
3036 return group2;
3037 }
3038
3039 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3040 statements and a vector of NUNITS elements. */
3041
3042 static poly_uint64
3043 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3044 {
3045 return exact_div (common_multiple (nunits, group_size), group_size);
3046 }
3047
3048 /* Helper that checks to see if a node is a load node. */
3049
3050 static inline bool
3051 vect_is_slp_load_node (slp_tree root)
3052 {
3053 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3054 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3055 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3056 }
3057
3058
3059 /* Helper function of optimize_load_redistribution that performs the operation
3060 recursively. */
3061
3062 static slp_tree
3063 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3064 vec_info *vinfo, unsigned int group_size,
3065 hash_map<slp_tree, slp_tree> *load_map,
3066 slp_tree root)
3067 {
3068 if (slp_tree *leader = load_map->get (root))
3069 return *leader;
3070
3071 slp_tree node;
3072 unsigned i;
3073
3074 /* For now, we don't know anything about externals so do not do anything. */
3075 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3076 return NULL;
3077 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3078 {
3079 /* First convert this node into a load node and add it to the leaves
3080 list and flatten the permute from a lane to a load one. If it's
3081 unneeded it will be elided later. */
3082 vec<stmt_vec_info> stmts;
3083 stmts.create (SLP_TREE_LANES (root));
3084 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3085 for (unsigned j = 0; j < lane_perm.length (); j++)
3086 {
3087 std::pair<unsigned, unsigned> perm = lane_perm[j];
3088 node = SLP_TREE_CHILDREN (root)[perm.first];
3089
3090 if (!vect_is_slp_load_node (node)
3091 || SLP_TREE_CHILDREN (node).exists ())
3092 {
3093 stmts.release ();
3094 goto next;
3095 }
3096
3097 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3098 }
3099
3100 if (dump_enabled_p ())
3101 dump_printf_loc (MSG_NOTE, vect_location,
3102 "converting stmts on permute node %p\n",
3103 (void *) root);
3104
3105 bool *matches = XALLOCAVEC (bool, group_size);
3106 poly_uint64 max_nunits = 1;
3107 unsigned tree_size = 0, limit = 1;
3108 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3109 matches, &limit, &tree_size, bst_map);
3110 if (!node)
3111 stmts.release ();
3112
3113 load_map->put (root, node);
3114 return node;
3115 }
3116
3117 next:
3118 load_map->put (root, NULL);
3119
3120 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3121 {
3122 slp_tree value
3123 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3124 node);
3125 if (value)
3126 {
3127 SLP_TREE_REF_COUNT (value)++;
3128 SLP_TREE_CHILDREN (root)[i] = value;
3129 /* ??? We know the original leafs of the replaced nodes will
3130 be referenced by bst_map, only the permutes created by
3131 pattern matching are not. */
3132 if (SLP_TREE_REF_COUNT (node) == 1)
3133 load_map->remove (node);
3134 vect_free_slp_tree (node);
3135 }
3136 }
3137
3138 return NULL;
3139 }
3140
3141 /* Temporary workaround for loads not being CSEd during SLP build. This
3142 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3143 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3144 same DR such that the final operation is equal to a permuted load. Such
3145 NODES are then directly converted into LOADS themselves. The nodes are
3146 CSEd using BST_MAP. */
3147
3148 static void
3149 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3150 vec_info *vinfo, unsigned int group_size,
3151 hash_map<slp_tree, slp_tree> *load_map,
3152 slp_tree root)
3153 {
3154 slp_tree node;
3155 unsigned i;
3156
3157 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3158 {
3159 slp_tree value
3160 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3161 node);
3162 if (value)
3163 {
3164 SLP_TREE_REF_COUNT (value)++;
3165 SLP_TREE_CHILDREN (root)[i] = value;
3166 /* ??? We know the original leafs of the replaced nodes will
3167 be referenced by bst_map, only the permutes created by
3168 pattern matching are not. */
3169 if (SLP_TREE_REF_COUNT (node) == 1)
3170 load_map->remove (node);
3171 vect_free_slp_tree (node);
3172 }
3173 }
3174 }
3175
3176 /* Helper function of vect_match_slp_patterns.
3177
3178 Attempts to match patterns against the slp tree rooted in REF_NODE using
3179 VINFO. Patterns are matched in post-order traversal.
3180
3181 If matching is successful the value in REF_NODE is updated and returned, if
3182 not then it is returned unchanged. */
3183
3184 static bool
3185 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3186 slp_tree_to_load_perm_map_t *perm_cache,
3187 slp_compat_nodes_map_t *compat_cache,
3188 hash_set<slp_tree> *visited)
3189 {
3190 unsigned i;
3191 slp_tree node = *ref_node;
3192 bool found_p = false;
3193 if (!node || visited->add (node))
3194 return false;
3195
3196 slp_tree child;
3197 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3198 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3199 vinfo, perm_cache, compat_cache,
3200 visited);
3201
3202 for (unsigned x = 0; x < num__slp_patterns; x++)
3203 {
3204 vect_pattern *pattern
3205 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3206 if (pattern)
3207 {
3208 pattern->build (vinfo);
3209 delete pattern;
3210 found_p = true;
3211 }
3212 }
3213
3214 return found_p;
3215 }
3216
3217 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3218 vec_info VINFO.
3219
3220 The modified tree is returned. Patterns are tried in order and multiple
3221 patterns may match. */
3222
3223 static bool
3224 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3225 hash_set<slp_tree> *visited,
3226 slp_tree_to_load_perm_map_t *perm_cache,
3227 slp_compat_nodes_map_t *compat_cache)
3228 {
3229 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3230 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3231
3232 if (dump_enabled_p ())
3233 dump_printf_loc (MSG_NOTE, vect_location,
3234 "Analyzing SLP tree %p for patterns\n",
3235 (void *) SLP_INSTANCE_TREE (instance));
3236
3237 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3238 visited);
3239 }
3240
3241 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3242 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3243 Return true if we could use IFN_STORE_LANES instead and if that appears
3244 to be the better approach. */
3245
3246 static bool
3247 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3248 unsigned int group_size,
3249 unsigned int new_group_size)
3250 {
3251 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3252 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3253 if (!vectype)
3254 return false;
3255 /* Allow the split if one of the two new groups would operate on full
3256 vectors *within* rather than across one scalar loop iteration.
3257 This is purely a heuristic, but it should work well for group
3258 sizes of 3 and 4, where the possible splits are:
3259
3260 3->2+1: OK if the vector has exactly two elements
3261 4->2+2: Likewise
3262 4->3+1: Less clear-cut. */
3263 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3264 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3265 return false;
3266 return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3267 }
3268
3269 /* Analyze an SLP instance starting from a group of grouped stores. Call
3270 vect_build_slp_tree to build a tree of packed stmts if possible.
3271 Return FALSE if it's impossible to SLP any stmt in the loop. */
3272
3273 static bool
3274 vect_analyze_slp_instance (vec_info *vinfo,
3275 scalar_stmts_to_slp_tree_map_t *bst_map,
3276 stmt_vec_info stmt_info, slp_instance_kind kind,
3277 unsigned max_tree_size, unsigned *limit);
3278
3279 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3280 of KIND. Return true if successful. */
3281
3282 static bool
3283 vect_build_slp_instance (vec_info *vinfo,
3284 slp_instance_kind kind,
3285 vec<stmt_vec_info> &scalar_stmts,
3286 vec<stmt_vec_info> &root_stmt_infos,
3287 vec<tree> &remain,
3288 unsigned max_tree_size, unsigned *limit,
3289 scalar_stmts_to_slp_tree_map_t *bst_map,
3290 /* ??? We need stmt_info for group splitting. */
3291 stmt_vec_info stmt_info_)
3292 {
3293 if (kind == slp_inst_kind_ctor)
3294 {
3295 if (dump_enabled_p ())
3296 dump_printf_loc (MSG_NOTE, vect_location,
3297 "Analyzing vectorizable constructor: %G\n",
3298 root_stmt_infos[0]->stmt);
3299 }
3300
3301 if (dump_enabled_p ())
3302 {
3303 dump_printf_loc (MSG_NOTE, vect_location,
3304 "Starting SLP discovery for\n");
3305 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3306 dump_printf_loc (MSG_NOTE, vect_location,
3307 " %G", scalar_stmts[i]->stmt);
3308 }
3309
3310 /* Build the tree for the SLP instance. */
3311 unsigned int group_size = scalar_stmts.length ();
3312 bool *matches = XALLOCAVEC (bool, group_size);
3313 poly_uint64 max_nunits = 1;
3314 unsigned tree_size = 0;
3315 unsigned i;
3316 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3317 &max_nunits, matches, limit,
3318 &tree_size, bst_map);
3319 if (node != NULL)
3320 {
3321 /* Calculate the unrolling factor based on the smallest type. */
3322 poly_uint64 unrolling_factor
3323 = calculate_unrolling_factor (max_nunits, group_size);
3324
3325 if (maybe_ne (unrolling_factor, 1U)
3326 && is_a <bb_vec_info> (vinfo))
3327 {
3328 unsigned HOST_WIDE_INT const_max_nunits;
3329 if (!max_nunits.is_constant (&const_max_nunits)
3330 || const_max_nunits > group_size)
3331 {
3332 if (dump_enabled_p ())
3333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3334 "Build SLP failed: store group "
3335 "size not a multiple of the vector size "
3336 "in basic block SLP\n");
3337 vect_free_slp_tree (node);
3338 return false;
3339 }
3340 /* Fatal mismatch. */
3341 if (dump_enabled_p ())
3342 dump_printf_loc (MSG_NOTE, vect_location,
3343 "SLP discovery succeeded but node needs "
3344 "splitting\n");
3345 memset (matches, true, group_size);
3346 matches[group_size / const_max_nunits * const_max_nunits] = false;
3347 vect_free_slp_tree (node);
3348 }
3349 else
3350 {
3351 /* Create a new SLP instance. */
3352 slp_instance new_instance = XNEW (class _slp_instance);
3353 SLP_INSTANCE_TREE (new_instance) = node;
3354 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3355 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3356 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3357 SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3358 SLP_INSTANCE_KIND (new_instance) = kind;
3359 new_instance->reduc_phis = NULL;
3360 new_instance->cost_vec = vNULL;
3361 new_instance->subgraph_entries = vNULL;
3362
3363 if (dump_enabled_p ())
3364 dump_printf_loc (MSG_NOTE, vect_location,
3365 "SLP size %u vs. limit %u.\n",
3366 tree_size, max_tree_size);
3367
3368 /* Fixup SLP reduction chains. */
3369 if (kind == slp_inst_kind_reduc_chain)
3370 {
3371 /* If this is a reduction chain with a conversion in front
3372 amend the SLP tree with a node for that. */
3373 gimple *scalar_def
3374 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3375 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3376 {
3377 /* Get at the conversion stmt - we know it's the single use
3378 of the last stmt of the reduction chain. */
3379 use_operand_p use_p;
3380 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3381 &use_p, &scalar_def);
3382 gcc_assert (r);
3383 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3384 next_info = vect_stmt_to_vectorize (next_info);
3385 scalar_stmts = vNULL;
3386 scalar_stmts.create (group_size);
3387 for (unsigned i = 0; i < group_size; ++i)
3388 scalar_stmts.quick_push (next_info);
3389 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3390 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3391 SLP_TREE_CHILDREN (conv).quick_push (node);
3392 SLP_INSTANCE_TREE (new_instance) = conv;
3393 /* We also have to fake this conversion stmt as SLP reduction
3394 group so we don't have to mess with too much code
3395 elsewhere. */
3396 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3397 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3398 }
3399 /* Fill the backedge child of the PHI SLP node. The
3400 general matching code cannot find it because the
3401 scalar code does not reflect how we vectorize the
3402 reduction. */
3403 use_operand_p use_p;
3404 imm_use_iterator imm_iter;
3405 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3406 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3407 gimple_get_lhs (scalar_def))
3408 /* There are exactly two non-debug uses, the reduction
3409 PHI and the loop-closed PHI node. */
3410 if (!is_gimple_debug (USE_STMT (use_p))
3411 && gimple_bb (USE_STMT (use_p)) == loop->header)
3412 {
3413 auto_vec<stmt_vec_info, 64> phis (group_size);
3414 stmt_vec_info phi_info
3415 = vinfo->lookup_stmt (USE_STMT (use_p));
3416 for (unsigned i = 0; i < group_size; ++i)
3417 phis.quick_push (phi_info);
3418 slp_tree *phi_node = bst_map->get (phis);
3419 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3420 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3421 = SLP_INSTANCE_TREE (new_instance);
3422 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3423 }
3424 }
3425
3426 vinfo->slp_instances.safe_push (new_instance);
3427
3428 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3429 the number of scalar stmts in the root in a few places.
3430 Verify that assumption holds. */
3431 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3432 .length () == group_size);
3433
3434 if (dump_enabled_p ())
3435 {
3436 dump_printf_loc (MSG_NOTE, vect_location,
3437 "Final SLP tree for instance %p:\n",
3438 (void *) new_instance);
3439 vect_print_slp_graph (MSG_NOTE, vect_location,
3440 SLP_INSTANCE_TREE (new_instance));
3441 }
3442
3443 return true;
3444 }
3445 }
3446 else
3447 {
3448 /* Failed to SLP. */
3449 /* Free the allocated memory. */
3450 scalar_stmts.release ();
3451 }
3452
3453 stmt_vec_info stmt_info = stmt_info_;
3454 /* Try to break the group up into pieces. */
3455 if (kind == slp_inst_kind_store)
3456 {
3457 /* ??? We could delay all the actual splitting of store-groups
3458 until after SLP discovery of the original group completed.
3459 Then we can recurse to vect_build_slp_instance directly. */
3460 for (i = 0; i < group_size; i++)
3461 if (!matches[i])
3462 break;
3463
3464 /* For basic block SLP, try to break the group up into multiples of
3465 a vector size. */
3466 if (is_a <bb_vec_info> (vinfo)
3467 && (i > 1 && i < group_size))
3468 {
3469 tree scalar_type
3470 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3471 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3472 1 << floor_log2 (i));
3473 unsigned HOST_WIDE_INT const_nunits;
3474 if (vectype
3475 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3476 {
3477 /* Split into two groups at the first vector boundary. */
3478 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3479 unsigned group1_size = i & ~(const_nunits - 1);
3480
3481 if (dump_enabled_p ())
3482 dump_printf_loc (MSG_NOTE, vect_location,
3483 "Splitting SLP group at stmt %u\n", i);
3484 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3485 group1_size);
3486 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3487 kind, max_tree_size,
3488 limit);
3489 /* Split the rest at the failure point and possibly
3490 re-analyze the remaining matching part if it has
3491 at least two lanes. */
3492 if (group1_size < i
3493 && (i + 1 < group_size
3494 || i - group1_size > 1))
3495 {
3496 stmt_vec_info rest2 = rest;
3497 rest = vect_split_slp_store_group (rest, i - group1_size);
3498 if (i - group1_size > 1)
3499 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3500 kind, max_tree_size,
3501 limit);
3502 }
3503 /* Re-analyze the non-matching tail if it has at least
3504 two lanes. */
3505 if (i + 1 < group_size)
3506 res |= vect_analyze_slp_instance (vinfo, bst_map,
3507 rest, kind, max_tree_size,
3508 limit);
3509 return res;
3510 }
3511 }
3512
3513 /* For loop vectorization split into arbitrary pieces of size > 1. */
3514 if (is_a <loop_vec_info> (vinfo)
3515 && (i > 1 && i < group_size)
3516 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3517 {
3518 unsigned group1_size = i;
3519
3520 if (dump_enabled_p ())
3521 dump_printf_loc (MSG_NOTE, vect_location,
3522 "Splitting SLP group at stmt %u\n", i);
3523
3524 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3525 group1_size);
3526 /* Loop vectorization cannot handle gaps in stores, make sure
3527 the split group appears as strided. */
3528 STMT_VINFO_STRIDED_P (rest) = 1;
3529 DR_GROUP_GAP (rest) = 0;
3530 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3531 DR_GROUP_GAP (stmt_info) = 0;
3532
3533 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3534 kind, max_tree_size, limit);
3535 if (i + 1 < group_size)
3536 res |= vect_analyze_slp_instance (vinfo, bst_map,
3537 rest, kind, max_tree_size, limit);
3538
3539 return res;
3540 }
3541
3542 /* Even though the first vector did not all match, we might be able to SLP
3543 (some) of the remainder. FORNOW ignore this possibility. */
3544 }
3545
3546 /* Failed to SLP. */
3547 if (dump_enabled_p ())
3548 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3549 return false;
3550 }
3551
3552
3553 /* Analyze an SLP instance starting from a group of grouped stores. Call
3554 vect_build_slp_tree to build a tree of packed stmts if possible.
3555 Return FALSE if it's impossible to SLP any stmt in the loop. */
3556
3557 static bool
3558 vect_analyze_slp_instance (vec_info *vinfo,
3559 scalar_stmts_to_slp_tree_map_t *bst_map,
3560 stmt_vec_info stmt_info,
3561 slp_instance_kind kind,
3562 unsigned max_tree_size, unsigned *limit)
3563 {
3564 unsigned int i;
3565 vec<stmt_vec_info> scalar_stmts;
3566
3567 if (is_a <bb_vec_info> (vinfo))
3568 vect_location = stmt_info->stmt;
3569
3570 stmt_vec_info next_info = stmt_info;
3571 if (kind == slp_inst_kind_store)
3572 {
3573 /* Collect the stores and store them in scalar_stmts. */
3574 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3575 while (next_info)
3576 {
3577 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3578 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3579 }
3580 }
3581 else if (kind == slp_inst_kind_reduc_chain)
3582 {
3583 /* Collect the reduction stmts and store them in scalar_stmts. */
3584 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3585 while (next_info)
3586 {
3587 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3588 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3589 }
3590 /* Mark the first element of the reduction chain as reduction to properly
3591 transform the node. In the reduction analysis phase only the last
3592 element of the chain is marked as reduction. */
3593 STMT_VINFO_DEF_TYPE (stmt_info)
3594 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3595 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3596 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3597 }
3598 else if (kind == slp_inst_kind_reduc_group)
3599 {
3600 /* Collect reduction statements. */
3601 const vec<stmt_vec_info> &reductions
3602 = as_a <loop_vec_info> (vinfo)->reductions;
3603 scalar_stmts.create (reductions.length ());
3604 for (i = 0; reductions.iterate (i, &next_info); i++)
3605 if ((STMT_VINFO_RELEVANT_P (next_info)
3606 || STMT_VINFO_LIVE_P (next_info))
3607 /* ??? Make sure we didn't skip a conversion around a reduction
3608 path. In that case we'd have to reverse engineer that conversion
3609 stmt following the chain using reduc_idx and from the PHI
3610 using reduc_def. */
3611 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3612 scalar_stmts.quick_push (next_info);
3613 /* If less than two were relevant/live there's nothing to SLP. */
3614 if (scalar_stmts.length () < 2)
3615 return false;
3616 }
3617 else
3618 gcc_unreachable ();
3619
3620 vec<stmt_vec_info> roots = vNULL;
3621 vec<tree> remain = vNULL;
3622 /* Build the tree for the SLP instance. */
3623 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3624 roots, remain,
3625 max_tree_size, limit, bst_map,
3626 kind == slp_inst_kind_store
3627 ? stmt_info : NULL);
3628
3629 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3630 where we should do store group splitting. */
3631
3632 return res;
3633 }
3634
3635 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3636 trees of packed scalar stmts if SLP is possible. */
3637
3638 opt_result
3639 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3640 {
3641 unsigned int i;
3642 stmt_vec_info first_element;
3643 slp_instance instance;
3644
3645 DUMP_VECT_SCOPE ("vect_analyze_slp");
3646
3647 unsigned limit = max_tree_size;
3648
3649 scalar_stmts_to_slp_tree_map_t *bst_map
3650 = new scalar_stmts_to_slp_tree_map_t ();
3651
3652 /* Find SLP sequences starting from groups of grouped stores. */
3653 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3654 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3655 slp_inst_kind_store, max_tree_size, &limit);
3656
3657 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3658 {
3659 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3660 {
3661 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3662 /* Apply patterns. */
3663 for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
3664 bb_vinfo->roots[i].stmts[j]
3665 = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
3666 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3667 bb_vinfo->roots[i].stmts,
3668 bb_vinfo->roots[i].roots,
3669 bb_vinfo->roots[i].remain,
3670 max_tree_size, &limit, bst_map, NULL))
3671 {
3672 bb_vinfo->roots[i].stmts = vNULL;
3673 bb_vinfo->roots[i].roots = vNULL;
3674 bb_vinfo->roots[i].remain = vNULL;
3675 }
3676 }
3677 }
3678
3679 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3680 {
3681 /* Find SLP sequences starting from reduction chains. */
3682 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3683 if (! STMT_VINFO_RELEVANT_P (first_element)
3684 && ! STMT_VINFO_LIVE_P (first_element))
3685 ;
3686 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3687 slp_inst_kind_reduc_chain,
3688 max_tree_size, &limit))
3689 {
3690 /* Dissolve reduction chain group. */
3691 stmt_vec_info vinfo = first_element;
3692 stmt_vec_info last = NULL;
3693 while (vinfo)
3694 {
3695 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3696 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3697 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3698 last = vinfo;
3699 vinfo = next;
3700 }
3701 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3702 /* It can be still vectorized as part of an SLP reduction. */
3703 loop_vinfo->reductions.safe_push (last);
3704 }
3705
3706 /* Find SLP sequences starting from groups of reductions. */
3707 if (loop_vinfo->reductions.length () > 1)
3708 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3709 slp_inst_kind_reduc_group, max_tree_size,
3710 &limit);
3711 }
3712
3713 hash_set<slp_tree> visited_patterns;
3714 slp_tree_to_load_perm_map_t perm_cache;
3715 slp_compat_nodes_map_t compat_cache;
3716
3717 /* See if any patterns can be found in the SLP tree. */
3718 bool pattern_found = false;
3719 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3720 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3721 &visited_patterns, &perm_cache,
3722 &compat_cache);
3723
3724 /* If any were found optimize permutations of loads. */
3725 if (pattern_found)
3726 {
3727 hash_map<slp_tree, slp_tree> load_map;
3728 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3729 {
3730 slp_tree root = SLP_INSTANCE_TREE (instance);
3731 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3732 &load_map, root);
3733 }
3734 }
3735
3736
3737
3738 /* The map keeps a reference on SLP nodes built, release that. */
3739 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3740 it != bst_map->end (); ++it)
3741 if ((*it).second)
3742 vect_free_slp_tree ((*it).second);
3743 delete bst_map;
3744
3745 if (pattern_found && dump_enabled_p ())
3746 {
3747 dump_printf_loc (MSG_NOTE, vect_location,
3748 "Pattern matched SLP tree\n");
3749 hash_set<slp_tree> visited;
3750 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3751 vect_print_slp_graph (MSG_NOTE, vect_location,
3752 SLP_INSTANCE_TREE (instance), visited);
3753 }
3754
3755 return opt_result::success ();
3756 }
3757
3758 /* Estimates the cost of inserting layout changes into the SLP graph.
3759 It can also say that the insertion is impossible. */
3760
3761 struct slpg_layout_cost
3762 {
3763 slpg_layout_cost () = default;
3764 slpg_layout_cost (sreal, bool);
3765
3766 static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3767 bool is_possible () const { return depth != sreal::max (); }
3768
3769 bool operator== (const slpg_layout_cost &) const;
3770 bool operator!= (const slpg_layout_cost &) const;
3771
3772 bool is_better_than (const slpg_layout_cost &, bool) const;
3773
3774 void add_parallel_cost (const slpg_layout_cost &);
3775 void add_serial_cost (const slpg_layout_cost &);
3776 void split (unsigned int);
3777
3778 /* The longest sequence of layout changes needed during any traversal
3779 of the partition dag, weighted by execution frequency.
3780
3781 This is the most important metric when optimizing for speed, since
3782 it helps to ensure that we keep the number of operations on
3783 critical paths to a minimum. */
3784 sreal depth = 0;
3785
3786 /* An estimate of the total number of operations needed. It is weighted by
3787 execution frequency when optimizing for speed but not when optimizing for
3788 size. In order to avoid double-counting, a node with a fanout of N will
3789 distribute 1/N of its total cost to each successor.
3790
3791 This is the most important metric when optimizing for size, since
3792 it helps to keep the total number of operations to a minimum, */
3793 sreal total = 0;
3794 };
3795
3796 /* Construct costs for a node with weight WEIGHT. A higher weight
3797 indicates more frequent execution. IS_FOR_SIZE is true if we are
3798 optimizing for size rather than speed. */
3799
3800 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3801 : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3802 {
3803 }
3804
3805 bool
3806 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3807 {
3808 return depth == other.depth && total == other.total;
3809 }
3810
3811 bool
3812 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3813 {
3814 return !operator== (other);
3815 }
3816
3817 /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
3818 true if we are optimizing for size rather than speed. */
3819
3820 bool
3821 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3822 bool is_for_size) const
3823 {
3824 if (is_for_size)
3825 {
3826 if (total != other.total)
3827 return total < other.total;
3828 return depth < other.depth;
3829 }
3830 else
3831 {
3832 if (depth != other.depth)
3833 return depth < other.depth;
3834 return total < other.total;
3835 }
3836 }
3837
3838 /* Increase the costs to account for something with cost INPUT_COST
3839 happening in parallel with the current costs. */
3840
3841 void
3842 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3843 {
3844 depth = std::max (depth, input_cost.depth);
3845 total += input_cost.total;
3846 }
3847
3848 /* Increase the costs to account for something with cost INPUT_COST
3849 happening in series with the current costs. */
3850
3851 void
3852 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3853 {
3854 depth += other.depth;
3855 total += other.total;
3856 }
3857
3858 /* Split the total cost among TIMES successors or predecessors. */
3859
3860 void
3861 slpg_layout_cost::split (unsigned int times)
3862 {
3863 if (times > 1)
3864 total /= times;
3865 }
3866
3867 /* Information about one node in the SLP graph, for use during
3868 vect_optimize_slp_pass. */
3869
3870 struct slpg_vertex
3871 {
3872 slpg_vertex (slp_tree node_) : node (node_) {}
3873
3874 /* The node itself. */
3875 slp_tree node;
3876
3877 /* Which partition the node belongs to, or -1 if none. Nodes outside of
3878 partitions are flexible; they can have whichever layout consumers
3879 want them to have. */
3880 int partition = -1;
3881
3882 /* The number of nodes that directly use the result of this one
3883 (i.e. the number of nodes that count this one as a child). */
3884 unsigned int out_degree = 0;
3885
3886 /* The execution frequency of the node. */
3887 sreal weight = 0;
3888
3889 /* The total execution frequency of all nodes that directly use the
3890 result of this one. */
3891 sreal out_weight = 0;
3892 };
3893
3894 /* Information about one partition of the SLP graph, for use during
3895 vect_optimize_slp_pass. */
3896
3897 struct slpg_partition_info
3898 {
3899 /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3900 of m_partitioned_nodes. */
3901 unsigned int node_begin = 0;
3902 unsigned int node_end = 0;
3903
3904 /* Which layout we've chosen to use for this partition, or -1 if
3905 we haven't picked one yet. */
3906 int layout = -1;
3907
3908 /* The number of predecessors and successors in the partition dag.
3909 The predecessors always have lower partition numbers and the
3910 successors always have higher partition numbers.
3911
3912 Note that the directions of these edges are not necessarily the
3913 same as in the data flow graph. For example, if an SCC has separate
3914 partitions for an inner loop and an outer loop, the inner loop's
3915 partition will have at least two incoming edges from the outer loop's
3916 partition: one for a live-in value and one for a live-out value.
3917 In data flow terms, one of these edges would also be from the outer loop
3918 to the inner loop, but the other would be in the opposite direction. */
3919 unsigned int in_degree = 0;
3920 unsigned int out_degree = 0;
3921 };
3922
3923 /* Information about the costs of using a particular layout for a
3924 particular partition. It can also say that the combination is
3925 impossible. */
3926
3927 struct slpg_partition_layout_costs
3928 {
3929 bool is_possible () const { return internal_cost.is_possible (); }
3930 void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3931
3932 /* The costs inherited from predecessor partitions. */
3933 slpg_layout_cost in_cost;
3934
3935 /* The inherent cost of the layout within the node itself. For example,
3936 this is nonzero for a load if choosing a particular layout would require
3937 the load to permute the loaded elements. It is nonzero for a
3938 VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3939 to full-vector moves. */
3940 slpg_layout_cost internal_cost;
3941
3942 /* The costs inherited from successor partitions. */
3943 slpg_layout_cost out_cost;
3944 };
3945
3946 /* This class tries to optimize the layout of vectors in order to avoid
3947 unnecessary shuffling. At the moment, the set of possible layouts are
3948 restricted to bijective permutations.
3949
3950 The goal of the pass depends on whether we're optimizing for size or
3951 for speed. When optimizing for size, the goal is to reduce the overall
3952 number of layout changes (including layout changes implied by things
3953 like load permutations). When optimizing for speed, the goal is to
3954 reduce the maximum latency attributable to layout changes on any
3955 non-cyclical path through the data flow graph.
3956
3957 For example, when optimizing a loop nest for speed, we will prefer
3958 to make layout changes outside of a loop rather than inside of a loop,
3959 and will prefer to make layout changes in parallel rather than serially,
3960 even if that increases the overall number of layout changes.
3961
3962 The high-level procedure is:
3963
3964 (1) Build a graph in which edges go from uses (parents) to definitions
3965 (children).
3966
3967 (2) Divide the graph into a dag of strongly-connected components (SCCs).
3968
3969 (3) When optimizing for speed, partition the nodes in each SCC based
3970 on their containing cfg loop. When optimizing for size, treat
3971 each SCC as a single partition.
3972
3973 This gives us a dag of partitions. The goal is now to assign a
3974 layout to each partition.
3975
3976 (4) Construct a set of vector layouts that are worth considering.
3977 Record which nodes must keep their current layout.
3978
3979 (5) Perform a forward walk over the partition dag (from loads to stores)
3980 accumulating the "forward" cost of using each layout. When visiting
3981 each partition, assign a tentative choice of layout to the partition
3982 and use that choice when calculating the cost of using a different
3983 layout in successor partitions.
3984
3985 (6) Perform a backward walk over the partition dag (from stores to loads),
3986 accumulating the "backward" cost of using each layout. When visiting
3987 each partition, make a final choice of layout for that partition based
3988 on the accumulated forward costs (from (5)) and backward costs
3989 (from (6)).
3990
3991 (7) Apply the chosen layouts to the SLP graph.
3992
3993 For example, consider the SLP statements:
3994
3995 S1: a_1 = load
3996 loop:
3997 S2: a_2 = PHI<a_1, a_3>
3998 S3: b_1 = load
3999 S4: a_3 = a_2 + b_1
4000 exit:
4001 S5: a_4 = PHI<a_3>
4002 S6: store a_4
4003
4004 S2 and S4 form an SCC and are part of the same loop. Every other
4005 statement is in a singleton SCC. In this example there is a one-to-one
4006 mapping between SCCs and partitions and the partition dag looks like this;
4007
4008 S1 S3
4009 \ /
4010 S2+S4
4011 |
4012 S5
4013 |
4014 S6
4015
4016 S2, S3 and S4 will have a higher execution frequency than the other
4017 statements, so when optimizing for speed, the goal is to avoid any
4018 layout changes:
4019
4020 - within S3
4021 - within S2+S4
4022 - on the S3->S2+S4 edge
4023
4024 For example, if S3 was originally a reversing load, the goal of the
4025 pass is to make it an unreversed load and change the layout on the
4026 S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
4027 on S1->S2+S4 and S5->S6 would also be acceptable.)
4028
4029 The difference between SCCs and partitions becomes important if we
4030 add an outer loop:
4031
4032 S1: a_1 = ...
4033 loop1:
4034 S2: a_2 = PHI<a_1, a_6>
4035 S3: b_1 = load
4036 S4: a_3 = a_2 + b_1
4037 loop2:
4038 S5: a_4 = PHI<a_3, a_5>
4039 S6: c_1 = load
4040 S7: a_5 = a_4 + c_1
4041 exit2:
4042 S8: a_6 = PHI<a_5>
4043 S9: store a_6
4044 exit1:
4045
4046 Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
4047 for speed, we usually do not want restrictions in the outer loop to "infect"
4048 the decision for the inner loop. For example, if an outer-loop node
4049 in the SCC contains a statement with a fixed layout, that should not
4050 prevent the inner loop from using a different layout. Conversely,
4051 the inner loop should not dictate a layout to the outer loop: if the
4052 outer loop does a lot of computation, then it may not be efficient to
4053 do all of that computation in the inner loop's preferred layout.
4054
4055 So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
4056 and S5+S7 (inner). We also try to arrange partitions so that:
4057
4058 - the partition for an outer loop comes before the partition for
4059 an inner loop
4060
4061 - if a sibling loop A dominates a sibling loop B, A's partition
4062 comes before B's
4063
4064 This gives the following partition dag for the example above:
4065
4066 S1 S3
4067 \ /
4068 S2+S4+S8 S6
4069 | \\ /
4070 | S5+S7
4071 |
4072 S9
4073
4074 There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
4075 one for a reversal of the edge S7->S8.
4076
4077 The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
4078 for S2+S4+S8 therefore has to balance the cost of using the outer loop's
4079 preferred layout against the cost of changing the layout on entry to the
4080 inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
4081
4082 Although this works well when optimizing for speed, it has the downside
4083 when optimizing for size that the choice of layout for S5+S7 is completely
4084 independent of S9, which lessens the chance of reducing the overall number
4085 of permutations. We therefore do not partition SCCs when optimizing
4086 for size.
4087
4088 To give a concrete example of the difference between optimizing
4089 for size and speed, consider:
4090
4091 a[0] = (b[1] << c[3]) - d[1];
4092 a[1] = (b[0] << c[2]) - d[0];
4093 a[2] = (b[3] << c[1]) - d[3];
4094 a[3] = (b[2] << c[0]) - d[2];
4095
4096 There are three different layouts here: one for a, one for b and d,
4097 and one for c. When optimizing for speed it is better to permute each
4098 of b, c and d into the order required by a, since those permutations
4099 happen in parallel. But when optimizing for size, it is better to:
4100
4101 - permute c into the same order as b
4102 - do the arithmetic
4103 - permute the result into the order required by a
4104
4105 This gives 2 permutations rather than 3. */
4106
4107 class vect_optimize_slp_pass
4108 {
4109 public:
4110 vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
4111 void run ();
4112
4113 private:
4114 /* Graph building. */
4115 struct loop *containing_loop (slp_tree);
4116 bool is_cfg_latch_edge (graph_edge *);
4117 void build_vertices (hash_set<slp_tree> &, slp_tree);
4118 void build_vertices ();
4119 void build_graph ();
4120
4121 /* Partitioning. */
4122 void create_partitions ();
4123 template<typename T> void for_each_partition_edge (unsigned int, T);
4124
4125 /* Layout selection. */
4126 bool is_compatible_layout (slp_tree, unsigned int);
4127 int change_layout_cost (slp_tree, unsigned int, unsigned int);
4128 slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4129 unsigned int);
4130 void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4131 int, unsigned int);
4132 int internal_node_cost (slp_tree, int, unsigned int);
4133 void start_choosing_layouts ();
4134
4135 /* Cost propagation. */
4136 slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4137 unsigned int, unsigned int);
4138 slpg_layout_cost total_in_cost (unsigned int);
4139 slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4140 slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4141 void forward_pass ();
4142 void backward_pass ();
4143
4144 /* Rematerialization. */
4145 slp_tree get_result_with_layout (slp_tree, unsigned int);
4146 void materialize ();
4147
4148 /* Clean-up. */
4149 void remove_redundant_permutations ();
4150
4151 void dump ();
4152
4153 vec_info *m_vinfo;
4154
4155 /* True if we should optimize the graph for size, false if we should
4156 optimize it for speed. (It wouldn't be easy to make this decision
4157 more locally.) */
4158 bool m_optimize_size;
4159
4160 /* A graph of all SLP nodes, with edges leading from uses to definitions.
4161 In other words, a node's predecessors are its slp_tree parents and
4162 a node's successors are its slp_tree children. */
4163 graph *m_slpg = nullptr;
4164
4165 /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
4166 auto_vec<slpg_vertex> m_vertices;
4167
4168 /* The list of all leaves of M_SLPG. such as external definitions, constants,
4169 and loads. */
4170 auto_vec<int> m_leafs;
4171
4172 /* This array has one entry for every vector layout that we're considering.
4173 Element 0 is null and indicates "no change". Other entries describe
4174 permutations that are inherent in the current graph and that we would
4175 like to reverse if possible.
4176
4177 For example, a permutation { 1, 2, 3, 0 } means that something has
4178 effectively been permuted in that way, such as a load group
4179 { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4180 We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4181 in order to put things "back" in order. */
4182 auto_vec<vec<unsigned> > m_perms;
4183
4184 /* A partitioning of the nodes for which a layout must be chosen.
4185 Each partition represents an <SCC, cfg loop> pair; that is,
4186 nodes in different SCCs belong to different partitions, and nodes
4187 within an SCC can be further partitioned according to a containing
4188 cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
4189
4190 - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4191 from leaves (such as loads) to roots (such as stores).
4192
4193 - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
4194 auto_vec<slpg_partition_info> m_partitions;
4195
4196 /* The list of all nodes for which a layout must be chosen. Nodes for
4197 partition P come before the nodes for partition P+1. Nodes within a
4198 partition are in reverse postorder. */
4199 auto_vec<unsigned int> m_partitioned_nodes;
4200
4201 /* Index P * num-layouts + L contains the cost of using layout L
4202 for partition P. */
4203 auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4204
4205 /* Index N * num-layouts + L, if nonnull, is a node that provides the
4206 original output of node N adjusted to have layout L. */
4207 auto_vec<slp_tree> m_node_layouts;
4208 };
4209
4210 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4211 Also record whether we should optimize anything for speed rather
4212 than size. */
4213
4214 void
4215 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4216 slp_tree node)
4217 {
4218 unsigned i;
4219 slp_tree child;
4220
4221 if (visited.add (node))
4222 return;
4223
4224 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4225 {
4226 basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4227 if (optimize_bb_for_speed_p (bb))
4228 m_optimize_size = false;
4229 }
4230
4231 node->vertex = m_vertices.length ();
4232 m_vertices.safe_push (slpg_vertex (node));
4233
4234 bool leaf = true;
4235 bool force_leaf = false;
4236 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4237 if (child)
4238 {
4239 leaf = false;
4240 build_vertices (visited, child);
4241 }
4242 else
4243 force_leaf = true;
4244 /* Since SLP discovery works along use-def edges all cycles have an
4245 entry - but there's the exception of cycles where we do not handle
4246 the entry explicitely (but with a NULL SLP node), like some reductions
4247 and inductions. Force those SLP PHIs to act as leafs to make them
4248 backwards reachable. */
4249 if (leaf || force_leaf)
4250 m_leafs.safe_push (node->vertex);
4251 }
4252
4253 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
4254
4255 void
4256 vect_optimize_slp_pass::build_vertices ()
4257 {
4258 hash_set<slp_tree> visited;
4259 unsigned i;
4260 slp_instance instance;
4261 FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4262 build_vertices (visited, SLP_INSTANCE_TREE (instance));
4263 }
4264
4265 /* Apply (reverse) bijectite PERM to VEC. */
4266
4267 template <class T>
4268 static void
4269 vect_slp_permute (vec<unsigned> perm,
4270 vec<T> &vec, bool reverse)
4271 {
4272 auto_vec<T, 64> saved;
4273 saved.create (vec.length ());
4274 for (unsigned i = 0; i < vec.length (); ++i)
4275 saved.quick_push (vec[i]);
4276
4277 if (reverse)
4278 {
4279 for (unsigned i = 0; i < vec.length (); ++i)
4280 vec[perm[i]] = saved[i];
4281 for (unsigned i = 0; i < vec.length (); ++i)
4282 gcc_assert (vec[perm[i]] == saved[i]);
4283 }
4284 else
4285 {
4286 for (unsigned i = 0; i < vec.length (); ++i)
4287 vec[i] = saved[perm[i]];
4288 for (unsigned i = 0; i < vec.length (); ++i)
4289 gcc_assert (vec[i] == saved[perm[i]]);
4290 }
4291 }
4292
4293 /* Return the cfg loop that contains NODE. */
4294
4295 struct loop *
4296 vect_optimize_slp_pass::containing_loop (slp_tree node)
4297 {
4298 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4299 if (!rep)
4300 return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4301 return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4302 }
4303
4304 /* Return true if UD (an edge from a use to a definition) is associated
4305 with a loop latch edge in the cfg. */
4306
4307 bool
4308 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4309 {
4310 slp_tree use = m_vertices[ud->src].node;
4311 slp_tree def = m_vertices[ud->dest].node;
4312 if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
4313 || SLP_TREE_CODE (use) == VEC_PERM_EXPR)
4314 || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4315 return false;
4316
4317 stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4318 return (is_a<gphi *> (use_rep->stmt)
4319 && bb_loop_header_p (gimple_bb (use_rep->stmt))
4320 && containing_loop (def) == containing_loop (use));
4321 }
4322
4323 /* Build the graph. Mark edges that correspond to cfg loop latch edges with
4324 a nonnull data field. */
4325
4326 void
4327 vect_optimize_slp_pass::build_graph ()
4328 {
4329 m_optimize_size = true;
4330 build_vertices ();
4331
4332 m_slpg = new_graph (m_vertices.length ());
4333 for (slpg_vertex &v : m_vertices)
4334 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4335 if (child)
4336 {
4337 graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4338 if (is_cfg_latch_edge (ud))
4339 ud->data = this;
4340 }
4341 }
4342
4343 /* Return true if E corresponds to a loop latch edge in the cfg. */
4344
4345 static bool
4346 skip_cfg_latch_edges (graph_edge *e)
4347 {
4348 return e->data;
4349 }
4350
4351 /* Create the node partitions. */
4352
4353 void
4354 vect_optimize_slp_pass::create_partitions ()
4355 {
4356 /* Calculate a postorder of the graph, ignoring edges that correspond
4357 to natural latch edges in the cfg. Reading the vector from the end
4358 to the beginning gives the reverse postorder. */
4359 auto_vec<int> initial_rpo;
4360 graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4361 false, NULL, skip_cfg_latch_edges);
4362 gcc_assert (initial_rpo.length () == m_vertices.length ());
4363
4364 /* Calculate the strongly connected components of the graph. */
4365 auto_vec<int> scc_grouping;
4366 unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4367
4368 /* Create a new index order in which all nodes from the same SCC are
4369 consecutive. Use scc_pos to record the index of the first node in
4370 each SCC. */
4371 auto_vec<unsigned int> scc_pos (num_sccs);
4372 int last_component = -1;
4373 unsigned int node_count = 0;
4374 for (unsigned int node_i : scc_grouping)
4375 {
4376 if (last_component != m_slpg->vertices[node_i].component)
4377 {
4378 last_component = m_slpg->vertices[node_i].component;
4379 gcc_assert (last_component == int (scc_pos.length ()));
4380 scc_pos.quick_push (node_count);
4381 }
4382 node_count += 1;
4383 }
4384 gcc_assert (node_count == initial_rpo.length ()
4385 && last_component + 1 == int (num_sccs));
4386
4387 /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4388 inside each SCC following the RPO we calculated above. The fact that
4389 we ignored natural latch edges when calculating the RPO should ensure
4390 that, for natural loop nests:
4391
4392 - the first node that we encounter in a cfg loop is the loop header phi
4393 - the loop header phis are in dominance order
4394
4395 Arranging for this is an optimization (see below) rather than a
4396 correctness issue. Unnatural loops with a tangled mess of backedges
4397 will still work correctly, but might give poorer results.
4398
4399 Also update scc_pos so that it gives 1 + the index of the last node
4400 in the SCC. */
4401 m_partitioned_nodes.safe_grow (node_count);
4402 for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4403 {
4404 unsigned int node_i = initial_rpo[old_i];
4405 unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4406 m_partitioned_nodes[new_i] = node_i;
4407 }
4408
4409 /* When optimizing for speed, partition each SCC based on the containing
4410 cfg loop. The order we constructed above should ensure that, for natural
4411 cfg loops, we'll create sub-SCC partitions for outer loops before
4412 the corresponding sub-SCC partitions for inner loops. Similarly,
4413 when one sibling loop A dominates another sibling loop B, we should
4414 create a sub-SCC partition for A before a sub-SCC partition for B.
4415
4416 As above, nothing depends for correctness on whether this achieves
4417 a natural nesting, but we should get better results when it does. */
4418 m_partitions.reserve (m_vertices.length ());
4419 unsigned int next_partition_i = 0;
4420 hash_map<struct loop *, int> loop_partitions;
4421 unsigned int rpo_begin = 0;
4422 unsigned int num_partitioned_nodes = 0;
4423 for (unsigned int rpo_end : scc_pos)
4424 {
4425 loop_partitions.empty ();
4426 unsigned int partition_i = next_partition_i;
4427 for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4428 {
4429 /* Handle externals and constants optimistically throughout.
4430 But treat existing vectors as fixed since we do not handle
4431 permuting them. */
4432 unsigned int node_i = m_partitioned_nodes[rpo_i];
4433 auto &vertex = m_vertices[node_i];
4434 if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4435 && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4436 || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4437 vertex.partition = -1;
4438 else
4439 {
4440 bool existed;
4441 if (m_optimize_size)
4442 existed = next_partition_i > partition_i;
4443 else
4444 {
4445 struct loop *loop = containing_loop (vertex.node);
4446 auto &entry = loop_partitions.get_or_insert (loop, &existed);
4447 if (!existed)
4448 entry = next_partition_i;
4449 partition_i = entry;
4450 }
4451 if (!existed)
4452 {
4453 m_partitions.quick_push (slpg_partition_info ());
4454 next_partition_i += 1;
4455 }
4456 vertex.partition = partition_i;
4457 num_partitioned_nodes += 1;
4458 m_partitions[partition_i].node_end += 1;
4459 }
4460 }
4461 rpo_begin = rpo_end;
4462 }
4463
4464 /* Assign ranges of consecutive node indices to each partition,
4465 in partition order. Start with node_end being the same as
4466 node_begin so that the next loop can use it as a counter. */
4467 unsigned int node_begin = 0;
4468 for (auto &partition : m_partitions)
4469 {
4470 partition.node_begin = node_begin;
4471 node_begin += partition.node_end;
4472 partition.node_end = partition.node_begin;
4473 }
4474 gcc_assert (node_begin == num_partitioned_nodes);
4475
4476 /* Finally build the list of nodes in partition order. */
4477 m_partitioned_nodes.truncate (num_partitioned_nodes);
4478 for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4479 {
4480 int partition_i = m_vertices[node_i].partition;
4481 if (partition_i >= 0)
4482 {
4483 unsigned int order_i = m_partitions[partition_i].node_end++;
4484 m_partitioned_nodes[order_i] = node_i;
4485 }
4486 }
4487 }
4488
4489 /* Look for edges from earlier partitions into node NODE_I and edges from
4490 node NODE_I into later partitions. Call:
4491
4492 FN (ud, other_node_i)
4493
4494 for each such use-to-def edge ud, where other_node_i is the node at the
4495 other end of the edge. */
4496
4497 template<typename T>
4498 void
4499 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4500 {
4501 int partition_i = m_vertices[node_i].partition;
4502 for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4503 pred; pred = pred->pred_next)
4504 {
4505 int src_partition_i = m_vertices[pred->src].partition;
4506 if (src_partition_i >= 0 && src_partition_i != partition_i)
4507 fn (pred, pred->src);
4508 }
4509 for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4510 succ; succ = succ->succ_next)
4511 {
4512 int dest_partition_i = m_vertices[succ->dest].partition;
4513 if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4514 fn (succ, succ->dest);
4515 }
4516 }
4517
4518 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4519 that NODE would operate on. This test is independent of NODE's actual
4520 operation. */
4521
4522 bool
4523 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4524 unsigned int layout_i)
4525 {
4526 if (layout_i == 0)
4527 return true;
4528
4529 if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4530 return false;
4531
4532 return true;
4533 }
4534
4535 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4536 to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
4537 layouts is incompatible with NODE or if the change is not possible for
4538 some other reason.
4539
4540 The properties taken from NODE include the number of lanes and the
4541 vector type. The actual operation doesn't matter. */
4542
4543 int
4544 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4545 unsigned int from_layout_i,
4546 unsigned int to_layout_i)
4547 {
4548 if (!is_compatible_layout (node, from_layout_i)
4549 || !is_compatible_layout (node, to_layout_i))
4550 return -1;
4551
4552 if (from_layout_i == to_layout_i)
4553 return 0;
4554
4555 auto_vec<slp_tree, 1> children (1);
4556 children.quick_push (node);
4557 auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4558 if (from_layout_i > 0)
4559 for (unsigned int i : m_perms[from_layout_i])
4560 perm.quick_push ({ 0, i });
4561 else
4562 for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4563 perm.quick_push ({ 0, i });
4564 if (to_layout_i > 0)
4565 vect_slp_permute (m_perms[to_layout_i], perm, true);
4566 auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4567 children, false);
4568 if (count >= 0)
4569 return MAX (count, 1);
4570
4571 /* ??? In principle we could try changing via layout 0, giving two
4572 layout changes rather than 1. Doing that would require
4573 corresponding support in get_result_with_layout. */
4574 return -1;
4575 }
4576
4577 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
4578
4579 inline slpg_partition_layout_costs &
4580 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4581 unsigned int layout_i)
4582 {
4583 return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4584 }
4585
4586 /* Change PERM in one of two ways:
4587
4588 - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4589 chosen for child I of NODE.
4590
4591 - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4592
4593 In both cases, arrange for the output to have layout OUT_LAYOUT_I */
4594
4595 void
4596 vect_optimize_slp_pass::
4597 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4598 int in_layout_i, unsigned int out_layout_i)
4599 {
4600 for (auto &entry : perm)
4601 {
4602 int this_in_layout_i = in_layout_i;
4603 if (this_in_layout_i < 0)
4604 {
4605 slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4606 unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4607 this_in_layout_i = m_partitions[in_partition_i].layout;
4608 }
4609 if (this_in_layout_i > 0)
4610 entry.second = m_perms[this_in_layout_i][entry.second];
4611 }
4612 if (out_layout_i > 0)
4613 vect_slp_permute (m_perms[out_layout_i], perm, true);
4614 }
4615
4616 /* Check whether the target allows NODE to be rearranged so that the node's
4617 output has layout OUT_LAYOUT_I. Return the cost of the change if so,
4618 in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
4619
4620 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4621 NODE can adapt to the layout changes that have (perhaps provisionally)
4622 been chosen for NODE's children, so that no extra permutations are
4623 needed on either the input or the output of NODE.
4624
4625 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4626 that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4627
4628 IN_LAYOUT_I has no meaning for other types of node.
4629
4630 Keeping the node as-is is always valid. If the target doesn't appear
4631 to support the node as-is, but might realistically support other layouts,
4632 then layout 0 instead has the cost of a worst-case permutation. On the
4633 one hand, this ensures that every node has at least one valid layout,
4634 avoiding what would otherwise be an awkward special case. On the other,
4635 it still encourages the pass to change an invalid pre-existing layout
4636 choice into a valid one. */
4637
4638 int
4639 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4640 unsigned int out_layout_i)
4641 {
4642 const int fallback_cost = 1;
4643
4644 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4645 {
4646 auto_lane_permutation_t tmp_perm;
4647 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4648
4649 /* Check that the child nodes support the chosen layout. Checking
4650 the first child is enough, since any second child would have the
4651 same shape. */
4652 auto first_child = SLP_TREE_CHILDREN (node)[0];
4653 if (in_layout_i > 0
4654 && !is_compatible_layout (first_child, in_layout_i))
4655 return -1;
4656
4657 change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4658 int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4659 node, tmp_perm,
4660 SLP_TREE_CHILDREN (node),
4661 false);
4662 if (count < 0)
4663 {
4664 if (in_layout_i == 0 && out_layout_i == 0)
4665 {
4666 /* Use the fallback cost if the node could in principle support
4667 some nonzero layout for both the inputs and the outputs.
4668 Otherwise assume that the node will be rejected later
4669 and rebuilt from scalars. */
4670 if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4671 return fallback_cost;
4672 return 0;
4673 }
4674 return -1;
4675 }
4676
4677 /* We currently have no way of telling whether the new layout is cheaper
4678 or more expensive than the old one. But at least in principle,
4679 it should be worth making zero permutations (whole-vector shuffles)
4680 cheaper than real permutations, in case the pass is able to remove
4681 the latter. */
4682 return count == 0 ? 0 : 1;
4683 }
4684
4685 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4686 if (rep
4687 && STMT_VINFO_DATA_REF (rep)
4688 && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4689 && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4690 {
4691 auto_load_permutation_t tmp_perm;
4692 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4693 if (out_layout_i > 0)
4694 vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4695
4696 poly_uint64 vf = 1;
4697 if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4698 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4699 unsigned int n_perms;
4700 if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4701 nullptr, vf, true, false, &n_perms))
4702 {
4703 auto rep = SLP_TREE_REPRESENTATIVE (node);
4704 if (out_layout_i == 0)
4705 {
4706 /* Use the fallback cost if the load is an N-to-N permutation.
4707 Otherwise assume that the node will be rejected later
4708 and rebuilt from scalars. */
4709 if (STMT_VINFO_GROUPED_ACCESS (rep)
4710 && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4711 == SLP_TREE_LANES (node)))
4712 return fallback_cost;
4713 return 0;
4714 }
4715 return -1;
4716 }
4717
4718 /* See the comment above the corresponding VEC_PERM_EXPR handling. */
4719 return n_perms == 0 ? 0 : 1;
4720 }
4721
4722 return 0;
4723 }
4724
4725 /* Decide which element layouts we should consider using. Calculate the
4726 weights associated with inserting layout changes on partition edges.
4727 Also mark partitions that cannot change layout, by setting their
4728 layout to zero. */
4729
4730 void
4731 vect_optimize_slp_pass::start_choosing_layouts ()
4732 {
4733 /* Used to assign unique permutation indices. */
4734 using perm_hash = unbounded_hashmap_traits<
4735 vec_free_hash_base<int_hash_base<unsigned>>,
4736 int_hash<int, -1, -2>
4737 >;
4738 hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4739
4740 /* Layout 0 is "no change". */
4741 m_perms.safe_push (vNULL);
4742
4743 /* Create layouts from existing permutations. */
4744 auto_load_permutation_t tmp_perm;
4745 for (unsigned int node_i : m_partitioned_nodes)
4746 {
4747 /* Leafs also double as entries to the reverse graph. Allow the
4748 layout of those to be changed. */
4749 auto &vertex = m_vertices[node_i];
4750 auto &partition = m_partitions[vertex.partition];
4751 if (!m_slpg->vertices[node_i].succ)
4752 partition.layout = 0;
4753
4754 /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
4755 slp_tree node = vertex.node;
4756 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4757 slp_tree child;
4758 unsigned HOST_WIDE_INT imin, imax = 0;
4759 bool any_permute = false;
4760 tmp_perm.truncate (0);
4761 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4762 {
4763 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4764 unpermuted, record a layout that reverses this permutation.
4765
4766 We would need more work to cope with loads that are internally
4767 permuted and also have inputs (such as masks for
4768 IFN_MASK_LOADs). */
4769 gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4770 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4771 {
4772 partition.layout = -1;
4773 continue;
4774 }
4775 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4776 imin = DR_GROUP_SIZE (dr_stmt) + 1;
4777 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4778 }
4779 else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4780 && SLP_TREE_CHILDREN (node).length () == 1
4781 && (child = SLP_TREE_CHILDREN (node)[0])
4782 && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4783 .is_constant (&imin)))
4784 {
4785 /* If the child has the same vector size as this node,
4786 reversing the permutation can make the permutation a no-op.
4787 In other cases it can change a true permutation into a
4788 full-vector extract. */
4789 tmp_perm.reserve (SLP_TREE_LANES (node));
4790 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4791 tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4792 }
4793 else
4794 continue;
4795
4796 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4797 {
4798 unsigned idx = tmp_perm[j];
4799 imin = MIN (imin, idx);
4800 imax = MAX (imax, idx);
4801 if (idx - tmp_perm[0] != j)
4802 any_permute = true;
4803 }
4804 /* If the span doesn't match we'd disrupt VF computation, avoid
4805 that for now. */
4806 if (imax - imin + 1 != SLP_TREE_LANES (node))
4807 continue;
4808 /* If there's no permute no need to split one out. In this case
4809 we can consider turning a load into a permuted load, if that
4810 turns out to be cheaper than alternatives. */
4811 if (!any_permute)
4812 {
4813 partition.layout = -1;
4814 continue;
4815 }
4816
4817 /* For now only handle true permutes, like
4818 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
4819 when permuting constants and invariants keeping the permute
4820 bijective. */
4821 auto_sbitmap load_index (SLP_TREE_LANES (node));
4822 bitmap_clear (load_index);
4823 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4824 bitmap_set_bit (load_index, tmp_perm[j] - imin);
4825 unsigned j;
4826 for (j = 0; j < SLP_TREE_LANES (node); ++j)
4827 if (!bitmap_bit_p (load_index, j))
4828 break;
4829 if (j != SLP_TREE_LANES (node))
4830 continue;
4831
4832 vec<unsigned> perm = vNULL;
4833 perm.safe_grow (SLP_TREE_LANES (node), true);
4834 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4835 perm[j] = tmp_perm[j] - imin;
4836
4837 if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4838 {
4839 /* Continue to use existing layouts, but don't add any more. */
4840 int *entry = layout_ids.get (perm);
4841 partition.layout = entry ? *entry : 0;
4842 perm.release ();
4843 }
4844 else
4845 {
4846 bool existed;
4847 int &layout_i = layout_ids.get_or_insert (perm, &existed);
4848 if (existed)
4849 perm.release ();
4850 else
4851 {
4852 layout_i = m_perms.length ();
4853 m_perms.safe_push (perm);
4854 }
4855 partition.layout = layout_i;
4856 }
4857 }
4858
4859 /* Initially assume that every layout is possible and has zero cost
4860 in every partition. */
4861 m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4862 * m_perms.length ());
4863
4864 /* We have to mark outgoing permutations facing non-associating-reduction
4865 graph entries that are not represented as to be materialized.
4866 slp_inst_kind_bb_reduc currently only covers associatable reductions. */
4867 for (slp_instance instance : m_vinfo->slp_instances)
4868 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4869 {
4870 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4871 m_partitions[m_vertices[node_i].partition].layout = 0;
4872 }
4873 else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4874 {
4875 stmt_vec_info stmt_info
4876 = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4877 stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4878 if (needs_fold_left_reduction_p (TREE_TYPE
4879 (gimple_get_lhs (stmt_info->stmt)),
4880 STMT_VINFO_REDUC_CODE (reduc_info)))
4881 {
4882 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4883 m_partitions[m_vertices[node_i].partition].layout = 0;
4884 }
4885 }
4886
4887 /* Check which layouts each node and partition can handle. Calculate the
4888 weights associated with inserting layout changes on edges. */
4889 for (unsigned int node_i : m_partitioned_nodes)
4890 {
4891 auto &vertex = m_vertices[node_i];
4892 auto &partition = m_partitions[vertex.partition];
4893 slp_tree node = vertex.node;
4894
4895 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4896 {
4897 vertex.weight = vect_slp_node_weight (node);
4898
4899 /* We do not handle stores with a permutation, so all
4900 incoming permutations must have been materialized.
4901
4902 We also don't handle masked grouped loads, which lack a
4903 permutation vector. In this case the memory locations
4904 form an implicit second input to the loads, on top of the
4905 explicit mask input, and the memory input's layout cannot
4906 be changed.
4907
4908 On the other hand, we do support permuting gather loads and
4909 masked gather loads, where each scalar load is independent
4910 of the others. This can be useful if the address/index input
4911 benefits from permutation. */
4912 if (STMT_VINFO_DATA_REF (rep)
4913 && STMT_VINFO_GROUPED_ACCESS (rep)
4914 && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4915 partition.layout = 0;
4916
4917 /* We cannot change the layout of an operation that is
4918 not independent on lanes. Note this is an explicit
4919 negative list since that's much shorter than the respective
4920 positive one but it's critical to keep maintaining it. */
4921 if (is_gimple_call (STMT_VINFO_STMT (rep)))
4922 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4923 {
4924 case CFN_COMPLEX_ADD_ROT90:
4925 case CFN_COMPLEX_ADD_ROT270:
4926 case CFN_COMPLEX_MUL:
4927 case CFN_COMPLEX_MUL_CONJ:
4928 case CFN_VEC_ADDSUB:
4929 case CFN_VEC_FMADDSUB:
4930 case CFN_VEC_FMSUBADD:
4931 partition.layout = 0;
4932 default:;
4933 }
4934 }
4935
4936 auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4937 {
4938 auto &other_vertex = m_vertices[other_node_i];
4939
4940 /* Count the number of edges from earlier partitions and the number
4941 of edges to later partitions. */
4942 if (other_vertex.partition < vertex.partition)
4943 partition.in_degree += 1;
4944 else
4945 partition.out_degree += 1;
4946
4947 /* If the current node uses the result of OTHER_NODE_I, accumulate
4948 the effects of that. */
4949 if (ud->src == int (node_i))
4950 {
4951 other_vertex.out_weight += vertex.weight;
4952 other_vertex.out_degree += 1;
4953 }
4954 };
4955 for_each_partition_edge (node_i, process_edge);
4956 }
4957 }
4958
4959 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4960 its current (provisional) choice of layout. The inputs do not necessarily
4961 have the same layout as each other. */
4962
4963 slpg_layout_cost
4964 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4965 {
4966 auto &vertex = m_vertices[node_i];
4967 slpg_layout_cost cost;
4968 auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4969 {
4970 auto &other_vertex = m_vertices[other_node_i];
4971 if (other_vertex.partition < vertex.partition)
4972 {
4973 auto &other_partition = m_partitions[other_vertex.partition];
4974 auto &other_costs = partition_layout_costs (other_vertex.partition,
4975 other_partition.layout);
4976 slpg_layout_cost this_cost = other_costs.in_cost;
4977 this_cost.add_serial_cost (other_costs.internal_cost);
4978 this_cost.split (other_partition.out_degree);
4979 cost.add_parallel_cost (this_cost);
4980 }
4981 };
4982 for_each_partition_edge (node_i, add_cost);
4983 return cost;
4984 }
4985
4986 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4987 and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
4988 slpg_layout_cost::impossible () if the change isn't possible. */
4989
4990 slpg_layout_cost
4991 vect_optimize_slp_pass::
4992 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4993 unsigned int layout2_i)
4994 {
4995 auto &def_vertex = m_vertices[ud->dest];
4996 auto &use_vertex = m_vertices[ud->src];
4997 auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4998 auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4999 auto factor = change_layout_cost (def_vertex.node, def_layout_i,
5000 use_layout_i);
5001 if (factor < 0)
5002 return slpg_layout_cost::impossible ();
5003
5004 /* We have a choice of putting the layout change at the site of the
5005 definition or at the site of the use. Prefer the former when
5006 optimizing for size or when the execution frequency of the
5007 definition is no greater than the combined execution frequencies of
5008 the uses. When putting the layout change at the site of the definition,
5009 divvy up the cost among all consumers. */
5010 if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
5011 {
5012 slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
5013 cost.split (def_vertex.out_degree);
5014 return cost;
5015 }
5016 return { use_vertex.weight * factor, m_optimize_size };
5017 }
5018
5019 /* UD represents a use-def link between FROM_NODE_I and a node in a later
5020 partition; FROM_NODE_I could be the definition node or the use node.
5021 The node at the other end of the link wants to use layout TO_LAYOUT_I.
5022 Return the cost of any necessary fix-ups on edge UD, or return
5023 slpg_layout_cost::impossible () if the change isn't possible.
5024
5025 At this point, FROM_NODE_I's partition has chosen the cheapest
5026 layout based on the information available so far, but this choice
5027 is only provisional. */
5028
5029 slpg_layout_cost
5030 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
5031 unsigned int to_layout_i)
5032 {
5033 auto &from_vertex = m_vertices[from_node_i];
5034 unsigned int from_partition_i = from_vertex.partition;
5035 slpg_partition_info &from_partition = m_partitions[from_partition_i];
5036 gcc_assert (from_partition.layout >= 0);
5037
5038 /* First calculate the cost on the assumption that FROM_PARTITION sticks
5039 with its current layout preference. */
5040 slpg_layout_cost cost = slpg_layout_cost::impossible ();
5041 auto edge_cost = edge_layout_cost (ud, from_node_i,
5042 from_partition.layout, to_layout_i);
5043 if (edge_cost.is_possible ())
5044 {
5045 auto &from_costs = partition_layout_costs (from_partition_i,
5046 from_partition.layout);
5047 cost = from_costs.in_cost;
5048 cost.add_serial_cost (from_costs.internal_cost);
5049 cost.split (from_partition.out_degree);
5050 cost.add_serial_cost (edge_cost);
5051 }
5052 else if (from_partition.layout == 0)
5053 /* We must allow the source partition to have layout 0 as a fallback,
5054 in case all other options turn out to be impossible. */
5055 return cost;
5056
5057 /* Take the minimum of that cost and the cost that applies if
5058 FROM_PARTITION instead switches to TO_LAYOUT_I. */
5059 auto &direct_layout_costs = partition_layout_costs (from_partition_i,
5060 to_layout_i);
5061 if (direct_layout_costs.is_possible ())
5062 {
5063 slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
5064 direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
5065 direct_cost.split (from_partition.out_degree);
5066 if (!cost.is_possible ()
5067 || direct_cost.is_better_than (cost, m_optimize_size))
5068 cost = direct_cost;
5069 }
5070
5071 return cost;
5072 }
5073
5074 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
5075 partition; TO_NODE_I could be the definition node or the use node.
5076 The node at the other end of the link wants to use layout FROM_LAYOUT_I;
5077 return the cost of any necessary fix-ups on edge UD, or
5078 slpg_layout_cost::impossible () if the choice cannot be made.
5079
5080 At this point, TO_NODE_I's partition has a fixed choice of layout. */
5081
5082 slpg_layout_cost
5083 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
5084 unsigned int from_layout_i)
5085 {
5086 auto &to_vertex = m_vertices[to_node_i];
5087 unsigned int to_partition_i = to_vertex.partition;
5088 slpg_partition_info &to_partition = m_partitions[to_partition_i];
5089 gcc_assert (to_partition.layout >= 0);
5090
5091 /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
5092 adjusted for this input having layout FROM_LAYOUT_I. Assume that
5093 any other inputs keep their current choice of layout. */
5094 auto &to_costs = partition_layout_costs (to_partition_i,
5095 to_partition.layout);
5096 if (ud->src == int (to_node_i)
5097 && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
5098 {
5099 auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
5100 auto old_layout = from_partition.layout;
5101 from_partition.layout = from_layout_i;
5102 int factor = internal_node_cost (to_vertex.node, -1,
5103 to_partition.layout);
5104 from_partition.layout = old_layout;
5105 if (factor >= 0)
5106 {
5107 slpg_layout_cost cost = to_costs.out_cost;
5108 cost.add_serial_cost ({ to_vertex.weight * factor,
5109 m_optimize_size });
5110 cost.split (to_partition.in_degree);
5111 return cost;
5112 }
5113 }
5114
5115 /* Compute the cost if we insert any necessary layout change on edge UD. */
5116 auto edge_cost = edge_layout_cost (ud, to_node_i,
5117 to_partition.layout, from_layout_i);
5118 if (edge_cost.is_possible ())
5119 {
5120 slpg_layout_cost cost = to_costs.out_cost;
5121 cost.add_serial_cost (to_costs.internal_cost);
5122 cost.split (to_partition.in_degree);
5123 cost.add_serial_cost (edge_cost);
5124 return cost;
5125 }
5126
5127 return slpg_layout_cost::impossible ();
5128 }
5129
5130 /* Make a forward pass through the partitions, accumulating input costs.
5131 Make a tentative (provisional) choice of layout for each partition,
5132 ensuring that this choice still allows later partitions to keep
5133 their original layout. */
5134
5135 void
5136 vect_optimize_slp_pass::forward_pass ()
5137 {
5138 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5139 ++partition_i)
5140 {
5141 auto &partition = m_partitions[partition_i];
5142
5143 /* If the partition consists of a single VEC_PERM_EXPR, precompute
5144 the incoming cost that would apply if every predecessor partition
5145 keeps its current layout. This is used within the loop below. */
5146 slpg_layout_cost in_cost;
5147 slp_tree single_node = nullptr;
5148 if (partition.node_end == partition.node_begin + 1)
5149 {
5150 unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5151 single_node = m_vertices[node_i].node;
5152 if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5153 in_cost = total_in_cost (node_i);
5154 }
5155
5156 /* Go through the possible layouts. Decide which ones are valid
5157 for this partition and record which of the valid layouts has
5158 the lowest cost. */
5159 unsigned int min_layout_i = 0;
5160 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5161 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5162 {
5163 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5164 if (!layout_costs.is_possible ())
5165 continue;
5166
5167 /* If the recorded layout is already 0 then the layout cannot
5168 change. */
5169 if (partition.layout == 0 && layout_i != 0)
5170 {
5171 layout_costs.mark_impossible ();
5172 continue;
5173 }
5174
5175 bool is_possible = true;
5176 for (unsigned int order_i = partition.node_begin;
5177 order_i < partition.node_end; ++order_i)
5178 {
5179 unsigned int node_i = m_partitioned_nodes[order_i];
5180 auto &vertex = m_vertices[node_i];
5181
5182 /* Reject the layout if it is individually incompatible
5183 with any node in the partition. */
5184 if (!is_compatible_layout (vertex.node, layout_i))
5185 {
5186 is_possible = false;
5187 break;
5188 }
5189
5190 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5191 {
5192 auto &other_vertex = m_vertices[other_node_i];
5193 if (other_vertex.partition < vertex.partition)
5194 {
5195 /* Accumulate the incoming costs from earlier
5196 partitions, plus the cost of any layout changes
5197 on UD itself. */
5198 auto cost = forward_cost (ud, other_node_i, layout_i);
5199 if (!cost.is_possible ())
5200 is_possible = false;
5201 else
5202 layout_costs.in_cost.add_parallel_cost (cost);
5203 }
5204 else
5205 /* Reject the layout if it would make layout 0 impossible
5206 for later partitions. This amounts to testing that the
5207 target supports reversing the layout change on edges
5208 to later partitions.
5209
5210 In principle, it might be possible to push a layout
5211 change all the way down a graph, so that it never
5212 needs to be reversed and so that the target doesn't
5213 need to support the reverse operation. But it would
5214 be awkward to bail out if we hit a partition that
5215 does not support the new layout, especially since
5216 we are not dealing with a lattice. */
5217 is_possible &= edge_layout_cost (ud, other_node_i, 0,
5218 layout_i).is_possible ();
5219 };
5220 for_each_partition_edge (node_i, add_cost);
5221
5222 /* Accumulate the cost of using LAYOUT_I within NODE,
5223 both for the inputs and the outputs. */
5224 int factor = internal_node_cost (vertex.node, layout_i,
5225 layout_i);
5226 if (factor < 0)
5227 {
5228 is_possible = false;
5229 break;
5230 }
5231 else if (factor)
5232 layout_costs.internal_cost.add_serial_cost
5233 ({ vertex.weight * factor, m_optimize_size });
5234 }
5235 if (!is_possible)
5236 {
5237 layout_costs.mark_impossible ();
5238 continue;
5239 }
5240
5241 /* Combine the incoming and partition-internal costs. */
5242 slpg_layout_cost combined_cost = layout_costs.in_cost;
5243 combined_cost.add_serial_cost (layout_costs.internal_cost);
5244
5245 /* If this partition consists of a single VEC_PERM_EXPR, see
5246 if the VEC_PERM_EXPR can be changed to support output layout
5247 LAYOUT_I while keeping all the provisional choices of input
5248 layout. */
5249 if (single_node
5250 && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5251 {
5252 int factor = internal_node_cost (single_node, -1, layout_i);
5253 if (factor >= 0)
5254 {
5255 auto weight = m_vertices[single_node->vertex].weight;
5256 slpg_layout_cost internal_cost
5257 = { weight * factor, m_optimize_size };
5258
5259 slpg_layout_cost alt_cost = in_cost;
5260 alt_cost.add_serial_cost (internal_cost);
5261 if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5262 {
5263 combined_cost = alt_cost;
5264 layout_costs.in_cost = in_cost;
5265 layout_costs.internal_cost = internal_cost;
5266 }
5267 }
5268 }
5269
5270 /* Record the layout with the lowest cost. Prefer layout 0 in
5271 the event of a tie between it and another layout. */
5272 if (!min_layout_cost.is_possible ()
5273 || combined_cost.is_better_than (min_layout_cost,
5274 m_optimize_size))
5275 {
5276 min_layout_i = layout_i;
5277 min_layout_cost = combined_cost;
5278 }
5279 }
5280
5281 /* This loop's handling of earlier partitions should ensure that
5282 choosing the original layout for the current partition is no
5283 less valid than it was in the original graph, even with the
5284 provisional layout choices for those earlier partitions. */
5285 gcc_assert (min_layout_cost.is_possible ());
5286 partition.layout = min_layout_i;
5287 }
5288 }
5289
5290 /* Make a backward pass through the partitions, accumulating output costs.
5291 Make a final choice of layout for each partition. */
5292
5293 void
5294 vect_optimize_slp_pass::backward_pass ()
5295 {
5296 for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5297 {
5298 auto &partition = m_partitions[partition_i];
5299
5300 unsigned int min_layout_i = 0;
5301 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5302 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5303 {
5304 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5305 if (!layout_costs.is_possible ())
5306 continue;
5307
5308 /* Accumulate the costs from successor partitions. */
5309 bool is_possible = true;
5310 for (unsigned int order_i = partition.node_begin;
5311 order_i < partition.node_end; ++order_i)
5312 {
5313 unsigned int node_i = m_partitioned_nodes[order_i];
5314 auto &vertex = m_vertices[node_i];
5315 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5316 {
5317 auto &other_vertex = m_vertices[other_node_i];
5318 auto &other_partition = m_partitions[other_vertex.partition];
5319 if (other_vertex.partition > vertex.partition)
5320 {
5321 /* Accumulate the incoming costs from later
5322 partitions, plus the cost of any layout changes
5323 on UD itself. */
5324 auto cost = backward_cost (ud, other_node_i, layout_i);
5325 if (!cost.is_possible ())
5326 is_possible = false;
5327 else
5328 layout_costs.out_cost.add_parallel_cost (cost);
5329 }
5330 else
5331 /* Make sure that earlier partitions can (if necessary
5332 or beneficial) keep the layout that they chose in
5333 the forward pass. This ensures that there is at
5334 least one valid choice of layout. */
5335 is_possible &= edge_layout_cost (ud, other_node_i,
5336 other_partition.layout,
5337 layout_i).is_possible ();
5338 };
5339 for_each_partition_edge (node_i, add_cost);
5340 }
5341 if (!is_possible)
5342 {
5343 layout_costs.mark_impossible ();
5344 continue;
5345 }
5346
5347 /* Locally combine the costs from the forward and backward passes.
5348 (This combined cost is not passed on, since that would lead
5349 to double counting.) */
5350 slpg_layout_cost combined_cost = layout_costs.in_cost;
5351 combined_cost.add_serial_cost (layout_costs.internal_cost);
5352 combined_cost.add_serial_cost (layout_costs.out_cost);
5353
5354 /* Record the layout with the lowest cost. Prefer layout 0 in
5355 the event of a tie between it and another layout. */
5356 if (!min_layout_cost.is_possible ()
5357 || combined_cost.is_better_than (min_layout_cost,
5358 m_optimize_size))
5359 {
5360 min_layout_i = layout_i;
5361 min_layout_cost = combined_cost;
5362 }
5363 }
5364
5365 gcc_assert (min_layout_cost.is_possible ());
5366 partition.layout = min_layout_i;
5367 }
5368 }
5369
5370 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5371 NODE already has the layout that was selected for its partition. */
5372
5373 slp_tree
5374 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5375 unsigned int to_layout_i)
5376 {
5377 unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5378 slp_tree result = m_node_layouts[result_i];
5379 if (result)
5380 return result;
5381
5382 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5383 || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5384 /* We can't permute vector defs in place. */
5385 && SLP_TREE_VEC_DEFS (node).is_empty ()))
5386 {
5387 /* If the vector is uniform or unchanged, there's nothing to do. */
5388 if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5389 result = node;
5390 else
5391 {
5392 auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5393 result = vect_create_new_slp_node (scalar_ops);
5394 vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5395 }
5396 }
5397 else
5398 {
5399 unsigned int partition_i = m_vertices[node->vertex].partition;
5400 unsigned int from_layout_i = m_partitions[partition_i].layout;
5401 if (from_layout_i == to_layout_i)
5402 return node;
5403
5404 /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5405 permutation instead of a serial one. Leave the new permutation
5406 in TMP_PERM on success. */
5407 auto_lane_permutation_t tmp_perm;
5408 unsigned int num_inputs = 1;
5409 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5410 {
5411 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5412 if (from_layout_i != 0)
5413 vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5414 if (to_layout_i != 0)
5415 vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5416 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5417 tmp_perm,
5418 SLP_TREE_CHILDREN (node),
5419 false) >= 0)
5420 num_inputs = SLP_TREE_CHILDREN (node).length ();
5421 else
5422 tmp_perm.truncate (0);
5423 }
5424
5425 if (dump_enabled_p ())
5426 {
5427 if (tmp_perm.length () > 0)
5428 dump_printf_loc (MSG_NOTE, vect_location,
5429 "duplicating permutation node %p with"
5430 " layout %d\n",
5431 (void *) node, to_layout_i);
5432 else
5433 dump_printf_loc (MSG_NOTE, vect_location,
5434 "inserting permutation node in place of %p\n",
5435 (void *) node);
5436 }
5437
5438 unsigned int num_lanes = SLP_TREE_LANES (node);
5439 result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5440 if (SLP_TREE_SCALAR_STMTS (node).length ())
5441 {
5442 auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5443 stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5444 if (from_layout_i != 0)
5445 vect_slp_permute (m_perms[from_layout_i], stmts, false);
5446 if (to_layout_i != 0)
5447 vect_slp_permute (m_perms[to_layout_i], stmts, true);
5448 }
5449 SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5450 SLP_TREE_LANES (result) = num_lanes;
5451 SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5452 result->vertex = -1;
5453
5454 auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5455 if (tmp_perm.length ())
5456 {
5457 lane_perm.safe_splice (tmp_perm);
5458 SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5459 }
5460 else
5461 {
5462 lane_perm.create (num_lanes);
5463 for (unsigned j = 0; j < num_lanes; ++j)
5464 lane_perm.quick_push ({ 0, j });
5465 if (from_layout_i != 0)
5466 vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5467 if (to_layout_i != 0)
5468 vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5469 SLP_TREE_CHILDREN (result).safe_push (node);
5470 }
5471 for (slp_tree child : SLP_TREE_CHILDREN (result))
5472 child->refcnt++;
5473 }
5474 m_node_layouts[result_i] = result;
5475 return result;
5476 }
5477
5478 /* Apply the chosen vector layouts to the SLP graph. */
5479
5480 void
5481 vect_optimize_slp_pass::materialize ()
5482 {
5483 /* We no longer need the costs, so avoid having two O(N * P) arrays
5484 live at the same time. */
5485 m_partition_layout_costs.release ();
5486 m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5487
5488 auto_sbitmap fully_folded (m_vertices.length ());
5489 bitmap_clear (fully_folded);
5490 for (unsigned int node_i : m_partitioned_nodes)
5491 {
5492 auto &vertex = m_vertices[node_i];
5493 slp_tree node = vertex.node;
5494 int layout_i = m_partitions[vertex.partition].layout;
5495 gcc_assert (layout_i >= 0);
5496
5497 /* Rearrange the scalar statements to match the chosen layout. */
5498 if (layout_i > 0)
5499 vect_slp_permute (m_perms[layout_i],
5500 SLP_TREE_SCALAR_STMTS (node), true);
5501
5502 /* Update load and lane permutations. */
5503 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5504 {
5505 /* First try to absorb the input vector layouts. If that fails,
5506 force the inputs to have layout LAYOUT_I too. We checked that
5507 that was possible before deciding to use nonzero output layouts.
5508 (Note that at this stage we don't really have any guarantee that
5509 the target supports the original VEC_PERM_EXPR.) */
5510 auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5511 auto_lane_permutation_t tmp_perm;
5512 tmp_perm.safe_splice (perm);
5513 change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5514 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5515 tmp_perm,
5516 SLP_TREE_CHILDREN (node),
5517 false) >= 0)
5518 {
5519 if (dump_enabled_p ()
5520 && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5521 perm.begin ()))
5522 dump_printf_loc (MSG_NOTE, vect_location,
5523 "absorbing input layouts into %p\n",
5524 (void *) node);
5525 std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5526 bitmap_set_bit (fully_folded, node_i);
5527 }
5528 else
5529 {
5530 /* Not MSG_MISSED because it would make no sense to users. */
5531 if (dump_enabled_p ())
5532 dump_printf_loc (MSG_NOTE, vect_location,
5533 "failed to absorb input layouts into %p\n",
5534 (void *) node);
5535 change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5536 }
5537 }
5538 else
5539 {
5540 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5541 auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5542 if (layout_i > 0)
5543 /* ??? When we handle non-bijective permutes the idea
5544 is that we can force the load-permutation to be
5545 { min, min + 1, min + 2, ... max }. But then the
5546 scalar defs might no longer match the lane content
5547 which means wrong-code with live lane vectorization.
5548 So we possibly have to have NULL entries for those. */
5549 vect_slp_permute (m_perms[layout_i], load_perm, true);
5550 }
5551 }
5552
5553 /* Do this before any nodes disappear, since it involves a walk
5554 over the leaves. */
5555 remove_redundant_permutations ();
5556
5557 /* Replace each child with a correctly laid-out version. */
5558 for (unsigned int node_i : m_partitioned_nodes)
5559 {
5560 /* Skip nodes that have already been handled above. */
5561 if (bitmap_bit_p (fully_folded, node_i))
5562 continue;
5563
5564 auto &vertex = m_vertices[node_i];
5565 int in_layout_i = m_partitions[vertex.partition].layout;
5566 gcc_assert (in_layout_i >= 0);
5567
5568 unsigned j;
5569 slp_tree child;
5570 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5571 {
5572 if (!child)
5573 continue;
5574
5575 slp_tree new_child = get_result_with_layout (child, in_layout_i);
5576 if (new_child != child)
5577 {
5578 vect_free_slp_tree (child);
5579 SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5580 new_child->refcnt += 1;
5581 }
5582 }
5583 }
5584 }
5585
5586 /* Elide load permutations that are not necessary. Such permutations might
5587 be pre-existing, rather than created by the layout optimizations. */
5588
5589 void
5590 vect_optimize_slp_pass::remove_redundant_permutations ()
5591 {
5592 for (unsigned int node_i : m_leafs)
5593 {
5594 slp_tree node = m_vertices[node_i].node;
5595 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5596 continue;
5597
5598 /* In basic block vectorization we allow any subchain of an interleaving
5599 chain.
5600 FORNOW: not in loop SLP because of realignment complications. */
5601 if (is_a <bb_vec_info> (m_vinfo))
5602 {
5603 bool subchain_p = true;
5604 stmt_vec_info next_load_info = NULL;
5605 stmt_vec_info load_info;
5606 unsigned j;
5607 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5608 {
5609 if (j != 0
5610 && (next_load_info != load_info
5611 || DR_GROUP_GAP (load_info) != 1))
5612 {
5613 subchain_p = false;
5614 break;
5615 }
5616 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5617 }
5618 if (subchain_p)
5619 {
5620 SLP_TREE_LOAD_PERMUTATION (node).release ();
5621 continue;
5622 }
5623 }
5624 else
5625 {
5626 loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5627 stmt_vec_info load_info;
5628 bool this_load_permuted = false;
5629 unsigned j;
5630 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5631 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5632 {
5633 this_load_permuted = true;
5634 break;
5635 }
5636 /* When this isn't a grouped access we know it's single element
5637 and contiguous. */
5638 if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5639 {
5640 if (!this_load_permuted
5641 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5642 || SLP_TREE_LANES (node) == 1))
5643 SLP_TREE_LOAD_PERMUTATION (node).release ();
5644 continue;
5645 }
5646 stmt_vec_info first_stmt_info
5647 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5648 if (!this_load_permuted
5649 /* The load requires permutation when unrolling exposes
5650 a gap either because the group is larger than the SLP
5651 group-size or because there is a gap between the groups. */
5652 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5653 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5654 && DR_GROUP_GAP (first_stmt_info) == 0)))
5655 {
5656 SLP_TREE_LOAD_PERMUTATION (node).release ();
5657 continue;
5658 }
5659 }
5660 }
5661 }
5662
5663 /* Print the partition graph and layout information to the dump file. */
5664
5665 void
5666 vect_optimize_slp_pass::dump ()
5667 {
5668 dump_printf_loc (MSG_NOTE, vect_location,
5669 "SLP optimize permutations:\n");
5670 for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5671 {
5672 dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
5673 const char *sep = "";
5674 for (unsigned int idx : m_perms[layout_i])
5675 {
5676 dump_printf (MSG_NOTE, "%s%d", sep, idx);
5677 sep = ", ";
5678 }
5679 dump_printf (MSG_NOTE, " }\n");
5680 }
5681 dump_printf_loc (MSG_NOTE, vect_location,
5682 "SLP optimize partitions:\n");
5683 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5684 ++partition_i)
5685 {
5686 auto &partition = m_partitions[partition_i];
5687 dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
5688 dump_printf_loc (MSG_NOTE, vect_location,
5689 " partition %d (layout %d):\n",
5690 partition_i, partition.layout);
5691 dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
5692 for (unsigned int order_i = partition.node_begin;
5693 order_i < partition.node_end; ++order_i)
5694 {
5695 auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5696 dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
5697 (void *) vertex.node);
5698 dump_printf_loc (MSG_NOTE, vect_location,
5699 " weight: %f\n",
5700 vertex.weight.to_double ());
5701 if (vertex.out_degree)
5702 dump_printf_loc (MSG_NOTE, vect_location,
5703 " out weight: %f (degree %d)\n",
5704 vertex.out_weight.to_double (),
5705 vertex.out_degree);
5706 if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5707 dump_printf_loc (MSG_NOTE, vect_location,
5708 " op: VEC_PERM_EXPR\n");
5709 else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5710 dump_printf_loc (MSG_NOTE, vect_location,
5711 " op template: %G", rep->stmt);
5712 }
5713 dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
5714 for (unsigned int order_i = partition.node_begin;
5715 order_i < partition.node_end; ++order_i)
5716 {
5717 unsigned int node_i = m_partitioned_nodes[order_i];
5718 auto &vertex = m_vertices[node_i];
5719 auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5720 {
5721 auto &other_vertex = m_vertices[other_node_i];
5722 if (other_vertex.partition < vertex.partition)
5723 dump_printf_loc (MSG_NOTE, vect_location,
5724 " - %p [%d] --> %p\n",
5725 (void *) other_vertex.node,
5726 other_vertex.partition,
5727 (void *) vertex.node);
5728 else
5729 dump_printf_loc (MSG_NOTE, vect_location,
5730 " - %p --> [%d] %p\n",
5731 (void *) vertex.node,
5732 other_vertex.partition,
5733 (void *) other_vertex.node);
5734 };
5735 for_each_partition_edge (node_i, print_edge);
5736 }
5737
5738 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5739 {
5740 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5741 if (layout_costs.is_possible ())
5742 {
5743 dump_printf_loc (MSG_NOTE, vect_location,
5744 " layout %d:%s\n", layout_i,
5745 partition.layout == int (layout_i)
5746 ? " (*)" : "");
5747 slpg_layout_cost combined_cost = layout_costs.in_cost;
5748 combined_cost.add_serial_cost (layout_costs.internal_cost);
5749 combined_cost.add_serial_cost (layout_costs.out_cost);
5750 #define TEMPLATE "{depth: %f, total: %f}"
5751 dump_printf_loc (MSG_NOTE, vect_location,
5752 " " TEMPLATE "\n",
5753 layout_costs.in_cost.depth.to_double (),
5754 layout_costs.in_cost.total.to_double ());
5755 dump_printf_loc (MSG_NOTE, vect_location,
5756 " + " TEMPLATE "\n",
5757 layout_costs.internal_cost.depth.to_double (),
5758 layout_costs.internal_cost.total.to_double ());
5759 dump_printf_loc (MSG_NOTE, vect_location,
5760 " + " TEMPLATE "\n",
5761 layout_costs.out_cost.depth.to_double (),
5762 layout_costs.out_cost.total.to_double ());
5763 dump_printf_loc (MSG_NOTE, vect_location,
5764 " = " TEMPLATE "\n",
5765 combined_cost.depth.to_double (),
5766 combined_cost.total.to_double ());
5767 #undef TEMPLATE
5768 }
5769 else
5770 dump_printf_loc (MSG_NOTE, vect_location,
5771 " layout %d: rejected\n", layout_i);
5772 }
5773 }
5774 }
5775
5776 /* Main entry point for the SLP graph optimization pass. */
5777
5778 void
5779 vect_optimize_slp_pass::run ()
5780 {
5781 build_graph ();
5782 create_partitions ();
5783 start_choosing_layouts ();
5784 if (m_perms.length () > 1)
5785 {
5786 forward_pass ();
5787 backward_pass ();
5788 if (dump_enabled_p ())
5789 dump ();
5790 materialize ();
5791 while (!m_perms.is_empty ())
5792 m_perms.pop ().release ();
5793 }
5794 else
5795 remove_redundant_permutations ();
5796 free_graph (m_slpg);
5797 }
5798
5799 /* Optimize the SLP graph of VINFO. */
5800
5801 void
5802 vect_optimize_slp (vec_info *vinfo)
5803 {
5804 if (vinfo->slp_instances.is_empty ())
5805 return;
5806 vect_optimize_slp_pass (vinfo).run ();
5807 }
5808
5809 /* Gather loads reachable from the individual SLP graph entries. */
5810
5811 void
5812 vect_gather_slp_loads (vec_info *vinfo)
5813 {
5814 unsigned i;
5815 slp_instance instance;
5816 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5817 {
5818 hash_set<slp_tree> visited;
5819 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5820 SLP_INSTANCE_TREE (instance), visited);
5821 }
5822 }
5823
5824
5825 /* For each possible SLP instance decide whether to SLP it and calculate overall
5826 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
5827 least one instance. */
5828
5829 bool
5830 vect_make_slp_decision (loop_vec_info loop_vinfo)
5831 {
5832 unsigned int i;
5833 poly_uint64 unrolling_factor = 1;
5834 const vec<slp_instance> &slp_instances
5835 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5836 slp_instance instance;
5837 int decided_to_slp = 0;
5838
5839 DUMP_VECT_SCOPE ("vect_make_slp_decision");
5840
5841 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5842 {
5843 /* FORNOW: SLP if you can. */
5844 /* All unroll factors have the form:
5845
5846 GET_MODE_SIZE (vinfo->vector_mode) * X
5847
5848 for some rational X, so they must have a common multiple. */
5849 unrolling_factor
5850 = force_common_multiple (unrolling_factor,
5851 SLP_INSTANCE_UNROLLING_FACTOR (instance));
5852
5853 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
5854 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5855 loop-based vectorization. Such stmts will be marked as HYBRID. */
5856 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5857 decided_to_slp++;
5858 }
5859
5860 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5861
5862 if (decided_to_slp && dump_enabled_p ())
5863 {
5864 dump_printf_loc (MSG_NOTE, vect_location,
5865 "Decided to SLP %d instances. Unrolling factor ",
5866 decided_to_slp);
5867 dump_dec (MSG_NOTE, unrolling_factor);
5868 dump_printf (MSG_NOTE, "\n");
5869 }
5870
5871 return (decided_to_slp > 0);
5872 }
5873
5874 /* Private data for vect_detect_hybrid_slp. */
5875 struct vdhs_data
5876 {
5877 loop_vec_info loop_vinfo;
5878 vec<stmt_vec_info> *worklist;
5879 };
5880
5881 /* Walker for walk_gimple_op. */
5882
5883 static tree
5884 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5885 {
5886 walk_stmt_info *wi = (walk_stmt_info *)data;
5887 vdhs_data *dat = (vdhs_data *)wi->info;
5888
5889 if (wi->is_lhs)
5890 return NULL_TREE;
5891
5892 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5893 if (!def_stmt_info)
5894 return NULL_TREE;
5895 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5896 if (PURE_SLP_STMT (def_stmt_info))
5897 {
5898 if (dump_enabled_p ())
5899 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5900 def_stmt_info->stmt);
5901 STMT_SLP_TYPE (def_stmt_info) = hybrid;
5902 dat->worklist->safe_push (def_stmt_info);
5903 }
5904
5905 return NULL_TREE;
5906 }
5907
5908 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5909 if so, otherwise pushing it to WORKLIST. */
5910
5911 static void
5912 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5913 vec<stmt_vec_info> &worklist,
5914 stmt_vec_info stmt_info)
5915 {
5916 if (dump_enabled_p ())
5917 dump_printf_loc (MSG_NOTE, vect_location,
5918 "Processing hybrid candidate : %G", stmt_info->stmt);
5919 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5920 imm_use_iterator iter2;
5921 ssa_op_iter iter1;
5922 use_operand_p use_p;
5923 def_operand_p def_p;
5924 bool any_def = false;
5925 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5926 {
5927 any_def = true;
5928 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5929 {
5930 if (is_gimple_debug (USE_STMT (use_p)))
5931 continue;
5932 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5933 /* An out-of loop use means this is a loop_vect sink. */
5934 if (!use_info)
5935 {
5936 if (dump_enabled_p ())
5937 dump_printf_loc (MSG_NOTE, vect_location,
5938 "Found loop_vect sink: %G", stmt_info->stmt);
5939 worklist.safe_push (stmt_info);
5940 return;
5941 }
5942 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5943 {
5944 if (dump_enabled_p ())
5945 dump_printf_loc (MSG_NOTE, vect_location,
5946 "Found loop_vect use: %G", use_info->stmt);
5947 worklist.safe_push (stmt_info);
5948 return;
5949 }
5950 }
5951 }
5952 /* No def means this is a loo_vect sink. */
5953 if (!any_def)
5954 {
5955 if (dump_enabled_p ())
5956 dump_printf_loc (MSG_NOTE, vect_location,
5957 "Found loop_vect sink: %G", stmt_info->stmt);
5958 worklist.safe_push (stmt_info);
5959 return;
5960 }
5961 if (dump_enabled_p ())
5962 dump_printf_loc (MSG_NOTE, vect_location,
5963 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5964 STMT_SLP_TYPE (stmt_info) = pure_slp;
5965 }
5966
5967 /* Find stmts that must be both vectorized and SLPed. */
5968
5969 void
5970 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5971 {
5972 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5973
5974 /* All stmts participating in SLP are marked pure_slp, all other
5975 stmts are loop_vect.
5976 First collect all loop_vect stmts into a worklist.
5977 SLP patterns cause not all original scalar stmts to appear in
5978 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5979 Rectify this here and do a backward walk over the IL only considering
5980 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5981 mark them as pure_slp. */
5982 auto_vec<stmt_vec_info> worklist;
5983 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5984 {
5985 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5986 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5987 gsi_next (&gsi))
5988 {
5989 gphi *phi = gsi.phi ();
5990 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5991 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5992 maybe_push_to_hybrid_worklist (loop_vinfo,
5993 worklist, stmt_info);
5994 }
5995 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5996 gsi_prev (&gsi))
5997 {
5998 gimple *stmt = gsi_stmt (gsi);
5999 if (is_gimple_debug (stmt))
6000 continue;
6001 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
6002 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
6003 {
6004 for (gimple_stmt_iterator gsi2
6005 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
6006 !gsi_end_p (gsi2); gsi_next (&gsi2))
6007 {
6008 stmt_vec_info patt_info
6009 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
6010 if (!STMT_SLP_TYPE (patt_info)
6011 && STMT_VINFO_RELEVANT (patt_info))
6012 maybe_push_to_hybrid_worklist (loop_vinfo,
6013 worklist, patt_info);
6014 }
6015 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6016 }
6017 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
6018 maybe_push_to_hybrid_worklist (loop_vinfo,
6019 worklist, stmt_info);
6020 }
6021 }
6022
6023 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
6024 mark any SLP vectorized stmt as hybrid.
6025 ??? We're visiting def stmts N times (once for each non-SLP and
6026 once for each hybrid-SLP use). */
6027 walk_stmt_info wi;
6028 vdhs_data dat;
6029 dat.worklist = &worklist;
6030 dat.loop_vinfo = loop_vinfo;
6031 memset (&wi, 0, sizeof (wi));
6032 wi.info = (void *)&dat;
6033 while (!worklist.is_empty ())
6034 {
6035 stmt_vec_info stmt_info = worklist.pop ();
6036 /* Since SSA operands are not set up for pattern stmts we need
6037 to use walk_gimple_op. */
6038 wi.is_lhs = 0;
6039 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
6040 /* For gather/scatter make sure to walk the offset operand, that
6041 can be a scaling and conversion away. */
6042 gather_scatter_info gs_info;
6043 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
6044 && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
6045 {
6046 int dummy;
6047 vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
6048 }
6049 }
6050 }
6051
6052
6053 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
6054
6055 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
6056 : vec_info (vec_info::bb, shared),
6057 bbs (_bbs),
6058 roots (vNULL)
6059 {
6060 for (unsigned i = 0; i < bbs.length (); ++i)
6061 {
6062 if (i != 0)
6063 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6064 gsi_next (&si))
6065 {
6066 gphi *phi = si.phi ();
6067 gimple_set_uid (phi, 0);
6068 add_stmt (phi);
6069 }
6070 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6071 !gsi_end_p (gsi); gsi_next (&gsi))
6072 {
6073 gimple *stmt = gsi_stmt (gsi);
6074 gimple_set_uid (stmt, 0);
6075 if (is_gimple_debug (stmt))
6076 continue;
6077 add_stmt (stmt);
6078 }
6079 }
6080 }
6081
6082
6083 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
6084 stmts in the basic block. */
6085
6086 _bb_vec_info::~_bb_vec_info ()
6087 {
6088 /* Reset region marker. */
6089 for (unsigned i = 0; i < bbs.length (); ++i)
6090 {
6091 if (i != 0)
6092 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6093 gsi_next (&si))
6094 {
6095 gphi *phi = si.phi ();
6096 gimple_set_uid (phi, -1);
6097 }
6098 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6099 !gsi_end_p (gsi); gsi_next (&gsi))
6100 {
6101 gimple *stmt = gsi_stmt (gsi);
6102 gimple_set_uid (stmt, -1);
6103 }
6104 }
6105
6106 for (unsigned i = 0; i < roots.length (); ++i)
6107 {
6108 roots[i].stmts.release ();
6109 roots[i].roots.release ();
6110 roots[i].remain.release ();
6111 }
6112 roots.release ();
6113 }
6114
6115 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
6116 given then that child nodes have already been processed, and that
6117 their def types currently match their SLP node's def type. */
6118
6119 static bool
6120 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
6121 slp_instance node_instance,
6122 stmt_vector_for_cost *cost_vec)
6123 {
6124 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6125
6126 /* Calculate the number of vector statements to be created for the
6127 scalar stmts in this node. For SLP reductions it is equal to the
6128 number of vector statements in the children (which has already been
6129 calculated by the recursive call). Otherwise it is the number of
6130 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
6131 VF divided by the number of elements in a vector. */
6132 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6133 && !STMT_VINFO_DATA_REF (stmt_info)
6134 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6135 {
6136 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6137 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6138 {
6139 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6140 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6141 break;
6142 }
6143 }
6144 else
6145 {
6146 poly_uint64 vf;
6147 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6148 vf = loop_vinfo->vectorization_factor;
6149 else
6150 vf = 1;
6151 unsigned int group_size = SLP_TREE_LANES (node);
6152 tree vectype = SLP_TREE_VECTYPE (node);
6153 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6154 = vect_get_num_vectors (vf * group_size, vectype);
6155 }
6156
6157 /* Handle purely internal nodes. */
6158 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6159 {
6160 if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6161 return false;
6162
6163 stmt_vec_info slp_stmt_info;
6164 unsigned int i;
6165 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6166 {
6167 if (STMT_VINFO_LIVE_P (slp_stmt_info)
6168 && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6169 node_instance, i,
6170 false, cost_vec))
6171 return false;
6172 }
6173 return true;
6174 }
6175
6176 bool dummy;
6177 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6178 node, node_instance, cost_vec);
6179 }
6180
6181 /* Try to build NODE from scalars, returning true on success.
6182 NODE_INSTANCE is the SLP instance that contains NODE. */
6183
6184 static bool
6185 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6186 slp_instance node_instance)
6187 {
6188 stmt_vec_info stmt_info;
6189 unsigned int i;
6190
6191 if (!is_a <bb_vec_info> (vinfo)
6192 || node == SLP_INSTANCE_TREE (node_instance)
6193 || !SLP_TREE_SCALAR_STMTS (node).exists ()
6194 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6195 /* Force the mask use to be built from scalars instead. */
6196 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6197 return false;
6198
6199 if (dump_enabled_p ())
6200 dump_printf_loc (MSG_NOTE, vect_location,
6201 "Building vector operands of %p from scalars instead\n",
6202 (void *) node);
6203
6204 /* Don't remove and free the child nodes here, since they could be
6205 referenced by other structures. The analysis and scheduling phases
6206 (need to) ignore child nodes of anything that isn't vect_internal_def. */
6207 unsigned int group_size = SLP_TREE_LANES (node);
6208 SLP_TREE_DEF_TYPE (node) = vect_external_def;
6209 /* Invariants get their vector type from the uses. */
6210 SLP_TREE_VECTYPE (node) = NULL_TREE;
6211 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6212 SLP_TREE_LOAD_PERMUTATION (node).release ();
6213 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6214 {
6215 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6216 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6217 }
6218 return true;
6219 }
6220
6221 /* Return true if all elements of the slice are the same. */
6222 bool
6223 vect_scalar_ops_slice::all_same_p () const
6224 {
6225 for (unsigned int i = 1; i < length; ++i)
6226 if (!operand_equal_p (op (0), op (i)))
6227 return false;
6228 return true;
6229 }
6230
6231 hashval_t
6232 vect_scalar_ops_slice_hash::hash (const value_type &s)
6233 {
6234 hashval_t hash = 0;
6235 for (unsigned i = 0; i < s.length; ++i)
6236 hash = iterative_hash_expr (s.op (i), hash);
6237 return hash;
6238 }
6239
6240 bool
6241 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6242 const compare_type &s2)
6243 {
6244 if (s1.length != s2.length)
6245 return false;
6246 for (unsigned i = 0; i < s1.length; ++i)
6247 if (!operand_equal_p (s1.op (i), s2.op (i)))
6248 return false;
6249 return true;
6250 }
6251
6252 /* Compute the prologue cost for invariant or constant operands represented
6253 by NODE. */
6254
6255 static void
6256 vect_prologue_cost_for_slp (slp_tree node,
6257 stmt_vector_for_cost *cost_vec)
6258 {
6259 /* There's a special case of an existing vector, that costs nothing. */
6260 if (SLP_TREE_SCALAR_OPS (node).length () == 0
6261 && !SLP_TREE_VEC_DEFS (node).is_empty ())
6262 return;
6263 /* Without looking at the actual initializer a vector of
6264 constants can be implemented as load from the constant pool.
6265 When all elements are the same we can use a splat. */
6266 tree vectype = SLP_TREE_VECTYPE (node);
6267 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6268 unsigned HOST_WIDE_INT const_nunits;
6269 unsigned nelt_limit;
6270 auto ops = &SLP_TREE_SCALAR_OPS (node);
6271 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6272 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6273 && ! multiple_p (const_nunits, group_size))
6274 {
6275 nelt_limit = const_nunits;
6276 hash_set<vect_scalar_ops_slice_hash> vector_ops;
6277 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6278 if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6279 starts.quick_push (i * const_nunits);
6280 }
6281 else
6282 {
6283 /* If either the vector has variable length or the vectors
6284 are composed of repeated whole groups we only need to
6285 cost construction once. All vectors will be the same. */
6286 nelt_limit = group_size;
6287 starts.quick_push (0);
6288 }
6289 /* ??? We're just tracking whether vectors in a single node are the same.
6290 Ideally we'd do something more global. */
6291 bool passed = false;
6292 for (unsigned int start : starts)
6293 {
6294 vect_cost_for_stmt kind;
6295 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6296 kind = vector_load;
6297 else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6298 kind = scalar_to_vec;
6299 else
6300 kind = vec_construct;
6301 /* The target cost hook has no idea which part of the SLP node
6302 we are costing so avoid passing it down more than once. Pass
6303 it to the first vec_construct or scalar_to_vec part since for those
6304 the x86 backend tries to account for GPR to XMM register moves. */
6305 record_stmt_cost (cost_vec, 1, kind,
6306 (kind != vector_load && !passed) ? node : nullptr,
6307 vectype, 0, vect_prologue);
6308 if (kind != vector_load)
6309 passed = true;
6310 }
6311 }
6312
6313 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6314 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6315
6316 Return true if the operations are supported. */
6317
6318 static bool
6319 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6320 slp_instance node_instance,
6321 hash_set<slp_tree> &visited_set,
6322 vec<slp_tree> &visited_vec,
6323 stmt_vector_for_cost *cost_vec)
6324 {
6325 int i, j;
6326 slp_tree child;
6327
6328 /* Assume we can code-generate all invariants. */
6329 if (!node
6330 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6331 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6332 return true;
6333
6334 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6335 {
6336 if (dump_enabled_p ())
6337 dump_printf_loc (MSG_NOTE, vect_location,
6338 "Failed cyclic SLP reference in %p\n", (void *) node);
6339 return false;
6340 }
6341 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6342
6343 /* If we already analyzed the exact same set of scalar stmts we're done.
6344 We share the generated vector stmts for those. */
6345 if (visited_set.add (node))
6346 return true;
6347 visited_vec.safe_push (node);
6348
6349 bool res = true;
6350 unsigned visited_rec_start = visited_vec.length ();
6351 unsigned cost_vec_rec_start = cost_vec->length ();
6352 bool seen_non_constant_child = false;
6353 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6354 {
6355 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6356 visited_set, visited_vec,
6357 cost_vec);
6358 if (!res)
6359 break;
6360 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6361 seen_non_constant_child = true;
6362 }
6363 /* We're having difficulties scheduling nodes with just constant
6364 operands and no scalar stmts since we then cannot compute a stmt
6365 insertion place. */
6366 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6367 {
6368 if (dump_enabled_p ())
6369 dump_printf_loc (MSG_NOTE, vect_location,
6370 "Cannot vectorize all-constant op node %p\n",
6371 (void *) node);
6372 res = false;
6373 }
6374
6375 if (res)
6376 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6377 cost_vec);
6378 /* If analysis failed we have to pop all recursive visited nodes
6379 plus ourselves. */
6380 if (!res)
6381 {
6382 while (visited_vec.length () >= visited_rec_start)
6383 visited_set.remove (visited_vec.pop ());
6384 cost_vec->truncate (cost_vec_rec_start);
6385 }
6386
6387 /* When the node can be vectorized cost invariant nodes it references.
6388 This is not done in DFS order to allow the refering node
6389 vectorizable_* calls to nail down the invariant nodes vector type
6390 and possibly unshare it if it needs a different vector type than
6391 other referrers. */
6392 if (res)
6393 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6394 if (child
6395 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6396 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6397 /* Perform usual caching, note code-generation still
6398 code-gens these nodes multiple times but we expect
6399 to CSE them later. */
6400 && !visited_set.add (child))
6401 {
6402 visited_vec.safe_push (child);
6403 /* ??? After auditing more code paths make a "default"
6404 and push the vector type from NODE to all children
6405 if it is not already set. */
6406 /* Compute the number of vectors to be generated. */
6407 tree vector_type = SLP_TREE_VECTYPE (child);
6408 if (!vector_type)
6409 {
6410 /* For shifts with a scalar argument we don't need
6411 to cost or code-generate anything.
6412 ??? Represent this more explicitely. */
6413 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6414 == shift_vec_info_type)
6415 && j == 1);
6416 continue;
6417 }
6418 unsigned group_size = SLP_TREE_LANES (child);
6419 poly_uint64 vf = 1;
6420 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6421 vf = loop_vinfo->vectorization_factor;
6422 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6423 = vect_get_num_vectors (vf * group_size, vector_type);
6424 /* And cost them. */
6425 vect_prologue_cost_for_slp (child, cost_vec);
6426 }
6427
6428 /* If this node or any of its children can't be vectorized, try pruning
6429 the tree here rather than felling the whole thing. */
6430 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6431 {
6432 /* We'll need to revisit this for invariant costing and number
6433 of vectorized stmt setting. */
6434 res = true;
6435 }
6436
6437 return res;
6438 }
6439
6440 /* Given a definition DEF, analyze if it will have any live scalar use after
6441 performing SLP vectorization whose information is represented by BB_VINFO,
6442 and record result into hash map SCALAR_USE_MAP as cache for later fast
6443 check. If recursion DEPTH exceeds a limit, stop analysis and make a
6444 conservative assumption. Return 0 if no scalar use, 1 if there is, -1
6445 means recursion is limited. */
6446
6447 static int
6448 vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
6449 hash_map<tree, int> &scalar_use_map,
6450 int depth = 0)
6451 {
6452 const int depth_limit = 2;
6453 imm_use_iterator use_iter;
6454 gimple *use_stmt;
6455
6456 if (int *res = scalar_use_map.get (def))
6457 return *res;
6458
6459 int scalar_use = 1;
6460
6461 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
6462 {
6463 if (is_gimple_debug (use_stmt))
6464 continue;
6465
6466 stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6467
6468 if (!use_stmt_info)
6469 break;
6470
6471 if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6472 continue;
6473
6474 /* Do not step forward when encounter PHI statement, since it may
6475 involve cyclic reference and cause infinite recursive invocation. */
6476 if (gimple_code (use_stmt) == GIMPLE_PHI)
6477 break;
6478
6479 /* When pattern recognition is involved, a statement whose definition is
6480 consumed in some pattern, may not be included in the final replacement
6481 pattern statements, so would be skipped when building SLP graph.
6482
6483 * Original
6484 char a_c = *(char *) a;
6485 char b_c = *(char *) b;
6486 unsigned short a_s = (unsigned short) a_c;
6487 int a_i = (int) a_s;
6488 int b_i = (int) b_c;
6489 int r_i = a_i - b_i;
6490
6491 * After pattern replacement
6492 a_s = (unsigned short) a_c;
6493 a_i = (int) a_s;
6494
6495 patt_b_s = (unsigned short) b_c; // b_i = (int) b_c
6496 patt_b_i = (int) patt_b_s; // b_i = (int) b_c
6497
6498 patt_r_s = widen_minus(a_c, b_c); // r_i = a_i - b_i
6499 patt_r_i = (int) patt_r_s; // r_i = a_i - b_i
6500
6501 The definitions of a_i(original statement) and b_i(pattern statement)
6502 are related to, but actually not part of widen_minus pattern.
6503 Vectorizing the pattern does not cause these definition statements to
6504 be marked as PURE_SLP. For this case, we need to recursively check
6505 whether their uses are all absorbed into vectorized code. But there
6506 is an exception that some use may participate in an vectorized
6507 operation via an external SLP node containing that use as an element.
6508 The parameter "scalar_use_map" tags such kind of SSA as having scalar
6509 use in advance. */
6510 tree lhs = gimple_get_lhs (use_stmt);
6511
6512 if (!lhs || TREE_CODE (lhs) != SSA_NAME)
6513 break;
6514
6515 if (depth_limit && depth >= depth_limit)
6516 return -1;
6517
6518 if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
6519 depth + 1)))
6520 break;
6521 }
6522
6523 if (end_imm_use_stmt_p (&use_iter))
6524 scalar_use = 0;
6525
6526 /* If recursion is limited, do not cache result for non-root defs. */
6527 if (!depth || scalar_use >= 0)
6528 {
6529 bool added = scalar_use_map.put (def, scalar_use);
6530 gcc_assert (!added);
6531 }
6532
6533 return scalar_use;
6534 }
6535
6536 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6537 region and that can be vectorized using vectorizable_live_operation
6538 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
6539 scalar code computing it to be retained. */
6540
6541 static void
6542 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6543 slp_instance instance,
6544 stmt_vector_for_cost *cost_vec,
6545 hash_map<tree, int> &scalar_use_map,
6546 hash_set<stmt_vec_info> &svisited,
6547 hash_set<slp_tree> &visited)
6548 {
6549 if (visited.add (node))
6550 return;
6551
6552 unsigned i;
6553 stmt_vec_info stmt_info;
6554 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6555 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6556 {
6557 if (svisited.contains (stmt_info))
6558 continue;
6559 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6560 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6561 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6562 /* Only the pattern root stmt computes the original scalar value. */
6563 continue;
6564 bool mark_visited = true;
6565 gimple *orig_stmt = orig_stmt_info->stmt;
6566 ssa_op_iter op_iter;
6567 def_operand_p def_p;
6568 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6569 {
6570 if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
6571 scalar_use_map))
6572 {
6573 STMT_VINFO_LIVE_P (stmt_info) = true;
6574 if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
6575 instance, i, false, cost_vec))
6576 /* ??? So we know we can vectorize the live stmt from one SLP
6577 node. If we cannot do so from all or none consistently
6578 we'd have to record which SLP node (and lane) we want to
6579 use for the live operation. So make sure we can
6580 code-generate from all nodes. */
6581 mark_visited = false;
6582 else
6583 STMT_VINFO_LIVE_P (stmt_info) = false;
6584 }
6585
6586 /* We have to verify whether we can insert the lane extract
6587 before all uses. The following is a conservative approximation.
6588 We cannot put this into vectorizable_live_operation because
6589 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6590 doesn't work.
6591 Note that while the fact that we emit code for loads at the
6592 first load should make this a non-problem leafs we construct
6593 from scalars are vectorized after the last scalar def.
6594 ??? If we'd actually compute the insert location during
6595 analysis we could use sth less conservative than the last
6596 scalar stmt in the node for the dominance check. */
6597 /* ??? What remains is "live" uses in vector CTORs in the same
6598 SLP graph which is where those uses can end up code-generated
6599 right after their definition instead of close to their original
6600 use. But that would restrict us to code-generate lane-extracts
6601 from the latest stmt in a node. So we compensate for this
6602 during code-generation, simply not replacing uses for those
6603 hopefully rare cases. */
6604 imm_use_iterator use_iter;
6605 gimple *use_stmt;
6606 stmt_vec_info use_stmt_info;
6607
6608 if (STMT_VINFO_LIVE_P (stmt_info))
6609 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6610 if (!is_gimple_debug (use_stmt)
6611 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6612 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6613 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6614 {
6615 if (dump_enabled_p ())
6616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6617 "Cannot determine insertion place for "
6618 "lane extract\n");
6619 STMT_VINFO_LIVE_P (stmt_info) = false;
6620 mark_visited = true;
6621 }
6622 }
6623 if (mark_visited)
6624 svisited.add (stmt_info);
6625 }
6626
6627 slp_tree child;
6628 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6629 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6630 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
6631 scalar_use_map, svisited, visited);
6632 }
6633
6634 /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
6635 are live outside of the basic-block vectorized region and that can be
6636 vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. */
6637
6638 static void
6639 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
6640 {
6641 if (bb_vinfo->slp_instances.is_empty ())
6642 return;
6643
6644 hash_set<stmt_vec_info> svisited;
6645 hash_set<slp_tree> visited;
6646 hash_map<tree, int> scalar_use_map;
6647 auto_vec<slp_tree> worklist;
6648
6649 for (slp_instance instance : bb_vinfo->slp_instances)
6650 {
6651 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
6652 for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
6653 if (TREE_CODE (op) == SSA_NAME)
6654 scalar_use_map.put (op, 1);
6655 if (!visited.add (SLP_INSTANCE_TREE (instance)))
6656 worklist.safe_push (SLP_INSTANCE_TREE (instance));
6657 }
6658
6659 do
6660 {
6661 slp_tree node = worklist.pop ();
6662
6663 if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
6664 {
6665 for (tree op : SLP_TREE_SCALAR_OPS (node))
6666 if (TREE_CODE (op) == SSA_NAME)
6667 scalar_use_map.put (op, 1);
6668 }
6669 else
6670 {
6671 for (slp_tree child : SLP_TREE_CHILDREN (node))
6672 if (child && !visited.add (child))
6673 worklist.safe_push (child);
6674 }
6675 }
6676 while (!worklist.is_empty ());
6677
6678 visited.empty ();
6679
6680 for (slp_instance instance : bb_vinfo->slp_instances)
6681 {
6682 vect_location = instance->location ();
6683 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6684 instance, &instance->cost_vec,
6685 scalar_use_map, svisited, visited);
6686 }
6687 }
6688
6689 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
6690
6691 static bool
6692 vectorizable_bb_reduc_epilogue (slp_instance instance,
6693 stmt_vector_for_cost *cost_vec)
6694 {
6695 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6696 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6697 if (reduc_code == MINUS_EXPR)
6698 reduc_code = PLUS_EXPR;
6699 internal_fn reduc_fn;
6700 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6701 if (!vectype
6702 || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6703 || reduc_fn == IFN_LAST
6704 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6705 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6706 TREE_TYPE (vectype)))
6707 {
6708 if (dump_enabled_p ())
6709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6710 "not vectorized: basic block reduction epilogue "
6711 "operation unsupported.\n");
6712 return false;
6713 }
6714
6715 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6716 cost log2 vector operations plus shuffles and one extraction. */
6717 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6718 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6719 vectype, 0, vect_body);
6720 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6721 vectype, 0, vect_body);
6722 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6723 vectype, 0, vect_body);
6724
6725 /* Since we replace all stmts of a possibly longer scalar reduction
6726 chain account for the extra scalar stmts for that. */
6727 record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
6728 instance->root_stmts[0], 0, vect_body);
6729 return true;
6730 }
6731
6732 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6733 and recurse to children. */
6734
6735 static void
6736 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6737 hash_set<slp_tree> &visited)
6738 {
6739 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6740 || visited.add (node))
6741 return;
6742
6743 stmt_vec_info stmt;
6744 unsigned i;
6745 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6746 roots.remove (vect_orig_stmt (stmt));
6747
6748 slp_tree child;
6749 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6750 if (child)
6751 vect_slp_prune_covered_roots (child, roots, visited);
6752 }
6753
6754 /* Analyze statements in SLP instances of VINFO. Return true if the
6755 operations are supported. */
6756
6757 bool
6758 vect_slp_analyze_operations (vec_info *vinfo)
6759 {
6760 slp_instance instance;
6761 int i;
6762
6763 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6764
6765 hash_set<slp_tree> visited;
6766 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6767 {
6768 auto_vec<slp_tree> visited_vec;
6769 stmt_vector_for_cost cost_vec;
6770 cost_vec.create (2);
6771 if (is_a <bb_vec_info> (vinfo))
6772 vect_location = instance->location ();
6773 if (!vect_slp_analyze_node_operations (vinfo,
6774 SLP_INSTANCE_TREE (instance),
6775 instance, visited, visited_vec,
6776 &cost_vec)
6777 /* CTOR instances require vectorized defs for the SLP tree root. */
6778 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6779 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6780 != vect_internal_def
6781 /* Make sure we vectorized with the expected type. */
6782 || !useless_type_conversion_p
6783 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6784 (instance->root_stmts[0]->stmt))),
6785 TREE_TYPE (SLP_TREE_VECTYPE
6786 (SLP_INSTANCE_TREE (instance))))))
6787 /* Check we can vectorize the reduction. */
6788 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6789 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6790 {
6791 slp_tree node = SLP_INSTANCE_TREE (instance);
6792 stmt_vec_info stmt_info;
6793 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6794 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6795 else
6796 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6797 if (dump_enabled_p ())
6798 dump_printf_loc (MSG_NOTE, vect_location,
6799 "removing SLP instance operations starting from: %G",
6800 stmt_info->stmt);
6801 vect_free_slp_instance (instance);
6802 vinfo->slp_instances.ordered_remove (i);
6803 cost_vec.release ();
6804 while (!visited_vec.is_empty ())
6805 visited.remove (visited_vec.pop ());
6806 }
6807 else
6808 {
6809 i++;
6810 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6811 {
6812 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6813 cost_vec.release ();
6814 }
6815 else
6816 /* For BB vectorization remember the SLP graph entry
6817 cost for later. */
6818 instance->cost_vec = cost_vec;
6819 }
6820 }
6821
6822 /* Now look for SLP instances with a root that are covered by other
6823 instances and remove them. */
6824 hash_set<stmt_vec_info> roots;
6825 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6826 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6827 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6828 if (!roots.is_empty ())
6829 {
6830 visited.empty ();
6831 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6832 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6833 visited);
6834 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6835 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6836 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6837 {
6838 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6839 if (dump_enabled_p ())
6840 dump_printf_loc (MSG_NOTE, vect_location,
6841 "removing SLP instance operations starting "
6842 "from: %G", root->stmt);
6843 vect_free_slp_instance (instance);
6844 vinfo->slp_instances.ordered_remove (i);
6845 }
6846 else
6847 ++i;
6848 }
6849
6850 /* Compute vectorizable live stmts. */
6851 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6852 vect_bb_slp_mark_live_stmts (bb_vinfo);
6853
6854 return !vinfo->slp_instances.is_empty ();
6855 }
6856
6857 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6858 closing the eventual chain. */
6859
6860 static slp_instance
6861 get_ultimate_leader (slp_instance instance,
6862 hash_map<slp_instance, slp_instance> &instance_leader)
6863 {
6864 auto_vec<slp_instance *, 8> chain;
6865 slp_instance *tem;
6866 while (*(tem = instance_leader.get (instance)) != instance)
6867 {
6868 chain.safe_push (tem);
6869 instance = *tem;
6870 }
6871 while (!chain.is_empty ())
6872 *chain.pop () = instance;
6873 return instance;
6874 }
6875
6876 namespace {
6877 /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
6878 KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6879 for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
6880
6881 INSTANCE_LEADER is as for get_ultimate_leader. */
6882
6883 template<typename T>
6884 bool
6885 vect_map_to_instance (slp_instance instance, T key,
6886 hash_map<T, slp_instance> &key_to_instance,
6887 hash_map<slp_instance, slp_instance> &instance_leader)
6888 {
6889 bool existed_p;
6890 slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6891 if (!existed_p)
6892 ;
6893 else if (key_instance != instance)
6894 {
6895 /* If we're running into a previously marked key make us the
6896 leader of the current ultimate leader. This keeps the
6897 leader chain acyclic and works even when the current instance
6898 connects two previously independent graph parts. */
6899 slp_instance key_leader
6900 = get_ultimate_leader (key_instance, instance_leader);
6901 if (key_leader != instance)
6902 instance_leader.put (key_leader, instance);
6903 }
6904 key_instance = instance;
6905 return existed_p;
6906 }
6907 }
6908
6909 /* Worker of vect_bb_partition_graph, recurse on NODE. */
6910
6911 static void
6912 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6913 slp_instance instance, slp_tree node,
6914 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6915 hash_map<slp_tree, slp_instance> &node_to_instance,
6916 hash_map<slp_instance, slp_instance> &instance_leader)
6917 {
6918 stmt_vec_info stmt_info;
6919 unsigned i;
6920
6921 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6922 vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6923 instance_leader);
6924
6925 if (vect_map_to_instance (instance, node, node_to_instance,
6926 instance_leader))
6927 return;
6928
6929 slp_tree child;
6930 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6931 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6932 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6933 node_to_instance, instance_leader);
6934 }
6935
6936 /* Partition the SLP graph into pieces that can be costed independently. */
6937
6938 static void
6939 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6940 {
6941 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6942
6943 /* First walk the SLP graph assigning each involved scalar stmt a
6944 corresponding SLP graph entry and upon visiting a previously
6945 marked stmt, make the stmts leader the current SLP graph entry. */
6946 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6947 hash_map<slp_tree, slp_instance> node_to_instance;
6948 hash_map<slp_instance, slp_instance> instance_leader;
6949 slp_instance instance;
6950 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6951 {
6952 instance_leader.put (instance, instance);
6953 vect_bb_partition_graph_r (bb_vinfo,
6954 instance, SLP_INSTANCE_TREE (instance),
6955 stmt_to_instance, node_to_instance,
6956 instance_leader);
6957 }
6958
6959 /* Then collect entries to each independent subgraph. */
6960 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6961 {
6962 slp_instance leader = get_ultimate_leader (instance, instance_leader);
6963 leader->subgraph_entries.safe_push (instance);
6964 if (dump_enabled_p ()
6965 && leader != instance)
6966 dump_printf_loc (MSG_NOTE, vect_location,
6967 "instance %p is leader of %p\n",
6968 (void *) leader, (void *) instance);
6969 }
6970 }
6971
6972 /* Compute the set of scalar stmts participating in internal and external
6973 nodes. */
6974
6975 static void
6976 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6977 hash_set<slp_tree> &visited,
6978 hash_set<stmt_vec_info> &vstmts,
6979 hash_set<stmt_vec_info> &estmts)
6980 {
6981 int i;
6982 stmt_vec_info stmt_info;
6983 slp_tree child;
6984
6985 if (visited.add (node))
6986 return;
6987
6988 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6989 {
6990 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6991 vstmts.add (stmt_info);
6992
6993 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6994 if (child)
6995 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6996 vstmts, estmts);
6997 }
6998 else
6999 for (tree def : SLP_TREE_SCALAR_OPS (node))
7000 {
7001 stmt_vec_info def_stmt = vinfo->lookup_def (def);
7002 if (def_stmt)
7003 estmts.add (def_stmt);
7004 }
7005 }
7006
7007
7008 /* Compute the scalar cost of the SLP node NODE and its children
7009 and return it. Do not account defs that are marked in LIFE and
7010 update LIFE according to uses of NODE. */
7011
7012 static void
7013 vect_bb_slp_scalar_cost (vec_info *vinfo,
7014 slp_tree node, vec<bool, va_heap> *life,
7015 stmt_vector_for_cost *cost_vec,
7016 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
7017 hash_set<slp_tree> &visited)
7018 {
7019 unsigned i;
7020 stmt_vec_info stmt_info;
7021 slp_tree child;
7022
7023 if (visited.add (node))
7024 return;
7025
7026 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7027 {
7028 ssa_op_iter op_iter;
7029 def_operand_p def_p;
7030
7031 if ((*life)[i])
7032 continue;
7033
7034 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
7035 gimple *orig_stmt = orig_stmt_info->stmt;
7036
7037 /* If there is a non-vectorized use of the defs then the scalar
7038 stmt is kept live in which case we do not account it or any
7039 required defs in the SLP children in the scalar cost. This
7040 way we make the vectorization more costly when compared to
7041 the scalar cost. */
7042 if (!STMT_VINFO_LIVE_P (stmt_info))
7043 {
7044 auto_vec<gimple *, 8> worklist;
7045 hash_set<gimple *> *worklist_visited = NULL;
7046 worklist.quick_push (orig_stmt);
7047 do
7048 {
7049 gimple *work_stmt = worklist.pop ();
7050 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
7051 {
7052 imm_use_iterator use_iter;
7053 gimple *use_stmt;
7054 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
7055 DEF_FROM_PTR (def_p))
7056 if (!is_gimple_debug (use_stmt))
7057 {
7058 stmt_vec_info use_stmt_info
7059 = vinfo->lookup_stmt (use_stmt);
7060 if (!use_stmt_info
7061 || !vectorized_scalar_stmts.contains (use_stmt_info))
7062 {
7063 if (use_stmt_info
7064 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
7065 {
7066 /* For stmts participating in patterns we have
7067 to check its uses recursively. */
7068 if (!worklist_visited)
7069 worklist_visited = new hash_set<gimple *> ();
7070 if (!worklist_visited->add (use_stmt))
7071 worklist.safe_push (use_stmt);
7072 continue;
7073 }
7074 (*life)[i] = true;
7075 goto next_lane;
7076 }
7077 }
7078 }
7079 }
7080 while (!worklist.is_empty ());
7081 next_lane:
7082 if (worklist_visited)
7083 delete worklist_visited;
7084 if ((*life)[i])
7085 continue;
7086 }
7087
7088 /* Count scalar stmts only once. */
7089 if (gimple_visited_p (orig_stmt))
7090 continue;
7091 gimple_set_visited (orig_stmt, true);
7092
7093 vect_cost_for_stmt kind;
7094 if (STMT_VINFO_DATA_REF (orig_stmt_info))
7095 {
7096 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
7097 kind = scalar_load;
7098 else
7099 kind = scalar_store;
7100 }
7101 else if (vect_nop_conversion_p (orig_stmt_info))
7102 continue;
7103 /* For single-argument PHIs assume coalescing which means zero cost
7104 for the scalar and the vector PHIs. This avoids artificially
7105 favoring the vector path (but may pessimize it in some cases). */
7106 else if (is_a <gphi *> (orig_stmt_info->stmt)
7107 && gimple_phi_num_args
7108 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
7109 continue;
7110 else
7111 kind = scalar_stmt;
7112 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
7113 SLP_TREE_VECTYPE (node), 0, vect_body);
7114 }
7115
7116 auto_vec<bool, 20> subtree_life;
7117 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7118 {
7119 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7120 {
7121 /* Do not directly pass LIFE to the recursive call, copy it to
7122 confine changes in the callee to the current child/subtree. */
7123 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7124 {
7125 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
7126 for (unsigned j = 0;
7127 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
7128 {
7129 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
7130 if (perm.first == i)
7131 subtree_life[perm.second] = (*life)[j];
7132 }
7133 }
7134 else
7135 {
7136 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
7137 subtree_life.safe_splice (*life);
7138 }
7139 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
7140 vectorized_scalar_stmts, visited);
7141 subtree_life.truncate (0);
7142 }
7143 }
7144 }
7145
7146 /* Comparator for the loop-index sorted cost vectors. */
7147
7148 static int
7149 li_cost_vec_cmp (const void *a_, const void *b_)
7150 {
7151 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
7152 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
7153 if (a->first < b->first)
7154 return -1;
7155 else if (a->first == b->first)
7156 return 0;
7157 return 1;
7158 }
7159
7160 /* Check if vectorization of the basic block is profitable for the
7161 subgraph denoted by SLP_INSTANCES. */
7162
7163 static bool
7164 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
7165 vec<slp_instance> slp_instances,
7166 loop_p orig_loop)
7167 {
7168 slp_instance instance;
7169 int i;
7170 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
7171 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
7172
7173 if (dump_enabled_p ())
7174 {
7175 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
7176 hash_set<slp_tree> visited;
7177 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7178 vect_print_slp_graph (MSG_NOTE, vect_location,
7179 SLP_INSTANCE_TREE (instance), visited);
7180 }
7181
7182 /* Compute the set of scalar stmts we know will go away 'locally' when
7183 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
7184 not accurate for nodes promoted extern late or for scalar stmts that
7185 are used both in extern defs and in vectorized defs. */
7186 hash_set<stmt_vec_info> vectorized_scalar_stmts;
7187 hash_set<stmt_vec_info> scalar_stmts_in_externs;
7188 hash_set<slp_tree> visited;
7189 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7190 {
7191 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
7192 SLP_INSTANCE_TREE (instance),
7193 visited,
7194 vectorized_scalar_stmts,
7195 scalar_stmts_in_externs);
7196 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
7197 vectorized_scalar_stmts.add (rstmt);
7198 }
7199 /* Scalar stmts used as defs in external nodes need to be preseved, so
7200 remove them from vectorized_scalar_stmts. */
7201 for (stmt_vec_info stmt : scalar_stmts_in_externs)
7202 vectorized_scalar_stmts.remove (stmt);
7203
7204 /* Calculate scalar cost and sum the cost for the vector stmts
7205 previously collected. */
7206 stmt_vector_for_cost scalar_costs = vNULL;
7207 stmt_vector_for_cost vector_costs = vNULL;
7208 visited.empty ();
7209 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7210 {
7211 auto_vec<bool, 20> life;
7212 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
7213 true);
7214 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7215 record_stmt_cost (&scalar_costs,
7216 SLP_INSTANCE_ROOT_STMTS (instance).length (),
7217 scalar_stmt,
7218 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
7219 vect_bb_slp_scalar_cost (bb_vinfo,
7220 SLP_INSTANCE_TREE (instance),
7221 &life, &scalar_costs, vectorized_scalar_stmts,
7222 visited);
7223 vector_costs.safe_splice (instance->cost_vec);
7224 instance->cost_vec.release ();
7225 }
7226
7227 if (dump_enabled_p ())
7228 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
7229
7230 /* When costing non-loop vectorization we need to consider each covered
7231 loop independently and make sure vectorization is profitable. For
7232 now we assume a loop may be not entered or executed an arbitrary
7233 number of iterations (??? static information can provide more
7234 precise info here) which means we can simply cost each containing
7235 loops stmts separately. */
7236
7237 /* First produce cost vectors sorted by loop index. */
7238 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7239 li_scalar_costs (scalar_costs.length ());
7240 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7241 li_vector_costs (vector_costs.length ());
7242 stmt_info_for_cost *cost;
7243 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7244 {
7245 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7246 li_scalar_costs.quick_push (std::make_pair (l, cost));
7247 }
7248 /* Use a random used loop as fallback in case the first vector_costs
7249 entry does not have a stmt_info associated with it. */
7250 unsigned l = li_scalar_costs[0].first;
7251 FOR_EACH_VEC_ELT (vector_costs, i, cost)
7252 {
7253 /* We inherit from the previous COST, invariants, externals and
7254 extracts immediately follow the cost for the related stmt. */
7255 if (cost->stmt_info)
7256 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7257 li_vector_costs.quick_push (std::make_pair (l, cost));
7258 }
7259 li_scalar_costs.qsort (li_cost_vec_cmp);
7260 li_vector_costs.qsort (li_cost_vec_cmp);
7261
7262 /* Now cost the portions individually. */
7263 unsigned vi = 0;
7264 unsigned si = 0;
7265 bool profitable = true;
7266 while (si < li_scalar_costs.length ()
7267 && vi < li_vector_costs.length ())
7268 {
7269 unsigned sl = li_scalar_costs[si].first;
7270 unsigned vl = li_vector_costs[vi].first;
7271 if (sl != vl)
7272 {
7273 if (dump_enabled_p ())
7274 dump_printf_loc (MSG_NOTE, vect_location,
7275 "Scalar %d and vector %d loop part do not "
7276 "match up, skipping scalar part\n", sl, vl);
7277 /* Skip the scalar part, assuming zero cost on the vector side. */
7278 do
7279 {
7280 si++;
7281 }
7282 while (si < li_scalar_costs.length ()
7283 && li_scalar_costs[si].first == sl);
7284 continue;
7285 }
7286
7287 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
7288 do
7289 {
7290 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
7291 si++;
7292 }
7293 while (si < li_scalar_costs.length ()
7294 && li_scalar_costs[si].first == sl);
7295 unsigned dummy;
7296 finish_cost (scalar_target_cost_data, nullptr,
7297 &dummy, &scalar_cost, &dummy);
7298
7299 /* Complete the target-specific vector cost calculation. */
7300 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
7301 do
7302 {
7303 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
7304 vi++;
7305 }
7306 while (vi < li_vector_costs.length ()
7307 && li_vector_costs[vi].first == vl);
7308 finish_cost (vect_target_cost_data, scalar_target_cost_data,
7309 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7310 delete scalar_target_cost_data;
7311 delete vect_target_cost_data;
7312
7313 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7314
7315 if (dump_enabled_p ())
7316 {
7317 dump_printf_loc (MSG_NOTE, vect_location,
7318 "Cost model analysis for part in loop %d:\n", sl);
7319 dump_printf (MSG_NOTE, " Vector cost: %d\n",
7320 vec_inside_cost + vec_outside_cost);
7321 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
7322 }
7323
7324 /* Vectorization is profitable if its cost is more than the cost of scalar
7325 version. Note that we err on the vector side for equal cost because
7326 the cost estimate is otherwise quite pessimistic (constant uses are
7327 free on the scalar side but cost a load on the vector side for
7328 example). */
7329 if (vec_outside_cost + vec_inside_cost > scalar_cost)
7330 {
7331 profitable = false;
7332 break;
7333 }
7334 }
7335 if (profitable && vi < li_vector_costs.length ())
7336 {
7337 if (dump_enabled_p ())
7338 dump_printf_loc (MSG_NOTE, vect_location,
7339 "Excess vector cost for part in loop %d:\n",
7340 li_vector_costs[vi].first);
7341 profitable = false;
7342 }
7343
7344 /* Unset visited flag. This is delayed when the subgraph is profitable
7345 and we process the loop for remaining unvectorized if-converted code. */
7346 if (!orig_loop || !profitable)
7347 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7348 gimple_set_visited (cost->stmt_info->stmt, false);
7349
7350 scalar_costs.release ();
7351 vector_costs.release ();
7352
7353 return profitable;
7354 }
7355
7356 /* qsort comparator for lane defs. */
7357
7358 static int
7359 vld_cmp (const void *a_, const void *b_)
7360 {
7361 auto *a = (const std::pair<unsigned, tree> *)a_;
7362 auto *b = (const std::pair<unsigned, tree> *)b_;
7363 return a->first - b->first;
7364 }
7365
7366 /* Return true if USE_STMT is a vector lane insert into VEC and set
7367 *THIS_LANE to the lane number that is set. */
7368
7369 static bool
7370 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7371 {
7372 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7373 if (!use_ass
7374 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7375 || (vec
7376 ? gimple_assign_rhs1 (use_ass) != vec
7377 : ((vec = gimple_assign_rhs1 (use_ass)), false))
7378 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7379 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7380 || !constant_multiple_p
7381 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7382 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7383 this_lane))
7384 return false;
7385 return true;
7386 }
7387
7388 /* Find any vectorizable constructors and add them to the grouped_store
7389 array. */
7390
7391 static void
7392 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7393 {
7394 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7395 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7396 !gsi_end_p (gsi); gsi_next (&gsi))
7397 {
7398 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7399 if (!assign)
7400 continue;
7401
7402 tree rhs = gimple_assign_rhs1 (assign);
7403 enum tree_code code = gimple_assign_rhs_code (assign);
7404 use_operand_p use_p;
7405 gimple *use_stmt;
7406 if (code == CONSTRUCTOR)
7407 {
7408 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7409 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7410 CONSTRUCTOR_NELTS (rhs))
7411 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7412 || uniform_vector_p (rhs))
7413 continue;
7414
7415 unsigned j;
7416 tree val;
7417 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7418 if (TREE_CODE (val) != SSA_NAME
7419 || !bb_vinfo->lookup_def (val))
7420 break;
7421 if (j != CONSTRUCTOR_NELTS (rhs))
7422 continue;
7423
7424 vec<stmt_vec_info> roots = vNULL;
7425 roots.safe_push (bb_vinfo->lookup_stmt (assign));
7426 vec<stmt_vec_info> stmts;
7427 stmts.create (CONSTRUCTOR_NELTS (rhs));
7428 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7429 stmts.quick_push
7430 (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7431 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7432 stmts, roots));
7433 }
7434 else if (code == BIT_INSERT_EXPR
7435 && VECTOR_TYPE_P (TREE_TYPE (rhs))
7436 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7437 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7438 && integer_zerop (gimple_assign_rhs3 (assign))
7439 && useless_type_conversion_p
7440 (TREE_TYPE (TREE_TYPE (rhs)),
7441 TREE_TYPE (gimple_assign_rhs2 (assign)))
7442 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7443 {
7444 /* We start to match on insert to lane zero but since the
7445 inserts need not be ordered we'd have to search both
7446 the def and the use chains. */
7447 tree vectype = TREE_TYPE (rhs);
7448 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7449 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7450 auto_sbitmap lanes (nlanes);
7451 bitmap_clear (lanes);
7452 bitmap_set_bit (lanes, 0);
7453 tree def = gimple_assign_lhs (assign);
7454 lane_defs.quick_push
7455 (std::make_pair (0, gimple_assign_rhs2 (assign)));
7456 unsigned lanes_found = 1;
7457 /* Start with the use chains, the last stmt will be the root. */
7458 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7459 vec<stmt_vec_info> roots = vNULL;
7460 roots.safe_push (last);
7461 do
7462 {
7463 use_operand_p use_p;
7464 gimple *use_stmt;
7465 if (!single_imm_use (def, &use_p, &use_stmt))
7466 break;
7467 unsigned this_lane;
7468 if (!bb_vinfo->lookup_stmt (use_stmt)
7469 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7470 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7471 break;
7472 if (bitmap_bit_p (lanes, this_lane))
7473 break;
7474 lanes_found++;
7475 bitmap_set_bit (lanes, this_lane);
7476 gassign *use_ass = as_a <gassign *> (use_stmt);
7477 lane_defs.quick_push (std::make_pair
7478 (this_lane, gimple_assign_rhs2 (use_ass)));
7479 last = bb_vinfo->lookup_stmt (use_ass);
7480 roots.safe_push (last);
7481 def = gimple_assign_lhs (use_ass);
7482 }
7483 while (lanes_found < nlanes);
7484 if (roots.length () > 1)
7485 std::swap(roots[0], roots[roots.length () - 1]);
7486 if (lanes_found < nlanes)
7487 {
7488 /* Now search the def chain. */
7489 def = gimple_assign_rhs1 (assign);
7490 do
7491 {
7492 if (TREE_CODE (def) != SSA_NAME
7493 || !has_single_use (def))
7494 break;
7495 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7496 unsigned this_lane;
7497 if (!bb_vinfo->lookup_stmt (def_stmt)
7498 || !vect_slp_is_lane_insert (def_stmt,
7499 NULL_TREE, &this_lane)
7500 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7501 break;
7502 if (bitmap_bit_p (lanes, this_lane))
7503 break;
7504 lanes_found++;
7505 bitmap_set_bit (lanes, this_lane);
7506 lane_defs.quick_push (std::make_pair
7507 (this_lane,
7508 gimple_assign_rhs2 (def_stmt)));
7509 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7510 def = gimple_assign_rhs1 (def_stmt);
7511 }
7512 while (lanes_found < nlanes);
7513 }
7514 if (lanes_found == nlanes)
7515 {
7516 /* Sort lane_defs after the lane index and register the root. */
7517 lane_defs.qsort (vld_cmp);
7518 vec<stmt_vec_info> stmts;
7519 stmts.create (nlanes);
7520 for (unsigned i = 0; i < nlanes; ++i)
7521 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7522 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7523 stmts, roots));
7524 }
7525 else
7526 roots.release ();
7527 }
7528 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7529 && (associative_tree_code (code) || code == MINUS_EXPR)
7530 /* ??? This pessimizes a two-element reduction. PR54400.
7531 ??? In-order reduction could be handled if we only
7532 traverse one operand chain in vect_slp_linearize_chain. */
7533 && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7534 /* Ops with constants at the tail can be stripped here. */
7535 && TREE_CODE (rhs) == SSA_NAME
7536 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7537 /* Should be the chain end. */
7538 && (!single_imm_use (gimple_assign_lhs (assign),
7539 &use_p, &use_stmt)
7540 || !is_gimple_assign (use_stmt)
7541 || (gimple_assign_rhs_code (use_stmt) != code
7542 && ((code != PLUS_EXPR && code != MINUS_EXPR)
7543 || (gimple_assign_rhs_code (use_stmt)
7544 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7545 {
7546 /* We start the match at the end of a possible association
7547 chain. */
7548 auto_vec<chain_op_t> chain;
7549 auto_vec<std::pair<tree_code, gimple *> > worklist;
7550 auto_vec<gimple *> chain_stmts;
7551 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7552 if (code == MINUS_EXPR)
7553 code = PLUS_EXPR;
7554 internal_fn reduc_fn;
7555 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7556 || reduc_fn == IFN_LAST)
7557 continue;
7558 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7559 /* ??? */
7560 code_stmt, alt_code_stmt, &chain_stmts);
7561 if (chain.length () > 1)
7562 {
7563 /* Sort the chain according to def_type and operation. */
7564 chain.sort (dt_sort_cmp, bb_vinfo);
7565 /* ??? Now we'd want to strip externals and constants
7566 but record those to be handled in the epilogue. */
7567 /* ??? For now do not allow mixing ops or externs/constants. */
7568 bool invalid = false;
7569 unsigned remain_cnt = 0;
7570 unsigned last_idx = 0;
7571 for (unsigned i = 0; i < chain.length (); ++i)
7572 {
7573 if (chain[i].code != code)
7574 {
7575 invalid = true;
7576 break;
7577 }
7578 if (chain[i].dt != vect_internal_def
7579 /* Avoid stmts where the def is not the LHS, like
7580 ASMs. */
7581 || (gimple_get_lhs (bb_vinfo->lookup_def
7582 (chain[i].op)->stmt)
7583 != chain[i].op))
7584 remain_cnt++;
7585 else
7586 last_idx = i;
7587 }
7588 /* Make sure to have an even number of lanes as we later do
7589 all-or-nothing discovery, not trying to split further. */
7590 if ((chain.length () - remain_cnt) & 1)
7591 remain_cnt++;
7592 if (!invalid && chain.length () - remain_cnt > 1)
7593 {
7594 vec<stmt_vec_info> stmts;
7595 vec<tree> remain = vNULL;
7596 stmts.create (chain.length ());
7597 if (remain_cnt > 0)
7598 remain.create (remain_cnt);
7599 for (unsigned i = 0; i < chain.length (); ++i)
7600 {
7601 stmt_vec_info stmt_info;
7602 if (chain[i].dt == vect_internal_def
7603 && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
7604 gimple_get_lhs (stmt_info->stmt) == chain[i].op)
7605 && (i != last_idx
7606 || (stmts.length () & 1)))
7607 stmts.quick_push (stmt_info);
7608 else
7609 remain.quick_push (chain[i].op);
7610 }
7611 vec<stmt_vec_info> roots;
7612 roots.create (chain_stmts.length ());
7613 for (unsigned i = 0; i < chain_stmts.length (); ++i)
7614 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7615 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7616 stmts, roots, remain));
7617 }
7618 }
7619 }
7620 }
7621 }
7622
7623 /* Walk the grouped store chains and replace entries with their
7624 pattern variant if any. */
7625
7626 static void
7627 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7628 {
7629 stmt_vec_info first_element;
7630 unsigned i;
7631
7632 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7633 {
7634 /* We also have CTORs in this array. */
7635 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7636 continue;
7637 if (STMT_VINFO_IN_PATTERN_P (first_element))
7638 {
7639 stmt_vec_info orig = first_element;
7640 first_element = STMT_VINFO_RELATED_STMT (first_element);
7641 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7642 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7643 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7644 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7645 vinfo->grouped_stores[i] = first_element;
7646 }
7647 stmt_vec_info prev = first_element;
7648 while (DR_GROUP_NEXT_ELEMENT (prev))
7649 {
7650 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7651 if (STMT_VINFO_IN_PATTERN_P (elt))
7652 {
7653 stmt_vec_info orig = elt;
7654 elt = STMT_VINFO_RELATED_STMT (elt);
7655 DR_GROUP_NEXT_ELEMENT (prev) = elt;
7656 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7657 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7658 }
7659 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7660 prev = elt;
7661 }
7662 }
7663 }
7664
7665 /* Check if the region described by BB_VINFO can be vectorized, returning
7666 true if so. When returning false, set FATAL to true if the same failure
7667 would prevent vectorization at other vector sizes, false if it is still
7668 worth trying other sizes. N_STMTS is the number of statements in the
7669 region. */
7670
7671 static bool
7672 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7673 vec<int> *dataref_groups)
7674 {
7675 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7676
7677 slp_instance instance;
7678 int i;
7679 poly_uint64 min_vf = 2;
7680
7681 /* The first group of checks is independent of the vector size. */
7682 fatal = true;
7683
7684 /* Analyze the data references. */
7685
7686 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7687 {
7688 if (dump_enabled_p ())
7689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7690 "not vectorized: unhandled data-ref in basic "
7691 "block.\n");
7692 return false;
7693 }
7694
7695 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7696 {
7697 if (dump_enabled_p ())
7698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7699 "not vectorized: unhandled data access in "
7700 "basic block.\n");
7701 return false;
7702 }
7703
7704 vect_slp_check_for_roots (bb_vinfo);
7705
7706 /* If there are no grouped stores and no constructors in the region
7707 there is no need to continue with pattern recog as vect_analyze_slp
7708 will fail anyway. */
7709 if (bb_vinfo->grouped_stores.is_empty ()
7710 && bb_vinfo->roots.is_empty ())
7711 {
7712 if (dump_enabled_p ())
7713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7714 "not vectorized: no grouped stores in "
7715 "basic block.\n");
7716 return false;
7717 }
7718
7719 /* While the rest of the analysis below depends on it in some way. */
7720 fatal = false;
7721
7722 vect_pattern_recog (bb_vinfo);
7723
7724 /* Update store groups from pattern processing. */
7725 vect_fixup_store_groups_with_patterns (bb_vinfo);
7726
7727 /* Check the SLP opportunities in the basic block, analyze and build SLP
7728 trees. */
7729 if (!vect_analyze_slp (bb_vinfo, n_stmts))
7730 {
7731 if (dump_enabled_p ())
7732 {
7733 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7734 "Failed to SLP the basic block.\n");
7735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7736 "not vectorized: failed to find SLP opportunities "
7737 "in basic block.\n");
7738 }
7739 return false;
7740 }
7741
7742 /* Optimize permutations. */
7743 vect_optimize_slp (bb_vinfo);
7744
7745 /* Gather the loads reachable from the SLP graph entries. */
7746 vect_gather_slp_loads (bb_vinfo);
7747
7748 vect_record_base_alignments (bb_vinfo);
7749
7750 /* Analyze and verify the alignment of data references and the
7751 dependence in the SLP instances. */
7752 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7753 {
7754 vect_location = instance->location ();
7755 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7756 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7757 {
7758 slp_tree node = SLP_INSTANCE_TREE (instance);
7759 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7760 if (dump_enabled_p ())
7761 dump_printf_loc (MSG_NOTE, vect_location,
7762 "removing SLP instance operations starting from: %G",
7763 stmt_info->stmt);
7764 vect_free_slp_instance (instance);
7765 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7766 continue;
7767 }
7768
7769 /* Mark all the statements that we want to vectorize as pure SLP and
7770 relevant. */
7771 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7772 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7773 unsigned j;
7774 stmt_vec_info root;
7775 /* Likewise consider instance root stmts as vectorized. */
7776 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7777 STMT_SLP_TYPE (root) = pure_slp;
7778
7779 i++;
7780 }
7781 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7782 return false;
7783
7784 if (!vect_slp_analyze_operations (bb_vinfo))
7785 {
7786 if (dump_enabled_p ())
7787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7788 "not vectorized: bad operation in basic block.\n");
7789 return false;
7790 }
7791
7792 vect_bb_partition_graph (bb_vinfo);
7793
7794 return true;
7795 }
7796
7797 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
7798 basic blocks in BBS, returning true on success.
7799 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
7800
7801 static bool
7802 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7803 vec<int> *dataref_groups, unsigned int n_stmts,
7804 loop_p orig_loop)
7805 {
7806 bb_vec_info bb_vinfo;
7807 auto_vector_modes vector_modes;
7808
7809 /* Autodetect first vector size we try. */
7810 machine_mode next_vector_mode = VOIDmode;
7811 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7812 unsigned int mode_i = 0;
7813
7814 vec_info_shared shared;
7815
7816 machine_mode autodetected_vector_mode = VOIDmode;
7817 while (1)
7818 {
7819 bool vectorized = false;
7820 bool fatal = false;
7821 bb_vinfo = new _bb_vec_info (bbs, &shared);
7822
7823 bool first_time_p = shared.datarefs.is_empty ();
7824 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7825 if (first_time_p)
7826 bb_vinfo->shared->save_datarefs ();
7827 else
7828 bb_vinfo->shared->check_datarefs ();
7829 bb_vinfo->vector_mode = next_vector_mode;
7830
7831 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7832 {
7833 if (dump_enabled_p ())
7834 {
7835 dump_printf_loc (MSG_NOTE, vect_location,
7836 "***** Analysis succeeded with vector mode"
7837 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7838 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7839 }
7840
7841 bb_vinfo->shared->check_datarefs ();
7842
7843 bool force_clear = false;
7844 auto_vec<slp_instance> profitable_subgraphs;
7845 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7846 {
7847 if (instance->subgraph_entries.is_empty ())
7848 continue;
7849
7850 dump_user_location_t saved_vect_location = vect_location;
7851 vect_location = instance->location ();
7852 if (!unlimited_cost_model (NULL)
7853 && !vect_bb_vectorization_profitable_p
7854 (bb_vinfo, instance->subgraph_entries, orig_loop))
7855 {
7856 if (dump_enabled_p ())
7857 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7858 "not vectorized: vectorization is not "
7859 "profitable.\n");
7860 vect_location = saved_vect_location;
7861 continue;
7862 }
7863
7864 vect_location = saved_vect_location;
7865 if (!dbg_cnt (vect_slp))
7866 {
7867 force_clear = true;
7868 continue;
7869 }
7870
7871 profitable_subgraphs.safe_push (instance);
7872 }
7873
7874 /* When we're vectorizing an if-converted loop body make sure
7875 we vectorized all if-converted code. */
7876 if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
7877 {
7878 gcc_assert (bb_vinfo->bbs.length () == 1);
7879 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7880 !gsi_end_p (gsi); gsi_next (&gsi))
7881 {
7882 /* The costing above left us with DCEable vectorized scalar
7883 stmts having the visited flag set on profitable
7884 subgraphs. Do the delayed clearing of the flag here. */
7885 if (gimple_visited_p (gsi_stmt (gsi)))
7886 {
7887 gimple_set_visited (gsi_stmt (gsi), false);
7888 continue;
7889 }
7890 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7891 continue;
7892
7893 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7894 if (gimple_assign_rhs_code (ass) == COND_EXPR)
7895 {
7896 if (!profitable_subgraphs.is_empty ()
7897 && dump_enabled_p ())
7898 dump_printf_loc (MSG_NOTE, vect_location,
7899 "not profitable because of "
7900 "unprofitable if-converted scalar "
7901 "code\n");
7902 profitable_subgraphs.truncate (0);
7903 }
7904 }
7905 }
7906
7907 /* Finally schedule the profitable subgraphs. */
7908 for (slp_instance instance : profitable_subgraphs)
7909 {
7910 if (!vectorized && dump_enabled_p ())
7911 dump_printf_loc (MSG_NOTE, vect_location,
7912 "Basic block will be vectorized "
7913 "using SLP\n");
7914 vectorized = true;
7915
7916 /* Dump before scheduling as store vectorization will remove
7917 the original stores and mess with the instance tree
7918 so querying its location will eventually ICE. */
7919 if (flag_checking)
7920 for (slp_instance sub : instance->subgraph_entries)
7921 gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7922 unsigned HOST_WIDE_INT bytes;
7923 if (dump_enabled_p ())
7924 for (slp_instance sub : instance->subgraph_entries)
7925 {
7926 tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7927 if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7928 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7929 sub->location (),
7930 "basic block part vectorized using %wu "
7931 "byte vectors\n", bytes);
7932 else
7933 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7934 sub->location (),
7935 "basic block part vectorized using "
7936 "variable length vectors\n");
7937 }
7938
7939 dump_user_location_t saved_vect_location = vect_location;
7940 vect_location = instance->location ();
7941
7942 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7943
7944 vect_location = saved_vect_location;
7945 }
7946 }
7947 else
7948 {
7949 if (dump_enabled_p ())
7950 dump_printf_loc (MSG_NOTE, vect_location,
7951 "***** Analysis failed with vector mode %s\n",
7952 GET_MODE_NAME (bb_vinfo->vector_mode));
7953 }
7954
7955 if (mode_i == 0)
7956 autodetected_vector_mode = bb_vinfo->vector_mode;
7957
7958 if (!fatal)
7959 while (mode_i < vector_modes.length ()
7960 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7961 {
7962 if (dump_enabled_p ())
7963 dump_printf_loc (MSG_NOTE, vect_location,
7964 "***** The result for vector mode %s would"
7965 " be the same\n",
7966 GET_MODE_NAME (vector_modes[mode_i]));
7967 mode_i += 1;
7968 }
7969
7970 delete bb_vinfo;
7971
7972 if (mode_i < vector_modes.length ()
7973 && VECTOR_MODE_P (autodetected_vector_mode)
7974 && (related_vector_mode (vector_modes[mode_i],
7975 GET_MODE_INNER (autodetected_vector_mode))
7976 == autodetected_vector_mode)
7977 && (related_vector_mode (autodetected_vector_mode,
7978 GET_MODE_INNER (vector_modes[mode_i]))
7979 == vector_modes[mode_i]))
7980 {
7981 if (dump_enabled_p ())
7982 dump_printf_loc (MSG_NOTE, vect_location,
7983 "***** Skipping vector mode %s, which would"
7984 " repeat the analysis for %s\n",
7985 GET_MODE_NAME (vector_modes[mode_i]),
7986 GET_MODE_NAME (autodetected_vector_mode));
7987 mode_i += 1;
7988 }
7989
7990 if (vectorized
7991 || mode_i == vector_modes.length ()
7992 || autodetected_vector_mode == VOIDmode
7993 /* If vect_slp_analyze_bb_1 signaled that analysis for all
7994 vector sizes will fail do not bother iterating. */
7995 || fatal)
7996 return vectorized;
7997
7998 /* Try the next biggest vector size. */
7999 next_vector_mode = vector_modes[mode_i++];
8000 if (dump_enabled_p ())
8001 dump_printf_loc (MSG_NOTE, vect_location,
8002 "***** Re-trying analysis with vector mode %s\n",
8003 GET_MODE_NAME (next_vector_mode));
8004 }
8005 }
8006
8007
8008 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
8009 true if anything in the basic-block was vectorized. */
8010
8011 static bool
8012 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
8013 {
8014 vec<data_reference_p> datarefs = vNULL;
8015 auto_vec<int> dataref_groups;
8016 int insns = 0;
8017 int current_group = 0;
8018
8019 for (unsigned i = 0; i < bbs.length (); i++)
8020 {
8021 basic_block bb = bbs[i];
8022 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
8023 gsi_next (&gsi))
8024 {
8025 gimple *stmt = gsi_stmt (gsi);
8026 if (is_gimple_debug (stmt))
8027 continue;
8028
8029 insns++;
8030
8031 if (gimple_location (stmt) != UNKNOWN_LOCATION)
8032 vect_location = stmt;
8033
8034 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
8035 &dataref_groups, current_group))
8036 ++current_group;
8037 }
8038 /* New BBs always start a new DR group. */
8039 ++current_group;
8040 }
8041
8042 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
8043 }
8044
8045 /* Special entry for the BB vectorizer. Analyze and transform a single
8046 if-converted BB with ORIG_LOOPs body being the not if-converted
8047 representation. Returns true if anything in the basic-block was
8048 vectorized. */
8049
8050 bool
8051 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
8052 {
8053 auto_vec<basic_block> bbs;
8054 bbs.safe_push (bb);
8055 return vect_slp_bbs (bbs, orig_loop);
8056 }
8057
8058 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
8059 true if anything in the basic-block was vectorized. */
8060
8061 bool
8062 vect_slp_function (function *fun)
8063 {
8064 bool r = false;
8065 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
8066 auto_bitmap exit_bbs;
8067 bitmap_set_bit (exit_bbs, EXIT_BLOCK);
8068 edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
8069 unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
8070 true, rpo, NULL);
8071
8072 /* For the moment split the function into pieces to avoid making
8073 the iteration on the vector mode moot. Split at points we know
8074 to not handle well which is CFG merges (SLP discovery doesn't
8075 handle non-loop-header PHIs) and loop exits. Since pattern
8076 recog requires reverse iteration to visit uses before defs
8077 simply chop RPO into pieces. */
8078 auto_vec<basic_block> bbs;
8079 for (unsigned i = 0; i < n; i++)
8080 {
8081 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
8082 bool split = false;
8083
8084 /* Split when a BB is not dominated by the first block. */
8085 if (!bbs.is_empty ()
8086 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
8087 {
8088 if (dump_enabled_p ())
8089 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8090 "splitting region at dominance boundary bb%d\n",
8091 bb->index);
8092 split = true;
8093 }
8094 /* Split when the loop determined by the first block
8095 is exited. This is because we eventually insert
8096 invariants at region begin. */
8097 else if (!bbs.is_empty ()
8098 && bbs[0]->loop_father != bb->loop_father
8099 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
8100 {
8101 if (dump_enabled_p ())
8102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8103 "splitting region at loop %d exit at bb%d\n",
8104 bbs[0]->loop_father->num, bb->index);
8105 split = true;
8106 }
8107 else if (!bbs.is_empty ()
8108 && bb->loop_father->header == bb
8109 && bb->loop_father->dont_vectorize)
8110 {
8111 if (dump_enabled_p ())
8112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8113 "splitting region at dont-vectorize loop %d "
8114 "entry at bb%d\n",
8115 bb->loop_father->num, bb->index);
8116 split = true;
8117 }
8118
8119 if (split && !bbs.is_empty ())
8120 {
8121 r |= vect_slp_bbs (bbs, NULL);
8122 bbs.truncate (0);
8123 }
8124
8125 if (bbs.is_empty ())
8126 {
8127 /* We need to be able to insert at the head of the region which
8128 we cannot for region starting with a returns-twice call. */
8129 if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
8130 if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
8131 {
8132 if (dump_enabled_p ())
8133 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8134 "skipping bb%d as start of region as it "
8135 "starts with returns-twice call\n",
8136 bb->index);
8137 continue;
8138 }
8139 /* If the loop this BB belongs to is marked as not to be vectorized
8140 honor that also for BB vectorization. */
8141 if (bb->loop_father->dont_vectorize)
8142 continue;
8143 }
8144
8145 bbs.safe_push (bb);
8146
8147 /* When we have a stmt ending this block and defining a
8148 value we have to insert on edges when inserting after it for
8149 a vector containing its definition. Avoid this for now. */
8150 if (gimple *last = *gsi_last_bb (bb))
8151 if (gimple_get_lhs (last)
8152 && is_ctrl_altering_stmt (last))
8153 {
8154 if (dump_enabled_p ())
8155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8156 "splitting region at control altering "
8157 "definition %G", last);
8158 r |= vect_slp_bbs (bbs, NULL);
8159 bbs.truncate (0);
8160 }
8161 }
8162
8163 if (!bbs.is_empty ())
8164 r |= vect_slp_bbs (bbs, NULL);
8165
8166 free (rpo);
8167
8168 return r;
8169 }
8170
8171 /* Build a variable-length vector in which the elements in ELTS are repeated
8172 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
8173 RESULTS and add any new instructions to SEQ.
8174
8175 The approach we use is:
8176
8177 (1) Find a vector mode VM with integer elements of mode IM.
8178
8179 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8180 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
8181 from small vectors to IM.
8182
8183 (3) Duplicate each ELTS'[I] into a vector of mode VM.
8184
8185 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
8186 correct byte contents.
8187
8188 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
8189
8190 We try to find the largest IM for which this sequence works, in order
8191 to cut down on the number of interleaves. */
8192
8193 void
8194 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
8195 const vec<tree> &elts, unsigned int nresults,
8196 vec<tree> &results)
8197 {
8198 unsigned int nelts = elts.length ();
8199 tree element_type = TREE_TYPE (vector_type);
8200
8201 /* (1) Find a vector mode VM with integer elements of mode IM. */
8202 unsigned int nvectors = 1;
8203 tree new_vector_type;
8204 tree permutes[2];
8205 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
8206 &nvectors, &new_vector_type,
8207 permutes))
8208 gcc_unreachable ();
8209
8210 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
8211 unsigned int partial_nelts = nelts / nvectors;
8212 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
8213
8214 tree_vector_builder partial_elts;
8215 auto_vec<tree, 32> pieces (nvectors * 2);
8216 pieces.quick_grow_cleared (nvectors * 2);
8217 for (unsigned int i = 0; i < nvectors; ++i)
8218 {
8219 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8220 ELTS' has mode IM. */
8221 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
8222 for (unsigned int j = 0; j < partial_nelts; ++j)
8223 partial_elts.quick_push (elts[i * partial_nelts + j]);
8224 tree t = gimple_build_vector (seq, &partial_elts);
8225 t = gimple_build (seq, VIEW_CONVERT_EXPR,
8226 TREE_TYPE (new_vector_type), t);
8227
8228 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
8229 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
8230 }
8231
8232 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
8233 correct byte contents.
8234
8235 Conceptually, we need to repeat the following operation log2(nvectors)
8236 times, where hi_start = nvectors / 2:
8237
8238 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
8239 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
8240
8241 However, if each input repeats every N elements and the VF is
8242 a multiple of N * 2, the HI result is the same as the LO result.
8243 This will be true for the first N1 iterations of the outer loop,
8244 followed by N2 iterations for which both the LO and HI results
8245 are needed. I.e.:
8246
8247 N1 + N2 = log2(nvectors)
8248
8249 Each "N1 iteration" doubles the number of redundant vectors and the
8250 effect of the process as a whole is to have a sequence of nvectors/2**N1
8251 vectors that repeats 2**N1 times. Rather than generate these redundant
8252 vectors, we halve the number of vectors for each N1 iteration. */
8253 unsigned int in_start = 0;
8254 unsigned int out_start = nvectors;
8255 unsigned int new_nvectors = nvectors;
8256 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
8257 {
8258 unsigned int hi_start = new_nvectors / 2;
8259 unsigned int out_i = 0;
8260 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
8261 {
8262 if ((in_i & 1) != 0
8263 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
8264 2 * in_repeat))
8265 continue;
8266
8267 tree output = make_ssa_name (new_vector_type);
8268 tree input1 = pieces[in_start + (in_i / 2)];
8269 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
8270 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
8271 input1, input2,
8272 permutes[in_i & 1]);
8273 gimple_seq_add_stmt (seq, stmt);
8274 pieces[out_start + out_i] = output;
8275 out_i += 1;
8276 }
8277 std::swap (in_start, out_start);
8278 new_nvectors = out_i;
8279 }
8280
8281 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
8282 results.reserve (nresults);
8283 for (unsigned int i = 0; i < nresults; ++i)
8284 if (i < new_nvectors)
8285 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
8286 pieces[in_start + i]));
8287 else
8288 results.quick_push (results[i - new_nvectors]);
8289 }
8290
8291
8292 /* For constant and loop invariant defs in OP_NODE this function creates
8293 vector defs that will be used in the vectorized stmts and stores them
8294 to SLP_TREE_VEC_DEFS of OP_NODE. */
8295
8296 static void
8297 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8298 {
8299 unsigned HOST_WIDE_INT nunits;
8300 tree vec_cst;
8301 unsigned j, number_of_places_left_in_vector;
8302 tree vector_type;
8303 tree vop;
8304 int group_size = op_node->ops.length ();
8305 unsigned int vec_num, i;
8306 unsigned number_of_copies = 1;
8307 bool constant_p;
8308 gimple_seq ctor_seq = NULL;
8309 auto_vec<tree, 16> permute_results;
8310
8311 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
8312 vector_type = SLP_TREE_VECTYPE (op_node);
8313
8314 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8315 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
8316 auto_vec<tree> voprnds (number_of_vectors);
8317
8318 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8319 created vectors. It is greater than 1 if unrolling is performed.
8320
8321 For example, we have two scalar operands, s1 and s2 (e.g., group of
8322 strided accesses of size two), while NUNITS is four (i.e., four scalars
8323 of this type can be packed in a vector). The output vector will contain
8324 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
8325 will be 2).
8326
8327 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8328 containing the operands.
8329
8330 For example, NUNITS is four as before, and the group size is 8
8331 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
8332 {s5, s6, s7, s8}. */
8333
8334 /* When using duplicate_and_interleave, we just need one element for
8335 each scalar statement. */
8336 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
8337 nunits = group_size;
8338
8339 number_of_copies = nunits * number_of_vectors / group_size;
8340
8341 number_of_places_left_in_vector = nunits;
8342 constant_p = true;
8343 tree uniform_elt = NULL_TREE;
8344 tree_vector_builder elts (vector_type, nunits, 1);
8345 elts.quick_grow (nunits);
8346 stmt_vec_info insert_after = NULL;
8347 for (j = 0; j < number_of_copies; j++)
8348 {
8349 tree op;
8350 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
8351 {
8352 /* Create 'vect_ = {op0,op1,...,opn}'. */
8353 tree orig_op = op;
8354 if (number_of_places_left_in_vector == nunits)
8355 uniform_elt = op;
8356 else if (uniform_elt && operand_equal_p (uniform_elt, op))
8357 op = elts[number_of_places_left_in_vector];
8358 else
8359 uniform_elt = NULL_TREE;
8360 number_of_places_left_in_vector--;
8361 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8362 {
8363 if (CONSTANT_CLASS_P (op))
8364 {
8365 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8366 {
8367 /* Can't use VIEW_CONVERT_EXPR for booleans because
8368 of possibly different sizes of scalar value and
8369 vector element. */
8370 if (integer_zerop (op))
8371 op = build_int_cst (TREE_TYPE (vector_type), 0);
8372 else if (integer_onep (op))
8373 op = build_all_ones_cst (TREE_TYPE (vector_type));
8374 else
8375 gcc_unreachable ();
8376 }
8377 else
8378 op = fold_unary (VIEW_CONVERT_EXPR,
8379 TREE_TYPE (vector_type), op);
8380 gcc_assert (op && CONSTANT_CLASS_P (op));
8381 }
8382 else
8383 {
8384 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8385 gimple *init_stmt;
8386 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8387 {
8388 tree true_val
8389 = build_all_ones_cst (TREE_TYPE (vector_type));
8390 tree false_val
8391 = build_zero_cst (TREE_TYPE (vector_type));
8392 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8393 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8394 op, true_val,
8395 false_val);
8396 }
8397 else
8398 {
8399 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8400 op);
8401 init_stmt
8402 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8403 op);
8404 }
8405 gimple_seq_add_stmt (&ctor_seq, init_stmt);
8406 op = new_temp;
8407 }
8408 }
8409 elts[number_of_places_left_in_vector] = op;
8410 if (!CONSTANT_CLASS_P (op))
8411 constant_p = false;
8412 /* For BB vectorization we have to compute an insert location
8413 when a def is inside the analyzed region since we cannot
8414 simply insert at the BB start in this case. */
8415 stmt_vec_info opdef;
8416 if (TREE_CODE (orig_op) == SSA_NAME
8417 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8418 && is_a <bb_vec_info> (vinfo)
8419 && (opdef = vinfo->lookup_def (orig_op)))
8420 {
8421 if (!insert_after)
8422 insert_after = opdef;
8423 else
8424 insert_after = get_later_stmt (insert_after, opdef);
8425 }
8426
8427 if (number_of_places_left_in_vector == 0)
8428 {
8429 auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
8430 if (uniform_elt)
8431 vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
8432 elts[0]);
8433 else if (constant_p
8434 ? multiple_p (type_nunits, nunits)
8435 : known_eq (type_nunits, nunits))
8436 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8437 else
8438 {
8439 if (permute_results.is_empty ())
8440 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8441 elts, number_of_vectors,
8442 permute_results);
8443 vec_cst = permute_results[number_of_vectors - j - 1];
8444 }
8445 if (!gimple_seq_empty_p (ctor_seq))
8446 {
8447 if (insert_after)
8448 {
8449 gimple_stmt_iterator gsi;
8450 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8451 {
8452 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8453 gsi_insert_seq_before (&gsi, ctor_seq,
8454 GSI_CONTINUE_LINKING);
8455 }
8456 else if (!stmt_ends_bb_p (insert_after->stmt))
8457 {
8458 gsi = gsi_for_stmt (insert_after->stmt);
8459 gsi_insert_seq_after (&gsi, ctor_seq,
8460 GSI_CONTINUE_LINKING);
8461 }
8462 else
8463 {
8464 /* When we want to insert after a def where the
8465 defining stmt throws then insert on the fallthru
8466 edge. */
8467 edge e = find_fallthru_edge
8468 (gimple_bb (insert_after->stmt)->succs);
8469 basic_block new_bb
8470 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8471 gcc_assert (!new_bb);
8472 }
8473 }
8474 else
8475 vinfo->insert_seq_on_entry (NULL, ctor_seq);
8476 ctor_seq = NULL;
8477 }
8478 voprnds.quick_push (vec_cst);
8479 insert_after = NULL;
8480 number_of_places_left_in_vector = nunits;
8481 constant_p = true;
8482 elts.new_vector (vector_type, nunits, 1);
8483 elts.quick_grow (nunits);
8484 }
8485 }
8486 }
8487
8488 /* Since the vectors are created in the reverse order, we should invert
8489 them. */
8490 vec_num = voprnds.length ();
8491 for (j = vec_num; j != 0; j--)
8492 {
8493 vop = voprnds[j - 1];
8494 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8495 }
8496
8497 /* In case that VF is greater than the unrolling factor needed for the SLP
8498 group of stmts, NUMBER_OF_VECTORS to be created is greater than
8499 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8500 to replicate the vectors. */
8501 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8502 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8503 i++)
8504 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8505 }
8506
8507 /* Get the Ith vectorized definition from SLP_NODE. */
8508
8509 tree
8510 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8511 {
8512 return SLP_TREE_VEC_DEFS (slp_node)[i];
8513 }
8514
8515 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
8516
8517 void
8518 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8519 {
8520 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8521 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8522 }
8523
8524 /* Get N vectorized definitions for SLP_NODE. */
8525
8526 void
8527 vect_get_slp_defs (vec_info *,
8528 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8529 {
8530 if (n == -1U)
8531 n = SLP_TREE_CHILDREN (slp_node).length ();
8532
8533 for (unsigned i = 0; i < n; ++i)
8534 {
8535 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8536 vec<tree> vec_defs = vNULL;
8537 vect_get_slp_defs (child, &vec_defs);
8538 vec_oprnds->quick_push (vec_defs);
8539 }
8540 }
8541
8542 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8543 - PERM gives the permutation that the caller wants to use for NODE,
8544 which might be different from SLP_LOAD_PERMUTATION.
8545 - DUMP_P controls whether the function dumps information. */
8546
8547 static bool
8548 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8549 load_permutation_t &perm,
8550 const vec<tree> &dr_chain,
8551 gimple_stmt_iterator *gsi, poly_uint64 vf,
8552 bool analyze_only, bool dump_p,
8553 unsigned *n_perms, unsigned int *n_loads,
8554 bool dce_chain)
8555 {
8556 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8557 int vec_index = 0;
8558 tree vectype = SLP_TREE_VECTYPE (node);
8559 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8560 unsigned int mask_element;
8561 unsigned dr_group_size;
8562 machine_mode mode;
8563
8564 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8565 dr_group_size = 1;
8566 else
8567 {
8568 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8569 dr_group_size = DR_GROUP_SIZE (stmt_info);
8570 }
8571
8572 mode = TYPE_MODE (vectype);
8573 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8574 unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8575
8576 /* Initialize the vect stmts of NODE to properly insert the generated
8577 stmts later. */
8578 if (! analyze_only)
8579 for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8580 SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8581
8582 /* Generate permutation masks for every NODE. Number of masks for each NODE
8583 is equal to GROUP_SIZE.
8584 E.g., we have a group of three nodes with three loads from the same
8585 location in each node, and the vector size is 4. I.e., we have a
8586 a0b0c0a1b1c1... sequence and we need to create the following vectors:
8587 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8588 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8589 ...
8590
8591 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8592 The last mask is illegal since we assume two operands for permute
8593 operation, and the mask element values can't be outside that range.
8594 Hence, the last mask must be converted into {2,5,5,5}.
8595 For the first two permutations we need the first and the second input
8596 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8597 we need the second and the third vectors: {b1,c1,a2,b2} and
8598 {c2,a3,b3,c3}. */
8599
8600 int vect_stmts_counter = 0;
8601 unsigned int index = 0;
8602 int first_vec_index = -1;
8603 int second_vec_index = -1;
8604 bool noop_p = true;
8605 *n_perms = 0;
8606
8607 vec_perm_builder mask;
8608 unsigned int nelts_to_build;
8609 unsigned int nvectors_per_build;
8610 unsigned int in_nlanes;
8611 bool repeating_p = (group_size == dr_group_size
8612 && multiple_p (nunits, group_size));
8613 if (repeating_p)
8614 {
8615 /* A single vector contains a whole number of copies of the node, so:
8616 (a) all permutes can use the same mask; and
8617 (b) the permutes only need a single vector input. */
8618 mask.new_vector (nunits, group_size, 3);
8619 nelts_to_build = mask.encoded_nelts ();
8620 /* It's possible to obtain zero nstmts during analyze_only, so make
8621 it at least one to ensure the later computation for n_perms
8622 proceed. */
8623 nvectors_per_build = nstmts > 0 ? nstmts : 1;
8624 in_nlanes = dr_group_size * 3;
8625 }
8626 else
8627 {
8628 /* We need to construct a separate mask for each vector statement. */
8629 unsigned HOST_WIDE_INT const_nunits, const_vf;
8630 if (!nunits.is_constant (&const_nunits)
8631 || !vf.is_constant (&const_vf))
8632 return false;
8633 mask.new_vector (const_nunits, const_nunits, 1);
8634 nelts_to_build = const_vf * group_size;
8635 nvectors_per_build = 1;
8636 in_nlanes = const_vf * dr_group_size;
8637 }
8638 auto_sbitmap used_in_lanes (in_nlanes);
8639 bitmap_clear (used_in_lanes);
8640 auto_bitmap used_defs;
8641
8642 unsigned int count = mask.encoded_nelts ();
8643 mask.quick_grow (count);
8644 vec_perm_indices indices;
8645
8646 for (unsigned int j = 0; j < nelts_to_build; j++)
8647 {
8648 unsigned int iter_num = j / group_size;
8649 unsigned int stmt_num = j % group_size;
8650 unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8651 bitmap_set_bit (used_in_lanes, i);
8652 if (repeating_p)
8653 {
8654 first_vec_index = 0;
8655 mask_element = i;
8656 }
8657 else
8658 {
8659 /* Enforced before the loop when !repeating_p. */
8660 unsigned int const_nunits = nunits.to_constant ();
8661 vec_index = i / const_nunits;
8662 mask_element = i % const_nunits;
8663 if (vec_index == first_vec_index
8664 || first_vec_index == -1)
8665 {
8666 first_vec_index = vec_index;
8667 }
8668 else if (vec_index == second_vec_index
8669 || second_vec_index == -1)
8670 {
8671 second_vec_index = vec_index;
8672 mask_element += const_nunits;
8673 }
8674 else
8675 {
8676 if (dump_p)
8677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8678 "permutation requires at "
8679 "least three vectors %G",
8680 stmt_info->stmt);
8681 gcc_assert (analyze_only);
8682 return false;
8683 }
8684
8685 gcc_assert (mask_element < 2 * const_nunits);
8686 }
8687
8688 if (mask_element != index)
8689 noop_p = false;
8690 mask[index++] = mask_element;
8691
8692 if (index == count)
8693 {
8694 if (!noop_p)
8695 {
8696 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8697 if (!can_vec_perm_const_p (mode, mode, indices))
8698 {
8699 if (dump_p)
8700 {
8701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8702 "unsupported vect permute { ");
8703 for (i = 0; i < count; ++i)
8704 {
8705 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8706 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8707 }
8708 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8709 }
8710 gcc_assert (analyze_only);
8711 return false;
8712 }
8713
8714 tree mask_vec = NULL_TREE;
8715 if (!analyze_only)
8716 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8717
8718 if (second_vec_index == -1)
8719 second_vec_index = first_vec_index;
8720
8721 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8722 {
8723 ++*n_perms;
8724 if (analyze_only)
8725 continue;
8726 /* Generate the permute statement if necessary. */
8727 tree first_vec = dr_chain[first_vec_index + ri];
8728 tree second_vec = dr_chain[second_vec_index + ri];
8729 gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8730 tree perm_dest
8731 = vect_create_destination_var (gimple_assign_lhs (stmt),
8732 vectype);
8733 perm_dest = make_ssa_name (perm_dest);
8734 gimple *perm_stmt
8735 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8736 second_vec, mask_vec);
8737 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8738 gsi);
8739 if (dce_chain)
8740 {
8741 bitmap_set_bit (used_defs, first_vec_index + ri);
8742 bitmap_set_bit (used_defs, second_vec_index + ri);
8743 }
8744
8745 /* Store the vector statement in NODE. */
8746 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8747 }
8748 }
8749 else if (!analyze_only)
8750 {
8751 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8752 {
8753 tree first_vec = dr_chain[first_vec_index + ri];
8754 /* If mask was NULL_TREE generate the requested
8755 identity transform. */
8756 if (dce_chain)
8757 bitmap_set_bit (used_defs, first_vec_index + ri);
8758
8759 /* Store the vector statement in NODE. */
8760 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8761 }
8762 }
8763
8764 index = 0;
8765 first_vec_index = -1;
8766 second_vec_index = -1;
8767 noop_p = true;
8768 }
8769 }
8770
8771 if (n_loads)
8772 {
8773 if (repeating_p)
8774 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8775 else
8776 {
8777 /* Enforced above when !repeating_p. */
8778 unsigned int const_nunits = nunits.to_constant ();
8779 *n_loads = 0;
8780 bool load_seen = false;
8781 for (unsigned i = 0; i < in_nlanes; ++i)
8782 {
8783 if (i % const_nunits == 0)
8784 {
8785 if (load_seen)
8786 *n_loads += 1;
8787 load_seen = false;
8788 }
8789 if (bitmap_bit_p (used_in_lanes, i))
8790 load_seen = true;
8791 }
8792 if (load_seen)
8793 *n_loads += 1;
8794 }
8795 }
8796
8797 if (dce_chain)
8798 for (unsigned i = 0; i < dr_chain.length (); ++i)
8799 if (!bitmap_bit_p (used_defs, i))
8800 {
8801 tree def = dr_chain[i];
8802 do
8803 {
8804 gimple *stmt = SSA_NAME_DEF_STMT (def);
8805 if (is_gimple_assign (stmt)
8806 && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
8807 || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
8808 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
8809 else
8810 def = NULL;
8811 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8812 gsi_remove (&rgsi, true);
8813 release_defs (stmt);
8814 }
8815 while (def);
8816 }
8817
8818 return true;
8819 }
8820
8821 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8822 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8823 permute statements for the SLP node NODE. Store the number of vector
8824 permute instructions in *N_PERMS and the number of vector load
8825 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
8826 that were not needed. */
8827
8828 bool
8829 vect_transform_slp_perm_load (vec_info *vinfo,
8830 slp_tree node, const vec<tree> &dr_chain,
8831 gimple_stmt_iterator *gsi, poly_uint64 vf,
8832 bool analyze_only, unsigned *n_perms,
8833 unsigned int *n_loads, bool dce_chain)
8834 {
8835 return vect_transform_slp_perm_load_1 (vinfo, node,
8836 SLP_TREE_LOAD_PERMUTATION (node),
8837 dr_chain, gsi, vf, analyze_only,
8838 dump_enabled_p (), n_perms, n_loads,
8839 dce_chain);
8840 }
8841
8842 /* Produce the next vector result for SLP permutation NODE by adding a vector
8843 statement at GSI. If MASK_VEC is nonnull, add:
8844
8845 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8846
8847 otherwise add:
8848
8849 <new SSA name> = FIRST_DEF. */
8850
8851 static void
8852 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8853 slp_tree node, tree first_def, tree second_def,
8854 tree mask_vec, poly_uint64 identity_offset)
8855 {
8856 tree vectype = SLP_TREE_VECTYPE (node);
8857
8858 /* ??? We SLP match existing vector element extracts but
8859 allow punning which we need to re-instantiate at uses
8860 but have no good way of explicitly representing. */
8861 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8862 && !types_compatible_p (TREE_TYPE (first_def), vectype))
8863 {
8864 gassign *conv_stmt
8865 = gimple_build_assign (make_ssa_name (vectype),
8866 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8867 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8868 first_def = gimple_assign_lhs (conv_stmt);
8869 }
8870 gassign *perm_stmt;
8871 tree perm_dest = make_ssa_name (vectype);
8872 if (mask_vec)
8873 {
8874 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8875 TYPE_SIZE (vectype))
8876 && !types_compatible_p (TREE_TYPE (second_def), vectype))
8877 {
8878 gassign *conv_stmt
8879 = gimple_build_assign (make_ssa_name (vectype),
8880 build1 (VIEW_CONVERT_EXPR,
8881 vectype, second_def));
8882 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8883 second_def = gimple_assign_lhs (conv_stmt);
8884 }
8885 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8886 first_def, second_def,
8887 mask_vec);
8888 }
8889 else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8890 {
8891 /* For identity permutes we still need to handle the case
8892 of offsetted extracts or concats. */
8893 unsigned HOST_WIDE_INT c;
8894 auto first_def_nunits
8895 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8896 if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8897 {
8898 unsigned HOST_WIDE_INT elsz
8899 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8900 tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8901 TYPE_SIZE (vectype),
8902 bitsize_int (identity_offset * elsz));
8903 perm_stmt = gimple_build_assign (perm_dest, lowpart);
8904 }
8905 else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8906 first_def_nunits, &c) && c == 2)
8907 {
8908 tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8909 NULL_TREE, second_def);
8910 perm_stmt = gimple_build_assign (perm_dest, ctor);
8911 }
8912 else
8913 gcc_unreachable ();
8914 }
8915 else
8916 {
8917 /* We need a copy here in case the def was external. */
8918 perm_stmt = gimple_build_assign (perm_dest, first_def);
8919 }
8920 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8921 /* Store the vector statement in NODE. */
8922 node->push_vec_def (perm_stmt);
8923 }
8924
8925 /* Subroutine of vectorizable_slp_permutation. Check whether the target
8926 can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8927 If GSI is nonnull, emit the permutation there.
8928
8929 When GSI is null, the only purpose of NODE is to give properties
8930 of the result, such as the vector type and number of SLP lanes.
8931 The node does not need to be a VEC_PERM_EXPR.
8932
8933 If the target supports the operation, return the number of individual
8934 VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
8935 dump file if DUMP_P is true. */
8936
8937 static int
8938 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8939 slp_tree node, lane_permutation_t &perm,
8940 vec<slp_tree> &children, bool dump_p)
8941 {
8942 tree vectype = SLP_TREE_VECTYPE (node);
8943
8944 /* ??? We currently only support all same vector input types
8945 while the SLP IL should really do a concat + select and thus accept
8946 arbitrary mismatches. */
8947 slp_tree child;
8948 unsigned i;
8949 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8950 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8951 tree op_vectype = NULL_TREE;
8952 FOR_EACH_VEC_ELT (children, i, child)
8953 if (SLP_TREE_VECTYPE (child))
8954 {
8955 op_vectype = SLP_TREE_VECTYPE (child);
8956 break;
8957 }
8958 if (!op_vectype)
8959 op_vectype = vectype;
8960 FOR_EACH_VEC_ELT (children, i, child)
8961 {
8962 if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8963 && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8964 || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8965 || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8966 {
8967 if (dump_p)
8968 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8969 "Unsupported vector types in lane permutation\n");
8970 return -1;
8971 }
8972 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8973 repeating_p = false;
8974 }
8975
8976 gcc_assert (perm.length () == SLP_TREE_LANES (node));
8977 if (dump_p)
8978 {
8979 dump_printf_loc (MSG_NOTE, vect_location,
8980 "vectorizing permutation");
8981 for (unsigned i = 0; i < perm.length (); ++i)
8982 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8983 if (repeating_p)
8984 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8985 dump_printf (MSG_NOTE, "\n");
8986 }
8987
8988 /* REPEATING_P is true if every output vector is guaranteed to use the
8989 same permute vector. We can handle that case for both variable-length
8990 and constant-length vectors, but we only handle other cases for
8991 constant-length vectors.
8992
8993 Set:
8994
8995 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8996 mask vector that we want to build.
8997
8998 - NCOPIES to the number of copies of PERM that we need in order
8999 to build the necessary permute mask vectors.
9000
9001 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
9002 for each permute mask vector. This is only relevant when GSI is
9003 nonnull. */
9004 uint64_t npatterns;
9005 unsigned nelts_per_pattern;
9006 uint64_t ncopies;
9007 unsigned noutputs_per_mask;
9008 if (repeating_p)
9009 {
9010 /* We need a single permute mask vector that has the form:
9011
9012 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
9013
9014 In other words, the original n-element permute in PERM is
9015 "unrolled" to fill a full vector. The stepped vector encoding
9016 that we use for permutes requires 3n elements. */
9017 npatterns = SLP_TREE_LANES (node);
9018 nelts_per_pattern = ncopies = 3;
9019 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9020 }
9021 else
9022 {
9023 /* Calculate every element of every permute mask vector explicitly,
9024 instead of relying on the pattern described above. */
9025 if (!nunits.is_constant (&npatterns)
9026 || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
9027 return -1;
9028 nelts_per_pattern = ncopies = 1;
9029 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
9030 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
9031 return -1;
9032 noutputs_per_mask = 1;
9033 }
9034 unsigned olanes = ncopies * SLP_TREE_LANES (node);
9035 gcc_assert (repeating_p || multiple_p (olanes, nunits));
9036
9037 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
9038 from the { SLP operand, scalar lane } permutation as recorded in the
9039 SLP node as intermediate step. This part should already work
9040 with SLP children with arbitrary number of lanes. */
9041 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
9042 auto_vec<unsigned> active_lane;
9043 vperm.create (olanes);
9044 active_lane.safe_grow_cleared (children.length (), true);
9045 for (unsigned i = 0; i < ncopies; ++i)
9046 {
9047 for (unsigned pi = 0; pi < perm.length (); ++pi)
9048 {
9049 std::pair<unsigned, unsigned> p = perm[pi];
9050 tree vtype = SLP_TREE_VECTYPE (children[p.first]);
9051 if (repeating_p)
9052 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
9053 else
9054 {
9055 /* We checked above that the vectors are constant-length. */
9056 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
9057 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
9058 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
9059 vperm.quick_push ({{p.first, vi}, vl});
9060 }
9061 }
9062 /* Advance to the next group. */
9063 for (unsigned j = 0; j < children.length (); ++j)
9064 active_lane[j] += SLP_TREE_LANES (children[j]);
9065 }
9066
9067 if (dump_p)
9068 {
9069 dump_printf_loc (MSG_NOTE, vect_location,
9070 "vectorizing permutation");
9071 for (unsigned i = 0; i < perm.length (); ++i)
9072 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
9073 if (repeating_p)
9074 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
9075 dump_printf (MSG_NOTE, "\n");
9076 dump_printf_loc (MSG_NOTE, vect_location, "as");
9077 for (unsigned i = 0; i < vperm.length (); ++i)
9078 {
9079 if (i != 0
9080 && (repeating_p
9081 ? multiple_p (i, npatterns)
9082 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
9083 dump_printf (MSG_NOTE, ",");
9084 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
9085 vperm[i].first.first, vperm[i].first.second,
9086 vperm[i].second);
9087 }
9088 dump_printf (MSG_NOTE, "\n");
9089 }
9090
9091 /* We can only handle two-vector permutes, everything else should
9092 be lowered on the SLP level. The following is closely inspired
9093 by vect_transform_slp_perm_load and is supposed to eventually
9094 replace it.
9095 ??? As intermediate step do code-gen in the SLP tree representation
9096 somehow? */
9097 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
9098 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
9099 unsigned int index = 0;
9100 poly_uint64 mask_element;
9101 vec_perm_builder mask;
9102 mask.new_vector (nunits, npatterns, nelts_per_pattern);
9103 unsigned int count = mask.encoded_nelts ();
9104 mask.quick_grow (count);
9105 vec_perm_indices indices;
9106 unsigned nperms = 0;
9107 for (unsigned i = 0; i < vperm.length (); ++i)
9108 {
9109 mask_element = vperm[i].second;
9110 if (first_vec.first == -1U
9111 || first_vec == vperm[i].first)
9112 first_vec = vperm[i].first;
9113 else if (second_vec.first == -1U
9114 || second_vec == vperm[i].first)
9115 {
9116 second_vec = vperm[i].first;
9117 mask_element += nunits;
9118 }
9119 else
9120 {
9121 if (dump_p)
9122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9123 "permutation requires at "
9124 "least three vectors\n");
9125 gcc_assert (!gsi);
9126 return -1;
9127 }
9128
9129 mask[index++] = mask_element;
9130
9131 if (index == count)
9132 {
9133 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
9134 TYPE_VECTOR_SUBPARTS (op_vectype));
9135 bool identity_p = (indices.series_p (0, 1, mask[0], 1)
9136 && constant_multiple_p (mask[0], nunits));
9137 machine_mode vmode = TYPE_MODE (vectype);
9138 machine_mode op_vmode = TYPE_MODE (op_vectype);
9139 unsigned HOST_WIDE_INT c;
9140 if ((!identity_p
9141 && !can_vec_perm_const_p (vmode, op_vmode, indices))
9142 || (identity_p
9143 && !known_le (nunits,
9144 TYPE_VECTOR_SUBPARTS (op_vectype))
9145 && (!constant_multiple_p (nunits,
9146 TYPE_VECTOR_SUBPARTS (op_vectype),
9147 &c) || c != 2)))
9148 {
9149 if (dump_p)
9150 {
9151 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
9152 vect_location,
9153 "unsupported vect permute { ");
9154 for (i = 0; i < count; ++i)
9155 {
9156 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
9157 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
9158 }
9159 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
9160 }
9161 gcc_assert (!gsi);
9162 return -1;
9163 }
9164
9165 if (!identity_p)
9166 nperms++;
9167 if (gsi)
9168 {
9169 if (second_vec.first == -1U)
9170 second_vec = first_vec;
9171
9172 slp_tree
9173 first_node = children[first_vec.first],
9174 second_node = children[second_vec.first];
9175
9176 tree mask_vec = NULL_TREE;
9177 if (!identity_p)
9178 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
9179
9180 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
9181 {
9182 tree first_def
9183 = vect_get_slp_vect_def (first_node,
9184 first_vec.second + vi);
9185 tree second_def
9186 = vect_get_slp_vect_def (second_node,
9187 second_vec.second + vi);
9188 vect_add_slp_permutation (vinfo, gsi, node, first_def,
9189 second_def, mask_vec, mask[0]);
9190 }
9191 }
9192
9193 index = 0;
9194 first_vec = std::make_pair (-1U, -1U);
9195 second_vec = std::make_pair (-1U, -1U);
9196 }
9197 }
9198
9199 return nperms;
9200 }
9201
9202 /* Vectorize the SLP permutations in NODE as specified
9203 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
9204 child number and lane number.
9205 Interleaving of two two-lane two-child SLP subtrees (not supported):
9206 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
9207 A blend of two four-lane two-child SLP subtrees:
9208 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
9209 Highpart of a four-lane one-child SLP subtree (not supported):
9210 [ { 0, 2 }, { 0, 3 } ]
9211 Where currently only a subset is supported by code generating below. */
9212
9213 static bool
9214 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
9215 slp_tree node, stmt_vector_for_cost *cost_vec)
9216 {
9217 tree vectype = SLP_TREE_VECTYPE (node);
9218 lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
9219 int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
9220 SLP_TREE_CHILDREN (node),
9221 dump_enabled_p ());
9222 if (nperms < 0)
9223 return false;
9224
9225 if (!gsi)
9226 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
9227
9228 return true;
9229 }
9230
9231 /* Vectorize SLP NODE. */
9232
9233 static void
9234 vect_schedule_slp_node (vec_info *vinfo,
9235 slp_tree node, slp_instance instance)
9236 {
9237 gimple_stmt_iterator si;
9238 int i;
9239 slp_tree child;
9240
9241 /* Vectorize externals and constants. */
9242 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
9243 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
9244 {
9245 /* ??? vectorizable_shift can end up using a scalar operand which is
9246 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
9247 node in this case. */
9248 if (!SLP_TREE_VECTYPE (node))
9249 return;
9250
9251 /* There are two reasons vector defs might already exist. The first
9252 is that we are vectorizing an existing vector def. The second is
9253 when performing BB vectorization shared constant/external nodes
9254 are not split apart during partitioning so during the code-gen
9255 DFS walk we can end up visiting them twice. */
9256 if (! SLP_TREE_VEC_DEFS (node).exists ())
9257 vect_create_constant_vectors (vinfo, node);
9258 return;
9259 }
9260
9261 gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
9262
9263 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
9264
9265 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
9266 SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
9267
9268 if (dump_enabled_p ())
9269 dump_printf_loc (MSG_NOTE, vect_location,
9270 "------>vectorizing SLP node starting from: %G",
9271 stmt_info->stmt);
9272
9273 if (STMT_VINFO_DATA_REF (stmt_info)
9274 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9275 {
9276 /* Vectorized loads go before the first scalar load to make it
9277 ready early, vectorized stores go before the last scalar
9278 stmt which is where all uses are ready. */
9279 stmt_vec_info last_stmt_info = NULL;
9280 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
9281 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
9282 else /* DR_IS_WRITE */
9283 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
9284 si = gsi_for_stmt (last_stmt_info->stmt);
9285 }
9286 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
9287 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
9288 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
9289 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9290 {
9291 /* For PHI node vectorization we do not use the insertion iterator. */
9292 si = gsi_none ();
9293 }
9294 else
9295 {
9296 /* Emit other stmts after the children vectorized defs which is
9297 earliest possible. */
9298 gimple *last_stmt = NULL;
9299 if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
9300 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
9301 || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
9302 {
9303 /* But avoid scheduling internal defs outside of the loop when
9304 we might have only implicitly tracked loop mask/len defs. */
9305 gimple_stmt_iterator si
9306 = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
9307 last_stmt = *si;
9308 }
9309 bool seen_vector_def = false;
9310 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9311 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9312 {
9313 /* For fold-left reductions we are retaining the scalar
9314 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
9315 set so the representation isn't perfect. Resort to the
9316 last scalar def here. */
9317 if (SLP_TREE_VEC_DEFS (child).is_empty ())
9318 {
9319 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
9320 == cycle_phi_info_type);
9321 gphi *phi = as_a <gphi *>
9322 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
9323 if (!last_stmt
9324 || vect_stmt_dominates_stmt_p (last_stmt, phi))
9325 last_stmt = phi;
9326 }
9327 /* We are emitting all vectorized stmts in the same place and
9328 the last one is the last.
9329 ??? Unless we have a load permutation applied and that
9330 figures to re-use an earlier generated load. */
9331 unsigned j;
9332 tree vdef;
9333 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9334 {
9335 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9336 if (!last_stmt
9337 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9338 last_stmt = vstmt;
9339 }
9340 }
9341 else if (!SLP_TREE_VECTYPE (child))
9342 {
9343 /* For externals we use unvectorized at all scalar defs. */
9344 unsigned j;
9345 tree def;
9346 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9347 if (TREE_CODE (def) == SSA_NAME
9348 && !SSA_NAME_IS_DEFAULT_DEF (def))
9349 {
9350 gimple *stmt = SSA_NAME_DEF_STMT (def);
9351 if (!last_stmt
9352 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9353 last_stmt = stmt;
9354 }
9355 }
9356 else
9357 {
9358 /* For externals we have to look at all defs since their
9359 insertion place is decided per vector. But beware
9360 of pre-existing vectors where we need to make sure
9361 we do not insert before the region boundary. */
9362 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9363 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9364 seen_vector_def = true;
9365 else
9366 {
9367 unsigned j;
9368 tree vdef;
9369 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9370 if (TREE_CODE (vdef) == SSA_NAME
9371 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9372 {
9373 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9374 if (!last_stmt
9375 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9376 last_stmt = vstmt;
9377 }
9378 }
9379 }
9380 /* This can happen when all children are pre-existing vectors or
9381 constants. */
9382 if (!last_stmt)
9383 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9384 if (!last_stmt)
9385 {
9386 gcc_assert (seen_vector_def);
9387 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9388 }
9389 else if (is_ctrl_altering_stmt (last_stmt))
9390 {
9391 /* We split regions to vectorize at control altering stmts
9392 with a definition so this must be an external which
9393 we can insert at the start of the region. */
9394 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9395 }
9396 else if (is_a <bb_vec_info> (vinfo)
9397 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9398 && gimple_could_trap_p (stmt_info->stmt))
9399 {
9400 /* We've constrained possibly trapping operations to all come
9401 from the same basic-block, if vectorized defs would allow earlier
9402 scheduling still force vectorized stmts to the original block.
9403 This is only necessary for BB vectorization since for loop vect
9404 all operations are in a single BB and scalar stmt based
9405 placement doesn't play well with epilogue vectorization. */
9406 gcc_assert (dominated_by_p (CDI_DOMINATORS,
9407 gimple_bb (stmt_info->stmt),
9408 gimple_bb (last_stmt)));
9409 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9410 }
9411 else if (is_a <gphi *> (last_stmt))
9412 si = gsi_after_labels (gimple_bb (last_stmt));
9413 else
9414 {
9415 si = gsi_for_stmt (last_stmt);
9416 gsi_next (&si);
9417 }
9418 }
9419
9420 /* Handle purely internal nodes. */
9421 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9422 {
9423 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
9424 be shared with different SLP nodes (but usually it's the same
9425 operation apart from the case the stmt is only there for denoting
9426 the actual scalar lane defs ...). So do not call vect_transform_stmt
9427 but open-code it here (partly). */
9428 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9429 gcc_assert (done);
9430 stmt_vec_info slp_stmt_info;
9431 unsigned int i;
9432 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9433 if (STMT_VINFO_LIVE_P (slp_stmt_info))
9434 {
9435 done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9436 instance, i, true, NULL);
9437 gcc_assert (done);
9438 }
9439 }
9440 else
9441 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9442 }
9443
9444 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9445 For loop vectorization this is done in vectorizable_call, but for SLP
9446 it needs to be deferred until end of vect_schedule_slp, because multiple
9447 SLP instances may refer to the same scalar stmt. */
9448
9449 static void
9450 vect_remove_slp_scalar_calls (vec_info *vinfo,
9451 slp_tree node, hash_set<slp_tree> &visited)
9452 {
9453 gimple *new_stmt;
9454 gimple_stmt_iterator gsi;
9455 int i;
9456 slp_tree child;
9457 tree lhs;
9458 stmt_vec_info stmt_info;
9459
9460 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9461 return;
9462
9463 if (visited.add (node))
9464 return;
9465
9466 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9467 vect_remove_slp_scalar_calls (vinfo, child, visited);
9468
9469 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9470 {
9471 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9472 if (!stmt || gimple_bb (stmt) == NULL)
9473 continue;
9474 if (is_pattern_stmt_p (stmt_info)
9475 || !PURE_SLP_STMT (stmt_info))
9476 continue;
9477 lhs = gimple_call_lhs (stmt);
9478 if (lhs)
9479 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9480 else
9481 {
9482 new_stmt = gimple_build_nop ();
9483 unlink_stmt_vdef (stmt_info->stmt);
9484 }
9485 gsi = gsi_for_stmt (stmt);
9486 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9487 if (lhs)
9488 SSA_NAME_DEF_STMT (lhs) = new_stmt;
9489 }
9490 }
9491
9492 static void
9493 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9494 {
9495 hash_set<slp_tree> visited;
9496 vect_remove_slp_scalar_calls (vinfo, node, visited);
9497 }
9498
9499 /* Vectorize the instance root. */
9500
9501 void
9502 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9503 {
9504 gassign *rstmt = NULL;
9505
9506 if (instance->kind == slp_inst_kind_ctor)
9507 {
9508 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9509 {
9510 tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9511 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9512 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9513 TREE_TYPE (vect_lhs)))
9514 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9515 vect_lhs);
9516 rstmt = gimple_build_assign (root_lhs, vect_lhs);
9517 }
9518 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9519 {
9520 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9521 tree child_def;
9522 int j;
9523 vec<constructor_elt, va_gc> *v;
9524 vec_alloc (v, nelts);
9525
9526 /* A CTOR can handle V16HI composition from VNx8HI so we
9527 do not need to convert vector elements if the types
9528 do not match. */
9529 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9530 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9531 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9532 tree rtype
9533 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9534 tree r_constructor = build_constructor (rtype, v);
9535 rstmt = gimple_build_assign (lhs, r_constructor);
9536 }
9537 }
9538 else if (instance->kind == slp_inst_kind_bb_reduc)
9539 {
9540 /* Largely inspired by reduction chain epilogue handling in
9541 vect_create_epilog_for_reduction. */
9542 vec<tree> vec_defs = vNULL;
9543 vect_get_slp_defs (node, &vec_defs);
9544 enum tree_code reduc_code
9545 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9546 /* ??? We actually have to reflect signs somewhere. */
9547 if (reduc_code == MINUS_EXPR)
9548 reduc_code = PLUS_EXPR;
9549 gimple_seq epilogue = NULL;
9550 /* We may end up with more than one vector result, reduce them
9551 to one vector. */
9552 tree vec_def = vec_defs[0];
9553 tree vectype = TREE_TYPE (vec_def);
9554 tree compute_vectype = vectype;
9555 bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9556 && TYPE_OVERFLOW_UNDEFINED (vectype)
9557 && operation_can_overflow (reduc_code));
9558 if (pun_for_overflow_p)
9559 {
9560 compute_vectype = unsigned_type_for (vectype);
9561 vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9562 compute_vectype, vec_def);
9563 }
9564 for (unsigned i = 1; i < vec_defs.length (); ++i)
9565 {
9566 tree def = vec_defs[i];
9567 if (pun_for_overflow_p)
9568 def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9569 compute_vectype, def);
9570 vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9571 vec_def, def);
9572 }
9573 vec_defs.release ();
9574 /* ??? Support other schemes than direct internal fn. */
9575 internal_fn reduc_fn;
9576 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9577 || reduc_fn == IFN_LAST)
9578 gcc_unreachable ();
9579 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9580 TREE_TYPE (compute_vectype), vec_def);
9581 if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9582 {
9583 tree rem_def = NULL_TREE;
9584 for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9585 {
9586 def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9587 if (!rem_def)
9588 rem_def = def;
9589 else
9590 rem_def = gimple_build (&epilogue, reduc_code,
9591 TREE_TYPE (scalar_def),
9592 rem_def, def);
9593 }
9594 scalar_def = gimple_build (&epilogue, reduc_code,
9595 TREE_TYPE (scalar_def),
9596 scalar_def, rem_def);
9597 }
9598 scalar_def = gimple_convert (&epilogue,
9599 TREE_TYPE (vectype), scalar_def);
9600 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9601 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9602 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9603 update_stmt (gsi_stmt (rgsi));
9604 return;
9605 }
9606 else
9607 gcc_unreachable ();
9608
9609 gcc_assert (rstmt);
9610
9611 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9612 gsi_replace (&rgsi, rstmt, true);
9613 }
9614
9615 struct slp_scc_info
9616 {
9617 bool on_stack;
9618 int dfs;
9619 int lowlink;
9620 };
9621
9622 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
9623
9624 static void
9625 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9626 hash_map<slp_tree, slp_scc_info> &scc_info,
9627 int &maxdfs, vec<slp_tree> &stack)
9628 {
9629 bool existed_p;
9630 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9631 gcc_assert (!existed_p);
9632 info->dfs = maxdfs;
9633 info->lowlink = maxdfs;
9634 maxdfs++;
9635
9636 /* Leaf. */
9637 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9638 {
9639 info->on_stack = false;
9640 vect_schedule_slp_node (vinfo, node, instance);
9641 return;
9642 }
9643
9644 info->on_stack = true;
9645 stack.safe_push (node);
9646
9647 unsigned i;
9648 slp_tree child;
9649 /* DFS recurse. */
9650 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9651 {
9652 if (!child)
9653 continue;
9654 slp_scc_info *child_info = scc_info.get (child);
9655 if (!child_info)
9656 {
9657 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9658 /* Recursion might have re-allocated the node. */
9659 info = scc_info.get (node);
9660 child_info = scc_info.get (child);
9661 info->lowlink = MIN (info->lowlink, child_info->lowlink);
9662 }
9663 else if (child_info->on_stack)
9664 info->lowlink = MIN (info->lowlink, child_info->dfs);
9665 }
9666 if (info->lowlink != info->dfs)
9667 return;
9668
9669 auto_vec<slp_tree, 4> phis_to_fixup;
9670
9671 /* Singleton. */
9672 if (stack.last () == node)
9673 {
9674 stack.pop ();
9675 info->on_stack = false;
9676 vect_schedule_slp_node (vinfo, node, instance);
9677 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9678 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9679 phis_to_fixup.quick_push (node);
9680 }
9681 else
9682 {
9683 /* SCC. */
9684 int last_idx = stack.length () - 1;
9685 while (stack[last_idx] != node)
9686 last_idx--;
9687 /* We can break the cycle at PHIs who have at least one child
9688 code generated. Then we could re-start the DFS walk until
9689 all nodes in the SCC are covered (we might have new entries
9690 for only back-reachable nodes). But it's simpler to just
9691 iterate and schedule those that are ready. */
9692 unsigned todo = stack.length () - last_idx;
9693 do
9694 {
9695 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9696 {
9697 slp_tree entry = stack[idx];
9698 if (!entry)
9699 continue;
9700 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9701 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9702 bool ready = !phi;
9703 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9704 if (!child)
9705 {
9706 gcc_assert (phi);
9707 ready = true;
9708 break;
9709 }
9710 else if (scc_info.get (child)->on_stack)
9711 {
9712 if (!phi)
9713 {
9714 ready = false;
9715 break;
9716 }
9717 }
9718 else
9719 {
9720 if (phi)
9721 {
9722 ready = true;
9723 break;
9724 }
9725 }
9726 if (ready)
9727 {
9728 vect_schedule_slp_node (vinfo, entry, instance);
9729 scc_info.get (entry)->on_stack = false;
9730 stack[idx] = NULL;
9731 todo--;
9732 if (phi)
9733 phis_to_fixup.safe_push (entry);
9734 }
9735 }
9736 }
9737 while (todo != 0);
9738
9739 /* Pop the SCC. */
9740 stack.truncate (last_idx);
9741 }
9742
9743 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
9744 slp_tree phi_node;
9745 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9746 {
9747 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9748 edge_iterator ei;
9749 edge e;
9750 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9751 {
9752 unsigned dest_idx = e->dest_idx;
9753 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9754 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9755 continue;
9756 unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9757 /* Simply fill all args. */
9758 if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9759 != vect_first_order_recurrence)
9760 for (unsigned i = 0; i < n; ++i)
9761 {
9762 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9763 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9764 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9765 e, gimple_phi_arg_location (phi, dest_idx));
9766 }
9767 else
9768 {
9769 /* Unless it is a first order recurrence which needs
9770 args filled in for both the PHI node and the permutes. */
9771 gimple *perm
9772 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9773 gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9774 add_phi_arg (as_a <gphi *> (rphi),
9775 vect_get_slp_vect_def (child, n - 1),
9776 e, gimple_phi_arg_location (phi, dest_idx));
9777 for (unsigned i = 0; i < n; ++i)
9778 {
9779 gimple *perm
9780 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9781 if (i > 0)
9782 gimple_assign_set_rhs1 (perm,
9783 vect_get_slp_vect_def (child, i - 1));
9784 gimple_assign_set_rhs2 (perm,
9785 vect_get_slp_vect_def (child, i));
9786 update_stmt (perm);
9787 }
9788 }
9789 }
9790 }
9791 }
9792
9793 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
9794
9795 void
9796 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9797 {
9798 slp_instance instance;
9799 unsigned int i;
9800
9801 hash_map<slp_tree, slp_scc_info> scc_info;
9802 int maxdfs = 0;
9803 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9804 {
9805 slp_tree node = SLP_INSTANCE_TREE (instance);
9806 if (dump_enabled_p ())
9807 {
9808 dump_printf_loc (MSG_NOTE, vect_location,
9809 "Vectorizing SLP tree:\n");
9810 /* ??? Dump all? */
9811 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9812 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9813 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9814 vect_print_slp_graph (MSG_NOTE, vect_location,
9815 SLP_INSTANCE_TREE (instance));
9816 }
9817 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9818 have a PHI be the node breaking the cycle. */
9819 auto_vec<slp_tree> stack;
9820 if (!scc_info.get (node))
9821 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9822
9823 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9824 vectorize_slp_instance_root_stmt (node, instance);
9825
9826 if (dump_enabled_p ())
9827 dump_printf_loc (MSG_NOTE, vect_location,
9828 "vectorizing stmts using SLP.\n");
9829 }
9830
9831 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9832 {
9833 slp_tree root = SLP_INSTANCE_TREE (instance);
9834 stmt_vec_info store_info;
9835 unsigned int j;
9836
9837 /* Remove scalar call stmts. Do not do this for basic-block
9838 vectorization as not all uses may be vectorized.
9839 ??? Why should this be necessary? DCE should be able to
9840 remove the stmts itself.
9841 ??? For BB vectorization we can as well remove scalar
9842 stmts starting from the SLP tree root if they have no
9843 uses. */
9844 if (is_a <loop_vec_info> (vinfo))
9845 vect_remove_slp_scalar_calls (vinfo, root);
9846
9847 /* Remove vectorized stores original scalar stmts. */
9848 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9849 {
9850 if (!STMT_VINFO_DATA_REF (store_info)
9851 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9852 break;
9853
9854 store_info = vect_orig_stmt (store_info);
9855 /* Free the attached stmt_vec_info and remove the stmt. */
9856 vinfo->remove_stmt (store_info);
9857
9858 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9859 to not crash in vect_free_slp_tree later. */
9860 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9861 SLP_TREE_REPRESENTATIVE (root) = NULL;
9862 }
9863 }
9864 }