]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
dump SLP subgraph before costing
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
52
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55
56 static object_allocator<_slp_tree> *slp_tree_pool;
57 static slp_tree slp_first_node;
58
59 void
60 vect_slp_init (void)
61 {
62 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
63 }
64
65 void
66 vect_slp_fini (void)
67 {
68 while (slp_first_node)
69 delete slp_first_node;
70 delete slp_tree_pool;
71 slp_tree_pool = NULL;
72 }
73
74 void *
75 _slp_tree::operator new (size_t n)
76 {
77 gcc_assert (n == sizeof (_slp_tree));
78 return slp_tree_pool->allocate_raw ();
79 }
80
81 void
82 _slp_tree::operator delete (void *node, size_t n)
83 {
84 gcc_assert (n == sizeof (_slp_tree));
85 slp_tree_pool->remove_raw (node);
86 }
87
88
89 /* Initialize a SLP node. */
90
91 _slp_tree::_slp_tree ()
92 {
93 this->prev_node = NULL;
94 if (slp_first_node)
95 slp_first_node->prev_node = this;
96 this->next_node = slp_first_node;
97 slp_first_node = this;
98 SLP_TREE_SCALAR_STMTS (this) = vNULL;
99 SLP_TREE_SCALAR_OPS (this) = vNULL;
100 SLP_TREE_VEC_STMTS (this) = vNULL;
101 SLP_TREE_VEC_DEFS (this) = vNULL;
102 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
103 SLP_TREE_CHILDREN (this) = vNULL;
104 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
105 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
106 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
107 SLP_TREE_CODE (this) = ERROR_MARK;
108 SLP_TREE_VECTYPE (this) = NULL_TREE;
109 SLP_TREE_REPRESENTATIVE (this) = NULL;
110 SLP_TREE_REF_COUNT (this) = 1;
111 this->max_nunits = 1;
112 this->lanes = 0;
113 }
114
115 /* Tear down a SLP node. */
116
117 _slp_tree::~_slp_tree ()
118 {
119 if (this->prev_node)
120 this->prev_node->next_node = this->next_node;
121 else
122 slp_first_node = this->next_node;
123 if (this->next_node)
124 this->next_node->prev_node = this->prev_node;
125 SLP_TREE_CHILDREN (this).release ();
126 SLP_TREE_SCALAR_STMTS (this).release ();
127 SLP_TREE_SCALAR_OPS (this).release ();
128 SLP_TREE_VEC_STMTS (this).release ();
129 SLP_TREE_VEC_DEFS (this).release ();
130 SLP_TREE_LOAD_PERMUTATION (this).release ();
131 SLP_TREE_LANE_PERMUTATION (this).release ();
132 }
133
134 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
135
136 void
137 vect_free_slp_tree (slp_tree node)
138 {
139 int i;
140 slp_tree child;
141
142 if (--SLP_TREE_REF_COUNT (node) != 0)
143 return;
144
145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
146 if (child)
147 vect_free_slp_tree (child);
148
149 delete node;
150 }
151
152 /* Return a location suitable for dumpings related to the SLP instance. */
153
154 dump_user_location_t
155 _slp_instance::location () const
156 {
157 if (root_stmt)
158 return root_stmt->stmt;
159 else
160 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
161 }
162
163
164 /* Free the memory allocated for the SLP instance. */
165
166 void
167 vect_free_slp_instance (slp_instance instance)
168 {
169 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
170 SLP_INSTANCE_LOADS (instance).release ();
171 instance->subgraph_entries.release ();
172 instance->cost_vec.release ();
173 free (instance);
174 }
175
176
177 /* Create an SLP node for SCALAR_STMTS. */
178
179 slp_tree
180 vect_create_new_slp_node (unsigned nops, tree_code code)
181 {
182 slp_tree node = new _slp_tree;
183 SLP_TREE_SCALAR_STMTS (node) = vNULL;
184 SLP_TREE_CHILDREN (node).create (nops);
185 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
186 SLP_TREE_CODE (node) = code;
187 return node;
188 }
189 /* Create an SLP node for SCALAR_STMTS. */
190
191 static slp_tree
192 vect_create_new_slp_node (slp_tree node,
193 vec<stmt_vec_info> scalar_stmts, unsigned nops)
194 {
195 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
196 SLP_TREE_CHILDREN (node).create (nops);
197 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
198 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
199 SLP_TREE_LANES (node) = scalar_stmts.length ();
200 return node;
201 }
202
203 /* Create an SLP node for SCALAR_STMTS. */
204
205 static slp_tree
206 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
207 {
208 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
209 }
210
211 /* Create an SLP node for OPS. */
212
213 static slp_tree
214 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
215 {
216 SLP_TREE_SCALAR_OPS (node) = ops;
217 SLP_TREE_DEF_TYPE (node) = vect_external_def;
218 SLP_TREE_LANES (node) = ops.length ();
219 return node;
220 }
221
222 /* Create an SLP node for OPS. */
223
224 static slp_tree
225 vect_create_new_slp_node (vec<tree> ops)
226 {
227 return vect_create_new_slp_node (new _slp_tree, ops);
228 }
229
230
231 /* This structure is used in creation of an SLP tree. Each instance
232 corresponds to the same operand in a group of scalar stmts in an SLP
233 node. */
234 typedef struct _slp_oprnd_info
235 {
236 /* Def-stmts for the operands. */
237 vec<stmt_vec_info> def_stmts;
238 /* Operands. */
239 vec<tree> ops;
240 /* Information about the first statement, its vector def-type, type, the
241 operand itself in case it's constant, and an indication if it's a pattern
242 stmt. */
243 tree first_op_type;
244 enum vect_def_type first_dt;
245 bool any_pattern;
246 } *slp_oprnd_info;
247
248
249 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
250 operand. */
251 static vec<slp_oprnd_info>
252 vect_create_oprnd_info (int nops, int group_size)
253 {
254 int i;
255 slp_oprnd_info oprnd_info;
256 vec<slp_oprnd_info> oprnds_info;
257
258 oprnds_info.create (nops);
259 for (i = 0; i < nops; i++)
260 {
261 oprnd_info = XNEW (struct _slp_oprnd_info);
262 oprnd_info->def_stmts.create (group_size);
263 oprnd_info->ops.create (group_size);
264 oprnd_info->first_dt = vect_uninitialized_def;
265 oprnd_info->first_op_type = NULL_TREE;
266 oprnd_info->any_pattern = false;
267 oprnds_info.quick_push (oprnd_info);
268 }
269
270 return oprnds_info;
271 }
272
273
274 /* Free operands info. */
275
276 static void
277 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
278 {
279 int i;
280 slp_oprnd_info oprnd_info;
281
282 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
283 {
284 oprnd_info->def_stmts.release ();
285 oprnd_info->ops.release ();
286 XDELETE (oprnd_info);
287 }
288
289 oprnds_info.release ();
290 }
291
292
293 /* Return true if STMTS contains a pattern statement. */
294
295 static bool
296 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
297 {
298 stmt_vec_info stmt_info;
299 unsigned int i;
300 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
301 if (is_pattern_stmt_p (stmt_info))
302 return true;
303 return false;
304 }
305
306 /* Return true when all lanes in the external or constant NODE have
307 the same value. */
308
309 static bool
310 vect_slp_tree_uniform_p (slp_tree node)
311 {
312 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
313 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
314
315 /* Pre-exsting vectors. */
316 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
317 return false;
318
319 unsigned i;
320 tree op, first = NULL_TREE;
321 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
322 if (!first)
323 first = op;
324 else if (!operand_equal_p (first, op, 0))
325 return false;
326
327 return true;
328 }
329
330 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
331 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
332 of the chain. */
333
334 int
335 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
336 stmt_vec_info first_stmt_info)
337 {
338 stmt_vec_info next_stmt_info = first_stmt_info;
339 int result = 0;
340
341 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
342 return -1;
343
344 do
345 {
346 if (next_stmt_info == stmt_info)
347 return result;
348 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
349 if (next_stmt_info)
350 result += DR_GROUP_GAP (next_stmt_info);
351 }
352 while (next_stmt_info);
353
354 return -1;
355 }
356
357 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
358 using the method implemented by duplicate_and_interleave. Return true
359 if so, returning the number of intermediate vectors in *NVECTORS_OUT
360 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
361 (if nonnull). */
362
363 bool
364 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
365 tree elt_type, unsigned int *nvectors_out,
366 tree *vector_type_out,
367 tree *permutes)
368 {
369 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
370 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
371 return false;
372
373 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
374 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
375 unsigned int nvectors = 1;
376 for (;;)
377 {
378 scalar_int_mode int_mode;
379 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
380 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
381 {
382 /* Get the natural vector type for this SLP group size. */
383 tree int_type = build_nonstandard_integer_type
384 (GET_MODE_BITSIZE (int_mode), 1);
385 tree vector_type
386 = get_vectype_for_scalar_type (vinfo, int_type, count);
387 if (vector_type
388 && VECTOR_MODE_P (TYPE_MODE (vector_type))
389 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
390 GET_MODE_SIZE (base_vector_mode)))
391 {
392 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
393 together into elements of type INT_TYPE and using the result
394 to build NVECTORS vectors. */
395 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
396 vec_perm_builder sel1 (nelts, 2, 3);
397 vec_perm_builder sel2 (nelts, 2, 3);
398 poly_int64 half_nelts = exact_div (nelts, 2);
399 for (unsigned int i = 0; i < 3; ++i)
400 {
401 sel1.quick_push (i);
402 sel1.quick_push (i + nelts);
403 sel2.quick_push (half_nelts + i);
404 sel2.quick_push (half_nelts + i + nelts);
405 }
406 vec_perm_indices indices1 (sel1, 2, nelts);
407 vec_perm_indices indices2 (sel2, 2, nelts);
408 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
409 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
410 {
411 if (nvectors_out)
412 *nvectors_out = nvectors;
413 if (vector_type_out)
414 *vector_type_out = vector_type;
415 if (permutes)
416 {
417 permutes[0] = vect_gen_perm_mask_checked (vector_type,
418 indices1);
419 permutes[1] = vect_gen_perm_mask_checked (vector_type,
420 indices2);
421 }
422 return true;
423 }
424 }
425 }
426 if (!multiple_p (elt_bytes, 2, &elt_bytes))
427 return false;
428 nvectors *= 2;
429 }
430 }
431
432 /* Return true if DTA and DTB match. */
433
434 static bool
435 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
436 {
437 return (dta == dtb
438 || ((dta == vect_external_def || dta == vect_constant_def)
439 && (dtb == vect_external_def || dtb == vect_constant_def)));
440 }
441
442 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
443 they are of a valid type and that they match the defs of the first stmt of
444 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
445 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
446 indicates swap is required for cond_expr stmts. Specifically, *SWAP
447 is 1 if STMT is cond and operands of comparison need to be swapped;
448 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
449 If there is any operand swap in this function, *SWAP is set to non-zero
450 value.
451 If there was a fatal error return -1; if the error could be corrected by
452 swapping operands of father node of this one, return 1; if everything is
453 ok return 0. */
454 static int
455 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
456 bool *skip_args,
457 vec<stmt_vec_info> stmts, unsigned stmt_num,
458 vec<slp_oprnd_info> *oprnds_info)
459 {
460 stmt_vec_info stmt_info = stmts[stmt_num];
461 tree oprnd;
462 unsigned int i, number_of_oprnds;
463 enum vect_def_type dt = vect_uninitialized_def;
464 slp_oprnd_info oprnd_info;
465 int first_op_idx = 1;
466 unsigned int commutative_op = -1U;
467 bool first_op_cond = false;
468 bool first = stmt_num == 0;
469
470 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
471 {
472 number_of_oprnds = gimple_call_num_args (stmt);
473 first_op_idx = 3;
474 if (gimple_call_internal_p (stmt))
475 {
476 internal_fn ifn = gimple_call_internal_fn (stmt);
477 commutative_op = first_commutative_argument (ifn);
478
479 /* Masked load, only look at mask. */
480 if (ifn == IFN_MASK_LOAD)
481 {
482 number_of_oprnds = 1;
483 /* Mask operand index. */
484 first_op_idx = 5;
485 }
486 }
487 }
488 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
489 {
490 enum tree_code code = gimple_assign_rhs_code (stmt);
491 number_of_oprnds = gimple_num_ops (stmt) - 1;
492 /* Swap can only be done for cond_expr if asked to, otherwise we
493 could result in different comparison code to the first stmt. */
494 if (code == COND_EXPR
495 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
496 {
497 first_op_cond = true;
498 number_of_oprnds++;
499 }
500 else
501 commutative_op = commutative_tree_code (code) ? 0U : -1U;
502 }
503 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
504 number_of_oprnds = gimple_phi_num_args (stmt);
505 else
506 return -1;
507
508 bool swapped = (swap != 0);
509 bool backedge = false;
510 gcc_assert (!swapped || first_op_cond);
511 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
512 for (i = 0; i < number_of_oprnds; i++)
513 {
514 if (first_op_cond)
515 {
516 /* Map indicating how operands of cond_expr should be swapped. */
517 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
518 int *map = maps[swap];
519
520 if (i < 2)
521 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
522 first_op_idx), map[i]);
523 else
524 oprnd = gimple_op (stmt_info->stmt, map[i]);
525 }
526 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
527 {
528 oprnd = gimple_phi_arg_def (stmt, i);
529 backedge = dominated_by_p (CDI_DOMINATORS,
530 gimple_phi_arg_edge (stmt, i)->src,
531 gimple_bb (stmt_info->stmt));
532 }
533 else
534 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
535 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
536 oprnd = TREE_OPERAND (oprnd, 0);
537
538 oprnd_info = (*oprnds_info)[i];
539
540 stmt_vec_info def_stmt_info;
541 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
542 {
543 if (dump_enabled_p ())
544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
545 "Build SLP failed: can't analyze def for %T\n",
546 oprnd);
547
548 return -1;
549 }
550
551 if (skip_args[i])
552 {
553 oprnd_info->def_stmts.quick_push (NULL);
554 oprnd_info->ops.quick_push (NULL_TREE);
555 oprnd_info->first_dt = vect_uninitialized_def;
556 continue;
557 }
558
559 oprnd_info->def_stmts.quick_push (def_stmt_info);
560 oprnd_info->ops.quick_push (oprnd);
561
562 if (def_stmt_info
563 && is_pattern_stmt_p (def_stmt_info))
564 {
565 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
566 != def_stmt_info)
567 oprnd_info->any_pattern = true;
568 else
569 /* If we promote this to external use the original stmt def. */
570 oprnd_info->ops.last ()
571 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
572 }
573
574 /* If there's a extern def on a backedge make sure we can
575 code-generate at the region start.
576 ??? This is another case that could be fixed by adjusting
577 how we split the function but at the moment we'd have conflicting
578 goals there. */
579 if (backedge
580 && dts[i] == vect_external_def
581 && is_a <bb_vec_info> (vinfo)
582 && TREE_CODE (oprnd) == SSA_NAME
583 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
584 && !dominated_by_p (CDI_DOMINATORS,
585 as_a <bb_vec_info> (vinfo)->bbs[0],
586 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
587 {
588 if (dump_enabled_p ())
589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
590 "Build SLP failed: extern def %T only defined "
591 "on backedge\n", oprnd);
592 return -1;
593 }
594
595 if (first)
596 {
597 tree type = TREE_TYPE (oprnd);
598 dt = dts[i];
599 if ((dt == vect_constant_def
600 || dt == vect_external_def)
601 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
602 && (TREE_CODE (type) == BOOLEAN_TYPE
603 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
604 type)))
605 {
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
608 "Build SLP failed: invalid type of def "
609 "for variable-length SLP %T\n", oprnd);
610 return -1;
611 }
612
613 /* For the swapping logic below force vect_reduction_def
614 for the reduction op in a SLP reduction group. */
615 if (!STMT_VINFO_DATA_REF (stmt_info)
616 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
617 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
618 && def_stmt_info)
619 dts[i] = dt = vect_reduction_def;
620
621 /* Check the types of the definition. */
622 switch (dt)
623 {
624 case vect_external_def:
625 case vect_constant_def:
626 case vect_internal_def:
627 case vect_reduction_def:
628 case vect_induction_def:
629 case vect_nested_cycle:
630 break;
631
632 default:
633 /* FORNOW: Not supported. */
634 if (dump_enabled_p ())
635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
636 "Build SLP failed: illegal type of def %T\n",
637 oprnd);
638 return -1;
639 }
640
641 oprnd_info->first_dt = dt;
642 oprnd_info->first_op_type = type;
643 }
644 }
645 if (first)
646 return 0;
647
648 /* Now match the operand definition types to that of the first stmt. */
649 for (i = 0; i < number_of_oprnds;)
650 {
651 if (skip_args[i])
652 {
653 ++i;
654 continue;
655 }
656
657 oprnd_info = (*oprnds_info)[i];
658 dt = dts[i];
659 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
660 oprnd = oprnd_info->ops[stmt_num];
661 tree type = TREE_TYPE (oprnd);
662
663 if (!types_compatible_p (oprnd_info->first_op_type, type))
664 {
665 if (dump_enabled_p ())
666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
667 "Build SLP failed: different operand types\n");
668 return 1;
669 }
670
671 /* Not first stmt of the group, check that the def-stmt/s match
672 the def-stmt/s of the first stmt. Allow different definition
673 types for reduction chains: the first stmt must be a
674 vect_reduction_def (a phi node), and the rest
675 end in the reduction chain. */
676 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
677 && !(oprnd_info->first_dt == vect_reduction_def
678 && !STMT_VINFO_DATA_REF (stmt_info)
679 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
680 && def_stmt_info
681 && !STMT_VINFO_DATA_REF (def_stmt_info)
682 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
683 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
684 || (!STMT_VINFO_DATA_REF (stmt_info)
685 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
686 && ((!def_stmt_info
687 || STMT_VINFO_DATA_REF (def_stmt_info)
688 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
689 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
690 != (oprnd_info->first_dt != vect_reduction_def))))
691 {
692 /* Try swapping operands if we got a mismatch. For BB
693 vectorization only in case it will clearly improve things. */
694 if (i == commutative_op && !swapped
695 && (!is_a <bb_vec_info> (vinfo)
696 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
697 dts[i+1])
698 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
699 || vect_def_types_match
700 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
701 {
702 if (dump_enabled_p ())
703 dump_printf_loc (MSG_NOTE, vect_location,
704 "trying swapped operands\n");
705 std::swap (dts[i], dts[i+1]);
706 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
707 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
708 std::swap ((*oprnds_info)[i]->ops[stmt_num],
709 (*oprnds_info)[i+1]->ops[stmt_num]);
710 swapped = true;
711 continue;
712 }
713
714 if (is_a <bb_vec_info> (vinfo)
715 && !oprnd_info->any_pattern)
716 {
717 /* Now for commutative ops we should see whether we can
718 make the other operand matching. */
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "treating operand as external\n");
722 oprnd_info->first_dt = dt = vect_external_def;
723 }
724 else
725 {
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
728 "Build SLP failed: different types\n");
729 return 1;
730 }
731 }
732
733 /* Make sure to demote the overall operand to external. */
734 if (dt == vect_external_def)
735 oprnd_info->first_dt = vect_external_def;
736 /* For a SLP reduction chain we want to duplicate the reduction to
737 each of the chain members. That gets us a sane SLP graph (still
738 the stmts are not 100% correct wrt the initial values). */
739 else if ((dt == vect_internal_def
740 || dt == vect_reduction_def)
741 && oprnd_info->first_dt == vect_reduction_def
742 && !STMT_VINFO_DATA_REF (stmt_info)
743 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
744 && !STMT_VINFO_DATA_REF (def_stmt_info)
745 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
746 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
747 {
748 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
749 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
750 }
751
752 ++i;
753 }
754
755 /* Swap operands. */
756 if (swapped)
757 {
758 if (dump_enabled_p ())
759 dump_printf_loc (MSG_NOTE, vect_location,
760 "swapped operands to match def types in %G",
761 stmt_info->stmt);
762 }
763
764 return 0;
765 }
766
767 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
768 Return true if we can, meaning that this choice doesn't conflict with
769 existing SLP nodes that use STMT_INFO. */
770
771 bool
772 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
773 {
774 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
775 if (old_vectype)
776 return useless_type_conversion_p (vectype, old_vectype);
777
778 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
779 {
780 /* We maintain the invariant that if any statement in the group is
781 used, all other members of the group have the same vector type. */
782 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
783 stmt_vec_info member_info = first_info;
784 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
785 if (is_pattern_stmt_p (member_info)
786 && !useless_type_conversion_p (vectype,
787 STMT_VINFO_VECTYPE (member_info)))
788 break;
789
790 if (!member_info)
791 {
792 for (member_info = first_info; member_info;
793 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
794 STMT_VINFO_VECTYPE (member_info) = vectype;
795 return true;
796 }
797 }
798 else if (!is_pattern_stmt_p (stmt_info))
799 {
800 STMT_VINFO_VECTYPE (stmt_info) = vectype;
801 return true;
802 }
803
804 if (dump_enabled_p ())
805 {
806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
807 "Build SLP failed: incompatible vector"
808 " types for: %G", stmt_info->stmt);
809 dump_printf_loc (MSG_NOTE, vect_location,
810 " old vector type: %T\n", old_vectype);
811 dump_printf_loc (MSG_NOTE, vect_location,
812 " new vector type: %T\n", vectype);
813 }
814 return false;
815 }
816
817 /* Return true if call statements CALL1 and CALL2 are similar enough
818 to be combined into the same SLP group. */
819
820 static bool
821 compatible_calls_p (gcall *call1, gcall *call2)
822 {
823 unsigned int nargs = gimple_call_num_args (call1);
824 if (nargs != gimple_call_num_args (call2))
825 return false;
826
827 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
828 return false;
829
830 if (gimple_call_internal_p (call1))
831 {
832 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
833 TREE_TYPE (gimple_call_lhs (call2))))
834 return false;
835 for (unsigned int i = 0; i < nargs; ++i)
836 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
837 TREE_TYPE (gimple_call_arg (call2, i))))
838 return false;
839 }
840 else
841 {
842 if (!operand_equal_p (gimple_call_fn (call1),
843 gimple_call_fn (call2), 0))
844 return false;
845
846 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
847 return false;
848 }
849 return true;
850 }
851
852 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
853 caller's attempt to find the vector type in STMT_INFO with the narrowest
854 element type. Return true if VECTYPE is nonnull and if it is valid
855 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
856 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
857 vect_build_slp_tree. */
858
859 static bool
860 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
861 unsigned int group_size,
862 tree vectype, poly_uint64 *max_nunits)
863 {
864 if (!vectype)
865 {
866 if (dump_enabled_p ())
867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
868 "Build SLP failed: unsupported data-type in %G\n",
869 stmt_info->stmt);
870 /* Fatal mismatch. */
871 return false;
872 }
873
874 /* If populating the vector type requires unrolling then fail
875 before adjusting *max_nunits for basic-block vectorization. */
876 if (is_a <bb_vec_info> (vinfo)
877 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
878 {
879 if (dump_enabled_p ())
880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
881 "Build SLP failed: unrolling required "
882 "in basic block SLP\n");
883 /* Fatal mismatch. */
884 return false;
885 }
886
887 /* In case of multiple types we need to detect the smallest type. */
888 vect_update_max_nunits (max_nunits, vectype);
889 return true;
890 }
891
892 /* Verify if the scalar stmts STMTS are isomorphic, require data
893 permutation or are of unsupported types of operation. Return
894 true if they are, otherwise return false and indicate in *MATCHES
895 which stmts are not isomorphic to the first one. If MATCHES[0]
896 is false then this indicates the comparison could not be
897 carried out or the stmts will never be vectorized by SLP.
898
899 Note COND_EXPR is possibly isomorphic to another one after swapping its
900 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
901 the first stmt by swapping the two operands of comparison; set SWAP[i]
902 to 2 if stmt I is isormorphic to the first stmt by inverting the code
903 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
904 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
905
906 static bool
907 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
908 vec<stmt_vec_info> stmts, unsigned int group_size,
909 poly_uint64 *max_nunits, bool *matches,
910 bool *two_operators, tree *node_vectype)
911 {
912 unsigned int i;
913 stmt_vec_info first_stmt_info = stmts[0];
914 enum tree_code first_stmt_code = ERROR_MARK;
915 enum tree_code alt_stmt_code = ERROR_MARK;
916 enum tree_code rhs_code = ERROR_MARK;
917 enum tree_code first_cond_code = ERROR_MARK;
918 tree lhs;
919 bool need_same_oprnds = false;
920 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
921 optab optab;
922 int icode;
923 machine_mode optab_op2_mode;
924 machine_mode vec_mode;
925 stmt_vec_info first_load = NULL, prev_first_load = NULL;
926 bool first_stmt_load_p = false, load_p = false;
927 bool first_stmt_phi_p = false, phi_p = false;
928 bool maybe_soft_fail = false;
929 tree soft_fail_nunits_vectype = NULL_TREE;
930
931 /* For every stmt in NODE find its def stmt/s. */
932 stmt_vec_info stmt_info;
933 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
934 {
935 gimple *stmt = stmt_info->stmt;
936 swap[i] = 0;
937 matches[i] = false;
938
939 if (dump_enabled_p ())
940 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
941
942 /* Fail to vectorize statements marked as unvectorizable, throw
943 or are volatile. */
944 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
945 || stmt_can_throw_internal (cfun, stmt)
946 || gimple_has_volatile_ops (stmt))
947 {
948 if (dump_enabled_p ())
949 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
950 "Build SLP failed: unvectorizable statement %G",
951 stmt);
952 /* ??? For BB vectorization we want to commutate operands in a way
953 to shuffle all unvectorizable defs into one operand and have
954 the other still vectorized. The following doesn't reliably
955 work for this though but it's the easiest we can do here. */
956 if (is_a <bb_vec_info> (vinfo) && i != 0)
957 continue;
958 /* Fatal mismatch. */
959 matches[0] = false;
960 return false;
961 }
962
963 lhs = gimple_get_lhs (stmt);
964 if (lhs == NULL_TREE)
965 {
966 if (dump_enabled_p ())
967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
968 "Build SLP failed: not GIMPLE_ASSIGN nor "
969 "GIMPLE_CALL %G", stmt);
970 if (is_a <bb_vec_info> (vinfo) && i != 0)
971 continue;
972 /* Fatal mismatch. */
973 matches[0] = false;
974 return false;
975 }
976
977 tree nunits_vectype;
978 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
979 &nunits_vectype, group_size))
980 {
981 if (is_a <bb_vec_info> (vinfo) && i != 0)
982 continue;
983 /* Fatal mismatch. */
984 matches[0] = false;
985 return false;
986 }
987 /* Record nunits required but continue analysis, producing matches[]
988 as if nunits was not an issue. This allows splitting of groups
989 to happen. */
990 if (nunits_vectype
991 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
992 nunits_vectype, max_nunits))
993 {
994 gcc_assert (is_a <bb_vec_info> (vinfo));
995 maybe_soft_fail = true;
996 soft_fail_nunits_vectype = nunits_vectype;
997 }
998
999 gcc_assert (vectype);
1000
1001 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1002 if (call_stmt)
1003 {
1004 rhs_code = CALL_EXPR;
1005
1006 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
1007 load_p = true;
1008 else if ((gimple_call_internal_p (call_stmt)
1009 && (!vectorizable_internal_fn_p
1010 (gimple_call_internal_fn (call_stmt))))
1011 || gimple_call_tail_p (call_stmt)
1012 || gimple_call_noreturn_p (call_stmt)
1013 || !gimple_call_nothrow_p (call_stmt)
1014 || gimple_call_chain (call_stmt))
1015 {
1016 if (dump_enabled_p ())
1017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1018 "Build SLP failed: unsupported call type %G",
1019 call_stmt);
1020 if (is_a <bb_vec_info> (vinfo) && i != 0)
1021 continue;
1022 /* Fatal mismatch. */
1023 matches[0] = false;
1024 return false;
1025 }
1026 }
1027 else if (gimple_code (stmt) == GIMPLE_PHI)
1028 {
1029 rhs_code = ERROR_MARK;
1030 phi_p = true;
1031 }
1032 else
1033 {
1034 rhs_code = gimple_assign_rhs_code (stmt);
1035 load_p = gimple_vuse (stmt);
1036 }
1037
1038 /* Check the operation. */
1039 if (i == 0)
1040 {
1041 *node_vectype = vectype;
1042 first_stmt_code = rhs_code;
1043 first_stmt_load_p = load_p;
1044 first_stmt_phi_p = phi_p;
1045
1046 /* Shift arguments should be equal in all the packed stmts for a
1047 vector shift with scalar shift operand. */
1048 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1049 || rhs_code == LROTATE_EXPR
1050 || rhs_code == RROTATE_EXPR)
1051 {
1052 vec_mode = TYPE_MODE (vectype);
1053
1054 /* First see if we have a vector/vector shift. */
1055 optab = optab_for_tree_code (rhs_code, vectype,
1056 optab_vector);
1057
1058 if (!optab
1059 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1060 {
1061 /* No vector/vector shift, try for a vector/scalar shift. */
1062 optab = optab_for_tree_code (rhs_code, vectype,
1063 optab_scalar);
1064
1065 if (!optab)
1066 {
1067 if (dump_enabled_p ())
1068 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1069 "Build SLP failed: no optab.\n");
1070 if (is_a <bb_vec_info> (vinfo) && i != 0)
1071 continue;
1072 /* Fatal mismatch. */
1073 matches[0] = false;
1074 return false;
1075 }
1076 icode = (int) optab_handler (optab, vec_mode);
1077 if (icode == CODE_FOR_nothing)
1078 {
1079 if (dump_enabled_p ())
1080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1081 "Build SLP failed: "
1082 "op not supported by target.\n");
1083 if (is_a <bb_vec_info> (vinfo) && i != 0)
1084 continue;
1085 /* Fatal mismatch. */
1086 matches[0] = false;
1087 return false;
1088 }
1089 optab_op2_mode = insn_data[icode].operand[2].mode;
1090 if (!VECTOR_MODE_P (optab_op2_mode))
1091 {
1092 need_same_oprnds = true;
1093 first_op1 = gimple_assign_rhs2 (stmt);
1094 }
1095 }
1096 }
1097 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1098 {
1099 need_same_oprnds = true;
1100 first_op1 = gimple_assign_rhs2 (stmt);
1101 }
1102 else if (!load_p
1103 && rhs_code == BIT_FIELD_REF)
1104 {
1105 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1106 if (!is_a <bb_vec_info> (vinfo)
1107 || TREE_CODE (vec) != SSA_NAME
1108 || !operand_equal_p (TYPE_SIZE (vectype),
1109 TYPE_SIZE (TREE_TYPE (vec))))
1110 {
1111 if (dump_enabled_p ())
1112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113 "Build SLP failed: "
1114 "BIT_FIELD_REF not supported\n");
1115 /* Fatal mismatch. */
1116 matches[0] = false;
1117 return false;
1118 }
1119 }
1120 else if (call_stmt
1121 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1122 {
1123 need_same_oprnds = true;
1124 first_op1 = gimple_call_arg (call_stmt, 1);
1125 }
1126 }
1127 else
1128 {
1129 if (first_stmt_code != rhs_code
1130 && alt_stmt_code == ERROR_MARK)
1131 alt_stmt_code = rhs_code;
1132 if ((first_stmt_code != rhs_code
1133 && (first_stmt_code != IMAGPART_EXPR
1134 || rhs_code != REALPART_EXPR)
1135 && (first_stmt_code != REALPART_EXPR
1136 || rhs_code != IMAGPART_EXPR)
1137 /* Handle mismatches in plus/minus by computing both
1138 and merging the results. */
1139 && !((first_stmt_code == PLUS_EXPR
1140 || first_stmt_code == MINUS_EXPR)
1141 && (alt_stmt_code == PLUS_EXPR
1142 || alt_stmt_code == MINUS_EXPR)
1143 && rhs_code == alt_stmt_code)
1144 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1145 && (first_stmt_code == ARRAY_REF
1146 || first_stmt_code == BIT_FIELD_REF
1147 || first_stmt_code == INDIRECT_REF
1148 || first_stmt_code == COMPONENT_REF
1149 || first_stmt_code == MEM_REF)))
1150 || first_stmt_load_p != load_p
1151 || first_stmt_phi_p != phi_p)
1152 {
1153 if (dump_enabled_p ())
1154 {
1155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1156 "Build SLP failed: different operation "
1157 "in stmt %G", stmt);
1158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159 "original stmt %G", first_stmt_info->stmt);
1160 }
1161 /* Mismatch. */
1162 continue;
1163 }
1164
1165 if (need_same_oprnds)
1166 {
1167 tree other_op1 = (call_stmt
1168 ? gimple_call_arg (call_stmt, 1)
1169 : gimple_assign_rhs2 (stmt));
1170 if (!operand_equal_p (first_op1, other_op1, 0))
1171 {
1172 if (dump_enabled_p ())
1173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1174 "Build SLP failed: different shift "
1175 "arguments in %G", stmt);
1176 /* Mismatch. */
1177 continue;
1178 }
1179 }
1180 if (!load_p
1181 && first_stmt_code == BIT_FIELD_REF
1182 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1183 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1184 {
1185 if (dump_enabled_p ())
1186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1187 "Build SLP failed: different BIT_FIELD_REF "
1188 "arguments in %G", stmt);
1189 /* Mismatch. */
1190 continue;
1191 }
1192
1193 if (!load_p && rhs_code == CALL_EXPR)
1194 {
1195 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1196 as_a <gcall *> (stmt)))
1197 {
1198 if (dump_enabled_p ())
1199 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1200 "Build SLP failed: different calls in %G",
1201 stmt);
1202 /* Mismatch. */
1203 continue;
1204 }
1205 }
1206
1207 if (phi_p
1208 && (gimple_bb (first_stmt_info->stmt)
1209 != gimple_bb (stmt_info->stmt)))
1210 {
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1213 "Build SLP failed: different BB for PHI "
1214 "in %G", stmt);
1215 /* Mismatch. */
1216 continue;
1217 }
1218
1219 if (!types_compatible_p (vectype, *node_vectype))
1220 {
1221 if (dump_enabled_p ())
1222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1223 "Build SLP failed: different vector type "
1224 "in %G", stmt);
1225 /* Mismatch. */
1226 continue;
1227 }
1228 }
1229
1230 /* Grouped store or load. */
1231 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1232 {
1233 if (REFERENCE_CLASS_P (lhs))
1234 {
1235 /* Store. */
1236 ;
1237 }
1238 else
1239 {
1240 /* Load. */
1241 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1242 if (prev_first_load)
1243 {
1244 /* Check that there are no loads from different interleaving
1245 chains in the same node. */
1246 if (prev_first_load != first_load)
1247 {
1248 if (dump_enabled_p ())
1249 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1250 vect_location,
1251 "Build SLP failed: different "
1252 "interleaving chains in one node %G",
1253 stmt);
1254 /* Mismatch. */
1255 continue;
1256 }
1257 }
1258 else
1259 prev_first_load = first_load;
1260 }
1261 } /* Grouped access. */
1262 else
1263 {
1264 if (load_p)
1265 {
1266 /* Not grouped load. */
1267 if (dump_enabled_p ())
1268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1269 "Build SLP failed: not grouped load %G", stmt);
1270
1271 /* FORNOW: Not grouped loads are not supported. */
1272 if (is_a <bb_vec_info> (vinfo) && i != 0)
1273 continue;
1274 /* Fatal mismatch. */
1275 matches[0] = false;
1276 return false;
1277 }
1278
1279 /* Not memory operation. */
1280 if (!phi_p
1281 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1282 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1283 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1284 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1285 && rhs_code != VIEW_CONVERT_EXPR
1286 && rhs_code != CALL_EXPR
1287 && rhs_code != BIT_FIELD_REF)
1288 {
1289 if (dump_enabled_p ())
1290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291 "Build SLP failed: operation unsupported %G",
1292 stmt);
1293 if (is_a <bb_vec_info> (vinfo) && i != 0)
1294 continue;
1295 /* Fatal mismatch. */
1296 matches[0] = false;
1297 return false;
1298 }
1299
1300 if (rhs_code == COND_EXPR)
1301 {
1302 tree cond_expr = gimple_assign_rhs1 (stmt);
1303 enum tree_code cond_code = TREE_CODE (cond_expr);
1304 enum tree_code swap_code = ERROR_MARK;
1305 enum tree_code invert_code = ERROR_MARK;
1306
1307 if (i == 0)
1308 first_cond_code = TREE_CODE (cond_expr);
1309 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1310 {
1311 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1312 swap_code = swap_tree_comparison (cond_code);
1313 invert_code = invert_tree_comparison (cond_code, honor_nans);
1314 }
1315
1316 if (first_cond_code == cond_code)
1317 ;
1318 /* Isomorphic can be achieved by swapping. */
1319 else if (first_cond_code == swap_code)
1320 swap[i] = 1;
1321 /* Isomorphic can be achieved by inverting. */
1322 else if (first_cond_code == invert_code)
1323 swap[i] = 2;
1324 else
1325 {
1326 if (dump_enabled_p ())
1327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1328 "Build SLP failed: different"
1329 " operation %G", stmt);
1330 /* Mismatch. */
1331 continue;
1332 }
1333 }
1334 }
1335
1336 matches[i] = true;
1337 }
1338
1339 for (i = 0; i < group_size; ++i)
1340 if (!matches[i])
1341 return false;
1342
1343 /* If we allowed a two-operation SLP node verify the target can cope
1344 with the permute we are going to use. */
1345 if (alt_stmt_code != ERROR_MARK
1346 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1347 {
1348 *two_operators = true;
1349 }
1350
1351 if (maybe_soft_fail)
1352 {
1353 unsigned HOST_WIDE_INT const_nunits;
1354 if (!TYPE_VECTOR_SUBPARTS
1355 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1356 || const_nunits > group_size)
1357 matches[0] = false;
1358 else
1359 {
1360 /* With constant vector elements simulate a mismatch at the
1361 point we need to split. */
1362 unsigned tail = group_size & (const_nunits - 1);
1363 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1364 }
1365 return false;
1366 }
1367
1368 return true;
1369 }
1370
1371 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1372 Note we never remove apart from at destruction time so we do not
1373 need a special value for deleted that differs from empty. */
1374 struct bst_traits
1375 {
1376 typedef vec <stmt_vec_info> value_type;
1377 typedef vec <stmt_vec_info> compare_type;
1378 static inline hashval_t hash (value_type);
1379 static inline bool equal (value_type existing, value_type candidate);
1380 static inline bool is_empty (value_type x) { return !x.exists (); }
1381 static inline bool is_deleted (value_type x) { return !x.exists (); }
1382 static const bool empty_zero_p = true;
1383 static inline void mark_empty (value_type &x) { x.release (); }
1384 static inline void mark_deleted (value_type &x) { x.release (); }
1385 static inline void remove (value_type &x) { x.release (); }
1386 };
1387 inline hashval_t
1388 bst_traits::hash (value_type x)
1389 {
1390 inchash::hash h;
1391 for (unsigned i = 0; i < x.length (); ++i)
1392 h.add_int (gimple_uid (x[i]->stmt));
1393 return h.end ();
1394 }
1395 inline bool
1396 bst_traits::equal (value_type existing, value_type candidate)
1397 {
1398 if (existing.length () != candidate.length ())
1399 return false;
1400 for (unsigned i = 0; i < existing.length (); ++i)
1401 if (existing[i] != candidate[i])
1402 return false;
1403 return true;
1404 }
1405
1406 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1407 simple_hashmap_traits <bst_traits, slp_tree> >
1408 scalar_stmts_to_slp_tree_map_t;
1409
1410 static slp_tree
1411 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1412 vec<stmt_vec_info> stmts, unsigned int group_size,
1413 poly_uint64 *max_nunits,
1414 bool *matches, unsigned *limit, unsigned *tree_size,
1415 scalar_stmts_to_slp_tree_map_t *bst_map);
1416
1417 static slp_tree
1418 vect_build_slp_tree (vec_info *vinfo,
1419 vec<stmt_vec_info> stmts, unsigned int group_size,
1420 poly_uint64 *max_nunits,
1421 bool *matches, unsigned *limit, unsigned *tree_size,
1422 scalar_stmts_to_slp_tree_map_t *bst_map)
1423 {
1424 if (slp_tree *leader = bst_map->get (stmts))
1425 {
1426 if (dump_enabled_p ())
1427 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1428 *leader ? "" : "failed ", *leader);
1429 if (*leader)
1430 {
1431 SLP_TREE_REF_COUNT (*leader)++;
1432 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1433 stmts.release ();
1434 }
1435 return *leader;
1436 }
1437
1438 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1439 so we can pick up backedge destinations during discovery. */
1440 slp_tree res = new _slp_tree;
1441 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1442 SLP_TREE_SCALAR_STMTS (res) = stmts;
1443 bst_map->put (stmts.copy (), res);
1444
1445 if (*limit == 0)
1446 {
1447 if (dump_enabled_p ())
1448 dump_printf_loc (MSG_NOTE, vect_location,
1449 "SLP discovery limit exceeded\n");
1450 bool existed_p = bst_map->put (stmts, NULL);
1451 gcc_assert (existed_p);
1452 /* Mark the node invalid so we can detect those when still in use
1453 as backedge destinations. */
1454 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1455 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1456 vect_free_slp_tree (res);
1457 memset (matches, 0, sizeof (bool) * group_size);
1458 return NULL;
1459 }
1460 --*limit;
1461
1462 poly_uint64 this_max_nunits = 1;
1463 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1464 &this_max_nunits,
1465 matches, limit, tree_size, bst_map);
1466 if (!res_)
1467 {
1468 bool existed_p = bst_map->put (stmts, NULL);
1469 gcc_assert (existed_p);
1470 /* Mark the node invalid so we can detect those when still in use
1471 as backedge destinations. */
1472 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1473 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1474 vect_free_slp_tree (res);
1475 }
1476 else
1477 {
1478 gcc_assert (res_ == res);
1479 res->max_nunits = this_max_nunits;
1480 vect_update_max_nunits (max_nunits, this_max_nunits);
1481 /* Keep a reference for the bst_map use. */
1482 SLP_TREE_REF_COUNT (res)++;
1483 }
1484 return res_;
1485 }
1486
1487 /* Recursively build an SLP tree starting from NODE.
1488 Fail (and return a value not equal to zero) if def-stmts are not
1489 isomorphic, require data permutation or are of unsupported types of
1490 operation. Otherwise, return 0.
1491 The value returned is the depth in the SLP tree where a mismatch
1492 was found. */
1493
1494 static slp_tree
1495 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1496 vec<stmt_vec_info> stmts, unsigned int group_size,
1497 poly_uint64 *max_nunits,
1498 bool *matches, unsigned *limit, unsigned *tree_size,
1499 scalar_stmts_to_slp_tree_map_t *bst_map)
1500 {
1501 unsigned nops, i, this_tree_size = 0;
1502 poly_uint64 this_max_nunits = *max_nunits;
1503
1504 matches[0] = false;
1505
1506 stmt_vec_info stmt_info = stmts[0];
1507 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1508 nops = gimple_call_num_args (stmt);
1509 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1510 {
1511 nops = gimple_num_ops (stmt) - 1;
1512 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1513 nops++;
1514 }
1515 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1516 nops = gimple_phi_num_args (phi);
1517 else
1518 return NULL;
1519
1520 /* If the SLP node is a PHI (induction or reduction), terminate
1521 the recursion. */
1522 bool *skip_args = XALLOCAVEC (bool, nops);
1523 memset (skip_args, 0, sizeof (bool) * nops);
1524 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1525 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1526 {
1527 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1528 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1529 group_size);
1530 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1531 max_nunits))
1532 return NULL;
1533
1534 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1535 if (def_type == vect_induction_def)
1536 {
1537 /* Induction PHIs are not cycles but walk the initial
1538 value. Only for inner loops through, for outer loops
1539 we need to pick up the value from the actual PHIs
1540 to more easily support peeling and epilogue vectorization. */
1541 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1542 if (!nested_in_vect_loop_p (loop, stmt_info))
1543 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1544 else
1545 loop = loop->inner;
1546 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1547 }
1548 else if (def_type == vect_reduction_def
1549 || def_type == vect_double_reduction_def
1550 || def_type == vect_nested_cycle)
1551 {
1552 /* Else def types have to match. */
1553 stmt_vec_info other_info;
1554 bool all_same = true;
1555 FOR_EACH_VEC_ELT (stmts, i, other_info)
1556 {
1557 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1558 return NULL;
1559 if (other_info != stmt_info)
1560 all_same = false;
1561 }
1562 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1563 /* Reduction initial values are not explicitely represented. */
1564 if (!nested_in_vect_loop_p (loop, stmt_info))
1565 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1566 /* Reduction chain backedge defs are filled manually.
1567 ??? Need a better way to identify a SLP reduction chain PHI.
1568 Or a better overall way to SLP match those. */
1569 if (all_same && def_type == vect_reduction_def)
1570 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1571 }
1572 else if (def_type != vect_internal_def)
1573 return NULL;
1574 }
1575
1576
1577 bool two_operators = false;
1578 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1579 tree vectype = NULL_TREE;
1580 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1581 &this_max_nunits, matches, &two_operators,
1582 &vectype))
1583 return NULL;
1584
1585 /* If the SLP node is a load, terminate the recursion unless masked. */
1586 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1587 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1588 {
1589 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1590 {
1591 /* Masked load. */
1592 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1593 nops = 1;
1594 }
1595 else
1596 {
1597 *max_nunits = this_max_nunits;
1598 (*tree_size)++;
1599 node = vect_create_new_slp_node (node, stmts, 0);
1600 SLP_TREE_VECTYPE (node) = vectype;
1601 /* And compute the load permutation. Whether it is actually
1602 a permutation depends on the unrolling factor which is
1603 decided later. */
1604 vec<unsigned> load_permutation;
1605 int j;
1606 stmt_vec_info load_info;
1607 load_permutation.create (group_size);
1608 stmt_vec_info first_stmt_info
1609 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1610 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1611 {
1612 int load_place = vect_get_place_in_interleaving_chain
1613 (load_info, first_stmt_info);
1614 gcc_assert (load_place != -1);
1615 load_permutation.safe_push (load_place);
1616 }
1617 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1618 return node;
1619 }
1620 }
1621 else if (gimple_assign_single_p (stmt_info->stmt)
1622 && !gimple_vuse (stmt_info->stmt)
1623 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1624 {
1625 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1626 the same SSA name vector of a compatible type to vectype. */
1627 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1628 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1629 stmt_vec_info estmt_info;
1630 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1631 {
1632 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1633 tree bfref = gimple_assign_rhs1 (estmt);
1634 HOST_WIDE_INT lane;
1635 if (!known_eq (bit_field_size (bfref),
1636 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1637 || !constant_multiple_p (bit_field_offset (bfref),
1638 bit_field_size (bfref), &lane))
1639 {
1640 lperm.release ();
1641 return NULL;
1642 }
1643 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1644 }
1645 slp_tree vnode = vect_create_new_slp_node (vNULL);
1646 /* ??? We record vectype here but we hide eventually necessary
1647 punning and instead rely on code generation to materialize
1648 VIEW_CONVERT_EXPRs as necessary. We instead should make
1649 this explicit somehow. */
1650 SLP_TREE_VECTYPE (vnode) = vectype;
1651 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1652 /* We are always building a permutation node even if it is an identity
1653 permute to shield the rest of the vectorizer from the odd node
1654 representing an actual vector without any scalar ops.
1655 ??? We could hide it completely with making the permute node
1656 external? */
1657 node = vect_create_new_slp_node (node, stmts, 1);
1658 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1659 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1660 SLP_TREE_VECTYPE (node) = vectype;
1661 SLP_TREE_CHILDREN (node).quick_push (vnode);
1662 return node;
1663 }
1664
1665 /* Get at the operands, verifying they are compatible. */
1666 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1667 slp_oprnd_info oprnd_info;
1668 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1669 {
1670 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
1671 stmts, i, &oprnds_info);
1672 if (res != 0)
1673 matches[(res == -1) ? 0 : i] = false;
1674 if (!matches[0])
1675 break;
1676 }
1677 for (i = 0; i < group_size; ++i)
1678 if (!matches[i])
1679 {
1680 vect_free_oprnd_info (oprnds_info);
1681 return NULL;
1682 }
1683 swap = NULL;
1684
1685 auto_vec<slp_tree, 4> children;
1686
1687 stmt_info = stmts[0];
1688
1689 /* Create SLP_TREE nodes for the definition node/s. */
1690 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1691 {
1692 slp_tree child;
1693 unsigned int j;
1694
1695 /* We're skipping certain operands from processing, for example
1696 outer loop reduction initial defs. */
1697 if (skip_args[i])
1698 {
1699 children.safe_push (NULL);
1700 continue;
1701 }
1702
1703 if (oprnd_info->first_dt == vect_uninitialized_def)
1704 {
1705 /* COND_EXPR have one too many eventually if the condition
1706 is a SSA name. */
1707 gcc_assert (i == 3 && nops == 4);
1708 continue;
1709 }
1710
1711 if (is_a <bb_vec_info> (vinfo)
1712 && oprnd_info->first_dt == vect_internal_def
1713 && !oprnd_info->any_pattern)
1714 {
1715 /* For BB vectorization, if all defs are the same do not
1716 bother to continue the build along the single-lane
1717 graph but use a splat of the scalar value. */
1718 stmt_vec_info first_def = oprnd_info->def_stmts[0];
1719 for (j = 1; j < group_size; ++j)
1720 if (oprnd_info->def_stmts[j] != first_def)
1721 break;
1722 if (j == group_size
1723 /* But avoid doing this for loads where we may be
1724 able to CSE things, unless the stmt is not
1725 vectorizable. */
1726 && (!STMT_VINFO_VECTORIZABLE (first_def)
1727 || !gimple_vuse (first_def->stmt)))
1728 {
1729 if (dump_enabled_p ())
1730 dump_printf_loc (MSG_NOTE, vect_location,
1731 "Using a splat of the uniform operand\n");
1732 oprnd_info->first_dt = vect_external_def;
1733 }
1734 }
1735
1736 if (oprnd_info->first_dt == vect_external_def
1737 || oprnd_info->first_dt == vect_constant_def)
1738 {
1739 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1740 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1741 oprnd_info->ops = vNULL;
1742 children.safe_push (invnode);
1743 continue;
1744 }
1745
1746 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1747 group_size, &this_max_nunits,
1748 matches, limit,
1749 &this_tree_size, bst_map)) != NULL)
1750 {
1751 oprnd_info->def_stmts = vNULL;
1752 children.safe_push (child);
1753 continue;
1754 }
1755
1756 /* If the SLP build for operand zero failed and operand zero
1757 and one can be commutated try that for the scalar stmts
1758 that failed the match. */
1759 if (i == 0
1760 /* A first scalar stmt mismatch signals a fatal mismatch. */
1761 && matches[0]
1762 /* ??? For COND_EXPRs we can swap the comparison operands
1763 as well as the arms under some constraints. */
1764 && nops == 2
1765 && oprnds_info[1]->first_dt == vect_internal_def
1766 && is_gimple_assign (stmt_info->stmt)
1767 /* Swapping operands for reductions breaks assumptions later on. */
1768 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1769 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1770 {
1771 /* See whether we can swap the matching or the non-matching
1772 stmt operands. */
1773 bool swap_not_matching = true;
1774 do
1775 {
1776 for (j = 0; j < group_size; ++j)
1777 {
1778 if (matches[j] != !swap_not_matching)
1779 continue;
1780 stmt_vec_info stmt_info = stmts[j];
1781 /* Verify if we can swap operands of this stmt. */
1782 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1783 if (!stmt
1784 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1785 {
1786 if (!swap_not_matching)
1787 goto fail;
1788 swap_not_matching = false;
1789 break;
1790 }
1791 }
1792 }
1793 while (j != group_size);
1794
1795 /* Swap mismatched definition stmts. */
1796 if (dump_enabled_p ())
1797 dump_printf_loc (MSG_NOTE, vect_location,
1798 "Re-trying with swapped operands of stmts ");
1799 for (j = 0; j < group_size; ++j)
1800 if (matches[j] == !swap_not_matching)
1801 {
1802 std::swap (oprnds_info[0]->def_stmts[j],
1803 oprnds_info[1]->def_stmts[j]);
1804 std::swap (oprnds_info[0]->ops[j],
1805 oprnds_info[1]->ops[j]);
1806 if (dump_enabled_p ())
1807 dump_printf (MSG_NOTE, "%d ", j);
1808 }
1809 if (dump_enabled_p ())
1810 dump_printf (MSG_NOTE, "\n");
1811 /* And try again with scratch 'matches' ... */
1812 bool *tem = XALLOCAVEC (bool, group_size);
1813 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1814 group_size, &this_max_nunits,
1815 tem, limit,
1816 &this_tree_size, bst_map)) != NULL)
1817 {
1818 oprnd_info->def_stmts = vNULL;
1819 children.safe_push (child);
1820 continue;
1821 }
1822 }
1823 fail:
1824
1825 /* If the SLP build failed and we analyze a basic-block
1826 simply treat nodes we fail to build as externally defined
1827 (and thus build vectors from the scalar defs).
1828 The cost model will reject outright expensive cases.
1829 ??? This doesn't treat cases where permutation ultimatively
1830 fails (or we don't try permutation below). Ideally we'd
1831 even compute a permutation that will end up with the maximum
1832 SLP tree size... */
1833 if (is_a <bb_vec_info> (vinfo)
1834 /* ??? Rejecting patterns this way doesn't work. We'd have to
1835 do extra work to cancel the pattern so the uses see the
1836 scalar version. */
1837 && !is_pattern_stmt_p (stmt_info)
1838 && !oprnd_info->any_pattern)
1839 {
1840 /* But if there's a leading vector sized set of matching stmts
1841 fail here so we can split the group. This matches the condition
1842 vect_analyze_slp_instance uses. */
1843 /* ??? We might want to split here and combine the results to support
1844 multiple vector sizes better. */
1845 for (j = 0; j < group_size; ++j)
1846 if (!matches[j])
1847 break;
1848 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1849 {
1850 if (dump_enabled_p ())
1851 dump_printf_loc (MSG_NOTE, vect_location,
1852 "Building vector operands from scalars\n");
1853 this_tree_size++;
1854 child = vect_create_new_slp_node (oprnd_info->ops);
1855 children.safe_push (child);
1856 oprnd_info->ops = vNULL;
1857 continue;
1858 }
1859 }
1860
1861 gcc_assert (child == NULL);
1862 FOR_EACH_VEC_ELT (children, j, child)
1863 if (child)
1864 vect_free_slp_tree (child);
1865 vect_free_oprnd_info (oprnds_info);
1866 return NULL;
1867 }
1868
1869 vect_free_oprnd_info (oprnds_info);
1870
1871 /* If we have all children of a child built up from uniform scalars
1872 or does more than one possibly expensive vector construction then
1873 just throw that away, causing it built up from scalars.
1874 The exception is the SLP node for the vector store. */
1875 if (is_a <bb_vec_info> (vinfo)
1876 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1877 /* ??? Rejecting patterns this way doesn't work. We'd have to
1878 do extra work to cancel the pattern so the uses see the
1879 scalar version. */
1880 && !is_pattern_stmt_p (stmt_info))
1881 {
1882 slp_tree child;
1883 unsigned j;
1884 bool all_uniform_p = true;
1885 unsigned n_vector_builds = 0;
1886 FOR_EACH_VEC_ELT (children, j, child)
1887 {
1888 if (!child)
1889 ;
1890 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1891 all_uniform_p = false;
1892 else if (!vect_slp_tree_uniform_p (child))
1893 {
1894 all_uniform_p = false;
1895 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1896 n_vector_builds++;
1897 }
1898 }
1899 if (all_uniform_p
1900 || n_vector_builds > 1
1901 || (n_vector_builds == children.length ()
1902 && is_a <gphi *> (stmt_info->stmt)))
1903 {
1904 /* Roll back. */
1905 matches[0] = false;
1906 FOR_EACH_VEC_ELT (children, j, child)
1907 if (child)
1908 vect_free_slp_tree (child);
1909
1910 if (dump_enabled_p ())
1911 dump_printf_loc (MSG_NOTE, vect_location,
1912 "Building parent vector operands from "
1913 "scalars instead\n");
1914 return NULL;
1915 }
1916 }
1917
1918 *tree_size += this_tree_size + 1;
1919 *max_nunits = this_max_nunits;
1920
1921 if (two_operators)
1922 {
1923 /* ??? We'd likely want to either cache in bst_map sth like
1924 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1925 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1926 explicit stmts to put in so the keying on 'stmts' doesn't
1927 work (but we have the same issue with nodes that use 'ops'). */
1928 slp_tree one = new _slp_tree;
1929 slp_tree two = new _slp_tree;
1930 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1931 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1932 SLP_TREE_VECTYPE (one) = vectype;
1933 SLP_TREE_VECTYPE (two) = vectype;
1934 SLP_TREE_CHILDREN (one).safe_splice (children);
1935 SLP_TREE_CHILDREN (two).safe_splice (children);
1936 slp_tree child;
1937 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1938 SLP_TREE_REF_COUNT (child)++;
1939
1940 /* Here we record the original defs since this
1941 node represents the final lane configuration. */
1942 node = vect_create_new_slp_node (node, stmts, 2);
1943 SLP_TREE_VECTYPE (node) = vectype;
1944 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1945 SLP_TREE_CHILDREN (node).quick_push (one);
1946 SLP_TREE_CHILDREN (node).quick_push (two);
1947 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1948 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1949 enum tree_code ocode = ERROR_MARK;
1950 stmt_vec_info ostmt_info;
1951 unsigned j = 0;
1952 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1953 {
1954 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1955 if (gimple_assign_rhs_code (ostmt) != code0)
1956 {
1957 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1958 ocode = gimple_assign_rhs_code (ostmt);
1959 j = i;
1960 }
1961 else
1962 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1963 }
1964 SLP_TREE_CODE (one) = code0;
1965 SLP_TREE_CODE (two) = ocode;
1966 SLP_TREE_LANES (one) = stmts.length ();
1967 SLP_TREE_LANES (two) = stmts.length ();
1968 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1969 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1970 return node;
1971 }
1972
1973 node = vect_create_new_slp_node (node, stmts, nops);
1974 SLP_TREE_VECTYPE (node) = vectype;
1975 SLP_TREE_CHILDREN (node).splice (children);
1976 return node;
1977 }
1978
1979 /* Dump a single SLP tree NODE. */
1980
1981 static void
1982 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1983 slp_tree node)
1984 {
1985 unsigned i, j;
1986 slp_tree child;
1987 stmt_vec_info stmt_info;
1988 tree op;
1989
1990 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1991 dump_user_location_t user_loc = loc.get_user_location ();
1992 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1993 SLP_TREE_DEF_TYPE (node) == vect_external_def
1994 ? " (external)"
1995 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1996 ? " (constant)"
1997 : ""), node,
1998 estimated_poly_value (node->max_nunits),
1999 SLP_TREE_REF_COUNT (node));
2000 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2001 {
2002 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2003 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2004 else
2005 dump_printf_loc (metadata, user_loc, "op template: %G",
2006 SLP_TREE_REPRESENTATIVE (node)->stmt);
2007 }
2008 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2009 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2010 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2011 else
2012 {
2013 dump_printf_loc (metadata, user_loc, "\t{ ");
2014 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2015 dump_printf (metadata, "%T%s ", op,
2016 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2017 dump_printf (metadata, "}\n");
2018 }
2019 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2020 {
2021 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2022 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2023 dump_printf (dump_kind, " %u", j);
2024 dump_printf (dump_kind, " }\n");
2025 }
2026 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2027 {
2028 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2029 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2030 dump_printf (dump_kind, " %u[%u]",
2031 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2032 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2033 dump_printf (dump_kind, " }\n");
2034 }
2035 if (SLP_TREE_CHILDREN (node).is_empty ())
2036 return;
2037 dump_printf_loc (metadata, user_loc, "\tchildren");
2038 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2039 dump_printf (dump_kind, " %p", (void *)child);
2040 dump_printf (dump_kind, "\n");
2041 }
2042
2043 DEBUG_FUNCTION void
2044 debug (slp_tree node)
2045 {
2046 debug_dump_context ctx;
2047 vect_print_slp_tree (MSG_NOTE,
2048 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2049 node);
2050 }
2051
2052 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2053
2054 static void
2055 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2056 slp_tree node, hash_set<slp_tree> &visited)
2057 {
2058 unsigned i;
2059 slp_tree child;
2060
2061 if (visited.add (node))
2062 return;
2063
2064 vect_print_slp_tree (dump_kind, loc, node);
2065
2066 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2067 if (child)
2068 vect_print_slp_graph (dump_kind, loc, child, visited);
2069 }
2070
2071 static void
2072 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2073 slp_tree entry)
2074 {
2075 hash_set<slp_tree> visited;
2076 vect_print_slp_graph (dump_kind, loc, entry, visited);
2077 }
2078
2079 /* Mark the tree rooted at NODE with PURE_SLP. */
2080
2081 static void
2082 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2083 {
2084 int i;
2085 stmt_vec_info stmt_info;
2086 slp_tree child;
2087
2088 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2089 return;
2090
2091 if (visited.add (node))
2092 return;
2093
2094 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2095 STMT_SLP_TYPE (stmt_info) = pure_slp;
2096
2097 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2098 if (child)
2099 vect_mark_slp_stmts (child, visited);
2100 }
2101
2102 static void
2103 vect_mark_slp_stmts (slp_tree node)
2104 {
2105 hash_set<slp_tree> visited;
2106 vect_mark_slp_stmts (node, visited);
2107 }
2108
2109 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2110
2111 static void
2112 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2113 {
2114 int i;
2115 stmt_vec_info stmt_info;
2116 slp_tree child;
2117
2118 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2119 return;
2120
2121 if (visited.add (node))
2122 return;
2123
2124 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2125 {
2126 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2127 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2128 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2129 }
2130
2131 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2132 if (child)
2133 vect_mark_slp_stmts_relevant (child, visited);
2134 }
2135
2136 static void
2137 vect_mark_slp_stmts_relevant (slp_tree node)
2138 {
2139 hash_set<slp_tree> visited;
2140 vect_mark_slp_stmts_relevant (node, visited);
2141 }
2142
2143
2144 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2145
2146 static void
2147 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2148 hash_set<slp_tree> &visited)
2149 {
2150 if (!node || visited.add (node))
2151 return;
2152
2153 if (SLP_TREE_CHILDREN (node).length () == 0)
2154 {
2155 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2156 return;
2157 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2158 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2159 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2160 loads.safe_push (node);
2161 }
2162 else
2163 {
2164 unsigned i;
2165 slp_tree child;
2166 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2167 vect_gather_slp_loads (loads, child, visited);
2168 }
2169 }
2170
2171
2172 /* Find the last store in SLP INSTANCE. */
2173
2174 stmt_vec_info
2175 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2176 {
2177 stmt_vec_info last = NULL;
2178 stmt_vec_info stmt_vinfo;
2179
2180 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2181 {
2182 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2183 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2184 }
2185
2186 return last;
2187 }
2188
2189 /* Find the first stmt in NODE. */
2190
2191 stmt_vec_info
2192 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2193 {
2194 stmt_vec_info first = NULL;
2195 stmt_vec_info stmt_vinfo;
2196
2197 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2198 {
2199 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2200 if (!first
2201 || get_later_stmt (stmt_vinfo, first) == first)
2202 first = stmt_vinfo;
2203 }
2204
2205 return first;
2206 }
2207
2208 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2209 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2210 (also containing the first GROUP1_SIZE stmts, since stores are
2211 consecutive), the second containing the remainder.
2212 Return the first stmt in the second group. */
2213
2214 static stmt_vec_info
2215 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2216 {
2217 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2218 gcc_assert (group1_size > 0);
2219 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2220 gcc_assert (group2_size > 0);
2221 DR_GROUP_SIZE (first_vinfo) = group1_size;
2222
2223 stmt_vec_info stmt_info = first_vinfo;
2224 for (unsigned i = group1_size; i > 1; i--)
2225 {
2226 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2227 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2228 }
2229 /* STMT is now the last element of the first group. */
2230 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2231 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2232
2233 DR_GROUP_SIZE (group2) = group2_size;
2234 for (stmt_info = group2; stmt_info;
2235 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2236 {
2237 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2238 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2239 }
2240
2241 /* For the second group, the DR_GROUP_GAP is that before the original group,
2242 plus skipping over the first vector. */
2243 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2244
2245 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2246 DR_GROUP_GAP (first_vinfo) += group2_size;
2247
2248 if (dump_enabled_p ())
2249 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2250 group1_size, group2_size);
2251
2252 return group2;
2253 }
2254
2255 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2256 statements and a vector of NUNITS elements. */
2257
2258 static poly_uint64
2259 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2260 {
2261 return exact_div (common_multiple (nunits, group_size), group_size);
2262 }
2263
2264 /* Helper that checks to see if a node is a load node. */
2265
2266 static inline bool
2267 vect_is_slp_load_node (slp_tree root)
2268 {
2269 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2270 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2271 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2272 }
2273
2274
2275 /* Helper function of optimize_load_redistribution that performs the operation
2276 recursively. */
2277
2278 static slp_tree
2279 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2280 vec_info *vinfo, unsigned int group_size,
2281 hash_map<slp_tree, slp_tree> *load_map,
2282 slp_tree root)
2283 {
2284 if (slp_tree *leader = load_map->get (root))
2285 return *leader;
2286
2287 load_map->put (root, NULL);
2288
2289 slp_tree node;
2290 unsigned i;
2291
2292 /* For now, we don't know anything about externals so do not do anything. */
2293 if (SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2294 return NULL;
2295 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2296 {
2297 /* First convert this node into a load node and add it to the leaves
2298 list and flatten the permute from a lane to a load one. If it's
2299 unneeded it will be elided later. */
2300 vec<stmt_vec_info> stmts;
2301 stmts.create (SLP_TREE_LANES (root));
2302 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2303 for (unsigned j = 0; j < lane_perm.length (); j++)
2304 {
2305 std::pair<unsigned, unsigned> perm = lane_perm[j];
2306 node = SLP_TREE_CHILDREN (root)[perm.first];
2307
2308 if (!vect_is_slp_load_node (node)
2309 || SLP_TREE_CHILDREN (node).exists ())
2310 {
2311 stmts.release ();
2312 goto next;
2313 }
2314
2315 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2316 }
2317
2318 if (dump_enabled_p ())
2319 dump_printf_loc (MSG_NOTE, vect_location,
2320 "converting stmts on permute node %p\n", root);
2321
2322 bool *matches = XALLOCAVEC (bool, group_size);
2323 poly_uint64 max_nunits = 1;
2324 unsigned tree_size = 0, limit = 1;
2325 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2326 matches, &limit, &tree_size, bst_map);
2327 if (!node)
2328 stmts.release ();
2329
2330 load_map->put (root, node);
2331 return node;
2332 }
2333
2334 next:
2335 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2336 {
2337 slp_tree value
2338 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2339 node);
2340 if (value)
2341 {
2342 SLP_TREE_REF_COUNT (value)++;
2343 SLP_TREE_CHILDREN (root)[i] = value;
2344 vect_free_slp_tree (node);
2345 }
2346 }
2347
2348 return NULL;
2349 }
2350
2351 /* Temporary workaround for loads not being CSEd during SLP build. This
2352 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2353 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2354 same DR such that the final operation is equal to a permuted load. Such
2355 NODES are then directly converted into LOADS themselves. The nodes are
2356 CSEd using BST_MAP. */
2357
2358 static void
2359 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2360 vec_info *vinfo, unsigned int group_size,
2361 hash_map<slp_tree, slp_tree> *load_map,
2362 slp_tree root)
2363 {
2364 slp_tree node;
2365 unsigned i;
2366
2367 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2368 {
2369 slp_tree value
2370 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2371 node);
2372 if (value)
2373 {
2374 SLP_TREE_REF_COUNT (value)++;
2375 SLP_TREE_CHILDREN (root)[i] = value;
2376 vect_free_slp_tree (node);
2377 }
2378 }
2379 }
2380
2381 /* Helper function of vect_match_slp_patterns.
2382
2383 Attempts to match patterns against the slp tree rooted in REF_NODE using
2384 VINFO. Patterns are matched in post-order traversal.
2385
2386 If matching is successful the value in REF_NODE is updated and returned, if
2387 not then it is returned unchanged. */
2388
2389 static bool
2390 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2391 slp_tree_to_load_perm_map_t *perm_cache,
2392 hash_set<slp_tree> *visited)
2393 {
2394 unsigned i;
2395 slp_tree node = *ref_node;
2396 bool found_p = false;
2397 if (!node || visited->add (node))
2398 return false;
2399
2400 slp_tree child;
2401 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2402 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2403 vinfo, perm_cache, visited);
2404
2405 for (unsigned x = 0; x < num__slp_patterns; x++)
2406 {
2407 vect_pattern *pattern = slp_patterns[x] (perm_cache, ref_node);
2408 if (pattern)
2409 {
2410 pattern->build (vinfo);
2411 delete pattern;
2412 found_p = true;
2413 }
2414 }
2415
2416 return found_p;
2417 }
2418
2419 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2420 vec_info VINFO.
2421
2422 The modified tree is returned. Patterns are tried in order and multiple
2423 patterns may match. */
2424
2425 static bool
2426 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2427 hash_set<slp_tree> *visited,
2428 slp_tree_to_load_perm_map_t *perm_cache)
2429 {
2430 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2431 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2432
2433 if (dump_enabled_p ())
2434 dump_printf_loc (MSG_NOTE, vect_location,
2435 "Analyzing SLP tree %p for patterns\n",
2436 SLP_INSTANCE_TREE (instance));
2437
2438 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
2439 }
2440
2441 /* Analyze an SLP instance starting from a group of grouped stores. Call
2442 vect_build_slp_tree to build a tree of packed stmts if possible.
2443 Return FALSE if it's impossible to SLP any stmt in the loop. */
2444
2445 static bool
2446 vect_analyze_slp_instance (vec_info *vinfo,
2447 scalar_stmts_to_slp_tree_map_t *bst_map,
2448 stmt_vec_info stmt_info, slp_instance_kind kind,
2449 unsigned max_tree_size, unsigned *limit);
2450
2451 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2452 of KIND. Return true if successful. */
2453
2454 static bool
2455 vect_build_slp_instance (vec_info *vinfo,
2456 slp_instance_kind kind,
2457 vec<stmt_vec_info> &scalar_stmts,
2458 stmt_vec_info root_stmt_info,
2459 unsigned max_tree_size, unsigned *limit,
2460 scalar_stmts_to_slp_tree_map_t *bst_map,
2461 /* ??? We need stmt_info for group splitting. */
2462 stmt_vec_info stmt_info_)
2463 {
2464 if (dump_enabled_p ())
2465 {
2466 dump_printf_loc (MSG_NOTE, vect_location,
2467 "Starting SLP discovery for\n");
2468 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
2469 dump_printf_loc (MSG_NOTE, vect_location,
2470 " %G", scalar_stmts[i]->stmt);
2471 }
2472
2473 /* Build the tree for the SLP instance. */
2474 unsigned int group_size = scalar_stmts.length ();
2475 bool *matches = XALLOCAVEC (bool, group_size);
2476 poly_uint64 max_nunits = 1;
2477 unsigned tree_size = 0;
2478 unsigned i;
2479 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2480 &max_nunits, matches, limit,
2481 &tree_size, bst_map);
2482 if (node != NULL)
2483 {
2484 /* Calculate the unrolling factor based on the smallest type. */
2485 poly_uint64 unrolling_factor
2486 = calculate_unrolling_factor (max_nunits, group_size);
2487
2488 if (maybe_ne (unrolling_factor, 1U)
2489 && is_a <bb_vec_info> (vinfo))
2490 {
2491 unsigned HOST_WIDE_INT const_max_nunits;
2492 if (!max_nunits.is_constant (&const_max_nunits)
2493 || const_max_nunits > group_size)
2494 {
2495 if (dump_enabled_p ())
2496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2497 "Build SLP failed: store group "
2498 "size not a multiple of the vector size "
2499 "in basic block SLP\n");
2500 vect_free_slp_tree (node);
2501 return false;
2502 }
2503 /* Fatal mismatch. */
2504 if (dump_enabled_p ())
2505 dump_printf_loc (MSG_NOTE, vect_location,
2506 "SLP discovery succeeded but node needs "
2507 "splitting\n");
2508 memset (matches, true, group_size);
2509 matches[group_size / const_max_nunits * const_max_nunits] = false;
2510 vect_free_slp_tree (node);
2511 }
2512 else
2513 {
2514 /* Create a new SLP instance. */
2515 slp_instance new_instance = XNEW (class _slp_instance);
2516 SLP_INSTANCE_TREE (new_instance) = node;
2517 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2518 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2519 SLP_INSTANCE_ROOT_STMT (new_instance) = root_stmt_info;
2520 SLP_INSTANCE_KIND (new_instance) = kind;
2521 new_instance->reduc_phis = NULL;
2522 new_instance->cost_vec = vNULL;
2523 new_instance->subgraph_entries = vNULL;
2524
2525 if (dump_enabled_p ())
2526 dump_printf_loc (MSG_NOTE, vect_location,
2527 "SLP size %u vs. limit %u.\n",
2528 tree_size, max_tree_size);
2529
2530 /* Fixup SLP reduction chains. */
2531 if (kind == slp_inst_kind_reduc_chain)
2532 {
2533 /* If this is a reduction chain with a conversion in front
2534 amend the SLP tree with a node for that. */
2535 gimple *scalar_def
2536 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2537 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
2538 {
2539 /* Get at the conversion stmt - we know it's the single use
2540 of the last stmt of the reduction chain. */
2541 use_operand_p use_p;
2542 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
2543 &use_p, &scalar_def);
2544 gcc_assert (r);
2545 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
2546 next_info = vect_stmt_to_vectorize (next_info);
2547 scalar_stmts = vNULL;
2548 scalar_stmts.create (group_size);
2549 for (unsigned i = 0; i < group_size; ++i)
2550 scalar_stmts.quick_push (next_info);
2551 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2552 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2553 SLP_TREE_CHILDREN (conv).quick_push (node);
2554 SLP_INSTANCE_TREE (new_instance) = conv;
2555 /* We also have to fake this conversion stmt as SLP reduction
2556 group so we don't have to mess with too much code
2557 elsewhere. */
2558 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2559 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2560 }
2561 /* Fill the backedge child of the PHI SLP node. The
2562 general matching code cannot find it because the
2563 scalar code does not reflect how we vectorize the
2564 reduction. */
2565 use_operand_p use_p;
2566 imm_use_iterator imm_iter;
2567 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
2568 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
2569 gimple_get_lhs (scalar_def))
2570 /* There are exactly two non-debug uses, the reduction
2571 PHI and the loop-closed PHI node. */
2572 if (!is_gimple_debug (USE_STMT (use_p))
2573 && gimple_bb (USE_STMT (use_p)) == loop->header)
2574 {
2575 auto_vec<stmt_vec_info, 64> phis (group_size);
2576 stmt_vec_info phi_info
2577 = vinfo->lookup_stmt (USE_STMT (use_p));
2578 for (unsigned i = 0; i < group_size; ++i)
2579 phis.quick_push (phi_info);
2580 slp_tree *phi_node = bst_map->get (phis);
2581 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
2582 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
2583 = SLP_INSTANCE_TREE (new_instance);
2584 SLP_INSTANCE_TREE (new_instance)->refcnt++;
2585 }
2586 }
2587
2588 vinfo->slp_instances.safe_push (new_instance);
2589
2590 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2591 the number of scalar stmts in the root in a few places.
2592 Verify that assumption holds. */
2593 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2594 .length () == group_size);
2595
2596 if (dump_enabled_p ())
2597 {
2598 dump_printf_loc (MSG_NOTE, vect_location,
2599 "Final SLP tree for instance %p:\n", new_instance);
2600 vect_print_slp_graph (MSG_NOTE, vect_location,
2601 SLP_INSTANCE_TREE (new_instance));
2602 }
2603
2604 return true;
2605 }
2606 }
2607 else
2608 {
2609 /* Failed to SLP. */
2610 /* Free the allocated memory. */
2611 scalar_stmts.release ();
2612 }
2613
2614 stmt_vec_info stmt_info = stmt_info_;
2615 /* Try to break the group up into pieces. */
2616 if (kind == slp_inst_kind_store)
2617 {
2618 /* ??? We could delay all the actual splitting of store-groups
2619 until after SLP discovery of the original group completed.
2620 Then we can recurse to vect_build_slp_instance directly. */
2621 for (i = 0; i < group_size; i++)
2622 if (!matches[i])
2623 break;
2624
2625 /* For basic block SLP, try to break the group up into multiples of
2626 a vector size. */
2627 if (is_a <bb_vec_info> (vinfo)
2628 && (i > 1 && i < group_size))
2629 {
2630 tree scalar_type
2631 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2632 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2633 1 << floor_log2 (i));
2634 unsigned HOST_WIDE_INT const_nunits;
2635 if (vectype
2636 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2637 {
2638 /* Split into two groups at the first vector boundary. */
2639 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2640 unsigned group1_size = i & ~(const_nunits - 1);
2641
2642 if (dump_enabled_p ())
2643 dump_printf_loc (MSG_NOTE, vect_location,
2644 "Splitting SLP group at stmt %u\n", i);
2645 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2646 group1_size);
2647 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2648 kind, max_tree_size,
2649 limit);
2650 /* Split the rest at the failure point and possibly
2651 re-analyze the remaining matching part if it has
2652 at least two lanes. */
2653 if (group1_size < i
2654 && (i + 1 < group_size
2655 || i - group1_size > 1))
2656 {
2657 stmt_vec_info rest2 = rest;
2658 rest = vect_split_slp_store_group (rest, i - group1_size);
2659 if (i - group1_size > 1)
2660 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
2661 kind, max_tree_size,
2662 limit);
2663 }
2664 /* Re-analyze the non-matching tail if it has at least
2665 two lanes. */
2666 if (i + 1 < group_size)
2667 res |= vect_analyze_slp_instance (vinfo, bst_map,
2668 rest, kind, max_tree_size,
2669 limit);
2670 return res;
2671 }
2672 }
2673
2674 /* For loop vectorization split into arbitrary pieces of size > 1. */
2675 if (is_a <loop_vec_info> (vinfo)
2676 && (i > 1 && i < group_size))
2677 {
2678 unsigned group1_size = i;
2679
2680 if (dump_enabled_p ())
2681 dump_printf_loc (MSG_NOTE, vect_location,
2682 "Splitting SLP group at stmt %u\n", i);
2683
2684 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2685 group1_size);
2686 /* Loop vectorization cannot handle gaps in stores, make sure
2687 the split group appears as strided. */
2688 STMT_VINFO_STRIDED_P (rest) = 1;
2689 DR_GROUP_GAP (rest) = 0;
2690 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2691 DR_GROUP_GAP (stmt_info) = 0;
2692
2693 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2694 kind, max_tree_size, limit);
2695 if (i + 1 < group_size)
2696 res |= vect_analyze_slp_instance (vinfo, bst_map,
2697 rest, kind, max_tree_size, limit);
2698
2699 return res;
2700 }
2701
2702 /* Even though the first vector did not all match, we might be able to SLP
2703 (some) of the remainder. FORNOW ignore this possibility. */
2704 }
2705
2706 /* Failed to SLP. */
2707 if (dump_enabled_p ())
2708 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2709 return false;
2710 }
2711
2712
2713 /* Analyze an SLP instance starting from a group of grouped stores. Call
2714 vect_build_slp_tree to build a tree of packed stmts if possible.
2715 Return FALSE if it's impossible to SLP any stmt in the loop. */
2716
2717 static bool
2718 vect_analyze_slp_instance (vec_info *vinfo,
2719 scalar_stmts_to_slp_tree_map_t *bst_map,
2720 stmt_vec_info stmt_info,
2721 slp_instance_kind kind,
2722 unsigned max_tree_size, unsigned *limit)
2723 {
2724 unsigned int i;
2725 vec<stmt_vec_info> scalar_stmts;
2726
2727 if (is_a <bb_vec_info> (vinfo))
2728 vect_location = stmt_info->stmt;
2729
2730 stmt_vec_info next_info = stmt_info;
2731 if (kind == slp_inst_kind_store)
2732 {
2733 /* Collect the stores and store them in scalar_stmts. */
2734 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
2735 while (next_info)
2736 {
2737 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2738 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2739 }
2740 }
2741 else if (kind == slp_inst_kind_reduc_chain)
2742 {
2743 /* Collect the reduction stmts and store them in scalar_stmts. */
2744 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
2745 while (next_info)
2746 {
2747 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2748 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2749 }
2750 /* Mark the first element of the reduction chain as reduction to properly
2751 transform the node. In the reduction analysis phase only the last
2752 element of the chain is marked as reduction. */
2753 STMT_VINFO_DEF_TYPE (stmt_info)
2754 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2755 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2756 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2757 }
2758 else if (kind == slp_inst_kind_ctor)
2759 {
2760 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2761 tree val;
2762 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
2763 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2764 {
2765 stmt_vec_info def_info = vinfo->lookup_def (val);
2766 def_info = vect_stmt_to_vectorize (def_info);
2767 scalar_stmts.quick_push (def_info);
2768 }
2769 if (dump_enabled_p ())
2770 dump_printf_loc (MSG_NOTE, vect_location,
2771 "Analyzing vectorizable constructor: %G\n",
2772 stmt_info->stmt);
2773 }
2774 else if (kind == slp_inst_kind_reduc_group)
2775 {
2776 /* Collect reduction statements. */
2777 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2778 scalar_stmts.create (reductions.length ());
2779 for (i = 0; reductions.iterate (i, &next_info); i++)
2780 if (STMT_VINFO_RELEVANT_P (next_info)
2781 || STMT_VINFO_LIVE_P (next_info))
2782 scalar_stmts.quick_push (next_info);
2783 /* If less than two were relevant/live there's nothing to SLP. */
2784 if (scalar_stmts.length () < 2)
2785 return false;
2786 }
2787 else
2788 gcc_unreachable ();
2789
2790 /* Build the tree for the SLP instance. */
2791 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
2792 kind == slp_inst_kind_ctor
2793 ? stmt_info : NULL,
2794 max_tree_size, limit, bst_map,
2795 kind == slp_inst_kind_store
2796 ? stmt_info : NULL);
2797
2798 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2799 where we should do store group splitting. */
2800
2801 return res;
2802 }
2803
2804 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2805 trees of packed scalar stmts if SLP is possible. */
2806
2807 opt_result
2808 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2809 {
2810 unsigned int i;
2811 stmt_vec_info first_element;
2812 slp_instance instance;
2813
2814 DUMP_VECT_SCOPE ("vect_analyze_slp");
2815
2816 unsigned limit = max_tree_size;
2817
2818 scalar_stmts_to_slp_tree_map_t *bst_map
2819 = new scalar_stmts_to_slp_tree_map_t ();
2820
2821 /* Find SLP sequences starting from groups of grouped stores. */
2822 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2823 vect_analyze_slp_instance (vinfo, bst_map, first_element,
2824 STMT_VINFO_GROUPED_ACCESS (first_element)
2825 ? slp_inst_kind_store : slp_inst_kind_ctor,
2826 max_tree_size, &limit);
2827
2828 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
2829 {
2830 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
2831 {
2832 vect_location = bb_vinfo->roots[i].root->stmt;
2833 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
2834 bb_vinfo->roots[i].stmts,
2835 bb_vinfo->roots[i].root,
2836 max_tree_size, &limit, bst_map, NULL))
2837 bb_vinfo->roots[i].stmts = vNULL;
2838 }
2839 }
2840
2841 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2842 {
2843 /* Find SLP sequences starting from reduction chains. */
2844 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2845 if (! STMT_VINFO_RELEVANT_P (first_element)
2846 && ! STMT_VINFO_LIVE_P (first_element))
2847 ;
2848 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2849 slp_inst_kind_reduc_chain,
2850 max_tree_size, &limit))
2851 {
2852 /* Dissolve reduction chain group. */
2853 stmt_vec_info vinfo = first_element;
2854 stmt_vec_info last = NULL;
2855 while (vinfo)
2856 {
2857 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2858 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2859 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2860 last = vinfo;
2861 vinfo = next;
2862 }
2863 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2864 /* It can be still vectorized as part of an SLP reduction. */
2865 loop_vinfo->reductions.safe_push (last);
2866 }
2867
2868 /* Find SLP sequences starting from groups of reductions. */
2869 if (loop_vinfo->reductions.length () > 1)
2870 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2871 slp_inst_kind_reduc_group, max_tree_size,
2872 &limit);
2873 }
2874
2875 hash_set<slp_tree> visited_patterns;
2876 slp_tree_to_load_perm_map_t perm_cache;
2877 hash_map<slp_tree, slp_tree> load_map;
2878
2879 /* See if any patterns can be found in the SLP tree. */
2880 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
2881 if (vect_match_slp_patterns (instance, vinfo, &visited_patterns,
2882 &perm_cache))
2883 {
2884 slp_tree root = SLP_INSTANCE_TREE (instance);
2885 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
2886 &load_map, root);
2887 if (dump_enabled_p ())
2888 {
2889 dump_printf_loc (MSG_NOTE, vect_location,
2890 "Pattern matched SLP tree\n");
2891 vect_print_slp_graph (MSG_NOTE, vect_location, root);
2892 }
2893 }
2894
2895
2896
2897 /* The map keeps a reference on SLP nodes built, release that. */
2898 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2899 it != bst_map->end (); ++it)
2900 if ((*it).second)
2901 vect_free_slp_tree ((*it).second);
2902 delete bst_map;
2903
2904 return opt_result::success ();
2905 }
2906
2907 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2908
2909 static void
2910 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2911 vec<slp_tree> &vertices, vec<int> &leafs)
2912 {
2913 unsigned i;
2914 slp_tree child;
2915
2916 if (visited.add (node))
2917 return;
2918
2919 node->vertex = vertices.length ();
2920 vertices.safe_push (node);
2921
2922 bool leaf = true;
2923 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2924 if (child)
2925 {
2926 leaf = false;
2927 vect_slp_build_vertices (visited, child, vertices, leafs);
2928 }
2929 if (leaf)
2930 leafs.safe_push (node->vertex);
2931 }
2932
2933 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2934
2935 static void
2936 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2937 vec<int> &leafs)
2938 {
2939 hash_set<slp_tree> visited;
2940 unsigned i;
2941 slp_instance instance;
2942 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2943 {
2944 unsigned n_v = vertices.length ();
2945 unsigned n_l = leafs.length ();
2946 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2947 leafs);
2948 /* If we added vertices but no entries to the reverse graph we've
2949 added a cycle that is not backwards-reachable. Push the entry
2950 to mimic as leaf then. */
2951 if (vertices.length () > n_v
2952 && leafs.length () == n_l)
2953 leafs.safe_push (SLP_INSTANCE_TREE (instance)->vertex);
2954 }
2955 }
2956
2957 /* Apply (reverse) bijectite PERM to VEC. */
2958
2959 template <class T>
2960 static void
2961 vect_slp_permute (vec<unsigned> perm,
2962 vec<T> &vec, bool reverse)
2963 {
2964 auto_vec<T, 64> saved;
2965 saved.create (vec.length ());
2966 for (unsigned i = 0; i < vec.length (); ++i)
2967 saved.quick_push (vec[i]);
2968
2969 if (reverse)
2970 {
2971 for (unsigned i = 0; i < vec.length (); ++i)
2972 vec[perm[i]] = saved[i];
2973 for (unsigned i = 0; i < vec.length (); ++i)
2974 gcc_assert (vec[perm[i]] == saved[i]);
2975 }
2976 else
2977 {
2978 for (unsigned i = 0; i < vec.length (); ++i)
2979 vec[i] = saved[perm[i]];
2980 for (unsigned i = 0; i < vec.length (); ++i)
2981 gcc_assert (vec[i] == saved[perm[i]]);
2982 }
2983 }
2984
2985 /* Return whether permutations PERM_A and PERM_B as recorded in the
2986 PERMS vector are equal. */
2987
2988 static bool
2989 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
2990 int perm_a, int perm_b)
2991 {
2992 return (perm_a == perm_b
2993 || (perms[perm_a].length () == perms[perm_b].length ()
2994 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
2995 sizeof (unsigned) * perms[perm_a].length ()) == 0));
2996 }
2997
2998 /* Optimize the SLP graph of VINFO. */
2999
3000 void
3001 vect_optimize_slp (vec_info *vinfo)
3002 {
3003 if (vinfo->slp_instances.is_empty ())
3004 return;
3005
3006 slp_tree node;
3007 unsigned i;
3008 auto_vec<slp_tree> vertices;
3009 auto_vec<int> leafs;
3010 vect_slp_build_vertices (vinfo, vertices, leafs);
3011
3012 struct graph *slpg = new_graph (vertices.length ());
3013 FOR_EACH_VEC_ELT (vertices, i, node)
3014 {
3015 unsigned j;
3016 slp_tree child;
3017 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3018 if (child)
3019 add_edge (slpg, i, child->vertex);
3020 }
3021
3022 /* Compute (reverse) postorder on the inverted graph. */
3023 auto_vec<int> ipo;
3024 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3025
3026 auto_sbitmap n_visited (vertices.length ());
3027 auto_sbitmap n_materialize (vertices.length ());
3028 auto_vec<int> n_perm (vertices.length ());
3029 auto_vec<vec<unsigned> > perms;
3030
3031 bitmap_clear (n_visited);
3032 bitmap_clear (n_materialize);
3033 n_perm.quick_grow_cleared (vertices.length ());
3034 perms.safe_push (vNULL); /* zero is no permute */
3035
3036 /* Produce initial permutations. */
3037 for (i = 0; i < leafs.length (); ++i)
3038 {
3039 int idx = leafs[i];
3040 slp_tree node = vertices[idx];
3041
3042 /* Handle externals and constants optimistically throughout the
3043 iteration. */
3044 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3045 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3046 continue;
3047
3048 /* Leafs do not change across iterations. Note leafs also double
3049 as entries to the reverse graph. */
3050 if (!slpg->vertices[idx].succ)
3051 bitmap_set_bit (n_visited, idx);
3052 /* Loads are the only thing generating permutes. */
3053 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3054 continue;
3055
3056 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3057 node unpermuted, record this permute. */
3058 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3059 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3060 continue;
3061 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3062 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3063 bool any_permute = false;
3064 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3065 {
3066 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3067 imin = MIN (imin, idx);
3068 imax = MAX (imax, idx);
3069 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3070 any_permute = true;
3071 }
3072 /* If there's no permute no need to split one out. */
3073 if (!any_permute)
3074 continue;
3075 /* If the span doesn't match we'd disrupt VF computation, avoid
3076 that for now. */
3077 if (imax - imin + 1 != SLP_TREE_LANES (node))
3078 continue;
3079
3080 /* For now only handle true permutes, like
3081 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3082 when permuting constants and invariants keeping the permute
3083 bijective. */
3084 auto_sbitmap load_index (SLP_TREE_LANES (node));
3085 bitmap_clear (load_index);
3086 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3087 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3088 unsigned j;
3089 for (j = 0; j < SLP_TREE_LANES (node); ++j)
3090 if (!bitmap_bit_p (load_index, j))
3091 break;
3092 if (j != SLP_TREE_LANES (node))
3093 continue;
3094
3095 vec<unsigned> perm = vNULL;
3096 perm.safe_grow (SLP_TREE_LANES (node), true);
3097 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3098 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3099 perms.safe_push (perm);
3100 n_perm[idx] = perms.length () - 1;
3101 }
3102
3103 /* Propagate permutes along the graph and compute materialization points. */
3104 bool changed;
3105 unsigned iteration = 0;
3106 do
3107 {
3108 changed = false;
3109 ++iteration;
3110
3111 for (i = vertices.length (); i > 0 ; --i)
3112 {
3113 int idx = ipo[i-1];
3114 slp_tree node = vertices[idx];
3115 /* For leafs there's nothing to do - we've seeded permutes
3116 on those above. */
3117 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3118 continue;
3119
3120 bitmap_set_bit (n_visited, idx);
3121
3122 /* We cannot move a permute across a store. */
3123 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
3124 && DR_IS_WRITE
3125 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
3126 continue;
3127
3128 int perm = -1;
3129 for (graph_edge *succ = slpg->vertices[idx].succ;
3130 succ; succ = succ->succ_next)
3131 {
3132 int succ_idx = succ->dest;
3133 /* Handle unvisited nodes optimistically. */
3134 /* ??? But for constants once we want to handle non-bijective
3135 permutes we have to verify the permute, when unifying lanes,
3136 will not unify different constants. For example see
3137 gcc.dg/vect/bb-slp-14.c for a case that would break. */
3138 if (!bitmap_bit_p (n_visited, succ_idx))
3139 continue;
3140 int succ_perm = n_perm[succ_idx];
3141 /* Once we materialize succs permutation its output lanes
3142 appear unpermuted to us. */
3143 if (bitmap_bit_p (n_materialize, succ_idx))
3144 succ_perm = 0;
3145 if (perm == -1)
3146 perm = succ_perm;
3147 else if (succ_perm == 0)
3148 {
3149 perm = 0;
3150 break;
3151 }
3152 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
3153 {
3154 perm = 0;
3155 break;
3156 }
3157 }
3158
3159 if (perm == -1)
3160 /* Pick up pre-computed leaf values. */
3161 perm = n_perm[idx];
3162 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
3163 {
3164 if (iteration > 1)
3165 /* Make sure we eventually converge. */
3166 gcc_checking_assert (perm == 0);
3167 n_perm[idx] = perm;
3168 if (perm == 0)
3169 bitmap_clear_bit (n_materialize, idx);
3170 changed = true;
3171 }
3172
3173 if (perm == 0)
3174 continue;
3175
3176 /* Elide pruning at materialization points in the first
3177 iteration so every node was visited once at least. */
3178 if (iteration == 1)
3179 continue;
3180
3181 /* Decide on permute materialization. Look whether there's
3182 a use (pred) edge that is permuted differently than us.
3183 In that case mark ourselves so the permutation is applied.
3184 For VEC_PERM_EXPRs the permutation doesn't carry along
3185 from children to parents so force materialization at the
3186 point of the VEC_PERM_EXPR. In principle VEC_PERM_EXPRs
3187 are a source of an arbitrary permutation again, similar
3188 to constants/externals - that's something we do not yet
3189 optimally handle. */
3190 bool all_preds_permuted = (SLP_TREE_CODE (node) != VEC_PERM_EXPR
3191 && slpg->vertices[idx].pred != NULL);
3192 if (all_preds_permuted)
3193 for (graph_edge *pred = slpg->vertices[idx].pred;
3194 pred; pred = pred->pred_next)
3195 {
3196 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
3197 int pred_perm = n_perm[pred->src];
3198 if (!vect_slp_perms_eq (perms, perm, pred_perm))
3199 {
3200 all_preds_permuted = false;
3201 break;
3202 }
3203 }
3204 if (!all_preds_permuted)
3205 {
3206 if (!bitmap_bit_p (n_materialize, idx))
3207 changed = true;
3208 bitmap_set_bit (n_materialize, idx);
3209 }
3210 }
3211 }
3212 while (changed || iteration == 1);
3213
3214 /* Materialize. */
3215 for (i = 0; i < vertices.length (); ++i)
3216 {
3217 int perm = n_perm[i];
3218 if (perm <= 0)
3219 continue;
3220
3221 slp_tree node = vertices[i];
3222
3223 /* First permute invariant/external original successors. */
3224 unsigned j;
3225 slp_tree child;
3226 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3227 {
3228 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3229 continue;
3230
3231 /* If the vector is uniform there's nothing to do. */
3232 if (vect_slp_tree_uniform_p (child))
3233 continue;
3234
3235 /* We can end up sharing some externals via two_operator
3236 handling. Be prepared to unshare those. */
3237 if (child->refcnt != 1)
3238 {
3239 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3240 SLP_TREE_CHILDREN (node)[j] = child
3241 = vect_create_new_slp_node
3242 (SLP_TREE_SCALAR_OPS (child).copy ());
3243 }
3244 vect_slp_permute (perms[perm],
3245 SLP_TREE_SCALAR_OPS (child), true);
3246 }
3247
3248 if (bitmap_bit_p (n_materialize, i))
3249 {
3250 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3251 /* For loads simply drop the permutation, the load permutation
3252 already performs the desired permutation. */
3253 ;
3254 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3255 {
3256 /* If the node is already a permute node we can apply
3257 the permutation to the lane selection, effectively
3258 materializing it on the incoming vectors. */
3259 if (dump_enabled_p ())
3260 dump_printf_loc (MSG_NOTE, vect_location,
3261 "simplifying permute node %p\n",
3262 node);
3263
3264 for (unsigned k = 0;
3265 k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3266 SLP_TREE_LANE_PERMUTATION (node)[k].second
3267 = perms[perm][SLP_TREE_LANE_PERMUTATION (node)[k].second];
3268 }
3269 else
3270 {
3271 if (dump_enabled_p ())
3272 dump_printf_loc (MSG_NOTE, vect_location,
3273 "inserting permute node in place of %p\n",
3274 node);
3275
3276 /* Make a copy of NODE and in-place change it to a
3277 VEC_PERM node to permute the lanes of the copy. */
3278 slp_tree copy = new _slp_tree;
3279 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3280 SLP_TREE_CHILDREN (node) = vNULL;
3281 SLP_TREE_SCALAR_STMTS (copy)
3282 = SLP_TREE_SCALAR_STMTS (node).copy ();
3283 vect_slp_permute (perms[perm],
3284 SLP_TREE_SCALAR_STMTS (copy), true);
3285 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3286 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3287 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3288 SLP_TREE_LANE_PERMUTATION (copy)
3289 = SLP_TREE_LANE_PERMUTATION (node);
3290 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3291 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3292 copy->refcnt = 1;
3293 copy->max_nunits = node->max_nunits;
3294 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3295 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3296 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3297
3298 /* Now turn NODE into a VEC_PERM. */
3299 SLP_TREE_CHILDREN (node).safe_push (copy);
3300 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3301 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3302 SLP_TREE_LANE_PERMUTATION (node)
3303 .quick_push (std::make_pair (0, perms[perm][j]));
3304 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3305 }
3306 }
3307 else
3308 {
3309 /* Apply the reverse permutation to our stmts. */
3310 vect_slp_permute (perms[perm],
3311 SLP_TREE_SCALAR_STMTS (node), true);
3312 /* And to the load permutation, which we can simply
3313 make regular by design. */
3314 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3315 {
3316 /* ??? When we handle non-bijective permutes the idea
3317 is that we can force the load-permutation to be
3318 { min, min + 1, min + 2, ... max }. But then the
3319 scalar defs might no longer match the lane content
3320 which means wrong-code with live lane vectorization.
3321 So we possibly have to have NULL entries for those. */
3322 vect_slp_permute (perms[perm],
3323 SLP_TREE_LOAD_PERMUTATION (node), true);
3324 }
3325 }
3326 }
3327
3328 /* Free the perms vector used for propagation. */
3329 while (!perms.is_empty ())
3330 perms.pop ().release ();
3331 free_graph (slpg);
3332
3333
3334 /* Now elide load permutations that are not necessary. */
3335 for (i = 0; i < leafs.length (); ++i)
3336 {
3337 node = vertices[leafs[i]];
3338 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3339 continue;
3340
3341 /* In basic block vectorization we allow any subchain of an interleaving
3342 chain.
3343 FORNOW: not in loop SLP because of realignment complications. */
3344 if (is_a <bb_vec_info> (vinfo))
3345 {
3346 bool subchain_p = true;
3347 stmt_vec_info next_load_info = NULL;
3348 stmt_vec_info load_info;
3349 unsigned j;
3350 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3351 {
3352 if (j != 0
3353 && (next_load_info != load_info
3354 || DR_GROUP_GAP (load_info) != 1))
3355 {
3356 subchain_p = false;
3357 break;
3358 }
3359 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3360 }
3361 if (subchain_p)
3362 {
3363 SLP_TREE_LOAD_PERMUTATION (node).release ();
3364 continue;
3365 }
3366 }
3367 else
3368 {
3369 stmt_vec_info load_info;
3370 bool this_load_permuted = false;
3371 unsigned j;
3372 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3373 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3374 {
3375 this_load_permuted = true;
3376 break;
3377 }
3378 stmt_vec_info first_stmt_info
3379 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3380 if (!this_load_permuted
3381 /* The load requires permutation when unrolling exposes
3382 a gap either because the group is larger than the SLP
3383 group-size or because there is a gap between the groups. */
3384 && (known_eq (LOOP_VINFO_VECT_FACTOR
3385 (as_a <loop_vec_info> (vinfo)), 1U)
3386 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3387 && DR_GROUP_GAP (first_stmt_info) == 0)))
3388 {
3389 SLP_TREE_LOAD_PERMUTATION (node).release ();
3390 continue;
3391 }
3392 }
3393 }
3394 }
3395
3396 /* Gather loads reachable from the individual SLP graph entries. */
3397
3398 void
3399 vect_gather_slp_loads (vec_info *vinfo)
3400 {
3401 unsigned i;
3402 slp_instance instance;
3403 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
3404 {
3405 hash_set<slp_tree> visited;
3406 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
3407 SLP_INSTANCE_TREE (instance), visited);
3408 }
3409 }
3410
3411
3412 /* For each possible SLP instance decide whether to SLP it and calculate overall
3413 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3414 least one instance. */
3415
3416 bool
3417 vect_make_slp_decision (loop_vec_info loop_vinfo)
3418 {
3419 unsigned int i;
3420 poly_uint64 unrolling_factor = 1;
3421 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3422 slp_instance instance;
3423 int decided_to_slp = 0;
3424
3425 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3426
3427 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3428 {
3429 /* FORNOW: SLP if you can. */
3430 /* All unroll factors have the form:
3431
3432 GET_MODE_SIZE (vinfo->vector_mode) * X
3433
3434 for some rational X, so they must have a common multiple. */
3435 unrolling_factor
3436 = force_common_multiple (unrolling_factor,
3437 SLP_INSTANCE_UNROLLING_FACTOR (instance));
3438
3439 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3440 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3441 loop-based vectorization. Such stmts will be marked as HYBRID. */
3442 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3443 decided_to_slp++;
3444 }
3445
3446 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
3447
3448 if (decided_to_slp && dump_enabled_p ())
3449 {
3450 dump_printf_loc (MSG_NOTE, vect_location,
3451 "Decided to SLP %d instances. Unrolling factor ",
3452 decided_to_slp);
3453 dump_dec (MSG_NOTE, unrolling_factor);
3454 dump_printf (MSG_NOTE, "\n");
3455 }
3456
3457 return (decided_to_slp > 0);
3458 }
3459
3460 /* Private data for vect_detect_hybrid_slp. */
3461 struct vdhs_data
3462 {
3463 loop_vec_info loop_vinfo;
3464 vec<stmt_vec_info> *worklist;
3465 };
3466
3467 /* Walker for walk_gimple_op. */
3468
3469 static tree
3470 vect_detect_hybrid_slp (tree *tp, int *, void *data)
3471 {
3472 walk_stmt_info *wi = (walk_stmt_info *)data;
3473 vdhs_data *dat = (vdhs_data *)wi->info;
3474
3475 if (wi->is_lhs)
3476 return NULL_TREE;
3477
3478 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
3479 if (!def_stmt_info)
3480 return NULL_TREE;
3481 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3482 if (PURE_SLP_STMT (def_stmt_info))
3483 {
3484 if (dump_enabled_p ())
3485 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3486 def_stmt_info->stmt);
3487 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3488 dat->worklist->safe_push (def_stmt_info);
3489 }
3490
3491 return NULL_TREE;
3492 }
3493
3494 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3495 if so, otherwise pushing it to WORKLIST. */
3496
3497 static void
3498 maybe_push_to_hybrid_worklist (vec_info *vinfo,
3499 vec<stmt_vec_info> &worklist,
3500 stmt_vec_info stmt_info)
3501 {
3502 if (dump_enabled_p ())
3503 dump_printf_loc (MSG_NOTE, vect_location,
3504 "Processing hybrid candidate : %G", stmt_info->stmt);
3505 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
3506 imm_use_iterator iter2;
3507 ssa_op_iter iter1;
3508 use_operand_p use_p;
3509 def_operand_p def_p;
3510 bool any_def = false;
3511 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
3512 {
3513 any_def = true;
3514 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
3515 {
3516 if (is_gimple_debug (USE_STMT (use_p)))
3517 continue;
3518 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
3519 /* An out-of loop use means this is a loop_vect sink. */
3520 if (!use_info)
3521 {
3522 if (dump_enabled_p ())
3523 dump_printf_loc (MSG_NOTE, vect_location,
3524 "Found loop_vect sink: %G", stmt_info->stmt);
3525 worklist.safe_push (stmt_info);
3526 return;
3527 }
3528 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
3529 {
3530 if (dump_enabled_p ())
3531 dump_printf_loc (MSG_NOTE, vect_location,
3532 "Found loop_vect use: %G", use_info->stmt);
3533 worklist.safe_push (stmt_info);
3534 return;
3535 }
3536 }
3537 }
3538 /* No def means this is a loo_vect sink. */
3539 if (!any_def)
3540 {
3541 if (dump_enabled_p ())
3542 dump_printf_loc (MSG_NOTE, vect_location,
3543 "Found loop_vect sink: %G", stmt_info->stmt);
3544 worklist.safe_push (stmt_info);
3545 return;
3546 }
3547 if (dump_enabled_p ())
3548 dump_printf_loc (MSG_NOTE, vect_location,
3549 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
3550 STMT_SLP_TYPE (stmt_info) = pure_slp;
3551 }
3552
3553 /* Find stmts that must be both vectorized and SLPed. */
3554
3555 void
3556 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3557 {
3558 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3559
3560 /* All stmts participating in SLP are marked pure_slp, all other
3561 stmts are loop_vect.
3562 First collect all loop_vect stmts into a worklist.
3563 SLP patterns cause not all original scalar stmts to appear in
3564 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3565 Rectify this here and do a backward walk over the IL only considering
3566 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3567 mark them as pure_slp. */
3568 auto_vec<stmt_vec_info> worklist;
3569 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
3570 {
3571 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3572 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3573 gsi_next (&gsi))
3574 {
3575 gphi *phi = gsi.phi ();
3576 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3577 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3578 maybe_push_to_hybrid_worklist (loop_vinfo,
3579 worklist, stmt_info);
3580 }
3581 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
3582 gsi_prev (&gsi))
3583 {
3584 gimple *stmt = gsi_stmt (gsi);
3585 if (is_gimple_debug (stmt))
3586 continue;
3587 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3588 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3589 {
3590 for (gimple_stmt_iterator gsi2
3591 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3592 !gsi_end_p (gsi2); gsi_next (&gsi2))
3593 {
3594 stmt_vec_info patt_info
3595 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3596 if (!STMT_SLP_TYPE (patt_info)
3597 && STMT_VINFO_RELEVANT (patt_info))
3598 maybe_push_to_hybrid_worklist (loop_vinfo,
3599 worklist, patt_info);
3600 }
3601 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3602 }
3603 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3604 maybe_push_to_hybrid_worklist (loop_vinfo,
3605 worklist, stmt_info);
3606 }
3607 }
3608
3609 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3610 mark any SLP vectorized stmt as hybrid.
3611 ??? We're visiting def stmts N times (once for each non-SLP and
3612 once for each hybrid-SLP use). */
3613 walk_stmt_info wi;
3614 vdhs_data dat;
3615 dat.worklist = &worklist;
3616 dat.loop_vinfo = loop_vinfo;
3617 memset (&wi, 0, sizeof (wi));
3618 wi.info = (void *)&dat;
3619 while (!worklist.is_empty ())
3620 {
3621 stmt_vec_info stmt_info = worklist.pop ();
3622 /* Since SSA operands are not set up for pattern stmts we need
3623 to use walk_gimple_op. */
3624 wi.is_lhs = 0;
3625 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3626 }
3627 }
3628
3629
3630 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3631
3632 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3633 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs), roots (vNULL)
3634 {
3635 for (unsigned i = 0; i < bbs.length (); ++i)
3636 {
3637 if (i != 0)
3638 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3639 gsi_next (&si))
3640 {
3641 gphi *phi = si.phi ();
3642 gimple_set_uid (phi, 0);
3643 add_stmt (phi);
3644 }
3645 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3646 !gsi_end_p (gsi); gsi_next (&gsi))
3647 {
3648 gimple *stmt = gsi_stmt (gsi);
3649 gimple_set_uid (stmt, 0);
3650 if (is_gimple_debug (stmt))
3651 continue;
3652 add_stmt (stmt);
3653 }
3654 }
3655 }
3656
3657
3658 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3659 stmts in the basic block. */
3660
3661 _bb_vec_info::~_bb_vec_info ()
3662 {
3663 /* Reset region marker. */
3664 for (unsigned i = 0; i < bbs.length (); ++i)
3665 {
3666 if (i != 0)
3667 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3668 gsi_next (&si))
3669 {
3670 gphi *phi = si.phi ();
3671 gimple_set_uid (phi, -1);
3672 }
3673 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3674 !gsi_end_p (gsi); gsi_next (&gsi))
3675 {
3676 gimple *stmt = gsi_stmt (gsi);
3677 gimple_set_uid (stmt, -1);
3678 }
3679 }
3680
3681 for (unsigned i = 0; i < roots.length (); ++i)
3682 roots[i].stmts.release ();
3683 roots.release ();
3684 }
3685
3686 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3687 given then that child nodes have already been processed, and that
3688 their def types currently match their SLP node's def type. */
3689
3690 static bool
3691 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3692 slp_instance node_instance,
3693 stmt_vector_for_cost *cost_vec)
3694 {
3695 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3696 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3697
3698 /* Calculate the number of vector statements to be created for the
3699 scalar stmts in this node. For SLP reductions it is equal to the
3700 number of vector statements in the children (which has already been
3701 calculated by the recursive call). Otherwise it is the number of
3702 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3703 VF divided by the number of elements in a vector. */
3704 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3705 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3706 {
3707 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3708 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3709 {
3710 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3711 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3712 break;
3713 }
3714 }
3715 else
3716 {
3717 poly_uint64 vf;
3718 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3719 vf = loop_vinfo->vectorization_factor;
3720 else
3721 vf = 1;
3722 unsigned int group_size = SLP_TREE_LANES (node);
3723 tree vectype = SLP_TREE_VECTYPE (node);
3724 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3725 = vect_get_num_vectors (vf * group_size, vectype);
3726 }
3727
3728 /* Handle purely internal nodes. */
3729 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3730 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3731
3732 if (is_a <bb_vec_info> (vinfo)
3733 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3734 {
3735 if (dump_enabled_p ())
3736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3737 "desired vector type conflicts with earlier one "
3738 "for %G", stmt_info->stmt);
3739 return false;
3740 }
3741
3742 bool dummy;
3743 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3744 node, node_instance, cost_vec);
3745 }
3746
3747 /* Try to build NODE from scalars, returning true on success.
3748 NODE_INSTANCE is the SLP instance that contains NODE. */
3749
3750 static bool
3751 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3752 slp_instance node_instance)
3753 {
3754 stmt_vec_info stmt_info;
3755 unsigned int i;
3756
3757 if (!is_a <bb_vec_info> (vinfo)
3758 || node == SLP_INSTANCE_TREE (node_instance)
3759 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3760 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3761 return false;
3762
3763 if (dump_enabled_p ())
3764 dump_printf_loc (MSG_NOTE, vect_location,
3765 "Building vector operands of %p from scalars instead\n", node);
3766
3767 /* Don't remove and free the child nodes here, since they could be
3768 referenced by other structures. The analysis and scheduling phases
3769 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3770 unsigned int group_size = SLP_TREE_LANES (node);
3771 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3772 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3773 SLP_TREE_LOAD_PERMUTATION (node).release ();
3774 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3775 {
3776 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3777 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3778 }
3779 return true;
3780 }
3781
3782 /* Compute the prologue cost for invariant or constant operands represented
3783 by NODE. */
3784
3785 static void
3786 vect_prologue_cost_for_slp (slp_tree node,
3787 stmt_vector_for_cost *cost_vec)
3788 {
3789 /* There's a special case of an existing vector, that costs nothing. */
3790 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3791 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3792 return;
3793 /* Without looking at the actual initializer a vector of
3794 constants can be implemented as load from the constant pool.
3795 When all elements are the same we can use a splat. */
3796 tree vectype = SLP_TREE_VECTYPE (node);
3797 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3798 unsigned num_vects_to_check;
3799 unsigned HOST_WIDE_INT const_nunits;
3800 unsigned nelt_limit;
3801 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3802 && ! multiple_p (const_nunits, group_size))
3803 {
3804 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3805 nelt_limit = const_nunits;
3806 }
3807 else
3808 {
3809 /* If either the vector has variable length or the vectors
3810 are composed of repeated whole groups we only need to
3811 cost construction once. All vectors will be the same. */
3812 num_vects_to_check = 1;
3813 nelt_limit = group_size;
3814 }
3815 tree elt = NULL_TREE;
3816 unsigned nelt = 0;
3817 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3818 {
3819 unsigned si = j % group_size;
3820 if (nelt == 0)
3821 elt = SLP_TREE_SCALAR_OPS (node)[si];
3822 /* ??? We're just tracking whether all operands of a single
3823 vector initializer are the same, ideally we'd check if
3824 we emitted the same one already. */
3825 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3826 elt = NULL_TREE;
3827 nelt++;
3828 if (nelt == nelt_limit)
3829 {
3830 record_stmt_cost (cost_vec, 1,
3831 SLP_TREE_DEF_TYPE (node) == vect_external_def
3832 ? (elt ? scalar_to_vec : vec_construct)
3833 : vector_load,
3834 NULL, vectype, 0, vect_prologue);
3835 nelt = 0;
3836 }
3837 }
3838 }
3839
3840 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3841 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3842
3843 Return true if the operations are supported. */
3844
3845 static bool
3846 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3847 slp_instance node_instance,
3848 hash_set<slp_tree> &visited_set,
3849 vec<slp_tree> &visited_vec,
3850 stmt_vector_for_cost *cost_vec)
3851 {
3852 int i, j;
3853 slp_tree child;
3854
3855 /* Assume we can code-generate all invariants. */
3856 if (!node
3857 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
3858 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
3859 return true;
3860
3861 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
3862 {
3863 if (dump_enabled_p ())
3864 dump_printf_loc (MSG_NOTE, vect_location,
3865 "Failed cyclic SLP reference in %p", node);
3866 return false;
3867 }
3868 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
3869
3870 /* If we already analyzed the exact same set of scalar stmts we're done.
3871 We share the generated vector stmts for those. */
3872 if (visited_set.add (node))
3873 return true;
3874 visited_vec.safe_push (node);
3875
3876 bool res = true;
3877 unsigned visited_rec_start = visited_vec.length ();
3878 unsigned cost_vec_rec_start = cost_vec->length ();
3879 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3880 {
3881 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3882 visited_set, visited_vec,
3883 cost_vec);
3884 if (!res)
3885 break;
3886 }
3887
3888 if (res)
3889 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3890 cost_vec);
3891 /* If analysis failed we have to pop all recursive visited nodes
3892 plus ourselves. */
3893 if (!res)
3894 {
3895 while (visited_vec.length () >= visited_rec_start)
3896 visited_set.remove (visited_vec.pop ());
3897 cost_vec->truncate (cost_vec_rec_start);
3898 }
3899
3900 /* When the node can be vectorized cost invariant nodes it references.
3901 This is not done in DFS order to allow the refering node
3902 vectorizable_* calls to nail down the invariant nodes vector type
3903 and possibly unshare it if it needs a different vector type than
3904 other referrers. */
3905 if (res)
3906 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3907 if (child
3908 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
3909 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3910 /* Perform usual caching, note code-generation still
3911 code-gens these nodes multiple times but we expect
3912 to CSE them later. */
3913 && !visited_set.add (child))
3914 {
3915 visited_vec.safe_push (child);
3916 /* ??? After auditing more code paths make a "default"
3917 and push the vector type from NODE to all children
3918 if it is not already set. */
3919 /* Compute the number of vectors to be generated. */
3920 tree vector_type = SLP_TREE_VECTYPE (child);
3921 if (!vector_type)
3922 {
3923 /* For shifts with a scalar argument we don't need
3924 to cost or code-generate anything.
3925 ??? Represent this more explicitely. */
3926 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3927 == shift_vec_info_type)
3928 && j == 1);
3929 continue;
3930 }
3931 unsigned group_size = SLP_TREE_LANES (child);
3932 poly_uint64 vf = 1;
3933 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3934 vf = loop_vinfo->vectorization_factor;
3935 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3936 = vect_get_num_vectors (vf * group_size, vector_type);
3937 /* And cost them. */
3938 vect_prologue_cost_for_slp (child, cost_vec);
3939 }
3940
3941 /* If this node or any of its children can't be vectorized, try pruning
3942 the tree here rather than felling the whole thing. */
3943 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3944 {
3945 /* We'll need to revisit this for invariant costing and number
3946 of vectorized stmt setting. */
3947 res = true;
3948 }
3949
3950 return res;
3951 }
3952
3953
3954 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3955 region and that can be vectorized using vectorizable_live_operation
3956 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3957 scalar code computing it to be retained. */
3958
3959 static void
3960 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3961 slp_instance instance,
3962 stmt_vector_for_cost *cost_vec,
3963 hash_set<stmt_vec_info> &svisited,
3964 hash_set<slp_tree> &visited)
3965 {
3966 if (visited.add (node))
3967 return;
3968
3969 unsigned i;
3970 stmt_vec_info stmt_info;
3971 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3972 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3973 {
3974 if (svisited.contains (stmt_info))
3975 continue;
3976 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3977 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
3978 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
3979 /* Only the pattern root stmt computes the original scalar value. */
3980 continue;
3981 bool mark_visited = true;
3982 gimple *orig_stmt = orig_stmt_info->stmt;
3983 ssa_op_iter op_iter;
3984 def_operand_p def_p;
3985 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3986 {
3987 imm_use_iterator use_iter;
3988 gimple *use_stmt;
3989 stmt_vec_info use_stmt_info;
3990 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3991 if (!is_gimple_debug (use_stmt))
3992 {
3993 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3994 if (!use_stmt_info
3995 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3996 {
3997 STMT_VINFO_LIVE_P (stmt_info) = true;
3998 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3999 NULL, node, instance, i,
4000 false, cost_vec))
4001 /* ??? So we know we can vectorize the live stmt
4002 from one SLP node. If we cannot do so from all
4003 or none consistently we'd have to record which
4004 SLP node (and lane) we want to use for the live
4005 operation. So make sure we can code-generate
4006 from all nodes. */
4007 mark_visited = false;
4008 else
4009 STMT_VINFO_LIVE_P (stmt_info) = false;
4010 break;
4011 }
4012 }
4013 /* We have to verify whether we can insert the lane extract
4014 before all uses. The following is a conservative approximation.
4015 We cannot put this into vectorizable_live_operation because
4016 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4017 doesn't work.
4018 Note that while the fact that we emit code for loads at the
4019 first load should make this a non-problem leafs we construct
4020 from scalars are vectorized after the last scalar def.
4021 ??? If we'd actually compute the insert location during
4022 analysis we could use sth less conservative than the last
4023 scalar stmt in the node for the dominance check. */
4024 /* ??? What remains is "live" uses in vector CTORs in the same
4025 SLP graph which is where those uses can end up code-generated
4026 right after their definition instead of close to their original
4027 use. But that would restrict us to code-generate lane-extracts
4028 from the latest stmt in a node. So we compensate for this
4029 during code-generation, simply not replacing uses for those
4030 hopefully rare cases. */
4031 if (STMT_VINFO_LIVE_P (stmt_info))
4032 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4033 if (!is_gimple_debug (use_stmt)
4034 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4035 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4036 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4037 {
4038 if (dump_enabled_p ())
4039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4040 "Cannot determine insertion place for "
4041 "lane extract\n");
4042 STMT_VINFO_LIVE_P (stmt_info) = false;
4043 mark_visited = true;
4044 }
4045 }
4046 if (mark_visited)
4047 svisited.add (stmt_info);
4048 }
4049
4050 slp_tree child;
4051 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4052 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4053 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4054 cost_vec, svisited, visited);
4055 }
4056
4057 /* Analyze statements in SLP instances of VINFO. Return true if the
4058 operations are supported. */
4059
4060 bool
4061 vect_slp_analyze_operations (vec_info *vinfo)
4062 {
4063 slp_instance instance;
4064 int i;
4065
4066 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4067
4068 hash_set<slp_tree> visited;
4069 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4070 {
4071 auto_vec<slp_tree> visited_vec;
4072 stmt_vector_for_cost cost_vec;
4073 cost_vec.create (2);
4074 if (is_a <bb_vec_info> (vinfo))
4075 vect_location = instance->location ();
4076 if (!vect_slp_analyze_node_operations (vinfo,
4077 SLP_INSTANCE_TREE (instance),
4078 instance, visited, visited_vec,
4079 &cost_vec)
4080 /* Instances with a root stmt require vectorized defs for the
4081 SLP tree root. */
4082 || (SLP_INSTANCE_ROOT_STMT (instance)
4083 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
4084 != vect_internal_def)))
4085 {
4086 slp_tree node = SLP_INSTANCE_TREE (instance);
4087 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4088 if (dump_enabled_p ())
4089 dump_printf_loc (MSG_NOTE, vect_location,
4090 "removing SLP instance operations starting from: %G",
4091 stmt_info->stmt);
4092 vect_free_slp_instance (instance);
4093 vinfo->slp_instances.ordered_remove (i);
4094 cost_vec.release ();
4095 while (!visited_vec.is_empty ())
4096 visited.remove (visited_vec.pop ());
4097 }
4098 else
4099 {
4100 i++;
4101
4102 /* For BB vectorization remember the SLP graph entry
4103 cost for later. */
4104 if (is_a <bb_vec_info> (vinfo))
4105 instance->cost_vec = cost_vec;
4106 else
4107 {
4108 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
4109 cost_vec.release ();
4110 }
4111 }
4112 }
4113
4114 /* Compute vectorizable live stmts. */
4115 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4116 {
4117 hash_set<stmt_vec_info> svisited;
4118 hash_set<slp_tree> visited;
4119 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4120 {
4121 vect_location = instance->location ();
4122 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
4123 instance, &instance->cost_vec, svisited,
4124 visited);
4125 }
4126 }
4127
4128 return !vinfo->slp_instances.is_empty ();
4129 }
4130
4131 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
4132 closing the eventual chain. */
4133
4134 static slp_instance
4135 get_ultimate_leader (slp_instance instance,
4136 hash_map<slp_instance, slp_instance> &instance_leader)
4137 {
4138 auto_vec<slp_instance *, 8> chain;
4139 slp_instance *tem;
4140 while (*(tem = instance_leader.get (instance)) != instance)
4141 {
4142 chain.safe_push (tem);
4143 instance = *tem;
4144 }
4145 while (!chain.is_empty ())
4146 *chain.pop () = instance;
4147 return instance;
4148 }
4149
4150 /* Worker of vect_bb_partition_graph, recurse on NODE. */
4151
4152 static void
4153 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
4154 slp_instance instance, slp_tree node,
4155 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
4156 hash_map<slp_instance, slp_instance> &instance_leader,
4157 hash_set<slp_tree> &visited)
4158 {
4159 stmt_vec_info stmt_info;
4160 unsigned i;
4161
4162 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4163 {
4164 bool existed_p;
4165 slp_instance &stmt_instance
4166 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
4167 if (!existed_p)
4168 ;
4169 else if (stmt_instance != instance)
4170 {
4171 /* If we're running into a previously marked stmt make us the
4172 leader of the current ultimate leader. This keeps the
4173 leader chain acyclic and works even when the current instance
4174 connects two previously independent graph parts. */
4175 slp_instance stmt_leader
4176 = get_ultimate_leader (stmt_instance, instance_leader);
4177 if (stmt_leader != instance)
4178 instance_leader.put (stmt_leader, instance);
4179 }
4180 stmt_instance = instance;
4181 }
4182
4183 if (visited.add (node))
4184 return;
4185
4186 slp_tree child;
4187 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4188 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4189 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
4190 instance_leader, visited);
4191 }
4192
4193 /* Partition the SLP graph into pieces that can be costed independently. */
4194
4195 static void
4196 vect_bb_partition_graph (bb_vec_info bb_vinfo)
4197 {
4198 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
4199
4200 /* First walk the SLP graph assigning each involved scalar stmt a
4201 corresponding SLP graph entry and upon visiting a previously
4202 marked stmt, make the stmts leader the current SLP graph entry. */
4203 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
4204 hash_map<slp_instance, slp_instance> instance_leader;
4205 hash_set<slp_tree> visited;
4206 slp_instance instance;
4207 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4208 {
4209 instance_leader.put (instance, instance);
4210 vect_bb_partition_graph_r (bb_vinfo,
4211 instance, SLP_INSTANCE_TREE (instance),
4212 stmt_to_instance, instance_leader,
4213 visited);
4214 }
4215
4216 /* Then collect entries to each independent subgraph. */
4217 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4218 {
4219 slp_instance leader = get_ultimate_leader (instance, instance_leader);
4220 leader->subgraph_entries.safe_push (instance);
4221 if (dump_enabled_p ()
4222 && leader != instance)
4223 dump_printf_loc (MSG_NOTE, vect_location,
4224 "instance %p is leader of %p\n",
4225 leader, instance);
4226 }
4227 }
4228
4229 /* Compute the scalar cost of the SLP node NODE and its children
4230 and return it. Do not account defs that are marked in LIFE and
4231 update LIFE according to uses of NODE. */
4232
4233 static void
4234 vect_bb_slp_scalar_cost (vec_info *vinfo,
4235 slp_tree node, vec<bool, va_heap> *life,
4236 stmt_vector_for_cost *cost_vec,
4237 hash_set<slp_tree> &visited)
4238 {
4239 unsigned i;
4240 stmt_vec_info stmt_info;
4241 slp_tree child;
4242
4243 if (visited.add (node))
4244 return;
4245
4246 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4247 {
4248 ssa_op_iter op_iter;
4249 def_operand_p def_p;
4250
4251 if ((*life)[i])
4252 continue;
4253
4254 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4255 gimple *orig_stmt = orig_stmt_info->stmt;
4256
4257 /* If there is a non-vectorized use of the defs then the scalar
4258 stmt is kept live in which case we do not account it or any
4259 required defs in the SLP children in the scalar cost. This
4260 way we make the vectorization more costly when compared to
4261 the scalar cost. */
4262 if (!STMT_VINFO_LIVE_P (stmt_info))
4263 {
4264 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4265 {
4266 imm_use_iterator use_iter;
4267 gimple *use_stmt;
4268 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4269 if (!is_gimple_debug (use_stmt))
4270 {
4271 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
4272 if (!use_stmt_info
4273 || !PURE_SLP_STMT
4274 (vect_stmt_to_vectorize (use_stmt_info)))
4275 {
4276 (*life)[i] = true;
4277 break;
4278 }
4279 }
4280 }
4281 if ((*life)[i])
4282 continue;
4283 }
4284
4285 /* Count scalar stmts only once. */
4286 if (gimple_visited_p (orig_stmt))
4287 continue;
4288 gimple_set_visited (orig_stmt, true);
4289
4290 vect_cost_for_stmt kind;
4291 if (STMT_VINFO_DATA_REF (orig_stmt_info))
4292 {
4293 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
4294 kind = scalar_load;
4295 else
4296 kind = scalar_store;
4297 }
4298 else if (vect_nop_conversion_p (orig_stmt_info))
4299 continue;
4300 /* For single-argument PHIs assume coalescing which means zero cost
4301 for the scalar and the vector PHIs. This avoids artificially
4302 favoring the vector path (but may pessimize it in some cases). */
4303 else if (is_a <gphi *> (orig_stmt_info->stmt)
4304 && gimple_phi_num_args
4305 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
4306 continue;
4307 else
4308 kind = scalar_stmt;
4309 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
4310 SLP_TREE_VECTYPE (node), 0, vect_body);
4311 }
4312
4313 auto_vec<bool, 20> subtree_life;
4314 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4315 {
4316 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4317 {
4318 /* Do not directly pass LIFE to the recursive call, copy it to
4319 confine changes in the callee to the current child/subtree. */
4320 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4321 {
4322 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
4323 for (unsigned j = 0;
4324 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
4325 {
4326 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
4327 if (perm.first == i)
4328 subtree_life[perm.second] = (*life)[j];
4329 }
4330 }
4331 else
4332 {
4333 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
4334 subtree_life.safe_splice (*life);
4335 }
4336 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
4337 visited);
4338 subtree_life.truncate (0);
4339 }
4340 }
4341 }
4342
4343 /* Comparator for the loop-index sorted cost vectors. */
4344
4345 static int
4346 li_cost_vec_cmp (const void *a_, const void *b_)
4347 {
4348 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
4349 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
4350 if (a->first < b->first)
4351 return -1;
4352 else if (a->first == b->first)
4353 return 0;
4354 return 1;
4355 }
4356
4357 /* Check if vectorization of the basic block is profitable for the
4358 subgraph denoted by SLP_INSTANCES. */
4359
4360 static bool
4361 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
4362 vec<slp_instance> slp_instances)
4363 {
4364 slp_instance instance;
4365 int i;
4366 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
4367 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
4368
4369 if (dump_enabled_p ())
4370 {
4371 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
4372 hash_set<slp_tree> visited;
4373 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4374 vect_print_slp_graph (MSG_NOTE, vect_location,
4375 SLP_INSTANCE_TREE (instance), visited);
4376 }
4377
4378 /* Calculate scalar cost and sum the cost for the vector stmts
4379 previously collected. */
4380 stmt_vector_for_cost scalar_costs = vNULL;
4381 stmt_vector_for_cost vector_costs = vNULL;
4382 hash_set<slp_tree> visited;
4383 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4384 {
4385 auto_vec<bool, 20> life;
4386 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
4387 true);
4388 if (SLP_INSTANCE_ROOT_STMT (instance))
4389 record_stmt_cost (&scalar_costs, 1, scalar_stmt,
4390 SLP_INSTANCE_ROOT_STMT (instance), 0, vect_body);
4391 vect_bb_slp_scalar_cost (bb_vinfo,
4392 SLP_INSTANCE_TREE (instance),
4393 &life, &scalar_costs, visited);
4394 vector_costs.safe_splice (instance->cost_vec);
4395 instance->cost_vec.release ();
4396 }
4397 /* Unset visited flag. */
4398 stmt_info_for_cost *cost;
4399 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
4400 gimple_set_visited (cost->stmt_info->stmt, false);
4401
4402 if (dump_enabled_p ())
4403 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4404
4405 /* When costing non-loop vectorization we need to consider each covered
4406 loop independently and make sure vectorization is profitable. For
4407 now we assume a loop may be not entered or executed an arbitrary
4408 number of iterations (??? static information can provide more
4409 precise info here) which means we can simply cost each containing
4410 loops stmts separately. */
4411
4412 /* First produce cost vectors sorted by loop index. */
4413 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
4414 li_scalar_costs (scalar_costs.length ());
4415 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
4416 li_vector_costs (vector_costs.length ());
4417 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
4418 {
4419 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
4420 li_scalar_costs.quick_push (std::make_pair (l, cost));
4421 }
4422 /* Use a random used loop as fallback in case the first vector_costs
4423 entry does not have a stmt_info associated with it. */
4424 unsigned l = li_scalar_costs[0].first;
4425 FOR_EACH_VEC_ELT (vector_costs, i, cost)
4426 {
4427 /* We inherit from the previous COST, invariants, externals and
4428 extracts immediately follow the cost for the related stmt. */
4429 if (cost->stmt_info)
4430 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
4431 li_vector_costs.quick_push (std::make_pair (l, cost));
4432 }
4433 li_scalar_costs.qsort (li_cost_vec_cmp);
4434 li_vector_costs.qsort (li_cost_vec_cmp);
4435
4436 /* Now cost the portions individually. */
4437 unsigned vi = 0;
4438 unsigned si = 0;
4439 while (si < li_scalar_costs.length ()
4440 && vi < li_vector_costs.length ())
4441 {
4442 unsigned sl = li_scalar_costs[si].first;
4443 unsigned vl = li_vector_costs[vi].first;
4444 if (sl != vl)
4445 {
4446 if (dump_enabled_p ())
4447 dump_printf_loc (MSG_NOTE, vect_location,
4448 "Scalar %d and vector %d loop part do not "
4449 "match up, skipping scalar part\n", sl, vl);
4450 /* Skip the scalar part, assuming zero cost on the vector side. */
4451 do
4452 {
4453 si++;
4454 }
4455 while (si < li_scalar_costs.length ()
4456 && li_scalar_costs[si].first == sl);
4457 continue;
4458 }
4459
4460 void *scalar_target_cost_data = init_cost (NULL);
4461 do
4462 {
4463 add_stmt_cost (bb_vinfo, scalar_target_cost_data,
4464 li_scalar_costs[si].second);
4465 si++;
4466 }
4467 while (si < li_scalar_costs.length ()
4468 && li_scalar_costs[si].first == sl);
4469 unsigned dummy;
4470 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
4471 destroy_cost_data (scalar_target_cost_data);
4472
4473 /* Complete the target-specific vector cost calculation. */
4474 void *vect_target_cost_data = init_cost (NULL);
4475 do
4476 {
4477 add_stmt_cost (bb_vinfo, vect_target_cost_data,
4478 li_vector_costs[vi].second);
4479 vi++;
4480 }
4481 while (vi < li_vector_costs.length ()
4482 && li_vector_costs[vi].first == vl);
4483 finish_cost (vect_target_cost_data, &vec_prologue_cost,
4484 &vec_inside_cost, &vec_epilogue_cost);
4485 destroy_cost_data (vect_target_cost_data);
4486
4487 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
4488
4489 if (dump_enabled_p ())
4490 {
4491 dump_printf_loc (MSG_NOTE, vect_location,
4492 "Cost model analysis for part in loop %d:\n", sl);
4493 dump_printf (MSG_NOTE, " Vector cost: %d\n",
4494 vec_inside_cost + vec_outside_cost);
4495 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
4496 }
4497
4498 /* Vectorization is profitable if its cost is more than the cost of scalar
4499 version. Note that we err on the vector side for equal cost because
4500 the cost estimate is otherwise quite pessimistic (constant uses are
4501 free on the scalar side but cost a load on the vector side for
4502 example). */
4503 if (vec_outside_cost + vec_inside_cost > scalar_cost)
4504 {
4505 scalar_costs.release ();
4506 vector_costs.release ();
4507 return false;
4508 }
4509 }
4510 if (vi < li_vector_costs.length ())
4511 {
4512 if (dump_enabled_p ())
4513 dump_printf_loc (MSG_NOTE, vect_location,
4514 "Excess vector cost for part in loop %d:\n",
4515 li_vector_costs[vi].first);
4516 scalar_costs.release ();
4517 vector_costs.release ();
4518 return false;
4519 }
4520
4521 scalar_costs.release ();
4522 vector_costs.release ();
4523 return true;
4524 }
4525
4526 /* qsort comparator for lane defs. */
4527
4528 static int
4529 vld_cmp (const void *a_, const void *b_)
4530 {
4531 auto *a = (const std::pair<unsigned, tree> *)a_;
4532 auto *b = (const std::pair<unsigned, tree> *)b_;
4533 return a->first - b->first;
4534 }
4535
4536 /* Return true if USE_STMT is a vector lane insert into VEC and set
4537 *THIS_LANE to the lane number that is set. */
4538
4539 static bool
4540 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
4541 {
4542 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
4543 if (!use_ass
4544 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
4545 || (vec
4546 ? gimple_assign_rhs1 (use_ass) != vec
4547 : ((vec = gimple_assign_rhs1 (use_ass)), false))
4548 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
4549 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
4550 || !constant_multiple_p
4551 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
4552 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
4553 this_lane))
4554 return false;
4555 return true;
4556 }
4557
4558 /* Find any vectorizable constructors and add them to the grouped_store
4559 array. */
4560
4561 static void
4562 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
4563 {
4564 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
4565 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
4566 !gsi_end_p (gsi); gsi_next (&gsi))
4567 {
4568 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
4569 if (!assign)
4570 continue;
4571
4572 tree rhs = gimple_assign_rhs1 (assign);
4573 if (gimple_assign_rhs_code (assign) == CONSTRUCTOR)
4574 {
4575 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
4576 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
4577 CONSTRUCTOR_NELTS (rhs))
4578 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
4579 || uniform_vector_p (rhs))
4580 continue;
4581
4582 unsigned j;
4583 tree val;
4584 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
4585 if (TREE_CODE (val) != SSA_NAME
4586 || !bb_vinfo->lookup_def (val))
4587 break;
4588 if (j != CONSTRUCTOR_NELTS (rhs))
4589 continue;
4590
4591 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
4592 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
4593 }
4594 else if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR
4595 && VECTOR_TYPE_P (TREE_TYPE (rhs))
4596 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
4597 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
4598 && integer_zerop (gimple_assign_rhs3 (assign))
4599 && useless_type_conversion_p
4600 (TREE_TYPE (TREE_TYPE (rhs)),
4601 TREE_TYPE (gimple_assign_rhs2 (assign)))
4602 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
4603 {
4604 /* We start to match on insert to lane zero but since the
4605 inserts need not be ordered we'd have to search both
4606 the def and the use chains. */
4607 tree vectype = TREE_TYPE (rhs);
4608 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4609 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
4610 auto_sbitmap lanes (nlanes);
4611 bitmap_clear (lanes);
4612 bitmap_set_bit (lanes, 0);
4613 tree def = gimple_assign_lhs (assign);
4614 lane_defs.quick_push
4615 (std::make_pair (0, gimple_assign_rhs2 (assign)));
4616 unsigned lanes_found = 1;
4617 /* Start with the use chains, the last stmt will be the root. */
4618 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
4619 do
4620 {
4621 use_operand_p use_p;
4622 gimple *use_stmt;
4623 if (!single_imm_use (def, &use_p, &use_stmt))
4624 break;
4625 unsigned this_lane;
4626 if (!bb_vinfo->lookup_stmt (use_stmt)
4627 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
4628 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
4629 break;
4630 if (bitmap_bit_p (lanes, this_lane))
4631 break;
4632 lanes_found++;
4633 bitmap_set_bit (lanes, this_lane);
4634 gassign *use_ass = as_a <gassign *> (use_stmt);
4635 lane_defs.quick_push (std::make_pair
4636 (this_lane, gimple_assign_rhs2 (use_ass)));
4637 last = bb_vinfo->lookup_stmt (use_ass);
4638 def = gimple_assign_lhs (use_ass);
4639 }
4640 while (lanes_found < nlanes);
4641 if (lanes_found < nlanes)
4642 {
4643 /* Now search the def chain. */
4644 def = gimple_assign_rhs1 (assign);
4645 do
4646 {
4647 if (TREE_CODE (def) != SSA_NAME
4648 || !has_single_use (def))
4649 break;
4650 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
4651 unsigned this_lane;
4652 if (!bb_vinfo->lookup_stmt (def_stmt)
4653 || !vect_slp_is_lane_insert (def_stmt,
4654 NULL_TREE, &this_lane)
4655 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
4656 break;
4657 if (bitmap_bit_p (lanes, this_lane))
4658 break;
4659 lanes_found++;
4660 bitmap_set_bit (lanes, this_lane);
4661 lane_defs.quick_push (std::make_pair
4662 (this_lane,
4663 gimple_assign_rhs2 (def_stmt)));
4664 def = gimple_assign_rhs1 (def_stmt);
4665 }
4666 while (lanes_found < nlanes);
4667 }
4668 if (lanes_found == nlanes)
4669 {
4670 /* Sort lane_defs after the lane index and register the root. */
4671 lane_defs.qsort (vld_cmp);
4672 vec<stmt_vec_info> stmts;
4673 stmts.create (nlanes);
4674 for (unsigned i = 0; i < nlanes; ++i)
4675 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
4676 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
4677 stmts, last));
4678 }
4679 }
4680 }
4681 }
4682
4683 /* Walk the grouped store chains and replace entries with their
4684 pattern variant if any. */
4685
4686 static void
4687 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
4688 {
4689 stmt_vec_info first_element;
4690 unsigned i;
4691
4692 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4693 {
4694 /* We also have CTORs in this array. */
4695 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
4696 continue;
4697 if (STMT_VINFO_IN_PATTERN_P (first_element))
4698 {
4699 stmt_vec_info orig = first_element;
4700 first_element = STMT_VINFO_RELATED_STMT (first_element);
4701 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
4702 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
4703 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
4704 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
4705 vinfo->grouped_stores[i] = first_element;
4706 }
4707 stmt_vec_info prev = first_element;
4708 while (DR_GROUP_NEXT_ELEMENT (prev))
4709 {
4710 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
4711 if (STMT_VINFO_IN_PATTERN_P (elt))
4712 {
4713 stmt_vec_info orig = elt;
4714 elt = STMT_VINFO_RELATED_STMT (elt);
4715 DR_GROUP_NEXT_ELEMENT (prev) = elt;
4716 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
4717 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
4718 }
4719 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
4720 prev = elt;
4721 }
4722 }
4723 }
4724
4725 /* Check if the region described by BB_VINFO can be vectorized, returning
4726 true if so. When returning false, set FATAL to true if the same failure
4727 would prevent vectorization at other vector sizes, false if it is still
4728 worth trying other sizes. N_STMTS is the number of statements in the
4729 region. */
4730
4731 static bool
4732 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
4733 vec<int> *dataref_groups)
4734 {
4735 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4736
4737 slp_instance instance;
4738 int i;
4739 poly_uint64 min_vf = 2;
4740
4741 /* The first group of checks is independent of the vector size. */
4742 fatal = true;
4743
4744 /* Analyze the data references. */
4745
4746 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
4747 {
4748 if (dump_enabled_p ())
4749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4750 "not vectorized: unhandled data-ref in basic "
4751 "block.\n");
4752 return false;
4753 }
4754
4755 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
4756 {
4757 if (dump_enabled_p ())
4758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4759 "not vectorized: unhandled data access in "
4760 "basic block.\n");
4761 return false;
4762 }
4763
4764 vect_slp_check_for_constructors (bb_vinfo);
4765
4766 /* If there are no grouped stores and no constructors in the region
4767 there is no need to continue with pattern recog as vect_analyze_slp
4768 will fail anyway. */
4769 if (bb_vinfo->grouped_stores.is_empty ()
4770 && bb_vinfo->roots.is_empty ())
4771 {
4772 if (dump_enabled_p ())
4773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4774 "not vectorized: no grouped stores in "
4775 "basic block.\n");
4776 return false;
4777 }
4778
4779 /* While the rest of the analysis below depends on it in some way. */
4780 fatal = false;
4781
4782 vect_pattern_recog (bb_vinfo);
4783
4784 /* Update store groups from pattern processing. */
4785 vect_fixup_store_groups_with_patterns (bb_vinfo);
4786
4787 /* Check the SLP opportunities in the basic block, analyze and build SLP
4788 trees. */
4789 if (!vect_analyze_slp (bb_vinfo, n_stmts))
4790 {
4791 if (dump_enabled_p ())
4792 {
4793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4794 "Failed to SLP the basic block.\n");
4795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4796 "not vectorized: failed to find SLP opportunities "
4797 "in basic block.\n");
4798 }
4799 return false;
4800 }
4801
4802 /* Optimize permutations. */
4803 vect_optimize_slp (bb_vinfo);
4804
4805 /* Gather the loads reachable from the SLP graph entries. */
4806 vect_gather_slp_loads (bb_vinfo);
4807
4808 vect_record_base_alignments (bb_vinfo);
4809
4810 /* Analyze and verify the alignment of data references and the
4811 dependence in the SLP instances. */
4812 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
4813 {
4814 vect_location = instance->location ();
4815 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
4816 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
4817 {
4818 slp_tree node = SLP_INSTANCE_TREE (instance);
4819 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4820 if (dump_enabled_p ())
4821 dump_printf_loc (MSG_NOTE, vect_location,
4822 "removing SLP instance operations starting from: %G",
4823 stmt_info->stmt);
4824 vect_free_slp_instance (instance);
4825 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
4826 continue;
4827 }
4828
4829 /* Mark all the statements that we want to vectorize as pure SLP and
4830 relevant. */
4831 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4832 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
4833 if (stmt_vec_info root = SLP_INSTANCE_ROOT_STMT (instance))
4834 {
4835 STMT_SLP_TYPE (root) = pure_slp;
4836 if (is_gimple_assign (root->stmt)
4837 && gimple_assign_rhs_code (root->stmt) == BIT_INSERT_EXPR)
4838 {
4839 /* ??? We should probably record the whole vector of
4840 root stmts so we do not have to back-track here... */
4841 for (unsigned n = SLP_TREE_LANES (SLP_INSTANCE_TREE (instance));
4842 n != 1; --n)
4843 {
4844 root = bb_vinfo->lookup_def (gimple_assign_rhs1 (root->stmt));
4845 STMT_SLP_TYPE (root) = pure_slp;
4846 }
4847 }
4848 }
4849
4850 i++;
4851 }
4852 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
4853 return false;
4854
4855 if (!vect_slp_analyze_operations (bb_vinfo))
4856 {
4857 if (dump_enabled_p ())
4858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4859 "not vectorized: bad operation in basic block.\n");
4860 return false;
4861 }
4862
4863 vect_bb_partition_graph (bb_vinfo);
4864
4865 return true;
4866 }
4867
4868 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4869 basic blocks in BBS, returning true on success.
4870 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4871
4872 static bool
4873 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
4874 vec<int> *dataref_groups, unsigned int n_stmts)
4875 {
4876 bb_vec_info bb_vinfo;
4877 auto_vector_modes vector_modes;
4878
4879 /* Autodetect first vector size we try. */
4880 machine_mode next_vector_mode = VOIDmode;
4881 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
4882 unsigned int mode_i = 0;
4883
4884 vec_info_shared shared;
4885
4886 machine_mode autodetected_vector_mode = VOIDmode;
4887 while (1)
4888 {
4889 bool vectorized = false;
4890 bool fatal = false;
4891 bb_vinfo = new _bb_vec_info (bbs, &shared);
4892
4893 bool first_time_p = shared.datarefs.is_empty ();
4894 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
4895 if (first_time_p)
4896 bb_vinfo->shared->save_datarefs ();
4897 else
4898 bb_vinfo->shared->check_datarefs ();
4899 bb_vinfo->vector_mode = next_vector_mode;
4900
4901 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
4902 {
4903 if (dump_enabled_p ())
4904 {
4905 dump_printf_loc (MSG_NOTE, vect_location,
4906 "***** Analysis succeeded with vector mode"
4907 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4908 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4909 }
4910
4911 bb_vinfo->shared->check_datarefs ();
4912
4913 unsigned i;
4914 slp_instance instance;
4915 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4916 {
4917 if (instance->subgraph_entries.is_empty ())
4918 continue;
4919
4920 vect_location = instance->location ();
4921 if (!unlimited_cost_model (NULL)
4922 && !vect_bb_vectorization_profitable_p
4923 (bb_vinfo, instance->subgraph_entries))
4924 {
4925 if (dump_enabled_p ())
4926 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4927 "not vectorized: vectorization is not "
4928 "profitable.\n");
4929 continue;
4930 }
4931
4932 if (!dbg_cnt (vect_slp))
4933 continue;
4934
4935 if (!vectorized && dump_enabled_p ())
4936 dump_printf_loc (MSG_NOTE, vect_location,
4937 "Basic block will be vectorized "
4938 "using SLP\n");
4939 vectorized = true;
4940
4941 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4942
4943 unsigned HOST_WIDE_INT bytes;
4944 if (dump_enabled_p ())
4945 {
4946 if (GET_MODE_SIZE
4947 (bb_vinfo->vector_mode).is_constant (&bytes))
4948 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4949 "basic block part vectorized using %wu "
4950 "byte vectors\n", bytes);
4951 else
4952 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4953 "basic block part vectorized using "
4954 "variable length vectors\n");
4955 }
4956 }
4957 }
4958 else
4959 {
4960 if (dump_enabled_p ())
4961 dump_printf_loc (MSG_NOTE, vect_location,
4962 "***** Analysis failed with vector mode %s\n",
4963 GET_MODE_NAME (bb_vinfo->vector_mode));
4964 }
4965
4966 if (mode_i == 0)
4967 autodetected_vector_mode = bb_vinfo->vector_mode;
4968
4969 if (!fatal)
4970 while (mode_i < vector_modes.length ()
4971 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
4972 {
4973 if (dump_enabled_p ())
4974 dump_printf_loc (MSG_NOTE, vect_location,
4975 "***** The result for vector mode %s would"
4976 " be the same\n",
4977 GET_MODE_NAME (vector_modes[mode_i]));
4978 mode_i += 1;
4979 }
4980
4981 delete bb_vinfo;
4982
4983 if (mode_i < vector_modes.length ()
4984 && VECTOR_MODE_P (autodetected_vector_mode)
4985 && (related_vector_mode (vector_modes[mode_i],
4986 GET_MODE_INNER (autodetected_vector_mode))
4987 == autodetected_vector_mode)
4988 && (related_vector_mode (autodetected_vector_mode,
4989 GET_MODE_INNER (vector_modes[mode_i]))
4990 == vector_modes[mode_i]))
4991 {
4992 if (dump_enabled_p ())
4993 dump_printf_loc (MSG_NOTE, vect_location,
4994 "***** Skipping vector mode %s, which would"
4995 " repeat the analysis for %s\n",
4996 GET_MODE_NAME (vector_modes[mode_i]),
4997 GET_MODE_NAME (autodetected_vector_mode));
4998 mode_i += 1;
4999 }
5000
5001 if (vectorized
5002 || mode_i == vector_modes.length ()
5003 || autodetected_vector_mode == VOIDmode
5004 /* If vect_slp_analyze_bb_1 signaled that analysis for all
5005 vector sizes will fail do not bother iterating. */
5006 || fatal)
5007 return vectorized;
5008
5009 /* Try the next biggest vector size. */
5010 next_vector_mode = vector_modes[mode_i++];
5011 if (dump_enabled_p ())
5012 dump_printf_loc (MSG_NOTE, vect_location,
5013 "***** Re-trying analysis with vector mode %s\n",
5014 GET_MODE_NAME (next_vector_mode));
5015 }
5016 }
5017
5018
5019 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
5020 true if anything in the basic-block was vectorized. */
5021
5022 static bool
5023 vect_slp_bbs (vec<basic_block> bbs)
5024 {
5025 vec<data_reference_p> datarefs = vNULL;
5026 auto_vec<int> dataref_groups;
5027 int insns = 0;
5028 int current_group = 0;
5029
5030 for (unsigned i = 0; i < bbs.length (); i++)
5031 {
5032 basic_block bb = bbs[i];
5033 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
5034 gsi_next (&gsi))
5035 {
5036 gimple *stmt = gsi_stmt (gsi);
5037 if (is_gimple_debug (stmt))
5038 continue;
5039
5040 insns++;
5041
5042 if (gimple_location (stmt) != UNKNOWN_LOCATION)
5043 vect_location = stmt;
5044
5045 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
5046 &dataref_groups, current_group))
5047 ++current_group;
5048 }
5049 }
5050
5051 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
5052 }
5053
5054 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5055 true if anything in the basic-block was vectorized. */
5056
5057 bool
5058 vect_slp_bb (basic_block bb)
5059 {
5060 auto_vec<basic_block> bbs;
5061 bbs.safe_push (bb);
5062 return vect_slp_bbs (bbs);
5063 }
5064
5065 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5066 true if anything in the basic-block was vectorized. */
5067
5068 bool
5069 vect_slp_function (function *fun)
5070 {
5071 bool r = false;
5072 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
5073 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
5074
5075 /* For the moment split the function into pieces to avoid making
5076 the iteration on the vector mode moot. Split at points we know
5077 to not handle well which is CFG merges (SLP discovery doesn't
5078 handle non-loop-header PHIs) and loop exits. Since pattern
5079 recog requires reverse iteration to visit uses before defs
5080 simply chop RPO into pieces. */
5081 auto_vec<basic_block> bbs;
5082 for (unsigned i = 0; i < n; i++)
5083 {
5084 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
5085 bool split = false;
5086
5087 /* Split when a BB is not dominated by the first block. */
5088 if (!bbs.is_empty ()
5089 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
5090 {
5091 if (dump_enabled_p ())
5092 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5093 "splitting region at dominance boundary bb%d\n",
5094 bb->index);
5095 split = true;
5096 }
5097 /* Split when the loop determined by the first block
5098 is exited. This is because we eventually insert
5099 invariants at region begin. */
5100 else if (!bbs.is_empty ()
5101 && bbs[0]->loop_father != bb->loop_father
5102 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
5103 {
5104 if (dump_enabled_p ())
5105 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5106 "splitting region at loop %d exit at bb%d\n",
5107 bbs[0]->loop_father->num, bb->index);
5108 split = true;
5109 }
5110
5111 if (split && !bbs.is_empty ())
5112 {
5113 r |= vect_slp_bbs (bbs);
5114 bbs.truncate (0);
5115 bbs.quick_push (bb);
5116 }
5117 else
5118 bbs.safe_push (bb);
5119
5120 /* When we have a stmt ending this block and defining a
5121 value we have to insert on edges when inserting after it for
5122 a vector containing its definition. Avoid this for now. */
5123 if (gimple *last = last_stmt (bb))
5124 if (gimple_get_lhs (last)
5125 && is_ctrl_altering_stmt (last))
5126 {
5127 if (dump_enabled_p ())
5128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5129 "splitting region at control altering "
5130 "definition %G", last);
5131 r |= vect_slp_bbs (bbs);
5132 bbs.truncate (0);
5133 }
5134 }
5135
5136 if (!bbs.is_empty ())
5137 r |= vect_slp_bbs (bbs);
5138
5139 free (rpo);
5140
5141 return r;
5142 }
5143
5144 /* Build a variable-length vector in which the elements in ELTS are repeated
5145 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
5146 RESULTS and add any new instructions to SEQ.
5147
5148 The approach we use is:
5149
5150 (1) Find a vector mode VM with integer elements of mode IM.
5151
5152 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5153 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
5154 from small vectors to IM.
5155
5156 (3) Duplicate each ELTS'[I] into a vector of mode VM.
5157
5158 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
5159 correct byte contents.
5160
5161 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
5162
5163 We try to find the largest IM for which this sequence works, in order
5164 to cut down on the number of interleaves. */
5165
5166 void
5167 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
5168 vec<tree> elts, unsigned int nresults,
5169 vec<tree> &results)
5170 {
5171 unsigned int nelts = elts.length ();
5172 tree element_type = TREE_TYPE (vector_type);
5173
5174 /* (1) Find a vector mode VM with integer elements of mode IM. */
5175 unsigned int nvectors = 1;
5176 tree new_vector_type;
5177 tree permutes[2];
5178 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
5179 &nvectors, &new_vector_type,
5180 permutes))
5181 gcc_unreachable ();
5182
5183 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
5184 unsigned int partial_nelts = nelts / nvectors;
5185 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
5186
5187 tree_vector_builder partial_elts;
5188 auto_vec<tree, 32> pieces (nvectors * 2);
5189 pieces.quick_grow_cleared (nvectors * 2);
5190 for (unsigned int i = 0; i < nvectors; ++i)
5191 {
5192 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5193 ELTS' has mode IM. */
5194 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
5195 for (unsigned int j = 0; j < partial_nelts; ++j)
5196 partial_elts.quick_push (elts[i * partial_nelts + j]);
5197 tree t = gimple_build_vector (seq, &partial_elts);
5198 t = gimple_build (seq, VIEW_CONVERT_EXPR,
5199 TREE_TYPE (new_vector_type), t);
5200
5201 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
5202 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
5203 }
5204
5205 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
5206 correct byte contents.
5207
5208 Conceptually, we need to repeat the following operation log2(nvectors)
5209 times, where hi_start = nvectors / 2:
5210
5211 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
5212 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
5213
5214 However, if each input repeats every N elements and the VF is
5215 a multiple of N * 2, the HI result is the same as the LO result.
5216 This will be true for the first N1 iterations of the outer loop,
5217 followed by N2 iterations for which both the LO and HI results
5218 are needed. I.e.:
5219
5220 N1 + N2 = log2(nvectors)
5221
5222 Each "N1 iteration" doubles the number of redundant vectors and the
5223 effect of the process as a whole is to have a sequence of nvectors/2**N1
5224 vectors that repeats 2**N1 times. Rather than generate these redundant
5225 vectors, we halve the number of vectors for each N1 iteration. */
5226 unsigned int in_start = 0;
5227 unsigned int out_start = nvectors;
5228 unsigned int new_nvectors = nvectors;
5229 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
5230 {
5231 unsigned int hi_start = new_nvectors / 2;
5232 unsigned int out_i = 0;
5233 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
5234 {
5235 if ((in_i & 1) != 0
5236 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
5237 2 * in_repeat))
5238 continue;
5239
5240 tree output = make_ssa_name (new_vector_type);
5241 tree input1 = pieces[in_start + (in_i / 2)];
5242 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
5243 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
5244 input1, input2,
5245 permutes[in_i & 1]);
5246 gimple_seq_add_stmt (seq, stmt);
5247 pieces[out_start + out_i] = output;
5248 out_i += 1;
5249 }
5250 std::swap (in_start, out_start);
5251 new_nvectors = out_i;
5252 }
5253
5254 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
5255 results.reserve (nresults);
5256 for (unsigned int i = 0; i < nresults; ++i)
5257 if (i < new_nvectors)
5258 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
5259 pieces[in_start + i]));
5260 else
5261 results.quick_push (results[i - new_nvectors]);
5262 }
5263
5264
5265 /* For constant and loop invariant defs in OP_NODE this function creates
5266 vector defs that will be used in the vectorized stmts and stores them
5267 to SLP_TREE_VEC_DEFS of OP_NODE. */
5268
5269 static void
5270 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
5271 {
5272 unsigned HOST_WIDE_INT nunits;
5273 tree vec_cst;
5274 unsigned j, number_of_places_left_in_vector;
5275 tree vector_type;
5276 tree vop;
5277 int group_size = op_node->ops.length ();
5278 unsigned int vec_num, i;
5279 unsigned number_of_copies = 1;
5280 bool constant_p;
5281 gimple_seq ctor_seq = NULL;
5282 auto_vec<tree, 16> permute_results;
5283
5284 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
5285 vector_type = SLP_TREE_VECTYPE (op_node);
5286
5287 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
5288 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
5289 auto_vec<tree> voprnds (number_of_vectors);
5290
5291 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5292 created vectors. It is greater than 1 if unrolling is performed.
5293
5294 For example, we have two scalar operands, s1 and s2 (e.g., group of
5295 strided accesses of size two), while NUNITS is four (i.e., four scalars
5296 of this type can be packed in a vector). The output vector will contain
5297 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5298 will be 2).
5299
5300 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
5301 containing the operands.
5302
5303 For example, NUNITS is four as before, and the group size is 8
5304 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5305 {s5, s6, s7, s8}. */
5306
5307 /* When using duplicate_and_interleave, we just need one element for
5308 each scalar statement. */
5309 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5310 nunits = group_size;
5311
5312 number_of_copies = nunits * number_of_vectors / group_size;
5313
5314 number_of_places_left_in_vector = nunits;
5315 constant_p = true;
5316 tree_vector_builder elts (vector_type, nunits, 1);
5317 elts.quick_grow (nunits);
5318 stmt_vec_info insert_after = NULL;
5319 for (j = 0; j < number_of_copies; j++)
5320 {
5321 tree op;
5322 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
5323 {
5324 /* Create 'vect_ = {op0,op1,...,opn}'. */
5325 number_of_places_left_in_vector--;
5326 tree orig_op = op;
5327 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
5328 {
5329 if (CONSTANT_CLASS_P (op))
5330 {
5331 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
5332 {
5333 /* Can't use VIEW_CONVERT_EXPR for booleans because
5334 of possibly different sizes of scalar value and
5335 vector element. */
5336 if (integer_zerop (op))
5337 op = build_int_cst (TREE_TYPE (vector_type), 0);
5338 else if (integer_onep (op))
5339 op = build_all_ones_cst (TREE_TYPE (vector_type));
5340 else
5341 gcc_unreachable ();
5342 }
5343 else
5344 op = fold_unary (VIEW_CONVERT_EXPR,
5345 TREE_TYPE (vector_type), op);
5346 gcc_assert (op && CONSTANT_CLASS_P (op));
5347 }
5348 else
5349 {
5350 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
5351 gimple *init_stmt;
5352 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
5353 {
5354 tree true_val
5355 = build_all_ones_cst (TREE_TYPE (vector_type));
5356 tree false_val
5357 = build_zero_cst (TREE_TYPE (vector_type));
5358 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
5359 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
5360 op, true_val,
5361 false_val);
5362 }
5363 else
5364 {
5365 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
5366 op);
5367 init_stmt
5368 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
5369 op);
5370 }
5371 gimple_seq_add_stmt (&ctor_seq, init_stmt);
5372 op = new_temp;
5373 }
5374 }
5375 elts[number_of_places_left_in_vector] = op;
5376 if (!CONSTANT_CLASS_P (op))
5377 constant_p = false;
5378 /* For BB vectorization we have to compute an insert location
5379 when a def is inside the analyzed region since we cannot
5380 simply insert at the BB start in this case. */
5381 stmt_vec_info opdef;
5382 if (TREE_CODE (orig_op) == SSA_NAME
5383 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
5384 && is_a <bb_vec_info> (vinfo)
5385 && (opdef = vinfo->lookup_def (orig_op)))
5386 {
5387 if (!insert_after)
5388 insert_after = opdef;
5389 else
5390 insert_after = get_later_stmt (insert_after, opdef);
5391 }
5392
5393 if (number_of_places_left_in_vector == 0)
5394 {
5395 if (constant_p
5396 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5397 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5398 vec_cst = gimple_build_vector (&ctor_seq, &elts);
5399 else
5400 {
5401 if (permute_results.is_empty ())
5402 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
5403 elts, number_of_vectors,
5404 permute_results);
5405 vec_cst = permute_results[number_of_vectors - j - 1];
5406 }
5407 if (!gimple_seq_empty_p (ctor_seq))
5408 {
5409 if (insert_after)
5410 {
5411 gimple_stmt_iterator gsi;
5412 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
5413 {
5414 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
5415 gsi_insert_seq_before (&gsi, ctor_seq,
5416 GSI_CONTINUE_LINKING);
5417 }
5418 else if (!stmt_ends_bb_p (insert_after->stmt))
5419 {
5420 gsi = gsi_for_stmt (insert_after->stmt);
5421 gsi_insert_seq_after (&gsi, ctor_seq,
5422 GSI_CONTINUE_LINKING);
5423 }
5424 else
5425 {
5426 /* When we want to insert after a def where the
5427 defining stmt throws then insert on the fallthru
5428 edge. */
5429 edge e = find_fallthru_edge
5430 (gimple_bb (insert_after->stmt)->succs);
5431 basic_block new_bb
5432 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
5433 gcc_assert (!new_bb);
5434 }
5435 }
5436 else
5437 vinfo->insert_seq_on_entry (NULL, ctor_seq);
5438 ctor_seq = NULL;
5439 }
5440 voprnds.quick_push (vec_cst);
5441 insert_after = NULL;
5442 number_of_places_left_in_vector = nunits;
5443 constant_p = true;
5444 elts.new_vector (vector_type, nunits, 1);
5445 elts.quick_grow (nunits);
5446 }
5447 }
5448 }
5449
5450 /* Since the vectors are created in the reverse order, we should invert
5451 them. */
5452 vec_num = voprnds.length ();
5453 for (j = vec_num; j != 0; j--)
5454 {
5455 vop = voprnds[j - 1];
5456 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5457 }
5458
5459 /* In case that VF is greater than the unrolling factor needed for the SLP
5460 group of stmts, NUMBER_OF_VECTORS to be created is greater than
5461 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
5462 to replicate the vectors. */
5463 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
5464 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
5465 i++)
5466 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5467 }
5468
5469 /* Get the Ith vectorized definition from SLP_NODE. */
5470
5471 tree
5472 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
5473 {
5474 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
5475 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
5476 else
5477 return SLP_TREE_VEC_DEFS (slp_node)[i];
5478 }
5479
5480 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
5481
5482 void
5483 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
5484 {
5485 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
5486 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
5487 {
5488 unsigned j;
5489 gimple *vec_def_stmt;
5490 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
5491 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
5492 }
5493 else
5494 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
5495 }
5496
5497 /* Get N vectorized definitions for SLP_NODE. */
5498
5499 void
5500 vect_get_slp_defs (vec_info *,
5501 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
5502 {
5503 if (n == -1U)
5504 n = SLP_TREE_CHILDREN (slp_node).length ();
5505
5506 for (unsigned i = 0; i < n; ++i)
5507 {
5508 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
5509 vec<tree> vec_defs = vNULL;
5510 vect_get_slp_defs (child, &vec_defs);
5511 vec_oprnds->quick_push (vec_defs);
5512 }
5513 }
5514
5515 /* Generate vector permute statements from a list of loads in DR_CHAIN.
5516 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
5517 permute statements for the SLP node NODE. Store the number of vector
5518 permute instructions in *N_PERMS and the number of vector load
5519 instructions in *N_LOADS. */
5520
5521 bool
5522 vect_transform_slp_perm_load (vec_info *vinfo,
5523 slp_tree node, vec<tree> dr_chain,
5524 gimple_stmt_iterator *gsi, poly_uint64 vf,
5525 bool analyze_only, unsigned *n_perms,
5526 unsigned int *n_loads)
5527 {
5528 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5529 int vec_index = 0;
5530 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5531 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
5532 unsigned int mask_element;
5533 machine_mode mode;
5534
5535 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
5536 return false;
5537
5538 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
5539
5540 mode = TYPE_MODE (vectype);
5541 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5542
5543 /* Initialize the vect stmts of NODE to properly insert the generated
5544 stmts later. */
5545 if (! analyze_only)
5546 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
5547 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
5548 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
5549
5550 /* Generate permutation masks for every NODE. Number of masks for each NODE
5551 is equal to GROUP_SIZE.
5552 E.g., we have a group of three nodes with three loads from the same
5553 location in each node, and the vector size is 4. I.e., we have a
5554 a0b0c0a1b1c1... sequence and we need to create the following vectors:
5555 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
5556 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
5557 ...
5558
5559 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
5560 The last mask is illegal since we assume two operands for permute
5561 operation, and the mask element values can't be outside that range.
5562 Hence, the last mask must be converted into {2,5,5,5}.
5563 For the first two permutations we need the first and the second input
5564 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
5565 we need the second and the third vectors: {b1,c1,a2,b2} and
5566 {c2,a3,b3,c3}. */
5567
5568 int vect_stmts_counter = 0;
5569 unsigned int index = 0;
5570 int first_vec_index = -1;
5571 int second_vec_index = -1;
5572 bool noop_p = true;
5573 *n_perms = 0;
5574
5575 vec_perm_builder mask;
5576 unsigned int nelts_to_build;
5577 unsigned int nvectors_per_build;
5578 unsigned int in_nlanes;
5579 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
5580 && multiple_p (nunits, group_size));
5581 if (repeating_p)
5582 {
5583 /* A single vector contains a whole number of copies of the node, so:
5584 (a) all permutes can use the same mask; and
5585 (b) the permutes only need a single vector input. */
5586 mask.new_vector (nunits, group_size, 3);
5587 nelts_to_build = mask.encoded_nelts ();
5588 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
5589 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
5590 }
5591 else
5592 {
5593 /* We need to construct a separate mask for each vector statement. */
5594 unsigned HOST_WIDE_INT const_nunits, const_vf;
5595 if (!nunits.is_constant (&const_nunits)
5596 || !vf.is_constant (&const_vf))
5597 return false;
5598 mask.new_vector (const_nunits, const_nunits, 1);
5599 nelts_to_build = const_vf * group_size;
5600 nvectors_per_build = 1;
5601 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
5602 }
5603 auto_sbitmap used_in_lanes (in_nlanes);
5604 bitmap_clear (used_in_lanes);
5605
5606 unsigned int count = mask.encoded_nelts ();
5607 mask.quick_grow (count);
5608 vec_perm_indices indices;
5609
5610 for (unsigned int j = 0; j < nelts_to_build; j++)
5611 {
5612 unsigned int iter_num = j / group_size;
5613 unsigned int stmt_num = j % group_size;
5614 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
5615 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
5616 bitmap_set_bit (used_in_lanes, i);
5617 if (repeating_p)
5618 {
5619 first_vec_index = 0;
5620 mask_element = i;
5621 }
5622 else
5623 {
5624 /* Enforced before the loop when !repeating_p. */
5625 unsigned int const_nunits = nunits.to_constant ();
5626 vec_index = i / const_nunits;
5627 mask_element = i % const_nunits;
5628 if (vec_index == first_vec_index
5629 || first_vec_index == -1)
5630 {
5631 first_vec_index = vec_index;
5632 }
5633 else if (vec_index == second_vec_index
5634 || second_vec_index == -1)
5635 {
5636 second_vec_index = vec_index;
5637 mask_element += const_nunits;
5638 }
5639 else
5640 {
5641 if (dump_enabled_p ())
5642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5643 "permutation requires at "
5644 "least three vectors %G",
5645 stmt_info->stmt);
5646 gcc_assert (analyze_only);
5647 return false;
5648 }
5649
5650 gcc_assert (mask_element < 2 * const_nunits);
5651 }
5652
5653 if (mask_element != index)
5654 noop_p = false;
5655 mask[index++] = mask_element;
5656
5657 if (index == count && !noop_p)
5658 {
5659 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
5660 if (!can_vec_perm_const_p (mode, indices))
5661 {
5662 if (dump_enabled_p ())
5663 {
5664 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5665 vect_location,
5666 "unsupported vect permute { ");
5667 for (i = 0; i < count; ++i)
5668 {
5669 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5670 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5671 }
5672 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5673 }
5674 gcc_assert (analyze_only);
5675 return false;
5676 }
5677
5678 ++*n_perms;
5679 }
5680
5681 if (index == count)
5682 {
5683 if (!analyze_only)
5684 {
5685 tree mask_vec = NULL_TREE;
5686
5687 if (! noop_p)
5688 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5689
5690 if (second_vec_index == -1)
5691 second_vec_index = first_vec_index;
5692
5693 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
5694 {
5695 /* Generate the permute statement if necessary. */
5696 tree first_vec = dr_chain[first_vec_index + ri];
5697 tree second_vec = dr_chain[second_vec_index + ri];
5698 gimple *perm_stmt;
5699 if (! noop_p)
5700 {
5701 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5702 tree perm_dest
5703 = vect_create_destination_var (gimple_assign_lhs (stmt),
5704 vectype);
5705 perm_dest = make_ssa_name (perm_dest);
5706 perm_stmt
5707 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5708 first_vec, second_vec,
5709 mask_vec);
5710 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
5711 gsi);
5712 }
5713 else
5714 /* If mask was NULL_TREE generate the requested
5715 identity transform. */
5716 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
5717
5718 /* Store the vector statement in NODE. */
5719 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
5720 }
5721 }
5722
5723 index = 0;
5724 first_vec_index = -1;
5725 second_vec_index = -1;
5726 noop_p = true;
5727 }
5728 }
5729
5730 if (n_loads)
5731 {
5732 if (repeating_p)
5733 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5734 else
5735 {
5736 /* Enforced above when !repeating_p. */
5737 unsigned int const_nunits = nunits.to_constant ();
5738 *n_loads = 0;
5739 bool load_seen = false;
5740 for (unsigned i = 0; i < in_nlanes; ++i)
5741 {
5742 if (i % const_nunits == 0)
5743 {
5744 if (load_seen)
5745 *n_loads += 1;
5746 load_seen = false;
5747 }
5748 if (bitmap_bit_p (used_in_lanes, i))
5749 load_seen = true;
5750 }
5751 if (load_seen)
5752 *n_loads += 1;
5753 }
5754 }
5755
5756 return true;
5757 }
5758
5759
5760 /* Vectorize the SLP permutations in NODE as specified
5761 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5762 child number and lane number.
5763 Interleaving of two two-lane two-child SLP subtrees (not supported):
5764 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5765 A blend of two four-lane two-child SLP subtrees:
5766 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5767 Highpart of a four-lane one-child SLP subtree (not supported):
5768 [ { 0, 2 }, { 0, 3 } ]
5769 Where currently only a subset is supported by code generating below. */
5770
5771 static bool
5772 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5773 slp_tree node, stmt_vector_for_cost *cost_vec)
5774 {
5775 tree vectype = SLP_TREE_VECTYPE (node);
5776
5777 /* ??? We currently only support all same vector input and output types
5778 while the SLP IL should really do a concat + select and thus accept
5779 arbitrary mismatches. */
5780 slp_tree child;
5781 unsigned i;
5782 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5783 if (!vect_maybe_update_slp_op_vectype (child, vectype)
5784 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
5785 {
5786 if (dump_enabled_p ())
5787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5788 "Unsupported lane permutation\n");
5789 return false;
5790 }
5791
5792 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
5793 gcc_assert (perm.length () == SLP_TREE_LANES (node));
5794 if (dump_enabled_p ())
5795 {
5796 dump_printf_loc (MSG_NOTE, vect_location,
5797 "vectorizing permutation");
5798 for (unsigned i = 0; i < perm.length (); ++i)
5799 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
5800 dump_printf (MSG_NOTE, "\n");
5801 }
5802
5803 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5804 if (!nunits.is_constant ())
5805 return false;
5806 unsigned HOST_WIDE_INT vf = 1;
5807 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
5808 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
5809 return false;
5810 unsigned olanes = vf * SLP_TREE_LANES (node);
5811 gcc_assert (multiple_p (olanes, nunits));
5812
5813 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5814 from the { SLP operand, scalar lane } permutation as recorded in the
5815 SLP node as intermediate step. This part should already work
5816 with SLP children with arbitrary number of lanes. */
5817 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
5818 auto_vec<unsigned> active_lane;
5819 vperm.create (olanes);
5820 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
5821 for (unsigned i = 0; i < vf; ++i)
5822 {
5823 for (unsigned pi = 0; pi < perm.length (); ++pi)
5824 {
5825 std::pair<unsigned, unsigned> p = perm[pi];
5826 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
5827 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
5828 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
5829 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
5830 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
5831 }
5832 /* Advance to the next group. */
5833 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
5834 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
5835 }
5836
5837 if (dump_enabled_p ())
5838 {
5839 dump_printf_loc (MSG_NOTE, vect_location, "as");
5840 for (unsigned i = 0; i < vperm.length (); ++i)
5841 {
5842 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
5843 dump_printf (MSG_NOTE, ",");
5844 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
5845 vperm[i].first.first, vperm[i].first.second,
5846 vperm[i].second);
5847 }
5848 dump_printf (MSG_NOTE, "\n");
5849 }
5850
5851 /* We can only handle two-vector permutes, everything else should
5852 be lowered on the SLP level. The following is closely inspired
5853 by vect_transform_slp_perm_load and is supposed to eventually
5854 replace it.
5855 ??? As intermediate step do code-gen in the SLP tree representation
5856 somehow? */
5857 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
5858 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
5859 unsigned int const_nunits = nunits.to_constant ();
5860 unsigned int index = 0;
5861 unsigned int mask_element;
5862 vec_perm_builder mask;
5863 mask.new_vector (const_nunits, const_nunits, 1);
5864 unsigned int count = mask.encoded_nelts ();
5865 mask.quick_grow (count);
5866 vec_perm_indices indices;
5867 unsigned nperms = 0;
5868 for (unsigned i = 0; i < vperm.length (); ++i)
5869 {
5870 mask_element = vperm[i].second;
5871 if (first_vec.first == -1U
5872 || first_vec == vperm[i].first)
5873 first_vec = vperm[i].first;
5874 else if (second_vec.first == -1U
5875 || second_vec == vperm[i].first)
5876 {
5877 second_vec = vperm[i].first;
5878 mask_element += const_nunits;
5879 }
5880 else
5881 {
5882 if (dump_enabled_p ())
5883 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5884 "permutation requires at "
5885 "least three vectors\n");
5886 gcc_assert (!gsi);
5887 return false;
5888 }
5889
5890 mask[index++] = mask_element;
5891
5892 if (index == count)
5893 {
5894 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
5895 const_nunits);
5896 bool identity_p = indices.series_p (0, 1, 0, 1);
5897 if (!identity_p
5898 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5899 {
5900 if (dump_enabled_p ())
5901 {
5902 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5903 vect_location,
5904 "unsupported vect permute { ");
5905 for (i = 0; i < count; ++i)
5906 {
5907 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5908 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5909 }
5910 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5911 }
5912 gcc_assert (!gsi);
5913 return false;
5914 }
5915
5916 if (!identity_p)
5917 nperms++;
5918 if (gsi)
5919 {
5920 if (second_vec.first == -1U)
5921 second_vec = first_vec;
5922
5923 /* Generate the permute statement if necessary. */
5924 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
5925 tree first_def
5926 = vect_get_slp_vect_def (first_node, first_vec.second);
5927 /* ??? We SLP match existing vector element extracts but
5928 allow punning which we need to re-instantiate at uses
5929 but have no good way of explicitely representing. */
5930 if (!types_compatible_p (TREE_TYPE (first_def), vectype))
5931 {
5932 gassign *conv_stmt;
5933 conv_stmt = gimple_build_assign (make_ssa_name (vectype),
5934 build1 (VIEW_CONVERT_EXPR,
5935 vectype, first_def));
5936 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
5937 first_def = gimple_assign_lhs (conv_stmt);
5938 }
5939 gassign *perm_stmt;
5940 tree perm_dest = make_ssa_name (vectype);
5941 if (!identity_p)
5942 {
5943 slp_tree second_node
5944 = SLP_TREE_CHILDREN (node)[second_vec.first];
5945 tree second_def
5946 = vect_get_slp_vect_def (second_node, second_vec.second);
5947 if (!types_compatible_p (TREE_TYPE (second_def), vectype))
5948 {
5949 gassign *conv_stmt;
5950 conv_stmt = gimple_build_assign (make_ssa_name (vectype),
5951 build1
5952 (VIEW_CONVERT_EXPR,
5953 vectype, second_def));
5954 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
5955 second_def = gimple_assign_lhs (conv_stmt);
5956 }
5957 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5958 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5959 first_def, second_def,
5960 mask_vec);
5961 }
5962 else
5963 /* We need a copy here in case the def was external. */
5964 perm_stmt = gimple_build_assign (perm_dest, first_def);
5965 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
5966 /* Store the vector statement in NODE. */
5967 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
5968 }
5969
5970 index = 0;
5971 first_vec = std::make_pair (-1U, -1U);
5972 second_vec = std::make_pair (-1U, -1U);
5973 }
5974 }
5975
5976 if (!gsi)
5977 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
5978
5979 return true;
5980 }
5981
5982 /* Vectorize SLP NODE. */
5983
5984 static void
5985 vect_schedule_slp_node (vec_info *vinfo,
5986 slp_tree node, slp_instance instance)
5987 {
5988 gimple_stmt_iterator si;
5989 int i;
5990 slp_tree child;
5991
5992 /* For existing vectors there's nothing to do. */
5993 if (SLP_TREE_VEC_DEFS (node).exists ())
5994 return;
5995
5996 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
5997
5998 /* Vectorize externals and constants. */
5999 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
6000 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6001 {
6002 /* ??? vectorizable_shift can end up using a scalar operand which is
6003 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
6004 node in this case. */
6005 if (!SLP_TREE_VECTYPE (node))
6006 return;
6007
6008 vect_create_constant_vectors (vinfo, node);
6009 return;
6010 }
6011
6012 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6013
6014 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
6015 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6016
6017 if (dump_enabled_p ())
6018 dump_printf_loc (MSG_NOTE, vect_location,
6019 "------>vectorizing SLP node starting from: %G",
6020 stmt_info->stmt);
6021
6022 if (STMT_VINFO_DATA_REF (stmt_info)
6023 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
6024 {
6025 /* Vectorized loads go before the first scalar load to make it
6026 ready early, vectorized stores go before the last scalar
6027 stmt which is where all uses are ready. */
6028 stmt_vec_info last_stmt_info = NULL;
6029 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
6030 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
6031 else /* DR_IS_WRITE */
6032 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
6033 si = gsi_for_stmt (last_stmt_info->stmt);
6034 }
6035 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
6036 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
6037 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
6038 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
6039 {
6040 /* For PHI node vectorization we do not use the insertion iterator. */
6041 si = gsi_none ();
6042 }
6043 else
6044 {
6045 /* Emit other stmts after the children vectorized defs which is
6046 earliest possible. */
6047 gimple *last_stmt = NULL;
6048 bool seen_vector_def = false;
6049 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6050 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6051 {
6052 /* For fold-left reductions we are retaining the scalar
6053 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
6054 set so the representation isn't perfect. Resort to the
6055 last scalar def here. */
6056 if (SLP_TREE_VEC_STMTS (child).is_empty ())
6057 {
6058 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
6059 == cycle_phi_info_type);
6060 gphi *phi = as_a <gphi *>
6061 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
6062 if (!last_stmt
6063 || vect_stmt_dominates_stmt_p (last_stmt, phi))
6064 last_stmt = phi;
6065 }
6066 /* We are emitting all vectorized stmts in the same place and
6067 the last one is the last.
6068 ??? Unless we have a load permutation applied and that
6069 figures to re-use an earlier generated load. */
6070 unsigned j;
6071 gimple *vstmt;
6072 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
6073 if (!last_stmt
6074 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
6075 last_stmt = vstmt;
6076 }
6077 else if (!SLP_TREE_VECTYPE (child))
6078 {
6079 /* For externals we use unvectorized at all scalar defs. */
6080 unsigned j;
6081 tree def;
6082 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
6083 if (TREE_CODE (def) == SSA_NAME
6084 && !SSA_NAME_IS_DEFAULT_DEF (def))
6085 {
6086 gimple *stmt = SSA_NAME_DEF_STMT (def);
6087 if (!last_stmt
6088 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
6089 last_stmt = stmt;
6090 }
6091 }
6092 else
6093 {
6094 /* For externals we have to look at all defs since their
6095 insertion place is decided per vector. But beware
6096 of pre-existing vectors where we need to make sure
6097 we do not insert before the region boundary. */
6098 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
6099 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
6100 seen_vector_def = true;
6101 else
6102 {
6103 unsigned j;
6104 tree vdef;
6105 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
6106 if (TREE_CODE (vdef) == SSA_NAME
6107 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
6108 {
6109 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
6110 if (!last_stmt
6111 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
6112 last_stmt = vstmt;
6113 }
6114 }
6115 }
6116 /* This can happen when all children are pre-existing vectors or
6117 constants. */
6118 if (!last_stmt)
6119 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
6120 if (!last_stmt)
6121 {
6122 gcc_assert (seen_vector_def);
6123 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
6124 }
6125 else if (is_a <gphi *> (last_stmt))
6126 si = gsi_after_labels (gimple_bb (last_stmt));
6127 else
6128 {
6129 si = gsi_for_stmt (last_stmt);
6130 gsi_next (&si);
6131 }
6132 }
6133
6134 bool done_p = false;
6135
6136 /* Handle purely internal nodes. */
6137 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6138 {
6139 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
6140 be shared with different SLP nodes (but usually it's the same
6141 operation apart from the case the stmt is only there for denoting
6142 the actual scalar lane defs ...). So do not call vect_transform_stmt
6143 but open-code it here (partly). */
6144 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
6145 gcc_assert (done);
6146 done_p = true;
6147 }
6148 if (!done_p)
6149 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
6150 }
6151
6152 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
6153 For loop vectorization this is done in vectorizable_call, but for SLP
6154 it needs to be deferred until end of vect_schedule_slp, because multiple
6155 SLP instances may refer to the same scalar stmt. */
6156
6157 static void
6158 vect_remove_slp_scalar_calls (vec_info *vinfo,
6159 slp_tree node, hash_set<slp_tree> &visited)
6160 {
6161 gimple *new_stmt;
6162 gimple_stmt_iterator gsi;
6163 int i;
6164 slp_tree child;
6165 tree lhs;
6166 stmt_vec_info stmt_info;
6167
6168 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
6169 return;
6170
6171 if (visited.add (node))
6172 return;
6173
6174 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6175 vect_remove_slp_scalar_calls (vinfo, child, visited);
6176
6177 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6178 {
6179 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
6180 if (!stmt || gimple_bb (stmt) == NULL)
6181 continue;
6182 if (is_pattern_stmt_p (stmt_info)
6183 || !PURE_SLP_STMT (stmt_info))
6184 continue;
6185 lhs = gimple_call_lhs (stmt);
6186 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
6187 gsi = gsi_for_stmt (stmt);
6188 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
6189 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
6190 }
6191 }
6192
6193 static void
6194 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
6195 {
6196 hash_set<slp_tree> visited;
6197 vect_remove_slp_scalar_calls (vinfo, node, visited);
6198 }
6199
6200 /* Vectorize the instance root. */
6201
6202 void
6203 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
6204 {
6205 gassign *rstmt = NULL;
6206
6207 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
6208 {
6209 gimple *child_stmt;
6210 int j;
6211
6212 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
6213 {
6214 tree vect_lhs = gimple_get_lhs (child_stmt);
6215 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
6216 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
6217 TREE_TYPE (vect_lhs)))
6218 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
6219 vect_lhs);
6220 rstmt = gimple_build_assign (root_lhs, vect_lhs);
6221 break;
6222 }
6223 }
6224 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
6225 {
6226 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6227 gimple *child_stmt;
6228 int j;
6229 vec<constructor_elt, va_gc> *v;
6230 vec_alloc (v, nelts);
6231
6232 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
6233 {
6234 CONSTRUCTOR_APPEND_ELT (v,
6235 NULL_TREE,
6236 gimple_get_lhs (child_stmt));
6237 }
6238 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
6239 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
6240 tree r_constructor = build_constructor (rtype, v);
6241 rstmt = gimple_build_assign (lhs, r_constructor);
6242 }
6243
6244 gcc_assert (rstmt);
6245
6246 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
6247 gsi_replace (&rgsi, rstmt, true);
6248 }
6249
6250 struct slp_scc_info
6251 {
6252 bool on_stack;
6253 int dfs;
6254 int lowlink;
6255 };
6256
6257 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
6258
6259 static void
6260 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
6261 hash_map<slp_tree, slp_scc_info> &scc_info,
6262 int &maxdfs, vec<slp_tree> &stack)
6263 {
6264 bool existed_p;
6265 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
6266 gcc_assert (!existed_p);
6267 info->dfs = maxdfs;
6268 info->lowlink = maxdfs;
6269 maxdfs++;
6270
6271 /* Leaf. */
6272 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
6273 {
6274 info->on_stack = false;
6275 vect_schedule_slp_node (vinfo, node, instance);
6276 return;
6277 }
6278
6279 info->on_stack = true;
6280 stack.safe_push (node);
6281
6282 unsigned i;
6283 slp_tree child;
6284 /* DFS recurse. */
6285 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6286 {
6287 if (!child)
6288 continue;
6289 slp_scc_info *child_info = scc_info.get (child);
6290 if (!child_info)
6291 {
6292 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
6293 /* Recursion might have re-allocated the node. */
6294 info = scc_info.get (node);
6295 child_info = scc_info.get (child);
6296 info->lowlink = MIN (info->lowlink, child_info->lowlink);
6297 }
6298 else if (child_info->on_stack)
6299 info->lowlink = MIN (info->lowlink, child_info->dfs);
6300 }
6301 if (info->lowlink != info->dfs)
6302 return;
6303
6304 auto_vec<slp_tree, 4> phis_to_fixup;
6305
6306 /* Singleton. */
6307 if (stack.last () == node)
6308 {
6309 stack.pop ();
6310 info->on_stack = false;
6311 vect_schedule_slp_node (vinfo, node, instance);
6312 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6313 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
6314 phis_to_fixup.quick_push (node);
6315 }
6316 else
6317 {
6318 /* SCC. */
6319 int last_idx = stack.length () - 1;
6320 while (stack[last_idx] != node)
6321 last_idx--;
6322 /* We can break the cycle at PHIs who have at least one child
6323 code generated. Then we could re-start the DFS walk until
6324 all nodes in the SCC are covered (we might have new entries
6325 for only back-reachable nodes). But it's simpler to just
6326 iterate and schedule those that are ready. */
6327 unsigned todo = stack.length () - last_idx;
6328 do
6329 {
6330 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
6331 {
6332 slp_tree entry = stack[idx];
6333 if (!entry)
6334 continue;
6335 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
6336 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
6337 bool ready = !phi;
6338 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
6339 if (!child)
6340 {
6341 gcc_assert (phi);
6342 ready = true;
6343 break;
6344 }
6345 else if (scc_info.get (child)->on_stack)
6346 {
6347 if (!phi)
6348 {
6349 ready = false;
6350 break;
6351 }
6352 }
6353 else
6354 {
6355 if (phi)
6356 {
6357 ready = true;
6358 break;
6359 }
6360 }
6361 if (ready)
6362 {
6363 vect_schedule_slp_node (vinfo, entry, instance);
6364 scc_info.get (entry)->on_stack = false;
6365 stack[idx] = NULL;
6366 todo--;
6367 if (phi)
6368 phis_to_fixup.safe_push (entry);
6369 }
6370 }
6371 }
6372 while (todo != 0);
6373
6374 /* Pop the SCC. */
6375 stack.truncate (last_idx);
6376 }
6377
6378 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
6379 slp_tree phi_node;
6380 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
6381 {
6382 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
6383 edge_iterator ei;
6384 edge e;
6385 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
6386 {
6387 unsigned dest_idx = e->dest_idx;
6388 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
6389 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
6390 continue;
6391 /* Simply fill all args. */
6392 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
6393 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
6394 vect_get_slp_vect_def (child, i),
6395 e, gimple_phi_arg_location (phi, dest_idx));
6396 }
6397 }
6398 }
6399
6400 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
6401
6402 void
6403 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
6404 {
6405 slp_instance instance;
6406 unsigned int i;
6407
6408 hash_map<slp_tree, slp_scc_info> scc_info;
6409 int maxdfs = 0;
6410 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6411 {
6412 slp_tree node = SLP_INSTANCE_TREE (instance);
6413 if (dump_enabled_p ())
6414 {
6415 dump_printf_loc (MSG_NOTE, vect_location,
6416 "Vectorizing SLP tree:\n");
6417 if (SLP_INSTANCE_ROOT_STMT (instance))
6418 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
6419 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
6420 vect_print_slp_graph (MSG_NOTE, vect_location,
6421 SLP_INSTANCE_TREE (instance));
6422 }
6423 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
6424 have a PHI be the node breaking the cycle. */
6425 auto_vec<slp_tree> stack;
6426 if (!scc_info.get (node))
6427 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
6428
6429 if (SLP_INSTANCE_ROOT_STMT (instance))
6430 vectorize_slp_instance_root_stmt (node, instance);
6431
6432 if (dump_enabled_p ())
6433 dump_printf_loc (MSG_NOTE, vect_location,
6434 "vectorizing stmts using SLP.\n");
6435 }
6436
6437 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6438 {
6439 slp_tree root = SLP_INSTANCE_TREE (instance);
6440 stmt_vec_info store_info;
6441 unsigned int j;
6442
6443 /* Remove scalar call stmts. Do not do this for basic-block
6444 vectorization as not all uses may be vectorized.
6445 ??? Why should this be necessary? DCE should be able to
6446 remove the stmts itself.
6447 ??? For BB vectorization we can as well remove scalar
6448 stmts starting from the SLP tree root if they have no
6449 uses. */
6450 if (is_a <loop_vec_info> (vinfo))
6451 vect_remove_slp_scalar_calls (vinfo, root);
6452
6453 /* Remove vectorized stores original scalar stmts. */
6454 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
6455 {
6456 if (!STMT_VINFO_DATA_REF (store_info)
6457 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
6458 break;
6459
6460 store_info = vect_orig_stmt (store_info);
6461 /* Free the attached stmt_vec_info and remove the stmt. */
6462 vinfo->remove_stmt (store_info);
6463 }
6464 }
6465 }