]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
fix vectorizer memleaks
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
52
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55
56 static object_allocator<_slp_tree> *slp_tree_pool;
57 static slp_tree slp_first_node;
58
59 void
60 vect_slp_init (void)
61 {
62 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
63 }
64
65 void
66 vect_slp_fini (void)
67 {
68 while (slp_first_node)
69 delete slp_first_node;
70 delete slp_tree_pool;
71 slp_tree_pool = NULL;
72 }
73
74 void *
75 _slp_tree::operator new (size_t n)
76 {
77 gcc_assert (n == sizeof (_slp_tree));
78 return slp_tree_pool->allocate_raw ();
79 }
80
81 void
82 _slp_tree::operator delete (void *node, size_t n)
83 {
84 gcc_assert (n == sizeof (_slp_tree));
85 slp_tree_pool->remove_raw (node);
86 }
87
88
89 /* Initialize a SLP node. */
90
91 _slp_tree::_slp_tree ()
92 {
93 this->prev_node = NULL;
94 if (slp_first_node)
95 slp_first_node->prev_node = this;
96 this->next_node = slp_first_node;
97 slp_first_node = this;
98 SLP_TREE_SCALAR_STMTS (this) = vNULL;
99 SLP_TREE_SCALAR_OPS (this) = vNULL;
100 SLP_TREE_VEC_STMTS (this) = vNULL;
101 SLP_TREE_VEC_DEFS (this) = vNULL;
102 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
103 SLP_TREE_CHILDREN (this) = vNULL;
104 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
105 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
106 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
107 SLP_TREE_CODE (this) = ERROR_MARK;
108 SLP_TREE_VECTYPE (this) = NULL_TREE;
109 SLP_TREE_REPRESENTATIVE (this) = NULL;
110 SLP_TREE_REF_COUNT (this) = 1;
111 this->max_nunits = 1;
112 this->lanes = 0;
113 }
114
115 /* Tear down a SLP node. */
116
117 _slp_tree::~_slp_tree ()
118 {
119 if (this->prev_node)
120 this->prev_node->next_node = this->next_node;
121 else
122 slp_first_node = this->next_node;
123 if (this->next_node)
124 this->next_node->prev_node = this->prev_node;
125 SLP_TREE_CHILDREN (this).release ();
126 SLP_TREE_SCALAR_STMTS (this).release ();
127 SLP_TREE_SCALAR_OPS (this).release ();
128 SLP_TREE_VEC_STMTS (this).release ();
129 SLP_TREE_VEC_DEFS (this).release ();
130 SLP_TREE_LOAD_PERMUTATION (this).release ();
131 SLP_TREE_LANE_PERMUTATION (this).release ();
132 }
133
134 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
135
136 void
137 vect_free_slp_tree (slp_tree node)
138 {
139 int i;
140 slp_tree child;
141
142 if (--SLP_TREE_REF_COUNT (node) != 0)
143 return;
144
145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
146 if (child)
147 vect_free_slp_tree (child);
148
149 delete node;
150 }
151
152 /* Return a location suitable for dumpings related to the SLP instance. */
153
154 dump_user_location_t
155 _slp_instance::location () const
156 {
157 if (root_stmt)
158 return root_stmt->stmt;
159 else
160 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
161 }
162
163
164 /* Free the memory allocated for the SLP instance. */
165
166 void
167 vect_free_slp_instance (slp_instance instance)
168 {
169 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
170 SLP_INSTANCE_LOADS (instance).release ();
171 instance->subgraph_entries.release ();
172 instance->cost_vec.release ();
173 free (instance);
174 }
175
176
177 /* Create an SLP node for SCALAR_STMTS. */
178
179 slp_tree
180 vect_create_new_slp_node (unsigned nops, tree_code code)
181 {
182 slp_tree node = new _slp_tree;
183 SLP_TREE_SCALAR_STMTS (node) = vNULL;
184 SLP_TREE_CHILDREN (node).create (nops);
185 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
186 SLP_TREE_CODE (node) = code;
187 return node;
188 }
189 /* Create an SLP node for SCALAR_STMTS. */
190
191 static slp_tree
192 vect_create_new_slp_node (slp_tree node,
193 vec<stmt_vec_info> scalar_stmts, unsigned nops)
194 {
195 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
196 SLP_TREE_CHILDREN (node).create (nops);
197 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
198 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
199 SLP_TREE_LANES (node) = scalar_stmts.length ();
200 return node;
201 }
202
203 /* Create an SLP node for SCALAR_STMTS. */
204
205 static slp_tree
206 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
207 {
208 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
209 }
210
211 /* Create an SLP node for OPS. */
212
213 static slp_tree
214 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
215 {
216 SLP_TREE_SCALAR_OPS (node) = ops;
217 SLP_TREE_DEF_TYPE (node) = vect_external_def;
218 SLP_TREE_LANES (node) = ops.length ();
219 return node;
220 }
221
222 /* Create an SLP node for OPS. */
223
224 static slp_tree
225 vect_create_new_slp_node (vec<tree> ops)
226 {
227 return vect_create_new_slp_node (new _slp_tree, ops);
228 }
229
230
231 /* This structure is used in creation of an SLP tree. Each instance
232 corresponds to the same operand in a group of scalar stmts in an SLP
233 node. */
234 typedef struct _slp_oprnd_info
235 {
236 /* Def-stmts for the operands. */
237 vec<stmt_vec_info> def_stmts;
238 /* Operands. */
239 vec<tree> ops;
240 /* Information about the first statement, its vector def-type, type, the
241 operand itself in case it's constant, and an indication if it's a pattern
242 stmt. */
243 tree first_op_type;
244 enum vect_def_type first_dt;
245 bool any_pattern;
246 } *slp_oprnd_info;
247
248
249 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
250 operand. */
251 static vec<slp_oprnd_info>
252 vect_create_oprnd_info (int nops, int group_size)
253 {
254 int i;
255 slp_oprnd_info oprnd_info;
256 vec<slp_oprnd_info> oprnds_info;
257
258 oprnds_info.create (nops);
259 for (i = 0; i < nops; i++)
260 {
261 oprnd_info = XNEW (struct _slp_oprnd_info);
262 oprnd_info->def_stmts.create (group_size);
263 oprnd_info->ops.create (group_size);
264 oprnd_info->first_dt = vect_uninitialized_def;
265 oprnd_info->first_op_type = NULL_TREE;
266 oprnd_info->any_pattern = false;
267 oprnds_info.quick_push (oprnd_info);
268 }
269
270 return oprnds_info;
271 }
272
273
274 /* Free operands info. */
275
276 static void
277 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
278 {
279 int i;
280 slp_oprnd_info oprnd_info;
281
282 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
283 {
284 oprnd_info->def_stmts.release ();
285 oprnd_info->ops.release ();
286 XDELETE (oprnd_info);
287 }
288
289 oprnds_info.release ();
290 }
291
292
293 /* Return true if STMTS contains a pattern statement. */
294
295 static bool
296 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
297 {
298 stmt_vec_info stmt_info;
299 unsigned int i;
300 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
301 if (is_pattern_stmt_p (stmt_info))
302 return true;
303 return false;
304 }
305
306 /* Return true when all lanes in the external or constant NODE have
307 the same value. */
308
309 static bool
310 vect_slp_tree_uniform_p (slp_tree node)
311 {
312 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
313 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
314
315 /* Pre-exsting vectors. */
316 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
317 return false;
318
319 unsigned i;
320 tree op, first = NULL_TREE;
321 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
322 if (!first)
323 first = op;
324 else if (!operand_equal_p (first, op, 0))
325 return false;
326
327 return true;
328 }
329
330 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
331 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
332 of the chain. */
333
334 int
335 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
336 stmt_vec_info first_stmt_info)
337 {
338 stmt_vec_info next_stmt_info = first_stmt_info;
339 int result = 0;
340
341 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
342 return -1;
343
344 do
345 {
346 if (next_stmt_info == stmt_info)
347 return result;
348 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
349 if (next_stmt_info)
350 result += DR_GROUP_GAP (next_stmt_info);
351 }
352 while (next_stmt_info);
353
354 return -1;
355 }
356
357 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
358 using the method implemented by duplicate_and_interleave. Return true
359 if so, returning the number of intermediate vectors in *NVECTORS_OUT
360 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
361 (if nonnull). */
362
363 bool
364 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
365 tree elt_type, unsigned int *nvectors_out,
366 tree *vector_type_out,
367 tree *permutes)
368 {
369 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
370 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
371 return false;
372
373 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
374 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
375 unsigned int nvectors = 1;
376 for (;;)
377 {
378 scalar_int_mode int_mode;
379 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
380 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
381 {
382 /* Get the natural vector type for this SLP group size. */
383 tree int_type = build_nonstandard_integer_type
384 (GET_MODE_BITSIZE (int_mode), 1);
385 tree vector_type
386 = get_vectype_for_scalar_type (vinfo, int_type, count);
387 if (vector_type
388 && VECTOR_MODE_P (TYPE_MODE (vector_type))
389 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
390 GET_MODE_SIZE (base_vector_mode)))
391 {
392 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
393 together into elements of type INT_TYPE and using the result
394 to build NVECTORS vectors. */
395 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
396 vec_perm_builder sel1 (nelts, 2, 3);
397 vec_perm_builder sel2 (nelts, 2, 3);
398 poly_int64 half_nelts = exact_div (nelts, 2);
399 for (unsigned int i = 0; i < 3; ++i)
400 {
401 sel1.quick_push (i);
402 sel1.quick_push (i + nelts);
403 sel2.quick_push (half_nelts + i);
404 sel2.quick_push (half_nelts + i + nelts);
405 }
406 vec_perm_indices indices1 (sel1, 2, nelts);
407 vec_perm_indices indices2 (sel2, 2, nelts);
408 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
409 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
410 {
411 if (nvectors_out)
412 *nvectors_out = nvectors;
413 if (vector_type_out)
414 *vector_type_out = vector_type;
415 if (permutes)
416 {
417 permutes[0] = vect_gen_perm_mask_checked (vector_type,
418 indices1);
419 permutes[1] = vect_gen_perm_mask_checked (vector_type,
420 indices2);
421 }
422 return true;
423 }
424 }
425 }
426 if (!multiple_p (elt_bytes, 2, &elt_bytes))
427 return false;
428 nvectors *= 2;
429 }
430 }
431
432 /* Return true if DTA and DTB match. */
433
434 static bool
435 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
436 {
437 return (dta == dtb
438 || ((dta == vect_external_def || dta == vect_constant_def)
439 && (dtb == vect_external_def || dtb == vect_constant_def)));
440 }
441
442 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
443 they are of a valid type and that they match the defs of the first stmt of
444 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
445 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
446 indicates swap is required for cond_expr stmts. Specifically, *SWAP
447 is 1 if STMT is cond and operands of comparison need to be swapped;
448 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
449 If there is any operand swap in this function, *SWAP is set to non-zero
450 value.
451 If there was a fatal error return -1; if the error could be corrected by
452 swapping operands of father node of this one, return 1; if everything is
453 ok return 0. */
454 static int
455 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
456 bool *skip_args,
457 vec<stmt_vec_info> stmts, unsigned stmt_num,
458 vec<slp_oprnd_info> *oprnds_info)
459 {
460 stmt_vec_info stmt_info = stmts[stmt_num];
461 tree oprnd;
462 unsigned int i, number_of_oprnds;
463 enum vect_def_type dt = vect_uninitialized_def;
464 slp_oprnd_info oprnd_info;
465 int first_op_idx = 1;
466 unsigned int commutative_op = -1U;
467 bool first_op_cond = false;
468 bool first = stmt_num == 0;
469
470 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
471 {
472 number_of_oprnds = gimple_call_num_args (stmt);
473 first_op_idx = 3;
474 if (gimple_call_internal_p (stmt))
475 {
476 internal_fn ifn = gimple_call_internal_fn (stmt);
477 commutative_op = first_commutative_argument (ifn);
478
479 /* Masked load, only look at mask. */
480 if (ifn == IFN_MASK_LOAD)
481 {
482 number_of_oprnds = 1;
483 /* Mask operand index. */
484 first_op_idx = 5;
485 }
486 }
487 }
488 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
489 {
490 enum tree_code code = gimple_assign_rhs_code (stmt);
491 number_of_oprnds = gimple_num_ops (stmt) - 1;
492 /* Swap can only be done for cond_expr if asked to, otherwise we
493 could result in different comparison code to the first stmt. */
494 if (code == COND_EXPR
495 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
496 {
497 first_op_cond = true;
498 number_of_oprnds++;
499 }
500 else
501 commutative_op = commutative_tree_code (code) ? 0U : -1U;
502 }
503 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
504 number_of_oprnds = gimple_phi_num_args (stmt);
505 else
506 return -1;
507
508 bool swapped = (swap != 0);
509 bool backedge = false;
510 gcc_assert (!swapped || first_op_cond);
511 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
512 for (i = 0; i < number_of_oprnds; i++)
513 {
514 if (first_op_cond)
515 {
516 /* Map indicating how operands of cond_expr should be swapped. */
517 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
518 int *map = maps[swap];
519
520 if (i < 2)
521 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
522 first_op_idx), map[i]);
523 else
524 oprnd = gimple_op (stmt_info->stmt, map[i]);
525 }
526 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
527 {
528 oprnd = gimple_phi_arg_def (stmt, i);
529 backedge = dominated_by_p (CDI_DOMINATORS,
530 gimple_phi_arg_edge (stmt, i)->src,
531 gimple_bb (stmt_info->stmt));
532 }
533 else
534 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
535 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
536 oprnd = TREE_OPERAND (oprnd, 0);
537
538 oprnd_info = (*oprnds_info)[i];
539
540 stmt_vec_info def_stmt_info;
541 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
542 {
543 if (dump_enabled_p ())
544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
545 "Build SLP failed: can't analyze def for %T\n",
546 oprnd);
547
548 return -1;
549 }
550
551 if (skip_args[i])
552 {
553 oprnd_info->def_stmts.quick_push (NULL);
554 oprnd_info->ops.quick_push (NULL_TREE);
555 oprnd_info->first_dt = vect_uninitialized_def;
556 continue;
557 }
558
559 oprnd_info->def_stmts.quick_push (def_stmt_info);
560 oprnd_info->ops.quick_push (oprnd);
561
562 if (def_stmt_info
563 && is_pattern_stmt_p (def_stmt_info))
564 {
565 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
566 != def_stmt_info)
567 oprnd_info->any_pattern = true;
568 else
569 /* If we promote this to external use the original stmt def. */
570 oprnd_info->ops.last ()
571 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
572 }
573
574 /* If there's a extern def on a backedge make sure we can
575 code-generate at the region start.
576 ??? This is another case that could be fixed by adjusting
577 how we split the function but at the moment we'd have conflicting
578 goals there. */
579 if (backedge
580 && dts[i] == vect_external_def
581 && is_a <bb_vec_info> (vinfo)
582 && TREE_CODE (oprnd) == SSA_NAME
583 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
584 && !dominated_by_p (CDI_DOMINATORS,
585 as_a <bb_vec_info> (vinfo)->bbs[0],
586 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
587 {
588 if (dump_enabled_p ())
589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
590 "Build SLP failed: extern def %T only defined "
591 "on backedge\n", oprnd);
592 return -1;
593 }
594
595 if (first)
596 {
597 tree type = TREE_TYPE (oprnd);
598 dt = dts[i];
599 if ((dt == vect_constant_def
600 || dt == vect_external_def)
601 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
602 && (TREE_CODE (type) == BOOLEAN_TYPE
603 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
604 type)))
605 {
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
608 "Build SLP failed: invalid type of def "
609 "for variable-length SLP %T\n", oprnd);
610 return -1;
611 }
612
613 /* For the swapping logic below force vect_reduction_def
614 for the reduction op in a SLP reduction group. */
615 if (!STMT_VINFO_DATA_REF (stmt_info)
616 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
617 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
618 && def_stmt_info)
619 dts[i] = dt = vect_reduction_def;
620
621 /* Check the types of the definition. */
622 switch (dt)
623 {
624 case vect_external_def:
625 case vect_constant_def:
626 case vect_internal_def:
627 case vect_reduction_def:
628 case vect_induction_def:
629 case vect_nested_cycle:
630 break;
631
632 default:
633 /* FORNOW: Not supported. */
634 if (dump_enabled_p ())
635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
636 "Build SLP failed: illegal type of def %T\n",
637 oprnd);
638 return -1;
639 }
640
641 oprnd_info->first_dt = dt;
642 oprnd_info->first_op_type = type;
643 }
644 }
645 if (first)
646 return 0;
647
648 /* Now match the operand definition types to that of the first stmt. */
649 for (i = 0; i < number_of_oprnds;)
650 {
651 if (skip_args[i])
652 {
653 ++i;
654 continue;
655 }
656
657 oprnd_info = (*oprnds_info)[i];
658 dt = dts[i];
659 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
660 oprnd = oprnd_info->ops[stmt_num];
661 tree type = TREE_TYPE (oprnd);
662
663 if (!types_compatible_p (oprnd_info->first_op_type, type))
664 {
665 if (dump_enabled_p ())
666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
667 "Build SLP failed: different operand types\n");
668 return 1;
669 }
670
671 /* Not first stmt of the group, check that the def-stmt/s match
672 the def-stmt/s of the first stmt. Allow different definition
673 types for reduction chains: the first stmt must be a
674 vect_reduction_def (a phi node), and the rest
675 end in the reduction chain. */
676 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
677 && !(oprnd_info->first_dt == vect_reduction_def
678 && !STMT_VINFO_DATA_REF (stmt_info)
679 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
680 && def_stmt_info
681 && !STMT_VINFO_DATA_REF (def_stmt_info)
682 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
683 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
684 || (!STMT_VINFO_DATA_REF (stmt_info)
685 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
686 && ((!def_stmt_info
687 || STMT_VINFO_DATA_REF (def_stmt_info)
688 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
689 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
690 != (oprnd_info->first_dt != vect_reduction_def))))
691 {
692 /* Try swapping operands if we got a mismatch. For BB
693 vectorization only in case it will clearly improve things. */
694 if (i == commutative_op && !swapped
695 && (!is_a <bb_vec_info> (vinfo)
696 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
697 dts[i+1])
698 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
699 || vect_def_types_match
700 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
701 {
702 if (dump_enabled_p ())
703 dump_printf_loc (MSG_NOTE, vect_location,
704 "trying swapped operands\n");
705 std::swap (dts[i], dts[i+1]);
706 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
707 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
708 std::swap ((*oprnds_info)[i]->ops[stmt_num],
709 (*oprnds_info)[i+1]->ops[stmt_num]);
710 swapped = true;
711 continue;
712 }
713
714 if (is_a <bb_vec_info> (vinfo)
715 && !oprnd_info->any_pattern)
716 {
717 /* Now for commutative ops we should see whether we can
718 make the other operand matching. */
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "treating operand as external\n");
722 oprnd_info->first_dt = dt = vect_external_def;
723 }
724 else
725 {
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
728 "Build SLP failed: different types\n");
729 return 1;
730 }
731 }
732
733 /* Make sure to demote the overall operand to external. */
734 if (dt == vect_external_def)
735 oprnd_info->first_dt = vect_external_def;
736 /* For a SLP reduction chain we want to duplicate the reduction to
737 each of the chain members. That gets us a sane SLP graph (still
738 the stmts are not 100% correct wrt the initial values). */
739 else if ((dt == vect_internal_def
740 || dt == vect_reduction_def)
741 && oprnd_info->first_dt == vect_reduction_def
742 && !STMT_VINFO_DATA_REF (stmt_info)
743 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
744 && !STMT_VINFO_DATA_REF (def_stmt_info)
745 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
746 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
747 {
748 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
749 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
750 }
751
752 ++i;
753 }
754
755 /* Swap operands. */
756 if (swapped)
757 {
758 if (dump_enabled_p ())
759 dump_printf_loc (MSG_NOTE, vect_location,
760 "swapped operands to match def types in %G",
761 stmt_info->stmt);
762 }
763
764 return 0;
765 }
766
767 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
768 Return true if we can, meaning that this choice doesn't conflict with
769 existing SLP nodes that use STMT_INFO. */
770
771 bool
772 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
773 {
774 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
775 if (old_vectype)
776 return useless_type_conversion_p (vectype, old_vectype);
777
778 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
779 {
780 /* We maintain the invariant that if any statement in the group is
781 used, all other members of the group have the same vector type. */
782 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
783 stmt_vec_info member_info = first_info;
784 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
785 if (is_pattern_stmt_p (member_info)
786 && !useless_type_conversion_p (vectype,
787 STMT_VINFO_VECTYPE (member_info)))
788 break;
789
790 if (!member_info)
791 {
792 for (member_info = first_info; member_info;
793 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
794 STMT_VINFO_VECTYPE (member_info) = vectype;
795 return true;
796 }
797 }
798 else if (!is_pattern_stmt_p (stmt_info))
799 {
800 STMT_VINFO_VECTYPE (stmt_info) = vectype;
801 return true;
802 }
803
804 if (dump_enabled_p ())
805 {
806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
807 "Build SLP failed: incompatible vector"
808 " types for: %G", stmt_info->stmt);
809 dump_printf_loc (MSG_NOTE, vect_location,
810 " old vector type: %T\n", old_vectype);
811 dump_printf_loc (MSG_NOTE, vect_location,
812 " new vector type: %T\n", vectype);
813 }
814 return false;
815 }
816
817 /* Return true if call statements CALL1 and CALL2 are similar enough
818 to be combined into the same SLP group. */
819
820 static bool
821 compatible_calls_p (gcall *call1, gcall *call2)
822 {
823 unsigned int nargs = gimple_call_num_args (call1);
824 if (nargs != gimple_call_num_args (call2))
825 return false;
826
827 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
828 return false;
829
830 if (gimple_call_internal_p (call1))
831 {
832 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
833 TREE_TYPE (gimple_call_lhs (call2))))
834 return false;
835 for (unsigned int i = 0; i < nargs; ++i)
836 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
837 TREE_TYPE (gimple_call_arg (call2, i))))
838 return false;
839 }
840 else
841 {
842 if (!operand_equal_p (gimple_call_fn (call1),
843 gimple_call_fn (call2), 0))
844 return false;
845
846 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
847 return false;
848 }
849 return true;
850 }
851
852 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
853 caller's attempt to find the vector type in STMT_INFO with the narrowest
854 element type. Return true if VECTYPE is nonnull and if it is valid
855 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
856 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
857 vect_build_slp_tree. */
858
859 static bool
860 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
861 unsigned int group_size,
862 tree vectype, poly_uint64 *max_nunits)
863 {
864 if (!vectype)
865 {
866 if (dump_enabled_p ())
867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
868 "Build SLP failed: unsupported data-type in %G\n",
869 stmt_info->stmt);
870 /* Fatal mismatch. */
871 return false;
872 }
873
874 /* If populating the vector type requires unrolling then fail
875 before adjusting *max_nunits for basic-block vectorization. */
876 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
877 unsigned HOST_WIDE_INT const_nunits;
878 if (is_a <bb_vec_info> (vinfo)
879 && (!nunits.is_constant (&const_nunits)
880 || const_nunits > group_size))
881 {
882 if (dump_enabled_p ())
883 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
884 "Build SLP failed: unrolling required "
885 "in basic block SLP\n");
886 /* Fatal mismatch. */
887 return false;
888 }
889
890 /* In case of multiple types we need to detect the smallest type. */
891 vect_update_max_nunits (max_nunits, vectype);
892 return true;
893 }
894
895 /* Verify if the scalar stmts STMTS are isomorphic, require data
896 permutation or are of unsupported types of operation. Return
897 true if they are, otherwise return false and indicate in *MATCHES
898 which stmts are not isomorphic to the first one. If MATCHES[0]
899 is false then this indicates the comparison could not be
900 carried out or the stmts will never be vectorized by SLP.
901
902 Note COND_EXPR is possibly isomorphic to another one after swapping its
903 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
904 the first stmt by swapping the two operands of comparison; set SWAP[i]
905 to 2 if stmt I is isormorphic to the first stmt by inverting the code
906 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
907 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
908
909 static bool
910 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
911 vec<stmt_vec_info> stmts, unsigned int group_size,
912 poly_uint64 *max_nunits, bool *matches,
913 bool *two_operators, tree *node_vectype)
914 {
915 unsigned int i;
916 stmt_vec_info first_stmt_info = stmts[0];
917 enum tree_code first_stmt_code = ERROR_MARK;
918 enum tree_code alt_stmt_code = ERROR_MARK;
919 enum tree_code rhs_code = ERROR_MARK;
920 enum tree_code first_cond_code = ERROR_MARK;
921 tree lhs;
922 bool need_same_oprnds = false;
923 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
924 optab optab;
925 int icode;
926 machine_mode optab_op2_mode;
927 machine_mode vec_mode;
928 stmt_vec_info first_load = NULL, prev_first_load = NULL;
929 bool first_stmt_load_p = false, load_p = false;
930 bool first_stmt_phi_p = false, phi_p = false;
931
932 /* For every stmt in NODE find its def stmt/s. */
933 stmt_vec_info stmt_info;
934 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
935 {
936 gimple *stmt = stmt_info->stmt;
937 swap[i] = 0;
938 matches[i] = false;
939
940 if (dump_enabled_p ())
941 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
942
943 /* Fail to vectorize statements marked as unvectorizable, throw
944 or are volatile. */
945 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
946 || stmt_can_throw_internal (cfun, stmt)
947 || gimple_has_volatile_ops (stmt))
948 {
949 if (dump_enabled_p ())
950 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
951 "Build SLP failed: unvectorizable statement %G",
952 stmt);
953 /* ??? For BB vectorization we want to commutate operands in a way
954 to shuffle all unvectorizable defs into one operand and have
955 the other still vectorized. The following doesn't reliably
956 work for this though but it's the easiest we can do here. */
957 if (is_a <bb_vec_info> (vinfo) && i != 0)
958 continue;
959 /* Fatal mismatch. */
960 matches[0] = false;
961 return false;
962 }
963
964 lhs = gimple_get_lhs (stmt);
965 if (lhs == NULL_TREE)
966 {
967 if (dump_enabled_p ())
968 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
969 "Build SLP failed: not GIMPLE_ASSIGN nor "
970 "GIMPLE_CALL %G", stmt);
971 if (is_a <bb_vec_info> (vinfo) && i != 0)
972 continue;
973 /* Fatal mismatch. */
974 matches[0] = false;
975 return false;
976 }
977
978 tree nunits_vectype;
979 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
980 &nunits_vectype, group_size)
981 || (nunits_vectype
982 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
983 nunits_vectype, max_nunits)))
984 {
985 if (is_a <bb_vec_info> (vinfo) && i != 0)
986 continue;
987 /* Fatal mismatch. */
988 matches[0] = false;
989 return false;
990 }
991
992 gcc_assert (vectype);
993
994 gcall *call_stmt = dyn_cast <gcall *> (stmt);
995 if (call_stmt)
996 {
997 rhs_code = CALL_EXPR;
998
999 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
1000 load_p = true;
1001 else if ((gimple_call_internal_p (call_stmt)
1002 && (!vectorizable_internal_fn_p
1003 (gimple_call_internal_fn (call_stmt))))
1004 || gimple_call_tail_p (call_stmt)
1005 || gimple_call_noreturn_p (call_stmt)
1006 || !gimple_call_nothrow_p (call_stmt)
1007 || gimple_call_chain (call_stmt))
1008 {
1009 if (dump_enabled_p ())
1010 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1011 "Build SLP failed: unsupported call type %G",
1012 call_stmt);
1013 if (is_a <bb_vec_info> (vinfo) && i != 0)
1014 continue;
1015 /* Fatal mismatch. */
1016 matches[0] = false;
1017 return false;
1018 }
1019 }
1020 else if (gimple_code (stmt) == GIMPLE_PHI)
1021 {
1022 rhs_code = ERROR_MARK;
1023 phi_p = true;
1024 }
1025 else
1026 {
1027 rhs_code = gimple_assign_rhs_code (stmt);
1028 load_p = gimple_vuse (stmt);
1029 }
1030
1031 /* Check the operation. */
1032 if (i == 0)
1033 {
1034 *node_vectype = vectype;
1035 first_stmt_code = rhs_code;
1036 first_stmt_load_p = load_p;
1037 first_stmt_phi_p = phi_p;
1038
1039 /* Shift arguments should be equal in all the packed stmts for a
1040 vector shift with scalar shift operand. */
1041 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1042 || rhs_code == LROTATE_EXPR
1043 || rhs_code == RROTATE_EXPR)
1044 {
1045 vec_mode = TYPE_MODE (vectype);
1046
1047 /* First see if we have a vector/vector shift. */
1048 optab = optab_for_tree_code (rhs_code, vectype,
1049 optab_vector);
1050
1051 if (!optab
1052 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1053 {
1054 /* No vector/vector shift, try for a vector/scalar shift. */
1055 optab = optab_for_tree_code (rhs_code, vectype,
1056 optab_scalar);
1057
1058 if (!optab)
1059 {
1060 if (dump_enabled_p ())
1061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1062 "Build SLP failed: no optab.\n");
1063 if (is_a <bb_vec_info> (vinfo) && i != 0)
1064 continue;
1065 /* Fatal mismatch. */
1066 matches[0] = false;
1067 return false;
1068 }
1069 icode = (int) optab_handler (optab, vec_mode);
1070 if (icode == CODE_FOR_nothing)
1071 {
1072 if (dump_enabled_p ())
1073 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1074 "Build SLP failed: "
1075 "op not supported by target.\n");
1076 if (is_a <bb_vec_info> (vinfo) && i != 0)
1077 continue;
1078 /* Fatal mismatch. */
1079 matches[0] = false;
1080 return false;
1081 }
1082 optab_op2_mode = insn_data[icode].operand[2].mode;
1083 if (!VECTOR_MODE_P (optab_op2_mode))
1084 {
1085 need_same_oprnds = true;
1086 first_op1 = gimple_assign_rhs2 (stmt);
1087 }
1088 }
1089 }
1090 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1091 {
1092 need_same_oprnds = true;
1093 first_op1 = gimple_assign_rhs2 (stmt);
1094 }
1095 else if (!load_p
1096 && rhs_code == BIT_FIELD_REF)
1097 {
1098 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1099 if (!is_a <bb_vec_info> (vinfo)
1100 || TREE_CODE (vec) != SSA_NAME
1101 || !types_compatible_p (vectype, TREE_TYPE (vec)))
1102 {
1103 if (dump_enabled_p ())
1104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1105 "Build SLP failed: "
1106 "BIT_FIELD_REF not supported\n");
1107 /* Fatal mismatch. */
1108 matches[0] = false;
1109 return false;
1110 }
1111 }
1112 else if (call_stmt
1113 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1114 {
1115 need_same_oprnds = true;
1116 first_op1 = gimple_call_arg (call_stmt, 1);
1117 }
1118 }
1119 else
1120 {
1121 if (first_stmt_code != rhs_code
1122 && alt_stmt_code == ERROR_MARK)
1123 alt_stmt_code = rhs_code;
1124 if ((first_stmt_code != rhs_code
1125 && (first_stmt_code != IMAGPART_EXPR
1126 || rhs_code != REALPART_EXPR)
1127 && (first_stmt_code != REALPART_EXPR
1128 || rhs_code != IMAGPART_EXPR)
1129 /* Handle mismatches in plus/minus by computing both
1130 and merging the results. */
1131 && !((first_stmt_code == PLUS_EXPR
1132 || first_stmt_code == MINUS_EXPR)
1133 && (alt_stmt_code == PLUS_EXPR
1134 || alt_stmt_code == MINUS_EXPR)
1135 && rhs_code == alt_stmt_code)
1136 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1137 && (first_stmt_code == ARRAY_REF
1138 || first_stmt_code == BIT_FIELD_REF
1139 || first_stmt_code == INDIRECT_REF
1140 || first_stmt_code == COMPONENT_REF
1141 || first_stmt_code == MEM_REF)))
1142 || first_stmt_load_p != load_p
1143 || first_stmt_phi_p != phi_p)
1144 {
1145 if (dump_enabled_p ())
1146 {
1147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1148 "Build SLP failed: different operation "
1149 "in stmt %G", stmt);
1150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1151 "original stmt %G", first_stmt_info->stmt);
1152 }
1153 /* Mismatch. */
1154 continue;
1155 }
1156
1157 if (need_same_oprnds)
1158 {
1159 tree other_op1 = (call_stmt
1160 ? gimple_call_arg (call_stmt, 1)
1161 : gimple_assign_rhs2 (stmt));
1162 if (!operand_equal_p (first_op1, other_op1, 0))
1163 {
1164 if (dump_enabled_p ())
1165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1166 "Build SLP failed: different shift "
1167 "arguments in %G", stmt);
1168 /* Mismatch. */
1169 continue;
1170 }
1171 }
1172 if (!load_p
1173 && first_stmt_code == BIT_FIELD_REF
1174 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1175 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1176 {
1177 if (dump_enabled_p ())
1178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1179 "Build SLP failed: different BIT_FIELD_REF "
1180 "arguments in %G", stmt);
1181 /* Mismatch. */
1182 continue;
1183 }
1184
1185 if (!load_p && rhs_code == CALL_EXPR)
1186 {
1187 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1188 as_a <gcall *> (stmt)))
1189 {
1190 if (dump_enabled_p ())
1191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1192 "Build SLP failed: different calls in %G",
1193 stmt);
1194 /* Mismatch. */
1195 continue;
1196 }
1197 }
1198
1199 if (phi_p
1200 && (gimple_bb (first_stmt_info->stmt)
1201 != gimple_bb (stmt_info->stmt)))
1202 {
1203 if (dump_enabled_p ())
1204 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1205 "Build SLP failed: different BB for PHI "
1206 "in %G", stmt);
1207 /* Mismatch. */
1208 continue;
1209 }
1210
1211 if (!types_compatible_p (vectype, *node_vectype))
1212 {
1213 if (dump_enabled_p ())
1214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1215 "Build SLP failed: different vector type "
1216 "in %G", stmt);
1217 /* Mismatch. */
1218 continue;
1219 }
1220 }
1221
1222 /* Grouped store or load. */
1223 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1224 {
1225 if (REFERENCE_CLASS_P (lhs))
1226 {
1227 /* Store. */
1228 ;
1229 }
1230 else
1231 {
1232 /* Load. */
1233 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1234 if (prev_first_load)
1235 {
1236 /* Check that there are no loads from different interleaving
1237 chains in the same node. */
1238 if (prev_first_load != first_load)
1239 {
1240 if (dump_enabled_p ())
1241 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1242 vect_location,
1243 "Build SLP failed: different "
1244 "interleaving chains in one node %G",
1245 stmt);
1246 /* Mismatch. */
1247 continue;
1248 }
1249 }
1250 else
1251 prev_first_load = first_load;
1252 }
1253 } /* Grouped access. */
1254 else
1255 {
1256 if (load_p)
1257 {
1258 /* Not grouped load. */
1259 if (dump_enabled_p ())
1260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1261 "Build SLP failed: not grouped load %G", stmt);
1262
1263 /* FORNOW: Not grouped loads are not supported. */
1264 if (is_a <bb_vec_info> (vinfo) && i != 0)
1265 continue;
1266 /* Fatal mismatch. */
1267 matches[0] = false;
1268 return false;
1269 }
1270
1271 /* Not memory operation. */
1272 if (!phi_p
1273 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1274 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1275 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1276 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1277 && rhs_code != VIEW_CONVERT_EXPR
1278 && rhs_code != CALL_EXPR
1279 && rhs_code != BIT_FIELD_REF)
1280 {
1281 if (dump_enabled_p ())
1282 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1283 "Build SLP failed: operation unsupported %G",
1284 stmt);
1285 if (is_a <bb_vec_info> (vinfo) && i != 0)
1286 continue;
1287 /* Fatal mismatch. */
1288 matches[0] = false;
1289 return false;
1290 }
1291
1292 if (rhs_code == COND_EXPR)
1293 {
1294 tree cond_expr = gimple_assign_rhs1 (stmt);
1295 enum tree_code cond_code = TREE_CODE (cond_expr);
1296 enum tree_code swap_code = ERROR_MARK;
1297 enum tree_code invert_code = ERROR_MARK;
1298
1299 if (i == 0)
1300 first_cond_code = TREE_CODE (cond_expr);
1301 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1302 {
1303 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1304 swap_code = swap_tree_comparison (cond_code);
1305 invert_code = invert_tree_comparison (cond_code, honor_nans);
1306 }
1307
1308 if (first_cond_code == cond_code)
1309 ;
1310 /* Isomorphic can be achieved by swapping. */
1311 else if (first_cond_code == swap_code)
1312 swap[i] = 1;
1313 /* Isomorphic can be achieved by inverting. */
1314 else if (first_cond_code == invert_code)
1315 swap[i] = 2;
1316 else
1317 {
1318 if (dump_enabled_p ())
1319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1320 "Build SLP failed: different"
1321 " operation %G", stmt);
1322 /* Mismatch. */
1323 continue;
1324 }
1325 }
1326 }
1327
1328 matches[i] = true;
1329 }
1330
1331 for (i = 0; i < group_size; ++i)
1332 if (!matches[i])
1333 return false;
1334
1335 /* If we allowed a two-operation SLP node verify the target can cope
1336 with the permute we are going to use. */
1337 if (alt_stmt_code != ERROR_MARK
1338 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1339 {
1340 *two_operators = true;
1341 }
1342
1343 return true;
1344 }
1345
1346 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1347 Note we never remove apart from at destruction time so we do not
1348 need a special value for deleted that differs from empty. */
1349 struct bst_traits
1350 {
1351 typedef vec <stmt_vec_info> value_type;
1352 typedef vec <stmt_vec_info> compare_type;
1353 static inline hashval_t hash (value_type);
1354 static inline bool equal (value_type existing, value_type candidate);
1355 static inline bool is_empty (value_type x) { return !x.exists (); }
1356 static inline bool is_deleted (value_type x) { return !x.exists (); }
1357 static const bool empty_zero_p = true;
1358 static inline void mark_empty (value_type &x) { x.release (); }
1359 static inline void mark_deleted (value_type &x) { x.release (); }
1360 static inline void remove (value_type &x) { x.release (); }
1361 };
1362 inline hashval_t
1363 bst_traits::hash (value_type x)
1364 {
1365 inchash::hash h;
1366 for (unsigned i = 0; i < x.length (); ++i)
1367 h.add_int (gimple_uid (x[i]->stmt));
1368 return h.end ();
1369 }
1370 inline bool
1371 bst_traits::equal (value_type existing, value_type candidate)
1372 {
1373 if (existing.length () != candidate.length ())
1374 return false;
1375 for (unsigned i = 0; i < existing.length (); ++i)
1376 if (existing[i] != candidate[i])
1377 return false;
1378 return true;
1379 }
1380
1381 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1382 simple_hashmap_traits <bst_traits, slp_tree> >
1383 scalar_stmts_to_slp_tree_map_t;
1384
1385 static slp_tree
1386 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1387 vec<stmt_vec_info> stmts, unsigned int group_size,
1388 poly_uint64 *max_nunits,
1389 bool *matches, unsigned *limit, unsigned *tree_size,
1390 scalar_stmts_to_slp_tree_map_t *bst_map);
1391
1392 static slp_tree
1393 vect_build_slp_tree (vec_info *vinfo,
1394 vec<stmt_vec_info> stmts, unsigned int group_size,
1395 poly_uint64 *max_nunits,
1396 bool *matches, unsigned *limit, unsigned *tree_size,
1397 scalar_stmts_to_slp_tree_map_t *bst_map)
1398 {
1399 if (slp_tree *leader = bst_map->get (stmts))
1400 {
1401 if (dump_enabled_p ())
1402 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1403 *leader ? "" : "failed ", *leader);
1404 if (*leader)
1405 {
1406 SLP_TREE_REF_COUNT (*leader)++;
1407 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1408 stmts.release ();
1409 }
1410 return *leader;
1411 }
1412
1413 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1414 so we can pick up backedge destinations during discovery. */
1415 slp_tree res = new _slp_tree;
1416 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1417 SLP_TREE_SCALAR_STMTS (res) = stmts;
1418 bst_map->put (stmts.copy (), res);
1419
1420 if (*limit == 0)
1421 {
1422 if (dump_enabled_p ())
1423 dump_printf_loc (MSG_NOTE, vect_location,
1424 "SLP discovery limit exceeded\n");
1425 bool existed_p = bst_map->put (stmts, NULL);
1426 gcc_assert (existed_p);
1427 /* Mark the node invalid so we can detect those when still in use
1428 as backedge destinations. */
1429 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1430 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1431 vect_free_slp_tree (res);
1432 memset (matches, 0, sizeof (bool) * group_size);
1433 return NULL;
1434 }
1435 --*limit;
1436
1437 poly_uint64 this_max_nunits = 1;
1438 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1439 &this_max_nunits,
1440 matches, limit, tree_size, bst_map);
1441 if (!res_)
1442 {
1443 bool existed_p = bst_map->put (stmts, NULL);
1444 gcc_assert (existed_p);
1445 /* Mark the node invalid so we can detect those when still in use
1446 as backedge destinations. */
1447 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1448 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1449 vect_free_slp_tree (res);
1450 }
1451 else
1452 {
1453 gcc_assert (res_ == res);
1454 res->max_nunits = this_max_nunits;
1455 vect_update_max_nunits (max_nunits, this_max_nunits);
1456 /* Keep a reference for the bst_map use. */
1457 SLP_TREE_REF_COUNT (res)++;
1458 }
1459 return res_;
1460 }
1461
1462 /* Recursively build an SLP tree starting from NODE.
1463 Fail (and return a value not equal to zero) if def-stmts are not
1464 isomorphic, require data permutation or are of unsupported types of
1465 operation. Otherwise, return 0.
1466 The value returned is the depth in the SLP tree where a mismatch
1467 was found. */
1468
1469 static slp_tree
1470 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1471 vec<stmt_vec_info> stmts, unsigned int group_size,
1472 poly_uint64 *max_nunits,
1473 bool *matches, unsigned *limit, unsigned *tree_size,
1474 scalar_stmts_to_slp_tree_map_t *bst_map)
1475 {
1476 unsigned nops, i, this_tree_size = 0;
1477 poly_uint64 this_max_nunits = *max_nunits;
1478
1479 matches[0] = false;
1480
1481 stmt_vec_info stmt_info = stmts[0];
1482 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1483 nops = gimple_call_num_args (stmt);
1484 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1485 {
1486 nops = gimple_num_ops (stmt) - 1;
1487 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1488 nops++;
1489 }
1490 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1491 nops = gimple_phi_num_args (phi);
1492 else
1493 return NULL;
1494
1495 /* If the SLP node is a PHI (induction or reduction), terminate
1496 the recursion. */
1497 bool *skip_args = XALLOCAVEC (bool, nops);
1498 memset (skip_args, 0, sizeof (bool) * nops);
1499 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1500 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1501 {
1502 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1503 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1504 group_size);
1505 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1506 max_nunits))
1507 return NULL;
1508
1509 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1510 if (def_type == vect_induction_def)
1511 {
1512 /* Induction PHIs are not cycles but walk the initial
1513 value. Only for inner loops through, for outer loops
1514 we need to pick up the value from the actual PHIs
1515 to more easily support peeling and epilogue vectorization. */
1516 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1517 if (!nested_in_vect_loop_p (loop, stmt_info))
1518 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1519 else
1520 loop = loop->inner;
1521 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1522 }
1523 else if (def_type == vect_reduction_def
1524 || def_type == vect_double_reduction_def
1525 || def_type == vect_nested_cycle)
1526 {
1527 /* Else def types have to match. */
1528 stmt_vec_info other_info;
1529 bool all_same = true;
1530 FOR_EACH_VEC_ELT (stmts, i, other_info)
1531 {
1532 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1533 return NULL;
1534 if (other_info != stmt_info)
1535 all_same = false;
1536 }
1537 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1538 /* Reduction initial values are not explicitely represented. */
1539 if (!nested_in_vect_loop_p (loop, stmt_info))
1540 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1541 /* Reduction chain backedge defs are filled manually.
1542 ??? Need a better way to identify a SLP reduction chain PHI.
1543 Or a better overall way to SLP match those. */
1544 if (all_same && def_type == vect_reduction_def)
1545 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1546 }
1547 else if (def_type != vect_internal_def)
1548 return NULL;
1549 }
1550
1551
1552 bool two_operators = false;
1553 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1554 tree vectype = NULL_TREE;
1555 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1556 &this_max_nunits, matches, &two_operators,
1557 &vectype))
1558 return NULL;
1559
1560 /* If the SLP node is a load, terminate the recursion unless masked. */
1561 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1562 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1563 {
1564 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1565 {
1566 /* Masked load. */
1567 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1568 nops = 1;
1569 }
1570 else
1571 {
1572 *max_nunits = this_max_nunits;
1573 (*tree_size)++;
1574 node = vect_create_new_slp_node (node, stmts, 0);
1575 SLP_TREE_VECTYPE (node) = vectype;
1576 /* And compute the load permutation. Whether it is actually
1577 a permutation depends on the unrolling factor which is
1578 decided later. */
1579 vec<unsigned> load_permutation;
1580 int j;
1581 stmt_vec_info load_info;
1582 load_permutation.create (group_size);
1583 stmt_vec_info first_stmt_info
1584 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1585 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1586 {
1587 int load_place = vect_get_place_in_interleaving_chain
1588 (load_info, first_stmt_info);
1589 gcc_assert (load_place != -1);
1590 load_permutation.safe_push (load_place);
1591 }
1592 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1593 return node;
1594 }
1595 }
1596 else if (gimple_assign_single_p (stmt_info->stmt)
1597 && !gimple_vuse (stmt_info->stmt)
1598 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1599 {
1600 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1601 the same SSA name vector of a compatible type to vectype. */
1602 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1603 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1604 stmt_vec_info estmt_info;
1605 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1606 {
1607 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1608 tree bfref = gimple_assign_rhs1 (estmt);
1609 HOST_WIDE_INT lane;
1610 if (!known_eq (bit_field_size (bfref),
1611 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1612 || !constant_multiple_p (bit_field_offset (bfref),
1613 bit_field_size (bfref), &lane))
1614 {
1615 lperm.release ();
1616 return NULL;
1617 }
1618 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1619 }
1620 slp_tree vnode = vect_create_new_slp_node (vNULL);
1621 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1622 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1623 /* We are always building a permutation node even if it is an identity
1624 permute to shield the rest of the vectorizer from the odd node
1625 representing an actual vector without any scalar ops.
1626 ??? We could hide it completely with making the permute node
1627 external? */
1628 node = vect_create_new_slp_node (node, stmts, 1);
1629 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1630 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1631 SLP_TREE_VECTYPE (node) = vectype;
1632 SLP_TREE_CHILDREN (node).quick_push (vnode);
1633 return node;
1634 }
1635
1636 /* Get at the operands, verifying they are compatible. */
1637 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1638 slp_oprnd_info oprnd_info;
1639 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1640 {
1641 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
1642 stmts, i, &oprnds_info);
1643 if (res != 0)
1644 matches[(res == -1) ? 0 : i] = false;
1645 if (!matches[0])
1646 break;
1647 }
1648 for (i = 0; i < group_size; ++i)
1649 if (!matches[i])
1650 {
1651 vect_free_oprnd_info (oprnds_info);
1652 return NULL;
1653 }
1654 swap = NULL;
1655
1656 auto_vec<slp_tree, 4> children;
1657
1658 stmt_info = stmts[0];
1659
1660 /* Create SLP_TREE nodes for the definition node/s. */
1661 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1662 {
1663 slp_tree child;
1664 unsigned int j;
1665
1666 /* We're skipping certain operands from processing, for example
1667 outer loop reduction initial defs. */
1668 if (skip_args[i])
1669 {
1670 children.safe_push (NULL);
1671 continue;
1672 }
1673
1674 if (oprnd_info->first_dt == vect_uninitialized_def)
1675 {
1676 /* COND_EXPR have one too many eventually if the condition
1677 is a SSA name. */
1678 gcc_assert (i == 3 && nops == 4);
1679 continue;
1680 }
1681
1682 if (is_a <bb_vec_info> (vinfo)
1683 && oprnd_info->first_dt == vect_internal_def
1684 && !oprnd_info->any_pattern)
1685 {
1686 /* For BB vectorization, if all defs are the same do not
1687 bother to continue the build along the single-lane
1688 graph but use a splat of the scalar value. */
1689 stmt_vec_info first_def = oprnd_info->def_stmts[0];
1690 for (j = 1; j < group_size; ++j)
1691 if (oprnd_info->def_stmts[j] != first_def)
1692 break;
1693 if (j == group_size
1694 /* But avoid doing this for loads where we may be
1695 able to CSE things, unless the stmt is not
1696 vectorizable. */
1697 && (!STMT_VINFO_VECTORIZABLE (first_def)
1698 || !gimple_vuse (first_def->stmt)))
1699 {
1700 if (dump_enabled_p ())
1701 dump_printf_loc (MSG_NOTE, vect_location,
1702 "Using a splat of the uniform operand\n");
1703 oprnd_info->first_dt = vect_external_def;
1704 }
1705 }
1706
1707 if (oprnd_info->first_dt == vect_external_def
1708 || oprnd_info->first_dt == vect_constant_def)
1709 {
1710 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1711 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1712 oprnd_info->ops = vNULL;
1713 children.safe_push (invnode);
1714 continue;
1715 }
1716
1717 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1718 group_size, &this_max_nunits,
1719 matches, limit,
1720 &this_tree_size, bst_map)) != NULL)
1721 {
1722 oprnd_info->def_stmts = vNULL;
1723 children.safe_push (child);
1724 continue;
1725 }
1726
1727 /* If the SLP build for operand zero failed and operand zero
1728 and one can be commutated try that for the scalar stmts
1729 that failed the match. */
1730 if (i == 0
1731 /* A first scalar stmt mismatch signals a fatal mismatch. */
1732 && matches[0]
1733 /* ??? For COND_EXPRs we can swap the comparison operands
1734 as well as the arms under some constraints. */
1735 && nops == 2
1736 && oprnds_info[1]->first_dt == vect_internal_def
1737 && is_gimple_assign (stmt_info->stmt)
1738 /* Swapping operands for reductions breaks assumptions later on. */
1739 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1740 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1741 {
1742 /* See whether we can swap the matching or the non-matching
1743 stmt operands. */
1744 bool swap_not_matching = true;
1745 do
1746 {
1747 for (j = 0; j < group_size; ++j)
1748 {
1749 if (matches[j] != !swap_not_matching)
1750 continue;
1751 stmt_vec_info stmt_info = stmts[j];
1752 /* Verify if we can swap operands of this stmt. */
1753 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1754 if (!stmt
1755 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1756 {
1757 if (!swap_not_matching)
1758 goto fail;
1759 swap_not_matching = false;
1760 break;
1761 }
1762 }
1763 }
1764 while (j != group_size);
1765
1766 /* Swap mismatched definition stmts. */
1767 if (dump_enabled_p ())
1768 dump_printf_loc (MSG_NOTE, vect_location,
1769 "Re-trying with swapped operands of stmts ");
1770 for (j = 0; j < group_size; ++j)
1771 if (matches[j] == !swap_not_matching)
1772 {
1773 std::swap (oprnds_info[0]->def_stmts[j],
1774 oprnds_info[1]->def_stmts[j]);
1775 std::swap (oprnds_info[0]->ops[j],
1776 oprnds_info[1]->ops[j]);
1777 if (dump_enabled_p ())
1778 dump_printf (MSG_NOTE, "%d ", j);
1779 }
1780 if (dump_enabled_p ())
1781 dump_printf (MSG_NOTE, "\n");
1782 /* And try again with scratch 'matches' ... */
1783 bool *tem = XALLOCAVEC (bool, group_size);
1784 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1785 group_size, &this_max_nunits,
1786 tem, limit,
1787 &this_tree_size, bst_map)) != NULL)
1788 {
1789 oprnd_info->def_stmts = vNULL;
1790 children.safe_push (child);
1791 continue;
1792 }
1793 }
1794 fail:
1795
1796 /* If the SLP build failed and we analyze a basic-block
1797 simply treat nodes we fail to build as externally defined
1798 (and thus build vectors from the scalar defs).
1799 The cost model will reject outright expensive cases.
1800 ??? This doesn't treat cases where permutation ultimatively
1801 fails (or we don't try permutation below). Ideally we'd
1802 even compute a permutation that will end up with the maximum
1803 SLP tree size... */
1804 if (is_a <bb_vec_info> (vinfo)
1805 /* ??? Rejecting patterns this way doesn't work. We'd have to
1806 do extra work to cancel the pattern so the uses see the
1807 scalar version. */
1808 && !is_pattern_stmt_p (stmt_info)
1809 && !oprnd_info->any_pattern)
1810 {
1811 /* But if there's a leading vector sized set of matching stmts
1812 fail here so we can split the group. This matches the condition
1813 vect_analyze_slp_instance uses. */
1814 /* ??? We might want to split here and combine the results to support
1815 multiple vector sizes better. */
1816 for (j = 0; j < group_size; ++j)
1817 if (!matches[j])
1818 break;
1819 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1820 {
1821 if (dump_enabled_p ())
1822 dump_printf_loc (MSG_NOTE, vect_location,
1823 "Building vector operands from scalars\n");
1824 this_tree_size++;
1825 child = vect_create_new_slp_node (oprnd_info->ops);
1826 children.safe_push (child);
1827 oprnd_info->ops = vNULL;
1828 continue;
1829 }
1830 }
1831
1832 gcc_assert (child == NULL);
1833 FOR_EACH_VEC_ELT (children, j, child)
1834 if (child)
1835 vect_free_slp_tree (child);
1836 vect_free_oprnd_info (oprnds_info);
1837 return NULL;
1838 }
1839
1840 vect_free_oprnd_info (oprnds_info);
1841
1842 /* If we have all children of a child built up from uniform scalars
1843 or does more than one possibly expensive vector construction then
1844 just throw that away, causing it built up from scalars.
1845 The exception is the SLP node for the vector store. */
1846 if (is_a <bb_vec_info> (vinfo)
1847 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1848 /* ??? Rejecting patterns this way doesn't work. We'd have to
1849 do extra work to cancel the pattern so the uses see the
1850 scalar version. */
1851 && !is_pattern_stmt_p (stmt_info))
1852 {
1853 slp_tree child;
1854 unsigned j;
1855 bool all_uniform_p = true;
1856 unsigned n_vector_builds = 0;
1857 FOR_EACH_VEC_ELT (children, j, child)
1858 {
1859 if (!child)
1860 ;
1861 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1862 all_uniform_p = false;
1863 else if (!vect_slp_tree_uniform_p (child))
1864 {
1865 all_uniform_p = false;
1866 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1867 n_vector_builds++;
1868 }
1869 }
1870 if (all_uniform_p || n_vector_builds > 1)
1871 {
1872 /* Roll back. */
1873 matches[0] = false;
1874 FOR_EACH_VEC_ELT (children, j, child)
1875 if (child)
1876 vect_free_slp_tree (child);
1877
1878 if (dump_enabled_p ())
1879 dump_printf_loc (MSG_NOTE, vect_location,
1880 "Building parent vector operands from "
1881 "scalars instead\n");
1882 return NULL;
1883 }
1884 }
1885
1886 *tree_size += this_tree_size + 1;
1887 *max_nunits = this_max_nunits;
1888
1889 if (two_operators)
1890 {
1891 /* ??? We'd likely want to either cache in bst_map sth like
1892 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1893 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1894 explicit stmts to put in so the keying on 'stmts' doesn't
1895 work (but we have the same issue with nodes that use 'ops'). */
1896 slp_tree one = new _slp_tree;
1897 slp_tree two = new _slp_tree;
1898 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1899 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1900 SLP_TREE_VECTYPE (one) = vectype;
1901 SLP_TREE_VECTYPE (two) = vectype;
1902 SLP_TREE_CHILDREN (one).safe_splice (children);
1903 SLP_TREE_CHILDREN (two).safe_splice (children);
1904 slp_tree child;
1905 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1906 SLP_TREE_REF_COUNT (child)++;
1907
1908 /* Here we record the original defs since this
1909 node represents the final lane configuration. */
1910 node = vect_create_new_slp_node (node, stmts, 2);
1911 SLP_TREE_VECTYPE (node) = vectype;
1912 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1913 SLP_TREE_CHILDREN (node).quick_push (one);
1914 SLP_TREE_CHILDREN (node).quick_push (two);
1915 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1916 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1917 enum tree_code ocode = ERROR_MARK;
1918 stmt_vec_info ostmt_info;
1919 unsigned j = 0;
1920 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1921 {
1922 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1923 if (gimple_assign_rhs_code (ostmt) != code0)
1924 {
1925 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1926 ocode = gimple_assign_rhs_code (ostmt);
1927 j = i;
1928 }
1929 else
1930 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1931 }
1932 SLP_TREE_CODE (one) = code0;
1933 SLP_TREE_CODE (two) = ocode;
1934 SLP_TREE_LANES (one) = stmts.length ();
1935 SLP_TREE_LANES (two) = stmts.length ();
1936 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1937 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1938 return node;
1939 }
1940
1941 node = vect_create_new_slp_node (node, stmts, nops);
1942 SLP_TREE_VECTYPE (node) = vectype;
1943 SLP_TREE_CHILDREN (node).splice (children);
1944 return node;
1945 }
1946
1947 /* Dump a single SLP tree NODE. */
1948
1949 static void
1950 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1951 slp_tree node)
1952 {
1953 unsigned i, j;
1954 slp_tree child;
1955 stmt_vec_info stmt_info;
1956 tree op;
1957
1958 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1959 dump_user_location_t user_loc = loc.get_user_location ();
1960 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1961 SLP_TREE_DEF_TYPE (node) == vect_external_def
1962 ? " (external)"
1963 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1964 ? " (constant)"
1965 : ""), node,
1966 estimated_poly_value (node->max_nunits),
1967 SLP_TREE_REF_COUNT (node));
1968 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
1969 {
1970 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
1971 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
1972 else
1973 dump_printf_loc (metadata, user_loc, "op template: %G",
1974 SLP_TREE_REPRESENTATIVE (node)->stmt);
1975 }
1976 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1977 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1978 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1979 else
1980 {
1981 dump_printf_loc (metadata, user_loc, "\t{ ");
1982 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1983 dump_printf (metadata, "%T%s ", op,
1984 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1985 dump_printf (metadata, "}\n");
1986 }
1987 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1988 {
1989 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1990 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1991 dump_printf (dump_kind, " %u", j);
1992 dump_printf (dump_kind, " }\n");
1993 }
1994 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1995 {
1996 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1997 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1998 dump_printf (dump_kind, " %u[%u]",
1999 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2000 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2001 dump_printf (dump_kind, " }\n");
2002 }
2003 if (SLP_TREE_CHILDREN (node).is_empty ())
2004 return;
2005 dump_printf_loc (metadata, user_loc, "\tchildren");
2006 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2007 dump_printf (dump_kind, " %p", (void *)child);
2008 dump_printf (dump_kind, "\n");
2009 }
2010
2011 DEBUG_FUNCTION void
2012 debug (slp_tree node)
2013 {
2014 debug_dump_context ctx;
2015 vect_print_slp_tree (MSG_NOTE,
2016 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2017 node);
2018 }
2019
2020 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2021
2022 static void
2023 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2024 slp_tree node, hash_set<slp_tree> &visited)
2025 {
2026 unsigned i;
2027 slp_tree child;
2028
2029 if (visited.add (node))
2030 return;
2031
2032 vect_print_slp_tree (dump_kind, loc, node);
2033
2034 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2035 if (child)
2036 vect_print_slp_graph (dump_kind, loc, child, visited);
2037 }
2038
2039 static void
2040 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2041 slp_tree entry)
2042 {
2043 hash_set<slp_tree> visited;
2044 vect_print_slp_graph (dump_kind, loc, entry, visited);
2045 }
2046
2047 /* Mark the tree rooted at NODE with PURE_SLP. */
2048
2049 static void
2050 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2051 {
2052 int i;
2053 stmt_vec_info stmt_info;
2054 slp_tree child;
2055
2056 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2057 return;
2058
2059 if (visited.add (node))
2060 return;
2061
2062 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2063 STMT_SLP_TYPE (stmt_info) = pure_slp;
2064
2065 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2066 if (child)
2067 vect_mark_slp_stmts (child, visited);
2068 }
2069
2070 static void
2071 vect_mark_slp_stmts (slp_tree node)
2072 {
2073 hash_set<slp_tree> visited;
2074 vect_mark_slp_stmts (node, visited);
2075 }
2076
2077 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2078
2079 static void
2080 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2081 {
2082 int i;
2083 stmt_vec_info stmt_info;
2084 slp_tree child;
2085
2086 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2087 return;
2088
2089 if (visited.add (node))
2090 return;
2091
2092 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2093 {
2094 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2095 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2096 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2097 }
2098
2099 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2100 if (child)
2101 vect_mark_slp_stmts_relevant (child, visited);
2102 }
2103
2104 static void
2105 vect_mark_slp_stmts_relevant (slp_tree node)
2106 {
2107 hash_set<slp_tree> visited;
2108 vect_mark_slp_stmts_relevant (node, visited);
2109 }
2110
2111
2112 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2113
2114 static void
2115 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2116 hash_set<slp_tree> &visited)
2117 {
2118 if (!node || visited.add (node))
2119 return;
2120
2121 if (SLP_TREE_CHILDREN (node).length () == 0)
2122 {
2123 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2124 return;
2125 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2126 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2127 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2128 loads.safe_push (node);
2129 }
2130 else
2131 {
2132 unsigned i;
2133 slp_tree child;
2134 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2135 vect_gather_slp_loads (loads, child, visited);
2136 }
2137 }
2138
2139
2140 /* Find the last store in SLP INSTANCE. */
2141
2142 stmt_vec_info
2143 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2144 {
2145 stmt_vec_info last = NULL;
2146 stmt_vec_info stmt_vinfo;
2147
2148 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2149 {
2150 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2151 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2152 }
2153
2154 return last;
2155 }
2156
2157 /* Find the first stmt in NODE. */
2158
2159 stmt_vec_info
2160 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2161 {
2162 stmt_vec_info first = NULL;
2163 stmt_vec_info stmt_vinfo;
2164
2165 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2166 {
2167 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2168 if (!first
2169 || get_later_stmt (stmt_vinfo, first) == first)
2170 first = stmt_vinfo;
2171 }
2172
2173 return first;
2174 }
2175
2176 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2177 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2178 (also containing the first GROUP1_SIZE stmts, since stores are
2179 consecutive), the second containing the remainder.
2180 Return the first stmt in the second group. */
2181
2182 static stmt_vec_info
2183 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2184 {
2185 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2186 gcc_assert (group1_size > 0);
2187 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2188 gcc_assert (group2_size > 0);
2189 DR_GROUP_SIZE (first_vinfo) = group1_size;
2190
2191 stmt_vec_info stmt_info = first_vinfo;
2192 for (unsigned i = group1_size; i > 1; i--)
2193 {
2194 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2195 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2196 }
2197 /* STMT is now the last element of the first group. */
2198 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2199 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2200
2201 DR_GROUP_SIZE (group2) = group2_size;
2202 for (stmt_info = group2; stmt_info;
2203 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2204 {
2205 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2206 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2207 }
2208
2209 /* For the second group, the DR_GROUP_GAP is that before the original group,
2210 plus skipping over the first vector. */
2211 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2212
2213 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2214 DR_GROUP_GAP (first_vinfo) += group2_size;
2215
2216 if (dump_enabled_p ())
2217 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2218 group1_size, group2_size);
2219
2220 return group2;
2221 }
2222
2223 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2224 statements and a vector of NUNITS elements. */
2225
2226 static poly_uint64
2227 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2228 {
2229 return exact_div (common_multiple (nunits, group_size), group_size);
2230 }
2231
2232 /* Helper function of vect_match_slp_patterns.
2233
2234 Attempts to match patterns against the slp tree rooted in REF_NODE using
2235 VINFO. Patterns are matched in post-order traversal.
2236
2237 If matching is successful the value in REF_NODE is updated and returned, if
2238 not then it is returned unchanged. */
2239
2240 static bool
2241 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2242 slp_tree_to_load_perm_map_t *perm_cache,
2243 hash_set<slp_tree> *visited)
2244 {
2245 unsigned i;
2246 slp_tree node = *ref_node;
2247 bool found_p = false;
2248 if (!node || visited->add (node))
2249 return false;
2250
2251 slp_tree child;
2252 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2253 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2254 vinfo, perm_cache, visited);
2255
2256 for (unsigned x = 0; x < num__slp_patterns; x++)
2257 {
2258 vect_pattern *pattern = slp_patterns[x] (perm_cache, ref_node);
2259 if (pattern)
2260 {
2261 pattern->build (vinfo);
2262 delete pattern;
2263 found_p = true;
2264 }
2265 }
2266
2267 return found_p;
2268 }
2269
2270 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2271 vec_info VINFO.
2272
2273 The modified tree is returned. Patterns are tried in order and multiple
2274 patterns may match. */
2275
2276 static bool
2277 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2278 hash_set<slp_tree> *visited,
2279 slp_tree_to_load_perm_map_t *perm_cache,
2280 scalar_stmts_to_slp_tree_map_t * /* bst_map */)
2281 {
2282 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2283 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2284
2285 if (dump_enabled_p ())
2286 dump_printf_loc (MSG_NOTE, vect_location,
2287 "Analyzing SLP tree %p for patterns\n",
2288 SLP_INSTANCE_TREE (instance));
2289
2290 bool found_p
2291 = vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
2292
2293 if (found_p)
2294 {
2295 if (dump_enabled_p ())
2296 {
2297 dump_printf_loc (MSG_NOTE, vect_location,
2298 "Pattern matched SLP tree\n");
2299 vect_print_slp_graph (MSG_NOTE, vect_location, *ref_node);
2300 }
2301 }
2302
2303 return found_p;
2304 }
2305
2306 /* Analyze an SLP instance starting from a group of grouped stores. Call
2307 vect_build_slp_tree to build a tree of packed stmts if possible.
2308 Return FALSE if it's impossible to SLP any stmt in the loop. */
2309
2310 static bool
2311 vect_analyze_slp_instance (vec_info *vinfo,
2312 scalar_stmts_to_slp_tree_map_t *bst_map,
2313 stmt_vec_info stmt_info, slp_instance_kind kind,
2314 unsigned max_tree_size, unsigned *limit);
2315
2316 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2317 of KIND. Return true if successful. */
2318
2319 static bool
2320 vect_build_slp_instance (vec_info *vinfo,
2321 slp_instance_kind kind,
2322 vec<stmt_vec_info> &scalar_stmts,
2323 stmt_vec_info root_stmt_info,
2324 unsigned max_tree_size, unsigned *limit,
2325 scalar_stmts_to_slp_tree_map_t *bst_map,
2326 /* ??? We need stmt_info for group splitting. */
2327 stmt_vec_info stmt_info_)
2328 {
2329 if (dump_enabled_p ())
2330 {
2331 dump_printf_loc (MSG_NOTE, vect_location,
2332 "Starting SLP discovery for\n");
2333 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
2334 dump_printf_loc (MSG_NOTE, vect_location,
2335 " %G", scalar_stmts[i]->stmt);
2336 }
2337
2338 /* Build the tree for the SLP instance. */
2339 unsigned int group_size = scalar_stmts.length ();
2340 bool *matches = XALLOCAVEC (bool, group_size);
2341 poly_uint64 max_nunits = 1;
2342 unsigned tree_size = 0;
2343 unsigned i;
2344 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2345 &max_nunits, matches, limit,
2346 &tree_size, bst_map);
2347 if (node != NULL)
2348 {
2349 /* Calculate the unrolling factor based on the smallest type. */
2350 poly_uint64 unrolling_factor
2351 = calculate_unrolling_factor (max_nunits, group_size);
2352
2353 if (maybe_ne (unrolling_factor, 1U)
2354 && is_a <bb_vec_info> (vinfo))
2355 {
2356 unsigned HOST_WIDE_INT const_max_nunits;
2357 if (!max_nunits.is_constant (&const_max_nunits)
2358 || const_max_nunits > group_size)
2359 {
2360 if (dump_enabled_p ())
2361 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2362 "Build SLP failed: store group "
2363 "size not a multiple of the vector size "
2364 "in basic block SLP\n");
2365 vect_free_slp_tree (node);
2366 return false;
2367 }
2368 /* Fatal mismatch. */
2369 if (dump_enabled_p ())
2370 dump_printf_loc (MSG_NOTE, vect_location,
2371 "SLP discovery succeeded but node needs "
2372 "splitting\n");
2373 memset (matches, true, group_size);
2374 matches[group_size / const_max_nunits * const_max_nunits] = false;
2375 vect_free_slp_tree (node);
2376 }
2377 else
2378 {
2379 /* Create a new SLP instance. */
2380 slp_instance new_instance = XNEW (class _slp_instance);
2381 SLP_INSTANCE_TREE (new_instance) = node;
2382 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2383 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2384 SLP_INSTANCE_ROOT_STMT (new_instance) = root_stmt_info;
2385 SLP_INSTANCE_KIND (new_instance) = kind;
2386 new_instance->reduc_phis = NULL;
2387 new_instance->cost_vec = vNULL;
2388 new_instance->subgraph_entries = vNULL;
2389
2390 if (dump_enabled_p ())
2391 dump_printf_loc (MSG_NOTE, vect_location,
2392 "SLP size %u vs. limit %u.\n",
2393 tree_size, max_tree_size);
2394
2395 /* Fixup SLP reduction chains. */
2396 if (kind == slp_inst_kind_reduc_chain)
2397 {
2398 /* If this is a reduction chain with a conversion in front
2399 amend the SLP tree with a node for that. */
2400 gimple *scalar_def
2401 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2402 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
2403 {
2404 /* Get at the conversion stmt - we know it's the single use
2405 of the last stmt of the reduction chain. */
2406 use_operand_p use_p;
2407 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
2408 &use_p, &scalar_def);
2409 gcc_assert (r);
2410 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
2411 next_info = vect_stmt_to_vectorize (next_info);
2412 scalar_stmts = vNULL;
2413 scalar_stmts.create (group_size);
2414 for (unsigned i = 0; i < group_size; ++i)
2415 scalar_stmts.quick_push (next_info);
2416 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2417 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2418 SLP_TREE_CHILDREN (conv).quick_push (node);
2419 SLP_INSTANCE_TREE (new_instance) = conv;
2420 /* We also have to fake this conversion stmt as SLP reduction
2421 group so we don't have to mess with too much code
2422 elsewhere. */
2423 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2424 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2425 }
2426 /* Fill the backedge child of the PHI SLP node. The
2427 general matching code cannot find it because the
2428 scalar code does not reflect how we vectorize the
2429 reduction. */
2430 use_operand_p use_p;
2431 imm_use_iterator imm_iter;
2432 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
2433 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
2434 gimple_get_lhs (scalar_def))
2435 /* There are exactly two non-debug uses, the reduction
2436 PHI and the loop-closed PHI node. */
2437 if (!is_gimple_debug (USE_STMT (use_p))
2438 && gimple_bb (USE_STMT (use_p)) == loop->header)
2439 {
2440 auto_vec<stmt_vec_info, 64> phis (group_size);
2441 stmt_vec_info phi_info
2442 = vinfo->lookup_stmt (USE_STMT (use_p));
2443 for (unsigned i = 0; i < group_size; ++i)
2444 phis.quick_push (phi_info);
2445 slp_tree *phi_node = bst_map->get (phis);
2446 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
2447 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
2448 = SLP_INSTANCE_TREE (new_instance);
2449 SLP_INSTANCE_TREE (new_instance)->refcnt++;
2450 }
2451 }
2452
2453 vinfo->slp_instances.safe_push (new_instance);
2454
2455 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2456 the number of scalar stmts in the root in a few places.
2457 Verify that assumption holds. */
2458 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2459 .length () == group_size);
2460
2461 if (dump_enabled_p ())
2462 {
2463 dump_printf_loc (MSG_NOTE, vect_location,
2464 "Final SLP tree for instance %p:\n", new_instance);
2465 vect_print_slp_graph (MSG_NOTE, vect_location,
2466 SLP_INSTANCE_TREE (new_instance));
2467 }
2468
2469 return true;
2470 }
2471 }
2472 else
2473 {
2474 /* Failed to SLP. */
2475 /* Free the allocated memory. */
2476 scalar_stmts.release ();
2477 }
2478
2479 stmt_vec_info stmt_info = stmt_info_;
2480 /* Try to break the group up into pieces. */
2481 if (kind == slp_inst_kind_store)
2482 {
2483 /* ??? We could delay all the actual splitting of store-groups
2484 until after SLP discovery of the original group completed.
2485 Then we can recurse to vect_build_slp_instance directly. */
2486 for (i = 0; i < group_size; i++)
2487 if (!matches[i])
2488 break;
2489
2490 /* For basic block SLP, try to break the group up into multiples of
2491 a vector size. */
2492 if (is_a <bb_vec_info> (vinfo)
2493 && (i > 1 && i < group_size))
2494 {
2495 tree scalar_type
2496 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2497 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2498 1 << floor_log2 (i));
2499 unsigned HOST_WIDE_INT const_nunits;
2500 if (vectype
2501 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2502 {
2503 /* Split into two groups at the first vector boundary. */
2504 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2505 unsigned group1_size = i & ~(const_nunits - 1);
2506
2507 if (dump_enabled_p ())
2508 dump_printf_loc (MSG_NOTE, vect_location,
2509 "Splitting SLP group at stmt %u\n", i);
2510 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2511 group1_size);
2512 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2513 kind, max_tree_size,
2514 limit);
2515 /* Split the rest at the failure point and possibly
2516 re-analyze the remaining matching part if it has
2517 at least two lanes. */
2518 if (group1_size < i
2519 && (i + 1 < group_size
2520 || i - group1_size > 1))
2521 {
2522 stmt_vec_info rest2 = rest;
2523 rest = vect_split_slp_store_group (rest, i - group1_size);
2524 if (i - group1_size > 1)
2525 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
2526 kind, max_tree_size,
2527 limit);
2528 }
2529 /* Re-analyze the non-matching tail if it has at least
2530 two lanes. */
2531 if (i + 1 < group_size)
2532 res |= vect_analyze_slp_instance (vinfo, bst_map,
2533 rest, kind, max_tree_size,
2534 limit);
2535 return res;
2536 }
2537 }
2538
2539 /* For loop vectorization split into arbitrary pieces of size > 1. */
2540 if (is_a <loop_vec_info> (vinfo)
2541 && (i > 1 && i < group_size))
2542 {
2543 unsigned group1_size = i;
2544
2545 if (dump_enabled_p ())
2546 dump_printf_loc (MSG_NOTE, vect_location,
2547 "Splitting SLP group at stmt %u\n", i);
2548
2549 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2550 group1_size);
2551 /* Loop vectorization cannot handle gaps in stores, make sure
2552 the split group appears as strided. */
2553 STMT_VINFO_STRIDED_P (rest) = 1;
2554 DR_GROUP_GAP (rest) = 0;
2555 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2556 DR_GROUP_GAP (stmt_info) = 0;
2557
2558 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2559 kind, max_tree_size, limit);
2560 if (i + 1 < group_size)
2561 res |= vect_analyze_slp_instance (vinfo, bst_map,
2562 rest, kind, max_tree_size, limit);
2563
2564 return res;
2565 }
2566
2567 /* Even though the first vector did not all match, we might be able to SLP
2568 (some) of the remainder. FORNOW ignore this possibility. */
2569 }
2570
2571 /* Failed to SLP. */
2572 if (dump_enabled_p ())
2573 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2574 return false;
2575 }
2576
2577
2578 /* Analyze an SLP instance starting from a group of grouped stores. Call
2579 vect_build_slp_tree to build a tree of packed stmts if possible.
2580 Return FALSE if it's impossible to SLP any stmt in the loop. */
2581
2582 static bool
2583 vect_analyze_slp_instance (vec_info *vinfo,
2584 scalar_stmts_to_slp_tree_map_t *bst_map,
2585 stmt_vec_info stmt_info,
2586 slp_instance_kind kind,
2587 unsigned max_tree_size, unsigned *limit)
2588 {
2589 unsigned int i;
2590 vec<stmt_vec_info> scalar_stmts;
2591
2592 if (is_a <bb_vec_info> (vinfo))
2593 vect_location = stmt_info->stmt;
2594
2595 stmt_vec_info next_info = stmt_info;
2596 if (kind == slp_inst_kind_store)
2597 {
2598 /* Collect the stores and store them in scalar_stmts. */
2599 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
2600 while (next_info)
2601 {
2602 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2603 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2604 }
2605 }
2606 else if (kind == slp_inst_kind_reduc_chain)
2607 {
2608 /* Collect the reduction stmts and store them in scalar_stmts. */
2609 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
2610 while (next_info)
2611 {
2612 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2613 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2614 }
2615 /* Mark the first element of the reduction chain as reduction to properly
2616 transform the node. In the reduction analysis phase only the last
2617 element of the chain is marked as reduction. */
2618 STMT_VINFO_DEF_TYPE (stmt_info)
2619 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2620 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2621 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2622 }
2623 else if (kind == slp_inst_kind_ctor)
2624 {
2625 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2626 tree val;
2627 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
2628 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2629 {
2630 stmt_vec_info def_info = vinfo->lookup_def (val);
2631 def_info = vect_stmt_to_vectorize (def_info);
2632 scalar_stmts.quick_push (def_info);
2633 }
2634 if (dump_enabled_p ())
2635 dump_printf_loc (MSG_NOTE, vect_location,
2636 "Analyzing vectorizable constructor: %G\n",
2637 stmt_info->stmt);
2638 }
2639 else if (kind == slp_inst_kind_reduc_group)
2640 {
2641 /* Collect reduction statements. */
2642 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2643 scalar_stmts.create (reductions.length ());
2644 for (i = 0; reductions.iterate (i, &next_info); i++)
2645 if (STMT_VINFO_RELEVANT_P (next_info)
2646 || STMT_VINFO_LIVE_P (next_info))
2647 scalar_stmts.quick_push (next_info);
2648 /* If less than two were relevant/live there's nothing to SLP. */
2649 if (scalar_stmts.length () < 2)
2650 return false;
2651 }
2652 else
2653 gcc_unreachable ();
2654
2655 /* Build the tree for the SLP instance. */
2656 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
2657 kind == slp_inst_kind_ctor
2658 ? stmt_info : NULL,
2659 max_tree_size, limit, bst_map,
2660 kind == slp_inst_kind_store
2661 ? stmt_info : NULL);
2662
2663 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2664 where we should do store group splitting. */
2665
2666 return res;
2667 }
2668
2669 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2670 trees of packed scalar stmts if SLP is possible. */
2671
2672 opt_result
2673 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2674 {
2675 unsigned int i;
2676 stmt_vec_info first_element;
2677 slp_instance instance;
2678
2679 DUMP_VECT_SCOPE ("vect_analyze_slp");
2680
2681 unsigned limit = max_tree_size;
2682
2683 scalar_stmts_to_slp_tree_map_t *bst_map
2684 = new scalar_stmts_to_slp_tree_map_t ();
2685
2686 /* Find SLP sequences starting from groups of grouped stores. */
2687 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2688 vect_analyze_slp_instance (vinfo, bst_map, first_element,
2689 STMT_VINFO_GROUPED_ACCESS (first_element)
2690 ? slp_inst_kind_store : slp_inst_kind_ctor,
2691 max_tree_size, &limit);
2692
2693 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
2694 {
2695 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
2696 {
2697 vect_location = bb_vinfo->roots[i].root->stmt;
2698 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
2699 bb_vinfo->roots[i].stmts,
2700 bb_vinfo->roots[i].root,
2701 max_tree_size, &limit, bst_map, NULL))
2702 bb_vinfo->roots[i].stmts = vNULL;
2703 }
2704 }
2705
2706 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2707 {
2708 /* Find SLP sequences starting from reduction chains. */
2709 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2710 if (! STMT_VINFO_RELEVANT_P (first_element)
2711 && ! STMT_VINFO_LIVE_P (first_element))
2712 ;
2713 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2714 slp_inst_kind_reduc_chain,
2715 max_tree_size, &limit))
2716 {
2717 /* Dissolve reduction chain group. */
2718 stmt_vec_info vinfo = first_element;
2719 stmt_vec_info last = NULL;
2720 while (vinfo)
2721 {
2722 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2723 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2724 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2725 last = vinfo;
2726 vinfo = next;
2727 }
2728 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2729 /* It can be still vectorized as part of an SLP reduction. */
2730 loop_vinfo->reductions.safe_push (last);
2731 }
2732
2733 /* Find SLP sequences starting from groups of reductions. */
2734 if (loop_vinfo->reductions.length () > 1)
2735 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2736 slp_inst_kind_reduc_group, max_tree_size,
2737 &limit);
2738 }
2739
2740 hash_set<slp_tree> visited_patterns;
2741 slp_tree_to_load_perm_map_t perm_cache;
2742 /* See if any patterns can be found in the SLP tree. */
2743 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
2744 vect_match_slp_patterns (instance, vinfo, &visited_patterns, &perm_cache,
2745 bst_map);
2746
2747 /* The map keeps a reference on SLP nodes built, release that. */
2748 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2749 it != bst_map->end (); ++it)
2750 if ((*it).second)
2751 vect_free_slp_tree ((*it).second);
2752 delete bst_map;
2753
2754 return opt_result::success ();
2755 }
2756
2757 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2758
2759 static void
2760 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2761 vec<slp_tree> &vertices, vec<int> &leafs)
2762 {
2763 unsigned i;
2764 slp_tree child;
2765
2766 if (visited.add (node))
2767 return;
2768
2769 node->vertex = vertices.length ();
2770 vertices.safe_push (node);
2771
2772 bool leaf = true;
2773 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2774 if (child)
2775 {
2776 leaf = false;
2777 vect_slp_build_vertices (visited, child, vertices, leafs);
2778 }
2779 if (leaf)
2780 leafs.safe_push (node->vertex);
2781 }
2782
2783 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2784
2785 static void
2786 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2787 vec<int> &leafs)
2788 {
2789 hash_set<slp_tree> visited;
2790 unsigned i;
2791 slp_instance instance;
2792 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2793 {
2794 unsigned n_v = vertices.length ();
2795 unsigned n_l = leafs.length ();
2796 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2797 leafs);
2798 /* If we added vertices but no entries to the reverse graph we've
2799 added a cycle that is not backwards-reachable. Push the entry
2800 to mimic as leaf then. */
2801 if (vertices.length () > n_v
2802 && leafs.length () == n_l)
2803 leafs.safe_push (SLP_INSTANCE_TREE (instance)->vertex);
2804 }
2805 }
2806
2807 /* Apply (reverse) bijectite PERM to VEC. */
2808
2809 template <class T>
2810 static void
2811 vect_slp_permute (vec<unsigned> perm,
2812 vec<T> &vec, bool reverse)
2813 {
2814 auto_vec<T, 64> saved;
2815 saved.create (vec.length ());
2816 for (unsigned i = 0; i < vec.length (); ++i)
2817 saved.quick_push (vec[i]);
2818
2819 if (reverse)
2820 {
2821 for (unsigned i = 0; i < vec.length (); ++i)
2822 vec[perm[i]] = saved[i];
2823 for (unsigned i = 0; i < vec.length (); ++i)
2824 gcc_assert (vec[perm[i]] == saved[i]);
2825 }
2826 else
2827 {
2828 for (unsigned i = 0; i < vec.length (); ++i)
2829 vec[i] = saved[perm[i]];
2830 for (unsigned i = 0; i < vec.length (); ++i)
2831 gcc_assert (vec[i] == saved[perm[i]]);
2832 }
2833 }
2834
2835 /* Return whether permutations PERM_A and PERM_B as recorded in the
2836 PERMS vector are equal. */
2837
2838 static bool
2839 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
2840 int perm_a, int perm_b)
2841 {
2842 return (perm_a == perm_b
2843 || (perms[perm_a].length () == perms[perm_b].length ()
2844 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
2845 sizeof (unsigned) * perms[perm_a].length ()) == 0));
2846 }
2847
2848 /* Optimize the SLP graph of VINFO. */
2849
2850 void
2851 vect_optimize_slp (vec_info *vinfo)
2852 {
2853 if (vinfo->slp_instances.is_empty ())
2854 return;
2855
2856 slp_tree node;
2857 unsigned i;
2858 auto_vec<slp_tree> vertices;
2859 auto_vec<int> leafs;
2860 vect_slp_build_vertices (vinfo, vertices, leafs);
2861
2862 struct graph *slpg = new_graph (vertices.length ());
2863 FOR_EACH_VEC_ELT (vertices, i, node)
2864 {
2865 unsigned j;
2866 slp_tree child;
2867 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2868 if (child)
2869 add_edge (slpg, i, child->vertex);
2870 }
2871
2872 /* Compute (reverse) postorder on the inverted graph. */
2873 auto_vec<int> ipo;
2874 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
2875
2876 auto_sbitmap n_visited (vertices.length ());
2877 auto_sbitmap n_materialize (vertices.length ());
2878 auto_vec<int> n_perm (vertices.length ());
2879 auto_vec<vec<unsigned> > perms;
2880
2881 bitmap_clear (n_visited);
2882 bitmap_clear (n_materialize);
2883 n_perm.quick_grow_cleared (vertices.length ());
2884 perms.safe_push (vNULL); /* zero is no permute */
2885
2886 /* Produce initial permutations. */
2887 for (i = 0; i < leafs.length (); ++i)
2888 {
2889 int idx = leafs[i];
2890 slp_tree node = vertices[idx];
2891
2892 /* Handle externals and constants optimistically throughout the
2893 iteration. */
2894 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
2895 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
2896 continue;
2897
2898 /* Leafs do not change across iterations. Note leafs also double
2899 as entries to the reverse graph. */
2900 if (!slpg->vertices[idx].succ)
2901 bitmap_set_bit (n_visited, idx);
2902 /* Loads are the only thing generating permutes. */
2903 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2904 continue;
2905
2906 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2907 node unpermuted, record this permute. */
2908 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
2909 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
2910 continue;
2911 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
2912 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
2913 bool any_permute = false;
2914 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2915 {
2916 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
2917 imin = MIN (imin, idx);
2918 imax = MAX (imax, idx);
2919 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
2920 any_permute = true;
2921 }
2922 /* If there's no permute no need to split one out. */
2923 if (!any_permute)
2924 continue;
2925 /* If the span doesn't match we'd disrupt VF computation, avoid
2926 that for now. */
2927 if (imax - imin + 1 != SLP_TREE_LANES (node))
2928 continue;
2929
2930 /* For now only handle true permutes, like
2931 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2932 when permuting constants and invariants keeping the permute
2933 bijective. */
2934 auto_sbitmap load_index (SLP_TREE_LANES (node));
2935 bitmap_clear (load_index);
2936 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2937 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
2938 unsigned j;
2939 for (j = 0; j < SLP_TREE_LANES (node); ++j)
2940 if (!bitmap_bit_p (load_index, j))
2941 break;
2942 if (j != SLP_TREE_LANES (node))
2943 continue;
2944
2945 vec<unsigned> perm = vNULL;
2946 perm.safe_grow (SLP_TREE_LANES (node), true);
2947 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2948 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
2949 perms.safe_push (perm);
2950 n_perm[idx] = perms.length () - 1;
2951 }
2952
2953 /* Propagate permutes along the graph and compute materialization points. */
2954 bool changed;
2955 unsigned iteration = 0;
2956 do
2957 {
2958 changed = false;
2959 ++iteration;
2960
2961 for (i = vertices.length (); i > 0 ; --i)
2962 {
2963 int idx = ipo[i-1];
2964 slp_tree node = vertices[idx];
2965 /* For leafs there's nothing to do - we've seeded permutes
2966 on those above. */
2967 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2968 continue;
2969
2970 bitmap_set_bit (n_visited, idx);
2971
2972 /* We cannot move a permute across a store. */
2973 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2974 && DR_IS_WRITE
2975 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
2976 continue;
2977
2978 int perm = -1;
2979 for (graph_edge *succ = slpg->vertices[idx].succ;
2980 succ; succ = succ->succ_next)
2981 {
2982 int succ_idx = succ->dest;
2983 /* Handle unvisited nodes optimistically. */
2984 /* ??? But for constants once we want to handle non-bijective
2985 permutes we have to verify the permute, when unifying lanes,
2986 will not unify different constants. For example see
2987 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2988 if (!bitmap_bit_p (n_visited, succ_idx))
2989 continue;
2990 int succ_perm = n_perm[succ_idx];
2991 /* Once we materialize succs permutation its output lanes
2992 appear unpermuted to us. */
2993 if (bitmap_bit_p (n_materialize, succ_idx))
2994 succ_perm = 0;
2995 if (perm == -1)
2996 perm = succ_perm;
2997 else if (succ_perm == 0)
2998 {
2999 perm = 0;
3000 break;
3001 }
3002 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
3003 {
3004 perm = 0;
3005 break;
3006 }
3007 }
3008
3009 if (perm == -1)
3010 /* Pick up pre-computed leaf values. */
3011 perm = n_perm[idx];
3012 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
3013 {
3014 if (iteration > 1)
3015 /* Make sure we eventually converge. */
3016 gcc_checking_assert (perm == 0);
3017 n_perm[idx] = perm;
3018 if (perm == 0)
3019 bitmap_clear_bit (n_materialize, idx);
3020 changed = true;
3021 }
3022
3023 if (perm == 0)
3024 continue;
3025
3026 /* Elide pruning at materialization points in the first
3027 iteration so every node was visited once at least. */
3028 if (iteration == 1)
3029 continue;
3030
3031 /* Decide on permute materialization. Look whether there's
3032 a use (pred) edge that is permuted differently than us.
3033 In that case mark ourselves so the permutation is applied.
3034 For VEC_PERM_EXPRs the permutation doesn't carry along
3035 from children to parents so force materialization at the
3036 point of the VEC_PERM_EXPR. In principle VEC_PERM_EXPRs
3037 are a source of an arbitrary permutation again, similar
3038 to constants/externals - that's something we do not yet
3039 optimally handle. */
3040 bool all_preds_permuted = (SLP_TREE_CODE (node) != VEC_PERM_EXPR
3041 && slpg->vertices[idx].pred != NULL);
3042 if (all_preds_permuted)
3043 for (graph_edge *pred = slpg->vertices[idx].pred;
3044 pred; pred = pred->pred_next)
3045 {
3046 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
3047 int pred_perm = n_perm[pred->src];
3048 if (!vect_slp_perms_eq (perms, perm, pred_perm))
3049 {
3050 all_preds_permuted = false;
3051 break;
3052 }
3053 }
3054 if (!all_preds_permuted)
3055 {
3056 if (!bitmap_bit_p (n_materialize, idx))
3057 changed = true;
3058 bitmap_set_bit (n_materialize, idx);
3059 }
3060 }
3061 }
3062 while (changed || iteration == 1);
3063
3064 /* Materialize. */
3065 for (i = 0; i < vertices.length (); ++i)
3066 {
3067 int perm = n_perm[i];
3068 if (perm <= 0)
3069 continue;
3070
3071 slp_tree node = vertices[i];
3072
3073 /* First permute invariant/external original successors. */
3074 unsigned j;
3075 slp_tree child;
3076 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3077 {
3078 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3079 continue;
3080
3081 /* If the vector is uniform there's nothing to do. */
3082 if (vect_slp_tree_uniform_p (child))
3083 continue;
3084
3085 /* We can end up sharing some externals via two_operator
3086 handling. Be prepared to unshare those. */
3087 if (child->refcnt != 1)
3088 {
3089 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3090 SLP_TREE_CHILDREN (node)[j] = child
3091 = vect_create_new_slp_node
3092 (SLP_TREE_SCALAR_OPS (child).copy ());
3093 }
3094 vect_slp_permute (perms[perm],
3095 SLP_TREE_SCALAR_OPS (child), true);
3096 }
3097
3098 if (bitmap_bit_p (n_materialize, i))
3099 {
3100 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3101 /* For loads simply drop the permutation, the load permutation
3102 already performs the desired permutation. */
3103 ;
3104 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3105 {
3106 /* If the node is already a permute node we can apply
3107 the permutation to the lane selection, effectively
3108 materializing it on the incoming vectors. */
3109 if (dump_enabled_p ())
3110 dump_printf_loc (MSG_NOTE, vect_location,
3111 "simplifying permute node %p\n",
3112 node);
3113
3114 for (unsigned k = 0;
3115 k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3116 SLP_TREE_LANE_PERMUTATION (node)[k].second
3117 = perms[perm][SLP_TREE_LANE_PERMUTATION (node)[k].second];
3118 }
3119 else
3120 {
3121 if (dump_enabled_p ())
3122 dump_printf_loc (MSG_NOTE, vect_location,
3123 "inserting permute node in place of %p\n",
3124 node);
3125
3126 /* Make a copy of NODE and in-place change it to a
3127 VEC_PERM node to permute the lanes of the copy. */
3128 slp_tree copy = new _slp_tree;
3129 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3130 SLP_TREE_CHILDREN (node) = vNULL;
3131 SLP_TREE_SCALAR_STMTS (copy)
3132 = SLP_TREE_SCALAR_STMTS (node).copy ();
3133 vect_slp_permute (perms[perm],
3134 SLP_TREE_SCALAR_STMTS (copy), true);
3135 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3136 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3137 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3138 SLP_TREE_LANE_PERMUTATION (copy)
3139 = SLP_TREE_LANE_PERMUTATION (node);
3140 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3141 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3142 copy->refcnt = 1;
3143 copy->max_nunits = node->max_nunits;
3144 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3145 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3146 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3147
3148 /* Now turn NODE into a VEC_PERM. */
3149 SLP_TREE_CHILDREN (node).safe_push (copy);
3150 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3151 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3152 SLP_TREE_LANE_PERMUTATION (node)
3153 .quick_push (std::make_pair (0, perms[perm][j]));
3154 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3155 }
3156 }
3157 else
3158 {
3159 /* Apply the reverse permutation to our stmts. */
3160 vect_slp_permute (perms[perm],
3161 SLP_TREE_SCALAR_STMTS (node), true);
3162 /* And to the load permutation, which we can simply
3163 make regular by design. */
3164 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3165 {
3166 /* ??? When we handle non-bijective permutes the idea
3167 is that we can force the load-permutation to be
3168 { min, min + 1, min + 2, ... max }. But then the
3169 scalar defs might no longer match the lane content
3170 which means wrong-code with live lane vectorization.
3171 So we possibly have to have NULL entries for those. */
3172 vect_slp_permute (perms[perm],
3173 SLP_TREE_LOAD_PERMUTATION (node), true);
3174 }
3175 }
3176 }
3177
3178 /* Free the perms vector used for propagation. */
3179 while (!perms.is_empty ())
3180 perms.pop ().release ();
3181 free_graph (slpg);
3182
3183
3184 /* Now elide load permutations that are not necessary. */
3185 for (i = 0; i < leafs.length (); ++i)
3186 {
3187 node = vertices[leafs[i]];
3188 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3189 continue;
3190
3191 /* In basic block vectorization we allow any subchain of an interleaving
3192 chain.
3193 FORNOW: not in loop SLP because of realignment complications. */
3194 if (is_a <bb_vec_info> (vinfo))
3195 {
3196 bool subchain_p = true;
3197 stmt_vec_info next_load_info = NULL;
3198 stmt_vec_info load_info;
3199 unsigned j;
3200 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3201 {
3202 if (j != 0
3203 && (next_load_info != load_info
3204 || DR_GROUP_GAP (load_info) != 1))
3205 {
3206 subchain_p = false;
3207 break;
3208 }
3209 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3210 }
3211 if (subchain_p)
3212 {
3213 SLP_TREE_LOAD_PERMUTATION (node).release ();
3214 continue;
3215 }
3216 }
3217 else
3218 {
3219 stmt_vec_info load_info;
3220 bool this_load_permuted = false;
3221 unsigned j;
3222 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3223 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3224 {
3225 this_load_permuted = true;
3226 break;
3227 }
3228 stmt_vec_info first_stmt_info
3229 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3230 if (!this_load_permuted
3231 /* The load requires permutation when unrolling exposes
3232 a gap either because the group is larger than the SLP
3233 group-size or because there is a gap between the groups. */
3234 && (known_eq (LOOP_VINFO_VECT_FACTOR
3235 (as_a <loop_vec_info> (vinfo)), 1U)
3236 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3237 && DR_GROUP_GAP (first_stmt_info) == 0)))
3238 {
3239 SLP_TREE_LOAD_PERMUTATION (node).release ();
3240 continue;
3241 }
3242 }
3243 }
3244 }
3245
3246 /* Gather loads reachable from the individual SLP graph entries. */
3247
3248 void
3249 vect_gather_slp_loads (vec_info *vinfo)
3250 {
3251 unsigned i;
3252 slp_instance instance;
3253 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
3254 {
3255 hash_set<slp_tree> visited;
3256 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
3257 SLP_INSTANCE_TREE (instance), visited);
3258 }
3259 }
3260
3261
3262 /* For each possible SLP instance decide whether to SLP it and calculate overall
3263 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3264 least one instance. */
3265
3266 bool
3267 vect_make_slp_decision (loop_vec_info loop_vinfo)
3268 {
3269 unsigned int i;
3270 poly_uint64 unrolling_factor = 1;
3271 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3272 slp_instance instance;
3273 int decided_to_slp = 0;
3274
3275 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3276
3277 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3278 {
3279 /* FORNOW: SLP if you can. */
3280 /* All unroll factors have the form:
3281
3282 GET_MODE_SIZE (vinfo->vector_mode) * X
3283
3284 for some rational X, so they must have a common multiple. */
3285 unrolling_factor
3286 = force_common_multiple (unrolling_factor,
3287 SLP_INSTANCE_UNROLLING_FACTOR (instance));
3288
3289 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3290 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3291 loop-based vectorization. Such stmts will be marked as HYBRID. */
3292 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3293 decided_to_slp++;
3294 }
3295
3296 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
3297
3298 if (decided_to_slp && dump_enabled_p ())
3299 {
3300 dump_printf_loc (MSG_NOTE, vect_location,
3301 "Decided to SLP %d instances. Unrolling factor ",
3302 decided_to_slp);
3303 dump_dec (MSG_NOTE, unrolling_factor);
3304 dump_printf (MSG_NOTE, "\n");
3305 }
3306
3307 return (decided_to_slp > 0);
3308 }
3309
3310 /* Private data for vect_detect_hybrid_slp. */
3311 struct vdhs_data
3312 {
3313 loop_vec_info loop_vinfo;
3314 vec<stmt_vec_info> *worklist;
3315 };
3316
3317 /* Walker for walk_gimple_op. */
3318
3319 static tree
3320 vect_detect_hybrid_slp (tree *tp, int *, void *data)
3321 {
3322 walk_stmt_info *wi = (walk_stmt_info *)data;
3323 vdhs_data *dat = (vdhs_data *)wi->info;
3324
3325 if (wi->is_lhs)
3326 return NULL_TREE;
3327
3328 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
3329 if (!def_stmt_info)
3330 return NULL_TREE;
3331 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3332 if (PURE_SLP_STMT (def_stmt_info))
3333 {
3334 if (dump_enabled_p ())
3335 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3336 def_stmt_info->stmt);
3337 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3338 dat->worklist->safe_push (def_stmt_info);
3339 }
3340
3341 return NULL_TREE;
3342 }
3343
3344 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3345 if so, otherwise pushing it to WORKLIST. */
3346
3347 static void
3348 maybe_push_to_hybrid_worklist (vec_info *vinfo,
3349 vec<stmt_vec_info> &worklist,
3350 stmt_vec_info stmt_info)
3351 {
3352 if (dump_enabled_p ())
3353 dump_printf_loc (MSG_NOTE, vect_location,
3354 "Processing hybrid candidate : %G", stmt_info->stmt);
3355 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
3356 imm_use_iterator iter2;
3357 ssa_op_iter iter1;
3358 use_operand_p use_p;
3359 def_operand_p def_p;
3360 bool any_def = false;
3361 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
3362 {
3363 any_def = true;
3364 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
3365 {
3366 if (is_gimple_debug (USE_STMT (use_p)))
3367 continue;
3368 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
3369 /* An out-of loop use means this is a loop_vect sink. */
3370 if (!use_info)
3371 {
3372 if (dump_enabled_p ())
3373 dump_printf_loc (MSG_NOTE, vect_location,
3374 "Found loop_vect sink: %G", stmt_info->stmt);
3375 worklist.safe_push (stmt_info);
3376 return;
3377 }
3378 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
3379 {
3380 if (dump_enabled_p ())
3381 dump_printf_loc (MSG_NOTE, vect_location,
3382 "Found loop_vect use: %G", use_info->stmt);
3383 worklist.safe_push (stmt_info);
3384 return;
3385 }
3386 }
3387 }
3388 /* No def means this is a loo_vect sink. */
3389 if (!any_def)
3390 {
3391 if (dump_enabled_p ())
3392 dump_printf_loc (MSG_NOTE, vect_location,
3393 "Found loop_vect sink: %G", stmt_info->stmt);
3394 worklist.safe_push (stmt_info);
3395 return;
3396 }
3397 if (dump_enabled_p ())
3398 dump_printf_loc (MSG_NOTE, vect_location,
3399 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
3400 STMT_SLP_TYPE (stmt_info) = pure_slp;
3401 }
3402
3403 /* Find stmts that must be both vectorized and SLPed. */
3404
3405 void
3406 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3407 {
3408 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3409
3410 /* All stmts participating in SLP are marked pure_slp, all other
3411 stmts are loop_vect.
3412 First collect all loop_vect stmts into a worklist.
3413 SLP patterns cause not all original scalar stmts to appear in
3414 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3415 Rectify this here and do a backward walk over the IL only considering
3416 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3417 mark them as pure_slp. */
3418 auto_vec<stmt_vec_info> worklist;
3419 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
3420 {
3421 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3422 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3423 gsi_next (&gsi))
3424 {
3425 gphi *phi = gsi.phi ();
3426 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3427 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3428 maybe_push_to_hybrid_worklist (loop_vinfo,
3429 worklist, stmt_info);
3430 }
3431 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
3432 gsi_prev (&gsi))
3433 {
3434 gimple *stmt = gsi_stmt (gsi);
3435 if (is_gimple_debug (stmt))
3436 continue;
3437 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3438 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3439 {
3440 for (gimple_stmt_iterator gsi2
3441 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3442 !gsi_end_p (gsi2); gsi_next (&gsi2))
3443 {
3444 stmt_vec_info patt_info
3445 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3446 if (!STMT_SLP_TYPE (patt_info)
3447 && STMT_VINFO_RELEVANT (patt_info))
3448 maybe_push_to_hybrid_worklist (loop_vinfo,
3449 worklist, patt_info);
3450 }
3451 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3452 }
3453 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3454 maybe_push_to_hybrid_worklist (loop_vinfo,
3455 worklist, stmt_info);
3456 }
3457 }
3458
3459 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3460 mark any SLP vectorized stmt as hybrid.
3461 ??? We're visiting def stmts N times (once for each non-SLP and
3462 once for each hybrid-SLP use). */
3463 walk_stmt_info wi;
3464 vdhs_data dat;
3465 dat.worklist = &worklist;
3466 dat.loop_vinfo = loop_vinfo;
3467 memset (&wi, 0, sizeof (wi));
3468 wi.info = (void *)&dat;
3469 while (!worklist.is_empty ())
3470 {
3471 stmt_vec_info stmt_info = worklist.pop ();
3472 /* Since SSA operands are not set up for pattern stmts we need
3473 to use walk_gimple_op. */
3474 wi.is_lhs = 0;
3475 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3476 }
3477 }
3478
3479
3480 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3481
3482 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3483 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs), roots (vNULL)
3484 {
3485 for (unsigned i = 0; i < bbs.length (); ++i)
3486 {
3487 if (i != 0)
3488 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3489 gsi_next (&si))
3490 {
3491 gphi *phi = si.phi ();
3492 gimple_set_uid (phi, 0);
3493 add_stmt (phi);
3494 }
3495 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3496 !gsi_end_p (gsi); gsi_next (&gsi))
3497 {
3498 gimple *stmt = gsi_stmt (gsi);
3499 gimple_set_uid (stmt, 0);
3500 if (is_gimple_debug (stmt))
3501 continue;
3502 add_stmt (stmt);
3503 }
3504 }
3505 }
3506
3507
3508 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3509 stmts in the basic block. */
3510
3511 _bb_vec_info::~_bb_vec_info ()
3512 {
3513 /* Reset region marker. */
3514 for (unsigned i = 0; i < bbs.length (); ++i)
3515 {
3516 if (i != 0)
3517 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3518 gsi_next (&si))
3519 {
3520 gphi *phi = si.phi ();
3521 gimple_set_uid (phi, -1);
3522 }
3523 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3524 !gsi_end_p (gsi); gsi_next (&gsi))
3525 {
3526 gimple *stmt = gsi_stmt (gsi);
3527 gimple_set_uid (stmt, -1);
3528 }
3529 }
3530
3531 for (unsigned i = 0; i < roots.length (); ++i)
3532 roots[i].stmts.release ();
3533 roots.release ();
3534 }
3535
3536 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3537 given then that child nodes have already been processed, and that
3538 their def types currently match their SLP node's def type. */
3539
3540 static bool
3541 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3542 slp_instance node_instance,
3543 stmt_vector_for_cost *cost_vec)
3544 {
3545 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3546 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3547
3548 /* Calculate the number of vector statements to be created for the
3549 scalar stmts in this node. For SLP reductions it is equal to the
3550 number of vector statements in the children (which has already been
3551 calculated by the recursive call). Otherwise it is the number of
3552 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3553 VF divided by the number of elements in a vector. */
3554 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3555 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3556 {
3557 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3558 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3559 {
3560 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3561 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3562 break;
3563 }
3564 }
3565 else
3566 {
3567 poly_uint64 vf;
3568 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3569 vf = loop_vinfo->vectorization_factor;
3570 else
3571 vf = 1;
3572 unsigned int group_size = SLP_TREE_LANES (node);
3573 tree vectype = SLP_TREE_VECTYPE (node);
3574 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3575 = vect_get_num_vectors (vf * group_size, vectype);
3576 }
3577
3578 /* Handle purely internal nodes. */
3579 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3580 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3581
3582 if (is_a <bb_vec_info> (vinfo)
3583 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3584 {
3585 if (dump_enabled_p ())
3586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3587 "desired vector type conflicts with earlier one "
3588 "for %G", stmt_info->stmt);
3589 return false;
3590 }
3591
3592 bool dummy;
3593 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3594 node, node_instance, cost_vec);
3595 }
3596
3597 /* Try to build NODE from scalars, returning true on success.
3598 NODE_INSTANCE is the SLP instance that contains NODE. */
3599
3600 static bool
3601 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3602 slp_instance node_instance)
3603 {
3604 stmt_vec_info stmt_info;
3605 unsigned int i;
3606
3607 if (!is_a <bb_vec_info> (vinfo)
3608 || node == SLP_INSTANCE_TREE (node_instance)
3609 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3610 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3611 return false;
3612
3613 if (dump_enabled_p ())
3614 dump_printf_loc (MSG_NOTE, vect_location,
3615 "Building vector operands of %p from scalars instead\n", node);
3616
3617 /* Don't remove and free the child nodes here, since they could be
3618 referenced by other structures. The analysis and scheduling phases
3619 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3620 unsigned int group_size = SLP_TREE_LANES (node);
3621 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3622 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3623 SLP_TREE_LOAD_PERMUTATION (node).release ();
3624 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3625 {
3626 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3627 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3628 }
3629 return true;
3630 }
3631
3632 /* Compute the prologue cost for invariant or constant operands represented
3633 by NODE. */
3634
3635 static void
3636 vect_prologue_cost_for_slp (slp_tree node,
3637 stmt_vector_for_cost *cost_vec)
3638 {
3639 /* There's a special case of an existing vector, that costs nothing. */
3640 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3641 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3642 return;
3643 /* Without looking at the actual initializer a vector of
3644 constants can be implemented as load from the constant pool.
3645 When all elements are the same we can use a splat. */
3646 tree vectype = SLP_TREE_VECTYPE (node);
3647 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3648 unsigned num_vects_to_check;
3649 unsigned HOST_WIDE_INT const_nunits;
3650 unsigned nelt_limit;
3651 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3652 && ! multiple_p (const_nunits, group_size))
3653 {
3654 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3655 nelt_limit = const_nunits;
3656 }
3657 else
3658 {
3659 /* If either the vector has variable length or the vectors
3660 are composed of repeated whole groups we only need to
3661 cost construction once. All vectors will be the same. */
3662 num_vects_to_check = 1;
3663 nelt_limit = group_size;
3664 }
3665 tree elt = NULL_TREE;
3666 unsigned nelt = 0;
3667 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3668 {
3669 unsigned si = j % group_size;
3670 if (nelt == 0)
3671 elt = SLP_TREE_SCALAR_OPS (node)[si];
3672 /* ??? We're just tracking whether all operands of a single
3673 vector initializer are the same, ideally we'd check if
3674 we emitted the same one already. */
3675 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3676 elt = NULL_TREE;
3677 nelt++;
3678 if (nelt == nelt_limit)
3679 {
3680 record_stmt_cost (cost_vec, 1,
3681 SLP_TREE_DEF_TYPE (node) == vect_external_def
3682 ? (elt ? scalar_to_vec : vec_construct)
3683 : vector_load,
3684 NULL, vectype, 0, vect_prologue);
3685 nelt = 0;
3686 }
3687 }
3688 }
3689
3690 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3691 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3692
3693 Return true if the operations are supported. */
3694
3695 static bool
3696 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3697 slp_instance node_instance,
3698 hash_set<slp_tree> &visited_set,
3699 vec<slp_tree> &visited_vec,
3700 stmt_vector_for_cost *cost_vec)
3701 {
3702 int i, j;
3703 slp_tree child;
3704
3705 /* Assume we can code-generate all invariants. */
3706 if (!node
3707 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
3708 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
3709 return true;
3710
3711 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
3712 {
3713 if (dump_enabled_p ())
3714 dump_printf_loc (MSG_NOTE, vect_location,
3715 "Failed cyclic SLP reference in %p", node);
3716 return false;
3717 }
3718 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
3719
3720 /* If we already analyzed the exact same set of scalar stmts we're done.
3721 We share the generated vector stmts for those. */
3722 if (visited_set.add (node))
3723 return true;
3724 visited_vec.safe_push (node);
3725
3726 bool res = true;
3727 unsigned visited_rec_start = visited_vec.length ();
3728 unsigned cost_vec_rec_start = cost_vec->length ();
3729 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3730 {
3731 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3732 visited_set, visited_vec,
3733 cost_vec);
3734 if (!res)
3735 break;
3736 }
3737
3738 if (res)
3739 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3740 cost_vec);
3741 /* If analysis failed we have to pop all recursive visited nodes
3742 plus ourselves. */
3743 if (!res)
3744 {
3745 while (visited_vec.length () >= visited_rec_start)
3746 visited_set.remove (visited_vec.pop ());
3747 cost_vec->truncate (cost_vec_rec_start);
3748 }
3749
3750 /* When the node can be vectorized cost invariant nodes it references.
3751 This is not done in DFS order to allow the refering node
3752 vectorizable_* calls to nail down the invariant nodes vector type
3753 and possibly unshare it if it needs a different vector type than
3754 other referrers. */
3755 if (res)
3756 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3757 if (child
3758 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
3759 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3760 /* Perform usual caching, note code-generation still
3761 code-gens these nodes multiple times but we expect
3762 to CSE them later. */
3763 && !visited_set.add (child))
3764 {
3765 visited_vec.safe_push (child);
3766 /* ??? After auditing more code paths make a "default"
3767 and push the vector type from NODE to all children
3768 if it is not already set. */
3769 /* Compute the number of vectors to be generated. */
3770 tree vector_type = SLP_TREE_VECTYPE (child);
3771 if (!vector_type)
3772 {
3773 /* For shifts with a scalar argument we don't need
3774 to cost or code-generate anything.
3775 ??? Represent this more explicitely. */
3776 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3777 == shift_vec_info_type)
3778 && j == 1);
3779 continue;
3780 }
3781 unsigned group_size = SLP_TREE_LANES (child);
3782 poly_uint64 vf = 1;
3783 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3784 vf = loop_vinfo->vectorization_factor;
3785 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3786 = vect_get_num_vectors (vf * group_size, vector_type);
3787 /* And cost them. */
3788 vect_prologue_cost_for_slp (child, cost_vec);
3789 }
3790
3791 /* If this node or any of its children can't be vectorized, try pruning
3792 the tree here rather than felling the whole thing. */
3793 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3794 {
3795 /* We'll need to revisit this for invariant costing and number
3796 of vectorized stmt setting. */
3797 res = true;
3798 }
3799
3800 return res;
3801 }
3802
3803
3804 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3805 region and that can be vectorized using vectorizable_live_operation
3806 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3807 scalar code computing it to be retained. */
3808
3809 static void
3810 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3811 slp_instance instance,
3812 stmt_vector_for_cost *cost_vec,
3813 hash_set<stmt_vec_info> &svisited,
3814 hash_set<slp_tree> &visited)
3815 {
3816 if (visited.add (node))
3817 return;
3818
3819 unsigned i;
3820 stmt_vec_info stmt_info;
3821 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3822 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3823 {
3824 if (svisited.contains (stmt_info))
3825 continue;
3826 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3827 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
3828 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
3829 /* Only the pattern root stmt computes the original scalar value. */
3830 continue;
3831 bool mark_visited = true;
3832 gimple *orig_stmt = orig_stmt_info->stmt;
3833 ssa_op_iter op_iter;
3834 def_operand_p def_p;
3835 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3836 {
3837 imm_use_iterator use_iter;
3838 gimple *use_stmt;
3839 stmt_vec_info use_stmt_info;
3840 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3841 if (!is_gimple_debug (use_stmt))
3842 {
3843 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3844 if (!use_stmt_info
3845 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3846 {
3847 STMT_VINFO_LIVE_P (stmt_info) = true;
3848 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3849 NULL, node, instance, i,
3850 false, cost_vec))
3851 /* ??? So we know we can vectorize the live stmt
3852 from one SLP node. If we cannot do so from all
3853 or none consistently we'd have to record which
3854 SLP node (and lane) we want to use for the live
3855 operation. So make sure we can code-generate
3856 from all nodes. */
3857 mark_visited = false;
3858 else
3859 STMT_VINFO_LIVE_P (stmt_info) = false;
3860 BREAK_FROM_IMM_USE_STMT (use_iter);
3861 }
3862 }
3863 /* We have to verify whether we can insert the lane extract
3864 before all uses. The following is a conservative approximation.
3865 We cannot put this into vectorizable_live_operation because
3866 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3867 doesn't work.
3868 Note that while the fact that we emit code for loads at the
3869 first load should make this a non-problem leafs we construct
3870 from scalars are vectorized after the last scalar def.
3871 ??? If we'd actually compute the insert location during
3872 analysis we could use sth less conservative than the last
3873 scalar stmt in the node for the dominance check. */
3874 /* ??? What remains is "live" uses in vector CTORs in the same
3875 SLP graph which is where those uses can end up code-generated
3876 right after their definition instead of close to their original
3877 use. But that would restrict us to code-generate lane-extracts
3878 from the latest stmt in a node. So we compensate for this
3879 during code-generation, simply not replacing uses for those
3880 hopefully rare cases. */
3881 if (STMT_VINFO_LIVE_P (stmt_info))
3882 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3883 if (!is_gimple_debug (use_stmt)
3884 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
3885 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3886 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
3887 {
3888 if (dump_enabled_p ())
3889 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3890 "Cannot determine insertion place for "
3891 "lane extract\n");
3892 STMT_VINFO_LIVE_P (stmt_info) = false;
3893 mark_visited = true;
3894 }
3895 }
3896 if (mark_visited)
3897 svisited.add (stmt_info);
3898 }
3899
3900 slp_tree child;
3901 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3902 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3903 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
3904 cost_vec, svisited, visited);
3905 }
3906
3907 /* Analyze statements in SLP instances of VINFO. Return true if the
3908 operations are supported. */
3909
3910 bool
3911 vect_slp_analyze_operations (vec_info *vinfo)
3912 {
3913 slp_instance instance;
3914 int i;
3915
3916 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3917
3918 hash_set<slp_tree> visited;
3919 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
3920 {
3921 auto_vec<slp_tree> visited_vec;
3922 stmt_vector_for_cost cost_vec;
3923 cost_vec.create (2);
3924 if (is_a <bb_vec_info> (vinfo))
3925 vect_location = instance->location ();
3926 if (!vect_slp_analyze_node_operations (vinfo,
3927 SLP_INSTANCE_TREE (instance),
3928 instance, visited, visited_vec,
3929 &cost_vec)
3930 /* Instances with a root stmt require vectorized defs for the
3931 SLP tree root. */
3932 || (SLP_INSTANCE_ROOT_STMT (instance)
3933 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
3934 != vect_internal_def)))
3935 {
3936 slp_tree node = SLP_INSTANCE_TREE (instance);
3937 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3938 if (dump_enabled_p ())
3939 dump_printf_loc (MSG_NOTE, vect_location,
3940 "removing SLP instance operations starting from: %G",
3941 stmt_info->stmt);
3942 vect_free_slp_instance (instance);
3943 vinfo->slp_instances.ordered_remove (i);
3944 cost_vec.release ();
3945 while (!visited_vec.is_empty ())
3946 visited.remove (visited_vec.pop ());
3947 }
3948 else
3949 {
3950 i++;
3951
3952 /* For BB vectorization remember the SLP graph entry
3953 cost for later. */
3954 if (is_a <bb_vec_info> (vinfo))
3955 instance->cost_vec = cost_vec;
3956 else
3957 {
3958 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
3959 cost_vec.release ();
3960 }
3961 }
3962 }
3963
3964 /* Compute vectorizable live stmts. */
3965 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3966 {
3967 hash_set<stmt_vec_info> svisited;
3968 hash_set<slp_tree> visited;
3969 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
3970 {
3971 vect_location = instance->location ();
3972 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
3973 instance, &instance->cost_vec, svisited,
3974 visited);
3975 }
3976 }
3977
3978 return !vinfo->slp_instances.is_empty ();
3979 }
3980
3981 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3982 closing the eventual chain. */
3983
3984 static slp_instance
3985 get_ultimate_leader (slp_instance instance,
3986 hash_map<slp_instance, slp_instance> &instance_leader)
3987 {
3988 auto_vec<slp_instance *, 8> chain;
3989 slp_instance *tem;
3990 while (*(tem = instance_leader.get (instance)) != instance)
3991 {
3992 chain.safe_push (tem);
3993 instance = *tem;
3994 }
3995 while (!chain.is_empty ())
3996 *chain.pop () = instance;
3997 return instance;
3998 }
3999
4000 /* Worker of vect_bb_partition_graph, recurse on NODE. */
4001
4002 static void
4003 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
4004 slp_instance instance, slp_tree node,
4005 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
4006 hash_map<slp_instance, slp_instance> &instance_leader,
4007 hash_set<slp_tree> &visited)
4008 {
4009 stmt_vec_info stmt_info;
4010 unsigned i;
4011
4012 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4013 {
4014 bool existed_p;
4015 slp_instance &stmt_instance
4016 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
4017 if (!existed_p)
4018 ;
4019 else if (stmt_instance != instance)
4020 {
4021 /* If we're running into a previously marked stmt make us the
4022 leader of the current ultimate leader. This keeps the
4023 leader chain acyclic and works even when the current instance
4024 connects two previously independent graph parts. */
4025 slp_instance stmt_leader
4026 = get_ultimate_leader (stmt_instance, instance_leader);
4027 if (stmt_leader != instance)
4028 instance_leader.put (stmt_leader, instance);
4029 }
4030 stmt_instance = instance;
4031 }
4032
4033 if (visited.add (node))
4034 return;
4035
4036 slp_tree child;
4037 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4038 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4039 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
4040 instance_leader, visited);
4041 }
4042
4043 /* Partition the SLP graph into pieces that can be costed independently. */
4044
4045 static void
4046 vect_bb_partition_graph (bb_vec_info bb_vinfo)
4047 {
4048 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
4049
4050 /* First walk the SLP graph assigning each involved scalar stmt a
4051 corresponding SLP graph entry and upon visiting a previously
4052 marked stmt, make the stmts leader the current SLP graph entry. */
4053 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
4054 hash_map<slp_instance, slp_instance> instance_leader;
4055 hash_set<slp_tree> visited;
4056 slp_instance instance;
4057 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4058 {
4059 instance_leader.put (instance, instance);
4060 vect_bb_partition_graph_r (bb_vinfo,
4061 instance, SLP_INSTANCE_TREE (instance),
4062 stmt_to_instance, instance_leader,
4063 visited);
4064 }
4065
4066 /* Then collect entries to each independent subgraph. */
4067 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4068 {
4069 slp_instance leader = get_ultimate_leader (instance, instance_leader);
4070 leader->subgraph_entries.safe_push (instance);
4071 if (dump_enabled_p ()
4072 && leader != instance)
4073 dump_printf_loc (MSG_NOTE, vect_location,
4074 "instance %p is leader of %p\n",
4075 leader, instance);
4076 }
4077 }
4078
4079 /* Compute the scalar cost of the SLP node NODE and its children
4080 and return it. Do not account defs that are marked in LIFE and
4081 update LIFE according to uses of NODE. */
4082
4083 static void
4084 vect_bb_slp_scalar_cost (vec_info *vinfo,
4085 slp_tree node, vec<bool, va_heap> *life,
4086 stmt_vector_for_cost *cost_vec,
4087 hash_set<slp_tree> &visited)
4088 {
4089 unsigned i;
4090 stmt_vec_info stmt_info;
4091 slp_tree child;
4092
4093 if (visited.add (node))
4094 return;
4095
4096 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4097 {
4098 ssa_op_iter op_iter;
4099 def_operand_p def_p;
4100
4101 if ((*life)[i])
4102 continue;
4103
4104 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4105 gimple *orig_stmt = orig_stmt_info->stmt;
4106
4107 /* If there is a non-vectorized use of the defs then the scalar
4108 stmt is kept live in which case we do not account it or any
4109 required defs in the SLP children in the scalar cost. This
4110 way we make the vectorization more costly when compared to
4111 the scalar cost. */
4112 if (!STMT_VINFO_LIVE_P (stmt_info))
4113 {
4114 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4115 {
4116 imm_use_iterator use_iter;
4117 gimple *use_stmt;
4118 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4119 if (!is_gimple_debug (use_stmt))
4120 {
4121 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
4122 if (!use_stmt_info
4123 || !PURE_SLP_STMT
4124 (vect_stmt_to_vectorize (use_stmt_info)))
4125 {
4126 (*life)[i] = true;
4127 BREAK_FROM_IMM_USE_STMT (use_iter);
4128 }
4129 }
4130 }
4131 if ((*life)[i])
4132 continue;
4133 }
4134
4135 /* Count scalar stmts only once. */
4136 if (gimple_visited_p (orig_stmt))
4137 continue;
4138 gimple_set_visited (orig_stmt, true);
4139
4140 vect_cost_for_stmt kind;
4141 if (STMT_VINFO_DATA_REF (orig_stmt_info))
4142 {
4143 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
4144 kind = scalar_load;
4145 else
4146 kind = scalar_store;
4147 }
4148 else if (vect_nop_conversion_p (orig_stmt_info))
4149 continue;
4150 else
4151 kind = scalar_stmt;
4152 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
4153 SLP_TREE_VECTYPE (node), 0, vect_body);
4154 }
4155
4156 auto_vec<bool, 20> subtree_life;
4157 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4158 {
4159 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4160 {
4161 /* Do not directly pass LIFE to the recursive call, copy it to
4162 confine changes in the callee to the current child/subtree. */
4163 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4164 {
4165 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
4166 for (unsigned j = 0;
4167 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
4168 {
4169 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
4170 if (perm.first == i)
4171 subtree_life[perm.second] = (*life)[j];
4172 }
4173 }
4174 else
4175 {
4176 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
4177 subtree_life.safe_splice (*life);
4178 }
4179 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
4180 visited);
4181 subtree_life.truncate (0);
4182 }
4183 }
4184 }
4185
4186 /* Check if vectorization of the basic block is profitable for the
4187 subgraph denoted by SLP_INSTANCES. */
4188
4189 static bool
4190 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
4191 vec<slp_instance> slp_instances)
4192 {
4193 slp_instance instance;
4194 int i;
4195 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
4196 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
4197
4198 void *vect_target_cost_data = init_cost (NULL);
4199
4200 /* Calculate scalar cost and sum the cost for the vector stmts
4201 previously collected. */
4202 stmt_vector_for_cost scalar_costs;
4203 scalar_costs.create (0);
4204 hash_set<slp_tree> visited;
4205 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4206 {
4207 auto_vec<bool, 20> life;
4208 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
4209 true);
4210 vect_bb_slp_scalar_cost (bb_vinfo,
4211 SLP_INSTANCE_TREE (instance),
4212 &life, &scalar_costs, visited);
4213 add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
4214 instance->cost_vec.release ();
4215 }
4216 /* Unset visited flag. */
4217 stmt_info_for_cost *si;
4218 FOR_EACH_VEC_ELT (scalar_costs, i, si)
4219 gimple_set_visited (si->stmt_info->stmt, false);
4220
4221 void *scalar_target_cost_data = init_cost (NULL);
4222 add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
4223 scalar_costs.release ();
4224 unsigned dummy;
4225 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
4226 destroy_cost_data (scalar_target_cost_data);
4227
4228 /* Complete the target-specific vector cost calculation. */
4229 finish_cost (vect_target_cost_data, &vec_prologue_cost,
4230 &vec_inside_cost, &vec_epilogue_cost);
4231 destroy_cost_data (vect_target_cost_data);
4232
4233 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
4234
4235 if (dump_enabled_p ())
4236 {
4237 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4238 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
4239 vec_inside_cost);
4240 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
4241 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
4242 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
4243 }
4244
4245 /* Vectorization is profitable if its cost is more than the cost of scalar
4246 version. Note that we err on the vector side for equal cost because
4247 the cost estimate is otherwise quite pessimistic (constant uses are
4248 free on the scalar side but cost a load on the vector side for
4249 example). */
4250 if (vec_outside_cost + vec_inside_cost > scalar_cost)
4251 return false;
4252
4253 return true;
4254 }
4255
4256 /* qsort comparator for lane defs. */
4257
4258 static int
4259 vld_cmp (const void *a_, const void *b_)
4260 {
4261 auto *a = (const std::pair<unsigned, tree> *)a_;
4262 auto *b = (const std::pair<unsigned, tree> *)b_;
4263 return a->first - b->first;
4264 }
4265
4266 /* Return true if USE_STMT is a vector lane insert into VEC and set
4267 *THIS_LANE to the lane number that is set. */
4268
4269 static bool
4270 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
4271 {
4272 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
4273 if (!use_ass
4274 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
4275 || (vec
4276 ? gimple_assign_rhs1 (use_ass) != vec
4277 : ((vec = gimple_assign_rhs1 (use_ass)), false))
4278 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
4279 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
4280 || !constant_multiple_p
4281 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
4282 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
4283 this_lane))
4284 return false;
4285 return true;
4286 }
4287
4288 /* Find any vectorizable constructors and add them to the grouped_store
4289 array. */
4290
4291 static void
4292 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
4293 {
4294 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
4295 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
4296 !gsi_end_p (gsi); gsi_next (&gsi))
4297 {
4298 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
4299 if (!assign)
4300 continue;
4301
4302 tree rhs = gimple_assign_rhs1 (assign);
4303 if (gimple_assign_rhs_code (assign) == CONSTRUCTOR)
4304 {
4305 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
4306 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
4307 CONSTRUCTOR_NELTS (rhs))
4308 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
4309 || uniform_vector_p (rhs))
4310 continue;
4311
4312 unsigned j;
4313 tree val;
4314 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
4315 if (TREE_CODE (val) != SSA_NAME
4316 || !bb_vinfo->lookup_def (val))
4317 break;
4318 if (j != CONSTRUCTOR_NELTS (rhs))
4319 continue;
4320
4321 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
4322 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
4323 }
4324 else if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR
4325 && VECTOR_TYPE_P (TREE_TYPE (rhs))
4326 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
4327 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
4328 && integer_zerop (gimple_assign_rhs3 (assign))
4329 && useless_type_conversion_p
4330 (TREE_TYPE (TREE_TYPE (rhs)),
4331 TREE_TYPE (gimple_assign_rhs2 (assign)))
4332 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
4333 {
4334 /* We start to match on insert to lane zero but since the
4335 inserts need not be ordered we'd have to search both
4336 the def and the use chains. */
4337 tree vectype = TREE_TYPE (rhs);
4338 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4339 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
4340 auto_sbitmap lanes (nlanes);
4341 bitmap_clear (lanes);
4342 bitmap_set_bit (lanes, 0);
4343 tree def = gimple_assign_lhs (assign);
4344 lane_defs.quick_push
4345 (std::make_pair (0, gimple_assign_rhs2 (assign)));
4346 unsigned lanes_found = 1;
4347 /* Start with the use chains, the last stmt will be the root. */
4348 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
4349 do
4350 {
4351 use_operand_p use_p;
4352 gimple *use_stmt;
4353 if (!single_imm_use (def, &use_p, &use_stmt))
4354 break;
4355 unsigned this_lane;
4356 if (!bb_vinfo->lookup_stmt (use_stmt)
4357 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
4358 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
4359 break;
4360 if (bitmap_bit_p (lanes, this_lane))
4361 break;
4362 lanes_found++;
4363 bitmap_set_bit (lanes, this_lane);
4364 gassign *use_ass = as_a <gassign *> (use_stmt);
4365 lane_defs.quick_push (std::make_pair
4366 (this_lane, gimple_assign_rhs2 (use_ass)));
4367 last = bb_vinfo->lookup_stmt (use_ass);
4368 def = gimple_assign_lhs (use_ass);
4369 }
4370 while (lanes_found < nlanes);
4371 if (lanes_found < nlanes)
4372 {
4373 /* Now search the def chain. */
4374 def = gimple_assign_rhs1 (assign);
4375 do
4376 {
4377 if (TREE_CODE (def) != SSA_NAME
4378 || !has_single_use (def))
4379 break;
4380 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
4381 unsigned this_lane;
4382 if (!bb_vinfo->lookup_stmt (def_stmt)
4383 || !vect_slp_is_lane_insert (def_stmt,
4384 NULL_TREE, &this_lane)
4385 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
4386 break;
4387 if (bitmap_bit_p (lanes, this_lane))
4388 break;
4389 lanes_found++;
4390 bitmap_set_bit (lanes, this_lane);
4391 lane_defs.quick_push (std::make_pair
4392 (this_lane,
4393 gimple_assign_rhs2 (def_stmt)));
4394 def = gimple_assign_rhs1 (def_stmt);
4395 }
4396 while (lanes_found < nlanes);
4397 }
4398 if (lanes_found == nlanes)
4399 {
4400 /* Sort lane_defs after the lane index and register the root. */
4401 lane_defs.qsort (vld_cmp);
4402 vec<stmt_vec_info> stmts;
4403 stmts.create (nlanes);
4404 for (unsigned i = 0; i < nlanes; ++i)
4405 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
4406 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
4407 stmts, last));
4408 }
4409 }
4410 }
4411 }
4412
4413 /* Walk the grouped store chains and replace entries with their
4414 pattern variant if any. */
4415
4416 static void
4417 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
4418 {
4419 stmt_vec_info first_element;
4420 unsigned i;
4421
4422 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4423 {
4424 /* We also have CTORs in this array. */
4425 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
4426 continue;
4427 if (STMT_VINFO_IN_PATTERN_P (first_element))
4428 {
4429 stmt_vec_info orig = first_element;
4430 first_element = STMT_VINFO_RELATED_STMT (first_element);
4431 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
4432 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
4433 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
4434 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
4435 vinfo->grouped_stores[i] = first_element;
4436 }
4437 stmt_vec_info prev = first_element;
4438 while (DR_GROUP_NEXT_ELEMENT (prev))
4439 {
4440 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
4441 if (STMT_VINFO_IN_PATTERN_P (elt))
4442 {
4443 stmt_vec_info orig = elt;
4444 elt = STMT_VINFO_RELATED_STMT (elt);
4445 DR_GROUP_NEXT_ELEMENT (prev) = elt;
4446 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
4447 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
4448 }
4449 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
4450 prev = elt;
4451 }
4452 }
4453 }
4454
4455 /* Check if the region described by BB_VINFO can be vectorized, returning
4456 true if so. When returning false, set FATAL to true if the same failure
4457 would prevent vectorization at other vector sizes, false if it is still
4458 worth trying other sizes. N_STMTS is the number of statements in the
4459 region. */
4460
4461 static bool
4462 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
4463 vec<int> *dataref_groups)
4464 {
4465 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4466
4467 slp_instance instance;
4468 int i;
4469 poly_uint64 min_vf = 2;
4470
4471 /* The first group of checks is independent of the vector size. */
4472 fatal = true;
4473
4474 /* Analyze the data references. */
4475
4476 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
4477 {
4478 if (dump_enabled_p ())
4479 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4480 "not vectorized: unhandled data-ref in basic "
4481 "block.\n");
4482 return false;
4483 }
4484
4485 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
4486 {
4487 if (dump_enabled_p ())
4488 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4489 "not vectorized: unhandled data access in "
4490 "basic block.\n");
4491 return false;
4492 }
4493
4494 vect_slp_check_for_constructors (bb_vinfo);
4495
4496 /* If there are no grouped stores and no constructors in the region
4497 there is no need to continue with pattern recog as vect_analyze_slp
4498 will fail anyway. */
4499 if (bb_vinfo->grouped_stores.is_empty ()
4500 && bb_vinfo->roots.is_empty ())
4501 {
4502 if (dump_enabled_p ())
4503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4504 "not vectorized: no grouped stores in "
4505 "basic block.\n");
4506 return false;
4507 }
4508
4509 /* While the rest of the analysis below depends on it in some way. */
4510 fatal = false;
4511
4512 vect_pattern_recog (bb_vinfo);
4513
4514 /* Update store groups from pattern processing. */
4515 vect_fixup_store_groups_with_patterns (bb_vinfo);
4516
4517 /* Check the SLP opportunities in the basic block, analyze and build SLP
4518 trees. */
4519 if (!vect_analyze_slp (bb_vinfo, n_stmts))
4520 {
4521 if (dump_enabled_p ())
4522 {
4523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4524 "Failed to SLP the basic block.\n");
4525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4526 "not vectorized: failed to find SLP opportunities "
4527 "in basic block.\n");
4528 }
4529 return false;
4530 }
4531
4532 /* Optimize permutations. */
4533 vect_optimize_slp (bb_vinfo);
4534
4535 /* Gather the loads reachable from the SLP graph entries. */
4536 vect_gather_slp_loads (bb_vinfo);
4537
4538 vect_record_base_alignments (bb_vinfo);
4539
4540 /* Analyze and verify the alignment of data references and the
4541 dependence in the SLP instances. */
4542 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
4543 {
4544 vect_location = instance->location ();
4545 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
4546 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
4547 {
4548 slp_tree node = SLP_INSTANCE_TREE (instance);
4549 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4550 if (dump_enabled_p ())
4551 dump_printf_loc (MSG_NOTE, vect_location,
4552 "removing SLP instance operations starting from: %G",
4553 stmt_info->stmt);
4554 vect_free_slp_instance (instance);
4555 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
4556 continue;
4557 }
4558
4559 /* Mark all the statements that we want to vectorize as pure SLP and
4560 relevant. */
4561 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4562 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
4563 if (stmt_vec_info root = SLP_INSTANCE_ROOT_STMT (instance))
4564 {
4565 STMT_SLP_TYPE (root) = pure_slp;
4566 if (is_gimple_assign (root->stmt)
4567 && gimple_assign_rhs_code (root->stmt) == BIT_INSERT_EXPR)
4568 {
4569 /* ??? We should probably record the whole vector of
4570 root stmts so we do not have to back-track here... */
4571 for (unsigned n = SLP_TREE_LANES (SLP_INSTANCE_TREE (instance));
4572 n != 1; --n)
4573 {
4574 root = bb_vinfo->lookup_def (gimple_assign_rhs1 (root->stmt));
4575 STMT_SLP_TYPE (root) = pure_slp;
4576 }
4577 }
4578 }
4579
4580 i++;
4581 }
4582 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
4583 return false;
4584
4585 if (!vect_slp_analyze_operations (bb_vinfo))
4586 {
4587 if (dump_enabled_p ())
4588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4589 "not vectorized: bad operation in basic block.\n");
4590 return false;
4591 }
4592
4593 vect_bb_partition_graph (bb_vinfo);
4594
4595 return true;
4596 }
4597
4598 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4599 basic blocks in BBS, returning true on success.
4600 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4601
4602 static bool
4603 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
4604 vec<int> *dataref_groups, unsigned int n_stmts)
4605 {
4606 bb_vec_info bb_vinfo;
4607 auto_vector_modes vector_modes;
4608
4609 /* Autodetect first vector size we try. */
4610 machine_mode next_vector_mode = VOIDmode;
4611 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
4612 unsigned int mode_i = 0;
4613
4614 vec_info_shared shared;
4615
4616 machine_mode autodetected_vector_mode = VOIDmode;
4617 while (1)
4618 {
4619 bool vectorized = false;
4620 bool fatal = false;
4621 bb_vinfo = new _bb_vec_info (bbs, &shared);
4622
4623 bool first_time_p = shared.datarefs.is_empty ();
4624 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
4625 if (first_time_p)
4626 bb_vinfo->shared->save_datarefs ();
4627 else
4628 bb_vinfo->shared->check_datarefs ();
4629 bb_vinfo->vector_mode = next_vector_mode;
4630
4631 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
4632 {
4633 if (dump_enabled_p ())
4634 {
4635 dump_printf_loc (MSG_NOTE, vect_location,
4636 "***** Analysis succeeded with vector mode"
4637 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4638 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4639 }
4640
4641 bb_vinfo->shared->check_datarefs ();
4642
4643 unsigned i;
4644 slp_instance instance;
4645 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4646 {
4647 if (instance->subgraph_entries.is_empty ())
4648 continue;
4649
4650 vect_location = instance->location ();
4651 if (!unlimited_cost_model (NULL)
4652 && !vect_bb_vectorization_profitable_p
4653 (bb_vinfo, instance->subgraph_entries))
4654 {
4655 if (dump_enabled_p ())
4656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4657 "not vectorized: vectorization is not "
4658 "profitable.\n");
4659 continue;
4660 }
4661
4662 if (!dbg_cnt (vect_slp))
4663 continue;
4664
4665 if (!vectorized && dump_enabled_p ())
4666 dump_printf_loc (MSG_NOTE, vect_location,
4667 "Basic block will be vectorized "
4668 "using SLP\n");
4669 vectorized = true;
4670
4671 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4672
4673 unsigned HOST_WIDE_INT bytes;
4674 if (dump_enabled_p ())
4675 {
4676 if (GET_MODE_SIZE
4677 (bb_vinfo->vector_mode).is_constant (&bytes))
4678 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4679 "basic block part vectorized using %wu "
4680 "byte vectors\n", bytes);
4681 else
4682 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4683 "basic block part vectorized using "
4684 "variable length vectors\n");
4685 }
4686 }
4687 }
4688 else
4689 {
4690 if (dump_enabled_p ())
4691 dump_printf_loc (MSG_NOTE, vect_location,
4692 "***** Analysis failed with vector mode %s\n",
4693 GET_MODE_NAME (bb_vinfo->vector_mode));
4694 }
4695
4696 if (mode_i == 0)
4697 autodetected_vector_mode = bb_vinfo->vector_mode;
4698
4699 if (!fatal)
4700 while (mode_i < vector_modes.length ()
4701 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
4702 {
4703 if (dump_enabled_p ())
4704 dump_printf_loc (MSG_NOTE, vect_location,
4705 "***** The result for vector mode %s would"
4706 " be the same\n",
4707 GET_MODE_NAME (vector_modes[mode_i]));
4708 mode_i += 1;
4709 }
4710
4711 delete bb_vinfo;
4712
4713 if (mode_i < vector_modes.length ()
4714 && VECTOR_MODE_P (autodetected_vector_mode)
4715 && (related_vector_mode (vector_modes[mode_i],
4716 GET_MODE_INNER (autodetected_vector_mode))
4717 == autodetected_vector_mode)
4718 && (related_vector_mode (autodetected_vector_mode,
4719 GET_MODE_INNER (vector_modes[mode_i]))
4720 == vector_modes[mode_i]))
4721 {
4722 if (dump_enabled_p ())
4723 dump_printf_loc (MSG_NOTE, vect_location,
4724 "***** Skipping vector mode %s, which would"
4725 " repeat the analysis for %s\n",
4726 GET_MODE_NAME (vector_modes[mode_i]),
4727 GET_MODE_NAME (autodetected_vector_mode));
4728 mode_i += 1;
4729 }
4730
4731 if (vectorized
4732 || mode_i == vector_modes.length ()
4733 || autodetected_vector_mode == VOIDmode
4734 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4735 vector sizes will fail do not bother iterating. */
4736 || fatal)
4737 return vectorized;
4738
4739 /* Try the next biggest vector size. */
4740 next_vector_mode = vector_modes[mode_i++];
4741 if (dump_enabled_p ())
4742 dump_printf_loc (MSG_NOTE, vect_location,
4743 "***** Re-trying analysis with vector mode %s\n",
4744 GET_MODE_NAME (next_vector_mode));
4745 }
4746 }
4747
4748
4749 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4750 true if anything in the basic-block was vectorized. */
4751
4752 static bool
4753 vect_slp_bbs (vec<basic_block> bbs)
4754 {
4755 vec<data_reference_p> datarefs = vNULL;
4756 auto_vec<int> dataref_groups;
4757 int insns = 0;
4758 int current_group = 0;
4759
4760 for (unsigned i = 0; i < bbs.length (); i++)
4761 {
4762 basic_block bb = bbs[i];
4763 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
4764 gsi_next (&gsi))
4765 {
4766 gimple *stmt = gsi_stmt (gsi);
4767 if (is_gimple_debug (stmt))
4768 continue;
4769
4770 insns++;
4771
4772 if (gimple_location (stmt) != UNKNOWN_LOCATION)
4773 vect_location = stmt;
4774
4775 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
4776 &dataref_groups, current_group))
4777 ++current_group;
4778 }
4779 }
4780
4781 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
4782 }
4783
4784 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4785 true if anything in the basic-block was vectorized. */
4786
4787 bool
4788 vect_slp_bb (basic_block bb)
4789 {
4790 auto_vec<basic_block> bbs;
4791 bbs.safe_push (bb);
4792 return vect_slp_bbs (bbs);
4793 }
4794
4795 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4796 true if anything in the basic-block was vectorized. */
4797
4798 bool
4799 vect_slp_function (function *fun)
4800 {
4801 bool r = false;
4802 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
4803 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
4804
4805 /* For the moment split the function into pieces to avoid making
4806 the iteration on the vector mode moot. Split at points we know
4807 to not handle well which is CFG merges (SLP discovery doesn't
4808 handle non-loop-header PHIs) and loop exits. Since pattern
4809 recog requires reverse iteration to visit uses before defs
4810 simply chop RPO into pieces. */
4811 auto_vec<basic_block> bbs;
4812 for (unsigned i = 0; i < n; i++)
4813 {
4814 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
4815 bool split = false;
4816
4817 /* Split when a BB is not dominated by the first block. */
4818 if (!bbs.is_empty ()
4819 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
4820 {
4821 if (dump_enabled_p ())
4822 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4823 "splitting region at dominance boundary bb%d\n",
4824 bb->index);
4825 split = true;
4826 }
4827 /* Split when the loop determined by the first block
4828 is exited. This is because we eventually insert
4829 invariants at region begin. */
4830 else if (!bbs.is_empty ()
4831 && bbs[0]->loop_father != bb->loop_father
4832 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
4833 {
4834 if (dump_enabled_p ())
4835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4836 "splitting region at loop %d exit at bb%d\n",
4837 bbs[0]->loop_father->num, bb->index);
4838 split = true;
4839 }
4840
4841 if (split && !bbs.is_empty ())
4842 {
4843 r |= vect_slp_bbs (bbs);
4844 bbs.truncate (0);
4845 bbs.quick_push (bb);
4846 }
4847 else
4848 bbs.safe_push (bb);
4849
4850 /* When we have a stmt ending this block and defining a
4851 value we have to insert on edges when inserting after it for
4852 a vector containing its definition. Avoid this for now. */
4853 if (gimple *last = last_stmt (bb))
4854 if (gimple_get_lhs (last)
4855 && is_ctrl_altering_stmt (last))
4856 {
4857 if (dump_enabled_p ())
4858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4859 "splitting region at control altering "
4860 "definition %G", last);
4861 r |= vect_slp_bbs (bbs);
4862 bbs.truncate (0);
4863 }
4864 }
4865
4866 if (!bbs.is_empty ())
4867 r |= vect_slp_bbs (bbs);
4868
4869 free (rpo);
4870
4871 return r;
4872 }
4873
4874 /* Build a variable-length vector in which the elements in ELTS are repeated
4875 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4876 RESULTS and add any new instructions to SEQ.
4877
4878 The approach we use is:
4879
4880 (1) Find a vector mode VM with integer elements of mode IM.
4881
4882 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4883 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4884 from small vectors to IM.
4885
4886 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4887
4888 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4889 correct byte contents.
4890
4891 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4892
4893 We try to find the largest IM for which this sequence works, in order
4894 to cut down on the number of interleaves. */
4895
4896 void
4897 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
4898 vec<tree> elts, unsigned int nresults,
4899 vec<tree> &results)
4900 {
4901 unsigned int nelts = elts.length ();
4902 tree element_type = TREE_TYPE (vector_type);
4903
4904 /* (1) Find a vector mode VM with integer elements of mode IM. */
4905 unsigned int nvectors = 1;
4906 tree new_vector_type;
4907 tree permutes[2];
4908 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
4909 &nvectors, &new_vector_type,
4910 permutes))
4911 gcc_unreachable ();
4912
4913 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4914 unsigned int partial_nelts = nelts / nvectors;
4915 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
4916
4917 tree_vector_builder partial_elts;
4918 auto_vec<tree, 32> pieces (nvectors * 2);
4919 pieces.quick_grow (nvectors * 2);
4920 for (unsigned int i = 0; i < nvectors; ++i)
4921 {
4922 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4923 ELTS' has mode IM. */
4924 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
4925 for (unsigned int j = 0; j < partial_nelts; ++j)
4926 partial_elts.quick_push (elts[i * partial_nelts + j]);
4927 tree t = gimple_build_vector (seq, &partial_elts);
4928 t = gimple_build (seq, VIEW_CONVERT_EXPR,
4929 TREE_TYPE (new_vector_type), t);
4930
4931 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4932 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
4933 }
4934
4935 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4936 correct byte contents.
4937
4938 We need to repeat the following operation log2(nvectors) times:
4939
4940 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4941 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4942
4943 However, if each input repeats every N elements and the VF is
4944 a multiple of N * 2, the HI result is the same as the LO. */
4945 unsigned int in_start = 0;
4946 unsigned int out_start = nvectors;
4947 unsigned int hi_start = nvectors / 2;
4948 /* A bound on the number of outputs needed to produce NRESULTS results
4949 in the final iteration. */
4950 unsigned int noutputs_bound = nvectors * nresults;
4951 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
4952 {
4953 noutputs_bound /= 2;
4954 unsigned int limit = MIN (noutputs_bound, nvectors);
4955 for (unsigned int i = 0; i < limit; ++i)
4956 {
4957 if ((i & 1) != 0
4958 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
4959 2 * in_repeat))
4960 {
4961 pieces[out_start + i] = pieces[out_start + i - 1];
4962 continue;
4963 }
4964
4965 tree output = make_ssa_name (new_vector_type);
4966 tree input1 = pieces[in_start + (i / 2)];
4967 tree input2 = pieces[in_start + (i / 2) + hi_start];
4968 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
4969 input1, input2,
4970 permutes[i & 1]);
4971 gimple_seq_add_stmt (seq, stmt);
4972 pieces[out_start + i] = output;
4973 }
4974 std::swap (in_start, out_start);
4975 }
4976
4977 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4978 results.reserve (nresults);
4979 for (unsigned int i = 0; i < nresults; ++i)
4980 if (i < nvectors)
4981 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
4982 pieces[in_start + i]));
4983 else
4984 results.quick_push (results[i - nvectors]);
4985 }
4986
4987
4988 /* For constant and loop invariant defs in OP_NODE this function creates
4989 vector defs that will be used in the vectorized stmts and stores them
4990 to SLP_TREE_VEC_DEFS of OP_NODE. */
4991
4992 static void
4993 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
4994 {
4995 unsigned HOST_WIDE_INT nunits;
4996 tree vec_cst;
4997 unsigned j, number_of_places_left_in_vector;
4998 tree vector_type;
4999 tree vop;
5000 int group_size = op_node->ops.length ();
5001 unsigned int vec_num, i;
5002 unsigned number_of_copies = 1;
5003 bool constant_p;
5004 gimple_seq ctor_seq = NULL;
5005 auto_vec<tree, 16> permute_results;
5006
5007 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
5008 vector_type = SLP_TREE_VECTYPE (op_node);
5009
5010 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
5011 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
5012 auto_vec<tree> voprnds (number_of_vectors);
5013
5014 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5015 created vectors. It is greater than 1 if unrolling is performed.
5016
5017 For example, we have two scalar operands, s1 and s2 (e.g., group of
5018 strided accesses of size two), while NUNITS is four (i.e., four scalars
5019 of this type can be packed in a vector). The output vector will contain
5020 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5021 will be 2).
5022
5023 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
5024 containing the operands.
5025
5026 For example, NUNITS is four as before, and the group size is 8
5027 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5028 {s5, s6, s7, s8}. */
5029
5030 /* When using duplicate_and_interleave, we just need one element for
5031 each scalar statement. */
5032 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5033 nunits = group_size;
5034
5035 number_of_copies = nunits * number_of_vectors / group_size;
5036
5037 number_of_places_left_in_vector = nunits;
5038 constant_p = true;
5039 tree_vector_builder elts (vector_type, nunits, 1);
5040 elts.quick_grow (nunits);
5041 stmt_vec_info insert_after = NULL;
5042 for (j = 0; j < number_of_copies; j++)
5043 {
5044 tree op;
5045 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
5046 {
5047 /* Create 'vect_ = {op0,op1,...,opn}'. */
5048 number_of_places_left_in_vector--;
5049 tree orig_op = op;
5050 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
5051 {
5052 if (CONSTANT_CLASS_P (op))
5053 {
5054 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
5055 {
5056 /* Can't use VIEW_CONVERT_EXPR for booleans because
5057 of possibly different sizes of scalar value and
5058 vector element. */
5059 if (integer_zerop (op))
5060 op = build_int_cst (TREE_TYPE (vector_type), 0);
5061 else if (integer_onep (op))
5062 op = build_all_ones_cst (TREE_TYPE (vector_type));
5063 else
5064 gcc_unreachable ();
5065 }
5066 else
5067 op = fold_unary (VIEW_CONVERT_EXPR,
5068 TREE_TYPE (vector_type), op);
5069 gcc_assert (op && CONSTANT_CLASS_P (op));
5070 }
5071 else
5072 {
5073 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
5074 gimple *init_stmt;
5075 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
5076 {
5077 tree true_val
5078 = build_all_ones_cst (TREE_TYPE (vector_type));
5079 tree false_val
5080 = build_zero_cst (TREE_TYPE (vector_type));
5081 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
5082 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
5083 op, true_val,
5084 false_val);
5085 }
5086 else
5087 {
5088 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
5089 op);
5090 init_stmt
5091 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
5092 op);
5093 }
5094 gimple_seq_add_stmt (&ctor_seq, init_stmt);
5095 op = new_temp;
5096 }
5097 }
5098 elts[number_of_places_left_in_vector] = op;
5099 if (!CONSTANT_CLASS_P (op))
5100 constant_p = false;
5101 /* For BB vectorization we have to compute an insert location
5102 when a def is inside the analyzed region since we cannot
5103 simply insert at the BB start in this case. */
5104 stmt_vec_info opdef;
5105 if (TREE_CODE (orig_op) == SSA_NAME
5106 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
5107 && is_a <bb_vec_info> (vinfo)
5108 && (opdef = vinfo->lookup_def (orig_op)))
5109 {
5110 if (!insert_after)
5111 insert_after = opdef;
5112 else
5113 insert_after = get_later_stmt (insert_after, opdef);
5114 }
5115
5116 if (number_of_places_left_in_vector == 0)
5117 {
5118 if (constant_p
5119 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5120 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5121 vec_cst = gimple_build_vector (&ctor_seq, &elts);
5122 else
5123 {
5124 if (permute_results.is_empty ())
5125 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
5126 elts, number_of_vectors,
5127 permute_results);
5128 vec_cst = permute_results[number_of_vectors - j - 1];
5129 }
5130 if (!gimple_seq_empty_p (ctor_seq))
5131 {
5132 if (insert_after)
5133 {
5134 gimple_stmt_iterator gsi;
5135 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
5136 {
5137 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
5138 gsi_insert_seq_before (&gsi, ctor_seq,
5139 GSI_CONTINUE_LINKING);
5140 }
5141 else if (!stmt_ends_bb_p (insert_after->stmt))
5142 {
5143 gsi = gsi_for_stmt (insert_after->stmt);
5144 gsi_insert_seq_after (&gsi, ctor_seq,
5145 GSI_CONTINUE_LINKING);
5146 }
5147 else
5148 {
5149 /* When we want to insert after a def where the
5150 defining stmt throws then insert on the fallthru
5151 edge. */
5152 edge e = find_fallthru_edge
5153 (gimple_bb (insert_after->stmt)->succs);
5154 basic_block new_bb
5155 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
5156 gcc_assert (!new_bb);
5157 }
5158 }
5159 else
5160 vinfo->insert_seq_on_entry (NULL, ctor_seq);
5161 ctor_seq = NULL;
5162 }
5163 voprnds.quick_push (vec_cst);
5164 insert_after = NULL;
5165 number_of_places_left_in_vector = nunits;
5166 constant_p = true;
5167 elts.new_vector (vector_type, nunits, 1);
5168 elts.quick_grow (nunits);
5169 }
5170 }
5171 }
5172
5173 /* Since the vectors are created in the reverse order, we should invert
5174 them. */
5175 vec_num = voprnds.length ();
5176 for (j = vec_num; j != 0; j--)
5177 {
5178 vop = voprnds[j - 1];
5179 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5180 }
5181
5182 /* In case that VF is greater than the unrolling factor needed for the SLP
5183 group of stmts, NUMBER_OF_VECTORS to be created is greater than
5184 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
5185 to replicate the vectors. */
5186 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
5187 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
5188 i++)
5189 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5190 }
5191
5192 /* Get the Ith vectorized definition from SLP_NODE. */
5193
5194 tree
5195 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
5196 {
5197 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
5198 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
5199 else
5200 return SLP_TREE_VEC_DEFS (slp_node)[i];
5201 }
5202
5203 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
5204
5205 void
5206 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
5207 {
5208 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
5209 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
5210 {
5211 unsigned j;
5212 gimple *vec_def_stmt;
5213 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
5214 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
5215 }
5216 else
5217 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
5218 }
5219
5220 /* Get N vectorized definitions for SLP_NODE. */
5221
5222 void
5223 vect_get_slp_defs (vec_info *,
5224 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
5225 {
5226 if (n == -1U)
5227 n = SLP_TREE_CHILDREN (slp_node).length ();
5228
5229 for (unsigned i = 0; i < n; ++i)
5230 {
5231 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
5232 vec<tree> vec_defs = vNULL;
5233 vect_get_slp_defs (child, &vec_defs);
5234 vec_oprnds->quick_push (vec_defs);
5235 }
5236 }
5237
5238 /* Generate vector permute statements from a list of loads in DR_CHAIN.
5239 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
5240 permute statements for the SLP node NODE. Store the number of vector
5241 permute instructions in *N_PERMS and the number of vector load
5242 instructions in *N_LOADS. */
5243
5244 bool
5245 vect_transform_slp_perm_load (vec_info *vinfo,
5246 slp_tree node, vec<tree> dr_chain,
5247 gimple_stmt_iterator *gsi, poly_uint64 vf,
5248 bool analyze_only, unsigned *n_perms,
5249 unsigned int *n_loads)
5250 {
5251 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5252 int vec_index = 0;
5253 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5254 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
5255 unsigned int mask_element;
5256 machine_mode mode;
5257
5258 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
5259 return false;
5260
5261 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
5262
5263 mode = TYPE_MODE (vectype);
5264 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5265
5266 /* Initialize the vect stmts of NODE to properly insert the generated
5267 stmts later. */
5268 if (! analyze_only)
5269 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
5270 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
5271 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
5272
5273 /* Generate permutation masks for every NODE. Number of masks for each NODE
5274 is equal to GROUP_SIZE.
5275 E.g., we have a group of three nodes with three loads from the same
5276 location in each node, and the vector size is 4. I.e., we have a
5277 a0b0c0a1b1c1... sequence and we need to create the following vectors:
5278 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
5279 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
5280 ...
5281
5282 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
5283 The last mask is illegal since we assume two operands for permute
5284 operation, and the mask element values can't be outside that range.
5285 Hence, the last mask must be converted into {2,5,5,5}.
5286 For the first two permutations we need the first and the second input
5287 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
5288 we need the second and the third vectors: {b1,c1,a2,b2} and
5289 {c2,a3,b3,c3}. */
5290
5291 int vect_stmts_counter = 0;
5292 unsigned int index = 0;
5293 int first_vec_index = -1;
5294 int second_vec_index = -1;
5295 bool noop_p = true;
5296 *n_perms = 0;
5297
5298 vec_perm_builder mask;
5299 unsigned int nelts_to_build;
5300 unsigned int nvectors_per_build;
5301 unsigned int in_nlanes;
5302 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
5303 && multiple_p (nunits, group_size));
5304 if (repeating_p)
5305 {
5306 /* A single vector contains a whole number of copies of the node, so:
5307 (a) all permutes can use the same mask; and
5308 (b) the permutes only need a single vector input. */
5309 mask.new_vector (nunits, group_size, 3);
5310 nelts_to_build = mask.encoded_nelts ();
5311 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
5312 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
5313 }
5314 else
5315 {
5316 /* We need to construct a separate mask for each vector statement. */
5317 unsigned HOST_WIDE_INT const_nunits, const_vf;
5318 if (!nunits.is_constant (&const_nunits)
5319 || !vf.is_constant (&const_vf))
5320 return false;
5321 mask.new_vector (const_nunits, const_nunits, 1);
5322 nelts_to_build = const_vf * group_size;
5323 nvectors_per_build = 1;
5324 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
5325 }
5326 auto_sbitmap used_in_lanes (in_nlanes);
5327 bitmap_clear (used_in_lanes);
5328
5329 unsigned int count = mask.encoded_nelts ();
5330 mask.quick_grow (count);
5331 vec_perm_indices indices;
5332
5333 for (unsigned int j = 0; j < nelts_to_build; j++)
5334 {
5335 unsigned int iter_num = j / group_size;
5336 unsigned int stmt_num = j % group_size;
5337 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
5338 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
5339 bitmap_set_bit (used_in_lanes, i);
5340 if (repeating_p)
5341 {
5342 first_vec_index = 0;
5343 mask_element = i;
5344 }
5345 else
5346 {
5347 /* Enforced before the loop when !repeating_p. */
5348 unsigned int const_nunits = nunits.to_constant ();
5349 vec_index = i / const_nunits;
5350 mask_element = i % const_nunits;
5351 if (vec_index == first_vec_index
5352 || first_vec_index == -1)
5353 {
5354 first_vec_index = vec_index;
5355 }
5356 else if (vec_index == second_vec_index
5357 || second_vec_index == -1)
5358 {
5359 second_vec_index = vec_index;
5360 mask_element += const_nunits;
5361 }
5362 else
5363 {
5364 if (dump_enabled_p ())
5365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5366 "permutation requires at "
5367 "least three vectors %G",
5368 stmt_info->stmt);
5369 gcc_assert (analyze_only);
5370 return false;
5371 }
5372
5373 gcc_assert (mask_element < 2 * const_nunits);
5374 }
5375
5376 if (mask_element != index)
5377 noop_p = false;
5378 mask[index++] = mask_element;
5379
5380 if (index == count && !noop_p)
5381 {
5382 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
5383 if (!can_vec_perm_const_p (mode, indices))
5384 {
5385 if (dump_enabled_p ())
5386 {
5387 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5388 vect_location,
5389 "unsupported vect permute { ");
5390 for (i = 0; i < count; ++i)
5391 {
5392 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5393 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5394 }
5395 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5396 }
5397 gcc_assert (analyze_only);
5398 return false;
5399 }
5400
5401 ++*n_perms;
5402 }
5403
5404 if (index == count)
5405 {
5406 if (!analyze_only)
5407 {
5408 tree mask_vec = NULL_TREE;
5409
5410 if (! noop_p)
5411 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5412
5413 if (second_vec_index == -1)
5414 second_vec_index = first_vec_index;
5415
5416 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
5417 {
5418 /* Generate the permute statement if necessary. */
5419 tree first_vec = dr_chain[first_vec_index + ri];
5420 tree second_vec = dr_chain[second_vec_index + ri];
5421 gimple *perm_stmt;
5422 if (! noop_p)
5423 {
5424 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5425 tree perm_dest
5426 = vect_create_destination_var (gimple_assign_lhs (stmt),
5427 vectype);
5428 perm_dest = make_ssa_name (perm_dest);
5429 perm_stmt
5430 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5431 first_vec, second_vec,
5432 mask_vec);
5433 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
5434 gsi);
5435 }
5436 else
5437 /* If mask was NULL_TREE generate the requested
5438 identity transform. */
5439 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
5440
5441 /* Store the vector statement in NODE. */
5442 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
5443 }
5444 }
5445
5446 index = 0;
5447 first_vec_index = -1;
5448 second_vec_index = -1;
5449 noop_p = true;
5450 }
5451 }
5452
5453 if (n_loads)
5454 {
5455 if (repeating_p)
5456 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5457 else
5458 {
5459 /* Enforced above when !repeating_p. */
5460 unsigned int const_nunits = nunits.to_constant ();
5461 *n_loads = 0;
5462 bool load_seen = false;
5463 for (unsigned i = 0; i < in_nlanes; ++i)
5464 {
5465 if (i % const_nunits == 0)
5466 {
5467 if (load_seen)
5468 *n_loads += 1;
5469 load_seen = false;
5470 }
5471 if (bitmap_bit_p (used_in_lanes, i))
5472 load_seen = true;
5473 }
5474 if (load_seen)
5475 *n_loads += 1;
5476 }
5477 }
5478
5479 return true;
5480 }
5481
5482
5483 /* Vectorize the SLP permutations in NODE as specified
5484 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5485 child number and lane number.
5486 Interleaving of two two-lane two-child SLP subtrees (not supported):
5487 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5488 A blend of two four-lane two-child SLP subtrees:
5489 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5490 Highpart of a four-lane one-child SLP subtree (not supported):
5491 [ { 0, 2 }, { 0, 3 } ]
5492 Where currently only a subset is supported by code generating below. */
5493
5494 static bool
5495 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5496 slp_tree node, stmt_vector_for_cost *cost_vec)
5497 {
5498 tree vectype = SLP_TREE_VECTYPE (node);
5499
5500 /* ??? We currently only support all same vector input and output types
5501 while the SLP IL should really do a concat + select and thus accept
5502 arbitrary mismatches. */
5503 slp_tree child;
5504 unsigned i;
5505 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5506 if (!vect_maybe_update_slp_op_vectype (child, vectype)
5507 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
5508 {
5509 if (dump_enabled_p ())
5510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5511 "Unsupported lane permutation\n");
5512 return false;
5513 }
5514
5515 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
5516 gcc_assert (perm.length () == SLP_TREE_LANES (node));
5517 if (dump_enabled_p ())
5518 {
5519 dump_printf_loc (MSG_NOTE, vect_location,
5520 "vectorizing permutation");
5521 for (unsigned i = 0; i < perm.length (); ++i)
5522 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
5523 dump_printf (MSG_NOTE, "\n");
5524 }
5525
5526 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5527 if (!nunits.is_constant ())
5528 return false;
5529 unsigned HOST_WIDE_INT vf = 1;
5530 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
5531 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
5532 return false;
5533 unsigned olanes = vf * SLP_TREE_LANES (node);
5534 gcc_assert (multiple_p (olanes, nunits));
5535
5536 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5537 from the { SLP operand, scalar lane } permutation as recorded in the
5538 SLP node as intermediate step. This part should already work
5539 with SLP children with arbitrary number of lanes. */
5540 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
5541 auto_vec<unsigned> active_lane;
5542 vperm.create (olanes);
5543 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
5544 for (unsigned i = 0; i < vf; ++i)
5545 {
5546 for (unsigned pi = 0; pi < perm.length (); ++pi)
5547 {
5548 std::pair<unsigned, unsigned> p = perm[pi];
5549 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
5550 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
5551 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
5552 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
5553 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
5554 }
5555 /* Advance to the next group. */
5556 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
5557 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
5558 }
5559
5560 if (dump_enabled_p ())
5561 {
5562 dump_printf_loc (MSG_NOTE, vect_location, "as");
5563 for (unsigned i = 0; i < vperm.length (); ++i)
5564 {
5565 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
5566 dump_printf (MSG_NOTE, ",");
5567 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
5568 vperm[i].first.first, vperm[i].first.second,
5569 vperm[i].second);
5570 }
5571 dump_printf (MSG_NOTE, "\n");
5572 }
5573
5574 /* We can only handle two-vector permutes, everything else should
5575 be lowered on the SLP level. The following is closely inspired
5576 by vect_transform_slp_perm_load and is supposed to eventually
5577 replace it.
5578 ??? As intermediate step do code-gen in the SLP tree representation
5579 somehow? */
5580 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
5581 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
5582 unsigned int const_nunits = nunits.to_constant ();
5583 unsigned int index = 0;
5584 unsigned int mask_element;
5585 vec_perm_builder mask;
5586 mask.new_vector (const_nunits, const_nunits, 1);
5587 unsigned int count = mask.encoded_nelts ();
5588 mask.quick_grow (count);
5589 vec_perm_indices indices;
5590 unsigned nperms = 0;
5591 for (unsigned i = 0; i < vperm.length (); ++i)
5592 {
5593 mask_element = vperm[i].second;
5594 if (first_vec.first == -1U
5595 || first_vec == vperm[i].first)
5596 first_vec = vperm[i].first;
5597 else if (second_vec.first == -1U
5598 || second_vec == vperm[i].first)
5599 {
5600 second_vec = vperm[i].first;
5601 mask_element += const_nunits;
5602 }
5603 else
5604 {
5605 if (dump_enabled_p ())
5606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5607 "permutation requires at "
5608 "least three vectors\n");
5609 gcc_assert (!gsi);
5610 return false;
5611 }
5612
5613 mask[index++] = mask_element;
5614
5615 if (index == count)
5616 {
5617 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
5618 const_nunits);
5619 bool identity_p = indices.series_p (0, 1, 0, 1);
5620 if (!identity_p
5621 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5622 {
5623 if (dump_enabled_p ())
5624 {
5625 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5626 vect_location,
5627 "unsupported vect permute { ");
5628 for (i = 0; i < count; ++i)
5629 {
5630 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5631 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5632 }
5633 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5634 }
5635 gcc_assert (!gsi);
5636 return false;
5637 }
5638
5639 if (!identity_p)
5640 nperms++;
5641 if (gsi)
5642 {
5643 if (second_vec.first == -1U)
5644 second_vec = first_vec;
5645
5646 /* Generate the permute statement if necessary. */
5647 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
5648 tree first_def
5649 = vect_get_slp_vect_def (first_node, first_vec.second);
5650 gassign *perm_stmt;
5651 tree perm_dest = make_ssa_name (vectype);
5652 if (!identity_p)
5653 {
5654 slp_tree second_node
5655 = SLP_TREE_CHILDREN (node)[second_vec.first];
5656 tree second_def
5657 = vect_get_slp_vect_def (second_node, second_vec.second);
5658 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5659 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5660 first_def, second_def,
5661 mask_vec);
5662 }
5663 else
5664 /* We need a copy here in case the def was external. */
5665 perm_stmt = gimple_build_assign (perm_dest, first_def);
5666 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
5667 /* Store the vector statement in NODE. */
5668 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
5669 }
5670
5671 index = 0;
5672 first_vec = std::make_pair (-1U, -1U);
5673 second_vec = std::make_pair (-1U, -1U);
5674 }
5675 }
5676
5677 if (!gsi)
5678 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
5679
5680 return true;
5681 }
5682
5683 /* Vectorize SLP NODE. */
5684
5685 static void
5686 vect_schedule_slp_node (vec_info *vinfo,
5687 slp_tree node, slp_instance instance)
5688 {
5689 gimple_stmt_iterator si;
5690 int i;
5691 slp_tree child;
5692
5693 /* For existing vectors there's nothing to do. */
5694 if (SLP_TREE_VEC_DEFS (node).exists ())
5695 return;
5696
5697 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
5698
5699 /* Vectorize externals and constants. */
5700 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5701 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5702 {
5703 /* ??? vectorizable_shift can end up using a scalar operand which is
5704 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5705 node in this case. */
5706 if (!SLP_TREE_VECTYPE (node))
5707 return;
5708
5709 vect_create_constant_vectors (vinfo, node);
5710 return;
5711 }
5712
5713 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5714
5715 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
5716 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
5717
5718 if (dump_enabled_p ())
5719 dump_printf_loc (MSG_NOTE, vect_location,
5720 "------>vectorizing SLP node starting from: %G",
5721 stmt_info->stmt);
5722
5723 if (STMT_VINFO_DATA_REF (stmt_info)
5724 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5725 {
5726 /* Vectorized loads go before the first scalar load to make it
5727 ready early, vectorized stores go before the last scalar
5728 stmt which is where all uses are ready. */
5729 stmt_vec_info last_stmt_info = NULL;
5730 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
5731 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
5732 else /* DR_IS_WRITE */
5733 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
5734 si = gsi_for_stmt (last_stmt_info->stmt);
5735 }
5736 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
5737 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
5738 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
5739 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5740 {
5741 /* For PHI node vectorization we do not use the insertion iterator. */
5742 si = gsi_none ();
5743 }
5744 else
5745 {
5746 /* Emit other stmts after the children vectorized defs which is
5747 earliest possible. */
5748 gimple *last_stmt = NULL;
5749 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5750 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5751 {
5752 /* For fold-left reductions we are retaining the scalar
5753 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5754 set so the representation isn't perfect. Resort to the
5755 last scalar def here. */
5756 if (SLP_TREE_VEC_STMTS (child).is_empty ())
5757 {
5758 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
5759 == cycle_phi_info_type);
5760 gphi *phi = as_a <gphi *>
5761 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
5762 if (!last_stmt
5763 || vect_stmt_dominates_stmt_p (last_stmt, phi))
5764 last_stmt = phi;
5765 }
5766 /* We are emitting all vectorized stmts in the same place and
5767 the last one is the last.
5768 ??? Unless we have a load permutation applied and that
5769 figures to re-use an earlier generated load. */
5770 unsigned j;
5771 gimple *vstmt;
5772 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
5773 if (!last_stmt
5774 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5775 last_stmt = vstmt;
5776 }
5777 else if (!SLP_TREE_VECTYPE (child))
5778 {
5779 /* For externals we use unvectorized at all scalar defs. */
5780 unsigned j;
5781 tree def;
5782 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
5783 if (TREE_CODE (def) == SSA_NAME
5784 && !SSA_NAME_IS_DEFAULT_DEF (def))
5785 {
5786 gimple *stmt = SSA_NAME_DEF_STMT (def);
5787 if (!last_stmt
5788 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
5789 last_stmt = stmt;
5790 }
5791 }
5792 else
5793 {
5794 /* For externals we have to look at all defs since their
5795 insertion place is decided per vector. But beware
5796 of pre-existing vectors where we need to make sure
5797 we do not insert before the region boundary. */
5798 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
5799 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
5800 last_stmt = gsi_stmt (gsi_after_labels
5801 (as_a <bb_vec_info> (vinfo)->bbs[0]));
5802 else
5803 {
5804 unsigned j;
5805 tree vdef;
5806 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
5807 if (TREE_CODE (vdef) == SSA_NAME
5808 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
5809 {
5810 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
5811 if (!last_stmt
5812 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5813 last_stmt = vstmt;
5814 }
5815 }
5816 }
5817 /* This can happen when all children are pre-existing vectors or
5818 constants. */
5819 if (!last_stmt)
5820 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
5821 if (is_a <gphi *> (last_stmt))
5822 si = gsi_after_labels (gimple_bb (last_stmt));
5823 else
5824 {
5825 si = gsi_for_stmt (last_stmt);
5826 gsi_next (&si);
5827 }
5828 }
5829
5830 bool done_p = false;
5831
5832 /* Handle purely internal nodes. */
5833 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5834 {
5835 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5836 be shared with different SLP nodes (but usually it's the same
5837 operation apart from the case the stmt is only there for denoting
5838 the actual scalar lane defs ...). So do not call vect_transform_stmt
5839 but open-code it here (partly). */
5840 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
5841 gcc_assert (done);
5842 done_p = true;
5843 }
5844 if (!done_p)
5845 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
5846 }
5847
5848 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5849 For loop vectorization this is done in vectorizable_call, but for SLP
5850 it needs to be deferred until end of vect_schedule_slp, because multiple
5851 SLP instances may refer to the same scalar stmt. */
5852
5853 static void
5854 vect_remove_slp_scalar_calls (vec_info *vinfo,
5855 slp_tree node, hash_set<slp_tree> &visited)
5856 {
5857 gimple *new_stmt;
5858 gimple_stmt_iterator gsi;
5859 int i;
5860 slp_tree child;
5861 tree lhs;
5862 stmt_vec_info stmt_info;
5863
5864 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5865 return;
5866
5867 if (visited.add (node))
5868 return;
5869
5870 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5871 vect_remove_slp_scalar_calls (vinfo, child, visited);
5872
5873 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5874 {
5875 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
5876 if (!stmt || gimple_bb (stmt) == NULL)
5877 continue;
5878 if (is_pattern_stmt_p (stmt_info)
5879 || !PURE_SLP_STMT (stmt_info))
5880 continue;
5881 lhs = gimple_call_lhs (stmt);
5882 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
5883 gsi = gsi_for_stmt (stmt);
5884 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
5885 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
5886 }
5887 }
5888
5889 static void
5890 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
5891 {
5892 hash_set<slp_tree> visited;
5893 vect_remove_slp_scalar_calls (vinfo, node, visited);
5894 }
5895
5896 /* Vectorize the instance root. */
5897
5898 void
5899 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
5900 {
5901 gassign *rstmt = NULL;
5902
5903 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
5904 {
5905 gimple *child_stmt;
5906 int j;
5907
5908 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5909 {
5910 tree vect_lhs = gimple_get_lhs (child_stmt);
5911 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
5912 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
5913 TREE_TYPE (vect_lhs)))
5914 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
5915 vect_lhs);
5916 rstmt = gimple_build_assign (root_lhs, vect_lhs);
5917 break;
5918 }
5919 }
5920 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
5921 {
5922 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5923 gimple *child_stmt;
5924 int j;
5925 vec<constructor_elt, va_gc> *v;
5926 vec_alloc (v, nelts);
5927
5928 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5929 {
5930 CONSTRUCTOR_APPEND_ELT (v,
5931 NULL_TREE,
5932 gimple_get_lhs (child_stmt));
5933 }
5934 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
5935 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
5936 tree r_constructor = build_constructor (rtype, v);
5937 rstmt = gimple_build_assign (lhs, r_constructor);
5938 }
5939
5940 gcc_assert (rstmt);
5941
5942 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
5943 gsi_replace (&rgsi, rstmt, true);
5944 }
5945
5946 struct slp_scc_info
5947 {
5948 bool on_stack;
5949 int dfs;
5950 int lowlink;
5951 };
5952
5953 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
5954
5955 static void
5956 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
5957 hash_map<slp_tree, slp_scc_info> &scc_info,
5958 int &maxdfs, vec<slp_tree> &stack)
5959 {
5960 bool existed_p;
5961 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
5962 gcc_assert (!existed_p);
5963 info->dfs = maxdfs;
5964 info->lowlink = maxdfs;
5965 maxdfs++;
5966
5967 /* Leaf. */
5968 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5969 {
5970 info->on_stack = false;
5971 vect_schedule_slp_node (vinfo, node, instance);
5972 return;
5973 }
5974
5975 info->on_stack = true;
5976 stack.safe_push (node);
5977
5978 unsigned i;
5979 slp_tree child;
5980 /* DFS recurse. */
5981 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5982 {
5983 if (!child)
5984 continue;
5985 slp_scc_info *child_info = scc_info.get (child);
5986 if (!child_info)
5987 {
5988 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
5989 /* Recursion might have re-allocated the node. */
5990 info = scc_info.get (node);
5991 child_info = scc_info.get (child);
5992 info->lowlink = MIN (info->lowlink, child_info->lowlink);
5993 }
5994 else if (child_info->on_stack)
5995 info->lowlink = MIN (info->lowlink, child_info->dfs);
5996 }
5997 if (info->lowlink != info->dfs)
5998 return;
5999
6000 auto_vec<slp_tree, 4> phis_to_fixup;
6001
6002 /* Singleton. */
6003 if (stack.last () == node)
6004 {
6005 stack.pop ();
6006 info->on_stack = false;
6007 vect_schedule_slp_node (vinfo, node, instance);
6008 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6009 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
6010 phis_to_fixup.quick_push (node);
6011 }
6012 else
6013 {
6014 /* SCC. */
6015 int last_idx = stack.length () - 1;
6016 while (stack[last_idx] != node)
6017 last_idx--;
6018 /* We can break the cycle at PHIs who have at least one child
6019 code generated. Then we could re-start the DFS walk until
6020 all nodes in the SCC are covered (we might have new entries
6021 for only back-reachable nodes). But it's simpler to just
6022 iterate and schedule those that are ready. */
6023 unsigned todo = stack.length () - last_idx;
6024 do
6025 {
6026 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
6027 {
6028 slp_tree entry = stack[idx];
6029 if (!entry)
6030 continue;
6031 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
6032 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
6033 bool ready = !phi;
6034 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
6035 if (!child)
6036 {
6037 gcc_assert (phi);
6038 ready = true;
6039 break;
6040 }
6041 else if (scc_info.get (child)->on_stack)
6042 {
6043 if (!phi)
6044 {
6045 ready = false;
6046 break;
6047 }
6048 }
6049 else
6050 {
6051 if (phi)
6052 {
6053 ready = true;
6054 break;
6055 }
6056 }
6057 if (ready)
6058 {
6059 vect_schedule_slp_node (vinfo, entry, instance);
6060 scc_info.get (entry)->on_stack = false;
6061 stack[idx] = NULL;
6062 todo--;
6063 if (phi)
6064 phis_to_fixup.safe_push (entry);
6065 }
6066 }
6067 }
6068 while (todo != 0);
6069
6070 /* Pop the SCC. */
6071 stack.truncate (last_idx);
6072 }
6073
6074 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
6075 slp_tree phi_node;
6076 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
6077 {
6078 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
6079 edge_iterator ei;
6080 edge e;
6081 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
6082 {
6083 unsigned dest_idx = e->dest_idx;
6084 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
6085 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
6086 continue;
6087 /* Simply fill all args. */
6088 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
6089 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
6090 vect_get_slp_vect_def (child, i),
6091 e, gimple_phi_arg_location (phi, dest_idx));
6092 }
6093 }
6094 }
6095
6096 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
6097
6098 void
6099 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
6100 {
6101 slp_instance instance;
6102 unsigned int i;
6103
6104 hash_map<slp_tree, slp_scc_info> scc_info;
6105 int maxdfs = 0;
6106 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6107 {
6108 slp_tree node = SLP_INSTANCE_TREE (instance);
6109 if (dump_enabled_p ())
6110 {
6111 dump_printf_loc (MSG_NOTE, vect_location,
6112 "Vectorizing SLP tree:\n");
6113 if (SLP_INSTANCE_ROOT_STMT (instance))
6114 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
6115 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
6116 vect_print_slp_graph (MSG_NOTE, vect_location,
6117 SLP_INSTANCE_TREE (instance));
6118 }
6119 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
6120 have a PHI be the node breaking the cycle. */
6121 auto_vec<slp_tree> stack;
6122 if (!scc_info.get (node))
6123 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
6124
6125 if (SLP_INSTANCE_ROOT_STMT (instance))
6126 vectorize_slp_instance_root_stmt (node, instance);
6127
6128 if (dump_enabled_p ())
6129 dump_printf_loc (MSG_NOTE, vect_location,
6130 "vectorizing stmts using SLP.\n");
6131 }
6132
6133 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6134 {
6135 slp_tree root = SLP_INSTANCE_TREE (instance);
6136 stmt_vec_info store_info;
6137 unsigned int j;
6138
6139 /* Remove scalar call stmts. Do not do this for basic-block
6140 vectorization as not all uses may be vectorized.
6141 ??? Why should this be necessary? DCE should be able to
6142 remove the stmts itself.
6143 ??? For BB vectorization we can as well remove scalar
6144 stmts starting from the SLP tree root if they have no
6145 uses. */
6146 if (is_a <loop_vec_info> (vinfo))
6147 vect_remove_slp_scalar_calls (vinfo, root);
6148
6149 /* Remove vectorized stores original scalar stmts. */
6150 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
6151 {
6152 if (!STMT_VINFO_DATA_REF (store_info)
6153 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
6154 break;
6155
6156 store_info = vect_orig_stmt (store_info);
6157 /* Free the attached stmt_vec_info and remove the stmt. */
6158 vinfo->remove_stmt (store_info);
6159 }
6160 }
6161 }