]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
tree-optimization/98191 - fix BIT_INSERT_EXPR sequence vectorization
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
52
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55
56 static object_allocator<_slp_tree> *slp_tree_pool;
57 static slp_tree slp_first_node;
58
59 void
60 vect_slp_init (void)
61 {
62 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
63 }
64
65 void
66 vect_slp_fini (void)
67 {
68 while (slp_first_node)
69 delete slp_first_node;
70 delete slp_tree_pool;
71 slp_tree_pool = NULL;
72 }
73
74 void *
75 _slp_tree::operator new (size_t n)
76 {
77 gcc_assert (n == sizeof (_slp_tree));
78 return slp_tree_pool->allocate_raw ();
79 }
80
81 void
82 _slp_tree::operator delete (void *node, size_t n)
83 {
84 gcc_assert (n == sizeof (_slp_tree));
85 slp_tree_pool->remove_raw (node);
86 }
87
88
89 /* Initialize a SLP node. */
90
91 _slp_tree::_slp_tree ()
92 {
93 this->prev_node = NULL;
94 if (slp_first_node)
95 slp_first_node->prev_node = this;
96 this->next_node = slp_first_node;
97 slp_first_node = this;
98 SLP_TREE_SCALAR_STMTS (this) = vNULL;
99 SLP_TREE_SCALAR_OPS (this) = vNULL;
100 SLP_TREE_VEC_STMTS (this) = vNULL;
101 SLP_TREE_VEC_DEFS (this) = vNULL;
102 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
103 SLP_TREE_CHILDREN (this) = vNULL;
104 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
105 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
106 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
107 SLP_TREE_CODE (this) = ERROR_MARK;
108 SLP_TREE_VECTYPE (this) = NULL_TREE;
109 SLP_TREE_REPRESENTATIVE (this) = NULL;
110 SLP_TREE_REF_COUNT (this) = 1;
111 this->max_nunits = 1;
112 this->lanes = 0;
113 }
114
115 /* Tear down a SLP node. */
116
117 _slp_tree::~_slp_tree ()
118 {
119 if (this->prev_node)
120 this->prev_node->next_node = this->next_node;
121 else
122 slp_first_node = this->next_node;
123 if (this->next_node)
124 this->next_node->prev_node = this->prev_node;
125 SLP_TREE_CHILDREN (this).release ();
126 SLP_TREE_SCALAR_STMTS (this).release ();
127 SLP_TREE_SCALAR_OPS (this).release ();
128 SLP_TREE_VEC_STMTS (this).release ();
129 SLP_TREE_VEC_DEFS (this).release ();
130 SLP_TREE_LOAD_PERMUTATION (this).release ();
131 SLP_TREE_LANE_PERMUTATION (this).release ();
132 }
133
134 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
135
136 static void
137 vect_free_slp_tree (slp_tree node)
138 {
139 int i;
140 slp_tree child;
141
142 if (--SLP_TREE_REF_COUNT (node) != 0)
143 return;
144
145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
146 if (child)
147 vect_free_slp_tree (child);
148
149 delete node;
150 }
151
152 /* Return a location suitable for dumpings related to the SLP instance. */
153
154 dump_user_location_t
155 _slp_instance::location () const
156 {
157 if (root_stmt)
158 return root_stmt->stmt;
159 else
160 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
161 }
162
163
164 /* Free the memory allocated for the SLP instance. */
165
166 void
167 vect_free_slp_instance (slp_instance instance)
168 {
169 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
170 SLP_INSTANCE_LOADS (instance).release ();
171 instance->subgraph_entries.release ();
172 instance->cost_vec.release ();
173 free (instance);
174 }
175
176
177 /* Create an SLP node for SCALAR_STMTS. */
178
179 static slp_tree
180 vect_create_new_slp_node (slp_tree node,
181 vec<stmt_vec_info> scalar_stmts, unsigned nops)
182 {
183 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
184 SLP_TREE_CHILDREN (node).create (nops);
185 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
186 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
187 SLP_TREE_LANES (node) = scalar_stmts.length ();
188 return node;
189 }
190
191 /* Create an SLP node for SCALAR_STMTS. */
192
193 static slp_tree
194 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
195 {
196 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
197 }
198
199 /* Create an SLP node for OPS. */
200
201 static slp_tree
202 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
203 {
204 SLP_TREE_SCALAR_OPS (node) = ops;
205 SLP_TREE_DEF_TYPE (node) = vect_external_def;
206 SLP_TREE_LANES (node) = ops.length ();
207 return node;
208 }
209
210 /* Create an SLP node for OPS. */
211
212 static slp_tree
213 vect_create_new_slp_node (vec<tree> ops)
214 {
215 return vect_create_new_slp_node (new _slp_tree, ops);
216 }
217
218
219 /* This structure is used in creation of an SLP tree. Each instance
220 corresponds to the same operand in a group of scalar stmts in an SLP
221 node. */
222 typedef struct _slp_oprnd_info
223 {
224 /* Def-stmts for the operands. */
225 vec<stmt_vec_info> def_stmts;
226 /* Operands. */
227 vec<tree> ops;
228 /* Information about the first statement, its vector def-type, type, the
229 operand itself in case it's constant, and an indication if it's a pattern
230 stmt. */
231 tree first_op_type;
232 enum vect_def_type first_dt;
233 bool any_pattern;
234 } *slp_oprnd_info;
235
236
237 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
238 operand. */
239 static vec<slp_oprnd_info>
240 vect_create_oprnd_info (int nops, int group_size)
241 {
242 int i;
243 slp_oprnd_info oprnd_info;
244 vec<slp_oprnd_info> oprnds_info;
245
246 oprnds_info.create (nops);
247 for (i = 0; i < nops; i++)
248 {
249 oprnd_info = XNEW (struct _slp_oprnd_info);
250 oprnd_info->def_stmts.create (group_size);
251 oprnd_info->ops.create (group_size);
252 oprnd_info->first_dt = vect_uninitialized_def;
253 oprnd_info->first_op_type = NULL_TREE;
254 oprnd_info->any_pattern = false;
255 oprnds_info.quick_push (oprnd_info);
256 }
257
258 return oprnds_info;
259 }
260
261
262 /* Free operands info. */
263
264 static void
265 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
266 {
267 int i;
268 slp_oprnd_info oprnd_info;
269
270 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
271 {
272 oprnd_info->def_stmts.release ();
273 oprnd_info->ops.release ();
274 XDELETE (oprnd_info);
275 }
276
277 oprnds_info.release ();
278 }
279
280
281 /* Return true if STMTS contains a pattern statement. */
282
283 static bool
284 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
285 {
286 stmt_vec_info stmt_info;
287 unsigned int i;
288 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
289 if (is_pattern_stmt_p (stmt_info))
290 return true;
291 return false;
292 }
293
294 /* Return true when all lanes in the external or constant NODE have
295 the same value. */
296
297 static bool
298 vect_slp_tree_uniform_p (slp_tree node)
299 {
300 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
301 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
302
303 /* Pre-exsting vectors. */
304 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
305 return false;
306
307 unsigned i;
308 tree op, first = NULL_TREE;
309 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
310 if (!first)
311 first = op;
312 else if (!operand_equal_p (first, op, 0))
313 return false;
314
315 return true;
316 }
317
318 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
319 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
320 of the chain. */
321
322 int
323 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
324 stmt_vec_info first_stmt_info)
325 {
326 stmt_vec_info next_stmt_info = first_stmt_info;
327 int result = 0;
328
329 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
330 return -1;
331
332 do
333 {
334 if (next_stmt_info == stmt_info)
335 return result;
336 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
337 if (next_stmt_info)
338 result += DR_GROUP_GAP (next_stmt_info);
339 }
340 while (next_stmt_info);
341
342 return -1;
343 }
344
345 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
346 using the method implemented by duplicate_and_interleave. Return true
347 if so, returning the number of intermediate vectors in *NVECTORS_OUT
348 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
349 (if nonnull). */
350
351 bool
352 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
353 tree elt_type, unsigned int *nvectors_out,
354 tree *vector_type_out,
355 tree *permutes)
356 {
357 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
358 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
359 return false;
360
361 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
362 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
363 unsigned int nvectors = 1;
364 for (;;)
365 {
366 scalar_int_mode int_mode;
367 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
368 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
369 {
370 /* Get the natural vector type for this SLP group size. */
371 tree int_type = build_nonstandard_integer_type
372 (GET_MODE_BITSIZE (int_mode), 1);
373 tree vector_type
374 = get_vectype_for_scalar_type (vinfo, int_type, count);
375 if (vector_type
376 && VECTOR_MODE_P (TYPE_MODE (vector_type))
377 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
378 GET_MODE_SIZE (base_vector_mode)))
379 {
380 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
381 together into elements of type INT_TYPE and using the result
382 to build NVECTORS vectors. */
383 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
384 vec_perm_builder sel1 (nelts, 2, 3);
385 vec_perm_builder sel2 (nelts, 2, 3);
386 poly_int64 half_nelts = exact_div (nelts, 2);
387 for (unsigned int i = 0; i < 3; ++i)
388 {
389 sel1.quick_push (i);
390 sel1.quick_push (i + nelts);
391 sel2.quick_push (half_nelts + i);
392 sel2.quick_push (half_nelts + i + nelts);
393 }
394 vec_perm_indices indices1 (sel1, 2, nelts);
395 vec_perm_indices indices2 (sel2, 2, nelts);
396 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
397 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
398 {
399 if (nvectors_out)
400 *nvectors_out = nvectors;
401 if (vector_type_out)
402 *vector_type_out = vector_type;
403 if (permutes)
404 {
405 permutes[0] = vect_gen_perm_mask_checked (vector_type,
406 indices1);
407 permutes[1] = vect_gen_perm_mask_checked (vector_type,
408 indices2);
409 }
410 return true;
411 }
412 }
413 }
414 if (!multiple_p (elt_bytes, 2, &elt_bytes))
415 return false;
416 nvectors *= 2;
417 }
418 }
419
420 /* Return true if DTA and DTB match. */
421
422 static bool
423 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
424 {
425 return (dta == dtb
426 || ((dta == vect_external_def || dta == vect_constant_def)
427 && (dtb == vect_external_def || dtb == vect_constant_def)));
428 }
429
430 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
431 they are of a valid type and that they match the defs of the first stmt of
432 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
433 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
434 indicates swap is required for cond_expr stmts. Specifically, *SWAP
435 is 1 if STMT is cond and operands of comparison need to be swapped;
436 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
437 If there is any operand swap in this function, *SWAP is set to non-zero
438 value.
439 If there was a fatal error return -1; if the error could be corrected by
440 swapping operands of father node of this one, return 1; if everything is
441 ok return 0. */
442 static int
443 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
444 bool *skip_args,
445 vec<stmt_vec_info> stmts, unsigned stmt_num,
446 vec<slp_oprnd_info> *oprnds_info)
447 {
448 stmt_vec_info stmt_info = stmts[stmt_num];
449 tree oprnd;
450 unsigned int i, number_of_oprnds;
451 enum vect_def_type dt = vect_uninitialized_def;
452 slp_oprnd_info oprnd_info;
453 int first_op_idx = 1;
454 unsigned int commutative_op = -1U;
455 bool first_op_cond = false;
456 bool first = stmt_num == 0;
457
458 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
459 {
460 number_of_oprnds = gimple_call_num_args (stmt);
461 first_op_idx = 3;
462 if (gimple_call_internal_p (stmt))
463 {
464 internal_fn ifn = gimple_call_internal_fn (stmt);
465 commutative_op = first_commutative_argument (ifn);
466
467 /* Masked load, only look at mask. */
468 if (ifn == IFN_MASK_LOAD)
469 {
470 number_of_oprnds = 1;
471 /* Mask operand index. */
472 first_op_idx = 5;
473 }
474 }
475 }
476 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
477 {
478 enum tree_code code = gimple_assign_rhs_code (stmt);
479 number_of_oprnds = gimple_num_ops (stmt) - 1;
480 /* Swap can only be done for cond_expr if asked to, otherwise we
481 could result in different comparison code to the first stmt. */
482 if (code == COND_EXPR
483 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
484 {
485 first_op_cond = true;
486 number_of_oprnds++;
487 }
488 else
489 commutative_op = commutative_tree_code (code) ? 0U : -1U;
490 }
491 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
492 number_of_oprnds = gimple_phi_num_args (stmt);
493 else
494 return -1;
495
496 bool swapped = (swap != 0);
497 bool backedge = false;
498 gcc_assert (!swapped || first_op_cond);
499 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
500 for (i = 0; i < number_of_oprnds; i++)
501 {
502 if (first_op_cond)
503 {
504 /* Map indicating how operands of cond_expr should be swapped. */
505 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
506 int *map = maps[swap];
507
508 if (i < 2)
509 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
510 first_op_idx), map[i]);
511 else
512 oprnd = gimple_op (stmt_info->stmt, map[i]);
513 }
514 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
515 {
516 oprnd = gimple_phi_arg_def (stmt, i);
517 backedge = dominated_by_p (CDI_DOMINATORS,
518 gimple_phi_arg_edge (stmt, i)->src,
519 gimple_bb (stmt_info->stmt));
520 }
521 else
522 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
523 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
524 oprnd = TREE_OPERAND (oprnd, 0);
525
526 oprnd_info = (*oprnds_info)[i];
527
528 stmt_vec_info def_stmt_info;
529 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
530 {
531 if (dump_enabled_p ())
532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
533 "Build SLP failed: can't analyze def for %T\n",
534 oprnd);
535
536 return -1;
537 }
538
539 if (skip_args[i])
540 {
541 oprnd_info->def_stmts.quick_push (NULL);
542 oprnd_info->ops.quick_push (NULL_TREE);
543 oprnd_info->first_dt = vect_uninitialized_def;
544 continue;
545 }
546
547 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
548 oprnd_info->any_pattern = true;
549
550 oprnd_info->def_stmts.quick_push (def_stmt_info);
551 oprnd_info->ops.quick_push (oprnd);
552
553 /* If there's a extern def on a backedge make sure we can
554 code-generate at the region start.
555 ??? This is another case that could be fixed by adjusting
556 how we split the function but at the moment we'd have conflicting
557 goals there. */
558 if (backedge
559 && dts[i] == vect_external_def
560 && is_a <bb_vec_info> (vinfo)
561 && TREE_CODE (oprnd) == SSA_NAME
562 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
563 && !dominated_by_p (CDI_DOMINATORS,
564 as_a <bb_vec_info> (vinfo)->bbs[0],
565 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
566 {
567 if (dump_enabled_p ())
568 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
569 "Build SLP failed: extern def %T only defined "
570 "on backedge\n", oprnd);
571 return -1;
572 }
573
574 if (first)
575 {
576 tree type = TREE_TYPE (oprnd);
577 dt = dts[i];
578 if ((dt == vect_constant_def
579 || dt == vect_external_def)
580 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
581 && (TREE_CODE (type) == BOOLEAN_TYPE
582 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
583 type)))
584 {
585 if (dump_enabled_p ())
586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
587 "Build SLP failed: invalid type of def "
588 "for variable-length SLP %T\n", oprnd);
589 return -1;
590 }
591
592 /* For the swapping logic below force vect_reduction_def
593 for the reduction op in a SLP reduction group. */
594 if (!STMT_VINFO_DATA_REF (stmt_info)
595 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
596 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
597 && def_stmt_info)
598 dts[i] = dt = vect_reduction_def;
599
600 /* Check the types of the definition. */
601 switch (dt)
602 {
603 case vect_external_def:
604 case vect_constant_def:
605 case vect_internal_def:
606 case vect_reduction_def:
607 case vect_induction_def:
608 case vect_nested_cycle:
609 break;
610
611 default:
612 /* FORNOW: Not supported. */
613 if (dump_enabled_p ())
614 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
615 "Build SLP failed: illegal type of def %T\n",
616 oprnd);
617 return -1;
618 }
619
620 oprnd_info->first_dt = dt;
621 oprnd_info->first_op_type = type;
622 }
623 }
624 if (first)
625 return 0;
626
627 /* Now match the operand definition types to that of the first stmt. */
628 for (i = 0; i < number_of_oprnds;)
629 {
630 if (skip_args[i])
631 {
632 ++i;
633 continue;
634 }
635
636 oprnd_info = (*oprnds_info)[i];
637 dt = dts[i];
638 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
639 oprnd = oprnd_info->ops[stmt_num];
640 tree type = TREE_TYPE (oprnd);
641
642 if (!types_compatible_p (oprnd_info->first_op_type, type))
643 {
644 if (dump_enabled_p ())
645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
646 "Build SLP failed: different operand types\n");
647 return 1;
648 }
649
650 /* Not first stmt of the group, check that the def-stmt/s match
651 the def-stmt/s of the first stmt. Allow different definition
652 types for reduction chains: the first stmt must be a
653 vect_reduction_def (a phi node), and the rest
654 end in the reduction chain. */
655 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
656 && !(oprnd_info->first_dt == vect_reduction_def
657 && !STMT_VINFO_DATA_REF (stmt_info)
658 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
659 && def_stmt_info
660 && !STMT_VINFO_DATA_REF (def_stmt_info)
661 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
662 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
663 || (!STMT_VINFO_DATA_REF (stmt_info)
664 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
665 && ((!def_stmt_info
666 || STMT_VINFO_DATA_REF (def_stmt_info)
667 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
668 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
669 != (oprnd_info->first_dt != vect_reduction_def))))
670 {
671 /* Try swapping operands if we got a mismatch. For BB
672 vectorization only in case it will clearly improve things. */
673 if (i == commutative_op && !swapped
674 && (!is_a <bb_vec_info> (vinfo)
675 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
676 dts[i+1])
677 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
678 || vect_def_types_match
679 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
680 {
681 if (dump_enabled_p ())
682 dump_printf_loc (MSG_NOTE, vect_location,
683 "trying swapped operands\n");
684 std::swap (dts[i], dts[i+1]);
685 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
686 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
687 std::swap ((*oprnds_info)[i]->ops[stmt_num],
688 (*oprnds_info)[i+1]->ops[stmt_num]);
689 swapped = true;
690 continue;
691 }
692
693 if (is_a <bb_vec_info> (vinfo)
694 && !oprnd_info->any_pattern)
695 {
696 /* Now for commutative ops we should see whether we can
697 make the other operand matching. */
698 if (dump_enabled_p ())
699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
700 "treating operand as external\n");
701 oprnd_info->first_dt = dt = vect_external_def;
702 }
703 else
704 {
705 if (dump_enabled_p ())
706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
707 "Build SLP failed: different types\n");
708 return 1;
709 }
710 }
711
712 /* Make sure to demote the overall operand to external. */
713 if (dt == vect_external_def)
714 oprnd_info->first_dt = vect_external_def;
715 /* For a SLP reduction chain we want to duplicate the reduction to
716 each of the chain members. That gets us a sane SLP graph (still
717 the stmts are not 100% correct wrt the initial values). */
718 else if ((dt == vect_internal_def
719 || dt == vect_reduction_def)
720 && oprnd_info->first_dt == vect_reduction_def
721 && !STMT_VINFO_DATA_REF (stmt_info)
722 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
723 && !STMT_VINFO_DATA_REF (def_stmt_info)
724 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
725 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
726 {
727 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
728 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
729 }
730
731 ++i;
732 }
733
734 /* Swap operands. */
735 if (swapped)
736 {
737 if (dump_enabled_p ())
738 dump_printf_loc (MSG_NOTE, vect_location,
739 "swapped operands to match def types in %G",
740 stmt_info->stmt);
741 }
742
743 return 0;
744 }
745
746 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
747 Return true if we can, meaning that this choice doesn't conflict with
748 existing SLP nodes that use STMT_INFO. */
749
750 bool
751 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
752 {
753 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
754 if (old_vectype)
755 return useless_type_conversion_p (vectype, old_vectype);
756
757 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
758 {
759 /* We maintain the invariant that if any statement in the group is
760 used, all other members of the group have the same vector type. */
761 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
762 stmt_vec_info member_info = first_info;
763 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
764 if (is_pattern_stmt_p (member_info)
765 && !useless_type_conversion_p (vectype,
766 STMT_VINFO_VECTYPE (member_info)))
767 break;
768
769 if (!member_info)
770 {
771 for (member_info = first_info; member_info;
772 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
773 STMT_VINFO_VECTYPE (member_info) = vectype;
774 return true;
775 }
776 }
777 else if (!is_pattern_stmt_p (stmt_info))
778 {
779 STMT_VINFO_VECTYPE (stmt_info) = vectype;
780 return true;
781 }
782
783 if (dump_enabled_p ())
784 {
785 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
786 "Build SLP failed: incompatible vector"
787 " types for: %G", stmt_info->stmt);
788 dump_printf_loc (MSG_NOTE, vect_location,
789 " old vector type: %T\n", old_vectype);
790 dump_printf_loc (MSG_NOTE, vect_location,
791 " new vector type: %T\n", vectype);
792 }
793 return false;
794 }
795
796 /* Return true if call statements CALL1 and CALL2 are similar enough
797 to be combined into the same SLP group. */
798
799 static bool
800 compatible_calls_p (gcall *call1, gcall *call2)
801 {
802 unsigned int nargs = gimple_call_num_args (call1);
803 if (nargs != gimple_call_num_args (call2))
804 return false;
805
806 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
807 return false;
808
809 if (gimple_call_internal_p (call1))
810 {
811 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
812 TREE_TYPE (gimple_call_lhs (call2))))
813 return false;
814 for (unsigned int i = 0; i < nargs; ++i)
815 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
816 TREE_TYPE (gimple_call_arg (call2, i))))
817 return false;
818 }
819 else
820 {
821 if (!operand_equal_p (gimple_call_fn (call1),
822 gimple_call_fn (call2), 0))
823 return false;
824
825 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
826 return false;
827 }
828 return true;
829 }
830
831 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
832 caller's attempt to find the vector type in STMT_INFO with the narrowest
833 element type. Return true if VECTYPE is nonnull and if it is valid
834 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
835 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
836 vect_build_slp_tree. */
837
838 static bool
839 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
840 unsigned int group_size,
841 tree vectype, poly_uint64 *max_nunits)
842 {
843 if (!vectype)
844 {
845 if (dump_enabled_p ())
846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
847 "Build SLP failed: unsupported data-type in %G\n",
848 stmt_info->stmt);
849 /* Fatal mismatch. */
850 return false;
851 }
852
853 /* If populating the vector type requires unrolling then fail
854 before adjusting *max_nunits for basic-block vectorization. */
855 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
856 unsigned HOST_WIDE_INT const_nunits;
857 if (is_a <bb_vec_info> (vinfo)
858 && (!nunits.is_constant (&const_nunits)
859 || const_nunits > group_size))
860 {
861 if (dump_enabled_p ())
862 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
863 "Build SLP failed: unrolling required "
864 "in basic block SLP\n");
865 /* Fatal mismatch. */
866 return false;
867 }
868
869 /* In case of multiple types we need to detect the smallest type. */
870 vect_update_max_nunits (max_nunits, vectype);
871 return true;
872 }
873
874 /* Verify if the scalar stmts STMTS are isomorphic, require data
875 permutation or are of unsupported types of operation. Return
876 true if they are, otherwise return false and indicate in *MATCHES
877 which stmts are not isomorphic to the first one. If MATCHES[0]
878 is false then this indicates the comparison could not be
879 carried out or the stmts will never be vectorized by SLP.
880
881 Note COND_EXPR is possibly isomorphic to another one after swapping its
882 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
883 the first stmt by swapping the two operands of comparison; set SWAP[i]
884 to 2 if stmt I is isormorphic to the first stmt by inverting the code
885 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
886 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
887
888 static bool
889 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
890 vec<stmt_vec_info> stmts, unsigned int group_size,
891 poly_uint64 *max_nunits, bool *matches,
892 bool *two_operators, tree *node_vectype)
893 {
894 unsigned int i;
895 stmt_vec_info first_stmt_info = stmts[0];
896 enum tree_code first_stmt_code = ERROR_MARK;
897 enum tree_code alt_stmt_code = ERROR_MARK;
898 enum tree_code rhs_code = ERROR_MARK;
899 enum tree_code first_cond_code = ERROR_MARK;
900 tree lhs;
901 bool need_same_oprnds = false;
902 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
903 optab optab;
904 int icode;
905 machine_mode optab_op2_mode;
906 machine_mode vec_mode;
907 stmt_vec_info first_load = NULL, prev_first_load = NULL;
908 bool first_stmt_load_p = false, load_p = false;
909 bool first_stmt_phi_p = false, phi_p = false;
910
911 /* For every stmt in NODE find its def stmt/s. */
912 stmt_vec_info stmt_info;
913 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
914 {
915 gimple *stmt = stmt_info->stmt;
916 swap[i] = 0;
917 matches[i] = false;
918
919 if (dump_enabled_p ())
920 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
921
922 /* Fail to vectorize statements marked as unvectorizable, throw
923 or are volatile. */
924 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
925 || stmt_can_throw_internal (cfun, stmt)
926 || gimple_has_volatile_ops (stmt))
927 {
928 if (dump_enabled_p ())
929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
930 "Build SLP failed: unvectorizable statement %G",
931 stmt);
932 /* ??? For BB vectorization we want to commutate operands in a way
933 to shuffle all unvectorizable defs into one operand and have
934 the other still vectorized. The following doesn't reliably
935 work for this though but it's the easiest we can do here. */
936 if (is_a <bb_vec_info> (vinfo) && i != 0)
937 continue;
938 /* Fatal mismatch. */
939 matches[0] = false;
940 return false;
941 }
942
943 lhs = gimple_get_lhs (stmt);
944 if (lhs == NULL_TREE)
945 {
946 if (dump_enabled_p ())
947 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
948 "Build SLP failed: not GIMPLE_ASSIGN nor "
949 "GIMPLE_CALL %G", stmt);
950 if (is_a <bb_vec_info> (vinfo) && i != 0)
951 continue;
952 /* Fatal mismatch. */
953 matches[0] = false;
954 return false;
955 }
956
957 tree nunits_vectype;
958 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
959 &nunits_vectype, group_size)
960 || (nunits_vectype
961 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
962 nunits_vectype, max_nunits)))
963 {
964 if (is_a <bb_vec_info> (vinfo) && i != 0)
965 continue;
966 /* Fatal mismatch. */
967 matches[0] = false;
968 return false;
969 }
970
971 gcc_assert (vectype);
972
973 gcall *call_stmt = dyn_cast <gcall *> (stmt);
974 if (call_stmt)
975 {
976 rhs_code = CALL_EXPR;
977
978 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
979 load_p = true;
980 else if ((gimple_call_internal_p (call_stmt)
981 && (!vectorizable_internal_fn_p
982 (gimple_call_internal_fn (call_stmt))))
983 || gimple_call_tail_p (call_stmt)
984 || gimple_call_noreturn_p (call_stmt)
985 || !gimple_call_nothrow_p (call_stmt)
986 || gimple_call_chain (call_stmt))
987 {
988 if (dump_enabled_p ())
989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
990 "Build SLP failed: unsupported call type %G",
991 call_stmt);
992 if (is_a <bb_vec_info> (vinfo) && i != 0)
993 continue;
994 /* Fatal mismatch. */
995 matches[0] = false;
996 return false;
997 }
998 }
999 else if (gimple_code (stmt) == GIMPLE_PHI)
1000 {
1001 rhs_code = ERROR_MARK;
1002 phi_p = true;
1003 }
1004 else
1005 {
1006 rhs_code = gimple_assign_rhs_code (stmt);
1007 load_p = gimple_vuse (stmt);
1008 }
1009
1010 /* Check the operation. */
1011 if (i == 0)
1012 {
1013 *node_vectype = vectype;
1014 first_stmt_code = rhs_code;
1015 first_stmt_load_p = load_p;
1016 first_stmt_phi_p = phi_p;
1017
1018 /* Shift arguments should be equal in all the packed stmts for a
1019 vector shift with scalar shift operand. */
1020 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1021 || rhs_code == LROTATE_EXPR
1022 || rhs_code == RROTATE_EXPR)
1023 {
1024 vec_mode = TYPE_MODE (vectype);
1025
1026 /* First see if we have a vector/vector shift. */
1027 optab = optab_for_tree_code (rhs_code, vectype,
1028 optab_vector);
1029
1030 if (!optab
1031 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1032 {
1033 /* No vector/vector shift, try for a vector/scalar shift. */
1034 optab = optab_for_tree_code (rhs_code, vectype,
1035 optab_scalar);
1036
1037 if (!optab)
1038 {
1039 if (dump_enabled_p ())
1040 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1041 "Build SLP failed: no optab.\n");
1042 if (is_a <bb_vec_info> (vinfo) && i != 0)
1043 continue;
1044 /* Fatal mismatch. */
1045 matches[0] = false;
1046 return false;
1047 }
1048 icode = (int) optab_handler (optab, vec_mode);
1049 if (icode == CODE_FOR_nothing)
1050 {
1051 if (dump_enabled_p ())
1052 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1053 "Build SLP failed: "
1054 "op not supported by target.\n");
1055 if (is_a <bb_vec_info> (vinfo) && i != 0)
1056 continue;
1057 /* Fatal mismatch. */
1058 matches[0] = false;
1059 return false;
1060 }
1061 optab_op2_mode = insn_data[icode].operand[2].mode;
1062 if (!VECTOR_MODE_P (optab_op2_mode))
1063 {
1064 need_same_oprnds = true;
1065 first_op1 = gimple_assign_rhs2 (stmt);
1066 }
1067 }
1068 }
1069 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1070 {
1071 need_same_oprnds = true;
1072 first_op1 = gimple_assign_rhs2 (stmt);
1073 }
1074 else if (!load_p
1075 && rhs_code == BIT_FIELD_REF)
1076 {
1077 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1078 if (TREE_CODE (vec) != SSA_NAME
1079 || !types_compatible_p (vectype, TREE_TYPE (vec)))
1080 {
1081 if (is_a <bb_vec_info> (vinfo) && i != 0)
1082 continue;
1083 if (dump_enabled_p ())
1084 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1085 "Build SLP failed: "
1086 "BIT_FIELD_REF not supported\n");
1087 /* Fatal mismatch. */
1088 matches[0] = false;
1089 return false;
1090 }
1091 }
1092 else if (call_stmt
1093 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1094 {
1095 need_same_oprnds = true;
1096 first_op1 = gimple_call_arg (call_stmt, 1);
1097 }
1098 }
1099 else
1100 {
1101 if (first_stmt_code != rhs_code
1102 && alt_stmt_code == ERROR_MARK)
1103 alt_stmt_code = rhs_code;
1104 if ((first_stmt_code != rhs_code
1105 && (first_stmt_code != IMAGPART_EXPR
1106 || rhs_code != REALPART_EXPR)
1107 && (first_stmt_code != REALPART_EXPR
1108 || rhs_code != IMAGPART_EXPR)
1109 /* Handle mismatches in plus/minus by computing both
1110 and merging the results. */
1111 && !((first_stmt_code == PLUS_EXPR
1112 || first_stmt_code == MINUS_EXPR)
1113 && (alt_stmt_code == PLUS_EXPR
1114 || alt_stmt_code == MINUS_EXPR)
1115 && rhs_code == alt_stmt_code)
1116 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1117 && (first_stmt_code == ARRAY_REF
1118 || first_stmt_code == BIT_FIELD_REF
1119 || first_stmt_code == INDIRECT_REF
1120 || first_stmt_code == COMPONENT_REF
1121 || first_stmt_code == MEM_REF)))
1122 || first_stmt_load_p != load_p
1123 || first_stmt_phi_p != phi_p)
1124 {
1125 if (dump_enabled_p ())
1126 {
1127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1128 "Build SLP failed: different operation "
1129 "in stmt %G", stmt);
1130 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1131 "original stmt %G", first_stmt_info->stmt);
1132 }
1133 /* Mismatch. */
1134 continue;
1135 }
1136
1137 if (need_same_oprnds)
1138 {
1139 tree other_op1 = (call_stmt
1140 ? gimple_call_arg (call_stmt, 1)
1141 : gimple_assign_rhs2 (stmt));
1142 if (!operand_equal_p (first_op1, other_op1, 0))
1143 {
1144 if (dump_enabled_p ())
1145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1146 "Build SLP failed: different shift "
1147 "arguments in %G", stmt);
1148 /* Mismatch. */
1149 continue;
1150 }
1151 }
1152 if (!load_p
1153 && first_stmt_code == BIT_FIELD_REF
1154 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1155 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1156 {
1157 if (dump_enabled_p ())
1158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159 "Build SLP failed: different BIT_FIELD_REF "
1160 "arguments in %G", stmt);
1161 /* Mismatch. */
1162 continue;
1163 }
1164
1165 if (!load_p && rhs_code == CALL_EXPR)
1166 {
1167 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1168 as_a <gcall *> (stmt)))
1169 {
1170 if (dump_enabled_p ())
1171 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1172 "Build SLP failed: different calls in %G",
1173 stmt);
1174 /* Mismatch. */
1175 continue;
1176 }
1177 }
1178
1179 if (phi_p
1180 && (gimple_bb (first_stmt_info->stmt)
1181 != gimple_bb (stmt_info->stmt)))
1182 {
1183 if (dump_enabled_p ())
1184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1185 "Build SLP failed: different BB for PHI "
1186 "in %G", stmt);
1187 /* Mismatch. */
1188 continue;
1189 }
1190
1191 if (!types_compatible_p (vectype, *node_vectype))
1192 {
1193 if (dump_enabled_p ())
1194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1195 "Build SLP failed: different vector type "
1196 "in %G", stmt);
1197 /* Mismatch. */
1198 continue;
1199 }
1200 }
1201
1202 /* Grouped store or load. */
1203 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1204 {
1205 if (REFERENCE_CLASS_P (lhs))
1206 {
1207 /* Store. */
1208 ;
1209 }
1210 else
1211 {
1212 /* Load. */
1213 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1214 if (prev_first_load)
1215 {
1216 /* Check that there are no loads from different interleaving
1217 chains in the same node. */
1218 if (prev_first_load != first_load)
1219 {
1220 if (dump_enabled_p ())
1221 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1222 vect_location,
1223 "Build SLP failed: different "
1224 "interleaving chains in one node %G",
1225 stmt);
1226 /* Mismatch. */
1227 continue;
1228 }
1229 }
1230 else
1231 prev_first_load = first_load;
1232 }
1233 } /* Grouped access. */
1234 else
1235 {
1236 if (load_p)
1237 {
1238 /* Not grouped load. */
1239 if (dump_enabled_p ())
1240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1241 "Build SLP failed: not grouped load %G", stmt);
1242
1243 /* FORNOW: Not grouped loads are not supported. */
1244 if (is_a <bb_vec_info> (vinfo) && i != 0)
1245 continue;
1246 /* Fatal mismatch. */
1247 matches[0] = false;
1248 return false;
1249 }
1250
1251 /* Not memory operation. */
1252 if (!phi_p
1253 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1254 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1255 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1256 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1257 && rhs_code != VIEW_CONVERT_EXPR
1258 && rhs_code != CALL_EXPR
1259 && rhs_code != BIT_FIELD_REF)
1260 {
1261 if (dump_enabled_p ())
1262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1263 "Build SLP failed: operation unsupported %G",
1264 stmt);
1265 if (is_a <bb_vec_info> (vinfo) && i != 0)
1266 continue;
1267 /* Fatal mismatch. */
1268 matches[0] = false;
1269 return false;
1270 }
1271
1272 if (rhs_code == COND_EXPR)
1273 {
1274 tree cond_expr = gimple_assign_rhs1 (stmt);
1275 enum tree_code cond_code = TREE_CODE (cond_expr);
1276 enum tree_code swap_code = ERROR_MARK;
1277 enum tree_code invert_code = ERROR_MARK;
1278
1279 if (i == 0)
1280 first_cond_code = TREE_CODE (cond_expr);
1281 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1282 {
1283 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1284 swap_code = swap_tree_comparison (cond_code);
1285 invert_code = invert_tree_comparison (cond_code, honor_nans);
1286 }
1287
1288 if (first_cond_code == cond_code)
1289 ;
1290 /* Isomorphic can be achieved by swapping. */
1291 else if (first_cond_code == swap_code)
1292 swap[i] = 1;
1293 /* Isomorphic can be achieved by inverting. */
1294 else if (first_cond_code == invert_code)
1295 swap[i] = 2;
1296 else
1297 {
1298 if (dump_enabled_p ())
1299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1300 "Build SLP failed: different"
1301 " operation %G", stmt);
1302 /* Mismatch. */
1303 continue;
1304 }
1305 }
1306 }
1307
1308 matches[i] = true;
1309 }
1310
1311 for (i = 0; i < group_size; ++i)
1312 if (!matches[i])
1313 return false;
1314
1315 /* If we allowed a two-operation SLP node verify the target can cope
1316 with the permute we are going to use. */
1317 if (alt_stmt_code != ERROR_MARK
1318 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1319 {
1320 *two_operators = true;
1321 }
1322
1323 return true;
1324 }
1325
1326 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1327 Note we never remove apart from at destruction time so we do not
1328 need a special value for deleted that differs from empty. */
1329 struct bst_traits
1330 {
1331 typedef vec <stmt_vec_info> value_type;
1332 typedef vec <stmt_vec_info> compare_type;
1333 static inline hashval_t hash (value_type);
1334 static inline bool equal (value_type existing, value_type candidate);
1335 static inline bool is_empty (value_type x) { return !x.exists (); }
1336 static inline bool is_deleted (value_type x) { return !x.exists (); }
1337 static const bool empty_zero_p = true;
1338 static inline void mark_empty (value_type &x) { x.release (); }
1339 static inline void mark_deleted (value_type &x) { x.release (); }
1340 static inline void remove (value_type &x) { x.release (); }
1341 };
1342 inline hashval_t
1343 bst_traits::hash (value_type x)
1344 {
1345 inchash::hash h;
1346 for (unsigned i = 0; i < x.length (); ++i)
1347 h.add_int (gimple_uid (x[i]->stmt));
1348 return h.end ();
1349 }
1350 inline bool
1351 bst_traits::equal (value_type existing, value_type candidate)
1352 {
1353 if (existing.length () != candidate.length ())
1354 return false;
1355 for (unsigned i = 0; i < existing.length (); ++i)
1356 if (existing[i] != candidate[i])
1357 return false;
1358 return true;
1359 }
1360
1361 typedef hash_map <vec <gimple *>, slp_tree,
1362 simple_hashmap_traits <bst_traits, slp_tree> >
1363 scalar_stmts_to_slp_tree_map_t;
1364
1365 static slp_tree
1366 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1367 vec<stmt_vec_info> stmts, unsigned int group_size,
1368 poly_uint64 *max_nunits,
1369 bool *matches, unsigned *npermutes, unsigned *tree_size,
1370 scalar_stmts_to_slp_tree_map_t *bst_map);
1371
1372 static slp_tree
1373 vect_build_slp_tree (vec_info *vinfo,
1374 vec<stmt_vec_info> stmts, unsigned int group_size,
1375 poly_uint64 *max_nunits,
1376 bool *matches, unsigned *npermutes, unsigned *tree_size,
1377 scalar_stmts_to_slp_tree_map_t *bst_map)
1378 {
1379 if (slp_tree *leader = bst_map->get (stmts))
1380 {
1381 if (dump_enabled_p ())
1382 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1383 *leader ? "" : "failed ", *leader);
1384 if (*leader)
1385 {
1386 SLP_TREE_REF_COUNT (*leader)++;
1387 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1388 }
1389 return *leader;
1390 }
1391
1392 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1393 so we can pick up backedge destinations during discovery. */
1394 slp_tree res = new _slp_tree;
1395 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1396 SLP_TREE_SCALAR_STMTS (res) = stmts;
1397 bst_map->put (stmts.copy (), res);
1398
1399 poly_uint64 this_max_nunits = 1;
1400 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1401 &this_max_nunits,
1402 matches, npermutes, tree_size, bst_map);
1403 if (!res_)
1404 {
1405 bool existed_p = bst_map->put (stmts, NULL);
1406 gcc_assert (existed_p);
1407 /* Mark the node invalid so we can detect those when still in use
1408 as backedge destinations. */
1409 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1410 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1411 vect_free_slp_tree (res);
1412 }
1413 else
1414 {
1415 gcc_assert (res_ == res);
1416 res->max_nunits = this_max_nunits;
1417 vect_update_max_nunits (max_nunits, this_max_nunits);
1418 /* Keep a reference for the bst_map use. */
1419 SLP_TREE_REF_COUNT (res)++;
1420 }
1421 return res_;
1422 }
1423
1424 /* Recursively build an SLP tree starting from NODE.
1425 Fail (and return a value not equal to zero) if def-stmts are not
1426 isomorphic, require data permutation or are of unsupported types of
1427 operation. Otherwise, return 0.
1428 The value returned is the depth in the SLP tree where a mismatch
1429 was found. */
1430
1431 static slp_tree
1432 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1433 vec<stmt_vec_info> stmts, unsigned int group_size,
1434 poly_uint64 *max_nunits,
1435 bool *matches, unsigned *npermutes, unsigned *tree_size,
1436 scalar_stmts_to_slp_tree_map_t *bst_map)
1437 {
1438 unsigned nops, i, this_tree_size = 0;
1439 poly_uint64 this_max_nunits = *max_nunits;
1440
1441 matches[0] = false;
1442
1443 stmt_vec_info stmt_info = stmts[0];
1444 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1445 nops = gimple_call_num_args (stmt);
1446 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1447 {
1448 nops = gimple_num_ops (stmt) - 1;
1449 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1450 nops++;
1451 }
1452 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1453 nops = gimple_phi_num_args (phi);
1454 else
1455 return NULL;
1456
1457 /* If the SLP node is a PHI (induction or reduction), terminate
1458 the recursion. */
1459 bool *skip_args = XALLOCAVEC (bool, nops);
1460 memset (skip_args, 0, sizeof (bool) * nops);
1461 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1462 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1463 {
1464 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1465 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1466 group_size);
1467 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1468 max_nunits))
1469 return NULL;
1470
1471 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1472 if (def_type == vect_induction_def)
1473 {
1474 /* Induction PHIs are not cycles but walk the initial
1475 value. Only for inner loops through, for outer loops
1476 we need to pick up the value from the actual PHIs
1477 to more easily support peeling and epilogue vectorization. */
1478 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1479 if (!nested_in_vect_loop_p (loop, stmt_info))
1480 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1481 else
1482 loop = loop->inner;
1483 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1484 }
1485 else if (def_type == vect_reduction_def
1486 || def_type == vect_double_reduction_def
1487 || def_type == vect_nested_cycle)
1488 {
1489 /* Else def types have to match. */
1490 stmt_vec_info other_info;
1491 bool all_same = true;
1492 FOR_EACH_VEC_ELT (stmts, i, other_info)
1493 {
1494 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1495 return NULL;
1496 if (other_info != stmt_info)
1497 all_same = false;
1498 }
1499 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1500 /* Reduction initial values are not explicitely represented. */
1501 if (!nested_in_vect_loop_p (loop, stmt_info))
1502 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1503 /* Reduction chain backedge defs are filled manually.
1504 ??? Need a better way to identify a SLP reduction chain PHI.
1505 Or a better overall way to SLP match those. */
1506 if (all_same && def_type == vect_reduction_def)
1507 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1508 }
1509 else if (def_type != vect_internal_def)
1510 return NULL;
1511 }
1512
1513
1514 bool two_operators = false;
1515 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1516 tree vectype = NULL_TREE;
1517 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1518 &this_max_nunits, matches, &two_operators,
1519 &vectype))
1520 return NULL;
1521
1522 /* If the SLP node is a load, terminate the recursion unless masked. */
1523 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1524 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1525 {
1526 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1527 {
1528 /* Masked load. */
1529 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1530 nops = 1;
1531 }
1532 else
1533 {
1534 *max_nunits = this_max_nunits;
1535 (*tree_size)++;
1536 node = vect_create_new_slp_node (node, stmts, 0);
1537 SLP_TREE_VECTYPE (node) = vectype;
1538 /* And compute the load permutation. Whether it is actually
1539 a permutation depends on the unrolling factor which is
1540 decided later. */
1541 vec<unsigned> load_permutation;
1542 int j;
1543 stmt_vec_info load_info;
1544 load_permutation.create (group_size);
1545 stmt_vec_info first_stmt_info
1546 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1547 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1548 {
1549 int load_place = vect_get_place_in_interleaving_chain
1550 (load_info, first_stmt_info);
1551 gcc_assert (load_place != -1);
1552 load_permutation.safe_push (load_place);
1553 }
1554 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1555 return node;
1556 }
1557 }
1558 else if (gimple_assign_single_p (stmt_info->stmt)
1559 && !gimple_vuse (stmt_info->stmt)
1560 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1561 {
1562 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1563 the same SSA name vector of a compatible type to vectype. */
1564 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1565 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1566 stmt_vec_info estmt_info;
1567 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1568 {
1569 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1570 tree bfref = gimple_assign_rhs1 (estmt);
1571 HOST_WIDE_INT lane;
1572 if (!known_eq (bit_field_size (bfref),
1573 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1574 || !constant_multiple_p (bit_field_offset (bfref),
1575 bit_field_size (bfref), &lane))
1576 {
1577 lperm.release ();
1578 return NULL;
1579 }
1580 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1581 }
1582 slp_tree vnode = vect_create_new_slp_node (vNULL);
1583 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1584 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1585 /* We are always building a permutation node even if it is an identity
1586 permute to shield the rest of the vectorizer from the odd node
1587 representing an actual vector without any scalar ops.
1588 ??? We could hide it completely with making the permute node
1589 external? */
1590 node = vect_create_new_slp_node (node, stmts, 1);
1591 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1592 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1593 SLP_TREE_VECTYPE (node) = vectype;
1594 SLP_TREE_CHILDREN (node).quick_push (vnode);
1595 return node;
1596 }
1597
1598 /* Get at the operands, verifying they are compatible. */
1599 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1600 slp_oprnd_info oprnd_info;
1601 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1602 {
1603 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
1604 stmts, i, &oprnds_info);
1605 if (res != 0)
1606 matches[(res == -1) ? 0 : i] = false;
1607 if (!matches[0])
1608 break;
1609 }
1610 for (i = 0; i < group_size; ++i)
1611 if (!matches[i])
1612 {
1613 vect_free_oprnd_info (oprnds_info);
1614 return NULL;
1615 }
1616 swap = NULL;
1617
1618 auto_vec<slp_tree, 4> children;
1619
1620 stmt_info = stmts[0];
1621
1622 /* Create SLP_TREE nodes for the definition node/s. */
1623 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1624 {
1625 slp_tree child;
1626 unsigned int j;
1627
1628 /* We're skipping certain operands from processing, for example
1629 outer loop reduction initial defs. */
1630 if (skip_args[i])
1631 {
1632 children.safe_push (NULL);
1633 continue;
1634 }
1635
1636 if (oprnd_info->first_dt == vect_uninitialized_def)
1637 {
1638 /* COND_EXPR have one too many eventually if the condition
1639 is a SSA name. */
1640 gcc_assert (i == 3 && nops == 4);
1641 continue;
1642 }
1643
1644 if (is_a <bb_vec_info> (vinfo)
1645 && oprnd_info->first_dt == vect_internal_def
1646 && !oprnd_info->any_pattern)
1647 {
1648 /* For BB vectorization, if all defs are the same do not
1649 bother to continue the build along the single-lane
1650 graph but use a splat of the scalar value. */
1651 stmt_vec_info first_def = oprnd_info->def_stmts[0];
1652 for (j = 1; j < group_size; ++j)
1653 if (oprnd_info->def_stmts[j] != first_def)
1654 break;
1655 if (j == group_size
1656 /* But avoid doing this for loads where we may be
1657 able to CSE things, unless the stmt is not
1658 vectorizable. */
1659 && (!STMT_VINFO_VECTORIZABLE (first_def)
1660 || !gimple_vuse (first_def->stmt)))
1661 {
1662 if (dump_enabled_p ())
1663 dump_printf_loc (MSG_NOTE, vect_location,
1664 "Using a splat of the uniform operand\n");
1665 oprnd_info->first_dt = vect_external_def;
1666 }
1667 }
1668
1669 if (oprnd_info->first_dt == vect_external_def
1670 || oprnd_info->first_dt == vect_constant_def)
1671 {
1672 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1673 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1674 oprnd_info->ops = vNULL;
1675 children.safe_push (invnode);
1676 continue;
1677 }
1678
1679 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1680 group_size, &this_max_nunits,
1681 matches, npermutes,
1682 &this_tree_size, bst_map)) != NULL)
1683 {
1684 oprnd_info->def_stmts = vNULL;
1685 children.safe_push (child);
1686 continue;
1687 }
1688
1689 /* If the SLP build for operand zero failed and operand zero
1690 and one can be commutated try that for the scalar stmts
1691 that failed the match. */
1692 if (i == 0
1693 /* A first scalar stmt mismatch signals a fatal mismatch. */
1694 && matches[0]
1695 /* ??? For COND_EXPRs we can swap the comparison operands
1696 as well as the arms under some constraints. */
1697 && nops == 2
1698 && oprnds_info[1]->first_dt == vect_internal_def
1699 && is_gimple_assign (stmt_info->stmt)
1700 /* Swapping operands for reductions breaks assumptions later on. */
1701 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1702 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1703 /* Do so only if the number of not successful permutes was nor more
1704 than a cut-ff as re-trying the recursive match on
1705 possibly each level of the tree would expose exponential
1706 behavior. */
1707 && *npermutes < 4)
1708 {
1709 /* See whether we can swap the matching or the non-matching
1710 stmt operands. */
1711 bool swap_not_matching = true;
1712 do
1713 {
1714 for (j = 0; j < group_size; ++j)
1715 {
1716 if (matches[j] != !swap_not_matching)
1717 continue;
1718 stmt_vec_info stmt_info = stmts[j];
1719 /* Verify if we can swap operands of this stmt. */
1720 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1721 if (!stmt
1722 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1723 {
1724 if (!swap_not_matching)
1725 goto fail;
1726 swap_not_matching = false;
1727 break;
1728 }
1729 }
1730 }
1731 while (j != group_size);
1732
1733 /* Swap mismatched definition stmts. */
1734 if (dump_enabled_p ())
1735 dump_printf_loc (MSG_NOTE, vect_location,
1736 "Re-trying with swapped operands of stmts ");
1737 for (j = 0; j < group_size; ++j)
1738 if (matches[j] == !swap_not_matching)
1739 {
1740 std::swap (oprnds_info[0]->def_stmts[j],
1741 oprnds_info[1]->def_stmts[j]);
1742 std::swap (oprnds_info[0]->ops[j],
1743 oprnds_info[1]->ops[j]);
1744 if (dump_enabled_p ())
1745 dump_printf (MSG_NOTE, "%d ", j);
1746 }
1747 if (dump_enabled_p ())
1748 dump_printf (MSG_NOTE, "\n");
1749 /* And try again with scratch 'matches' ... */
1750 bool *tem = XALLOCAVEC (bool, group_size);
1751 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1752 group_size, &this_max_nunits,
1753 tem, npermutes,
1754 &this_tree_size, bst_map)) != NULL)
1755 {
1756 oprnd_info->def_stmts = vNULL;
1757 children.safe_push (child);
1758 continue;
1759 }
1760 /* We do not undo the swapping here since it might still be
1761 the better order for the second operand in case we build
1762 the first one from scalars below. */
1763 ++*npermutes;
1764 }
1765 fail:
1766
1767 /* If the SLP build failed and we analyze a basic-block
1768 simply treat nodes we fail to build as externally defined
1769 (and thus build vectors from the scalar defs).
1770 The cost model will reject outright expensive cases.
1771 ??? This doesn't treat cases where permutation ultimatively
1772 fails (or we don't try permutation below). Ideally we'd
1773 even compute a permutation that will end up with the maximum
1774 SLP tree size... */
1775 if (is_a <bb_vec_info> (vinfo)
1776 /* ??? Rejecting patterns this way doesn't work. We'd have to
1777 do extra work to cancel the pattern so the uses see the
1778 scalar version. */
1779 && !is_pattern_stmt_p (stmt_info)
1780 && !oprnd_info->any_pattern)
1781 {
1782 /* But if there's a leading vector sized set of matching stmts
1783 fail here so we can split the group. This matches the condition
1784 vect_analyze_slp_instance uses. */
1785 /* ??? We might want to split here and combine the results to support
1786 multiple vector sizes better. */
1787 for (j = 0; j < group_size; ++j)
1788 if (!matches[j])
1789 break;
1790 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1791 {
1792 if (dump_enabled_p ())
1793 dump_printf_loc (MSG_NOTE, vect_location,
1794 "Building vector operands from scalars\n");
1795 this_tree_size++;
1796 child = vect_create_new_slp_node (oprnd_info->ops);
1797 children.safe_push (child);
1798 oprnd_info->ops = vNULL;
1799 continue;
1800 }
1801 }
1802
1803 gcc_assert (child == NULL);
1804 FOR_EACH_VEC_ELT (children, j, child)
1805 if (child)
1806 vect_free_slp_tree (child);
1807 vect_free_oprnd_info (oprnds_info);
1808 return NULL;
1809 }
1810
1811 vect_free_oprnd_info (oprnds_info);
1812
1813 /* If we have all children of a child built up from uniform scalars
1814 or does more than one possibly expensive vector construction then
1815 just throw that away, causing it built up from scalars.
1816 The exception is the SLP node for the vector store. */
1817 if (is_a <bb_vec_info> (vinfo)
1818 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1819 /* ??? Rejecting patterns this way doesn't work. We'd have to
1820 do extra work to cancel the pattern so the uses see the
1821 scalar version. */
1822 && !is_pattern_stmt_p (stmt_info))
1823 {
1824 slp_tree child;
1825 unsigned j;
1826 bool all_uniform_p = true;
1827 unsigned n_vector_builds = 0;
1828 FOR_EACH_VEC_ELT (children, j, child)
1829 {
1830 if (!child)
1831 ;
1832 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1833 all_uniform_p = false;
1834 else if (!vect_slp_tree_uniform_p (child))
1835 {
1836 all_uniform_p = false;
1837 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1838 n_vector_builds++;
1839 }
1840 }
1841 if (all_uniform_p || n_vector_builds > 1)
1842 {
1843 /* Roll back. */
1844 matches[0] = false;
1845 FOR_EACH_VEC_ELT (children, j, child)
1846 if (child)
1847 vect_free_slp_tree (child);
1848
1849 if (dump_enabled_p ())
1850 dump_printf_loc (MSG_NOTE, vect_location,
1851 "Building parent vector operands from "
1852 "scalars instead\n");
1853 return NULL;
1854 }
1855 }
1856
1857 *tree_size += this_tree_size + 1;
1858 *max_nunits = this_max_nunits;
1859
1860 if (two_operators)
1861 {
1862 /* ??? We'd likely want to either cache in bst_map sth like
1863 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1864 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1865 explicit stmts to put in so the keying on 'stmts' doesn't
1866 work (but we have the same issue with nodes that use 'ops'). */
1867 slp_tree one = new _slp_tree;
1868 slp_tree two = new _slp_tree;
1869 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1870 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1871 SLP_TREE_VECTYPE (one) = vectype;
1872 SLP_TREE_VECTYPE (two) = vectype;
1873 SLP_TREE_CHILDREN (one).safe_splice (children);
1874 SLP_TREE_CHILDREN (two).safe_splice (children);
1875 slp_tree child;
1876 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1877 SLP_TREE_REF_COUNT (child)++;
1878
1879 /* Here we record the original defs since this
1880 node represents the final lane configuration. */
1881 node = vect_create_new_slp_node (node, stmts, 2);
1882 SLP_TREE_VECTYPE (node) = vectype;
1883 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1884 SLP_TREE_CHILDREN (node).quick_push (one);
1885 SLP_TREE_CHILDREN (node).quick_push (two);
1886 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1887 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1888 enum tree_code ocode = ERROR_MARK;
1889 stmt_vec_info ostmt_info;
1890 unsigned j = 0;
1891 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1892 {
1893 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1894 if (gimple_assign_rhs_code (ostmt) != code0)
1895 {
1896 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1897 ocode = gimple_assign_rhs_code (ostmt);
1898 j = i;
1899 }
1900 else
1901 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1902 }
1903 SLP_TREE_CODE (one) = code0;
1904 SLP_TREE_CODE (two) = ocode;
1905 SLP_TREE_LANES (one) = stmts.length ();
1906 SLP_TREE_LANES (two) = stmts.length ();
1907 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1908 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1909 return node;
1910 }
1911
1912 node = vect_create_new_slp_node (node, stmts, nops);
1913 SLP_TREE_VECTYPE (node) = vectype;
1914 SLP_TREE_CHILDREN (node).splice (children);
1915 return node;
1916 }
1917
1918 /* Dump a single SLP tree NODE. */
1919
1920 static void
1921 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1922 slp_tree node)
1923 {
1924 unsigned i, j;
1925 slp_tree child;
1926 stmt_vec_info stmt_info;
1927 tree op;
1928
1929 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1930 dump_user_location_t user_loc = loc.get_user_location ();
1931 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1932 SLP_TREE_DEF_TYPE (node) == vect_external_def
1933 ? " (external)"
1934 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1935 ? " (constant)"
1936 : ""), node,
1937 estimated_poly_value (node->max_nunits),
1938 SLP_TREE_REF_COUNT (node));
1939 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
1940 {
1941 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
1942 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
1943 else
1944 dump_printf_loc (metadata, user_loc, "op template: %G",
1945 SLP_TREE_REPRESENTATIVE (node)->stmt);
1946 }
1947 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1948 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1949 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1950 else
1951 {
1952 dump_printf_loc (metadata, user_loc, "\t{ ");
1953 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1954 dump_printf (metadata, "%T%s ", op,
1955 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1956 dump_printf (metadata, "}\n");
1957 }
1958 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1959 {
1960 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1961 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1962 dump_printf (dump_kind, " %u", j);
1963 dump_printf (dump_kind, " }\n");
1964 }
1965 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1966 {
1967 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1968 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1969 dump_printf (dump_kind, " %u[%u]",
1970 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1971 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1972 dump_printf (dump_kind, " }\n");
1973 }
1974 if (SLP_TREE_CHILDREN (node).is_empty ())
1975 return;
1976 dump_printf_loc (metadata, user_loc, "\tchildren");
1977 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1978 dump_printf (dump_kind, " %p", (void *)child);
1979 dump_printf (dump_kind, "\n");
1980 }
1981
1982 DEBUG_FUNCTION void
1983 debug (slp_tree node)
1984 {
1985 debug_dump_context ctx;
1986 vect_print_slp_tree (MSG_NOTE,
1987 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1988 node);
1989 }
1990
1991 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1992
1993 static void
1994 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1995 slp_tree node, hash_set<slp_tree> &visited)
1996 {
1997 unsigned i;
1998 slp_tree child;
1999
2000 if (visited.add (node))
2001 return;
2002
2003 vect_print_slp_tree (dump_kind, loc, node);
2004
2005 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2006 if (child)
2007 vect_print_slp_graph (dump_kind, loc, child, visited);
2008 }
2009
2010 static void
2011 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2012 slp_tree entry)
2013 {
2014 hash_set<slp_tree> visited;
2015 vect_print_slp_graph (dump_kind, loc, entry, visited);
2016 }
2017
2018 /* Mark the tree rooted at NODE with PURE_SLP. */
2019
2020 static void
2021 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2022 {
2023 int i;
2024 stmt_vec_info stmt_info;
2025 slp_tree child;
2026
2027 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2028 return;
2029
2030 if (visited.add (node))
2031 return;
2032
2033 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2034 STMT_SLP_TYPE (stmt_info) = pure_slp;
2035
2036 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2037 if (child)
2038 vect_mark_slp_stmts (child, visited);
2039 }
2040
2041 static void
2042 vect_mark_slp_stmts (slp_tree node)
2043 {
2044 hash_set<slp_tree> visited;
2045 vect_mark_slp_stmts (node, visited);
2046 }
2047
2048 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2049
2050 static void
2051 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2052 {
2053 int i;
2054 stmt_vec_info stmt_info;
2055 slp_tree child;
2056
2057 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2058 return;
2059
2060 if (visited.add (node))
2061 return;
2062
2063 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2064 {
2065 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2066 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2067 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2068 }
2069
2070 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2071 if (child)
2072 vect_mark_slp_stmts_relevant (child, visited);
2073 }
2074
2075 static void
2076 vect_mark_slp_stmts_relevant (slp_tree node)
2077 {
2078 hash_set<slp_tree> visited;
2079 vect_mark_slp_stmts_relevant (node, visited);
2080 }
2081
2082
2083 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2084
2085 static void
2086 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2087 hash_set<slp_tree> &visited)
2088 {
2089 if (!node || visited.add (node))
2090 return;
2091
2092 if (SLP_TREE_CHILDREN (node).length () == 0)
2093 {
2094 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2095 return;
2096 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2097 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2098 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2099 loads.safe_push (node);
2100 }
2101 else
2102 {
2103 unsigned i;
2104 slp_tree child;
2105 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2106 vect_gather_slp_loads (loads, child, visited);
2107 }
2108 }
2109
2110
2111 /* Find the last store in SLP INSTANCE. */
2112
2113 stmt_vec_info
2114 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2115 {
2116 stmt_vec_info last = NULL;
2117 stmt_vec_info stmt_vinfo;
2118
2119 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2120 {
2121 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2122 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2123 }
2124
2125 return last;
2126 }
2127
2128 /* Find the first stmt in NODE. */
2129
2130 stmt_vec_info
2131 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2132 {
2133 stmt_vec_info first = NULL;
2134 stmt_vec_info stmt_vinfo;
2135
2136 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2137 {
2138 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2139 if (!first
2140 || get_later_stmt (stmt_vinfo, first) == first)
2141 first = stmt_vinfo;
2142 }
2143
2144 return first;
2145 }
2146
2147 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2148 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2149 (also containing the first GROUP1_SIZE stmts, since stores are
2150 consecutive), the second containing the remainder.
2151 Return the first stmt in the second group. */
2152
2153 static stmt_vec_info
2154 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2155 {
2156 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2157 gcc_assert (group1_size > 0);
2158 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2159 gcc_assert (group2_size > 0);
2160 DR_GROUP_SIZE (first_vinfo) = group1_size;
2161
2162 stmt_vec_info stmt_info = first_vinfo;
2163 for (unsigned i = group1_size; i > 1; i--)
2164 {
2165 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2166 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2167 }
2168 /* STMT is now the last element of the first group. */
2169 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2170 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2171
2172 DR_GROUP_SIZE (group2) = group2_size;
2173 for (stmt_info = group2; stmt_info;
2174 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2175 {
2176 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2177 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2178 }
2179
2180 /* For the second group, the DR_GROUP_GAP is that before the original group,
2181 plus skipping over the first vector. */
2182 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2183
2184 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2185 DR_GROUP_GAP (first_vinfo) += group2_size;
2186
2187 if (dump_enabled_p ())
2188 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2189 group1_size, group2_size);
2190
2191 return group2;
2192 }
2193
2194 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2195 statements and a vector of NUNITS elements. */
2196
2197 static poly_uint64
2198 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2199 {
2200 return exact_div (common_multiple (nunits, group_size), group_size);
2201 }
2202
2203 static bool
2204 vect_analyze_slp_instance (vec_info *vinfo,
2205 scalar_stmts_to_slp_tree_map_t *bst_map,
2206 stmt_vec_info stmt_info, slp_instance_kind kind,
2207 unsigned max_tree_size);
2208
2209 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2210 of KIND. Return true if successful. */
2211
2212 static bool
2213 vect_build_slp_instance (vec_info *vinfo,
2214 slp_instance_kind kind,
2215 vec<stmt_vec_info> scalar_stmts,
2216 stmt_vec_info root_stmt_info,
2217 unsigned max_tree_size,
2218 scalar_stmts_to_slp_tree_map_t *bst_map,
2219 /* ??? We need stmt_info for group splitting. */
2220 stmt_vec_info stmt_info_)
2221 {
2222 if (dump_enabled_p ())
2223 {
2224 dump_printf_loc (MSG_NOTE, vect_location,
2225 "Starting SLP discovery for\n");
2226 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
2227 dump_printf_loc (MSG_NOTE, vect_location,
2228 " %G", scalar_stmts[i]->stmt);
2229 }
2230
2231 /* Build the tree for the SLP instance. */
2232 unsigned int group_size = scalar_stmts.length ();
2233 bool *matches = XALLOCAVEC (bool, group_size);
2234 unsigned npermutes = 0;
2235 poly_uint64 max_nunits = 1;
2236 unsigned tree_size = 0;
2237 unsigned i;
2238 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2239 &max_nunits, matches, &npermutes,
2240 &tree_size, bst_map);
2241 if (node != NULL)
2242 {
2243 /* Calculate the unrolling factor based on the smallest type. */
2244 poly_uint64 unrolling_factor
2245 = calculate_unrolling_factor (max_nunits, group_size);
2246
2247 if (maybe_ne (unrolling_factor, 1U)
2248 && is_a <bb_vec_info> (vinfo))
2249 {
2250 unsigned HOST_WIDE_INT const_max_nunits;
2251 if (!max_nunits.is_constant (&const_max_nunits)
2252 || const_max_nunits > group_size)
2253 {
2254 if (dump_enabled_p ())
2255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2256 "Build SLP failed: store group "
2257 "size not a multiple of the vector size "
2258 "in basic block SLP\n");
2259 vect_free_slp_tree (node);
2260 return false;
2261 }
2262 /* Fatal mismatch. */
2263 if (dump_enabled_p ())
2264 dump_printf_loc (MSG_NOTE, vect_location,
2265 "SLP discovery succeeded but node needs "
2266 "splitting\n");
2267 memset (matches, true, group_size);
2268 matches[group_size / const_max_nunits * const_max_nunits] = false;
2269 vect_free_slp_tree (node);
2270 }
2271 else
2272 {
2273 /* Create a new SLP instance. */
2274 slp_instance new_instance = XNEW (class _slp_instance);
2275 SLP_INSTANCE_TREE (new_instance) = node;
2276 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2277 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2278 SLP_INSTANCE_ROOT_STMT (new_instance) = root_stmt_info;
2279 SLP_INSTANCE_KIND (new_instance) = kind;
2280 new_instance->reduc_phis = NULL;
2281 new_instance->cost_vec = vNULL;
2282 new_instance->subgraph_entries = vNULL;
2283
2284 if (dump_enabled_p ())
2285 dump_printf_loc (MSG_NOTE, vect_location,
2286 "SLP size %u vs. limit %u.\n",
2287 tree_size, max_tree_size);
2288
2289 /* Fixup SLP reduction chains. */
2290 if (kind == slp_inst_kind_reduc_chain)
2291 {
2292 /* If this is a reduction chain with a conversion in front
2293 amend the SLP tree with a node for that. */
2294 gimple *scalar_def
2295 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2296 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
2297 {
2298 /* Get at the conversion stmt - we know it's the single use
2299 of the last stmt of the reduction chain. */
2300 use_operand_p use_p;
2301 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
2302 &use_p, &scalar_def);
2303 gcc_assert (r);
2304 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
2305 next_info = vect_stmt_to_vectorize (next_info);
2306 scalar_stmts = vNULL;
2307 scalar_stmts.create (group_size);
2308 for (unsigned i = 0; i < group_size; ++i)
2309 scalar_stmts.quick_push (next_info);
2310 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2311 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2312 SLP_TREE_CHILDREN (conv).quick_push (node);
2313 SLP_INSTANCE_TREE (new_instance) = conv;
2314 /* We also have to fake this conversion stmt as SLP reduction
2315 group so we don't have to mess with too much code
2316 elsewhere. */
2317 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2318 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2319 }
2320 /* Fill the backedge child of the PHI SLP node. The
2321 general matching code cannot find it because the
2322 scalar code does not reflect how we vectorize the
2323 reduction. */
2324 use_operand_p use_p;
2325 imm_use_iterator imm_iter;
2326 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
2327 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
2328 gimple_get_lhs (scalar_def))
2329 /* There are exactly two non-debug uses, the reduction
2330 PHI and the loop-closed PHI node. */
2331 if (!is_gimple_debug (USE_STMT (use_p))
2332 && gimple_bb (USE_STMT (use_p)) == loop->header)
2333 {
2334 auto_vec<stmt_vec_info, 64> phis (group_size);
2335 stmt_vec_info phi_info
2336 = vinfo->lookup_stmt (USE_STMT (use_p));
2337 for (unsigned i = 0; i < group_size; ++i)
2338 phis.quick_push (phi_info);
2339 slp_tree *phi_node = bst_map->get (phis);
2340 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
2341 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
2342 = SLP_INSTANCE_TREE (new_instance);
2343 SLP_INSTANCE_TREE (new_instance)->refcnt++;
2344 }
2345 }
2346
2347 vinfo->slp_instances.safe_push (new_instance);
2348
2349 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2350 the number of scalar stmts in the root in a few places.
2351 Verify that assumption holds. */
2352 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2353 .length () == group_size);
2354
2355 if (dump_enabled_p ())
2356 {
2357 dump_printf_loc (MSG_NOTE, vect_location,
2358 "Final SLP tree for instance %p:\n", new_instance);
2359 vect_print_slp_graph (MSG_NOTE, vect_location,
2360 SLP_INSTANCE_TREE (new_instance));
2361 }
2362
2363 return true;
2364 }
2365 }
2366 else
2367 {
2368 /* Failed to SLP. */
2369 /* Free the allocated memory. */
2370 scalar_stmts.release ();
2371 }
2372
2373 stmt_vec_info stmt_info = stmt_info_;
2374 /* Try to break the group up into pieces. */
2375 if (kind == slp_inst_kind_store)
2376 {
2377 /* ??? We could delay all the actual splitting of store-groups
2378 until after SLP discovery of the original group completed.
2379 Then we can recurse to vect_build_slp_instance directly. */
2380 for (i = 0; i < group_size; i++)
2381 if (!matches[i])
2382 break;
2383
2384 /* For basic block SLP, try to break the group up into multiples of
2385 a vector size. */
2386 if (is_a <bb_vec_info> (vinfo)
2387 && (i > 1 && i < group_size))
2388 {
2389 tree scalar_type
2390 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2391 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2392 1 << floor_log2 (i));
2393 unsigned HOST_WIDE_INT const_nunits;
2394 if (vectype
2395 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2396 {
2397 /* Split into two groups at the first vector boundary. */
2398 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2399 unsigned group1_size = i & ~(const_nunits - 1);
2400
2401 if (dump_enabled_p ())
2402 dump_printf_loc (MSG_NOTE, vect_location,
2403 "Splitting SLP group at stmt %u\n", i);
2404 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2405 group1_size);
2406 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2407 kind, max_tree_size);
2408 /* Split the rest at the failure point and possibly
2409 re-analyze the remaining matching part if it has
2410 at least two lanes. */
2411 if (group1_size < i
2412 && (i + 1 < group_size
2413 || i - group1_size > 1))
2414 {
2415 stmt_vec_info rest2 = rest;
2416 rest = vect_split_slp_store_group (rest, i - group1_size);
2417 if (i - group1_size > 1)
2418 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
2419 kind, max_tree_size);
2420 }
2421 /* Re-analyze the non-matching tail if it has at least
2422 two lanes. */
2423 if (i + 1 < group_size)
2424 res |= vect_analyze_slp_instance (vinfo, bst_map,
2425 rest, kind, max_tree_size);
2426 return res;
2427 }
2428 }
2429
2430 /* For loop vectorization split into arbitrary pieces of size > 1. */
2431 if (is_a <loop_vec_info> (vinfo)
2432 && (i > 1 && i < group_size))
2433 {
2434 unsigned group1_size = i;
2435
2436 if (dump_enabled_p ())
2437 dump_printf_loc (MSG_NOTE, vect_location,
2438 "Splitting SLP group at stmt %u\n", i);
2439
2440 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2441 group1_size);
2442 /* Loop vectorization cannot handle gaps in stores, make sure
2443 the split group appears as strided. */
2444 STMT_VINFO_STRIDED_P (rest) = 1;
2445 DR_GROUP_GAP (rest) = 0;
2446 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2447 DR_GROUP_GAP (stmt_info) = 0;
2448
2449 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2450 kind, max_tree_size);
2451 if (i + 1 < group_size)
2452 res |= vect_analyze_slp_instance (vinfo, bst_map,
2453 rest, kind, max_tree_size);
2454
2455 return res;
2456 }
2457
2458 /* Even though the first vector did not all match, we might be able to SLP
2459 (some) of the remainder. FORNOW ignore this possibility. */
2460 }
2461
2462 /* Failed to SLP. */
2463 if (dump_enabled_p ())
2464 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2465 return false;
2466 }
2467
2468
2469 /* Analyze an SLP instance starting from a group of grouped stores. Call
2470 vect_build_slp_tree to build a tree of packed stmts if possible.
2471 Return FALSE if it's impossible to SLP any stmt in the loop. */
2472
2473 static bool
2474 vect_analyze_slp_instance (vec_info *vinfo,
2475 scalar_stmts_to_slp_tree_map_t *bst_map,
2476 stmt_vec_info stmt_info,
2477 slp_instance_kind kind,
2478 unsigned max_tree_size)
2479 {
2480 unsigned int i;
2481 vec<stmt_vec_info> scalar_stmts;
2482
2483 if (is_a <bb_vec_info> (vinfo))
2484 vect_location = stmt_info->stmt;
2485
2486 stmt_vec_info next_info = stmt_info;
2487 if (kind == slp_inst_kind_store)
2488 {
2489 /* Collect the stores and store them in scalar_stmts. */
2490 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
2491 while (next_info)
2492 {
2493 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2494 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2495 }
2496 }
2497 else if (kind == slp_inst_kind_reduc_chain)
2498 {
2499 /* Collect the reduction stmts and store them in scalar_stmts. */
2500 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
2501 while (next_info)
2502 {
2503 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2504 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2505 }
2506 /* Mark the first element of the reduction chain as reduction to properly
2507 transform the node. In the reduction analysis phase only the last
2508 element of the chain is marked as reduction. */
2509 STMT_VINFO_DEF_TYPE (stmt_info)
2510 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2511 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2512 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2513 }
2514 else if (kind == slp_inst_kind_ctor)
2515 {
2516 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2517 tree val;
2518 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
2519 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2520 {
2521 stmt_vec_info def_info = vinfo->lookup_def (val);
2522 def_info = vect_stmt_to_vectorize (def_info);
2523 scalar_stmts.quick_push (def_info);
2524 }
2525 if (dump_enabled_p ())
2526 dump_printf_loc (MSG_NOTE, vect_location,
2527 "Analyzing vectorizable constructor: %G\n",
2528 stmt_info->stmt);
2529 }
2530 else if (kind == slp_inst_kind_reduc_group)
2531 {
2532 /* Collect reduction statements. */
2533 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2534 scalar_stmts.create (reductions.length ());
2535 for (i = 0; reductions.iterate (i, &next_info); i++)
2536 if (STMT_VINFO_RELEVANT_P (next_info)
2537 || STMT_VINFO_LIVE_P (next_info))
2538 scalar_stmts.quick_push (next_info);
2539 /* If less than two were relevant/live there's nothing to SLP. */
2540 if (scalar_stmts.length () < 2)
2541 return false;
2542 }
2543 else
2544 gcc_unreachable ();
2545
2546 /* Build the tree for the SLP instance. */
2547 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
2548 kind == slp_inst_kind_ctor
2549 ? stmt_info : NULL,
2550 max_tree_size, bst_map,
2551 kind == slp_inst_kind_store
2552 ? stmt_info : NULL);
2553
2554 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2555 where we should do store group splitting. */
2556
2557 return res;
2558 }
2559
2560 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2561 trees of packed scalar stmts if SLP is possible. */
2562
2563 opt_result
2564 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2565 {
2566 unsigned int i;
2567 stmt_vec_info first_element;
2568
2569 DUMP_VECT_SCOPE ("vect_analyze_slp");
2570
2571 scalar_stmts_to_slp_tree_map_t *bst_map
2572 = new scalar_stmts_to_slp_tree_map_t ();
2573
2574 /* Find SLP sequences starting from groups of grouped stores. */
2575 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2576 vect_analyze_slp_instance (vinfo, bst_map, first_element,
2577 STMT_VINFO_GROUPED_ACCESS (first_element)
2578 ? slp_inst_kind_store : slp_inst_kind_ctor,
2579 max_tree_size);
2580
2581 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
2582 {
2583 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
2584 {
2585 vect_location = bb_vinfo->roots[i].root->stmt;
2586 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
2587 bb_vinfo->roots[i].stmts,
2588 bb_vinfo->roots[i].root,
2589 max_tree_size, bst_map, NULL))
2590 bb_vinfo->roots[i].stmts = vNULL;
2591 }
2592 }
2593
2594 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2595 {
2596 /* Find SLP sequences starting from reduction chains. */
2597 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2598 if (! STMT_VINFO_RELEVANT_P (first_element)
2599 && ! STMT_VINFO_LIVE_P (first_element))
2600 ;
2601 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2602 slp_inst_kind_reduc_chain,
2603 max_tree_size))
2604 {
2605 /* Dissolve reduction chain group. */
2606 stmt_vec_info vinfo = first_element;
2607 stmt_vec_info last = NULL;
2608 while (vinfo)
2609 {
2610 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2611 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2612 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2613 last = vinfo;
2614 vinfo = next;
2615 }
2616 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2617 /* It can be still vectorized as part of an SLP reduction. */
2618 loop_vinfo->reductions.safe_push (last);
2619 }
2620
2621 /* Find SLP sequences starting from groups of reductions. */
2622 if (loop_vinfo->reductions.length () > 1)
2623 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2624 slp_inst_kind_reduc_group, max_tree_size);
2625 }
2626
2627 /* The map keeps a reference on SLP nodes built, release that. */
2628 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2629 it != bst_map->end (); ++it)
2630 if ((*it).second)
2631 vect_free_slp_tree ((*it).second);
2632 delete bst_map;
2633
2634 return opt_result::success ();
2635 }
2636
2637 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2638
2639 static void
2640 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2641 vec<slp_tree> &vertices, vec<int> &leafs)
2642 {
2643 unsigned i;
2644 slp_tree child;
2645
2646 if (visited.add (node))
2647 return;
2648
2649 node->vertex = vertices.length ();
2650 vertices.safe_push (node);
2651
2652 bool leaf = true;
2653 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2654 if (child)
2655 {
2656 leaf = false;
2657 vect_slp_build_vertices (visited, child, vertices, leafs);
2658 }
2659 if (leaf)
2660 leafs.safe_push (node->vertex);
2661 }
2662
2663 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2664
2665 static void
2666 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2667 vec<int> &leafs)
2668 {
2669 hash_set<slp_tree> visited;
2670 unsigned i;
2671 slp_instance instance;
2672 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2673 {
2674 unsigned n_v = vertices.length ();
2675 unsigned n_l = leafs.length ();
2676 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2677 leafs);
2678 /* If we added vertices but no entries to the reverse graph we've
2679 added a cycle that is not backwards-reachable. Push the entry
2680 to mimic as leaf then. */
2681 if (vertices.length () > n_v
2682 && leafs.length () == n_l)
2683 leafs.safe_push (SLP_INSTANCE_TREE (instance)->vertex);
2684 }
2685 }
2686
2687 /* Apply (reverse) bijectite PERM to VEC. */
2688
2689 template <class T>
2690 static void
2691 vect_slp_permute (vec<unsigned> perm,
2692 vec<T> &vec, bool reverse)
2693 {
2694 auto_vec<T, 64> saved;
2695 saved.create (vec.length ());
2696 for (unsigned i = 0; i < vec.length (); ++i)
2697 saved.quick_push (vec[i]);
2698
2699 if (reverse)
2700 {
2701 for (unsigned i = 0; i < vec.length (); ++i)
2702 vec[perm[i]] = saved[i];
2703 for (unsigned i = 0; i < vec.length (); ++i)
2704 gcc_assert (vec[perm[i]] == saved[i]);
2705 }
2706 else
2707 {
2708 for (unsigned i = 0; i < vec.length (); ++i)
2709 vec[i] = saved[perm[i]];
2710 for (unsigned i = 0; i < vec.length (); ++i)
2711 gcc_assert (vec[i] == saved[perm[i]]);
2712 }
2713 }
2714
2715 /* Return whether permutations PERM_A and PERM_B as recorded in the
2716 PERMS vector are equal. */
2717
2718 static bool
2719 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
2720 int perm_a, int perm_b)
2721 {
2722 return (perm_a == perm_b
2723 || (perms[perm_a].length () == perms[perm_b].length ()
2724 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
2725 sizeof (unsigned) * perms[perm_a].length ()) == 0));
2726 }
2727
2728 /* Optimize the SLP graph of VINFO. */
2729
2730 void
2731 vect_optimize_slp (vec_info *vinfo)
2732 {
2733 if (vinfo->slp_instances.is_empty ())
2734 return;
2735
2736 slp_tree node;
2737 unsigned i;
2738 auto_vec<slp_tree> vertices;
2739 auto_vec<int> leafs;
2740 vect_slp_build_vertices (vinfo, vertices, leafs);
2741
2742 struct graph *slpg = new_graph (vertices.length ());
2743 FOR_EACH_VEC_ELT (vertices, i, node)
2744 {
2745 unsigned j;
2746 slp_tree child;
2747 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2748 if (child)
2749 add_edge (slpg, i, child->vertex);
2750 }
2751
2752 /* Compute (reverse) postorder on the inverted graph. */
2753 auto_vec<int> ipo;
2754 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
2755
2756 auto_sbitmap n_visited (vertices.length ());
2757 auto_sbitmap n_materialize (vertices.length ());
2758 auto_vec<int> n_perm (vertices.length ());
2759 auto_vec<vec<unsigned> > perms;
2760
2761 bitmap_clear (n_visited);
2762 bitmap_clear (n_materialize);
2763 n_perm.quick_grow_cleared (vertices.length ());
2764 perms.safe_push (vNULL); /* zero is no permute */
2765
2766 /* Produce initial permutations. */
2767 for (i = 0; i < leafs.length (); ++i)
2768 {
2769 int idx = leafs[i];
2770 slp_tree node = vertices[idx];
2771
2772 /* Handle externals and constants optimistically throughout the
2773 iteration. */
2774 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
2775 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
2776 continue;
2777
2778 /* Leafs do not change across iterations. Note leafs also double
2779 as entries to the reverse graph. */
2780 if (!slpg->vertices[idx].succ)
2781 bitmap_set_bit (n_visited, idx);
2782 /* Loads are the only thing generating permutes. */
2783 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2784 continue;
2785
2786 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2787 node unpermuted, record this permute. */
2788 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
2789 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
2790 continue;
2791 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
2792 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
2793 bool any_permute = false;
2794 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2795 {
2796 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
2797 imin = MIN (imin, idx);
2798 imax = MAX (imax, idx);
2799 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
2800 any_permute = true;
2801 }
2802 /* If there's no permute no need to split one out. */
2803 if (!any_permute)
2804 continue;
2805 /* If the span doesn't match we'd disrupt VF computation, avoid
2806 that for now. */
2807 if (imax - imin + 1 != SLP_TREE_LANES (node))
2808 continue;
2809
2810 /* For now only handle true permutes, like
2811 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2812 when permuting constants and invariants keeping the permute
2813 bijective. */
2814 auto_sbitmap load_index (SLP_TREE_LANES (node));
2815 bitmap_clear (load_index);
2816 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2817 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
2818 unsigned j;
2819 for (j = 0; j < SLP_TREE_LANES (node); ++j)
2820 if (!bitmap_bit_p (load_index, j))
2821 break;
2822 if (j != SLP_TREE_LANES (node))
2823 continue;
2824
2825 vec<unsigned> perm = vNULL;
2826 perm.safe_grow (SLP_TREE_LANES (node), true);
2827 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2828 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
2829 perms.safe_push (perm);
2830 n_perm[idx] = perms.length () - 1;
2831 }
2832
2833 /* Propagate permutes along the graph and compute materialization points. */
2834 bool changed;
2835 unsigned iteration = 0;
2836 do
2837 {
2838 changed = false;
2839 ++iteration;
2840
2841 for (i = vertices.length (); i > 0 ; --i)
2842 {
2843 int idx = ipo[i-1];
2844 slp_tree node = vertices[idx];
2845 /* For leafs there's nothing to do - we've seeded permutes
2846 on those above. */
2847 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2848 continue;
2849
2850 bitmap_set_bit (n_visited, idx);
2851
2852 /* We cannot move a permute across a store. */
2853 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2854 && DR_IS_WRITE
2855 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
2856 continue;
2857
2858 int perm = -1;
2859 for (graph_edge *succ = slpg->vertices[idx].succ;
2860 succ; succ = succ->succ_next)
2861 {
2862 int succ_idx = succ->dest;
2863 /* Handle unvisited nodes optimistically. */
2864 /* ??? But for constants once we want to handle non-bijective
2865 permutes we have to verify the permute, when unifying lanes,
2866 will not unify different constants. For example see
2867 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2868 if (!bitmap_bit_p (n_visited, succ_idx))
2869 continue;
2870 int succ_perm = n_perm[succ_idx];
2871 /* Once we materialize succs permutation its output lanes
2872 appear unpermuted to us. */
2873 if (bitmap_bit_p (n_materialize, succ_idx))
2874 succ_perm = 0;
2875 if (perm == -1)
2876 perm = succ_perm;
2877 else if (succ_perm == 0)
2878 {
2879 perm = 0;
2880 break;
2881 }
2882 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
2883 {
2884 perm = 0;
2885 break;
2886 }
2887 }
2888
2889 if (perm == -1)
2890 /* Pick up pre-computed leaf values. */
2891 perm = n_perm[idx];
2892 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
2893 {
2894 if (iteration > 1)
2895 /* Make sure we eventually converge. */
2896 gcc_checking_assert (perm == 0);
2897 n_perm[idx] = perm;
2898 if (perm == 0)
2899 bitmap_clear_bit (n_materialize, idx);
2900 changed = true;
2901 }
2902
2903 if (perm == 0)
2904 continue;
2905
2906 /* Elide pruning at materialization points in the first
2907 iteration so every node was visited once at least. */
2908 if (iteration == 1)
2909 continue;
2910
2911 /* Decide on permute materialization. Look whether there's
2912 a use (pred) edge that is permuted differently than us.
2913 In that case mark ourselves so the permutation is applied. */
2914 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
2915 for (graph_edge *pred = slpg->vertices[idx].pred;
2916 pred; pred = pred->pred_next)
2917 {
2918 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
2919 int pred_perm = n_perm[pred->src];
2920 if (!vect_slp_perms_eq (perms, perm, pred_perm))
2921 {
2922 all_preds_permuted = false;
2923 break;
2924 }
2925 }
2926 if (!all_preds_permuted)
2927 {
2928 if (!bitmap_bit_p (n_materialize, idx))
2929 changed = true;
2930 bitmap_set_bit (n_materialize, idx);
2931 }
2932 }
2933 }
2934 while (changed || iteration == 1);
2935
2936 /* Materialize. */
2937 for (i = 0; i < vertices.length (); ++i)
2938 {
2939 int perm = n_perm[i];
2940 if (perm <= 0)
2941 continue;
2942
2943 slp_tree node = vertices[i];
2944
2945 /* First permute invariant/external original successors. */
2946 unsigned j;
2947 slp_tree child;
2948 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2949 {
2950 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2951 continue;
2952
2953 /* If the vector is uniform there's nothing to do. */
2954 if (vect_slp_tree_uniform_p (child))
2955 continue;
2956
2957 /* We can end up sharing some externals via two_operator
2958 handling. Be prepared to unshare those. */
2959 if (child->refcnt != 1)
2960 {
2961 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
2962 SLP_TREE_CHILDREN (node)[j] = child
2963 = vect_create_new_slp_node
2964 (SLP_TREE_SCALAR_OPS (child).copy ());
2965 }
2966 vect_slp_permute (perms[perm],
2967 SLP_TREE_SCALAR_OPS (child), true);
2968 }
2969
2970 if (bitmap_bit_p (n_materialize, i))
2971 {
2972 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2973 /* For loads simply drop the permutation, the load permutation
2974 already performs the desired permutation. */
2975 ;
2976 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2977 {
2978 /* If the node if already a permute node we just need to apply
2979 the permutation to the permute node itself. */
2980 if (dump_enabled_p ())
2981 dump_printf_loc (MSG_NOTE, vect_location,
2982 "simplifying permute node %p\n",
2983 node);
2984
2985 vect_slp_permute (perms[perm], SLP_TREE_LANE_PERMUTATION (node),
2986 true);
2987 }
2988 else
2989 {
2990 if (dump_enabled_p ())
2991 dump_printf_loc (MSG_NOTE, vect_location,
2992 "inserting permute node in place of %p\n",
2993 node);
2994
2995 /* Make a copy of NODE and in-place change it to a
2996 VEC_PERM node to permute the lanes of the copy. */
2997 slp_tree copy = new _slp_tree;
2998 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
2999 SLP_TREE_CHILDREN (node) = vNULL;
3000 SLP_TREE_SCALAR_STMTS (copy)
3001 = SLP_TREE_SCALAR_STMTS (node).copy ();
3002 vect_slp_permute (perms[perm],
3003 SLP_TREE_SCALAR_STMTS (copy), true);
3004 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3005 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3006 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3007 SLP_TREE_LANE_PERMUTATION (copy)
3008 = SLP_TREE_LANE_PERMUTATION (node);
3009 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3010 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3011 copy->refcnt = 1;
3012 copy->max_nunits = node->max_nunits;
3013 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3014 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3015 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3016
3017 /* Now turn NODE into a VEC_PERM. */
3018 SLP_TREE_CHILDREN (node).safe_push (copy);
3019 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3020 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3021 SLP_TREE_LANE_PERMUTATION (node)
3022 .quick_push (std::make_pair (0, perms[perm][j]));
3023 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3024 }
3025 }
3026 else
3027 {
3028 /* Apply the reverse permutation to our stmts. */
3029 vect_slp_permute (perms[perm],
3030 SLP_TREE_SCALAR_STMTS (node), true);
3031 /* And to the load permutation, which we can simply
3032 make regular by design. */
3033 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3034 {
3035 /* ??? When we handle non-bijective permutes the idea
3036 is that we can force the load-permutation to be
3037 { min, min + 1, min + 2, ... max }. But then the
3038 scalar defs might no longer match the lane content
3039 which means wrong-code with live lane vectorization.
3040 So we possibly have to have NULL entries for those. */
3041 vect_slp_permute (perms[perm],
3042 SLP_TREE_LOAD_PERMUTATION (node), true);
3043 }
3044 }
3045 }
3046
3047 /* Free the perms vector used for propagation. */
3048 while (!perms.is_empty ())
3049 perms.pop ().release ();
3050 free_graph (slpg);
3051
3052
3053 /* Now elide load permutations that are not necessary. */
3054 for (i = 0; i < leafs.length (); ++i)
3055 {
3056 node = vertices[leafs[i]];
3057 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3058 continue;
3059
3060 /* In basic block vectorization we allow any subchain of an interleaving
3061 chain.
3062 FORNOW: not in loop SLP because of realignment complications. */
3063 if (is_a <bb_vec_info> (vinfo))
3064 {
3065 bool subchain_p = true;
3066 stmt_vec_info next_load_info = NULL;
3067 stmt_vec_info load_info;
3068 unsigned j;
3069 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3070 {
3071 if (j != 0
3072 && (next_load_info != load_info
3073 || DR_GROUP_GAP (load_info) != 1))
3074 {
3075 subchain_p = false;
3076 break;
3077 }
3078 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3079 }
3080 if (subchain_p)
3081 {
3082 SLP_TREE_LOAD_PERMUTATION (node).release ();
3083 continue;
3084 }
3085 }
3086 else
3087 {
3088 stmt_vec_info load_info;
3089 bool this_load_permuted = false;
3090 unsigned j;
3091 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3092 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3093 {
3094 this_load_permuted = true;
3095 break;
3096 }
3097 stmt_vec_info first_stmt_info
3098 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3099 if (!this_load_permuted
3100 /* The load requires permutation when unrolling exposes
3101 a gap either because the group is larger than the SLP
3102 group-size or because there is a gap between the groups. */
3103 && (known_eq (LOOP_VINFO_VECT_FACTOR
3104 (as_a <loop_vec_info> (vinfo)), 1U)
3105 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3106 && DR_GROUP_GAP (first_stmt_info) == 0)))
3107 {
3108 SLP_TREE_LOAD_PERMUTATION (node).release ();
3109 continue;
3110 }
3111 }
3112 }
3113 }
3114
3115 /* Gather loads reachable from the individual SLP graph entries. */
3116
3117 void
3118 vect_gather_slp_loads (vec_info *vinfo)
3119 {
3120 unsigned i;
3121 slp_instance instance;
3122 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
3123 {
3124 hash_set<slp_tree> visited;
3125 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
3126 SLP_INSTANCE_TREE (instance), visited);
3127 }
3128 }
3129
3130
3131 /* For each possible SLP instance decide whether to SLP it and calculate overall
3132 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3133 least one instance. */
3134
3135 bool
3136 vect_make_slp_decision (loop_vec_info loop_vinfo)
3137 {
3138 unsigned int i;
3139 poly_uint64 unrolling_factor = 1;
3140 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3141 slp_instance instance;
3142 int decided_to_slp = 0;
3143
3144 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3145
3146 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3147 {
3148 /* FORNOW: SLP if you can. */
3149 /* All unroll factors have the form:
3150
3151 GET_MODE_SIZE (vinfo->vector_mode) * X
3152
3153 for some rational X, so they must have a common multiple. */
3154 unrolling_factor
3155 = force_common_multiple (unrolling_factor,
3156 SLP_INSTANCE_UNROLLING_FACTOR (instance));
3157
3158 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3159 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3160 loop-based vectorization. Such stmts will be marked as HYBRID. */
3161 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3162 decided_to_slp++;
3163 }
3164
3165 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
3166
3167 if (decided_to_slp && dump_enabled_p ())
3168 {
3169 dump_printf_loc (MSG_NOTE, vect_location,
3170 "Decided to SLP %d instances. Unrolling factor ",
3171 decided_to_slp);
3172 dump_dec (MSG_NOTE, unrolling_factor);
3173 dump_printf (MSG_NOTE, "\n");
3174 }
3175
3176 return (decided_to_slp > 0);
3177 }
3178
3179 /* Private data for vect_detect_hybrid_slp. */
3180 struct vdhs_data
3181 {
3182 loop_vec_info loop_vinfo;
3183 vec<stmt_vec_info> *worklist;
3184 };
3185
3186 /* Walker for walk_gimple_op. */
3187
3188 static tree
3189 vect_detect_hybrid_slp (tree *tp, int *, void *data)
3190 {
3191 walk_stmt_info *wi = (walk_stmt_info *)data;
3192 vdhs_data *dat = (vdhs_data *)wi->info;
3193
3194 if (wi->is_lhs)
3195 return NULL_TREE;
3196
3197 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
3198 if (!def_stmt_info)
3199 return NULL_TREE;
3200 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3201 if (PURE_SLP_STMT (def_stmt_info))
3202 {
3203 if (dump_enabled_p ())
3204 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3205 def_stmt_info->stmt);
3206 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3207 dat->worklist->safe_push (def_stmt_info);
3208 }
3209
3210 return NULL_TREE;
3211 }
3212
3213 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3214 if so, otherwise pushing it to WORKLIST. */
3215
3216 static void
3217 maybe_push_to_hybrid_worklist (vec_info *vinfo,
3218 vec<stmt_vec_info> &worklist,
3219 stmt_vec_info stmt_info)
3220 {
3221 if (dump_enabled_p ())
3222 dump_printf_loc (MSG_NOTE, vect_location,
3223 "Processing hybrid candidate : %G", stmt_info->stmt);
3224 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
3225 imm_use_iterator iter2;
3226 ssa_op_iter iter1;
3227 use_operand_p use_p;
3228 def_operand_p def_p;
3229 bool any_def = false;
3230 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
3231 {
3232 any_def = true;
3233 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
3234 {
3235 if (is_gimple_debug (USE_STMT (use_p)))
3236 continue;
3237 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
3238 /* An out-of loop use means this is a loop_vect sink. */
3239 if (!use_info)
3240 {
3241 if (dump_enabled_p ())
3242 dump_printf_loc (MSG_NOTE, vect_location,
3243 "Found loop_vect sink: %G", stmt_info->stmt);
3244 worklist.safe_push (stmt_info);
3245 return;
3246 }
3247 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
3248 {
3249 if (dump_enabled_p ())
3250 dump_printf_loc (MSG_NOTE, vect_location,
3251 "Found loop_vect use: %G", use_info->stmt);
3252 worklist.safe_push (stmt_info);
3253 return;
3254 }
3255 }
3256 }
3257 /* No def means this is a loo_vect sink. */
3258 if (!any_def)
3259 {
3260 if (dump_enabled_p ())
3261 dump_printf_loc (MSG_NOTE, vect_location,
3262 "Found loop_vect sink: %G", stmt_info->stmt);
3263 worklist.safe_push (stmt_info);
3264 return;
3265 }
3266 if (dump_enabled_p ())
3267 dump_printf_loc (MSG_NOTE, vect_location,
3268 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
3269 STMT_SLP_TYPE (stmt_info) = pure_slp;
3270 }
3271
3272 /* Find stmts that must be both vectorized and SLPed. */
3273
3274 void
3275 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3276 {
3277 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3278
3279 /* All stmts participating in SLP are marked pure_slp, all other
3280 stmts are loop_vect.
3281 First collect all loop_vect stmts into a worklist.
3282 SLP patterns cause not all original scalar stmts to appear in
3283 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3284 Rectify this here and do a backward walk over the IL only considering
3285 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3286 mark them as pure_slp. */
3287 auto_vec<stmt_vec_info> worklist;
3288 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
3289 {
3290 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3291 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3292 gsi_next (&gsi))
3293 {
3294 gphi *phi = gsi.phi ();
3295 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3296 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3297 maybe_push_to_hybrid_worklist (loop_vinfo,
3298 worklist, stmt_info);
3299 }
3300 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
3301 gsi_prev (&gsi))
3302 {
3303 gimple *stmt = gsi_stmt (gsi);
3304 if (is_gimple_debug (stmt))
3305 continue;
3306 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3307 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3308 {
3309 for (gimple_stmt_iterator gsi2
3310 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3311 !gsi_end_p (gsi2); gsi_next (&gsi2))
3312 {
3313 stmt_vec_info patt_info
3314 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3315 if (!STMT_SLP_TYPE (patt_info)
3316 && STMT_VINFO_RELEVANT (patt_info))
3317 maybe_push_to_hybrid_worklist (loop_vinfo,
3318 worklist, patt_info);
3319 }
3320 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3321 }
3322 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3323 maybe_push_to_hybrid_worklist (loop_vinfo,
3324 worklist, stmt_info);
3325 }
3326 }
3327
3328 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3329 mark any SLP vectorized stmt as hybrid.
3330 ??? We're visiting def stmts N times (once for each non-SLP and
3331 once for each hybrid-SLP use). */
3332 walk_stmt_info wi;
3333 vdhs_data dat;
3334 dat.worklist = &worklist;
3335 dat.loop_vinfo = loop_vinfo;
3336 memset (&wi, 0, sizeof (wi));
3337 wi.info = (void *)&dat;
3338 while (!worklist.is_empty ())
3339 {
3340 stmt_vec_info stmt_info = worklist.pop ();
3341 /* Since SSA operands are not set up for pattern stmts we need
3342 to use walk_gimple_op. */
3343 wi.is_lhs = 0;
3344 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3345 }
3346 }
3347
3348
3349 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3350
3351 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3352 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs), roots (vNULL)
3353 {
3354 for (unsigned i = 0; i < bbs.length (); ++i)
3355 {
3356 if (i != 0)
3357 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3358 gsi_next (&si))
3359 {
3360 gphi *phi = si.phi ();
3361 gimple_set_uid (phi, 0);
3362 add_stmt (phi);
3363 }
3364 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3365 !gsi_end_p (gsi); gsi_next (&gsi))
3366 {
3367 gimple *stmt = gsi_stmt (gsi);
3368 gimple_set_uid (stmt, 0);
3369 if (is_gimple_debug (stmt))
3370 continue;
3371 add_stmt (stmt);
3372 }
3373 }
3374 }
3375
3376
3377 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3378 stmts in the basic block. */
3379
3380 _bb_vec_info::~_bb_vec_info ()
3381 {
3382 /* Reset region marker. */
3383 for (unsigned i = 0; i < bbs.length (); ++i)
3384 {
3385 if (i != 0)
3386 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3387 gsi_next (&si))
3388 {
3389 gphi *phi = si.phi ();
3390 gimple_set_uid (phi, -1);
3391 }
3392 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3393 !gsi_end_p (gsi); gsi_next (&gsi))
3394 {
3395 gimple *stmt = gsi_stmt (gsi);
3396 gimple_set_uid (stmt, -1);
3397 }
3398 }
3399
3400 for (unsigned i = 0; i < roots.length (); ++i)
3401 roots[i].stmts.release ();
3402 roots.release ();
3403 }
3404
3405 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3406 given then that child nodes have already been processed, and that
3407 their def types currently match their SLP node's def type. */
3408
3409 static bool
3410 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3411 slp_instance node_instance,
3412 stmt_vector_for_cost *cost_vec)
3413 {
3414 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3415 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3416
3417 /* Calculate the number of vector statements to be created for the
3418 scalar stmts in this node. For SLP reductions it is equal to the
3419 number of vector statements in the children (which has already been
3420 calculated by the recursive call). Otherwise it is the number of
3421 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3422 VF divided by the number of elements in a vector. */
3423 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3424 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3425 {
3426 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3427 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3428 {
3429 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3430 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3431 break;
3432 }
3433 }
3434 else
3435 {
3436 poly_uint64 vf;
3437 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3438 vf = loop_vinfo->vectorization_factor;
3439 else
3440 vf = 1;
3441 unsigned int group_size = SLP_TREE_LANES (node);
3442 tree vectype = SLP_TREE_VECTYPE (node);
3443 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3444 = vect_get_num_vectors (vf * group_size, vectype);
3445 }
3446
3447 /* Handle purely internal nodes. */
3448 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3449 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3450
3451 if (is_a <bb_vec_info> (vinfo)
3452 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3453 {
3454 if (dump_enabled_p ())
3455 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3456 "desired vector type conflicts with earlier one "
3457 "for %G", stmt_info->stmt);
3458 return false;
3459 }
3460
3461 bool dummy;
3462 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3463 node, node_instance, cost_vec);
3464 }
3465
3466 /* Try to build NODE from scalars, returning true on success.
3467 NODE_INSTANCE is the SLP instance that contains NODE. */
3468
3469 static bool
3470 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3471 slp_instance node_instance)
3472 {
3473 stmt_vec_info stmt_info;
3474 unsigned int i;
3475
3476 if (!is_a <bb_vec_info> (vinfo)
3477 || node == SLP_INSTANCE_TREE (node_instance)
3478 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3479 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3480 return false;
3481
3482 if (dump_enabled_p ())
3483 dump_printf_loc (MSG_NOTE, vect_location,
3484 "Building vector operands of %p from scalars instead\n", node);
3485
3486 /* Don't remove and free the child nodes here, since they could be
3487 referenced by other structures. The analysis and scheduling phases
3488 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3489 unsigned int group_size = SLP_TREE_LANES (node);
3490 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3491 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3492 SLP_TREE_LOAD_PERMUTATION (node).release ();
3493 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3494 {
3495 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3496 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3497 }
3498 return true;
3499 }
3500
3501 /* Compute the prologue cost for invariant or constant operands represented
3502 by NODE. */
3503
3504 static void
3505 vect_prologue_cost_for_slp (slp_tree node,
3506 stmt_vector_for_cost *cost_vec)
3507 {
3508 /* There's a special case of an existing vector, that costs nothing. */
3509 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3510 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3511 return;
3512 /* Without looking at the actual initializer a vector of
3513 constants can be implemented as load from the constant pool.
3514 When all elements are the same we can use a splat. */
3515 tree vectype = SLP_TREE_VECTYPE (node);
3516 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3517 unsigned num_vects_to_check;
3518 unsigned HOST_WIDE_INT const_nunits;
3519 unsigned nelt_limit;
3520 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3521 && ! multiple_p (const_nunits, group_size))
3522 {
3523 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3524 nelt_limit = const_nunits;
3525 }
3526 else
3527 {
3528 /* If either the vector has variable length or the vectors
3529 are composed of repeated whole groups we only need to
3530 cost construction once. All vectors will be the same. */
3531 num_vects_to_check = 1;
3532 nelt_limit = group_size;
3533 }
3534 tree elt = NULL_TREE;
3535 unsigned nelt = 0;
3536 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3537 {
3538 unsigned si = j % group_size;
3539 if (nelt == 0)
3540 elt = SLP_TREE_SCALAR_OPS (node)[si];
3541 /* ??? We're just tracking whether all operands of a single
3542 vector initializer are the same, ideally we'd check if
3543 we emitted the same one already. */
3544 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3545 elt = NULL_TREE;
3546 nelt++;
3547 if (nelt == nelt_limit)
3548 {
3549 record_stmt_cost (cost_vec, 1,
3550 SLP_TREE_DEF_TYPE (node) == vect_external_def
3551 ? (elt ? scalar_to_vec : vec_construct)
3552 : vector_load,
3553 NULL, vectype, 0, vect_prologue);
3554 nelt = 0;
3555 }
3556 }
3557 }
3558
3559 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3560 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3561
3562 Return true if the operations are supported. */
3563
3564 static bool
3565 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3566 slp_instance node_instance,
3567 hash_set<slp_tree> &visited_set,
3568 vec<slp_tree> &visited_vec,
3569 stmt_vector_for_cost *cost_vec)
3570 {
3571 int i, j;
3572 slp_tree child;
3573
3574 /* Assume we can code-generate all invariants. */
3575 if (!node
3576 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
3577 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
3578 return true;
3579
3580 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
3581 {
3582 if (dump_enabled_p ())
3583 dump_printf_loc (MSG_NOTE, vect_location,
3584 "Failed cyclic SLP reference in %p", node);
3585 return false;
3586 }
3587 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
3588
3589 /* If we already analyzed the exact same set of scalar stmts we're done.
3590 We share the generated vector stmts for those. */
3591 if (visited_set.add (node))
3592 return true;
3593 visited_vec.safe_push (node);
3594
3595 bool res = true;
3596 unsigned visited_rec_start = visited_vec.length ();
3597 unsigned cost_vec_rec_start = cost_vec->length ();
3598 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3599 {
3600 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3601 visited_set, visited_vec,
3602 cost_vec);
3603 if (!res)
3604 break;
3605 }
3606
3607 if (res)
3608 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3609 cost_vec);
3610 /* If analysis failed we have to pop all recursive visited nodes
3611 plus ourselves. */
3612 if (!res)
3613 {
3614 while (visited_vec.length () >= visited_rec_start)
3615 visited_set.remove (visited_vec.pop ());
3616 cost_vec->truncate (cost_vec_rec_start);
3617 }
3618
3619 /* When the node can be vectorized cost invariant nodes it references.
3620 This is not done in DFS order to allow the refering node
3621 vectorizable_* calls to nail down the invariant nodes vector type
3622 and possibly unshare it if it needs a different vector type than
3623 other referrers. */
3624 if (res)
3625 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3626 if (child
3627 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
3628 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3629 /* Perform usual caching, note code-generation still
3630 code-gens these nodes multiple times but we expect
3631 to CSE them later. */
3632 && !visited_set.add (child))
3633 {
3634 visited_vec.safe_push (child);
3635 /* ??? After auditing more code paths make a "default"
3636 and push the vector type from NODE to all children
3637 if it is not already set. */
3638 /* Compute the number of vectors to be generated. */
3639 tree vector_type = SLP_TREE_VECTYPE (child);
3640 if (!vector_type)
3641 {
3642 /* For shifts with a scalar argument we don't need
3643 to cost or code-generate anything.
3644 ??? Represent this more explicitely. */
3645 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3646 == shift_vec_info_type)
3647 && j == 1);
3648 continue;
3649 }
3650 unsigned group_size = SLP_TREE_LANES (child);
3651 poly_uint64 vf = 1;
3652 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3653 vf = loop_vinfo->vectorization_factor;
3654 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3655 = vect_get_num_vectors (vf * group_size, vector_type);
3656 /* And cost them. */
3657 vect_prologue_cost_for_slp (child, cost_vec);
3658 }
3659
3660 /* If this node or any of its children can't be vectorized, try pruning
3661 the tree here rather than felling the whole thing. */
3662 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3663 {
3664 /* We'll need to revisit this for invariant costing and number
3665 of vectorized stmt setting. */
3666 res = true;
3667 }
3668
3669 return res;
3670 }
3671
3672
3673 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3674 region and that can be vectorized using vectorizable_live_operation
3675 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3676 scalar code computing it to be retained. */
3677
3678 static void
3679 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3680 slp_instance instance,
3681 stmt_vector_for_cost *cost_vec,
3682 hash_set<stmt_vec_info> &svisited,
3683 hash_set<slp_tree> &visited)
3684 {
3685 if (visited.add (node))
3686 return;
3687
3688 unsigned i;
3689 stmt_vec_info stmt_info;
3690 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3691 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3692 {
3693 if (svisited.contains (stmt_info))
3694 continue;
3695 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3696 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
3697 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
3698 /* Only the pattern root stmt computes the original scalar value. */
3699 continue;
3700 bool mark_visited = true;
3701 gimple *orig_stmt = orig_stmt_info->stmt;
3702 ssa_op_iter op_iter;
3703 def_operand_p def_p;
3704 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3705 {
3706 imm_use_iterator use_iter;
3707 gimple *use_stmt;
3708 stmt_vec_info use_stmt_info;
3709 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3710 if (!is_gimple_debug (use_stmt))
3711 {
3712 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3713 if (!use_stmt_info
3714 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3715 {
3716 STMT_VINFO_LIVE_P (stmt_info) = true;
3717 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3718 NULL, node, instance, i,
3719 false, cost_vec))
3720 /* ??? So we know we can vectorize the live stmt
3721 from one SLP node. If we cannot do so from all
3722 or none consistently we'd have to record which
3723 SLP node (and lane) we want to use for the live
3724 operation. So make sure we can code-generate
3725 from all nodes. */
3726 mark_visited = false;
3727 else
3728 STMT_VINFO_LIVE_P (stmt_info) = false;
3729 BREAK_FROM_IMM_USE_STMT (use_iter);
3730 }
3731 }
3732 /* We have to verify whether we can insert the lane extract
3733 before all uses. The following is a conservative approximation.
3734 We cannot put this into vectorizable_live_operation because
3735 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3736 doesn't work.
3737 Note that while the fact that we emit code for loads at the
3738 first load should make this a non-problem leafs we construct
3739 from scalars are vectorized after the last scalar def.
3740 ??? If we'd actually compute the insert location during
3741 analysis we could use sth less conservative than the last
3742 scalar stmt in the node for the dominance check. */
3743 /* ??? What remains is "live" uses in vector CTORs in the same
3744 SLP graph which is where those uses can end up code-generated
3745 right after their definition instead of close to their original
3746 use. But that would restrict us to code-generate lane-extracts
3747 from the latest stmt in a node. So we compensate for this
3748 during code-generation, simply not replacing uses for those
3749 hopefully rare cases. */
3750 if (STMT_VINFO_LIVE_P (stmt_info))
3751 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3752 if (!is_gimple_debug (use_stmt)
3753 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
3754 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3755 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
3756 {
3757 if (dump_enabled_p ())
3758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3759 "Cannot determine insertion place for "
3760 "lane extract\n");
3761 STMT_VINFO_LIVE_P (stmt_info) = false;
3762 mark_visited = true;
3763 }
3764 }
3765 if (mark_visited)
3766 svisited.add (stmt_info);
3767 }
3768
3769 slp_tree child;
3770 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3771 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3772 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
3773 cost_vec, svisited, visited);
3774 }
3775
3776 /* Analyze statements in SLP instances of VINFO. Return true if the
3777 operations are supported. */
3778
3779 bool
3780 vect_slp_analyze_operations (vec_info *vinfo)
3781 {
3782 slp_instance instance;
3783 int i;
3784
3785 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3786
3787 hash_set<slp_tree> visited;
3788 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
3789 {
3790 auto_vec<slp_tree> visited_vec;
3791 stmt_vector_for_cost cost_vec;
3792 cost_vec.create (2);
3793 if (is_a <bb_vec_info> (vinfo))
3794 vect_location = instance->location ();
3795 if (!vect_slp_analyze_node_operations (vinfo,
3796 SLP_INSTANCE_TREE (instance),
3797 instance, visited, visited_vec,
3798 &cost_vec)
3799 /* Instances with a root stmt require vectorized defs for the
3800 SLP tree root. */
3801 || (SLP_INSTANCE_ROOT_STMT (instance)
3802 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
3803 != vect_internal_def)))
3804 {
3805 slp_tree node = SLP_INSTANCE_TREE (instance);
3806 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3807 if (dump_enabled_p ())
3808 dump_printf_loc (MSG_NOTE, vect_location,
3809 "removing SLP instance operations starting from: %G",
3810 stmt_info->stmt);
3811 vect_free_slp_instance (instance);
3812 vinfo->slp_instances.ordered_remove (i);
3813 cost_vec.release ();
3814 while (!visited_vec.is_empty ())
3815 visited.remove (visited_vec.pop ());
3816 }
3817 else
3818 {
3819 i++;
3820
3821 /* For BB vectorization remember the SLP graph entry
3822 cost for later. */
3823 if (is_a <bb_vec_info> (vinfo))
3824 instance->cost_vec = cost_vec;
3825 else
3826 {
3827 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
3828 cost_vec.release ();
3829 }
3830 }
3831 }
3832
3833 /* Compute vectorizable live stmts. */
3834 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3835 {
3836 hash_set<stmt_vec_info> svisited;
3837 hash_set<slp_tree> visited;
3838 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
3839 {
3840 vect_location = instance->location ();
3841 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
3842 instance, &instance->cost_vec, svisited,
3843 visited);
3844 }
3845 }
3846
3847 return !vinfo->slp_instances.is_empty ();
3848 }
3849
3850 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3851 closing the eventual chain. */
3852
3853 static slp_instance
3854 get_ultimate_leader (slp_instance instance,
3855 hash_map<slp_instance, slp_instance> &instance_leader)
3856 {
3857 auto_vec<slp_instance *, 8> chain;
3858 slp_instance *tem;
3859 while (*(tem = instance_leader.get (instance)) != instance)
3860 {
3861 chain.safe_push (tem);
3862 instance = *tem;
3863 }
3864 while (!chain.is_empty ())
3865 *chain.pop () = instance;
3866 return instance;
3867 }
3868
3869 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3870
3871 static void
3872 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
3873 slp_instance instance, slp_tree node,
3874 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
3875 hash_map<slp_instance, slp_instance> &instance_leader,
3876 hash_set<slp_tree> &visited)
3877 {
3878 stmt_vec_info stmt_info;
3879 unsigned i;
3880
3881 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3882 {
3883 bool existed_p;
3884 slp_instance &stmt_instance
3885 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
3886 if (!existed_p)
3887 ;
3888 else if (stmt_instance != instance)
3889 {
3890 /* If we're running into a previously marked stmt make us the
3891 leader of the current ultimate leader. This keeps the
3892 leader chain acyclic and works even when the current instance
3893 connects two previously independent graph parts. */
3894 slp_instance stmt_leader
3895 = get_ultimate_leader (stmt_instance, instance_leader);
3896 if (stmt_leader != instance)
3897 instance_leader.put (stmt_leader, instance);
3898 }
3899 stmt_instance = instance;
3900 }
3901
3902 if (visited.add (node))
3903 return;
3904
3905 slp_tree child;
3906 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3907 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3908 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
3909 instance_leader, visited);
3910 }
3911
3912 /* Partition the SLP graph into pieces that can be costed independently. */
3913
3914 static void
3915 vect_bb_partition_graph (bb_vec_info bb_vinfo)
3916 {
3917 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3918
3919 /* First walk the SLP graph assigning each involved scalar stmt a
3920 corresponding SLP graph entry and upon visiting a previously
3921 marked stmt, make the stmts leader the current SLP graph entry. */
3922 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
3923 hash_map<slp_instance, slp_instance> instance_leader;
3924 hash_set<slp_tree> visited;
3925 slp_instance instance;
3926 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3927 {
3928 instance_leader.put (instance, instance);
3929 vect_bb_partition_graph_r (bb_vinfo,
3930 instance, SLP_INSTANCE_TREE (instance),
3931 stmt_to_instance, instance_leader,
3932 visited);
3933 }
3934
3935 /* Then collect entries to each independent subgraph. */
3936 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3937 {
3938 slp_instance leader = get_ultimate_leader (instance, instance_leader);
3939 leader->subgraph_entries.safe_push (instance);
3940 if (dump_enabled_p ()
3941 && leader != instance)
3942 dump_printf_loc (MSG_NOTE, vect_location,
3943 "instance %p is leader of %p\n",
3944 leader, instance);
3945 }
3946 }
3947
3948 /* Compute the scalar cost of the SLP node NODE and its children
3949 and return it. Do not account defs that are marked in LIFE and
3950 update LIFE according to uses of NODE. */
3951
3952 static void
3953 vect_bb_slp_scalar_cost (vec_info *vinfo,
3954 slp_tree node, vec<bool, va_heap> *life,
3955 stmt_vector_for_cost *cost_vec,
3956 hash_set<slp_tree> &visited)
3957 {
3958 unsigned i;
3959 stmt_vec_info stmt_info;
3960 slp_tree child;
3961
3962 if (visited.add (node))
3963 return;
3964
3965 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3966 {
3967 ssa_op_iter op_iter;
3968 def_operand_p def_p;
3969
3970 if ((*life)[i])
3971 continue;
3972
3973 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3974 gimple *orig_stmt = orig_stmt_info->stmt;
3975
3976 /* If there is a non-vectorized use of the defs then the scalar
3977 stmt is kept live in which case we do not account it or any
3978 required defs in the SLP children in the scalar cost. This
3979 way we make the vectorization more costly when compared to
3980 the scalar cost. */
3981 if (!STMT_VINFO_LIVE_P (stmt_info))
3982 {
3983 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3984 {
3985 imm_use_iterator use_iter;
3986 gimple *use_stmt;
3987 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3988 if (!is_gimple_debug (use_stmt))
3989 {
3990 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3991 if (!use_stmt_info
3992 || !PURE_SLP_STMT
3993 (vect_stmt_to_vectorize (use_stmt_info)))
3994 {
3995 (*life)[i] = true;
3996 BREAK_FROM_IMM_USE_STMT (use_iter);
3997 }
3998 }
3999 }
4000 if ((*life)[i])
4001 continue;
4002 }
4003
4004 /* Count scalar stmts only once. */
4005 if (gimple_visited_p (orig_stmt))
4006 continue;
4007 gimple_set_visited (orig_stmt, true);
4008
4009 vect_cost_for_stmt kind;
4010 if (STMT_VINFO_DATA_REF (orig_stmt_info))
4011 {
4012 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
4013 kind = scalar_load;
4014 else
4015 kind = scalar_store;
4016 }
4017 else if (vect_nop_conversion_p (orig_stmt_info))
4018 continue;
4019 else
4020 kind = scalar_stmt;
4021 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
4022 SLP_TREE_VECTYPE (node), 0, vect_body);
4023 }
4024
4025 auto_vec<bool, 20> subtree_life;
4026 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4027 {
4028 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4029 {
4030 /* Do not directly pass LIFE to the recursive call, copy it to
4031 confine changes in the callee to the current child/subtree. */
4032 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4033 {
4034 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
4035 for (unsigned j = 0;
4036 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
4037 {
4038 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
4039 if (perm.first == i)
4040 subtree_life[perm.second] = (*life)[j];
4041 }
4042 }
4043 else
4044 {
4045 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
4046 subtree_life.safe_splice (*life);
4047 }
4048 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
4049 visited);
4050 subtree_life.truncate (0);
4051 }
4052 }
4053 }
4054
4055 /* Check if vectorization of the basic block is profitable for the
4056 subgraph denoted by SLP_INSTANCES. */
4057
4058 static bool
4059 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
4060 vec<slp_instance> slp_instances)
4061 {
4062 slp_instance instance;
4063 int i;
4064 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
4065 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
4066
4067 void *vect_target_cost_data = init_cost (NULL);
4068
4069 /* Calculate scalar cost and sum the cost for the vector stmts
4070 previously collected. */
4071 stmt_vector_for_cost scalar_costs;
4072 scalar_costs.create (0);
4073 hash_set<slp_tree> visited;
4074 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4075 {
4076 auto_vec<bool, 20> life;
4077 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
4078 true);
4079 vect_bb_slp_scalar_cost (bb_vinfo,
4080 SLP_INSTANCE_TREE (instance),
4081 &life, &scalar_costs, visited);
4082 add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
4083 instance->cost_vec.release ();
4084 }
4085 /* Unset visited flag. */
4086 stmt_info_for_cost *si;
4087 FOR_EACH_VEC_ELT (scalar_costs, i, si)
4088 gimple_set_visited (si->stmt_info->stmt, false);
4089
4090 void *scalar_target_cost_data = init_cost (NULL);
4091 add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
4092 scalar_costs.release ();
4093 unsigned dummy;
4094 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
4095 destroy_cost_data (scalar_target_cost_data);
4096
4097 /* Complete the target-specific vector cost calculation. */
4098 finish_cost (vect_target_cost_data, &vec_prologue_cost,
4099 &vec_inside_cost, &vec_epilogue_cost);
4100 destroy_cost_data (vect_target_cost_data);
4101
4102 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
4103
4104 if (dump_enabled_p ())
4105 {
4106 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4107 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
4108 vec_inside_cost);
4109 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
4110 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
4111 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
4112 }
4113
4114 /* Vectorization is profitable if its cost is more than the cost of scalar
4115 version. Note that we err on the vector side for equal cost because
4116 the cost estimate is otherwise quite pessimistic (constant uses are
4117 free on the scalar side but cost a load on the vector side for
4118 example). */
4119 if (vec_outside_cost + vec_inside_cost > scalar_cost)
4120 return false;
4121
4122 return true;
4123 }
4124
4125 /* qsort comparator for lane defs. */
4126
4127 static int
4128 vld_cmp (const void *a_, const void *b_)
4129 {
4130 auto *a = (const std::pair<unsigned, tree> *)a_;
4131 auto *b = (const std::pair<unsigned, tree> *)b_;
4132 return a->first - b->first;
4133 }
4134
4135 /* Return true if USE_STMT is a vector lane insert into VEC and set
4136 *THIS_LANE to the lane number that is set. */
4137
4138 static bool
4139 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
4140 {
4141 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
4142 if (!use_ass
4143 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
4144 || (vec
4145 ? gimple_assign_rhs1 (use_ass) != vec
4146 : ((vec = gimple_assign_rhs1 (use_ass)), false))
4147 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
4148 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
4149 || !constant_multiple_p
4150 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
4151 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
4152 this_lane))
4153 return false;
4154 return true;
4155 }
4156
4157 /* Find any vectorizable constructors and add them to the grouped_store
4158 array. */
4159
4160 static void
4161 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
4162 {
4163 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
4164 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
4165 !gsi_end_p (gsi); gsi_next (&gsi))
4166 {
4167 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
4168 if (!assign)
4169 continue;
4170
4171 tree rhs = gimple_assign_rhs1 (assign);
4172 if (gimple_assign_rhs_code (assign) == CONSTRUCTOR)
4173 {
4174 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
4175 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
4176 CONSTRUCTOR_NELTS (rhs))
4177 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
4178 || uniform_vector_p (rhs))
4179 continue;
4180
4181 unsigned j;
4182 tree val;
4183 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
4184 if (TREE_CODE (val) != SSA_NAME
4185 || !bb_vinfo->lookup_def (val))
4186 break;
4187 if (j != CONSTRUCTOR_NELTS (rhs))
4188 continue;
4189
4190 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
4191 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
4192 }
4193 else if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR
4194 && VECTOR_TYPE_P (TREE_TYPE (rhs))
4195 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
4196 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
4197 && integer_zerop (gimple_assign_rhs3 (assign))
4198 && useless_type_conversion_p
4199 (TREE_TYPE (TREE_TYPE (rhs)),
4200 TREE_TYPE (gimple_assign_rhs2 (assign)))
4201 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
4202 {
4203 /* We start to match on insert to lane zero but since the
4204 inserts need not be ordered we'd have to search both
4205 the def and the use chains. */
4206 tree vectype = TREE_TYPE (rhs);
4207 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4208 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
4209 auto_sbitmap lanes (nlanes);
4210 bitmap_clear (lanes);
4211 bitmap_set_bit (lanes, 0);
4212 tree def = gimple_assign_lhs (assign);
4213 lane_defs.quick_push
4214 (std::make_pair (0, gimple_assign_rhs2 (assign)));
4215 unsigned lanes_found = 1;
4216 /* Start with the use chains, the last stmt will be the root. */
4217 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
4218 do
4219 {
4220 use_operand_p use_p;
4221 gimple *use_stmt;
4222 if (!single_imm_use (def, &use_p, &use_stmt))
4223 break;
4224 unsigned this_lane;
4225 if (!bb_vinfo->lookup_stmt (use_stmt)
4226 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
4227 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
4228 break;
4229 if (bitmap_bit_p (lanes, this_lane))
4230 break;
4231 lanes_found++;
4232 bitmap_set_bit (lanes, this_lane);
4233 gassign *use_ass = as_a <gassign *> (use_stmt);
4234 lane_defs.quick_push (std::make_pair
4235 (this_lane, gimple_assign_rhs2 (use_ass)));
4236 last = bb_vinfo->lookup_stmt (use_ass);
4237 def = gimple_assign_lhs (use_ass);
4238 }
4239 while (lanes_found < nlanes);
4240 if (lanes_found < nlanes)
4241 {
4242 /* Now search the def chain. */
4243 def = gimple_assign_rhs1 (assign);
4244 do
4245 {
4246 if (TREE_CODE (def) != SSA_NAME
4247 || !has_single_use (def))
4248 break;
4249 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
4250 unsigned this_lane;
4251 if (!bb_vinfo->lookup_stmt (def_stmt)
4252 || !vect_slp_is_lane_insert (def_stmt,
4253 NULL_TREE, &this_lane)
4254 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
4255 break;
4256 if (bitmap_bit_p (lanes, this_lane))
4257 break;
4258 lanes_found++;
4259 bitmap_set_bit (lanes, this_lane);
4260 lane_defs.quick_push (std::make_pair
4261 (this_lane,
4262 gimple_assign_rhs2 (def_stmt)));
4263 def = gimple_assign_rhs1 (def_stmt);
4264 }
4265 while (lanes_found < nlanes);
4266 }
4267 if (lanes_found == nlanes)
4268 {
4269 /* Sort lane_defs after the lane index and register the root. */
4270 lane_defs.qsort (vld_cmp);
4271 vec<stmt_vec_info> stmts;
4272 stmts.create (nlanes);
4273 for (unsigned i = 0; i < nlanes; ++i)
4274 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
4275 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
4276 stmts, last));
4277 }
4278 }
4279 }
4280 }
4281
4282 /* Walk the grouped store chains and replace entries with their
4283 pattern variant if any. */
4284
4285 static void
4286 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
4287 {
4288 stmt_vec_info first_element;
4289 unsigned i;
4290
4291 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4292 {
4293 /* We also have CTORs in this array. */
4294 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
4295 continue;
4296 if (STMT_VINFO_IN_PATTERN_P (first_element))
4297 {
4298 stmt_vec_info orig = first_element;
4299 first_element = STMT_VINFO_RELATED_STMT (first_element);
4300 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
4301 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
4302 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
4303 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
4304 vinfo->grouped_stores[i] = first_element;
4305 }
4306 stmt_vec_info prev = first_element;
4307 while (DR_GROUP_NEXT_ELEMENT (prev))
4308 {
4309 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
4310 if (STMT_VINFO_IN_PATTERN_P (elt))
4311 {
4312 stmt_vec_info orig = elt;
4313 elt = STMT_VINFO_RELATED_STMT (elt);
4314 DR_GROUP_NEXT_ELEMENT (prev) = elt;
4315 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
4316 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
4317 }
4318 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
4319 prev = elt;
4320 }
4321 }
4322 }
4323
4324 /* Check if the region described by BB_VINFO can be vectorized, returning
4325 true if so. When returning false, set FATAL to true if the same failure
4326 would prevent vectorization at other vector sizes, false if it is still
4327 worth trying other sizes. N_STMTS is the number of statements in the
4328 region. */
4329
4330 static bool
4331 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
4332 vec<int> *dataref_groups)
4333 {
4334 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4335
4336 slp_instance instance;
4337 int i;
4338 poly_uint64 min_vf = 2;
4339
4340 /* The first group of checks is independent of the vector size. */
4341 fatal = true;
4342
4343 /* Analyze the data references. */
4344
4345 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
4346 {
4347 if (dump_enabled_p ())
4348 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4349 "not vectorized: unhandled data-ref in basic "
4350 "block.\n");
4351 return false;
4352 }
4353
4354 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
4355 {
4356 if (dump_enabled_p ())
4357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4358 "not vectorized: unhandled data access in "
4359 "basic block.\n");
4360 return false;
4361 }
4362
4363 vect_slp_check_for_constructors (bb_vinfo);
4364
4365 /* If there are no grouped stores and no constructors in the region
4366 there is no need to continue with pattern recog as vect_analyze_slp
4367 will fail anyway. */
4368 if (bb_vinfo->grouped_stores.is_empty ()
4369 && bb_vinfo->roots.is_empty ())
4370 {
4371 if (dump_enabled_p ())
4372 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4373 "not vectorized: no grouped stores in "
4374 "basic block.\n");
4375 return false;
4376 }
4377
4378 /* While the rest of the analysis below depends on it in some way. */
4379 fatal = false;
4380
4381 vect_pattern_recog (bb_vinfo);
4382
4383 /* Update store groups from pattern processing. */
4384 vect_fixup_store_groups_with_patterns (bb_vinfo);
4385
4386 /* Check the SLP opportunities in the basic block, analyze and build SLP
4387 trees. */
4388 if (!vect_analyze_slp (bb_vinfo, n_stmts))
4389 {
4390 if (dump_enabled_p ())
4391 {
4392 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4393 "Failed to SLP the basic block.\n");
4394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4395 "not vectorized: failed to find SLP opportunities "
4396 "in basic block.\n");
4397 }
4398 return false;
4399 }
4400
4401 /* Optimize permutations. */
4402 vect_optimize_slp (bb_vinfo);
4403
4404 /* Gather the loads reachable from the SLP graph entries. */
4405 vect_gather_slp_loads (bb_vinfo);
4406
4407 vect_record_base_alignments (bb_vinfo);
4408
4409 /* Analyze and verify the alignment of data references and the
4410 dependence in the SLP instances. */
4411 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
4412 {
4413 vect_location = instance->location ();
4414 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
4415 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
4416 {
4417 slp_tree node = SLP_INSTANCE_TREE (instance);
4418 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4419 if (dump_enabled_p ())
4420 dump_printf_loc (MSG_NOTE, vect_location,
4421 "removing SLP instance operations starting from: %G",
4422 stmt_info->stmt);
4423 vect_free_slp_instance (instance);
4424 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
4425 continue;
4426 }
4427
4428 /* Mark all the statements that we want to vectorize as pure SLP and
4429 relevant. */
4430 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4431 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
4432 if (stmt_vec_info root = SLP_INSTANCE_ROOT_STMT (instance))
4433 {
4434 STMT_SLP_TYPE (root) = pure_slp;
4435 if (is_gimple_assign (root->stmt)
4436 && gimple_assign_rhs_code (root->stmt) == BIT_INSERT_EXPR)
4437 {
4438 /* ??? We should probably record the whole vector of
4439 root stmts so we do not have to back-track here... */
4440 for (unsigned n = SLP_TREE_LANES (SLP_INSTANCE_TREE (instance));
4441 n != 1; --n)
4442 {
4443 root = bb_vinfo->lookup_def (gimple_assign_rhs1 (root->stmt));
4444 STMT_SLP_TYPE (root) = pure_slp;
4445 }
4446 }
4447 }
4448
4449 i++;
4450 }
4451 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
4452 return false;
4453
4454 if (!vect_slp_analyze_operations (bb_vinfo))
4455 {
4456 if (dump_enabled_p ())
4457 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4458 "not vectorized: bad operation in basic block.\n");
4459 return false;
4460 }
4461
4462 vect_bb_partition_graph (bb_vinfo);
4463
4464 return true;
4465 }
4466
4467 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4468 basic blocks in BBS, returning true on success.
4469 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4470
4471 static bool
4472 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
4473 vec<int> *dataref_groups, unsigned int n_stmts)
4474 {
4475 bb_vec_info bb_vinfo;
4476 auto_vector_modes vector_modes;
4477
4478 /* Autodetect first vector size we try. */
4479 machine_mode next_vector_mode = VOIDmode;
4480 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
4481 unsigned int mode_i = 0;
4482
4483 vec_info_shared shared;
4484
4485 machine_mode autodetected_vector_mode = VOIDmode;
4486 while (1)
4487 {
4488 bool vectorized = false;
4489 bool fatal = false;
4490 bb_vinfo = new _bb_vec_info (bbs, &shared);
4491
4492 bool first_time_p = shared.datarefs.is_empty ();
4493 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
4494 if (first_time_p)
4495 bb_vinfo->shared->save_datarefs ();
4496 else
4497 bb_vinfo->shared->check_datarefs ();
4498 bb_vinfo->vector_mode = next_vector_mode;
4499
4500 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)
4501 && dbg_cnt (vect_slp))
4502 {
4503 if (dump_enabled_p ())
4504 {
4505 dump_printf_loc (MSG_NOTE, vect_location,
4506 "***** Analysis succeeded with vector mode"
4507 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4508 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4509 }
4510
4511 bb_vinfo->shared->check_datarefs ();
4512
4513 unsigned i;
4514 slp_instance instance;
4515 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4516 {
4517 if (instance->subgraph_entries.is_empty ())
4518 continue;
4519
4520 vect_location = instance->location ();
4521 if (!unlimited_cost_model (NULL)
4522 && !vect_bb_vectorization_profitable_p
4523 (bb_vinfo, instance->subgraph_entries))
4524 {
4525 if (dump_enabled_p ())
4526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4527 "not vectorized: vectorization is not "
4528 "profitable.\n");
4529 continue;
4530 }
4531
4532 if (!vectorized && dump_enabled_p ())
4533 dump_printf_loc (MSG_NOTE, vect_location,
4534 "Basic block will be vectorized "
4535 "using SLP\n");
4536 vectorized = true;
4537
4538 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4539
4540 unsigned HOST_WIDE_INT bytes;
4541 if (dump_enabled_p ())
4542 {
4543 if (GET_MODE_SIZE
4544 (bb_vinfo->vector_mode).is_constant (&bytes))
4545 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4546 "basic block part vectorized using %wu "
4547 "byte vectors\n", bytes);
4548 else
4549 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4550 "basic block part vectorized using "
4551 "variable length vectors\n");
4552 }
4553 }
4554 }
4555 else
4556 {
4557 if (dump_enabled_p ())
4558 dump_printf_loc (MSG_NOTE, vect_location,
4559 "***** Analysis failed with vector mode %s\n",
4560 GET_MODE_NAME (bb_vinfo->vector_mode));
4561 }
4562
4563 if (mode_i == 0)
4564 autodetected_vector_mode = bb_vinfo->vector_mode;
4565
4566 if (!fatal)
4567 while (mode_i < vector_modes.length ()
4568 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
4569 {
4570 if (dump_enabled_p ())
4571 dump_printf_loc (MSG_NOTE, vect_location,
4572 "***** The result for vector mode %s would"
4573 " be the same\n",
4574 GET_MODE_NAME (vector_modes[mode_i]));
4575 mode_i += 1;
4576 }
4577
4578 delete bb_vinfo;
4579
4580 if (mode_i < vector_modes.length ()
4581 && VECTOR_MODE_P (autodetected_vector_mode)
4582 && (related_vector_mode (vector_modes[mode_i],
4583 GET_MODE_INNER (autodetected_vector_mode))
4584 == autodetected_vector_mode)
4585 && (related_vector_mode (autodetected_vector_mode,
4586 GET_MODE_INNER (vector_modes[mode_i]))
4587 == vector_modes[mode_i]))
4588 {
4589 if (dump_enabled_p ())
4590 dump_printf_loc (MSG_NOTE, vect_location,
4591 "***** Skipping vector mode %s, which would"
4592 " repeat the analysis for %s\n",
4593 GET_MODE_NAME (vector_modes[mode_i]),
4594 GET_MODE_NAME (autodetected_vector_mode));
4595 mode_i += 1;
4596 }
4597
4598 if (vectorized
4599 || mode_i == vector_modes.length ()
4600 || autodetected_vector_mode == VOIDmode
4601 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4602 vector sizes will fail do not bother iterating. */
4603 || fatal)
4604 return vectorized;
4605
4606 /* Try the next biggest vector size. */
4607 next_vector_mode = vector_modes[mode_i++];
4608 if (dump_enabled_p ())
4609 dump_printf_loc (MSG_NOTE, vect_location,
4610 "***** Re-trying analysis with vector mode %s\n",
4611 GET_MODE_NAME (next_vector_mode));
4612 }
4613 }
4614
4615
4616 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4617 true if anything in the basic-block was vectorized. */
4618
4619 static bool
4620 vect_slp_bbs (vec<basic_block> bbs)
4621 {
4622 vec<data_reference_p> datarefs = vNULL;
4623 auto_vec<int> dataref_groups;
4624 int insns = 0;
4625 int current_group = 0;
4626
4627 for (unsigned i = 0; i < bbs.length (); i++)
4628 {
4629 basic_block bb = bbs[i];
4630 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
4631 gsi_next (&gsi))
4632 {
4633 gimple *stmt = gsi_stmt (gsi);
4634 if (is_gimple_debug (stmt))
4635 continue;
4636
4637 insns++;
4638
4639 if (gimple_location (stmt) != UNKNOWN_LOCATION)
4640 vect_location = stmt;
4641
4642 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
4643 &dataref_groups, current_group))
4644 ++current_group;
4645 }
4646 }
4647
4648 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
4649 }
4650
4651 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4652 true if anything in the basic-block was vectorized. */
4653
4654 bool
4655 vect_slp_bb (basic_block bb)
4656 {
4657 auto_vec<basic_block> bbs;
4658 bbs.safe_push (bb);
4659 return vect_slp_bbs (bbs);
4660 }
4661
4662 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4663 true if anything in the basic-block was vectorized. */
4664
4665 bool
4666 vect_slp_function (function *fun)
4667 {
4668 bool r = false;
4669 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
4670 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
4671
4672 /* For the moment split the function into pieces to avoid making
4673 the iteration on the vector mode moot. Split at points we know
4674 to not handle well which is CFG merges (SLP discovery doesn't
4675 handle non-loop-header PHIs) and loop exits. Since pattern
4676 recog requires reverse iteration to visit uses before defs
4677 simply chop RPO into pieces. */
4678 auto_vec<basic_block> bbs;
4679 for (unsigned i = 0; i < n; i++)
4680 {
4681 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
4682 bool split = false;
4683
4684 /* Split when a BB is not dominated by the first block. */
4685 if (!bbs.is_empty ()
4686 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
4687 {
4688 if (dump_enabled_p ())
4689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4690 "splitting region at dominance boundary bb%d\n",
4691 bb->index);
4692 split = true;
4693 }
4694 /* Split when the loop determined by the first block
4695 is exited. This is because we eventually insert
4696 invariants at region begin. */
4697 else if (!bbs.is_empty ()
4698 && bbs[0]->loop_father != bb->loop_father
4699 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
4700 {
4701 if (dump_enabled_p ())
4702 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4703 "splitting region at loop %d exit at bb%d\n",
4704 bbs[0]->loop_father->num, bb->index);
4705 split = true;
4706 }
4707
4708 if (split && !bbs.is_empty ())
4709 {
4710 r |= vect_slp_bbs (bbs);
4711 bbs.truncate (0);
4712 bbs.quick_push (bb);
4713 }
4714 else
4715 bbs.safe_push (bb);
4716
4717 /* When we have a stmt ending this block and defining a
4718 value we have to insert on edges when inserting after it for
4719 a vector containing its definition. Avoid this for now. */
4720 if (gimple *last = last_stmt (bb))
4721 if (gimple_get_lhs (last)
4722 && is_ctrl_altering_stmt (last))
4723 {
4724 if (dump_enabled_p ())
4725 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4726 "splitting region at control altering "
4727 "definition %G", last);
4728 r |= vect_slp_bbs (bbs);
4729 bbs.truncate (0);
4730 }
4731 }
4732
4733 if (!bbs.is_empty ())
4734 r |= vect_slp_bbs (bbs);
4735
4736 free (rpo);
4737
4738 return r;
4739 }
4740
4741 /* Build a variable-length vector in which the elements in ELTS are repeated
4742 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4743 RESULTS and add any new instructions to SEQ.
4744
4745 The approach we use is:
4746
4747 (1) Find a vector mode VM with integer elements of mode IM.
4748
4749 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4750 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4751 from small vectors to IM.
4752
4753 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4754
4755 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4756 correct byte contents.
4757
4758 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4759
4760 We try to find the largest IM for which this sequence works, in order
4761 to cut down on the number of interleaves. */
4762
4763 void
4764 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
4765 vec<tree> elts, unsigned int nresults,
4766 vec<tree> &results)
4767 {
4768 unsigned int nelts = elts.length ();
4769 tree element_type = TREE_TYPE (vector_type);
4770
4771 /* (1) Find a vector mode VM with integer elements of mode IM. */
4772 unsigned int nvectors = 1;
4773 tree new_vector_type;
4774 tree permutes[2];
4775 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
4776 &nvectors, &new_vector_type,
4777 permutes))
4778 gcc_unreachable ();
4779
4780 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4781 unsigned int partial_nelts = nelts / nvectors;
4782 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
4783
4784 tree_vector_builder partial_elts;
4785 auto_vec<tree, 32> pieces (nvectors * 2);
4786 pieces.quick_grow (nvectors * 2);
4787 for (unsigned int i = 0; i < nvectors; ++i)
4788 {
4789 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4790 ELTS' has mode IM. */
4791 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
4792 for (unsigned int j = 0; j < partial_nelts; ++j)
4793 partial_elts.quick_push (elts[i * partial_nelts + j]);
4794 tree t = gimple_build_vector (seq, &partial_elts);
4795 t = gimple_build (seq, VIEW_CONVERT_EXPR,
4796 TREE_TYPE (new_vector_type), t);
4797
4798 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4799 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
4800 }
4801
4802 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4803 correct byte contents.
4804
4805 We need to repeat the following operation log2(nvectors) times:
4806
4807 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4808 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4809
4810 However, if each input repeats every N elements and the VF is
4811 a multiple of N * 2, the HI result is the same as the LO. */
4812 unsigned int in_start = 0;
4813 unsigned int out_start = nvectors;
4814 unsigned int hi_start = nvectors / 2;
4815 /* A bound on the number of outputs needed to produce NRESULTS results
4816 in the final iteration. */
4817 unsigned int noutputs_bound = nvectors * nresults;
4818 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
4819 {
4820 noutputs_bound /= 2;
4821 unsigned int limit = MIN (noutputs_bound, nvectors);
4822 for (unsigned int i = 0; i < limit; ++i)
4823 {
4824 if ((i & 1) != 0
4825 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
4826 2 * in_repeat))
4827 {
4828 pieces[out_start + i] = pieces[out_start + i - 1];
4829 continue;
4830 }
4831
4832 tree output = make_ssa_name (new_vector_type);
4833 tree input1 = pieces[in_start + (i / 2)];
4834 tree input2 = pieces[in_start + (i / 2) + hi_start];
4835 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
4836 input1, input2,
4837 permutes[i & 1]);
4838 gimple_seq_add_stmt (seq, stmt);
4839 pieces[out_start + i] = output;
4840 }
4841 std::swap (in_start, out_start);
4842 }
4843
4844 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4845 results.reserve (nresults);
4846 for (unsigned int i = 0; i < nresults; ++i)
4847 if (i < nvectors)
4848 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
4849 pieces[in_start + i]));
4850 else
4851 results.quick_push (results[i - nvectors]);
4852 }
4853
4854
4855 /* For constant and loop invariant defs in OP_NODE this function creates
4856 vector defs that will be used in the vectorized stmts and stores them
4857 to SLP_TREE_VEC_DEFS of OP_NODE. */
4858
4859 static void
4860 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
4861 {
4862 unsigned HOST_WIDE_INT nunits;
4863 tree vec_cst;
4864 unsigned j, number_of_places_left_in_vector;
4865 tree vector_type;
4866 tree vop;
4867 int group_size = op_node->ops.length ();
4868 unsigned int vec_num, i;
4869 unsigned number_of_copies = 1;
4870 bool constant_p;
4871 gimple_seq ctor_seq = NULL;
4872 auto_vec<tree, 16> permute_results;
4873
4874 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
4875 vector_type = SLP_TREE_VECTYPE (op_node);
4876
4877 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
4878 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
4879 auto_vec<tree> voprnds (number_of_vectors);
4880
4881 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4882 created vectors. It is greater than 1 if unrolling is performed.
4883
4884 For example, we have two scalar operands, s1 and s2 (e.g., group of
4885 strided accesses of size two), while NUNITS is four (i.e., four scalars
4886 of this type can be packed in a vector). The output vector will contain
4887 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4888 will be 2).
4889
4890 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4891 containing the operands.
4892
4893 For example, NUNITS is four as before, and the group size is 8
4894 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4895 {s5, s6, s7, s8}. */
4896
4897 /* When using duplicate_and_interleave, we just need one element for
4898 each scalar statement. */
4899 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4900 nunits = group_size;
4901
4902 number_of_copies = nunits * number_of_vectors / group_size;
4903
4904 number_of_places_left_in_vector = nunits;
4905 constant_p = true;
4906 tree_vector_builder elts (vector_type, nunits, 1);
4907 elts.quick_grow (nunits);
4908 stmt_vec_info insert_after = NULL;
4909 for (j = 0; j < number_of_copies; j++)
4910 {
4911 tree op;
4912 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
4913 {
4914 /* Create 'vect_ = {op0,op1,...,opn}'. */
4915 number_of_places_left_in_vector--;
4916 tree orig_op = op;
4917 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
4918 {
4919 if (CONSTANT_CLASS_P (op))
4920 {
4921 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4922 {
4923 /* Can't use VIEW_CONVERT_EXPR for booleans because
4924 of possibly different sizes of scalar value and
4925 vector element. */
4926 if (integer_zerop (op))
4927 op = build_int_cst (TREE_TYPE (vector_type), 0);
4928 else if (integer_onep (op))
4929 op = build_all_ones_cst (TREE_TYPE (vector_type));
4930 else
4931 gcc_unreachable ();
4932 }
4933 else
4934 op = fold_unary (VIEW_CONVERT_EXPR,
4935 TREE_TYPE (vector_type), op);
4936 gcc_assert (op && CONSTANT_CLASS_P (op));
4937 }
4938 else
4939 {
4940 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
4941 gimple *init_stmt;
4942 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4943 {
4944 tree true_val
4945 = build_all_ones_cst (TREE_TYPE (vector_type));
4946 tree false_val
4947 = build_zero_cst (TREE_TYPE (vector_type));
4948 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
4949 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
4950 op, true_val,
4951 false_val);
4952 }
4953 else
4954 {
4955 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
4956 op);
4957 init_stmt
4958 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
4959 op);
4960 }
4961 gimple_seq_add_stmt (&ctor_seq, init_stmt);
4962 op = new_temp;
4963 }
4964 }
4965 elts[number_of_places_left_in_vector] = op;
4966 if (!CONSTANT_CLASS_P (op))
4967 constant_p = false;
4968 /* For BB vectorization we have to compute an insert location
4969 when a def is inside the analyzed region since we cannot
4970 simply insert at the BB start in this case. */
4971 stmt_vec_info opdef;
4972 if (TREE_CODE (orig_op) == SSA_NAME
4973 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
4974 && is_a <bb_vec_info> (vinfo)
4975 && (opdef = vinfo->lookup_def (orig_op)))
4976 {
4977 if (!insert_after)
4978 insert_after = opdef;
4979 else
4980 insert_after = get_later_stmt (insert_after, opdef);
4981 }
4982
4983 if (number_of_places_left_in_vector == 0)
4984 {
4985 if (constant_p
4986 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4987 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4988 vec_cst = gimple_build_vector (&ctor_seq, &elts);
4989 else
4990 {
4991 if (permute_results.is_empty ())
4992 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
4993 elts, number_of_vectors,
4994 permute_results);
4995 vec_cst = permute_results[number_of_vectors - j - 1];
4996 }
4997 if (!gimple_seq_empty_p (ctor_seq))
4998 {
4999 if (insert_after)
5000 {
5001 gimple_stmt_iterator gsi;
5002 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
5003 {
5004 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
5005 gsi_insert_seq_before (&gsi, ctor_seq,
5006 GSI_CONTINUE_LINKING);
5007 }
5008 else if (!stmt_ends_bb_p (insert_after->stmt))
5009 {
5010 gsi = gsi_for_stmt (insert_after->stmt);
5011 gsi_insert_seq_after (&gsi, ctor_seq,
5012 GSI_CONTINUE_LINKING);
5013 }
5014 else
5015 {
5016 /* When we want to insert after a def where the
5017 defining stmt throws then insert on the fallthru
5018 edge. */
5019 edge e = find_fallthru_edge
5020 (gimple_bb (insert_after->stmt)->succs);
5021 basic_block new_bb
5022 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
5023 gcc_assert (!new_bb);
5024 }
5025 }
5026 else
5027 vinfo->insert_seq_on_entry (NULL, ctor_seq);
5028 ctor_seq = NULL;
5029 }
5030 voprnds.quick_push (vec_cst);
5031 insert_after = NULL;
5032 number_of_places_left_in_vector = nunits;
5033 constant_p = true;
5034 elts.new_vector (vector_type, nunits, 1);
5035 elts.quick_grow (nunits);
5036 }
5037 }
5038 }
5039
5040 /* Since the vectors are created in the reverse order, we should invert
5041 them. */
5042 vec_num = voprnds.length ();
5043 for (j = vec_num; j != 0; j--)
5044 {
5045 vop = voprnds[j - 1];
5046 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5047 }
5048
5049 /* In case that VF is greater than the unrolling factor needed for the SLP
5050 group of stmts, NUMBER_OF_VECTORS to be created is greater than
5051 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
5052 to replicate the vectors. */
5053 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
5054 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
5055 i++)
5056 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5057 }
5058
5059 /* Get the Ith vectorized definition from SLP_NODE. */
5060
5061 tree
5062 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
5063 {
5064 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
5065 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
5066 else
5067 return SLP_TREE_VEC_DEFS (slp_node)[i];
5068 }
5069
5070 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
5071
5072 void
5073 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
5074 {
5075 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
5076 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
5077 {
5078 unsigned j;
5079 gimple *vec_def_stmt;
5080 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
5081 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
5082 }
5083 else
5084 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
5085 }
5086
5087 /* Get N vectorized definitions for SLP_NODE. */
5088
5089 void
5090 vect_get_slp_defs (vec_info *,
5091 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
5092 {
5093 if (n == -1U)
5094 n = SLP_TREE_CHILDREN (slp_node).length ();
5095
5096 for (unsigned i = 0; i < n; ++i)
5097 {
5098 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
5099 vec<tree> vec_defs = vNULL;
5100 vect_get_slp_defs (child, &vec_defs);
5101 vec_oprnds->quick_push (vec_defs);
5102 }
5103 }
5104
5105 /* Generate vector permute statements from a list of loads in DR_CHAIN.
5106 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
5107 permute statements for the SLP node NODE. Store the number of vector
5108 permute instructions in *N_PERMS and the number of vector load
5109 instructions in *N_LOADS. */
5110
5111 bool
5112 vect_transform_slp_perm_load (vec_info *vinfo,
5113 slp_tree node, vec<tree> dr_chain,
5114 gimple_stmt_iterator *gsi, poly_uint64 vf,
5115 bool analyze_only, unsigned *n_perms,
5116 unsigned int *n_loads)
5117 {
5118 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5119 int vec_index = 0;
5120 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5121 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
5122 unsigned int mask_element;
5123 machine_mode mode;
5124
5125 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
5126 return false;
5127
5128 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
5129
5130 mode = TYPE_MODE (vectype);
5131 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5132
5133 /* Initialize the vect stmts of NODE to properly insert the generated
5134 stmts later. */
5135 if (! analyze_only)
5136 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
5137 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
5138 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
5139
5140 /* Generate permutation masks for every NODE. Number of masks for each NODE
5141 is equal to GROUP_SIZE.
5142 E.g., we have a group of three nodes with three loads from the same
5143 location in each node, and the vector size is 4. I.e., we have a
5144 a0b0c0a1b1c1... sequence and we need to create the following vectors:
5145 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
5146 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
5147 ...
5148
5149 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
5150 The last mask is illegal since we assume two operands for permute
5151 operation, and the mask element values can't be outside that range.
5152 Hence, the last mask must be converted into {2,5,5,5}.
5153 For the first two permutations we need the first and the second input
5154 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
5155 we need the second and the third vectors: {b1,c1,a2,b2} and
5156 {c2,a3,b3,c3}. */
5157
5158 int vect_stmts_counter = 0;
5159 unsigned int index = 0;
5160 int first_vec_index = -1;
5161 int second_vec_index = -1;
5162 bool noop_p = true;
5163 *n_perms = 0;
5164
5165 vec_perm_builder mask;
5166 unsigned int nelts_to_build;
5167 unsigned int nvectors_per_build;
5168 unsigned int in_nlanes;
5169 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
5170 && multiple_p (nunits, group_size));
5171 if (repeating_p)
5172 {
5173 /* A single vector contains a whole number of copies of the node, so:
5174 (a) all permutes can use the same mask; and
5175 (b) the permutes only need a single vector input. */
5176 mask.new_vector (nunits, group_size, 3);
5177 nelts_to_build = mask.encoded_nelts ();
5178 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
5179 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
5180 }
5181 else
5182 {
5183 /* We need to construct a separate mask for each vector statement. */
5184 unsigned HOST_WIDE_INT const_nunits, const_vf;
5185 if (!nunits.is_constant (&const_nunits)
5186 || !vf.is_constant (&const_vf))
5187 return false;
5188 mask.new_vector (const_nunits, const_nunits, 1);
5189 nelts_to_build = const_vf * group_size;
5190 nvectors_per_build = 1;
5191 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
5192 }
5193 auto_sbitmap used_in_lanes (in_nlanes);
5194 bitmap_clear (used_in_lanes);
5195
5196 unsigned int count = mask.encoded_nelts ();
5197 mask.quick_grow (count);
5198 vec_perm_indices indices;
5199
5200 for (unsigned int j = 0; j < nelts_to_build; j++)
5201 {
5202 unsigned int iter_num = j / group_size;
5203 unsigned int stmt_num = j % group_size;
5204 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
5205 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
5206 bitmap_set_bit (used_in_lanes, i);
5207 if (repeating_p)
5208 {
5209 first_vec_index = 0;
5210 mask_element = i;
5211 }
5212 else
5213 {
5214 /* Enforced before the loop when !repeating_p. */
5215 unsigned int const_nunits = nunits.to_constant ();
5216 vec_index = i / const_nunits;
5217 mask_element = i % const_nunits;
5218 if (vec_index == first_vec_index
5219 || first_vec_index == -1)
5220 {
5221 first_vec_index = vec_index;
5222 }
5223 else if (vec_index == second_vec_index
5224 || second_vec_index == -1)
5225 {
5226 second_vec_index = vec_index;
5227 mask_element += const_nunits;
5228 }
5229 else
5230 {
5231 if (dump_enabled_p ())
5232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5233 "permutation requires at "
5234 "least three vectors %G",
5235 stmt_info->stmt);
5236 gcc_assert (analyze_only);
5237 return false;
5238 }
5239
5240 gcc_assert (mask_element < 2 * const_nunits);
5241 }
5242
5243 if (mask_element != index)
5244 noop_p = false;
5245 mask[index++] = mask_element;
5246
5247 if (index == count && !noop_p)
5248 {
5249 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
5250 if (!can_vec_perm_const_p (mode, indices))
5251 {
5252 if (dump_enabled_p ())
5253 {
5254 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5255 vect_location,
5256 "unsupported vect permute { ");
5257 for (i = 0; i < count; ++i)
5258 {
5259 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5260 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5261 }
5262 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5263 }
5264 gcc_assert (analyze_only);
5265 return false;
5266 }
5267
5268 ++*n_perms;
5269 }
5270
5271 if (index == count)
5272 {
5273 if (!analyze_only)
5274 {
5275 tree mask_vec = NULL_TREE;
5276
5277 if (! noop_p)
5278 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5279
5280 if (second_vec_index == -1)
5281 second_vec_index = first_vec_index;
5282
5283 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
5284 {
5285 /* Generate the permute statement if necessary. */
5286 tree first_vec = dr_chain[first_vec_index + ri];
5287 tree second_vec = dr_chain[second_vec_index + ri];
5288 gimple *perm_stmt;
5289 if (! noop_p)
5290 {
5291 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5292 tree perm_dest
5293 = vect_create_destination_var (gimple_assign_lhs (stmt),
5294 vectype);
5295 perm_dest = make_ssa_name (perm_dest);
5296 perm_stmt
5297 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5298 first_vec, second_vec,
5299 mask_vec);
5300 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
5301 gsi);
5302 }
5303 else
5304 /* If mask was NULL_TREE generate the requested
5305 identity transform. */
5306 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
5307
5308 /* Store the vector statement in NODE. */
5309 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
5310 }
5311 }
5312
5313 index = 0;
5314 first_vec_index = -1;
5315 second_vec_index = -1;
5316 noop_p = true;
5317 }
5318 }
5319
5320 if (n_loads)
5321 {
5322 if (repeating_p)
5323 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5324 else
5325 {
5326 /* Enforced above when !repeating_p. */
5327 unsigned int const_nunits = nunits.to_constant ();
5328 *n_loads = 0;
5329 bool load_seen = false;
5330 for (unsigned i = 0; i < in_nlanes; ++i)
5331 {
5332 if (i % const_nunits == 0)
5333 {
5334 if (load_seen)
5335 *n_loads += 1;
5336 load_seen = false;
5337 }
5338 if (bitmap_bit_p (used_in_lanes, i))
5339 load_seen = true;
5340 }
5341 if (load_seen)
5342 *n_loads += 1;
5343 }
5344 }
5345
5346 return true;
5347 }
5348
5349
5350 /* Vectorize the SLP permutations in NODE as specified
5351 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5352 child number and lane number.
5353 Interleaving of two two-lane two-child SLP subtrees (not supported):
5354 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5355 A blend of two four-lane two-child SLP subtrees:
5356 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5357 Highpart of a four-lane one-child SLP subtree (not supported):
5358 [ { 0, 2 }, { 0, 3 } ]
5359 Where currently only a subset is supported by code generating below. */
5360
5361 static bool
5362 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5363 slp_tree node, stmt_vector_for_cost *cost_vec)
5364 {
5365 tree vectype = SLP_TREE_VECTYPE (node);
5366
5367 /* ??? We currently only support all same vector input and output types
5368 while the SLP IL should really do a concat + select and thus accept
5369 arbitrary mismatches. */
5370 slp_tree child;
5371 unsigned i;
5372 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5373 if (!vect_maybe_update_slp_op_vectype (child, vectype)
5374 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
5375 {
5376 if (dump_enabled_p ())
5377 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5378 "Unsupported lane permutation\n");
5379 return false;
5380 }
5381
5382 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
5383 gcc_assert (perm.length () == SLP_TREE_LANES (node));
5384 if (dump_enabled_p ())
5385 {
5386 dump_printf_loc (MSG_NOTE, vect_location,
5387 "vectorizing permutation");
5388 for (unsigned i = 0; i < perm.length (); ++i)
5389 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
5390 dump_printf (MSG_NOTE, "\n");
5391 }
5392
5393 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5394 if (!nunits.is_constant ())
5395 return false;
5396 unsigned HOST_WIDE_INT vf = 1;
5397 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
5398 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
5399 return false;
5400 unsigned olanes = vf * SLP_TREE_LANES (node);
5401 gcc_assert (multiple_p (olanes, nunits));
5402
5403 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5404 from the { SLP operand, scalar lane } permutation as recorded in the
5405 SLP node as intermediate step. This part should already work
5406 with SLP children with arbitrary number of lanes. */
5407 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
5408 auto_vec<unsigned> active_lane;
5409 vperm.create (olanes);
5410 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
5411 for (unsigned i = 0; i < vf; ++i)
5412 {
5413 for (unsigned pi = 0; pi < perm.length (); ++pi)
5414 {
5415 std::pair<unsigned, unsigned> p = perm[pi];
5416 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
5417 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
5418 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
5419 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
5420 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
5421 }
5422 /* Advance to the next group. */
5423 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
5424 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
5425 }
5426
5427 if (dump_enabled_p ())
5428 {
5429 dump_printf_loc (MSG_NOTE, vect_location, "as");
5430 for (unsigned i = 0; i < vperm.length (); ++i)
5431 {
5432 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
5433 dump_printf (MSG_NOTE, ",");
5434 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
5435 vperm[i].first.first, vperm[i].first.second,
5436 vperm[i].first.second);
5437 }
5438 dump_printf (MSG_NOTE, "\n");
5439 }
5440
5441 /* We can only handle two-vector permutes, everything else should
5442 be lowered on the SLP level. The following is closely inspired
5443 by vect_transform_slp_perm_load and is supposed to eventually
5444 replace it.
5445 ??? As intermediate step do code-gen in the SLP tree representation
5446 somehow? */
5447 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
5448 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
5449 unsigned int const_nunits = nunits.to_constant ();
5450 unsigned int index = 0;
5451 unsigned int mask_element;
5452 vec_perm_builder mask;
5453 mask.new_vector (const_nunits, const_nunits, 1);
5454 unsigned int count = mask.encoded_nelts ();
5455 mask.quick_grow (count);
5456 vec_perm_indices indices;
5457 unsigned nperms = 0;
5458 for (unsigned i = 0; i < vperm.length (); ++i)
5459 {
5460 mask_element = vperm[i].second;
5461 if (first_vec.first == -1U
5462 || first_vec == vperm[i].first)
5463 first_vec = vperm[i].first;
5464 else if (second_vec.first == -1U
5465 || second_vec == vperm[i].first)
5466 {
5467 second_vec = vperm[i].first;
5468 mask_element += const_nunits;
5469 }
5470 else
5471 {
5472 if (dump_enabled_p ())
5473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5474 "permutation requires at "
5475 "least three vectors\n");
5476 gcc_assert (!gsi);
5477 return false;
5478 }
5479
5480 mask[index++] = mask_element;
5481
5482 if (index == count)
5483 {
5484 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
5485 const_nunits);
5486 bool identity_p = indices.series_p (0, 1, 0, 1);
5487 if (!identity_p
5488 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5489 {
5490 if (dump_enabled_p ())
5491 {
5492 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5493 vect_location,
5494 "unsupported vect permute { ");
5495 for (i = 0; i < count; ++i)
5496 {
5497 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5498 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5499 }
5500 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5501 }
5502 gcc_assert (!gsi);
5503 return false;
5504 }
5505
5506 if (!identity_p)
5507 nperms++;
5508 if (gsi)
5509 {
5510 if (second_vec.first == -1U)
5511 second_vec = first_vec;
5512
5513 /* Generate the permute statement if necessary. */
5514 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
5515 tree first_def
5516 = vect_get_slp_vect_def (first_node, first_vec.second);
5517 gassign *perm_stmt;
5518 tree perm_dest = make_ssa_name (vectype);
5519 if (!identity_p)
5520 {
5521 slp_tree second_node
5522 = SLP_TREE_CHILDREN (node)[second_vec.first];
5523 tree second_def
5524 = vect_get_slp_vect_def (second_node, second_vec.second);
5525 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5526 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5527 first_def, second_def,
5528 mask_vec);
5529 }
5530 else
5531 /* We need a copy here in case the def was external. */
5532 perm_stmt = gimple_build_assign (perm_dest, first_def);
5533 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
5534 /* Store the vector statement in NODE. */
5535 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
5536 }
5537
5538 index = 0;
5539 first_vec = std::make_pair (-1U, -1U);
5540 second_vec = std::make_pair (-1U, -1U);
5541 }
5542 }
5543
5544 if (!gsi)
5545 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
5546
5547 return true;
5548 }
5549
5550 /* Vectorize SLP NODE. */
5551
5552 static void
5553 vect_schedule_slp_node (vec_info *vinfo,
5554 slp_tree node, slp_instance instance)
5555 {
5556 gimple_stmt_iterator si;
5557 int i;
5558 slp_tree child;
5559
5560 /* For existing vectors there's nothing to do. */
5561 if (SLP_TREE_VEC_DEFS (node).exists ())
5562 return;
5563
5564 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
5565
5566 /* Vectorize externals and constants. */
5567 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5568 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5569 {
5570 /* ??? vectorizable_shift can end up using a scalar operand which is
5571 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5572 node in this case. */
5573 if (!SLP_TREE_VECTYPE (node))
5574 return;
5575
5576 vect_create_constant_vectors (vinfo, node);
5577 return;
5578 }
5579
5580 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5581
5582 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
5583 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
5584
5585 if (dump_enabled_p ())
5586 dump_printf_loc (MSG_NOTE, vect_location,
5587 "------>vectorizing SLP node starting from: %G",
5588 stmt_info->stmt);
5589
5590 if (STMT_VINFO_DATA_REF (stmt_info)
5591 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5592 {
5593 /* Vectorized loads go before the first scalar load to make it
5594 ready early, vectorized stores go before the last scalar
5595 stmt which is where all uses are ready. */
5596 stmt_vec_info last_stmt_info = NULL;
5597 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
5598 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
5599 else /* DR_IS_WRITE */
5600 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
5601 si = gsi_for_stmt (last_stmt_info->stmt);
5602 }
5603 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
5604 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
5605 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
5606 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5607 {
5608 /* For PHI node vectorization we do not use the insertion iterator. */
5609 si = gsi_none ();
5610 }
5611 else
5612 {
5613 /* Emit other stmts after the children vectorized defs which is
5614 earliest possible. */
5615 gimple *last_stmt = NULL;
5616 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5617 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5618 {
5619 /* For fold-left reductions we are retaining the scalar
5620 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5621 set so the representation isn't perfect. Resort to the
5622 last scalar def here. */
5623 if (SLP_TREE_VEC_STMTS (child).is_empty ())
5624 {
5625 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
5626 == cycle_phi_info_type);
5627 gphi *phi = as_a <gphi *>
5628 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
5629 if (!last_stmt
5630 || vect_stmt_dominates_stmt_p (last_stmt, phi))
5631 last_stmt = phi;
5632 }
5633 /* We are emitting all vectorized stmts in the same place and
5634 the last one is the last.
5635 ??? Unless we have a load permutation applied and that
5636 figures to re-use an earlier generated load. */
5637 unsigned j;
5638 gimple *vstmt;
5639 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
5640 if (!last_stmt
5641 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5642 last_stmt = vstmt;
5643 }
5644 else if (!SLP_TREE_VECTYPE (child))
5645 {
5646 /* For externals we use unvectorized at all scalar defs. */
5647 unsigned j;
5648 tree def;
5649 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
5650 if (TREE_CODE (def) == SSA_NAME
5651 && !SSA_NAME_IS_DEFAULT_DEF (def))
5652 {
5653 gimple *stmt = SSA_NAME_DEF_STMT (def);
5654 if (!last_stmt
5655 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
5656 last_stmt = stmt;
5657 }
5658 }
5659 else
5660 {
5661 /* For externals we have to look at all defs since their
5662 insertion place is decided per vector. But beware
5663 of pre-existing vectors where we need to make sure
5664 we do not insert before the region boundary. */
5665 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
5666 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
5667 last_stmt = gsi_stmt (gsi_after_labels
5668 (as_a <bb_vec_info> (vinfo)->bbs[0]));
5669 else
5670 {
5671 unsigned j;
5672 tree vdef;
5673 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
5674 if (TREE_CODE (vdef) == SSA_NAME
5675 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
5676 {
5677 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
5678 if (!last_stmt
5679 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5680 last_stmt = vstmt;
5681 }
5682 }
5683 }
5684 /* This can happen when all children are pre-existing vectors or
5685 constants. */
5686 if (!last_stmt)
5687 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
5688 if (is_a <gphi *> (last_stmt))
5689 si = gsi_after_labels (gimple_bb (last_stmt));
5690 else
5691 {
5692 si = gsi_for_stmt (last_stmt);
5693 gsi_next (&si);
5694 }
5695 }
5696
5697 bool done_p = false;
5698
5699 /* Handle purely internal nodes. */
5700 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5701 {
5702 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5703 be shared with different SLP nodes (but usually it's the same
5704 operation apart from the case the stmt is only there for denoting
5705 the actual scalar lane defs ...). So do not call vect_transform_stmt
5706 but open-code it here (partly). */
5707 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
5708 gcc_assert (done);
5709 done_p = true;
5710 }
5711 if (!done_p)
5712 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
5713 }
5714
5715 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5716 For loop vectorization this is done in vectorizable_call, but for SLP
5717 it needs to be deferred until end of vect_schedule_slp, because multiple
5718 SLP instances may refer to the same scalar stmt. */
5719
5720 static void
5721 vect_remove_slp_scalar_calls (vec_info *vinfo,
5722 slp_tree node, hash_set<slp_tree> &visited)
5723 {
5724 gimple *new_stmt;
5725 gimple_stmt_iterator gsi;
5726 int i;
5727 slp_tree child;
5728 tree lhs;
5729 stmt_vec_info stmt_info;
5730
5731 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5732 return;
5733
5734 if (visited.add (node))
5735 return;
5736
5737 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5738 vect_remove_slp_scalar_calls (vinfo, child, visited);
5739
5740 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5741 {
5742 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
5743 if (!stmt || gimple_bb (stmt) == NULL)
5744 continue;
5745 if (is_pattern_stmt_p (stmt_info)
5746 || !PURE_SLP_STMT (stmt_info))
5747 continue;
5748 lhs = gimple_call_lhs (stmt);
5749 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
5750 gsi = gsi_for_stmt (stmt);
5751 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
5752 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
5753 }
5754 }
5755
5756 static void
5757 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
5758 {
5759 hash_set<slp_tree> visited;
5760 vect_remove_slp_scalar_calls (vinfo, node, visited);
5761 }
5762
5763 /* Vectorize the instance root. */
5764
5765 void
5766 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
5767 {
5768 gassign *rstmt = NULL;
5769
5770 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
5771 {
5772 gimple *child_stmt;
5773 int j;
5774
5775 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5776 {
5777 tree vect_lhs = gimple_get_lhs (child_stmt);
5778 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
5779 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
5780 TREE_TYPE (vect_lhs)))
5781 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
5782 vect_lhs);
5783 rstmt = gimple_build_assign (root_lhs, vect_lhs);
5784 break;
5785 }
5786 }
5787 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
5788 {
5789 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5790 gimple *child_stmt;
5791 int j;
5792 vec<constructor_elt, va_gc> *v;
5793 vec_alloc (v, nelts);
5794
5795 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5796 {
5797 CONSTRUCTOR_APPEND_ELT (v,
5798 NULL_TREE,
5799 gimple_get_lhs (child_stmt));
5800 }
5801 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
5802 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
5803 tree r_constructor = build_constructor (rtype, v);
5804 rstmt = gimple_build_assign (lhs, r_constructor);
5805 }
5806
5807 gcc_assert (rstmt);
5808
5809 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
5810 gsi_replace (&rgsi, rstmt, true);
5811 }
5812
5813 struct slp_scc_info
5814 {
5815 bool on_stack;
5816 int dfs;
5817 int lowlink;
5818 };
5819
5820 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
5821
5822 static void
5823 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
5824 hash_map<slp_tree, slp_scc_info> &scc_info,
5825 int &maxdfs, vec<slp_tree> &stack)
5826 {
5827 bool existed_p;
5828 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
5829 gcc_assert (!existed_p);
5830 info->dfs = maxdfs;
5831 info->lowlink = maxdfs;
5832 maxdfs++;
5833
5834 /* Leaf. */
5835 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5836 {
5837 info->on_stack = false;
5838 vect_schedule_slp_node (vinfo, node, instance);
5839 return;
5840 }
5841
5842 info->on_stack = true;
5843 stack.safe_push (node);
5844
5845 unsigned i;
5846 slp_tree child;
5847 /* DFS recurse. */
5848 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5849 {
5850 if (!child)
5851 continue;
5852 slp_scc_info *child_info = scc_info.get (child);
5853 if (!child_info)
5854 {
5855 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
5856 /* Recursion might have re-allocated the node. */
5857 info = scc_info.get (node);
5858 child_info = scc_info.get (child);
5859 info->lowlink = MIN (info->lowlink, child_info->lowlink);
5860 }
5861 else if (child_info->on_stack)
5862 info->lowlink = MIN (info->lowlink, child_info->dfs);
5863 }
5864 if (info->lowlink != info->dfs)
5865 return;
5866
5867 auto_vec<slp_tree, 4> phis_to_fixup;
5868
5869 /* Singleton. */
5870 if (stack.last () == node)
5871 {
5872 stack.pop ();
5873 info->on_stack = false;
5874 vect_schedule_slp_node (vinfo, node, instance);
5875 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
5876 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
5877 phis_to_fixup.quick_push (node);
5878 }
5879 else
5880 {
5881 /* SCC. */
5882 int last_idx = stack.length () - 1;
5883 while (stack[last_idx] != node)
5884 last_idx--;
5885 /* We can break the cycle at PHIs who have at least one child
5886 code generated. Then we could re-start the DFS walk until
5887 all nodes in the SCC are covered (we might have new entries
5888 for only back-reachable nodes). But it's simpler to just
5889 iterate and schedule those that are ready. */
5890 unsigned todo = stack.length () - last_idx;
5891 do
5892 {
5893 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
5894 {
5895 slp_tree entry = stack[idx];
5896 if (!entry)
5897 continue;
5898 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
5899 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
5900 bool ready = !phi;
5901 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
5902 if (!child)
5903 {
5904 gcc_assert (phi);
5905 ready = true;
5906 break;
5907 }
5908 else if (scc_info.get (child)->on_stack)
5909 {
5910 if (!phi)
5911 {
5912 ready = false;
5913 break;
5914 }
5915 }
5916 else
5917 {
5918 if (phi)
5919 {
5920 ready = true;
5921 break;
5922 }
5923 }
5924 if (ready)
5925 {
5926 vect_schedule_slp_node (vinfo, entry, instance);
5927 scc_info.get (entry)->on_stack = false;
5928 stack[idx] = NULL;
5929 todo--;
5930 if (phi)
5931 phis_to_fixup.safe_push (entry);
5932 }
5933 }
5934 }
5935 while (todo != 0);
5936
5937 /* Pop the SCC. */
5938 stack.truncate (last_idx);
5939 }
5940
5941 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
5942 slp_tree phi_node;
5943 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
5944 {
5945 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
5946 edge_iterator ei;
5947 edge e;
5948 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
5949 {
5950 unsigned dest_idx = e->dest_idx;
5951 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
5952 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
5953 continue;
5954 /* Simply fill all args. */
5955 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
5956 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
5957 vect_get_slp_vect_def (child, i),
5958 e, gimple_phi_arg_location (phi, dest_idx));
5959 }
5960 }
5961 }
5962
5963 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
5964
5965 void
5966 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
5967 {
5968 slp_instance instance;
5969 unsigned int i;
5970
5971 hash_map<slp_tree, slp_scc_info> scc_info;
5972 int maxdfs = 0;
5973 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5974 {
5975 slp_tree node = SLP_INSTANCE_TREE (instance);
5976 if (dump_enabled_p ())
5977 {
5978 dump_printf_loc (MSG_NOTE, vect_location,
5979 "Vectorizing SLP tree:\n");
5980 if (SLP_INSTANCE_ROOT_STMT (instance))
5981 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
5982 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
5983 vect_print_slp_graph (MSG_NOTE, vect_location,
5984 SLP_INSTANCE_TREE (instance));
5985 }
5986 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
5987 have a PHI be the node breaking the cycle. */
5988 auto_vec<slp_tree> stack;
5989 if (!scc_info.get (node))
5990 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
5991
5992 if (SLP_INSTANCE_ROOT_STMT (instance))
5993 vectorize_slp_instance_root_stmt (node, instance);
5994
5995 if (dump_enabled_p ())
5996 dump_printf_loc (MSG_NOTE, vect_location,
5997 "vectorizing stmts using SLP.\n");
5998 }
5999
6000 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6001 {
6002 slp_tree root = SLP_INSTANCE_TREE (instance);
6003 stmt_vec_info store_info;
6004 unsigned int j;
6005
6006 /* Remove scalar call stmts. Do not do this for basic-block
6007 vectorization as not all uses may be vectorized.
6008 ??? Why should this be necessary? DCE should be able to
6009 remove the stmts itself.
6010 ??? For BB vectorization we can as well remove scalar
6011 stmts starting from the SLP tree root if they have no
6012 uses. */
6013 if (is_a <loop_vec_info> (vinfo))
6014 vect_remove_slp_scalar_calls (vinfo, root);
6015
6016 /* Remove vectorized stores original scalar stmts. */
6017 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
6018 {
6019 if (!STMT_VINFO_DATA_REF (store_info)
6020 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
6021 break;
6022
6023 store_info = vect_orig_stmt (store_info);
6024 /* Free the attached stmt_vec_info and remove the stmt. */
6025 vinfo->remove_stmt (store_info);
6026 }
6027 }
6028 }