]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
Make SLP root stmt a vector
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
52
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55
56 static object_allocator<_slp_tree> *slp_tree_pool;
57 static slp_tree slp_first_node;
58
59 void
60 vect_slp_init (void)
61 {
62 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
63 }
64
65 void
66 vect_slp_fini (void)
67 {
68 while (slp_first_node)
69 delete slp_first_node;
70 delete slp_tree_pool;
71 slp_tree_pool = NULL;
72 }
73
74 void *
75 _slp_tree::operator new (size_t n)
76 {
77 gcc_assert (n == sizeof (_slp_tree));
78 return slp_tree_pool->allocate_raw ();
79 }
80
81 void
82 _slp_tree::operator delete (void *node, size_t n)
83 {
84 gcc_assert (n == sizeof (_slp_tree));
85 slp_tree_pool->remove_raw (node);
86 }
87
88
89 /* Initialize a SLP node. */
90
91 _slp_tree::_slp_tree ()
92 {
93 this->prev_node = NULL;
94 if (slp_first_node)
95 slp_first_node->prev_node = this;
96 this->next_node = slp_first_node;
97 slp_first_node = this;
98 SLP_TREE_SCALAR_STMTS (this) = vNULL;
99 SLP_TREE_SCALAR_OPS (this) = vNULL;
100 SLP_TREE_VEC_STMTS (this) = vNULL;
101 SLP_TREE_VEC_DEFS (this) = vNULL;
102 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
103 SLP_TREE_CHILDREN (this) = vNULL;
104 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
105 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
106 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
107 SLP_TREE_CODE (this) = ERROR_MARK;
108 SLP_TREE_VECTYPE (this) = NULL_TREE;
109 SLP_TREE_REPRESENTATIVE (this) = NULL;
110 SLP_TREE_REF_COUNT (this) = 1;
111 this->max_nunits = 1;
112 this->lanes = 0;
113 }
114
115 /* Tear down a SLP node. */
116
117 _slp_tree::~_slp_tree ()
118 {
119 if (this->prev_node)
120 this->prev_node->next_node = this->next_node;
121 else
122 slp_first_node = this->next_node;
123 if (this->next_node)
124 this->next_node->prev_node = this->prev_node;
125 SLP_TREE_CHILDREN (this).release ();
126 SLP_TREE_SCALAR_STMTS (this).release ();
127 SLP_TREE_SCALAR_OPS (this).release ();
128 SLP_TREE_VEC_STMTS (this).release ();
129 SLP_TREE_VEC_DEFS (this).release ();
130 SLP_TREE_LOAD_PERMUTATION (this).release ();
131 SLP_TREE_LANE_PERMUTATION (this).release ();
132 }
133
134 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
135
136 void
137 vect_free_slp_tree (slp_tree node)
138 {
139 int i;
140 slp_tree child;
141
142 if (--SLP_TREE_REF_COUNT (node) != 0)
143 return;
144
145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
146 if (child)
147 vect_free_slp_tree (child);
148
149 /* If the node defines any SLP only patterns then those patterns are no
150 longer valid and should be removed. */
151 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
152 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
153 {
154 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
155 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
156 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
157 }
158
159 delete node;
160 }
161
162 /* Return a location suitable for dumpings related to the SLP instance. */
163
164 dump_user_location_t
165 _slp_instance::location () const
166 {
167 if (!root_stmts.is_empty ())
168 return root_stmts[0]->stmt;
169 else
170 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
171 }
172
173
174 /* Free the memory allocated for the SLP instance. */
175
176 void
177 vect_free_slp_instance (slp_instance instance)
178 {
179 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
180 SLP_INSTANCE_LOADS (instance).release ();
181 SLP_INSTANCE_ROOT_STMTS (instance).release ();
182 instance->subgraph_entries.release ();
183 instance->cost_vec.release ();
184 free (instance);
185 }
186
187
188 /* Create an SLP node for SCALAR_STMTS. */
189
190 slp_tree
191 vect_create_new_slp_node (unsigned nops, tree_code code)
192 {
193 slp_tree node = new _slp_tree;
194 SLP_TREE_SCALAR_STMTS (node) = vNULL;
195 SLP_TREE_CHILDREN (node).create (nops);
196 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
197 SLP_TREE_CODE (node) = code;
198 return node;
199 }
200 /* Create an SLP node for SCALAR_STMTS. */
201
202 static slp_tree
203 vect_create_new_slp_node (slp_tree node,
204 vec<stmt_vec_info> scalar_stmts, unsigned nops)
205 {
206 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
207 SLP_TREE_CHILDREN (node).create (nops);
208 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
209 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
210 SLP_TREE_LANES (node) = scalar_stmts.length ();
211 return node;
212 }
213
214 /* Create an SLP node for SCALAR_STMTS. */
215
216 static slp_tree
217 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
218 {
219 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
220 }
221
222 /* Create an SLP node for OPS. */
223
224 static slp_tree
225 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
226 {
227 SLP_TREE_SCALAR_OPS (node) = ops;
228 SLP_TREE_DEF_TYPE (node) = vect_external_def;
229 SLP_TREE_LANES (node) = ops.length ();
230 return node;
231 }
232
233 /* Create an SLP node for OPS. */
234
235 static slp_tree
236 vect_create_new_slp_node (vec<tree> ops)
237 {
238 return vect_create_new_slp_node (new _slp_tree, ops);
239 }
240
241
242 /* This structure is used in creation of an SLP tree. Each instance
243 corresponds to the same operand in a group of scalar stmts in an SLP
244 node. */
245 typedef struct _slp_oprnd_info
246 {
247 /* Def-stmts for the operands. */
248 vec<stmt_vec_info> def_stmts;
249 /* Operands. */
250 vec<tree> ops;
251 /* Information about the first statement, its vector def-type, type, the
252 operand itself in case it's constant, and an indication if it's a pattern
253 stmt. */
254 tree first_op_type;
255 enum vect_def_type first_dt;
256 bool any_pattern;
257 } *slp_oprnd_info;
258
259
260 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
261 operand. */
262 static vec<slp_oprnd_info>
263 vect_create_oprnd_info (int nops, int group_size)
264 {
265 int i;
266 slp_oprnd_info oprnd_info;
267 vec<slp_oprnd_info> oprnds_info;
268
269 oprnds_info.create (nops);
270 for (i = 0; i < nops; i++)
271 {
272 oprnd_info = XNEW (struct _slp_oprnd_info);
273 oprnd_info->def_stmts.create (group_size);
274 oprnd_info->ops.create (group_size);
275 oprnd_info->first_dt = vect_uninitialized_def;
276 oprnd_info->first_op_type = NULL_TREE;
277 oprnd_info->any_pattern = false;
278 oprnds_info.quick_push (oprnd_info);
279 }
280
281 return oprnds_info;
282 }
283
284
285 /* Free operands info. */
286
287 static void
288 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
289 {
290 int i;
291 slp_oprnd_info oprnd_info;
292
293 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
294 {
295 oprnd_info->def_stmts.release ();
296 oprnd_info->ops.release ();
297 XDELETE (oprnd_info);
298 }
299
300 oprnds_info.release ();
301 }
302
303
304 /* Return true if STMTS contains a pattern statement. */
305
306 static bool
307 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
308 {
309 stmt_vec_info stmt_info;
310 unsigned int i;
311 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
312 if (is_pattern_stmt_p (stmt_info))
313 return true;
314 return false;
315 }
316
317 /* Return true when all lanes in the external or constant NODE have
318 the same value. */
319
320 static bool
321 vect_slp_tree_uniform_p (slp_tree node)
322 {
323 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
324 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
325
326 /* Pre-exsting vectors. */
327 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
328 return false;
329
330 unsigned i;
331 tree op, first = NULL_TREE;
332 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
333 if (!first)
334 first = op;
335 else if (!operand_equal_p (first, op, 0))
336 return false;
337
338 return true;
339 }
340
341 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
342 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
343 of the chain. */
344
345 int
346 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
347 stmt_vec_info first_stmt_info)
348 {
349 stmt_vec_info next_stmt_info = first_stmt_info;
350 int result = 0;
351
352 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
353 return -1;
354
355 do
356 {
357 if (next_stmt_info == stmt_info)
358 return result;
359 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
360 if (next_stmt_info)
361 result += DR_GROUP_GAP (next_stmt_info);
362 }
363 while (next_stmt_info);
364
365 return -1;
366 }
367
368 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
369 using the method implemented by duplicate_and_interleave. Return true
370 if so, returning the number of intermediate vectors in *NVECTORS_OUT
371 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
372 (if nonnull). */
373
374 bool
375 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
376 tree elt_type, unsigned int *nvectors_out,
377 tree *vector_type_out,
378 tree *permutes)
379 {
380 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
381 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
382 return false;
383
384 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
385 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
386 unsigned int nvectors = 1;
387 for (;;)
388 {
389 scalar_int_mode int_mode;
390 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
391 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
392 {
393 /* Get the natural vector type for this SLP group size. */
394 tree int_type = build_nonstandard_integer_type
395 (GET_MODE_BITSIZE (int_mode), 1);
396 tree vector_type
397 = get_vectype_for_scalar_type (vinfo, int_type, count);
398 if (vector_type
399 && VECTOR_MODE_P (TYPE_MODE (vector_type))
400 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
401 GET_MODE_SIZE (base_vector_mode)))
402 {
403 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
404 together into elements of type INT_TYPE and using the result
405 to build NVECTORS vectors. */
406 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
407 vec_perm_builder sel1 (nelts, 2, 3);
408 vec_perm_builder sel2 (nelts, 2, 3);
409 poly_int64 half_nelts = exact_div (nelts, 2);
410 for (unsigned int i = 0; i < 3; ++i)
411 {
412 sel1.quick_push (i);
413 sel1.quick_push (i + nelts);
414 sel2.quick_push (half_nelts + i);
415 sel2.quick_push (half_nelts + i + nelts);
416 }
417 vec_perm_indices indices1 (sel1, 2, nelts);
418 vec_perm_indices indices2 (sel2, 2, nelts);
419 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
420 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
421 {
422 if (nvectors_out)
423 *nvectors_out = nvectors;
424 if (vector_type_out)
425 *vector_type_out = vector_type;
426 if (permutes)
427 {
428 permutes[0] = vect_gen_perm_mask_checked (vector_type,
429 indices1);
430 permutes[1] = vect_gen_perm_mask_checked (vector_type,
431 indices2);
432 }
433 return true;
434 }
435 }
436 }
437 if (!multiple_p (elt_bytes, 2, &elt_bytes))
438 return false;
439 nvectors *= 2;
440 }
441 }
442
443 /* Return true if DTA and DTB match. */
444
445 static bool
446 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
447 {
448 return (dta == dtb
449 || ((dta == vect_external_def || dta == vect_constant_def)
450 && (dtb == vect_external_def || dtb == vect_constant_def)));
451 }
452
453 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
454 they are of a valid type and that they match the defs of the first stmt of
455 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
456 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
457 indicates swap is required for cond_expr stmts. Specifically, *SWAP
458 is 1 if STMT is cond and operands of comparison need to be swapped;
459 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
460 If there is any operand swap in this function, *SWAP is set to non-zero
461 value.
462 If there was a fatal error return -1; if the error could be corrected by
463 swapping operands of father node of this one, return 1; if everything is
464 ok return 0. */
465 static int
466 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
467 bool *skip_args,
468 vec<stmt_vec_info> stmts, unsigned stmt_num,
469 vec<slp_oprnd_info> *oprnds_info)
470 {
471 stmt_vec_info stmt_info = stmts[stmt_num];
472 tree oprnd;
473 unsigned int i, number_of_oprnds;
474 enum vect_def_type dt = vect_uninitialized_def;
475 slp_oprnd_info oprnd_info;
476 int first_op_idx = 1;
477 unsigned int commutative_op = -1U;
478 bool first_op_cond = false;
479 bool first = stmt_num == 0;
480
481 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
482 {
483 number_of_oprnds = gimple_call_num_args (stmt);
484 first_op_idx = 3;
485 if (gimple_call_internal_p (stmt))
486 {
487 internal_fn ifn = gimple_call_internal_fn (stmt);
488 commutative_op = first_commutative_argument (ifn);
489
490 /* Masked load, only look at mask. */
491 if (ifn == IFN_MASK_LOAD)
492 {
493 number_of_oprnds = 1;
494 /* Mask operand index. */
495 first_op_idx = 5;
496 }
497 }
498 }
499 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
500 {
501 enum tree_code code = gimple_assign_rhs_code (stmt);
502 number_of_oprnds = gimple_num_ops (stmt) - 1;
503 /* Swap can only be done for cond_expr if asked to, otherwise we
504 could result in different comparison code to the first stmt. */
505 if (code == COND_EXPR
506 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
507 {
508 first_op_cond = true;
509 number_of_oprnds++;
510 }
511 else
512 commutative_op = commutative_tree_code (code) ? 0U : -1U;
513 }
514 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
515 number_of_oprnds = gimple_phi_num_args (stmt);
516 else
517 return -1;
518
519 bool swapped = (swap != 0);
520 bool backedge = false;
521 gcc_assert (!swapped || first_op_cond);
522 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
523 for (i = 0; i < number_of_oprnds; i++)
524 {
525 if (first_op_cond)
526 {
527 /* Map indicating how operands of cond_expr should be swapped. */
528 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
529 int *map = maps[swap];
530
531 if (i < 2)
532 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
533 first_op_idx), map[i]);
534 else
535 oprnd = gimple_op (stmt_info->stmt, map[i]);
536 }
537 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
538 {
539 oprnd = gimple_phi_arg_def (stmt, i);
540 backedge = dominated_by_p (CDI_DOMINATORS,
541 gimple_phi_arg_edge (stmt, i)->src,
542 gimple_bb (stmt_info->stmt));
543 }
544 else
545 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
546 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
547 oprnd = TREE_OPERAND (oprnd, 0);
548
549 oprnd_info = (*oprnds_info)[i];
550
551 stmt_vec_info def_stmt_info;
552 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
553 {
554 if (dump_enabled_p ())
555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
556 "Build SLP failed: can't analyze def for %T\n",
557 oprnd);
558
559 return -1;
560 }
561
562 if (skip_args[i])
563 {
564 oprnd_info->def_stmts.quick_push (NULL);
565 oprnd_info->ops.quick_push (NULL_TREE);
566 oprnd_info->first_dt = vect_uninitialized_def;
567 continue;
568 }
569
570 oprnd_info->def_stmts.quick_push (def_stmt_info);
571 oprnd_info->ops.quick_push (oprnd);
572
573 if (def_stmt_info
574 && is_pattern_stmt_p (def_stmt_info))
575 {
576 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
577 != def_stmt_info)
578 oprnd_info->any_pattern = true;
579 else
580 /* If we promote this to external use the original stmt def. */
581 oprnd_info->ops.last ()
582 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
583 }
584
585 /* If there's a extern def on a backedge make sure we can
586 code-generate at the region start.
587 ??? This is another case that could be fixed by adjusting
588 how we split the function but at the moment we'd have conflicting
589 goals there. */
590 if (backedge
591 && dts[i] == vect_external_def
592 && is_a <bb_vec_info> (vinfo)
593 && TREE_CODE (oprnd) == SSA_NAME
594 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
595 && !dominated_by_p (CDI_DOMINATORS,
596 as_a <bb_vec_info> (vinfo)->bbs[0],
597 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
598 {
599 if (dump_enabled_p ())
600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
601 "Build SLP failed: extern def %T only defined "
602 "on backedge\n", oprnd);
603 return -1;
604 }
605
606 if (first)
607 {
608 tree type = TREE_TYPE (oprnd);
609 dt = dts[i];
610 if ((dt == vect_constant_def
611 || dt == vect_external_def)
612 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
613 && (TREE_CODE (type) == BOOLEAN_TYPE
614 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
615 type)))
616 {
617 if (dump_enabled_p ())
618 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
619 "Build SLP failed: invalid type of def "
620 "for variable-length SLP %T\n", oprnd);
621 return -1;
622 }
623
624 /* For the swapping logic below force vect_reduction_def
625 for the reduction op in a SLP reduction group. */
626 if (!STMT_VINFO_DATA_REF (stmt_info)
627 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
628 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
629 && def_stmt_info)
630 dts[i] = dt = vect_reduction_def;
631
632 /* Check the types of the definition. */
633 switch (dt)
634 {
635 case vect_external_def:
636 case vect_constant_def:
637 case vect_internal_def:
638 case vect_reduction_def:
639 case vect_induction_def:
640 case vect_nested_cycle:
641 break;
642
643 default:
644 /* FORNOW: Not supported. */
645 if (dump_enabled_p ())
646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
647 "Build SLP failed: illegal type of def %T\n",
648 oprnd);
649 return -1;
650 }
651
652 oprnd_info->first_dt = dt;
653 oprnd_info->first_op_type = type;
654 }
655 }
656 if (first)
657 return 0;
658
659 /* Now match the operand definition types to that of the first stmt. */
660 for (i = 0; i < number_of_oprnds;)
661 {
662 if (skip_args[i])
663 {
664 ++i;
665 continue;
666 }
667
668 oprnd_info = (*oprnds_info)[i];
669 dt = dts[i];
670 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
671 oprnd = oprnd_info->ops[stmt_num];
672 tree type = TREE_TYPE (oprnd);
673
674 if (!types_compatible_p (oprnd_info->first_op_type, type))
675 {
676 if (dump_enabled_p ())
677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
678 "Build SLP failed: different operand types\n");
679 return 1;
680 }
681
682 /* Not first stmt of the group, check that the def-stmt/s match
683 the def-stmt/s of the first stmt. Allow different definition
684 types for reduction chains: the first stmt must be a
685 vect_reduction_def (a phi node), and the rest
686 end in the reduction chain. */
687 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
688 && !(oprnd_info->first_dt == vect_reduction_def
689 && !STMT_VINFO_DATA_REF (stmt_info)
690 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
691 && def_stmt_info
692 && !STMT_VINFO_DATA_REF (def_stmt_info)
693 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
694 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
695 || (!STMT_VINFO_DATA_REF (stmt_info)
696 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
697 && ((!def_stmt_info
698 || STMT_VINFO_DATA_REF (def_stmt_info)
699 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
700 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
701 != (oprnd_info->first_dt != vect_reduction_def))))
702 {
703 /* Try swapping operands if we got a mismatch. For BB
704 vectorization only in case it will clearly improve things. */
705 if (i == commutative_op && !swapped
706 && (!is_a <bb_vec_info> (vinfo)
707 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
708 dts[i+1])
709 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
710 || vect_def_types_match
711 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
712 {
713 if (dump_enabled_p ())
714 dump_printf_loc (MSG_NOTE, vect_location,
715 "trying swapped operands\n");
716 std::swap (dts[i], dts[i+1]);
717 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
718 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
719 std::swap ((*oprnds_info)[i]->ops[stmt_num],
720 (*oprnds_info)[i+1]->ops[stmt_num]);
721 swapped = true;
722 continue;
723 }
724
725 if (is_a <bb_vec_info> (vinfo)
726 && !oprnd_info->any_pattern)
727 {
728 /* Now for commutative ops we should see whether we can
729 make the other operand matching. */
730 if (dump_enabled_p ())
731 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
732 "treating operand as external\n");
733 oprnd_info->first_dt = dt = vect_external_def;
734 }
735 else
736 {
737 if (dump_enabled_p ())
738 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
739 "Build SLP failed: different types\n");
740 return 1;
741 }
742 }
743
744 /* Make sure to demote the overall operand to external. */
745 if (dt == vect_external_def)
746 oprnd_info->first_dt = vect_external_def;
747 /* For a SLP reduction chain we want to duplicate the reduction to
748 each of the chain members. That gets us a sane SLP graph (still
749 the stmts are not 100% correct wrt the initial values). */
750 else if ((dt == vect_internal_def
751 || dt == vect_reduction_def)
752 && oprnd_info->first_dt == vect_reduction_def
753 && !STMT_VINFO_DATA_REF (stmt_info)
754 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
755 && !STMT_VINFO_DATA_REF (def_stmt_info)
756 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
757 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
758 {
759 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
760 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
761 }
762
763 ++i;
764 }
765
766 /* Swap operands. */
767 if (swapped)
768 {
769 if (dump_enabled_p ())
770 dump_printf_loc (MSG_NOTE, vect_location,
771 "swapped operands to match def types in %G",
772 stmt_info->stmt);
773 }
774
775 return 0;
776 }
777
778 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
779 Return true if we can, meaning that this choice doesn't conflict with
780 existing SLP nodes that use STMT_INFO. */
781
782 bool
783 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
784 {
785 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
786 if (old_vectype)
787 return useless_type_conversion_p (vectype, old_vectype);
788
789 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
790 {
791 /* We maintain the invariant that if any statement in the group is
792 used, all other members of the group have the same vector type. */
793 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
794 stmt_vec_info member_info = first_info;
795 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
796 if (is_pattern_stmt_p (member_info)
797 && !useless_type_conversion_p (vectype,
798 STMT_VINFO_VECTYPE (member_info)))
799 break;
800
801 if (!member_info)
802 {
803 for (member_info = first_info; member_info;
804 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
805 STMT_VINFO_VECTYPE (member_info) = vectype;
806 return true;
807 }
808 }
809 else if (!is_pattern_stmt_p (stmt_info))
810 {
811 STMT_VINFO_VECTYPE (stmt_info) = vectype;
812 return true;
813 }
814
815 if (dump_enabled_p ())
816 {
817 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
818 "Build SLP failed: incompatible vector"
819 " types for: %G", stmt_info->stmt);
820 dump_printf_loc (MSG_NOTE, vect_location,
821 " old vector type: %T\n", old_vectype);
822 dump_printf_loc (MSG_NOTE, vect_location,
823 " new vector type: %T\n", vectype);
824 }
825 return false;
826 }
827
828 /* Return true if call statements CALL1 and CALL2 are similar enough
829 to be combined into the same SLP group. */
830
831 static bool
832 compatible_calls_p (gcall *call1, gcall *call2)
833 {
834 unsigned int nargs = gimple_call_num_args (call1);
835 if (nargs != gimple_call_num_args (call2))
836 return false;
837
838 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
839 return false;
840
841 if (gimple_call_internal_p (call1))
842 {
843 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
844 TREE_TYPE (gimple_call_lhs (call2))))
845 return false;
846 for (unsigned int i = 0; i < nargs; ++i)
847 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
848 TREE_TYPE (gimple_call_arg (call2, i))))
849 return false;
850 }
851 else
852 {
853 if (!operand_equal_p (gimple_call_fn (call1),
854 gimple_call_fn (call2), 0))
855 return false;
856
857 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
858 return false;
859 }
860 return true;
861 }
862
863 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
864 caller's attempt to find the vector type in STMT_INFO with the narrowest
865 element type. Return true if VECTYPE is nonnull and if it is valid
866 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
867 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
868 vect_build_slp_tree. */
869
870 static bool
871 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
872 unsigned int group_size,
873 tree vectype, poly_uint64 *max_nunits)
874 {
875 if (!vectype)
876 {
877 if (dump_enabled_p ())
878 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
879 "Build SLP failed: unsupported data-type in %G\n",
880 stmt_info->stmt);
881 /* Fatal mismatch. */
882 return false;
883 }
884
885 /* If populating the vector type requires unrolling then fail
886 before adjusting *max_nunits for basic-block vectorization. */
887 if (is_a <bb_vec_info> (vinfo)
888 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
889 {
890 if (dump_enabled_p ())
891 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
892 "Build SLP failed: unrolling required "
893 "in basic block SLP\n");
894 /* Fatal mismatch. */
895 return false;
896 }
897
898 /* In case of multiple types we need to detect the smallest type. */
899 vect_update_max_nunits (max_nunits, vectype);
900 return true;
901 }
902
903 /* Verify if the scalar stmts STMTS are isomorphic, require data
904 permutation or are of unsupported types of operation. Return
905 true if they are, otherwise return false and indicate in *MATCHES
906 which stmts are not isomorphic to the first one. If MATCHES[0]
907 is false then this indicates the comparison could not be
908 carried out or the stmts will never be vectorized by SLP.
909
910 Note COND_EXPR is possibly isomorphic to another one after swapping its
911 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
912 the first stmt by swapping the two operands of comparison; set SWAP[i]
913 to 2 if stmt I is isormorphic to the first stmt by inverting the code
914 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
915 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
916
917 static bool
918 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
919 vec<stmt_vec_info> stmts, unsigned int group_size,
920 poly_uint64 *max_nunits, bool *matches,
921 bool *two_operators, tree *node_vectype)
922 {
923 unsigned int i;
924 stmt_vec_info first_stmt_info = stmts[0];
925 enum tree_code first_stmt_code = ERROR_MARK;
926 enum tree_code alt_stmt_code = ERROR_MARK;
927 enum tree_code rhs_code = ERROR_MARK;
928 enum tree_code first_cond_code = ERROR_MARK;
929 tree lhs;
930 bool need_same_oprnds = false;
931 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
932 optab optab;
933 int icode;
934 machine_mode optab_op2_mode;
935 machine_mode vec_mode;
936 stmt_vec_info first_load = NULL, prev_first_load = NULL;
937 bool first_stmt_load_p = false, load_p = false;
938 bool first_stmt_phi_p = false, phi_p = false;
939 bool maybe_soft_fail = false;
940 tree soft_fail_nunits_vectype = NULL_TREE;
941
942 /* For every stmt in NODE find its def stmt/s. */
943 stmt_vec_info stmt_info;
944 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
945 {
946 gimple *stmt = stmt_info->stmt;
947 swap[i] = 0;
948 matches[i] = false;
949
950 if (dump_enabled_p ())
951 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
952
953 /* Fail to vectorize statements marked as unvectorizable, throw
954 or are volatile. */
955 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
956 || stmt_can_throw_internal (cfun, stmt)
957 || gimple_has_volatile_ops (stmt))
958 {
959 if (dump_enabled_p ())
960 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
961 "Build SLP failed: unvectorizable statement %G",
962 stmt);
963 /* ??? For BB vectorization we want to commutate operands in a way
964 to shuffle all unvectorizable defs into one operand and have
965 the other still vectorized. The following doesn't reliably
966 work for this though but it's the easiest we can do here. */
967 if (is_a <bb_vec_info> (vinfo) && i != 0)
968 continue;
969 /* Fatal mismatch. */
970 matches[0] = false;
971 return false;
972 }
973
974 lhs = gimple_get_lhs (stmt);
975 if (lhs == NULL_TREE)
976 {
977 if (dump_enabled_p ())
978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
979 "Build SLP failed: not GIMPLE_ASSIGN nor "
980 "GIMPLE_CALL %G", stmt);
981 if (is_a <bb_vec_info> (vinfo) && i != 0)
982 continue;
983 /* Fatal mismatch. */
984 matches[0] = false;
985 return false;
986 }
987
988 tree nunits_vectype;
989 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
990 &nunits_vectype, group_size))
991 {
992 if (is_a <bb_vec_info> (vinfo) && i != 0)
993 continue;
994 /* Fatal mismatch. */
995 matches[0] = false;
996 return false;
997 }
998 /* Record nunits required but continue analysis, producing matches[]
999 as if nunits was not an issue. This allows splitting of groups
1000 to happen. */
1001 if (nunits_vectype
1002 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1003 nunits_vectype, max_nunits))
1004 {
1005 gcc_assert (is_a <bb_vec_info> (vinfo));
1006 maybe_soft_fail = true;
1007 soft_fail_nunits_vectype = nunits_vectype;
1008 }
1009
1010 gcc_assert (vectype);
1011
1012 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1013 if (call_stmt)
1014 {
1015 rhs_code = CALL_EXPR;
1016
1017 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
1018 load_p = true;
1019 else if ((gimple_call_internal_p (call_stmt)
1020 && (!vectorizable_internal_fn_p
1021 (gimple_call_internal_fn (call_stmt))))
1022 || gimple_call_tail_p (call_stmt)
1023 || gimple_call_noreturn_p (call_stmt)
1024 || !gimple_call_nothrow_p (call_stmt)
1025 || gimple_call_chain (call_stmt))
1026 {
1027 if (dump_enabled_p ())
1028 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1029 "Build SLP failed: unsupported call type %G",
1030 call_stmt);
1031 if (is_a <bb_vec_info> (vinfo) && i != 0)
1032 continue;
1033 /* Fatal mismatch. */
1034 matches[0] = false;
1035 return false;
1036 }
1037 }
1038 else if (gimple_code (stmt) == GIMPLE_PHI)
1039 {
1040 rhs_code = ERROR_MARK;
1041 phi_p = true;
1042 }
1043 else
1044 {
1045 rhs_code = gimple_assign_rhs_code (stmt);
1046 load_p = gimple_vuse (stmt);
1047 }
1048
1049 /* Check the operation. */
1050 if (i == 0)
1051 {
1052 *node_vectype = vectype;
1053 first_stmt_code = rhs_code;
1054 first_stmt_load_p = load_p;
1055 first_stmt_phi_p = phi_p;
1056
1057 /* Shift arguments should be equal in all the packed stmts for a
1058 vector shift with scalar shift operand. */
1059 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1060 || rhs_code == LROTATE_EXPR
1061 || rhs_code == RROTATE_EXPR)
1062 {
1063 vec_mode = TYPE_MODE (vectype);
1064
1065 /* First see if we have a vector/vector shift. */
1066 optab = optab_for_tree_code (rhs_code, vectype,
1067 optab_vector);
1068
1069 if (!optab
1070 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1071 {
1072 /* No vector/vector shift, try for a vector/scalar shift. */
1073 optab = optab_for_tree_code (rhs_code, vectype,
1074 optab_scalar);
1075
1076 if (!optab)
1077 {
1078 if (dump_enabled_p ())
1079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1080 "Build SLP failed: no optab.\n");
1081 if (is_a <bb_vec_info> (vinfo) && i != 0)
1082 continue;
1083 /* Fatal mismatch. */
1084 matches[0] = false;
1085 return false;
1086 }
1087 icode = (int) optab_handler (optab, vec_mode);
1088 if (icode == CODE_FOR_nothing)
1089 {
1090 if (dump_enabled_p ())
1091 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1092 "Build SLP failed: "
1093 "op not supported by target.\n");
1094 if (is_a <bb_vec_info> (vinfo) && i != 0)
1095 continue;
1096 /* Fatal mismatch. */
1097 matches[0] = false;
1098 return false;
1099 }
1100 optab_op2_mode = insn_data[icode].operand[2].mode;
1101 if (!VECTOR_MODE_P (optab_op2_mode))
1102 {
1103 need_same_oprnds = true;
1104 first_op1 = gimple_assign_rhs2 (stmt);
1105 }
1106 }
1107 }
1108 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1109 {
1110 need_same_oprnds = true;
1111 first_op1 = gimple_assign_rhs2 (stmt);
1112 }
1113 else if (!load_p
1114 && rhs_code == BIT_FIELD_REF)
1115 {
1116 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1117 if (!is_a <bb_vec_info> (vinfo)
1118 || TREE_CODE (vec) != SSA_NAME
1119 || !operand_equal_p (TYPE_SIZE (vectype),
1120 TYPE_SIZE (TREE_TYPE (vec))))
1121 {
1122 if (dump_enabled_p ())
1123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1124 "Build SLP failed: "
1125 "BIT_FIELD_REF not supported\n");
1126 /* Fatal mismatch. */
1127 matches[0] = false;
1128 return false;
1129 }
1130 }
1131 else if (call_stmt
1132 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1133 {
1134 need_same_oprnds = true;
1135 first_op1 = gimple_call_arg (call_stmt, 1);
1136 }
1137 }
1138 else
1139 {
1140 if (first_stmt_code != rhs_code
1141 && alt_stmt_code == ERROR_MARK)
1142 alt_stmt_code = rhs_code;
1143 if ((first_stmt_code != rhs_code
1144 && (first_stmt_code != IMAGPART_EXPR
1145 || rhs_code != REALPART_EXPR)
1146 && (first_stmt_code != REALPART_EXPR
1147 || rhs_code != IMAGPART_EXPR)
1148 /* Handle mismatches in plus/minus by computing both
1149 and merging the results. */
1150 && !((first_stmt_code == PLUS_EXPR
1151 || first_stmt_code == MINUS_EXPR)
1152 && (alt_stmt_code == PLUS_EXPR
1153 || alt_stmt_code == MINUS_EXPR)
1154 && rhs_code == alt_stmt_code)
1155 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1156 && (first_stmt_code == ARRAY_REF
1157 || first_stmt_code == BIT_FIELD_REF
1158 || first_stmt_code == INDIRECT_REF
1159 || first_stmt_code == COMPONENT_REF
1160 || first_stmt_code == MEM_REF)))
1161 || first_stmt_load_p != load_p
1162 || first_stmt_phi_p != phi_p)
1163 {
1164 if (dump_enabled_p ())
1165 {
1166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1167 "Build SLP failed: different operation "
1168 "in stmt %G", stmt);
1169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1170 "original stmt %G", first_stmt_info->stmt);
1171 }
1172 /* Mismatch. */
1173 continue;
1174 }
1175
1176 if (need_same_oprnds)
1177 {
1178 tree other_op1 = (call_stmt
1179 ? gimple_call_arg (call_stmt, 1)
1180 : gimple_assign_rhs2 (stmt));
1181 if (!operand_equal_p (first_op1, other_op1, 0))
1182 {
1183 if (dump_enabled_p ())
1184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1185 "Build SLP failed: different shift "
1186 "arguments in %G", stmt);
1187 /* Mismatch. */
1188 continue;
1189 }
1190 }
1191 if (!load_p
1192 && first_stmt_code == BIT_FIELD_REF
1193 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1194 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1195 {
1196 if (dump_enabled_p ())
1197 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1198 "Build SLP failed: different BIT_FIELD_REF "
1199 "arguments in %G", stmt);
1200 /* Mismatch. */
1201 continue;
1202 }
1203
1204 if (!load_p && rhs_code == CALL_EXPR)
1205 {
1206 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1207 as_a <gcall *> (stmt)))
1208 {
1209 if (dump_enabled_p ())
1210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1211 "Build SLP failed: different calls in %G",
1212 stmt);
1213 /* Mismatch. */
1214 continue;
1215 }
1216 }
1217
1218 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1219 && (gimple_bb (first_stmt_info->stmt)
1220 != gimple_bb (stmt_info->stmt)))
1221 {
1222 if (dump_enabled_p ())
1223 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1224 "Build SLP failed: different BB for PHI "
1225 "or possibly trapping operation in %G", stmt);
1226 /* Mismatch. */
1227 continue;
1228 }
1229
1230 if (!types_compatible_p (vectype, *node_vectype))
1231 {
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234 "Build SLP failed: different vector type "
1235 "in %G", stmt);
1236 /* Mismatch. */
1237 continue;
1238 }
1239 }
1240
1241 /* Grouped store or load. */
1242 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1243 {
1244 if (REFERENCE_CLASS_P (lhs))
1245 {
1246 /* Store. */
1247 ;
1248 }
1249 else
1250 {
1251 /* Load. */
1252 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1253 if (prev_first_load)
1254 {
1255 /* Check that there are no loads from different interleaving
1256 chains in the same node. */
1257 if (prev_first_load != first_load)
1258 {
1259 if (dump_enabled_p ())
1260 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1261 vect_location,
1262 "Build SLP failed: different "
1263 "interleaving chains in one node %G",
1264 stmt);
1265 /* Mismatch. */
1266 continue;
1267 }
1268 }
1269 else
1270 prev_first_load = first_load;
1271 }
1272 } /* Grouped access. */
1273 else
1274 {
1275 if (load_p)
1276 {
1277 /* Not grouped load. */
1278 if (dump_enabled_p ())
1279 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1280 "Build SLP failed: not grouped load %G", stmt);
1281
1282 /* FORNOW: Not grouped loads are not supported. */
1283 if (is_a <bb_vec_info> (vinfo) && i != 0)
1284 continue;
1285 /* Fatal mismatch. */
1286 matches[0] = false;
1287 return false;
1288 }
1289
1290 /* Not memory operation. */
1291 if (!phi_p
1292 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1293 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1294 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1295 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1296 && rhs_code != VIEW_CONVERT_EXPR
1297 && rhs_code != CALL_EXPR
1298 && rhs_code != BIT_FIELD_REF)
1299 {
1300 if (dump_enabled_p ())
1301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1302 "Build SLP failed: operation unsupported %G",
1303 stmt);
1304 if (is_a <bb_vec_info> (vinfo) && i != 0)
1305 continue;
1306 /* Fatal mismatch. */
1307 matches[0] = false;
1308 return false;
1309 }
1310
1311 if (rhs_code == COND_EXPR)
1312 {
1313 tree cond_expr = gimple_assign_rhs1 (stmt);
1314 enum tree_code cond_code = TREE_CODE (cond_expr);
1315 enum tree_code swap_code = ERROR_MARK;
1316 enum tree_code invert_code = ERROR_MARK;
1317
1318 if (i == 0)
1319 first_cond_code = TREE_CODE (cond_expr);
1320 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1321 {
1322 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1323 swap_code = swap_tree_comparison (cond_code);
1324 invert_code = invert_tree_comparison (cond_code, honor_nans);
1325 }
1326
1327 if (first_cond_code == cond_code)
1328 ;
1329 /* Isomorphic can be achieved by swapping. */
1330 else if (first_cond_code == swap_code)
1331 swap[i] = 1;
1332 /* Isomorphic can be achieved by inverting. */
1333 else if (first_cond_code == invert_code)
1334 swap[i] = 2;
1335 else
1336 {
1337 if (dump_enabled_p ())
1338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1339 "Build SLP failed: different"
1340 " operation %G", stmt);
1341 /* Mismatch. */
1342 continue;
1343 }
1344 }
1345 }
1346
1347 matches[i] = true;
1348 }
1349
1350 for (i = 0; i < group_size; ++i)
1351 if (!matches[i])
1352 return false;
1353
1354 /* If we allowed a two-operation SLP node verify the target can cope
1355 with the permute we are going to use. */
1356 if (alt_stmt_code != ERROR_MARK
1357 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1358 {
1359 *two_operators = true;
1360 }
1361
1362 if (maybe_soft_fail)
1363 {
1364 unsigned HOST_WIDE_INT const_nunits;
1365 if (!TYPE_VECTOR_SUBPARTS
1366 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1367 || const_nunits > group_size)
1368 matches[0] = false;
1369 else
1370 {
1371 /* With constant vector elements simulate a mismatch at the
1372 point we need to split. */
1373 unsigned tail = group_size & (const_nunits - 1);
1374 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1375 }
1376 return false;
1377 }
1378
1379 return true;
1380 }
1381
1382 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1383 Note we never remove apart from at destruction time so we do not
1384 need a special value for deleted that differs from empty. */
1385 struct bst_traits
1386 {
1387 typedef vec <stmt_vec_info> value_type;
1388 typedef vec <stmt_vec_info> compare_type;
1389 static inline hashval_t hash (value_type);
1390 static inline bool equal (value_type existing, value_type candidate);
1391 static inline bool is_empty (value_type x) { return !x.exists (); }
1392 static inline bool is_deleted (value_type x) { return !x.exists (); }
1393 static const bool empty_zero_p = true;
1394 static inline void mark_empty (value_type &x) { x.release (); }
1395 static inline void mark_deleted (value_type &x) { x.release (); }
1396 static inline void remove (value_type &x) { x.release (); }
1397 };
1398 inline hashval_t
1399 bst_traits::hash (value_type x)
1400 {
1401 inchash::hash h;
1402 for (unsigned i = 0; i < x.length (); ++i)
1403 h.add_int (gimple_uid (x[i]->stmt));
1404 return h.end ();
1405 }
1406 inline bool
1407 bst_traits::equal (value_type existing, value_type candidate)
1408 {
1409 if (existing.length () != candidate.length ())
1410 return false;
1411 for (unsigned i = 0; i < existing.length (); ++i)
1412 if (existing[i] != candidate[i])
1413 return false;
1414 return true;
1415 }
1416
1417 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1418 simple_hashmap_traits <bst_traits, slp_tree> >
1419 scalar_stmts_to_slp_tree_map_t;
1420
1421 static slp_tree
1422 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1423 vec<stmt_vec_info> stmts, unsigned int group_size,
1424 poly_uint64 *max_nunits,
1425 bool *matches, unsigned *limit, unsigned *tree_size,
1426 scalar_stmts_to_slp_tree_map_t *bst_map);
1427
1428 static slp_tree
1429 vect_build_slp_tree (vec_info *vinfo,
1430 vec<stmt_vec_info> stmts, unsigned int group_size,
1431 poly_uint64 *max_nunits,
1432 bool *matches, unsigned *limit, unsigned *tree_size,
1433 scalar_stmts_to_slp_tree_map_t *bst_map)
1434 {
1435 if (slp_tree *leader = bst_map->get (stmts))
1436 {
1437 if (dump_enabled_p ())
1438 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1439 *leader ? "" : "failed ", *leader);
1440 if (*leader)
1441 {
1442 SLP_TREE_REF_COUNT (*leader)++;
1443 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1444 stmts.release ();
1445 }
1446 return *leader;
1447 }
1448
1449 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1450 so we can pick up backedge destinations during discovery. */
1451 slp_tree res = new _slp_tree;
1452 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1453 SLP_TREE_SCALAR_STMTS (res) = stmts;
1454 bst_map->put (stmts.copy (), res);
1455
1456 if (*limit == 0)
1457 {
1458 if (dump_enabled_p ())
1459 dump_printf_loc (MSG_NOTE, vect_location,
1460 "SLP discovery limit exceeded\n");
1461 bool existed_p = bst_map->put (stmts, NULL);
1462 gcc_assert (existed_p);
1463 /* Mark the node invalid so we can detect those when still in use
1464 as backedge destinations. */
1465 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1466 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1467 vect_free_slp_tree (res);
1468 memset (matches, 0, sizeof (bool) * group_size);
1469 return NULL;
1470 }
1471 --*limit;
1472
1473 poly_uint64 this_max_nunits = 1;
1474 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1475 &this_max_nunits,
1476 matches, limit, tree_size, bst_map);
1477 if (!res_)
1478 {
1479 bool existed_p = bst_map->put (stmts, NULL);
1480 gcc_assert (existed_p);
1481 /* Mark the node invalid so we can detect those when still in use
1482 as backedge destinations. */
1483 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1484 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1485 vect_free_slp_tree (res);
1486 }
1487 else
1488 {
1489 gcc_assert (res_ == res);
1490 res->max_nunits = this_max_nunits;
1491 vect_update_max_nunits (max_nunits, this_max_nunits);
1492 /* Keep a reference for the bst_map use. */
1493 SLP_TREE_REF_COUNT (res)++;
1494 }
1495 return res_;
1496 }
1497
1498 /* Recursively build an SLP tree starting from NODE.
1499 Fail (and return a value not equal to zero) if def-stmts are not
1500 isomorphic, require data permutation or are of unsupported types of
1501 operation. Otherwise, return 0.
1502 The value returned is the depth in the SLP tree where a mismatch
1503 was found. */
1504
1505 static slp_tree
1506 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1507 vec<stmt_vec_info> stmts, unsigned int group_size,
1508 poly_uint64 *max_nunits,
1509 bool *matches, unsigned *limit, unsigned *tree_size,
1510 scalar_stmts_to_slp_tree_map_t *bst_map)
1511 {
1512 unsigned nops, i, this_tree_size = 0;
1513 poly_uint64 this_max_nunits = *max_nunits;
1514
1515 matches[0] = false;
1516
1517 stmt_vec_info stmt_info = stmts[0];
1518 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1519 nops = gimple_call_num_args (stmt);
1520 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1521 {
1522 nops = gimple_num_ops (stmt) - 1;
1523 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1524 nops++;
1525 }
1526 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1527 nops = gimple_phi_num_args (phi);
1528 else
1529 return NULL;
1530
1531 /* If the SLP node is a PHI (induction or reduction), terminate
1532 the recursion. */
1533 bool *skip_args = XALLOCAVEC (bool, nops);
1534 memset (skip_args, 0, sizeof (bool) * nops);
1535 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1536 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1537 {
1538 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1539 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1540 group_size);
1541 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1542 max_nunits))
1543 return NULL;
1544
1545 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1546 if (def_type == vect_induction_def)
1547 {
1548 /* Induction PHIs are not cycles but walk the initial
1549 value. Only for inner loops through, for outer loops
1550 we need to pick up the value from the actual PHIs
1551 to more easily support peeling and epilogue vectorization. */
1552 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1553 if (!nested_in_vect_loop_p (loop, stmt_info))
1554 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1555 else
1556 loop = loop->inner;
1557 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1558 }
1559 else if (def_type == vect_reduction_def
1560 || def_type == vect_double_reduction_def
1561 || def_type == vect_nested_cycle)
1562 {
1563 /* Else def types have to match. */
1564 stmt_vec_info other_info;
1565 bool all_same = true;
1566 FOR_EACH_VEC_ELT (stmts, i, other_info)
1567 {
1568 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1569 return NULL;
1570 if (other_info != stmt_info)
1571 all_same = false;
1572 }
1573 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1574 /* Reduction initial values are not explicitely represented. */
1575 if (!nested_in_vect_loop_p (loop, stmt_info))
1576 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1577 /* Reduction chain backedge defs are filled manually.
1578 ??? Need a better way to identify a SLP reduction chain PHI.
1579 Or a better overall way to SLP match those. */
1580 if (all_same && def_type == vect_reduction_def)
1581 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1582 }
1583 else if (def_type != vect_internal_def)
1584 return NULL;
1585 }
1586
1587
1588 bool two_operators = false;
1589 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1590 tree vectype = NULL_TREE;
1591 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1592 &this_max_nunits, matches, &two_operators,
1593 &vectype))
1594 return NULL;
1595
1596 /* If the SLP node is a load, terminate the recursion unless masked. */
1597 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1598 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1599 {
1600 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1601 {
1602 /* Masked load. */
1603 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1604 nops = 1;
1605 }
1606 else
1607 {
1608 *max_nunits = this_max_nunits;
1609 (*tree_size)++;
1610 node = vect_create_new_slp_node (node, stmts, 0);
1611 SLP_TREE_VECTYPE (node) = vectype;
1612 /* And compute the load permutation. Whether it is actually
1613 a permutation depends on the unrolling factor which is
1614 decided later. */
1615 vec<unsigned> load_permutation;
1616 int j;
1617 stmt_vec_info load_info;
1618 load_permutation.create (group_size);
1619 stmt_vec_info first_stmt_info
1620 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1621 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1622 {
1623 int load_place = vect_get_place_in_interleaving_chain
1624 (load_info, first_stmt_info);
1625 gcc_assert (load_place != -1);
1626 load_permutation.safe_push (load_place);
1627 }
1628 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1629 return node;
1630 }
1631 }
1632 else if (gimple_assign_single_p (stmt_info->stmt)
1633 && !gimple_vuse (stmt_info->stmt)
1634 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1635 {
1636 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1637 the same SSA name vector of a compatible type to vectype. */
1638 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1639 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1640 stmt_vec_info estmt_info;
1641 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1642 {
1643 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1644 tree bfref = gimple_assign_rhs1 (estmt);
1645 HOST_WIDE_INT lane;
1646 if (!known_eq (bit_field_size (bfref),
1647 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1648 || !constant_multiple_p (bit_field_offset (bfref),
1649 bit_field_size (bfref), &lane))
1650 {
1651 lperm.release ();
1652 return NULL;
1653 }
1654 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1655 }
1656 slp_tree vnode = vect_create_new_slp_node (vNULL);
1657 /* ??? We record vectype here but we hide eventually necessary
1658 punning and instead rely on code generation to materialize
1659 VIEW_CONVERT_EXPRs as necessary. We instead should make
1660 this explicit somehow. */
1661 SLP_TREE_VECTYPE (vnode) = vectype;
1662 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1663 /* We are always building a permutation node even if it is an identity
1664 permute to shield the rest of the vectorizer from the odd node
1665 representing an actual vector without any scalar ops.
1666 ??? We could hide it completely with making the permute node
1667 external? */
1668 node = vect_create_new_slp_node (node, stmts, 1);
1669 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1670 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1671 SLP_TREE_VECTYPE (node) = vectype;
1672 SLP_TREE_CHILDREN (node).quick_push (vnode);
1673 return node;
1674 }
1675
1676 /* Get at the operands, verifying they are compatible. */
1677 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1678 slp_oprnd_info oprnd_info;
1679 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1680 {
1681 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
1682 stmts, i, &oprnds_info);
1683 if (res != 0)
1684 matches[(res == -1) ? 0 : i] = false;
1685 if (!matches[0])
1686 break;
1687 }
1688 for (i = 0; i < group_size; ++i)
1689 if (!matches[i])
1690 {
1691 vect_free_oprnd_info (oprnds_info);
1692 return NULL;
1693 }
1694 swap = NULL;
1695
1696 auto_vec<slp_tree, 4> children;
1697
1698 stmt_info = stmts[0];
1699
1700 /* Create SLP_TREE nodes for the definition node/s. */
1701 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1702 {
1703 slp_tree child;
1704 unsigned int j;
1705
1706 /* We're skipping certain operands from processing, for example
1707 outer loop reduction initial defs. */
1708 if (skip_args[i])
1709 {
1710 children.safe_push (NULL);
1711 continue;
1712 }
1713
1714 if (oprnd_info->first_dt == vect_uninitialized_def)
1715 {
1716 /* COND_EXPR have one too many eventually if the condition
1717 is a SSA name. */
1718 gcc_assert (i == 3 && nops == 4);
1719 continue;
1720 }
1721
1722 if (is_a <bb_vec_info> (vinfo)
1723 && oprnd_info->first_dt == vect_internal_def
1724 && !oprnd_info->any_pattern)
1725 {
1726 /* For BB vectorization, if all defs are the same do not
1727 bother to continue the build along the single-lane
1728 graph but use a splat of the scalar value. */
1729 stmt_vec_info first_def = oprnd_info->def_stmts[0];
1730 for (j = 1; j < group_size; ++j)
1731 if (oprnd_info->def_stmts[j] != first_def)
1732 break;
1733 if (j == group_size
1734 /* But avoid doing this for loads where we may be
1735 able to CSE things, unless the stmt is not
1736 vectorizable. */
1737 && (!STMT_VINFO_VECTORIZABLE (first_def)
1738 || !gimple_vuse (first_def->stmt)))
1739 {
1740 if (dump_enabled_p ())
1741 dump_printf_loc (MSG_NOTE, vect_location,
1742 "Using a splat of the uniform operand\n");
1743 oprnd_info->first_dt = vect_external_def;
1744 }
1745 }
1746
1747 if (oprnd_info->first_dt == vect_external_def
1748 || oprnd_info->first_dt == vect_constant_def)
1749 {
1750 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1751 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1752 oprnd_info->ops = vNULL;
1753 children.safe_push (invnode);
1754 continue;
1755 }
1756
1757 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1758 group_size, &this_max_nunits,
1759 matches, limit,
1760 &this_tree_size, bst_map)) != NULL)
1761 {
1762 oprnd_info->def_stmts = vNULL;
1763 children.safe_push (child);
1764 continue;
1765 }
1766
1767 /* If the SLP build for operand zero failed and operand zero
1768 and one can be commutated try that for the scalar stmts
1769 that failed the match. */
1770 if (i == 0
1771 /* A first scalar stmt mismatch signals a fatal mismatch. */
1772 && matches[0]
1773 /* ??? For COND_EXPRs we can swap the comparison operands
1774 as well as the arms under some constraints. */
1775 && nops == 2
1776 && oprnds_info[1]->first_dt == vect_internal_def
1777 && is_gimple_assign (stmt_info->stmt)
1778 /* Swapping operands for reductions breaks assumptions later on. */
1779 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1780 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1781 {
1782 /* See whether we can swap the matching or the non-matching
1783 stmt operands. */
1784 bool swap_not_matching = true;
1785 do
1786 {
1787 for (j = 0; j < group_size; ++j)
1788 {
1789 if (matches[j] != !swap_not_matching)
1790 continue;
1791 stmt_vec_info stmt_info = stmts[j];
1792 /* Verify if we can swap operands of this stmt. */
1793 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1794 if (!stmt
1795 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1796 {
1797 if (!swap_not_matching)
1798 goto fail;
1799 swap_not_matching = false;
1800 break;
1801 }
1802 }
1803 }
1804 while (j != group_size);
1805
1806 /* Swap mismatched definition stmts. */
1807 if (dump_enabled_p ())
1808 dump_printf_loc (MSG_NOTE, vect_location,
1809 "Re-trying with swapped operands of stmts ");
1810 for (j = 0; j < group_size; ++j)
1811 if (matches[j] == !swap_not_matching)
1812 {
1813 std::swap (oprnds_info[0]->def_stmts[j],
1814 oprnds_info[1]->def_stmts[j]);
1815 std::swap (oprnds_info[0]->ops[j],
1816 oprnds_info[1]->ops[j]);
1817 if (dump_enabled_p ())
1818 dump_printf (MSG_NOTE, "%d ", j);
1819 }
1820 if (dump_enabled_p ())
1821 dump_printf (MSG_NOTE, "\n");
1822 /* And try again with scratch 'matches' ... */
1823 bool *tem = XALLOCAVEC (bool, group_size);
1824 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1825 group_size, &this_max_nunits,
1826 tem, limit,
1827 &this_tree_size, bst_map)) != NULL)
1828 {
1829 oprnd_info->def_stmts = vNULL;
1830 children.safe_push (child);
1831 continue;
1832 }
1833 }
1834 fail:
1835
1836 /* If the SLP build failed and we analyze a basic-block
1837 simply treat nodes we fail to build as externally defined
1838 (and thus build vectors from the scalar defs).
1839 The cost model will reject outright expensive cases.
1840 ??? This doesn't treat cases where permutation ultimatively
1841 fails (or we don't try permutation below). Ideally we'd
1842 even compute a permutation that will end up with the maximum
1843 SLP tree size... */
1844 if (is_a <bb_vec_info> (vinfo)
1845 /* ??? Rejecting patterns this way doesn't work. We'd have to
1846 do extra work to cancel the pattern so the uses see the
1847 scalar version. */
1848 && !is_pattern_stmt_p (stmt_info)
1849 && !oprnd_info->any_pattern)
1850 {
1851 /* But if there's a leading vector sized set of matching stmts
1852 fail here so we can split the group. This matches the condition
1853 vect_analyze_slp_instance uses. */
1854 /* ??? We might want to split here and combine the results to support
1855 multiple vector sizes better. */
1856 for (j = 0; j < group_size; ++j)
1857 if (!matches[j])
1858 break;
1859 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1860 {
1861 if (dump_enabled_p ())
1862 dump_printf_loc (MSG_NOTE, vect_location,
1863 "Building vector operands from scalars\n");
1864 this_tree_size++;
1865 child = vect_create_new_slp_node (oprnd_info->ops);
1866 children.safe_push (child);
1867 oprnd_info->ops = vNULL;
1868 continue;
1869 }
1870 }
1871
1872 gcc_assert (child == NULL);
1873 FOR_EACH_VEC_ELT (children, j, child)
1874 if (child)
1875 vect_free_slp_tree (child);
1876 vect_free_oprnd_info (oprnds_info);
1877 return NULL;
1878 }
1879
1880 vect_free_oprnd_info (oprnds_info);
1881
1882 /* If we have all children of a child built up from uniform scalars
1883 or does more than one possibly expensive vector construction then
1884 just throw that away, causing it built up from scalars.
1885 The exception is the SLP node for the vector store. */
1886 if (is_a <bb_vec_info> (vinfo)
1887 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1888 /* ??? Rejecting patterns this way doesn't work. We'd have to
1889 do extra work to cancel the pattern so the uses see the
1890 scalar version. */
1891 && !is_pattern_stmt_p (stmt_info))
1892 {
1893 slp_tree child;
1894 unsigned j;
1895 bool all_uniform_p = true;
1896 unsigned n_vector_builds = 0;
1897 FOR_EACH_VEC_ELT (children, j, child)
1898 {
1899 if (!child)
1900 ;
1901 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1902 all_uniform_p = false;
1903 else if (!vect_slp_tree_uniform_p (child))
1904 {
1905 all_uniform_p = false;
1906 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1907 n_vector_builds++;
1908 }
1909 }
1910 if (all_uniform_p
1911 || n_vector_builds > 1
1912 || (n_vector_builds == children.length ()
1913 && is_a <gphi *> (stmt_info->stmt)))
1914 {
1915 /* Roll back. */
1916 matches[0] = false;
1917 FOR_EACH_VEC_ELT (children, j, child)
1918 if (child)
1919 vect_free_slp_tree (child);
1920
1921 if (dump_enabled_p ())
1922 dump_printf_loc (MSG_NOTE, vect_location,
1923 "Building parent vector operands from "
1924 "scalars instead\n");
1925 return NULL;
1926 }
1927 }
1928
1929 *tree_size += this_tree_size + 1;
1930 *max_nunits = this_max_nunits;
1931
1932 if (two_operators)
1933 {
1934 /* ??? We'd likely want to either cache in bst_map sth like
1935 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1936 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1937 explicit stmts to put in so the keying on 'stmts' doesn't
1938 work (but we have the same issue with nodes that use 'ops'). */
1939 slp_tree one = new _slp_tree;
1940 slp_tree two = new _slp_tree;
1941 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1942 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1943 SLP_TREE_VECTYPE (one) = vectype;
1944 SLP_TREE_VECTYPE (two) = vectype;
1945 SLP_TREE_CHILDREN (one).safe_splice (children);
1946 SLP_TREE_CHILDREN (two).safe_splice (children);
1947 slp_tree child;
1948 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1949 SLP_TREE_REF_COUNT (child)++;
1950
1951 /* Here we record the original defs since this
1952 node represents the final lane configuration. */
1953 node = vect_create_new_slp_node (node, stmts, 2);
1954 SLP_TREE_VECTYPE (node) = vectype;
1955 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1956 SLP_TREE_CHILDREN (node).quick_push (one);
1957 SLP_TREE_CHILDREN (node).quick_push (two);
1958 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1959 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1960 enum tree_code ocode = ERROR_MARK;
1961 stmt_vec_info ostmt_info;
1962 unsigned j = 0;
1963 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1964 {
1965 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1966 if (gimple_assign_rhs_code (ostmt) != code0)
1967 {
1968 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1969 ocode = gimple_assign_rhs_code (ostmt);
1970 j = i;
1971 }
1972 else
1973 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1974 }
1975 SLP_TREE_CODE (one) = code0;
1976 SLP_TREE_CODE (two) = ocode;
1977 SLP_TREE_LANES (one) = stmts.length ();
1978 SLP_TREE_LANES (two) = stmts.length ();
1979 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1980 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1981 return node;
1982 }
1983
1984 node = vect_create_new_slp_node (node, stmts, nops);
1985 SLP_TREE_VECTYPE (node) = vectype;
1986 SLP_TREE_CHILDREN (node).splice (children);
1987 return node;
1988 }
1989
1990 /* Dump a single SLP tree NODE. */
1991
1992 static void
1993 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1994 slp_tree node)
1995 {
1996 unsigned i, j;
1997 slp_tree child;
1998 stmt_vec_info stmt_info;
1999 tree op;
2000
2001 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2002 dump_user_location_t user_loc = loc.get_user_location ();
2003 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
2004 SLP_TREE_DEF_TYPE (node) == vect_external_def
2005 ? " (external)"
2006 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2007 ? " (constant)"
2008 : ""), node,
2009 estimated_poly_value (node->max_nunits),
2010 SLP_TREE_REF_COUNT (node));
2011 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2012 {
2013 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2014 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2015 else
2016 dump_printf_loc (metadata, user_loc, "op template: %G",
2017 SLP_TREE_REPRESENTATIVE (node)->stmt);
2018 }
2019 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2020 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2021 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2022 else
2023 {
2024 dump_printf_loc (metadata, user_loc, "\t{ ");
2025 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2026 dump_printf (metadata, "%T%s ", op,
2027 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2028 dump_printf (metadata, "}\n");
2029 }
2030 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2031 {
2032 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2033 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2034 dump_printf (dump_kind, " %u", j);
2035 dump_printf (dump_kind, " }\n");
2036 }
2037 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2038 {
2039 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2040 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2041 dump_printf (dump_kind, " %u[%u]",
2042 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2043 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2044 dump_printf (dump_kind, " }\n");
2045 }
2046 if (SLP_TREE_CHILDREN (node).is_empty ())
2047 return;
2048 dump_printf_loc (metadata, user_loc, "\tchildren");
2049 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2050 dump_printf (dump_kind, " %p", (void *)child);
2051 dump_printf (dump_kind, "\n");
2052 }
2053
2054 DEBUG_FUNCTION void
2055 debug (slp_tree node)
2056 {
2057 debug_dump_context ctx;
2058 vect_print_slp_tree (MSG_NOTE,
2059 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2060 node);
2061 }
2062
2063 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2064
2065 static void
2066 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2067 slp_tree node, hash_set<slp_tree> &visited)
2068 {
2069 unsigned i;
2070 slp_tree child;
2071
2072 if (visited.add (node))
2073 return;
2074
2075 vect_print_slp_tree (dump_kind, loc, node);
2076
2077 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2078 if (child)
2079 vect_print_slp_graph (dump_kind, loc, child, visited);
2080 }
2081
2082 static void
2083 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2084 slp_tree entry)
2085 {
2086 hash_set<slp_tree> visited;
2087 vect_print_slp_graph (dump_kind, loc, entry, visited);
2088 }
2089
2090 /* Mark the tree rooted at NODE with PURE_SLP. */
2091
2092 static void
2093 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2094 {
2095 int i;
2096 stmt_vec_info stmt_info;
2097 slp_tree child;
2098
2099 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2100 return;
2101
2102 if (visited.add (node))
2103 return;
2104
2105 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2106 STMT_SLP_TYPE (stmt_info) = pure_slp;
2107
2108 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2109 if (child)
2110 vect_mark_slp_stmts (child, visited);
2111 }
2112
2113 static void
2114 vect_mark_slp_stmts (slp_tree node)
2115 {
2116 hash_set<slp_tree> visited;
2117 vect_mark_slp_stmts (node, visited);
2118 }
2119
2120 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2121
2122 static void
2123 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2124 {
2125 int i;
2126 stmt_vec_info stmt_info;
2127 slp_tree child;
2128
2129 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2130 return;
2131
2132 if (visited.add (node))
2133 return;
2134
2135 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2136 {
2137 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2138 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2139 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2140 }
2141
2142 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2143 if (child)
2144 vect_mark_slp_stmts_relevant (child, visited);
2145 }
2146
2147 static void
2148 vect_mark_slp_stmts_relevant (slp_tree node)
2149 {
2150 hash_set<slp_tree> visited;
2151 vect_mark_slp_stmts_relevant (node, visited);
2152 }
2153
2154
2155 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2156
2157 static void
2158 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2159 hash_set<slp_tree> &visited)
2160 {
2161 if (!node || visited.add (node))
2162 return;
2163
2164 if (SLP_TREE_CHILDREN (node).length () == 0)
2165 {
2166 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2167 return;
2168 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2169 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2170 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2171 loads.safe_push (node);
2172 }
2173 else
2174 {
2175 unsigned i;
2176 slp_tree child;
2177 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2178 vect_gather_slp_loads (loads, child, visited);
2179 }
2180 }
2181
2182
2183 /* Find the last store in SLP INSTANCE. */
2184
2185 stmt_vec_info
2186 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2187 {
2188 stmt_vec_info last = NULL;
2189 stmt_vec_info stmt_vinfo;
2190
2191 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2192 {
2193 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2194 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2195 }
2196
2197 return last;
2198 }
2199
2200 /* Find the first stmt in NODE. */
2201
2202 stmt_vec_info
2203 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2204 {
2205 stmt_vec_info first = NULL;
2206 stmt_vec_info stmt_vinfo;
2207
2208 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2209 {
2210 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2211 if (!first
2212 || get_later_stmt (stmt_vinfo, first) == first)
2213 first = stmt_vinfo;
2214 }
2215
2216 return first;
2217 }
2218
2219 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2220 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2221 (also containing the first GROUP1_SIZE stmts, since stores are
2222 consecutive), the second containing the remainder.
2223 Return the first stmt in the second group. */
2224
2225 static stmt_vec_info
2226 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2227 {
2228 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2229 gcc_assert (group1_size > 0);
2230 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2231 gcc_assert (group2_size > 0);
2232 DR_GROUP_SIZE (first_vinfo) = group1_size;
2233
2234 stmt_vec_info stmt_info = first_vinfo;
2235 for (unsigned i = group1_size; i > 1; i--)
2236 {
2237 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2238 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2239 }
2240 /* STMT is now the last element of the first group. */
2241 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2242 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2243
2244 DR_GROUP_SIZE (group2) = group2_size;
2245 for (stmt_info = group2; stmt_info;
2246 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2247 {
2248 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2249 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2250 }
2251
2252 /* For the second group, the DR_GROUP_GAP is that before the original group,
2253 plus skipping over the first vector. */
2254 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2255
2256 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2257 DR_GROUP_GAP (first_vinfo) += group2_size;
2258
2259 if (dump_enabled_p ())
2260 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2261 group1_size, group2_size);
2262
2263 return group2;
2264 }
2265
2266 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2267 statements and a vector of NUNITS elements. */
2268
2269 static poly_uint64
2270 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2271 {
2272 return exact_div (common_multiple (nunits, group_size), group_size);
2273 }
2274
2275 /* Helper that checks to see if a node is a load node. */
2276
2277 static inline bool
2278 vect_is_slp_load_node (slp_tree root)
2279 {
2280 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2281 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2282 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2283 }
2284
2285
2286 /* Helper function of optimize_load_redistribution that performs the operation
2287 recursively. */
2288
2289 static slp_tree
2290 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2291 vec_info *vinfo, unsigned int group_size,
2292 hash_map<slp_tree, slp_tree> *load_map,
2293 slp_tree root)
2294 {
2295 if (slp_tree *leader = load_map->get (root))
2296 return *leader;
2297
2298 slp_tree node;
2299 unsigned i;
2300
2301 /* For now, we don't know anything about externals so do not do anything. */
2302 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2303 return NULL;
2304 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2305 {
2306 /* First convert this node into a load node and add it to the leaves
2307 list and flatten the permute from a lane to a load one. If it's
2308 unneeded it will be elided later. */
2309 vec<stmt_vec_info> stmts;
2310 stmts.create (SLP_TREE_LANES (root));
2311 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2312 for (unsigned j = 0; j < lane_perm.length (); j++)
2313 {
2314 std::pair<unsigned, unsigned> perm = lane_perm[j];
2315 node = SLP_TREE_CHILDREN (root)[perm.first];
2316
2317 if (!vect_is_slp_load_node (node)
2318 || SLP_TREE_CHILDREN (node).exists ())
2319 {
2320 stmts.release ();
2321 goto next;
2322 }
2323
2324 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2325 }
2326
2327 if (dump_enabled_p ())
2328 dump_printf_loc (MSG_NOTE, vect_location,
2329 "converting stmts on permute node %p\n", root);
2330
2331 bool *matches = XALLOCAVEC (bool, group_size);
2332 poly_uint64 max_nunits = 1;
2333 unsigned tree_size = 0, limit = 1;
2334 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2335 matches, &limit, &tree_size, bst_map);
2336 if (!node)
2337 stmts.release ();
2338
2339 load_map->put (root, node);
2340 return node;
2341 }
2342
2343 next:
2344 load_map->put (root, NULL);
2345
2346 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2347 {
2348 slp_tree value
2349 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2350 node);
2351 if (value)
2352 {
2353 SLP_TREE_REF_COUNT (value)++;
2354 SLP_TREE_CHILDREN (root)[i] = value;
2355 /* ??? We know the original leafs of the replaced nodes will
2356 be referenced by bst_map, only the permutes created by
2357 pattern matching are not. */
2358 if (SLP_TREE_REF_COUNT (node) == 1)
2359 load_map->remove (node);
2360 vect_free_slp_tree (node);
2361 }
2362 }
2363
2364 return NULL;
2365 }
2366
2367 /* Temporary workaround for loads not being CSEd during SLP build. This
2368 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2369 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2370 same DR such that the final operation is equal to a permuted load. Such
2371 NODES are then directly converted into LOADS themselves. The nodes are
2372 CSEd using BST_MAP. */
2373
2374 static void
2375 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2376 vec_info *vinfo, unsigned int group_size,
2377 hash_map<slp_tree, slp_tree> *load_map,
2378 slp_tree root)
2379 {
2380 slp_tree node;
2381 unsigned i;
2382
2383 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2384 {
2385 slp_tree value
2386 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2387 node);
2388 if (value)
2389 {
2390 SLP_TREE_REF_COUNT (value)++;
2391 SLP_TREE_CHILDREN (root)[i] = value;
2392 /* ??? We know the original leafs of the replaced nodes will
2393 be referenced by bst_map, only the permutes created by
2394 pattern matching are not. */
2395 if (SLP_TREE_REF_COUNT (node) == 1)
2396 load_map->remove (node);
2397 vect_free_slp_tree (node);
2398 }
2399 }
2400 }
2401
2402 /* Helper function of vect_match_slp_patterns.
2403
2404 Attempts to match patterns against the slp tree rooted in REF_NODE using
2405 VINFO. Patterns are matched in post-order traversal.
2406
2407 If matching is successful the value in REF_NODE is updated and returned, if
2408 not then it is returned unchanged. */
2409
2410 static bool
2411 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2412 slp_tree_to_load_perm_map_t *perm_cache,
2413 hash_set<slp_tree> *visited)
2414 {
2415 unsigned i;
2416 slp_tree node = *ref_node;
2417 bool found_p = false;
2418 if (!node || visited->add (node))
2419 return false;
2420
2421 slp_tree child;
2422 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2423 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2424 vinfo, perm_cache, visited);
2425
2426 for (unsigned x = 0; x < num__slp_patterns; x++)
2427 {
2428 vect_pattern *pattern = slp_patterns[x] (perm_cache, ref_node);
2429 if (pattern)
2430 {
2431 pattern->build (vinfo);
2432 delete pattern;
2433 found_p = true;
2434 }
2435 }
2436
2437 return found_p;
2438 }
2439
2440 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2441 vec_info VINFO.
2442
2443 The modified tree is returned. Patterns are tried in order and multiple
2444 patterns may match. */
2445
2446 static bool
2447 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2448 hash_set<slp_tree> *visited,
2449 slp_tree_to_load_perm_map_t *perm_cache)
2450 {
2451 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2452 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2453
2454 if (dump_enabled_p ())
2455 dump_printf_loc (MSG_NOTE, vect_location,
2456 "Analyzing SLP tree %p for patterns\n",
2457 SLP_INSTANCE_TREE (instance));
2458
2459 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
2460 }
2461
2462 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
2463 splitting into two, with the first split group having size NEW_GROUP_SIZE.
2464 Return true if we could use IFN_STORE_LANES instead and if that appears
2465 to be the better approach. */
2466
2467 static bool
2468 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
2469 unsigned int group_size,
2470 unsigned int new_group_size)
2471 {
2472 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2473 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
2474 if (!vectype)
2475 return false;
2476 /* Allow the split if one of the two new groups would operate on full
2477 vectors *within* rather than across one scalar loop iteration.
2478 This is purely a heuristic, but it should work well for group
2479 sizes of 3 and 4, where the possible splits are:
2480
2481 3->2+1: OK if the vector has exactly two elements
2482 4->2+2: Likewise
2483 4->3+1: Less clear-cut. */
2484 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
2485 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
2486 return false;
2487 return vect_store_lanes_supported (vectype, group_size, false);
2488 }
2489
2490 /* Analyze an SLP instance starting from a group of grouped stores. Call
2491 vect_build_slp_tree to build a tree of packed stmts if possible.
2492 Return FALSE if it's impossible to SLP any stmt in the loop. */
2493
2494 static bool
2495 vect_analyze_slp_instance (vec_info *vinfo,
2496 scalar_stmts_to_slp_tree_map_t *bst_map,
2497 stmt_vec_info stmt_info, slp_instance_kind kind,
2498 unsigned max_tree_size, unsigned *limit);
2499
2500 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2501 of KIND. Return true if successful. */
2502
2503 static bool
2504 vect_build_slp_instance (vec_info *vinfo,
2505 slp_instance_kind kind,
2506 vec<stmt_vec_info> &scalar_stmts,
2507 vec<stmt_vec_info> &root_stmt_infos,
2508 unsigned max_tree_size, unsigned *limit,
2509 scalar_stmts_to_slp_tree_map_t *bst_map,
2510 /* ??? We need stmt_info for group splitting. */
2511 stmt_vec_info stmt_info_)
2512 {
2513 if (dump_enabled_p ())
2514 {
2515 dump_printf_loc (MSG_NOTE, vect_location,
2516 "Starting SLP discovery for\n");
2517 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
2518 dump_printf_loc (MSG_NOTE, vect_location,
2519 " %G", scalar_stmts[i]->stmt);
2520 }
2521
2522 /* Build the tree for the SLP instance. */
2523 unsigned int group_size = scalar_stmts.length ();
2524 bool *matches = XALLOCAVEC (bool, group_size);
2525 poly_uint64 max_nunits = 1;
2526 unsigned tree_size = 0;
2527 unsigned i;
2528 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2529 &max_nunits, matches, limit,
2530 &tree_size, bst_map);
2531 if (node != NULL)
2532 {
2533 /* Calculate the unrolling factor based on the smallest type. */
2534 poly_uint64 unrolling_factor
2535 = calculate_unrolling_factor (max_nunits, group_size);
2536
2537 if (maybe_ne (unrolling_factor, 1U)
2538 && is_a <bb_vec_info> (vinfo))
2539 {
2540 unsigned HOST_WIDE_INT const_max_nunits;
2541 if (!max_nunits.is_constant (&const_max_nunits)
2542 || const_max_nunits > group_size)
2543 {
2544 if (dump_enabled_p ())
2545 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2546 "Build SLP failed: store group "
2547 "size not a multiple of the vector size "
2548 "in basic block SLP\n");
2549 vect_free_slp_tree (node);
2550 return false;
2551 }
2552 /* Fatal mismatch. */
2553 if (dump_enabled_p ())
2554 dump_printf_loc (MSG_NOTE, vect_location,
2555 "SLP discovery succeeded but node needs "
2556 "splitting\n");
2557 memset (matches, true, group_size);
2558 matches[group_size / const_max_nunits * const_max_nunits] = false;
2559 vect_free_slp_tree (node);
2560 }
2561 else
2562 {
2563 /* Create a new SLP instance. */
2564 slp_instance new_instance = XNEW (class _slp_instance);
2565 SLP_INSTANCE_TREE (new_instance) = node;
2566 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2567 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2568 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
2569 SLP_INSTANCE_KIND (new_instance) = kind;
2570 new_instance->reduc_phis = NULL;
2571 new_instance->cost_vec = vNULL;
2572 new_instance->subgraph_entries = vNULL;
2573
2574 if (dump_enabled_p ())
2575 dump_printf_loc (MSG_NOTE, vect_location,
2576 "SLP size %u vs. limit %u.\n",
2577 tree_size, max_tree_size);
2578
2579 /* Fixup SLP reduction chains. */
2580 if (kind == slp_inst_kind_reduc_chain)
2581 {
2582 /* If this is a reduction chain with a conversion in front
2583 amend the SLP tree with a node for that. */
2584 gimple *scalar_def
2585 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2586 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
2587 {
2588 /* Get at the conversion stmt - we know it's the single use
2589 of the last stmt of the reduction chain. */
2590 use_operand_p use_p;
2591 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
2592 &use_p, &scalar_def);
2593 gcc_assert (r);
2594 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
2595 next_info = vect_stmt_to_vectorize (next_info);
2596 scalar_stmts = vNULL;
2597 scalar_stmts.create (group_size);
2598 for (unsigned i = 0; i < group_size; ++i)
2599 scalar_stmts.quick_push (next_info);
2600 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2601 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2602 SLP_TREE_CHILDREN (conv).quick_push (node);
2603 SLP_INSTANCE_TREE (new_instance) = conv;
2604 /* We also have to fake this conversion stmt as SLP reduction
2605 group so we don't have to mess with too much code
2606 elsewhere. */
2607 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2608 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2609 }
2610 /* Fill the backedge child of the PHI SLP node. The
2611 general matching code cannot find it because the
2612 scalar code does not reflect how we vectorize the
2613 reduction. */
2614 use_operand_p use_p;
2615 imm_use_iterator imm_iter;
2616 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
2617 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
2618 gimple_get_lhs (scalar_def))
2619 /* There are exactly two non-debug uses, the reduction
2620 PHI and the loop-closed PHI node. */
2621 if (!is_gimple_debug (USE_STMT (use_p))
2622 && gimple_bb (USE_STMT (use_p)) == loop->header)
2623 {
2624 auto_vec<stmt_vec_info, 64> phis (group_size);
2625 stmt_vec_info phi_info
2626 = vinfo->lookup_stmt (USE_STMT (use_p));
2627 for (unsigned i = 0; i < group_size; ++i)
2628 phis.quick_push (phi_info);
2629 slp_tree *phi_node = bst_map->get (phis);
2630 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
2631 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
2632 = SLP_INSTANCE_TREE (new_instance);
2633 SLP_INSTANCE_TREE (new_instance)->refcnt++;
2634 }
2635 }
2636
2637 vinfo->slp_instances.safe_push (new_instance);
2638
2639 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2640 the number of scalar stmts in the root in a few places.
2641 Verify that assumption holds. */
2642 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2643 .length () == group_size);
2644
2645 if (dump_enabled_p ())
2646 {
2647 dump_printf_loc (MSG_NOTE, vect_location,
2648 "Final SLP tree for instance %p:\n", new_instance);
2649 vect_print_slp_graph (MSG_NOTE, vect_location,
2650 SLP_INSTANCE_TREE (new_instance));
2651 }
2652
2653 return true;
2654 }
2655 }
2656 else
2657 {
2658 /* Failed to SLP. */
2659 /* Free the allocated memory. */
2660 scalar_stmts.release ();
2661 }
2662
2663 stmt_vec_info stmt_info = stmt_info_;
2664 /* Try to break the group up into pieces. */
2665 if (kind == slp_inst_kind_store)
2666 {
2667 /* ??? We could delay all the actual splitting of store-groups
2668 until after SLP discovery of the original group completed.
2669 Then we can recurse to vect_build_slp_instance directly. */
2670 for (i = 0; i < group_size; i++)
2671 if (!matches[i])
2672 break;
2673
2674 /* For basic block SLP, try to break the group up into multiples of
2675 a vector size. */
2676 if (is_a <bb_vec_info> (vinfo)
2677 && (i > 1 && i < group_size))
2678 {
2679 tree scalar_type
2680 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2681 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2682 1 << floor_log2 (i));
2683 unsigned HOST_WIDE_INT const_nunits;
2684 if (vectype
2685 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2686 {
2687 /* Split into two groups at the first vector boundary. */
2688 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2689 unsigned group1_size = i & ~(const_nunits - 1);
2690
2691 if (dump_enabled_p ())
2692 dump_printf_loc (MSG_NOTE, vect_location,
2693 "Splitting SLP group at stmt %u\n", i);
2694 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2695 group1_size);
2696 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2697 kind, max_tree_size,
2698 limit);
2699 /* Split the rest at the failure point and possibly
2700 re-analyze the remaining matching part if it has
2701 at least two lanes. */
2702 if (group1_size < i
2703 && (i + 1 < group_size
2704 || i - group1_size > 1))
2705 {
2706 stmt_vec_info rest2 = rest;
2707 rest = vect_split_slp_store_group (rest, i - group1_size);
2708 if (i - group1_size > 1)
2709 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
2710 kind, max_tree_size,
2711 limit);
2712 }
2713 /* Re-analyze the non-matching tail if it has at least
2714 two lanes. */
2715 if (i + 1 < group_size)
2716 res |= vect_analyze_slp_instance (vinfo, bst_map,
2717 rest, kind, max_tree_size,
2718 limit);
2719 return res;
2720 }
2721 }
2722
2723 /* For loop vectorization split into arbitrary pieces of size > 1. */
2724 if (is_a <loop_vec_info> (vinfo)
2725 && (i > 1 && i < group_size)
2726 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
2727 {
2728 unsigned group1_size = i;
2729
2730 if (dump_enabled_p ())
2731 dump_printf_loc (MSG_NOTE, vect_location,
2732 "Splitting SLP group at stmt %u\n", i);
2733
2734 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2735 group1_size);
2736 /* Loop vectorization cannot handle gaps in stores, make sure
2737 the split group appears as strided. */
2738 STMT_VINFO_STRIDED_P (rest) = 1;
2739 DR_GROUP_GAP (rest) = 0;
2740 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2741 DR_GROUP_GAP (stmt_info) = 0;
2742
2743 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2744 kind, max_tree_size, limit);
2745 if (i + 1 < group_size)
2746 res |= vect_analyze_slp_instance (vinfo, bst_map,
2747 rest, kind, max_tree_size, limit);
2748
2749 return res;
2750 }
2751
2752 /* Even though the first vector did not all match, we might be able to SLP
2753 (some) of the remainder. FORNOW ignore this possibility. */
2754 }
2755
2756 /* Failed to SLP. */
2757 if (dump_enabled_p ())
2758 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2759 return false;
2760 }
2761
2762
2763 /* Analyze an SLP instance starting from a group of grouped stores. Call
2764 vect_build_slp_tree to build a tree of packed stmts if possible.
2765 Return FALSE if it's impossible to SLP any stmt in the loop. */
2766
2767 static bool
2768 vect_analyze_slp_instance (vec_info *vinfo,
2769 scalar_stmts_to_slp_tree_map_t *bst_map,
2770 stmt_vec_info stmt_info,
2771 slp_instance_kind kind,
2772 unsigned max_tree_size, unsigned *limit)
2773 {
2774 unsigned int i;
2775 vec<stmt_vec_info> scalar_stmts;
2776
2777 if (is_a <bb_vec_info> (vinfo))
2778 vect_location = stmt_info->stmt;
2779
2780 stmt_vec_info next_info = stmt_info;
2781 if (kind == slp_inst_kind_store)
2782 {
2783 /* Collect the stores and store them in scalar_stmts. */
2784 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
2785 while (next_info)
2786 {
2787 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2788 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2789 }
2790 }
2791 else if (kind == slp_inst_kind_reduc_chain)
2792 {
2793 /* Collect the reduction stmts and store them in scalar_stmts. */
2794 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
2795 while (next_info)
2796 {
2797 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2798 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2799 }
2800 /* Mark the first element of the reduction chain as reduction to properly
2801 transform the node. In the reduction analysis phase only the last
2802 element of the chain is marked as reduction. */
2803 STMT_VINFO_DEF_TYPE (stmt_info)
2804 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2805 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2806 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2807 }
2808 else if (kind == slp_inst_kind_ctor)
2809 {
2810 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2811 tree val;
2812 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
2813 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2814 {
2815 stmt_vec_info def_info = vinfo->lookup_def (val);
2816 def_info = vect_stmt_to_vectorize (def_info);
2817 scalar_stmts.quick_push (def_info);
2818 }
2819 if (dump_enabled_p ())
2820 dump_printf_loc (MSG_NOTE, vect_location,
2821 "Analyzing vectorizable constructor: %G\n",
2822 stmt_info->stmt);
2823 }
2824 else if (kind == slp_inst_kind_reduc_group)
2825 {
2826 /* Collect reduction statements. */
2827 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2828 scalar_stmts.create (reductions.length ());
2829 for (i = 0; reductions.iterate (i, &next_info); i++)
2830 if (STMT_VINFO_RELEVANT_P (next_info)
2831 || STMT_VINFO_LIVE_P (next_info))
2832 scalar_stmts.quick_push (next_info);
2833 /* If less than two were relevant/live there's nothing to SLP. */
2834 if (scalar_stmts.length () < 2)
2835 return false;
2836 }
2837 else
2838 gcc_unreachable ();
2839
2840 vec<stmt_vec_info> roots = vNULL;
2841 if (kind == slp_inst_kind_ctor)
2842 {
2843 roots.create (1);
2844 roots.quick_push (stmt_info);
2845 }
2846 /* Build the tree for the SLP instance. */
2847 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
2848 roots,
2849 max_tree_size, limit, bst_map,
2850 kind == slp_inst_kind_store
2851 ? stmt_info : NULL);
2852 if (!res)
2853 roots.release ();
2854
2855 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2856 where we should do store group splitting. */
2857
2858 return res;
2859 }
2860
2861 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2862 trees of packed scalar stmts if SLP is possible. */
2863
2864 opt_result
2865 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2866 {
2867 unsigned int i;
2868 stmt_vec_info first_element;
2869 slp_instance instance;
2870
2871 DUMP_VECT_SCOPE ("vect_analyze_slp");
2872
2873 unsigned limit = max_tree_size;
2874
2875 scalar_stmts_to_slp_tree_map_t *bst_map
2876 = new scalar_stmts_to_slp_tree_map_t ();
2877
2878 /* Find SLP sequences starting from groups of grouped stores. */
2879 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2880 vect_analyze_slp_instance (vinfo, bst_map, first_element,
2881 STMT_VINFO_GROUPED_ACCESS (first_element)
2882 ? slp_inst_kind_store : slp_inst_kind_ctor,
2883 max_tree_size, &limit);
2884
2885 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
2886 {
2887 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
2888 {
2889 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
2890 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
2891 bb_vinfo->roots[i].stmts,
2892 bb_vinfo->roots[i].roots,
2893 max_tree_size, &limit, bst_map, NULL))
2894 {
2895 bb_vinfo->roots[i].stmts = vNULL;
2896 bb_vinfo->roots[i].roots = vNULL;
2897 }
2898 }
2899 }
2900
2901 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2902 {
2903 /* Find SLP sequences starting from reduction chains. */
2904 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2905 if (! STMT_VINFO_RELEVANT_P (first_element)
2906 && ! STMT_VINFO_LIVE_P (first_element))
2907 ;
2908 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2909 slp_inst_kind_reduc_chain,
2910 max_tree_size, &limit))
2911 {
2912 /* Dissolve reduction chain group. */
2913 stmt_vec_info vinfo = first_element;
2914 stmt_vec_info last = NULL;
2915 while (vinfo)
2916 {
2917 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2918 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2919 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2920 last = vinfo;
2921 vinfo = next;
2922 }
2923 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2924 /* It can be still vectorized as part of an SLP reduction. */
2925 loop_vinfo->reductions.safe_push (last);
2926 }
2927
2928 /* Find SLP sequences starting from groups of reductions. */
2929 if (loop_vinfo->reductions.length () > 1)
2930 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2931 slp_inst_kind_reduc_group, max_tree_size,
2932 &limit);
2933 }
2934
2935 hash_set<slp_tree> visited_patterns;
2936 slp_tree_to_load_perm_map_t perm_cache;
2937
2938 /* See if any patterns can be found in the SLP tree. */
2939 bool pattern_found = false;
2940 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
2941 pattern_found |= vect_match_slp_patterns (instance, vinfo,
2942 &visited_patterns, &perm_cache);
2943
2944 /* If any were found optimize permutations of loads. */
2945 if (pattern_found)
2946 {
2947 hash_map<slp_tree, slp_tree> load_map;
2948 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
2949 {
2950 slp_tree root = SLP_INSTANCE_TREE (instance);
2951 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
2952 &load_map, root);
2953 }
2954 }
2955
2956
2957
2958 /* The map keeps a reference on SLP nodes built, release that. */
2959 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2960 it != bst_map->end (); ++it)
2961 if ((*it).second)
2962 vect_free_slp_tree ((*it).second);
2963 delete bst_map;
2964
2965 if (pattern_found && dump_enabled_p ())
2966 {
2967 dump_printf_loc (MSG_NOTE, vect_location,
2968 "Pattern matched SLP tree\n");
2969 hash_set<slp_tree> visited;
2970 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
2971 vect_print_slp_graph (MSG_NOTE, vect_location,
2972 SLP_INSTANCE_TREE (instance), visited);
2973 }
2974
2975 return opt_result::success ();
2976 }
2977
2978 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2979
2980 static void
2981 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2982 vec<slp_tree> &vertices, vec<int> &leafs)
2983 {
2984 unsigned i;
2985 slp_tree child;
2986
2987 if (visited.add (node))
2988 return;
2989
2990 node->vertex = vertices.length ();
2991 vertices.safe_push (node);
2992
2993 bool leaf = true;
2994 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2995 if (child)
2996 {
2997 leaf = false;
2998 vect_slp_build_vertices (visited, child, vertices, leafs);
2999 }
3000 if (leaf)
3001 leafs.safe_push (node->vertex);
3002 }
3003
3004 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3005
3006 static void
3007 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
3008 vec<int> &leafs)
3009 {
3010 hash_set<slp_tree> visited;
3011 unsigned i;
3012 slp_instance instance;
3013 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
3014 {
3015 unsigned n_v = vertices.length ();
3016 unsigned n_l = leafs.length ();
3017 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
3018 leafs);
3019 /* If we added vertices but no entries to the reverse graph we've
3020 added a cycle that is not backwards-reachable. Push the entry
3021 to mimic as leaf then. */
3022 if (vertices.length () > n_v
3023 && leafs.length () == n_l)
3024 leafs.safe_push (SLP_INSTANCE_TREE (instance)->vertex);
3025 }
3026 }
3027
3028 /* Apply (reverse) bijectite PERM to VEC. */
3029
3030 template <class T>
3031 static void
3032 vect_slp_permute (vec<unsigned> perm,
3033 vec<T> &vec, bool reverse)
3034 {
3035 auto_vec<T, 64> saved;
3036 saved.create (vec.length ());
3037 for (unsigned i = 0; i < vec.length (); ++i)
3038 saved.quick_push (vec[i]);
3039
3040 if (reverse)
3041 {
3042 for (unsigned i = 0; i < vec.length (); ++i)
3043 vec[perm[i]] = saved[i];
3044 for (unsigned i = 0; i < vec.length (); ++i)
3045 gcc_assert (vec[perm[i]] == saved[i]);
3046 }
3047 else
3048 {
3049 for (unsigned i = 0; i < vec.length (); ++i)
3050 vec[i] = saved[perm[i]];
3051 for (unsigned i = 0; i < vec.length (); ++i)
3052 gcc_assert (vec[i] == saved[perm[i]]);
3053 }
3054 }
3055
3056 /* Return whether permutations PERM_A and PERM_B as recorded in the
3057 PERMS vector are equal. */
3058
3059 static bool
3060 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
3061 int perm_a, int perm_b)
3062 {
3063 return (perm_a == perm_b
3064 || (perms[perm_a].length () == perms[perm_b].length ()
3065 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
3066 sizeof (unsigned) * perms[perm_a].length ()) == 0));
3067 }
3068
3069 /* Optimize the SLP graph of VINFO. */
3070
3071 void
3072 vect_optimize_slp (vec_info *vinfo)
3073 {
3074 if (vinfo->slp_instances.is_empty ())
3075 return;
3076
3077 slp_tree node;
3078 unsigned i;
3079 auto_vec<slp_tree> vertices;
3080 auto_vec<int> leafs;
3081 vect_slp_build_vertices (vinfo, vertices, leafs);
3082
3083 struct graph *slpg = new_graph (vertices.length ());
3084 FOR_EACH_VEC_ELT (vertices, i, node)
3085 {
3086 unsigned j;
3087 slp_tree child;
3088 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3089 if (child)
3090 add_edge (slpg, i, child->vertex);
3091 }
3092
3093 /* Compute (reverse) postorder on the inverted graph. */
3094 auto_vec<int> ipo;
3095 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3096
3097 auto_sbitmap n_visited (vertices.length ());
3098 auto_sbitmap n_materialize (vertices.length ());
3099 auto_vec<int> n_perm (vertices.length ());
3100 auto_vec<vec<unsigned> > perms;
3101
3102 bitmap_clear (n_visited);
3103 bitmap_clear (n_materialize);
3104 n_perm.quick_grow_cleared (vertices.length ());
3105 perms.safe_push (vNULL); /* zero is no permute */
3106
3107 /* Produce initial permutations. */
3108 for (i = 0; i < leafs.length (); ++i)
3109 {
3110 int idx = leafs[i];
3111 slp_tree node = vertices[idx];
3112
3113 /* Handle externals and constants optimistically throughout the
3114 iteration. */
3115 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3116 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3117 continue;
3118
3119 /* Leafs do not change across iterations. Note leafs also double
3120 as entries to the reverse graph. */
3121 if (!slpg->vertices[idx].succ)
3122 bitmap_set_bit (n_visited, idx);
3123 /* Loads are the only thing generating permutes. */
3124 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3125 continue;
3126
3127 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3128 node unpermuted, record this permute. */
3129 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3130 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3131 continue;
3132 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3133 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3134 bool any_permute = false;
3135 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3136 {
3137 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3138 imin = MIN (imin, idx);
3139 imax = MAX (imax, idx);
3140 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3141 any_permute = true;
3142 }
3143 /* If there's no permute no need to split one out. */
3144 if (!any_permute)
3145 continue;
3146 /* If the span doesn't match we'd disrupt VF computation, avoid
3147 that for now. */
3148 if (imax - imin + 1 != SLP_TREE_LANES (node))
3149 continue;
3150
3151 /* For now only handle true permutes, like
3152 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3153 when permuting constants and invariants keeping the permute
3154 bijective. */
3155 auto_sbitmap load_index (SLP_TREE_LANES (node));
3156 bitmap_clear (load_index);
3157 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3158 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3159 unsigned j;
3160 for (j = 0; j < SLP_TREE_LANES (node); ++j)
3161 if (!bitmap_bit_p (load_index, j))
3162 break;
3163 if (j != SLP_TREE_LANES (node))
3164 continue;
3165
3166 vec<unsigned> perm = vNULL;
3167 perm.safe_grow (SLP_TREE_LANES (node), true);
3168 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3169 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3170 perms.safe_push (perm);
3171 n_perm[idx] = perms.length () - 1;
3172 }
3173
3174 /* Propagate permutes along the graph and compute materialization points. */
3175 bool changed;
3176 unsigned iteration = 0;
3177 do
3178 {
3179 changed = false;
3180 ++iteration;
3181
3182 for (i = vertices.length (); i > 0 ; --i)
3183 {
3184 int idx = ipo[i-1];
3185 slp_tree node = vertices[idx];
3186 /* For leafs there's nothing to do - we've seeded permutes
3187 on those above. */
3188 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3189 continue;
3190
3191 bitmap_set_bit (n_visited, idx);
3192
3193 /* We cannot move a permute across a store. */
3194 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
3195 && DR_IS_WRITE
3196 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
3197 continue;
3198
3199 int perm = -1;
3200 for (graph_edge *succ = slpg->vertices[idx].succ;
3201 succ; succ = succ->succ_next)
3202 {
3203 int succ_idx = succ->dest;
3204 /* Handle unvisited nodes optimistically. */
3205 /* ??? But for constants once we want to handle non-bijective
3206 permutes we have to verify the permute, when unifying lanes,
3207 will not unify different constants. For example see
3208 gcc.dg/vect/bb-slp-14.c for a case that would break. */
3209 if (!bitmap_bit_p (n_visited, succ_idx))
3210 continue;
3211 int succ_perm = n_perm[succ_idx];
3212 /* Once we materialize succs permutation its output lanes
3213 appear unpermuted to us. */
3214 if (bitmap_bit_p (n_materialize, succ_idx))
3215 succ_perm = 0;
3216 if (perm == -1)
3217 perm = succ_perm;
3218 else if (succ_perm == 0)
3219 {
3220 perm = 0;
3221 break;
3222 }
3223 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
3224 {
3225 perm = 0;
3226 break;
3227 }
3228 }
3229
3230 if (perm == -1)
3231 /* Pick up pre-computed leaf values. */
3232 perm = n_perm[idx];
3233 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
3234 {
3235 if (iteration > 1)
3236 /* Make sure we eventually converge. */
3237 gcc_checking_assert (perm == 0);
3238 n_perm[idx] = perm;
3239 if (perm == 0)
3240 bitmap_clear_bit (n_materialize, idx);
3241 changed = true;
3242 }
3243
3244 if (perm == 0)
3245 continue;
3246
3247 /* Elide pruning at materialization points in the first
3248 iteration so every node was visited once at least. */
3249 if (iteration == 1)
3250 continue;
3251
3252 /* Decide on permute materialization. Look whether there's
3253 a use (pred) edge that is permuted differently than us.
3254 In that case mark ourselves so the permutation is applied.
3255 For VEC_PERM_EXPRs the permutation doesn't carry along
3256 from children to parents so force materialization at the
3257 point of the VEC_PERM_EXPR. In principle VEC_PERM_EXPRs
3258 are a source of an arbitrary permutation again, similar
3259 to constants/externals - that's something we do not yet
3260 optimally handle. */
3261 bool all_preds_permuted = (SLP_TREE_CODE (node) != VEC_PERM_EXPR
3262 && slpg->vertices[idx].pred != NULL);
3263 if (all_preds_permuted)
3264 for (graph_edge *pred = slpg->vertices[idx].pred;
3265 pred; pred = pred->pred_next)
3266 {
3267 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
3268 int pred_perm = n_perm[pred->src];
3269 if (!vect_slp_perms_eq (perms, perm, pred_perm))
3270 {
3271 all_preds_permuted = false;
3272 break;
3273 }
3274 }
3275 if (!all_preds_permuted)
3276 {
3277 if (!bitmap_bit_p (n_materialize, idx))
3278 changed = true;
3279 bitmap_set_bit (n_materialize, idx);
3280 }
3281 }
3282 }
3283 while (changed || iteration == 1);
3284
3285 /* Materialize. */
3286 for (i = 0; i < vertices.length (); ++i)
3287 {
3288 int perm = n_perm[i];
3289 if (perm <= 0)
3290 continue;
3291
3292 slp_tree node = vertices[i];
3293
3294 /* First permute invariant/external original successors. */
3295 unsigned j;
3296 slp_tree child;
3297 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3298 {
3299 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3300 continue;
3301
3302 /* If the vector is uniform there's nothing to do. */
3303 if (vect_slp_tree_uniform_p (child))
3304 continue;
3305
3306 /* We can end up sharing some externals via two_operator
3307 handling. Be prepared to unshare those. */
3308 if (child->refcnt != 1)
3309 {
3310 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3311 SLP_TREE_CHILDREN (node)[j] = child
3312 = vect_create_new_slp_node
3313 (SLP_TREE_SCALAR_OPS (child).copy ());
3314 }
3315 vect_slp_permute (perms[perm],
3316 SLP_TREE_SCALAR_OPS (child), true);
3317 }
3318
3319 if (bitmap_bit_p (n_materialize, i))
3320 {
3321 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3322 /* For loads simply drop the permutation, the load permutation
3323 already performs the desired permutation. */
3324 ;
3325 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3326 {
3327 /* If the node is already a permute node we can apply
3328 the permutation to the lane selection, effectively
3329 materializing it on the incoming vectors. */
3330 if (dump_enabled_p ())
3331 dump_printf_loc (MSG_NOTE, vect_location,
3332 "simplifying permute node %p\n",
3333 node);
3334
3335 for (unsigned k = 0;
3336 k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3337 SLP_TREE_LANE_PERMUTATION (node)[k].second
3338 = perms[perm][SLP_TREE_LANE_PERMUTATION (node)[k].second];
3339 }
3340 else
3341 {
3342 if (dump_enabled_p ())
3343 dump_printf_loc (MSG_NOTE, vect_location,
3344 "inserting permute node in place of %p\n",
3345 node);
3346
3347 /* Make a copy of NODE and in-place change it to a
3348 VEC_PERM node to permute the lanes of the copy. */
3349 slp_tree copy = new _slp_tree;
3350 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3351 SLP_TREE_CHILDREN (node) = vNULL;
3352 SLP_TREE_SCALAR_STMTS (copy)
3353 = SLP_TREE_SCALAR_STMTS (node).copy ();
3354 vect_slp_permute (perms[perm],
3355 SLP_TREE_SCALAR_STMTS (copy), true);
3356 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3357 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3358 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3359 SLP_TREE_LANE_PERMUTATION (copy)
3360 = SLP_TREE_LANE_PERMUTATION (node);
3361 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3362 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3363 copy->refcnt = 1;
3364 copy->max_nunits = node->max_nunits;
3365 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3366 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3367 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3368
3369 /* Now turn NODE into a VEC_PERM. */
3370 SLP_TREE_CHILDREN (node).safe_push (copy);
3371 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3372 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3373 SLP_TREE_LANE_PERMUTATION (node)
3374 .quick_push (std::make_pair (0, perms[perm][j]));
3375 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3376 }
3377 }
3378 else
3379 {
3380 /* Apply the reverse permutation to our stmts. */
3381 vect_slp_permute (perms[perm],
3382 SLP_TREE_SCALAR_STMTS (node), true);
3383 /* And to the load permutation, which we can simply
3384 make regular by design. */
3385 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3386 {
3387 /* ??? When we handle non-bijective permutes the idea
3388 is that we can force the load-permutation to be
3389 { min, min + 1, min + 2, ... max }. But then the
3390 scalar defs might no longer match the lane content
3391 which means wrong-code with live lane vectorization.
3392 So we possibly have to have NULL entries for those. */
3393 vect_slp_permute (perms[perm],
3394 SLP_TREE_LOAD_PERMUTATION (node), true);
3395 }
3396 }
3397 }
3398
3399 /* Free the perms vector used for propagation. */
3400 while (!perms.is_empty ())
3401 perms.pop ().release ();
3402 free_graph (slpg);
3403
3404
3405 /* Now elide load permutations that are not necessary. */
3406 for (i = 0; i < leafs.length (); ++i)
3407 {
3408 node = vertices[leafs[i]];
3409 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3410 continue;
3411
3412 /* In basic block vectorization we allow any subchain of an interleaving
3413 chain.
3414 FORNOW: not in loop SLP because of realignment complications. */
3415 if (is_a <bb_vec_info> (vinfo))
3416 {
3417 bool subchain_p = true;
3418 stmt_vec_info next_load_info = NULL;
3419 stmt_vec_info load_info;
3420 unsigned j;
3421 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3422 {
3423 if (j != 0
3424 && (next_load_info != load_info
3425 || DR_GROUP_GAP (load_info) != 1))
3426 {
3427 subchain_p = false;
3428 break;
3429 }
3430 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3431 }
3432 if (subchain_p)
3433 {
3434 SLP_TREE_LOAD_PERMUTATION (node).release ();
3435 continue;
3436 }
3437 }
3438 else
3439 {
3440 stmt_vec_info load_info;
3441 bool this_load_permuted = false;
3442 unsigned j;
3443 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3444 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3445 {
3446 this_load_permuted = true;
3447 break;
3448 }
3449 stmt_vec_info first_stmt_info
3450 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3451 if (!this_load_permuted
3452 /* The load requires permutation when unrolling exposes
3453 a gap either because the group is larger than the SLP
3454 group-size or because there is a gap between the groups. */
3455 && (known_eq (LOOP_VINFO_VECT_FACTOR
3456 (as_a <loop_vec_info> (vinfo)), 1U)
3457 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3458 && DR_GROUP_GAP (first_stmt_info) == 0)))
3459 {
3460 SLP_TREE_LOAD_PERMUTATION (node).release ();
3461 continue;
3462 }
3463 }
3464 }
3465 }
3466
3467 /* Gather loads reachable from the individual SLP graph entries. */
3468
3469 void
3470 vect_gather_slp_loads (vec_info *vinfo)
3471 {
3472 unsigned i;
3473 slp_instance instance;
3474 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
3475 {
3476 hash_set<slp_tree> visited;
3477 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
3478 SLP_INSTANCE_TREE (instance), visited);
3479 }
3480 }
3481
3482
3483 /* For each possible SLP instance decide whether to SLP it and calculate overall
3484 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3485 least one instance. */
3486
3487 bool
3488 vect_make_slp_decision (loop_vec_info loop_vinfo)
3489 {
3490 unsigned int i;
3491 poly_uint64 unrolling_factor = 1;
3492 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3493 slp_instance instance;
3494 int decided_to_slp = 0;
3495
3496 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3497
3498 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3499 {
3500 /* FORNOW: SLP if you can. */
3501 /* All unroll factors have the form:
3502
3503 GET_MODE_SIZE (vinfo->vector_mode) * X
3504
3505 for some rational X, so they must have a common multiple. */
3506 unrolling_factor
3507 = force_common_multiple (unrolling_factor,
3508 SLP_INSTANCE_UNROLLING_FACTOR (instance));
3509
3510 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3511 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3512 loop-based vectorization. Such stmts will be marked as HYBRID. */
3513 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3514 decided_to_slp++;
3515 }
3516
3517 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
3518
3519 if (decided_to_slp && dump_enabled_p ())
3520 {
3521 dump_printf_loc (MSG_NOTE, vect_location,
3522 "Decided to SLP %d instances. Unrolling factor ",
3523 decided_to_slp);
3524 dump_dec (MSG_NOTE, unrolling_factor);
3525 dump_printf (MSG_NOTE, "\n");
3526 }
3527
3528 return (decided_to_slp > 0);
3529 }
3530
3531 /* Private data for vect_detect_hybrid_slp. */
3532 struct vdhs_data
3533 {
3534 loop_vec_info loop_vinfo;
3535 vec<stmt_vec_info> *worklist;
3536 };
3537
3538 /* Walker for walk_gimple_op. */
3539
3540 static tree
3541 vect_detect_hybrid_slp (tree *tp, int *, void *data)
3542 {
3543 walk_stmt_info *wi = (walk_stmt_info *)data;
3544 vdhs_data *dat = (vdhs_data *)wi->info;
3545
3546 if (wi->is_lhs)
3547 return NULL_TREE;
3548
3549 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
3550 if (!def_stmt_info)
3551 return NULL_TREE;
3552 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3553 if (PURE_SLP_STMT (def_stmt_info))
3554 {
3555 if (dump_enabled_p ())
3556 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3557 def_stmt_info->stmt);
3558 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3559 dat->worklist->safe_push (def_stmt_info);
3560 }
3561
3562 return NULL_TREE;
3563 }
3564
3565 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3566 if so, otherwise pushing it to WORKLIST. */
3567
3568 static void
3569 maybe_push_to_hybrid_worklist (vec_info *vinfo,
3570 vec<stmt_vec_info> &worklist,
3571 stmt_vec_info stmt_info)
3572 {
3573 if (dump_enabled_p ())
3574 dump_printf_loc (MSG_NOTE, vect_location,
3575 "Processing hybrid candidate : %G", stmt_info->stmt);
3576 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
3577 imm_use_iterator iter2;
3578 ssa_op_iter iter1;
3579 use_operand_p use_p;
3580 def_operand_p def_p;
3581 bool any_def = false;
3582 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
3583 {
3584 any_def = true;
3585 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
3586 {
3587 if (is_gimple_debug (USE_STMT (use_p)))
3588 continue;
3589 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
3590 /* An out-of loop use means this is a loop_vect sink. */
3591 if (!use_info)
3592 {
3593 if (dump_enabled_p ())
3594 dump_printf_loc (MSG_NOTE, vect_location,
3595 "Found loop_vect sink: %G", stmt_info->stmt);
3596 worklist.safe_push (stmt_info);
3597 return;
3598 }
3599 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
3600 {
3601 if (dump_enabled_p ())
3602 dump_printf_loc (MSG_NOTE, vect_location,
3603 "Found loop_vect use: %G", use_info->stmt);
3604 worklist.safe_push (stmt_info);
3605 return;
3606 }
3607 }
3608 }
3609 /* No def means this is a loo_vect sink. */
3610 if (!any_def)
3611 {
3612 if (dump_enabled_p ())
3613 dump_printf_loc (MSG_NOTE, vect_location,
3614 "Found loop_vect sink: %G", stmt_info->stmt);
3615 worklist.safe_push (stmt_info);
3616 return;
3617 }
3618 if (dump_enabled_p ())
3619 dump_printf_loc (MSG_NOTE, vect_location,
3620 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
3621 STMT_SLP_TYPE (stmt_info) = pure_slp;
3622 }
3623
3624 /* Find stmts that must be both vectorized and SLPed. */
3625
3626 void
3627 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3628 {
3629 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3630
3631 /* All stmts participating in SLP are marked pure_slp, all other
3632 stmts are loop_vect.
3633 First collect all loop_vect stmts into a worklist.
3634 SLP patterns cause not all original scalar stmts to appear in
3635 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3636 Rectify this here and do a backward walk over the IL only considering
3637 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3638 mark them as pure_slp. */
3639 auto_vec<stmt_vec_info> worklist;
3640 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
3641 {
3642 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3643 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3644 gsi_next (&gsi))
3645 {
3646 gphi *phi = gsi.phi ();
3647 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3648 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3649 maybe_push_to_hybrid_worklist (loop_vinfo,
3650 worklist, stmt_info);
3651 }
3652 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
3653 gsi_prev (&gsi))
3654 {
3655 gimple *stmt = gsi_stmt (gsi);
3656 if (is_gimple_debug (stmt))
3657 continue;
3658 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3659 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3660 {
3661 for (gimple_stmt_iterator gsi2
3662 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3663 !gsi_end_p (gsi2); gsi_next (&gsi2))
3664 {
3665 stmt_vec_info patt_info
3666 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3667 if (!STMT_SLP_TYPE (patt_info)
3668 && STMT_VINFO_RELEVANT (patt_info))
3669 maybe_push_to_hybrid_worklist (loop_vinfo,
3670 worklist, patt_info);
3671 }
3672 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3673 }
3674 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3675 maybe_push_to_hybrid_worklist (loop_vinfo,
3676 worklist, stmt_info);
3677 }
3678 }
3679
3680 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3681 mark any SLP vectorized stmt as hybrid.
3682 ??? We're visiting def stmts N times (once for each non-SLP and
3683 once for each hybrid-SLP use). */
3684 walk_stmt_info wi;
3685 vdhs_data dat;
3686 dat.worklist = &worklist;
3687 dat.loop_vinfo = loop_vinfo;
3688 memset (&wi, 0, sizeof (wi));
3689 wi.info = (void *)&dat;
3690 while (!worklist.is_empty ())
3691 {
3692 stmt_vec_info stmt_info = worklist.pop ();
3693 /* Since SSA operands are not set up for pattern stmts we need
3694 to use walk_gimple_op. */
3695 wi.is_lhs = 0;
3696 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3697 }
3698 }
3699
3700
3701 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3702
3703 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3704 : vec_info (vec_info::bb, init_cost (NULL, false), shared),
3705 bbs (_bbs),
3706 roots (vNULL)
3707 {
3708 for (unsigned i = 0; i < bbs.length (); ++i)
3709 {
3710 if (i != 0)
3711 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3712 gsi_next (&si))
3713 {
3714 gphi *phi = si.phi ();
3715 gimple_set_uid (phi, 0);
3716 add_stmt (phi);
3717 }
3718 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3719 !gsi_end_p (gsi); gsi_next (&gsi))
3720 {
3721 gimple *stmt = gsi_stmt (gsi);
3722 gimple_set_uid (stmt, 0);
3723 if (is_gimple_debug (stmt))
3724 continue;
3725 add_stmt (stmt);
3726 }
3727 }
3728 }
3729
3730
3731 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3732 stmts in the basic block. */
3733
3734 _bb_vec_info::~_bb_vec_info ()
3735 {
3736 /* Reset region marker. */
3737 for (unsigned i = 0; i < bbs.length (); ++i)
3738 {
3739 if (i != 0)
3740 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3741 gsi_next (&si))
3742 {
3743 gphi *phi = si.phi ();
3744 gimple_set_uid (phi, -1);
3745 }
3746 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3747 !gsi_end_p (gsi); gsi_next (&gsi))
3748 {
3749 gimple *stmt = gsi_stmt (gsi);
3750 gimple_set_uid (stmt, -1);
3751 }
3752 }
3753
3754 for (unsigned i = 0; i < roots.length (); ++i)
3755 {
3756 roots[i].stmts.release ();
3757 roots[i].roots.release ();
3758 }
3759 roots.release ();
3760 }
3761
3762 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3763 given then that child nodes have already been processed, and that
3764 their def types currently match their SLP node's def type. */
3765
3766 static bool
3767 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3768 slp_instance node_instance,
3769 stmt_vector_for_cost *cost_vec)
3770 {
3771 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3772
3773 /* Calculate the number of vector statements to be created for the
3774 scalar stmts in this node. For SLP reductions it is equal to the
3775 number of vector statements in the children (which has already been
3776 calculated by the recursive call). Otherwise it is the number of
3777 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3778 VF divided by the number of elements in a vector. */
3779 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3780 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3781 {
3782 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3783 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3784 {
3785 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3786 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3787 break;
3788 }
3789 }
3790 else
3791 {
3792 poly_uint64 vf;
3793 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3794 vf = loop_vinfo->vectorization_factor;
3795 else
3796 vf = 1;
3797 unsigned int group_size = SLP_TREE_LANES (node);
3798 tree vectype = SLP_TREE_VECTYPE (node);
3799 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3800 = vect_get_num_vectors (vf * group_size, vectype);
3801 }
3802
3803 /* Handle purely internal nodes. */
3804 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3805 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3806
3807 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3808 if (is_a <bb_vec_info> (vinfo)
3809 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3810 {
3811 if (dump_enabled_p ())
3812 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3813 "desired vector type conflicts with earlier one "
3814 "for %G", stmt_info->stmt);
3815 return false;
3816 }
3817
3818 bool dummy;
3819 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3820 node, node_instance, cost_vec);
3821 }
3822
3823 /* Try to build NODE from scalars, returning true on success.
3824 NODE_INSTANCE is the SLP instance that contains NODE. */
3825
3826 static bool
3827 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3828 slp_instance node_instance)
3829 {
3830 stmt_vec_info stmt_info;
3831 unsigned int i;
3832
3833 if (!is_a <bb_vec_info> (vinfo)
3834 || node == SLP_INSTANCE_TREE (node_instance)
3835 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3836 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3837 return false;
3838
3839 if (dump_enabled_p ())
3840 dump_printf_loc (MSG_NOTE, vect_location,
3841 "Building vector operands of %p from scalars instead\n", node);
3842
3843 /* Don't remove and free the child nodes here, since they could be
3844 referenced by other structures. The analysis and scheduling phases
3845 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3846 unsigned int group_size = SLP_TREE_LANES (node);
3847 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3848 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3849 SLP_TREE_LOAD_PERMUTATION (node).release ();
3850 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3851 {
3852 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3853 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3854 }
3855 return true;
3856 }
3857
3858 /* Compute the prologue cost for invariant or constant operands represented
3859 by NODE. */
3860
3861 static void
3862 vect_prologue_cost_for_slp (slp_tree node,
3863 stmt_vector_for_cost *cost_vec)
3864 {
3865 /* There's a special case of an existing vector, that costs nothing. */
3866 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3867 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3868 return;
3869 /* Without looking at the actual initializer a vector of
3870 constants can be implemented as load from the constant pool.
3871 When all elements are the same we can use a splat. */
3872 tree vectype = SLP_TREE_VECTYPE (node);
3873 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3874 unsigned num_vects_to_check;
3875 unsigned HOST_WIDE_INT const_nunits;
3876 unsigned nelt_limit;
3877 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3878 && ! multiple_p (const_nunits, group_size))
3879 {
3880 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3881 nelt_limit = const_nunits;
3882 }
3883 else
3884 {
3885 /* If either the vector has variable length or the vectors
3886 are composed of repeated whole groups we only need to
3887 cost construction once. All vectors will be the same. */
3888 num_vects_to_check = 1;
3889 nelt_limit = group_size;
3890 }
3891 tree elt = NULL_TREE;
3892 unsigned nelt = 0;
3893 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3894 {
3895 unsigned si = j % group_size;
3896 if (nelt == 0)
3897 elt = SLP_TREE_SCALAR_OPS (node)[si];
3898 /* ??? We're just tracking whether all operands of a single
3899 vector initializer are the same, ideally we'd check if
3900 we emitted the same one already. */
3901 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3902 elt = NULL_TREE;
3903 nelt++;
3904 if (nelt == nelt_limit)
3905 {
3906 record_stmt_cost (cost_vec, 1,
3907 SLP_TREE_DEF_TYPE (node) == vect_external_def
3908 ? (elt ? scalar_to_vec : vec_construct)
3909 : vector_load,
3910 NULL, vectype, 0, vect_prologue);
3911 nelt = 0;
3912 }
3913 }
3914 }
3915
3916 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3917 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3918
3919 Return true if the operations are supported. */
3920
3921 static bool
3922 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3923 slp_instance node_instance,
3924 hash_set<slp_tree> &visited_set,
3925 vec<slp_tree> &visited_vec,
3926 stmt_vector_for_cost *cost_vec)
3927 {
3928 int i, j;
3929 slp_tree child;
3930
3931 /* Assume we can code-generate all invariants. */
3932 if (!node
3933 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
3934 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
3935 return true;
3936
3937 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
3938 {
3939 if (dump_enabled_p ())
3940 dump_printf_loc (MSG_NOTE, vect_location,
3941 "Failed cyclic SLP reference in %p\n", node);
3942 return false;
3943 }
3944 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
3945
3946 /* If we already analyzed the exact same set of scalar stmts we're done.
3947 We share the generated vector stmts for those. */
3948 if (visited_set.add (node))
3949 return true;
3950 visited_vec.safe_push (node);
3951
3952 bool res = true;
3953 unsigned visited_rec_start = visited_vec.length ();
3954 unsigned cost_vec_rec_start = cost_vec->length ();
3955 bool seen_non_constant_child = false;
3956 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3957 {
3958 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3959 visited_set, visited_vec,
3960 cost_vec);
3961 if (!res)
3962 break;
3963 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
3964 seen_non_constant_child = true;
3965 }
3966 /* We're having difficulties scheduling nodes with just constant
3967 operands and no scalar stmts since we then cannot compute a stmt
3968 insertion place. */
3969 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
3970 {
3971 if (dump_enabled_p ())
3972 dump_printf_loc (MSG_NOTE, vect_location,
3973 "Cannot vectorize all-constant op node %p\n", node);
3974 res = false;
3975 }
3976
3977 if (res)
3978 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3979 cost_vec);
3980 /* If analysis failed we have to pop all recursive visited nodes
3981 plus ourselves. */
3982 if (!res)
3983 {
3984 while (visited_vec.length () >= visited_rec_start)
3985 visited_set.remove (visited_vec.pop ());
3986 cost_vec->truncate (cost_vec_rec_start);
3987 }
3988
3989 /* When the node can be vectorized cost invariant nodes it references.
3990 This is not done in DFS order to allow the refering node
3991 vectorizable_* calls to nail down the invariant nodes vector type
3992 and possibly unshare it if it needs a different vector type than
3993 other referrers. */
3994 if (res)
3995 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3996 if (child
3997 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
3998 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3999 /* Perform usual caching, note code-generation still
4000 code-gens these nodes multiple times but we expect
4001 to CSE them later. */
4002 && !visited_set.add (child))
4003 {
4004 visited_vec.safe_push (child);
4005 /* ??? After auditing more code paths make a "default"
4006 and push the vector type from NODE to all children
4007 if it is not already set. */
4008 /* Compute the number of vectors to be generated. */
4009 tree vector_type = SLP_TREE_VECTYPE (child);
4010 if (!vector_type)
4011 {
4012 /* For shifts with a scalar argument we don't need
4013 to cost or code-generate anything.
4014 ??? Represent this more explicitely. */
4015 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4016 == shift_vec_info_type)
4017 && j == 1);
4018 continue;
4019 }
4020 unsigned group_size = SLP_TREE_LANES (child);
4021 poly_uint64 vf = 1;
4022 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4023 vf = loop_vinfo->vectorization_factor;
4024 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
4025 = vect_get_num_vectors (vf * group_size, vector_type);
4026 /* And cost them. */
4027 vect_prologue_cost_for_slp (child, cost_vec);
4028 }
4029
4030 /* If this node or any of its children can't be vectorized, try pruning
4031 the tree here rather than felling the whole thing. */
4032 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
4033 {
4034 /* We'll need to revisit this for invariant costing and number
4035 of vectorized stmt setting. */
4036 res = true;
4037 }
4038
4039 return res;
4040 }
4041
4042
4043 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4044 region and that can be vectorized using vectorizable_live_operation
4045 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
4046 scalar code computing it to be retained. */
4047
4048 static void
4049 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
4050 slp_instance instance,
4051 stmt_vector_for_cost *cost_vec,
4052 hash_set<stmt_vec_info> &svisited,
4053 hash_set<slp_tree> &visited)
4054 {
4055 if (visited.add (node))
4056 return;
4057
4058 unsigned i;
4059 stmt_vec_info stmt_info;
4060 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
4061 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4062 {
4063 if (svisited.contains (stmt_info))
4064 continue;
4065 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4066 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
4067 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
4068 /* Only the pattern root stmt computes the original scalar value. */
4069 continue;
4070 bool mark_visited = true;
4071 gimple *orig_stmt = orig_stmt_info->stmt;
4072 ssa_op_iter op_iter;
4073 def_operand_p def_p;
4074 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4075 {
4076 imm_use_iterator use_iter;
4077 gimple *use_stmt;
4078 stmt_vec_info use_stmt_info;
4079 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4080 if (!is_gimple_debug (use_stmt))
4081 {
4082 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
4083 if (!use_stmt_info
4084 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4085 {
4086 STMT_VINFO_LIVE_P (stmt_info) = true;
4087 if (vectorizable_live_operation (bb_vinfo, stmt_info,
4088 NULL, node, instance, i,
4089 false, cost_vec))
4090 /* ??? So we know we can vectorize the live stmt
4091 from one SLP node. If we cannot do so from all
4092 or none consistently we'd have to record which
4093 SLP node (and lane) we want to use for the live
4094 operation. So make sure we can code-generate
4095 from all nodes. */
4096 mark_visited = false;
4097 else
4098 STMT_VINFO_LIVE_P (stmt_info) = false;
4099 break;
4100 }
4101 }
4102 /* We have to verify whether we can insert the lane extract
4103 before all uses. The following is a conservative approximation.
4104 We cannot put this into vectorizable_live_operation because
4105 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4106 doesn't work.
4107 Note that while the fact that we emit code for loads at the
4108 first load should make this a non-problem leafs we construct
4109 from scalars are vectorized after the last scalar def.
4110 ??? If we'd actually compute the insert location during
4111 analysis we could use sth less conservative than the last
4112 scalar stmt in the node for the dominance check. */
4113 /* ??? What remains is "live" uses in vector CTORs in the same
4114 SLP graph which is where those uses can end up code-generated
4115 right after their definition instead of close to their original
4116 use. But that would restrict us to code-generate lane-extracts
4117 from the latest stmt in a node. So we compensate for this
4118 during code-generation, simply not replacing uses for those
4119 hopefully rare cases. */
4120 if (STMT_VINFO_LIVE_P (stmt_info))
4121 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4122 if (!is_gimple_debug (use_stmt)
4123 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4124 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4125 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4126 {
4127 if (dump_enabled_p ())
4128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4129 "Cannot determine insertion place for "
4130 "lane extract\n");
4131 STMT_VINFO_LIVE_P (stmt_info) = false;
4132 mark_visited = true;
4133 }
4134 }
4135 if (mark_visited)
4136 svisited.add (stmt_info);
4137 }
4138
4139 slp_tree child;
4140 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4141 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4142 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4143 cost_vec, svisited, visited);
4144 }
4145
4146 /* Analyze statements in SLP instances of VINFO. Return true if the
4147 operations are supported. */
4148
4149 bool
4150 vect_slp_analyze_operations (vec_info *vinfo)
4151 {
4152 slp_instance instance;
4153 int i;
4154
4155 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4156
4157 hash_set<slp_tree> visited;
4158 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4159 {
4160 auto_vec<slp_tree> visited_vec;
4161 stmt_vector_for_cost cost_vec;
4162 cost_vec.create (2);
4163 if (is_a <bb_vec_info> (vinfo))
4164 vect_location = instance->location ();
4165 if (!vect_slp_analyze_node_operations (vinfo,
4166 SLP_INSTANCE_TREE (instance),
4167 instance, visited, visited_vec,
4168 &cost_vec)
4169 /* Instances with a root stmt require vectorized defs for the
4170 SLP tree root. */
4171 /* ??? Do inst->kind check instead. */
4172 || (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
4173 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
4174 != vect_internal_def)))
4175 {
4176 slp_tree node = SLP_INSTANCE_TREE (instance);
4177 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4178 if (dump_enabled_p ())
4179 dump_printf_loc (MSG_NOTE, vect_location,
4180 "removing SLP instance operations starting from: %G",
4181 stmt_info->stmt);
4182 vect_free_slp_instance (instance);
4183 vinfo->slp_instances.ordered_remove (i);
4184 cost_vec.release ();
4185 while (!visited_vec.is_empty ())
4186 visited.remove (visited_vec.pop ());
4187 }
4188 else
4189 {
4190 i++;
4191
4192 /* For BB vectorization remember the SLP graph entry
4193 cost for later. */
4194 if (is_a <bb_vec_info> (vinfo))
4195 instance->cost_vec = cost_vec;
4196 else
4197 {
4198 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
4199 cost_vec.release ();
4200 }
4201 }
4202 }
4203
4204 /* Compute vectorizable live stmts. */
4205 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4206 {
4207 hash_set<stmt_vec_info> svisited;
4208 hash_set<slp_tree> visited;
4209 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4210 {
4211 vect_location = instance->location ();
4212 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
4213 instance, &instance->cost_vec, svisited,
4214 visited);
4215 }
4216 }
4217
4218 return !vinfo->slp_instances.is_empty ();
4219 }
4220
4221 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
4222 closing the eventual chain. */
4223
4224 static slp_instance
4225 get_ultimate_leader (slp_instance instance,
4226 hash_map<slp_instance, slp_instance> &instance_leader)
4227 {
4228 auto_vec<slp_instance *, 8> chain;
4229 slp_instance *tem;
4230 while (*(tem = instance_leader.get (instance)) != instance)
4231 {
4232 chain.safe_push (tem);
4233 instance = *tem;
4234 }
4235 while (!chain.is_empty ())
4236 *chain.pop () = instance;
4237 return instance;
4238 }
4239
4240 /* Worker of vect_bb_partition_graph, recurse on NODE. */
4241
4242 static void
4243 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
4244 slp_instance instance, slp_tree node,
4245 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
4246 hash_map<slp_instance, slp_instance> &instance_leader,
4247 hash_set<slp_tree> &visited)
4248 {
4249 stmt_vec_info stmt_info;
4250 unsigned i;
4251
4252 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4253 {
4254 bool existed_p;
4255 slp_instance &stmt_instance
4256 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
4257 if (!existed_p)
4258 ;
4259 else if (stmt_instance != instance)
4260 {
4261 /* If we're running into a previously marked stmt make us the
4262 leader of the current ultimate leader. This keeps the
4263 leader chain acyclic and works even when the current instance
4264 connects two previously independent graph parts. */
4265 slp_instance stmt_leader
4266 = get_ultimate_leader (stmt_instance, instance_leader);
4267 if (stmt_leader != instance)
4268 instance_leader.put (stmt_leader, instance);
4269 }
4270 stmt_instance = instance;
4271 }
4272
4273 if (!SLP_TREE_SCALAR_STMTS (node).is_empty () && visited.add (node))
4274 return;
4275
4276 slp_tree child;
4277 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4278 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4279 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
4280 instance_leader, visited);
4281 }
4282
4283 /* Partition the SLP graph into pieces that can be costed independently. */
4284
4285 static void
4286 vect_bb_partition_graph (bb_vec_info bb_vinfo)
4287 {
4288 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
4289
4290 /* First walk the SLP graph assigning each involved scalar stmt a
4291 corresponding SLP graph entry and upon visiting a previously
4292 marked stmt, make the stmts leader the current SLP graph entry. */
4293 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
4294 hash_map<slp_instance, slp_instance> instance_leader;
4295 hash_set<slp_tree> visited;
4296 slp_instance instance;
4297 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4298 {
4299 instance_leader.put (instance, instance);
4300 vect_bb_partition_graph_r (bb_vinfo,
4301 instance, SLP_INSTANCE_TREE (instance),
4302 stmt_to_instance, instance_leader,
4303 visited);
4304 }
4305
4306 /* Then collect entries to each independent subgraph. */
4307 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4308 {
4309 slp_instance leader = get_ultimate_leader (instance, instance_leader);
4310 leader->subgraph_entries.safe_push (instance);
4311 if (dump_enabled_p ()
4312 && leader != instance)
4313 dump_printf_loc (MSG_NOTE, vect_location,
4314 "instance %p is leader of %p\n",
4315 leader, instance);
4316 }
4317 }
4318
4319 /* Compute the scalar cost of the SLP node NODE and its children
4320 and return it. Do not account defs that are marked in LIFE and
4321 update LIFE according to uses of NODE. */
4322
4323 static void
4324 vect_bb_slp_scalar_cost (vec_info *vinfo,
4325 slp_tree node, vec<bool, va_heap> *life,
4326 stmt_vector_for_cost *cost_vec,
4327 hash_set<slp_tree> &visited)
4328 {
4329 unsigned i;
4330 stmt_vec_info stmt_info;
4331 slp_tree child;
4332
4333 if (visited.add (node))
4334 return;
4335
4336 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4337 {
4338 ssa_op_iter op_iter;
4339 def_operand_p def_p;
4340
4341 if ((*life)[i])
4342 continue;
4343
4344 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4345 gimple *orig_stmt = orig_stmt_info->stmt;
4346
4347 /* If there is a non-vectorized use of the defs then the scalar
4348 stmt is kept live in which case we do not account it or any
4349 required defs in the SLP children in the scalar cost. This
4350 way we make the vectorization more costly when compared to
4351 the scalar cost. */
4352 if (!STMT_VINFO_LIVE_P (stmt_info))
4353 {
4354 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4355 {
4356 imm_use_iterator use_iter;
4357 gimple *use_stmt;
4358 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4359 if (!is_gimple_debug (use_stmt))
4360 {
4361 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
4362 if (!use_stmt_info
4363 || !PURE_SLP_STMT
4364 (vect_stmt_to_vectorize (use_stmt_info)))
4365 {
4366 (*life)[i] = true;
4367 break;
4368 }
4369 }
4370 }
4371 if ((*life)[i])
4372 continue;
4373 }
4374
4375 /* Count scalar stmts only once. */
4376 if (gimple_visited_p (orig_stmt))
4377 continue;
4378 gimple_set_visited (orig_stmt, true);
4379
4380 vect_cost_for_stmt kind;
4381 if (STMT_VINFO_DATA_REF (orig_stmt_info))
4382 {
4383 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
4384 kind = scalar_load;
4385 else
4386 kind = scalar_store;
4387 }
4388 else if (vect_nop_conversion_p (orig_stmt_info))
4389 continue;
4390 /* For single-argument PHIs assume coalescing which means zero cost
4391 for the scalar and the vector PHIs. This avoids artificially
4392 favoring the vector path (but may pessimize it in some cases). */
4393 else if (is_a <gphi *> (orig_stmt_info->stmt)
4394 && gimple_phi_num_args
4395 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
4396 continue;
4397 else
4398 kind = scalar_stmt;
4399 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
4400 SLP_TREE_VECTYPE (node), 0, vect_body);
4401 }
4402
4403 auto_vec<bool, 20> subtree_life;
4404 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4405 {
4406 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4407 {
4408 /* Do not directly pass LIFE to the recursive call, copy it to
4409 confine changes in the callee to the current child/subtree. */
4410 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4411 {
4412 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
4413 for (unsigned j = 0;
4414 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
4415 {
4416 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
4417 if (perm.first == i)
4418 subtree_life[perm.second] = (*life)[j];
4419 }
4420 }
4421 else
4422 {
4423 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
4424 subtree_life.safe_splice (*life);
4425 }
4426 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
4427 visited);
4428 subtree_life.truncate (0);
4429 }
4430 }
4431 }
4432
4433 /* Comparator for the loop-index sorted cost vectors. */
4434
4435 static int
4436 li_cost_vec_cmp (const void *a_, const void *b_)
4437 {
4438 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
4439 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
4440 if (a->first < b->first)
4441 return -1;
4442 else if (a->first == b->first)
4443 return 0;
4444 return 1;
4445 }
4446
4447 /* Check if vectorization of the basic block is profitable for the
4448 subgraph denoted by SLP_INSTANCES. */
4449
4450 static bool
4451 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
4452 vec<slp_instance> slp_instances)
4453 {
4454 slp_instance instance;
4455 int i;
4456 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
4457 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
4458
4459 if (dump_enabled_p ())
4460 {
4461 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
4462 hash_set<slp_tree> visited;
4463 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4464 vect_print_slp_graph (MSG_NOTE, vect_location,
4465 SLP_INSTANCE_TREE (instance), visited);
4466 }
4467
4468 /* Calculate scalar cost and sum the cost for the vector stmts
4469 previously collected. */
4470 stmt_vector_for_cost scalar_costs = vNULL;
4471 stmt_vector_for_cost vector_costs = vNULL;
4472 hash_set<slp_tree> visited;
4473 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4474 {
4475 auto_vec<bool, 20> life;
4476 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
4477 true);
4478 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
4479 record_stmt_cost (&scalar_costs,
4480 SLP_INSTANCE_ROOT_STMTS (instance).length (),
4481 scalar_stmt,
4482 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
4483 vect_bb_slp_scalar_cost (bb_vinfo,
4484 SLP_INSTANCE_TREE (instance),
4485 &life, &scalar_costs, visited);
4486 vector_costs.safe_splice (instance->cost_vec);
4487 instance->cost_vec.release ();
4488 }
4489 /* Unset visited flag. */
4490 stmt_info_for_cost *cost;
4491 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
4492 gimple_set_visited (cost->stmt_info->stmt, false);
4493
4494 if (dump_enabled_p ())
4495 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4496
4497 /* When costing non-loop vectorization we need to consider each covered
4498 loop independently and make sure vectorization is profitable. For
4499 now we assume a loop may be not entered or executed an arbitrary
4500 number of iterations (??? static information can provide more
4501 precise info here) which means we can simply cost each containing
4502 loops stmts separately. */
4503
4504 /* First produce cost vectors sorted by loop index. */
4505 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
4506 li_scalar_costs (scalar_costs.length ());
4507 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
4508 li_vector_costs (vector_costs.length ());
4509 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
4510 {
4511 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
4512 li_scalar_costs.quick_push (std::make_pair (l, cost));
4513 }
4514 /* Use a random used loop as fallback in case the first vector_costs
4515 entry does not have a stmt_info associated with it. */
4516 unsigned l = li_scalar_costs[0].first;
4517 FOR_EACH_VEC_ELT (vector_costs, i, cost)
4518 {
4519 /* We inherit from the previous COST, invariants, externals and
4520 extracts immediately follow the cost for the related stmt. */
4521 if (cost->stmt_info)
4522 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
4523 li_vector_costs.quick_push (std::make_pair (l, cost));
4524 }
4525 li_scalar_costs.qsort (li_cost_vec_cmp);
4526 li_vector_costs.qsort (li_cost_vec_cmp);
4527
4528 /* Now cost the portions individually. */
4529 unsigned vi = 0;
4530 unsigned si = 0;
4531 while (si < li_scalar_costs.length ()
4532 && vi < li_vector_costs.length ())
4533 {
4534 unsigned sl = li_scalar_costs[si].first;
4535 unsigned vl = li_vector_costs[vi].first;
4536 if (sl != vl)
4537 {
4538 if (dump_enabled_p ())
4539 dump_printf_loc (MSG_NOTE, vect_location,
4540 "Scalar %d and vector %d loop part do not "
4541 "match up, skipping scalar part\n", sl, vl);
4542 /* Skip the scalar part, assuming zero cost on the vector side. */
4543 do
4544 {
4545 si++;
4546 }
4547 while (si < li_scalar_costs.length ()
4548 && li_scalar_costs[si].first == sl);
4549 continue;
4550 }
4551
4552 void *scalar_target_cost_data = init_cost (NULL, true);
4553 do
4554 {
4555 add_stmt_cost (bb_vinfo, scalar_target_cost_data,
4556 li_scalar_costs[si].second);
4557 si++;
4558 }
4559 while (si < li_scalar_costs.length ()
4560 && li_scalar_costs[si].first == sl);
4561 unsigned dummy;
4562 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
4563 destroy_cost_data (scalar_target_cost_data);
4564
4565 /* Complete the target-specific vector cost calculation. */
4566 void *vect_target_cost_data = init_cost (NULL, false);
4567 do
4568 {
4569 add_stmt_cost (bb_vinfo, vect_target_cost_data,
4570 li_vector_costs[vi].second);
4571 vi++;
4572 }
4573 while (vi < li_vector_costs.length ()
4574 && li_vector_costs[vi].first == vl);
4575 finish_cost (vect_target_cost_data, &vec_prologue_cost,
4576 &vec_inside_cost, &vec_epilogue_cost);
4577 destroy_cost_data (vect_target_cost_data);
4578
4579 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
4580
4581 if (dump_enabled_p ())
4582 {
4583 dump_printf_loc (MSG_NOTE, vect_location,
4584 "Cost model analysis for part in loop %d:\n", sl);
4585 dump_printf (MSG_NOTE, " Vector cost: %d\n",
4586 vec_inside_cost + vec_outside_cost);
4587 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
4588 }
4589
4590 /* Vectorization is profitable if its cost is more than the cost of scalar
4591 version. Note that we err on the vector side for equal cost because
4592 the cost estimate is otherwise quite pessimistic (constant uses are
4593 free on the scalar side but cost a load on the vector side for
4594 example). */
4595 if (vec_outside_cost + vec_inside_cost > scalar_cost)
4596 {
4597 scalar_costs.release ();
4598 vector_costs.release ();
4599 return false;
4600 }
4601 }
4602 if (vi < li_vector_costs.length ())
4603 {
4604 if (dump_enabled_p ())
4605 dump_printf_loc (MSG_NOTE, vect_location,
4606 "Excess vector cost for part in loop %d:\n",
4607 li_vector_costs[vi].first);
4608 scalar_costs.release ();
4609 vector_costs.release ();
4610 return false;
4611 }
4612
4613 scalar_costs.release ();
4614 vector_costs.release ();
4615 return true;
4616 }
4617
4618 /* qsort comparator for lane defs. */
4619
4620 static int
4621 vld_cmp (const void *a_, const void *b_)
4622 {
4623 auto *a = (const std::pair<unsigned, tree> *)a_;
4624 auto *b = (const std::pair<unsigned, tree> *)b_;
4625 return a->first - b->first;
4626 }
4627
4628 /* Return true if USE_STMT is a vector lane insert into VEC and set
4629 *THIS_LANE to the lane number that is set. */
4630
4631 static bool
4632 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
4633 {
4634 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
4635 if (!use_ass
4636 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
4637 || (vec
4638 ? gimple_assign_rhs1 (use_ass) != vec
4639 : ((vec = gimple_assign_rhs1 (use_ass)), false))
4640 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
4641 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
4642 || !constant_multiple_p
4643 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
4644 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
4645 this_lane))
4646 return false;
4647 return true;
4648 }
4649
4650 /* Find any vectorizable constructors and add them to the grouped_store
4651 array. */
4652
4653 static void
4654 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
4655 {
4656 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
4657 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
4658 !gsi_end_p (gsi); gsi_next (&gsi))
4659 {
4660 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
4661 if (!assign)
4662 continue;
4663
4664 tree rhs = gimple_assign_rhs1 (assign);
4665 if (gimple_assign_rhs_code (assign) == CONSTRUCTOR)
4666 {
4667 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
4668 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
4669 CONSTRUCTOR_NELTS (rhs))
4670 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
4671 || uniform_vector_p (rhs))
4672 continue;
4673
4674 unsigned j;
4675 tree val;
4676 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
4677 if (TREE_CODE (val) != SSA_NAME
4678 || !bb_vinfo->lookup_def (val))
4679 break;
4680 if (j != CONSTRUCTOR_NELTS (rhs))
4681 continue;
4682
4683 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
4684 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
4685 }
4686 else if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR
4687 && VECTOR_TYPE_P (TREE_TYPE (rhs))
4688 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
4689 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
4690 && integer_zerop (gimple_assign_rhs3 (assign))
4691 && useless_type_conversion_p
4692 (TREE_TYPE (TREE_TYPE (rhs)),
4693 TREE_TYPE (gimple_assign_rhs2 (assign)))
4694 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
4695 {
4696 /* We start to match on insert to lane zero but since the
4697 inserts need not be ordered we'd have to search both
4698 the def and the use chains. */
4699 tree vectype = TREE_TYPE (rhs);
4700 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4701 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
4702 auto_sbitmap lanes (nlanes);
4703 bitmap_clear (lanes);
4704 bitmap_set_bit (lanes, 0);
4705 tree def = gimple_assign_lhs (assign);
4706 lane_defs.quick_push
4707 (std::make_pair (0, gimple_assign_rhs2 (assign)));
4708 unsigned lanes_found = 1;
4709 /* Start with the use chains, the last stmt will be the root. */
4710 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
4711 vec<stmt_vec_info> roots = vNULL;
4712 roots.safe_push (last);
4713 do
4714 {
4715 use_operand_p use_p;
4716 gimple *use_stmt;
4717 if (!single_imm_use (def, &use_p, &use_stmt))
4718 break;
4719 unsigned this_lane;
4720 if (!bb_vinfo->lookup_stmt (use_stmt)
4721 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
4722 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
4723 break;
4724 if (bitmap_bit_p (lanes, this_lane))
4725 break;
4726 lanes_found++;
4727 bitmap_set_bit (lanes, this_lane);
4728 gassign *use_ass = as_a <gassign *> (use_stmt);
4729 lane_defs.quick_push (std::make_pair
4730 (this_lane, gimple_assign_rhs2 (use_ass)));
4731 last = bb_vinfo->lookup_stmt (use_ass);
4732 roots.safe_push (last);
4733 def = gimple_assign_lhs (use_ass);
4734 }
4735 while (lanes_found < nlanes);
4736 if (roots.length () > 1)
4737 std::swap(roots[0], roots[roots.length () - 1]);
4738 if (lanes_found < nlanes)
4739 {
4740 /* Now search the def chain. */
4741 def = gimple_assign_rhs1 (assign);
4742 do
4743 {
4744 if (TREE_CODE (def) != SSA_NAME
4745 || !has_single_use (def))
4746 break;
4747 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
4748 unsigned this_lane;
4749 if (!bb_vinfo->lookup_stmt (def_stmt)
4750 || !vect_slp_is_lane_insert (def_stmt,
4751 NULL_TREE, &this_lane)
4752 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
4753 break;
4754 if (bitmap_bit_p (lanes, this_lane))
4755 break;
4756 lanes_found++;
4757 bitmap_set_bit (lanes, this_lane);
4758 lane_defs.quick_push (std::make_pair
4759 (this_lane,
4760 gimple_assign_rhs2 (def_stmt)));
4761 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
4762 def = gimple_assign_rhs1 (def_stmt);
4763 }
4764 while (lanes_found < nlanes);
4765 }
4766 if (lanes_found == nlanes)
4767 {
4768 /* Sort lane_defs after the lane index and register the root. */
4769 lane_defs.qsort (vld_cmp);
4770 vec<stmt_vec_info> stmts;
4771 stmts.create (nlanes);
4772 for (unsigned i = 0; i < nlanes; ++i)
4773 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
4774 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
4775 stmts, roots));
4776 }
4777 else
4778 roots.release ();
4779 }
4780 }
4781 }
4782
4783 /* Walk the grouped store chains and replace entries with their
4784 pattern variant if any. */
4785
4786 static void
4787 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
4788 {
4789 stmt_vec_info first_element;
4790 unsigned i;
4791
4792 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4793 {
4794 /* We also have CTORs in this array. */
4795 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
4796 continue;
4797 if (STMT_VINFO_IN_PATTERN_P (first_element))
4798 {
4799 stmt_vec_info orig = first_element;
4800 first_element = STMT_VINFO_RELATED_STMT (first_element);
4801 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
4802 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
4803 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
4804 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
4805 vinfo->grouped_stores[i] = first_element;
4806 }
4807 stmt_vec_info prev = first_element;
4808 while (DR_GROUP_NEXT_ELEMENT (prev))
4809 {
4810 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
4811 if (STMT_VINFO_IN_PATTERN_P (elt))
4812 {
4813 stmt_vec_info orig = elt;
4814 elt = STMT_VINFO_RELATED_STMT (elt);
4815 DR_GROUP_NEXT_ELEMENT (prev) = elt;
4816 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
4817 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
4818 }
4819 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
4820 prev = elt;
4821 }
4822 }
4823 }
4824
4825 /* Check if the region described by BB_VINFO can be vectorized, returning
4826 true if so. When returning false, set FATAL to true if the same failure
4827 would prevent vectorization at other vector sizes, false if it is still
4828 worth trying other sizes. N_STMTS is the number of statements in the
4829 region. */
4830
4831 static bool
4832 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
4833 vec<int> *dataref_groups)
4834 {
4835 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4836
4837 slp_instance instance;
4838 int i;
4839 poly_uint64 min_vf = 2;
4840
4841 /* The first group of checks is independent of the vector size. */
4842 fatal = true;
4843
4844 /* Analyze the data references. */
4845
4846 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
4847 {
4848 if (dump_enabled_p ())
4849 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4850 "not vectorized: unhandled data-ref in basic "
4851 "block.\n");
4852 return false;
4853 }
4854
4855 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
4856 {
4857 if (dump_enabled_p ())
4858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4859 "not vectorized: unhandled data access in "
4860 "basic block.\n");
4861 return false;
4862 }
4863
4864 vect_slp_check_for_constructors (bb_vinfo);
4865
4866 /* If there are no grouped stores and no constructors in the region
4867 there is no need to continue with pattern recog as vect_analyze_slp
4868 will fail anyway. */
4869 if (bb_vinfo->grouped_stores.is_empty ()
4870 && bb_vinfo->roots.is_empty ())
4871 {
4872 if (dump_enabled_p ())
4873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4874 "not vectorized: no grouped stores in "
4875 "basic block.\n");
4876 return false;
4877 }
4878
4879 /* While the rest of the analysis below depends on it in some way. */
4880 fatal = false;
4881
4882 vect_pattern_recog (bb_vinfo);
4883
4884 /* Update store groups from pattern processing. */
4885 vect_fixup_store_groups_with_patterns (bb_vinfo);
4886
4887 /* Check the SLP opportunities in the basic block, analyze and build SLP
4888 trees. */
4889 if (!vect_analyze_slp (bb_vinfo, n_stmts))
4890 {
4891 if (dump_enabled_p ())
4892 {
4893 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4894 "Failed to SLP the basic block.\n");
4895 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4896 "not vectorized: failed to find SLP opportunities "
4897 "in basic block.\n");
4898 }
4899 return false;
4900 }
4901
4902 /* Optimize permutations. */
4903 vect_optimize_slp (bb_vinfo);
4904
4905 /* Gather the loads reachable from the SLP graph entries. */
4906 vect_gather_slp_loads (bb_vinfo);
4907
4908 vect_record_base_alignments (bb_vinfo);
4909
4910 /* Analyze and verify the alignment of data references and the
4911 dependence in the SLP instances. */
4912 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
4913 {
4914 vect_location = instance->location ();
4915 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
4916 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
4917 {
4918 slp_tree node = SLP_INSTANCE_TREE (instance);
4919 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4920 if (dump_enabled_p ())
4921 dump_printf_loc (MSG_NOTE, vect_location,
4922 "removing SLP instance operations starting from: %G",
4923 stmt_info->stmt);
4924 vect_free_slp_instance (instance);
4925 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
4926 continue;
4927 }
4928
4929 /* Mark all the statements that we want to vectorize as pure SLP and
4930 relevant. */
4931 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4932 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
4933 unsigned j;
4934 stmt_vec_info root;
4935 /* Likewise consider instance root stmts as vectorized. */
4936 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
4937 STMT_SLP_TYPE (root) = pure_slp;
4938
4939 i++;
4940 }
4941 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
4942 return false;
4943
4944 if (!vect_slp_analyze_operations (bb_vinfo))
4945 {
4946 if (dump_enabled_p ())
4947 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4948 "not vectorized: bad operation in basic block.\n");
4949 return false;
4950 }
4951
4952 vect_bb_partition_graph (bb_vinfo);
4953
4954 return true;
4955 }
4956
4957 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4958 basic blocks in BBS, returning true on success.
4959 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4960
4961 static bool
4962 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
4963 vec<int> *dataref_groups, unsigned int n_stmts)
4964 {
4965 bb_vec_info bb_vinfo;
4966 auto_vector_modes vector_modes;
4967
4968 /* Autodetect first vector size we try. */
4969 machine_mode next_vector_mode = VOIDmode;
4970 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
4971 unsigned int mode_i = 0;
4972
4973 vec_info_shared shared;
4974
4975 machine_mode autodetected_vector_mode = VOIDmode;
4976 while (1)
4977 {
4978 bool vectorized = false;
4979 bool fatal = false;
4980 bb_vinfo = new _bb_vec_info (bbs, &shared);
4981
4982 bool first_time_p = shared.datarefs.is_empty ();
4983 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
4984 if (first_time_p)
4985 bb_vinfo->shared->save_datarefs ();
4986 else
4987 bb_vinfo->shared->check_datarefs ();
4988 bb_vinfo->vector_mode = next_vector_mode;
4989
4990 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
4991 {
4992 if (dump_enabled_p ())
4993 {
4994 dump_printf_loc (MSG_NOTE, vect_location,
4995 "***** Analysis succeeded with vector mode"
4996 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4997 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4998 }
4999
5000 bb_vinfo->shared->check_datarefs ();
5001
5002 unsigned i;
5003 slp_instance instance;
5004 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
5005 {
5006 if (instance->subgraph_entries.is_empty ())
5007 continue;
5008
5009 vect_location = instance->location ();
5010 if (!unlimited_cost_model (NULL)
5011 && !vect_bb_vectorization_profitable_p
5012 (bb_vinfo, instance->subgraph_entries))
5013 {
5014 if (dump_enabled_p ())
5015 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5016 "not vectorized: vectorization is not "
5017 "profitable.\n");
5018 continue;
5019 }
5020
5021 if (!dbg_cnt (vect_slp))
5022 continue;
5023
5024 if (!vectorized && dump_enabled_p ())
5025 dump_printf_loc (MSG_NOTE, vect_location,
5026 "Basic block will be vectorized "
5027 "using SLP\n");
5028 vectorized = true;
5029
5030 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
5031
5032 unsigned HOST_WIDE_INT bytes;
5033 if (dump_enabled_p ())
5034 {
5035 if (GET_MODE_SIZE
5036 (bb_vinfo->vector_mode).is_constant (&bytes))
5037 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5038 "basic block part vectorized using %wu "
5039 "byte vectors\n", bytes);
5040 else
5041 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5042 "basic block part vectorized using "
5043 "variable length vectors\n");
5044 }
5045 }
5046 }
5047 else
5048 {
5049 if (dump_enabled_p ())
5050 dump_printf_loc (MSG_NOTE, vect_location,
5051 "***** Analysis failed with vector mode %s\n",
5052 GET_MODE_NAME (bb_vinfo->vector_mode));
5053 }
5054
5055 if (mode_i == 0)
5056 autodetected_vector_mode = bb_vinfo->vector_mode;
5057
5058 if (!fatal)
5059 while (mode_i < vector_modes.length ()
5060 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
5061 {
5062 if (dump_enabled_p ())
5063 dump_printf_loc (MSG_NOTE, vect_location,
5064 "***** The result for vector mode %s would"
5065 " be the same\n",
5066 GET_MODE_NAME (vector_modes[mode_i]));
5067 mode_i += 1;
5068 }
5069
5070 delete bb_vinfo;
5071
5072 if (mode_i < vector_modes.length ()
5073 && VECTOR_MODE_P (autodetected_vector_mode)
5074 && (related_vector_mode (vector_modes[mode_i],
5075 GET_MODE_INNER (autodetected_vector_mode))
5076 == autodetected_vector_mode)
5077 && (related_vector_mode (autodetected_vector_mode,
5078 GET_MODE_INNER (vector_modes[mode_i]))
5079 == vector_modes[mode_i]))
5080 {
5081 if (dump_enabled_p ())
5082 dump_printf_loc (MSG_NOTE, vect_location,
5083 "***** Skipping vector mode %s, which would"
5084 " repeat the analysis for %s\n",
5085 GET_MODE_NAME (vector_modes[mode_i]),
5086 GET_MODE_NAME (autodetected_vector_mode));
5087 mode_i += 1;
5088 }
5089
5090 if (vectorized
5091 || mode_i == vector_modes.length ()
5092 || autodetected_vector_mode == VOIDmode
5093 /* If vect_slp_analyze_bb_1 signaled that analysis for all
5094 vector sizes will fail do not bother iterating. */
5095 || fatal)
5096 return vectorized;
5097
5098 /* Try the next biggest vector size. */
5099 next_vector_mode = vector_modes[mode_i++];
5100 if (dump_enabled_p ())
5101 dump_printf_loc (MSG_NOTE, vect_location,
5102 "***** Re-trying analysis with vector mode %s\n",
5103 GET_MODE_NAME (next_vector_mode));
5104 }
5105 }
5106
5107
5108 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
5109 true if anything in the basic-block was vectorized. */
5110
5111 static bool
5112 vect_slp_bbs (vec<basic_block> bbs)
5113 {
5114 vec<data_reference_p> datarefs = vNULL;
5115 auto_vec<int> dataref_groups;
5116 int insns = 0;
5117 int current_group = 0;
5118
5119 for (unsigned i = 0; i < bbs.length (); i++)
5120 {
5121 basic_block bb = bbs[i];
5122 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
5123 gsi_next (&gsi))
5124 {
5125 gimple *stmt = gsi_stmt (gsi);
5126 if (is_gimple_debug (stmt))
5127 continue;
5128
5129 insns++;
5130
5131 if (gimple_location (stmt) != UNKNOWN_LOCATION)
5132 vect_location = stmt;
5133
5134 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
5135 &dataref_groups, current_group))
5136 ++current_group;
5137 }
5138 }
5139
5140 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
5141 }
5142
5143 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5144 true if anything in the basic-block was vectorized. */
5145
5146 bool
5147 vect_slp_bb (basic_block bb)
5148 {
5149 auto_vec<basic_block> bbs;
5150 bbs.safe_push (bb);
5151 return vect_slp_bbs (bbs);
5152 }
5153
5154 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5155 true if anything in the basic-block was vectorized. */
5156
5157 bool
5158 vect_slp_function (function *fun)
5159 {
5160 bool r = false;
5161 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
5162 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
5163
5164 /* For the moment split the function into pieces to avoid making
5165 the iteration on the vector mode moot. Split at points we know
5166 to not handle well which is CFG merges (SLP discovery doesn't
5167 handle non-loop-header PHIs) and loop exits. Since pattern
5168 recog requires reverse iteration to visit uses before defs
5169 simply chop RPO into pieces. */
5170 auto_vec<basic_block> bbs;
5171 for (unsigned i = 0; i < n; i++)
5172 {
5173 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
5174 bool split = false;
5175
5176 /* Split when a BB is not dominated by the first block. */
5177 if (!bbs.is_empty ()
5178 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
5179 {
5180 if (dump_enabled_p ())
5181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5182 "splitting region at dominance boundary bb%d\n",
5183 bb->index);
5184 split = true;
5185 }
5186 /* Split when the loop determined by the first block
5187 is exited. This is because we eventually insert
5188 invariants at region begin. */
5189 else if (!bbs.is_empty ()
5190 && bbs[0]->loop_father != bb->loop_father
5191 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
5192 {
5193 if (dump_enabled_p ())
5194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5195 "splitting region at loop %d exit at bb%d\n",
5196 bbs[0]->loop_father->num, bb->index);
5197 split = true;
5198 }
5199
5200 if (split && !bbs.is_empty ())
5201 {
5202 r |= vect_slp_bbs (bbs);
5203 bbs.truncate (0);
5204 bbs.quick_push (bb);
5205 }
5206 else
5207 bbs.safe_push (bb);
5208
5209 /* When we have a stmt ending this block and defining a
5210 value we have to insert on edges when inserting after it for
5211 a vector containing its definition. Avoid this for now. */
5212 if (gimple *last = last_stmt (bb))
5213 if (gimple_get_lhs (last)
5214 && is_ctrl_altering_stmt (last))
5215 {
5216 if (dump_enabled_p ())
5217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5218 "splitting region at control altering "
5219 "definition %G", last);
5220 r |= vect_slp_bbs (bbs);
5221 bbs.truncate (0);
5222 }
5223 }
5224
5225 if (!bbs.is_empty ())
5226 r |= vect_slp_bbs (bbs);
5227
5228 free (rpo);
5229
5230 return r;
5231 }
5232
5233 /* Build a variable-length vector in which the elements in ELTS are repeated
5234 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
5235 RESULTS and add any new instructions to SEQ.
5236
5237 The approach we use is:
5238
5239 (1) Find a vector mode VM with integer elements of mode IM.
5240
5241 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5242 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
5243 from small vectors to IM.
5244
5245 (3) Duplicate each ELTS'[I] into a vector of mode VM.
5246
5247 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
5248 correct byte contents.
5249
5250 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
5251
5252 We try to find the largest IM for which this sequence works, in order
5253 to cut down on the number of interleaves. */
5254
5255 void
5256 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
5257 vec<tree> elts, unsigned int nresults,
5258 vec<tree> &results)
5259 {
5260 unsigned int nelts = elts.length ();
5261 tree element_type = TREE_TYPE (vector_type);
5262
5263 /* (1) Find a vector mode VM with integer elements of mode IM. */
5264 unsigned int nvectors = 1;
5265 tree new_vector_type;
5266 tree permutes[2];
5267 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
5268 &nvectors, &new_vector_type,
5269 permutes))
5270 gcc_unreachable ();
5271
5272 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
5273 unsigned int partial_nelts = nelts / nvectors;
5274 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
5275
5276 tree_vector_builder partial_elts;
5277 auto_vec<tree, 32> pieces (nvectors * 2);
5278 pieces.quick_grow_cleared (nvectors * 2);
5279 for (unsigned int i = 0; i < nvectors; ++i)
5280 {
5281 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5282 ELTS' has mode IM. */
5283 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
5284 for (unsigned int j = 0; j < partial_nelts; ++j)
5285 partial_elts.quick_push (elts[i * partial_nelts + j]);
5286 tree t = gimple_build_vector (seq, &partial_elts);
5287 t = gimple_build (seq, VIEW_CONVERT_EXPR,
5288 TREE_TYPE (new_vector_type), t);
5289
5290 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
5291 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
5292 }
5293
5294 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
5295 correct byte contents.
5296
5297 Conceptually, we need to repeat the following operation log2(nvectors)
5298 times, where hi_start = nvectors / 2:
5299
5300 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
5301 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
5302
5303 However, if each input repeats every N elements and the VF is
5304 a multiple of N * 2, the HI result is the same as the LO result.
5305 This will be true for the first N1 iterations of the outer loop,
5306 followed by N2 iterations for which both the LO and HI results
5307 are needed. I.e.:
5308
5309 N1 + N2 = log2(nvectors)
5310
5311 Each "N1 iteration" doubles the number of redundant vectors and the
5312 effect of the process as a whole is to have a sequence of nvectors/2**N1
5313 vectors that repeats 2**N1 times. Rather than generate these redundant
5314 vectors, we halve the number of vectors for each N1 iteration. */
5315 unsigned int in_start = 0;
5316 unsigned int out_start = nvectors;
5317 unsigned int new_nvectors = nvectors;
5318 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
5319 {
5320 unsigned int hi_start = new_nvectors / 2;
5321 unsigned int out_i = 0;
5322 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
5323 {
5324 if ((in_i & 1) != 0
5325 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
5326 2 * in_repeat))
5327 continue;
5328
5329 tree output = make_ssa_name (new_vector_type);
5330 tree input1 = pieces[in_start + (in_i / 2)];
5331 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
5332 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
5333 input1, input2,
5334 permutes[in_i & 1]);
5335 gimple_seq_add_stmt (seq, stmt);
5336 pieces[out_start + out_i] = output;
5337 out_i += 1;
5338 }
5339 std::swap (in_start, out_start);
5340 new_nvectors = out_i;
5341 }
5342
5343 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
5344 results.reserve (nresults);
5345 for (unsigned int i = 0; i < nresults; ++i)
5346 if (i < new_nvectors)
5347 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
5348 pieces[in_start + i]));
5349 else
5350 results.quick_push (results[i - new_nvectors]);
5351 }
5352
5353
5354 /* For constant and loop invariant defs in OP_NODE this function creates
5355 vector defs that will be used in the vectorized stmts and stores them
5356 to SLP_TREE_VEC_DEFS of OP_NODE. */
5357
5358 static void
5359 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
5360 {
5361 unsigned HOST_WIDE_INT nunits;
5362 tree vec_cst;
5363 unsigned j, number_of_places_left_in_vector;
5364 tree vector_type;
5365 tree vop;
5366 int group_size = op_node->ops.length ();
5367 unsigned int vec_num, i;
5368 unsigned number_of_copies = 1;
5369 bool constant_p;
5370 gimple_seq ctor_seq = NULL;
5371 auto_vec<tree, 16> permute_results;
5372
5373 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
5374 vector_type = SLP_TREE_VECTYPE (op_node);
5375
5376 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
5377 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
5378 auto_vec<tree> voprnds (number_of_vectors);
5379
5380 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5381 created vectors. It is greater than 1 if unrolling is performed.
5382
5383 For example, we have two scalar operands, s1 and s2 (e.g., group of
5384 strided accesses of size two), while NUNITS is four (i.e., four scalars
5385 of this type can be packed in a vector). The output vector will contain
5386 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5387 will be 2).
5388
5389 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
5390 containing the operands.
5391
5392 For example, NUNITS is four as before, and the group size is 8
5393 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5394 {s5, s6, s7, s8}. */
5395
5396 /* When using duplicate_and_interleave, we just need one element for
5397 each scalar statement. */
5398 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5399 nunits = group_size;
5400
5401 number_of_copies = nunits * number_of_vectors / group_size;
5402
5403 number_of_places_left_in_vector = nunits;
5404 constant_p = true;
5405 tree_vector_builder elts (vector_type, nunits, 1);
5406 elts.quick_grow (nunits);
5407 stmt_vec_info insert_after = NULL;
5408 for (j = 0; j < number_of_copies; j++)
5409 {
5410 tree op;
5411 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
5412 {
5413 /* Create 'vect_ = {op0,op1,...,opn}'. */
5414 number_of_places_left_in_vector--;
5415 tree orig_op = op;
5416 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
5417 {
5418 if (CONSTANT_CLASS_P (op))
5419 {
5420 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
5421 {
5422 /* Can't use VIEW_CONVERT_EXPR for booleans because
5423 of possibly different sizes of scalar value and
5424 vector element. */
5425 if (integer_zerop (op))
5426 op = build_int_cst (TREE_TYPE (vector_type), 0);
5427 else if (integer_onep (op))
5428 op = build_all_ones_cst (TREE_TYPE (vector_type));
5429 else
5430 gcc_unreachable ();
5431 }
5432 else
5433 op = fold_unary (VIEW_CONVERT_EXPR,
5434 TREE_TYPE (vector_type), op);
5435 gcc_assert (op && CONSTANT_CLASS_P (op));
5436 }
5437 else
5438 {
5439 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
5440 gimple *init_stmt;
5441 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
5442 {
5443 tree true_val
5444 = build_all_ones_cst (TREE_TYPE (vector_type));
5445 tree false_val
5446 = build_zero_cst (TREE_TYPE (vector_type));
5447 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
5448 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
5449 op, true_val,
5450 false_val);
5451 }
5452 else
5453 {
5454 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
5455 op);
5456 init_stmt
5457 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
5458 op);
5459 }
5460 gimple_seq_add_stmt (&ctor_seq, init_stmt);
5461 op = new_temp;
5462 }
5463 }
5464 elts[number_of_places_left_in_vector] = op;
5465 if (!CONSTANT_CLASS_P (op))
5466 constant_p = false;
5467 /* For BB vectorization we have to compute an insert location
5468 when a def is inside the analyzed region since we cannot
5469 simply insert at the BB start in this case. */
5470 stmt_vec_info opdef;
5471 if (TREE_CODE (orig_op) == SSA_NAME
5472 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
5473 && is_a <bb_vec_info> (vinfo)
5474 && (opdef = vinfo->lookup_def (orig_op)))
5475 {
5476 if (!insert_after)
5477 insert_after = opdef;
5478 else
5479 insert_after = get_later_stmt (insert_after, opdef);
5480 }
5481
5482 if (number_of_places_left_in_vector == 0)
5483 {
5484 if (constant_p
5485 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5486 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5487 vec_cst = gimple_build_vector (&ctor_seq, &elts);
5488 else
5489 {
5490 if (permute_results.is_empty ())
5491 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
5492 elts, number_of_vectors,
5493 permute_results);
5494 vec_cst = permute_results[number_of_vectors - j - 1];
5495 }
5496 if (!gimple_seq_empty_p (ctor_seq))
5497 {
5498 if (insert_after)
5499 {
5500 gimple_stmt_iterator gsi;
5501 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
5502 {
5503 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
5504 gsi_insert_seq_before (&gsi, ctor_seq,
5505 GSI_CONTINUE_LINKING);
5506 }
5507 else if (!stmt_ends_bb_p (insert_after->stmt))
5508 {
5509 gsi = gsi_for_stmt (insert_after->stmt);
5510 gsi_insert_seq_after (&gsi, ctor_seq,
5511 GSI_CONTINUE_LINKING);
5512 }
5513 else
5514 {
5515 /* When we want to insert after a def where the
5516 defining stmt throws then insert on the fallthru
5517 edge. */
5518 edge e = find_fallthru_edge
5519 (gimple_bb (insert_after->stmt)->succs);
5520 basic_block new_bb
5521 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
5522 gcc_assert (!new_bb);
5523 }
5524 }
5525 else
5526 vinfo->insert_seq_on_entry (NULL, ctor_seq);
5527 ctor_seq = NULL;
5528 }
5529 voprnds.quick_push (vec_cst);
5530 insert_after = NULL;
5531 number_of_places_left_in_vector = nunits;
5532 constant_p = true;
5533 elts.new_vector (vector_type, nunits, 1);
5534 elts.quick_grow (nunits);
5535 }
5536 }
5537 }
5538
5539 /* Since the vectors are created in the reverse order, we should invert
5540 them. */
5541 vec_num = voprnds.length ();
5542 for (j = vec_num; j != 0; j--)
5543 {
5544 vop = voprnds[j - 1];
5545 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5546 }
5547
5548 /* In case that VF is greater than the unrolling factor needed for the SLP
5549 group of stmts, NUMBER_OF_VECTORS to be created is greater than
5550 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
5551 to replicate the vectors. */
5552 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
5553 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
5554 i++)
5555 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5556 }
5557
5558 /* Get the Ith vectorized definition from SLP_NODE. */
5559
5560 tree
5561 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
5562 {
5563 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
5564 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
5565 else
5566 return SLP_TREE_VEC_DEFS (slp_node)[i];
5567 }
5568
5569 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
5570
5571 void
5572 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
5573 {
5574 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
5575 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
5576 {
5577 unsigned j;
5578 gimple *vec_def_stmt;
5579 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
5580 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
5581 }
5582 else
5583 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
5584 }
5585
5586 /* Get N vectorized definitions for SLP_NODE. */
5587
5588 void
5589 vect_get_slp_defs (vec_info *,
5590 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
5591 {
5592 if (n == -1U)
5593 n = SLP_TREE_CHILDREN (slp_node).length ();
5594
5595 for (unsigned i = 0; i < n; ++i)
5596 {
5597 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
5598 vec<tree> vec_defs = vNULL;
5599 vect_get_slp_defs (child, &vec_defs);
5600 vec_oprnds->quick_push (vec_defs);
5601 }
5602 }
5603
5604 /* Generate vector permute statements from a list of loads in DR_CHAIN.
5605 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
5606 permute statements for the SLP node NODE. Store the number of vector
5607 permute instructions in *N_PERMS and the number of vector load
5608 instructions in *N_LOADS. */
5609
5610 bool
5611 vect_transform_slp_perm_load (vec_info *vinfo,
5612 slp_tree node, vec<tree> dr_chain,
5613 gimple_stmt_iterator *gsi, poly_uint64 vf,
5614 bool analyze_only, unsigned *n_perms,
5615 unsigned int *n_loads)
5616 {
5617 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5618 int vec_index = 0;
5619 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5620 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
5621 unsigned int mask_element;
5622 machine_mode mode;
5623
5624 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
5625 return false;
5626
5627 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
5628
5629 mode = TYPE_MODE (vectype);
5630 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5631
5632 /* Initialize the vect stmts of NODE to properly insert the generated
5633 stmts later. */
5634 if (! analyze_only)
5635 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
5636 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
5637 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
5638
5639 /* Generate permutation masks for every NODE. Number of masks for each NODE
5640 is equal to GROUP_SIZE.
5641 E.g., we have a group of three nodes with three loads from the same
5642 location in each node, and the vector size is 4. I.e., we have a
5643 a0b0c0a1b1c1... sequence and we need to create the following vectors:
5644 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
5645 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
5646 ...
5647
5648 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
5649 The last mask is illegal since we assume two operands for permute
5650 operation, and the mask element values can't be outside that range.
5651 Hence, the last mask must be converted into {2,5,5,5}.
5652 For the first two permutations we need the first and the second input
5653 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
5654 we need the second and the third vectors: {b1,c1,a2,b2} and
5655 {c2,a3,b3,c3}. */
5656
5657 int vect_stmts_counter = 0;
5658 unsigned int index = 0;
5659 int first_vec_index = -1;
5660 int second_vec_index = -1;
5661 bool noop_p = true;
5662 *n_perms = 0;
5663
5664 vec_perm_builder mask;
5665 unsigned int nelts_to_build;
5666 unsigned int nvectors_per_build;
5667 unsigned int in_nlanes;
5668 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
5669 && multiple_p (nunits, group_size));
5670 if (repeating_p)
5671 {
5672 /* A single vector contains a whole number of copies of the node, so:
5673 (a) all permutes can use the same mask; and
5674 (b) the permutes only need a single vector input. */
5675 mask.new_vector (nunits, group_size, 3);
5676 nelts_to_build = mask.encoded_nelts ();
5677 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
5678 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
5679 }
5680 else
5681 {
5682 /* We need to construct a separate mask for each vector statement. */
5683 unsigned HOST_WIDE_INT const_nunits, const_vf;
5684 if (!nunits.is_constant (&const_nunits)
5685 || !vf.is_constant (&const_vf))
5686 return false;
5687 mask.new_vector (const_nunits, const_nunits, 1);
5688 nelts_to_build = const_vf * group_size;
5689 nvectors_per_build = 1;
5690 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
5691 }
5692 auto_sbitmap used_in_lanes (in_nlanes);
5693 bitmap_clear (used_in_lanes);
5694
5695 unsigned int count = mask.encoded_nelts ();
5696 mask.quick_grow (count);
5697 vec_perm_indices indices;
5698
5699 for (unsigned int j = 0; j < nelts_to_build; j++)
5700 {
5701 unsigned int iter_num = j / group_size;
5702 unsigned int stmt_num = j % group_size;
5703 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
5704 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
5705 bitmap_set_bit (used_in_lanes, i);
5706 if (repeating_p)
5707 {
5708 first_vec_index = 0;
5709 mask_element = i;
5710 }
5711 else
5712 {
5713 /* Enforced before the loop when !repeating_p. */
5714 unsigned int const_nunits = nunits.to_constant ();
5715 vec_index = i / const_nunits;
5716 mask_element = i % const_nunits;
5717 if (vec_index == first_vec_index
5718 || first_vec_index == -1)
5719 {
5720 first_vec_index = vec_index;
5721 }
5722 else if (vec_index == second_vec_index
5723 || second_vec_index == -1)
5724 {
5725 second_vec_index = vec_index;
5726 mask_element += const_nunits;
5727 }
5728 else
5729 {
5730 if (dump_enabled_p ())
5731 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5732 "permutation requires at "
5733 "least three vectors %G",
5734 stmt_info->stmt);
5735 gcc_assert (analyze_only);
5736 return false;
5737 }
5738
5739 gcc_assert (mask_element < 2 * const_nunits);
5740 }
5741
5742 if (mask_element != index)
5743 noop_p = false;
5744 mask[index++] = mask_element;
5745
5746 if (index == count && !noop_p)
5747 {
5748 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
5749 if (!can_vec_perm_const_p (mode, indices))
5750 {
5751 if (dump_enabled_p ())
5752 {
5753 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5754 vect_location,
5755 "unsupported vect permute { ");
5756 for (i = 0; i < count; ++i)
5757 {
5758 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5759 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5760 }
5761 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5762 }
5763 gcc_assert (analyze_only);
5764 return false;
5765 }
5766
5767 ++*n_perms;
5768 }
5769
5770 if (index == count)
5771 {
5772 if (!analyze_only)
5773 {
5774 tree mask_vec = NULL_TREE;
5775
5776 if (! noop_p)
5777 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5778
5779 if (second_vec_index == -1)
5780 second_vec_index = first_vec_index;
5781
5782 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
5783 {
5784 /* Generate the permute statement if necessary. */
5785 tree first_vec = dr_chain[first_vec_index + ri];
5786 tree second_vec = dr_chain[second_vec_index + ri];
5787 gimple *perm_stmt;
5788 if (! noop_p)
5789 {
5790 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5791 tree perm_dest
5792 = vect_create_destination_var (gimple_assign_lhs (stmt),
5793 vectype);
5794 perm_dest = make_ssa_name (perm_dest);
5795 perm_stmt
5796 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5797 first_vec, second_vec,
5798 mask_vec);
5799 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
5800 gsi);
5801 }
5802 else
5803 /* If mask was NULL_TREE generate the requested
5804 identity transform. */
5805 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
5806
5807 /* Store the vector statement in NODE. */
5808 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
5809 }
5810 }
5811
5812 index = 0;
5813 first_vec_index = -1;
5814 second_vec_index = -1;
5815 noop_p = true;
5816 }
5817 }
5818
5819 if (n_loads)
5820 {
5821 if (repeating_p)
5822 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5823 else
5824 {
5825 /* Enforced above when !repeating_p. */
5826 unsigned int const_nunits = nunits.to_constant ();
5827 *n_loads = 0;
5828 bool load_seen = false;
5829 for (unsigned i = 0; i < in_nlanes; ++i)
5830 {
5831 if (i % const_nunits == 0)
5832 {
5833 if (load_seen)
5834 *n_loads += 1;
5835 load_seen = false;
5836 }
5837 if (bitmap_bit_p (used_in_lanes, i))
5838 load_seen = true;
5839 }
5840 if (load_seen)
5841 *n_loads += 1;
5842 }
5843 }
5844
5845 return true;
5846 }
5847
5848 /* Produce the next vector result for SLP permutation NODE by adding a vector
5849 statement at GSI. If MASK_VEC is nonnull, add:
5850
5851 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
5852
5853 otherwise add:
5854
5855 <new SSA name> = FIRST_DEF. */
5856
5857 static void
5858 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5859 slp_tree node, tree first_def, tree second_def,
5860 tree mask_vec)
5861 {
5862 tree vectype = SLP_TREE_VECTYPE (node);
5863
5864 /* ??? We SLP match existing vector element extracts but
5865 allow punning which we need to re-instantiate at uses
5866 but have no good way of explicitly representing. */
5867 if (!types_compatible_p (TREE_TYPE (first_def), vectype))
5868 {
5869 gassign *conv_stmt
5870 = gimple_build_assign (make_ssa_name (vectype),
5871 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
5872 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
5873 first_def = gimple_assign_lhs (conv_stmt);
5874 }
5875 gassign *perm_stmt;
5876 tree perm_dest = make_ssa_name (vectype);
5877 if (mask_vec)
5878 {
5879 if (!types_compatible_p (TREE_TYPE (second_def), vectype))
5880 {
5881 gassign *conv_stmt
5882 = gimple_build_assign (make_ssa_name (vectype),
5883 build1 (VIEW_CONVERT_EXPR,
5884 vectype, second_def));
5885 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
5886 second_def = gimple_assign_lhs (conv_stmt);
5887 }
5888 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5889 first_def, second_def,
5890 mask_vec);
5891 }
5892 else
5893 /* We need a copy here in case the def was external. */
5894 perm_stmt = gimple_build_assign (perm_dest, first_def);
5895 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
5896 /* Store the vector statement in NODE. */
5897 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
5898 }
5899
5900 /* Vectorize the SLP permutations in NODE as specified
5901 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5902 child number and lane number.
5903 Interleaving of two two-lane two-child SLP subtrees (not supported):
5904 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5905 A blend of two four-lane two-child SLP subtrees:
5906 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5907 Highpart of a four-lane one-child SLP subtree (not supported):
5908 [ { 0, 2 }, { 0, 3 } ]
5909 Where currently only a subset is supported by code generating below. */
5910
5911 static bool
5912 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5913 slp_tree node, stmt_vector_for_cost *cost_vec)
5914 {
5915 tree vectype = SLP_TREE_VECTYPE (node);
5916
5917 /* ??? We currently only support all same vector input and output types
5918 while the SLP IL should really do a concat + select and thus accept
5919 arbitrary mismatches. */
5920 slp_tree child;
5921 unsigned i;
5922 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5923 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
5924 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5925 {
5926 if (!vect_maybe_update_slp_op_vectype (child, vectype)
5927 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
5928 {
5929 if (dump_enabled_p ())
5930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5931 "Unsupported lane permutation\n");
5932 return false;
5933 }
5934 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
5935 repeating_p = false;
5936 }
5937
5938 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
5939 gcc_assert (perm.length () == SLP_TREE_LANES (node));
5940 if (dump_enabled_p ())
5941 {
5942 dump_printf_loc (MSG_NOTE, vect_location,
5943 "vectorizing permutation");
5944 for (unsigned i = 0; i < perm.length (); ++i)
5945 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
5946 if (repeating_p)
5947 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
5948 dump_printf (MSG_NOTE, "\n");
5949 }
5950
5951 /* REPEATING_P is true if every output vector is guaranteed to use the
5952 same permute vector. We can handle that case for both variable-length
5953 and constant-length vectors, but we only handle other cases for
5954 constant-length vectors.
5955
5956 Set:
5957
5958 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
5959 mask vector that we want to build.
5960
5961 - NCOPIES to the number of copies of PERM that we need in order
5962 to build the necessary permute mask vectors.
5963
5964 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
5965 for each permute mask vector. This is only relevant when GSI is
5966 nonnull. */
5967 uint64_t npatterns;
5968 unsigned nelts_per_pattern;
5969 uint64_t ncopies;
5970 unsigned noutputs_per_mask;
5971 if (repeating_p)
5972 {
5973 /* We need a single permute mask vector that has the form:
5974
5975 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
5976
5977 In other words, the original n-element permute in PERM is
5978 "unrolled" to fill a full vector. The stepped vector encoding
5979 that we use for permutes requires 3n elements. */
5980 npatterns = SLP_TREE_LANES (node);
5981 nelts_per_pattern = ncopies = 3;
5982 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5983 }
5984 else
5985 {
5986 /* Calculate every element of every permute mask vector explicitly,
5987 instead of relying on the pattern described above. */
5988 if (!nunits.is_constant (&npatterns))
5989 return false;
5990 nelts_per_pattern = ncopies = 1;
5991 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
5992 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
5993 return false;
5994 noutputs_per_mask = 1;
5995 }
5996 unsigned olanes = ncopies * SLP_TREE_LANES (node);
5997 gcc_assert (repeating_p || multiple_p (olanes, nunits));
5998
5999 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
6000 from the { SLP operand, scalar lane } permutation as recorded in the
6001 SLP node as intermediate step. This part should already work
6002 with SLP children with arbitrary number of lanes. */
6003 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
6004 auto_vec<unsigned> active_lane;
6005 vperm.create (olanes);
6006 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
6007 for (unsigned i = 0; i < ncopies; ++i)
6008 {
6009 for (unsigned pi = 0; pi < perm.length (); ++pi)
6010 {
6011 std::pair<unsigned, unsigned> p = perm[pi];
6012 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
6013 if (repeating_p)
6014 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
6015 else
6016 {
6017 /* We checked above that the vectors are constant-length. */
6018 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
6019 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
6020 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
6021 vperm.quick_push ({{p.first, vi}, vl});
6022 }
6023 }
6024 /* Advance to the next group. */
6025 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
6026 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
6027 }
6028
6029 if (dump_enabled_p ())
6030 {
6031 dump_printf_loc (MSG_NOTE, vect_location, "as");
6032 for (unsigned i = 0; i < vperm.length (); ++i)
6033 {
6034 if (i != 0
6035 && (repeating_p
6036 ? multiple_p (i, npatterns)
6037 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
6038 dump_printf (MSG_NOTE, ",");
6039 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
6040 vperm[i].first.first, vperm[i].first.second,
6041 vperm[i].second);
6042 }
6043 dump_printf (MSG_NOTE, "\n");
6044 }
6045
6046 /* We can only handle two-vector permutes, everything else should
6047 be lowered on the SLP level. The following is closely inspired
6048 by vect_transform_slp_perm_load and is supposed to eventually
6049 replace it.
6050 ??? As intermediate step do code-gen in the SLP tree representation
6051 somehow? */
6052 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
6053 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
6054 unsigned int index = 0;
6055 poly_uint64 mask_element;
6056 vec_perm_builder mask;
6057 mask.new_vector (nunits, npatterns, nelts_per_pattern);
6058 unsigned int count = mask.encoded_nelts ();
6059 mask.quick_grow (count);
6060 vec_perm_indices indices;
6061 unsigned nperms = 0;
6062 for (unsigned i = 0; i < vperm.length (); ++i)
6063 {
6064 mask_element = vperm[i].second;
6065 if (first_vec.first == -1U
6066 || first_vec == vperm[i].first)
6067 first_vec = vperm[i].first;
6068 else if (second_vec.first == -1U
6069 || second_vec == vperm[i].first)
6070 {
6071 second_vec = vperm[i].first;
6072 mask_element += nunits;
6073 }
6074 else
6075 {
6076 if (dump_enabled_p ())
6077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6078 "permutation requires at "
6079 "least three vectors\n");
6080 gcc_assert (!gsi);
6081 return false;
6082 }
6083
6084 mask[index++] = mask_element;
6085
6086 if (index == count)
6087 {
6088 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, nunits);
6089 bool identity_p = indices.series_p (0, 1, 0, 1);
6090 if (!identity_p
6091 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6092 {
6093 if (dump_enabled_p ())
6094 {
6095 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
6096 vect_location,
6097 "unsupported vect permute { ");
6098 for (i = 0; i < count; ++i)
6099 {
6100 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
6101 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
6102 }
6103 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
6104 }
6105 gcc_assert (!gsi);
6106 return false;
6107 }
6108
6109 if (!identity_p)
6110 nperms++;
6111 if (gsi)
6112 {
6113 if (second_vec.first == -1U)
6114 second_vec = first_vec;
6115
6116 slp_tree
6117 first_node = SLP_TREE_CHILDREN (node)[first_vec.first],
6118 second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
6119
6120 tree mask_vec = NULL_TREE;
6121 if (!identity_p)
6122 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6123
6124 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
6125 {
6126 tree first_def
6127 = vect_get_slp_vect_def (first_node,
6128 first_vec.second + vi);
6129 tree second_def
6130 = vect_get_slp_vect_def (second_node,
6131 second_vec.second + vi);
6132 vect_add_slp_permutation (vinfo, gsi, node, first_def,
6133 second_def, mask_vec);
6134 }
6135 }
6136
6137 index = 0;
6138 first_vec = std::make_pair (-1U, -1U);
6139 second_vec = std::make_pair (-1U, -1U);
6140 }
6141 }
6142
6143 if (!gsi)
6144 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
6145
6146 return true;
6147 }
6148
6149 /* Vectorize SLP NODE. */
6150
6151 static void
6152 vect_schedule_slp_node (vec_info *vinfo,
6153 slp_tree node, slp_instance instance)
6154 {
6155 gimple_stmt_iterator si;
6156 int i;
6157 slp_tree child;
6158
6159 /* For existing vectors there's nothing to do. */
6160 if (SLP_TREE_VEC_DEFS (node).exists ())
6161 return;
6162
6163 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
6164
6165 /* Vectorize externals and constants. */
6166 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
6167 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6168 {
6169 /* ??? vectorizable_shift can end up using a scalar operand which is
6170 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
6171 node in this case. */
6172 if (!SLP_TREE_VECTYPE (node))
6173 return;
6174
6175 vect_create_constant_vectors (vinfo, node);
6176 return;
6177 }
6178
6179 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6180
6181 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
6182 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6183
6184 if (dump_enabled_p ())
6185 dump_printf_loc (MSG_NOTE, vect_location,
6186 "------>vectorizing SLP node starting from: %G",
6187 stmt_info->stmt);
6188
6189 if (STMT_VINFO_DATA_REF (stmt_info)
6190 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
6191 {
6192 /* Vectorized loads go before the first scalar load to make it
6193 ready early, vectorized stores go before the last scalar
6194 stmt which is where all uses are ready. */
6195 stmt_vec_info last_stmt_info = NULL;
6196 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
6197 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
6198 else /* DR_IS_WRITE */
6199 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
6200 si = gsi_for_stmt (last_stmt_info->stmt);
6201 }
6202 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
6203 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
6204 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
6205 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
6206 {
6207 /* For PHI node vectorization we do not use the insertion iterator. */
6208 si = gsi_none ();
6209 }
6210 else
6211 {
6212 /* Emit other stmts after the children vectorized defs which is
6213 earliest possible. */
6214 gimple *last_stmt = NULL;
6215 bool seen_vector_def = false;
6216 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6217 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6218 {
6219 /* For fold-left reductions we are retaining the scalar
6220 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
6221 set so the representation isn't perfect. Resort to the
6222 last scalar def here. */
6223 if (SLP_TREE_VEC_STMTS (child).is_empty ())
6224 {
6225 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
6226 == cycle_phi_info_type);
6227 gphi *phi = as_a <gphi *>
6228 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
6229 if (!last_stmt
6230 || vect_stmt_dominates_stmt_p (last_stmt, phi))
6231 last_stmt = phi;
6232 }
6233 /* We are emitting all vectorized stmts in the same place and
6234 the last one is the last.
6235 ??? Unless we have a load permutation applied and that
6236 figures to re-use an earlier generated load. */
6237 unsigned j;
6238 gimple *vstmt;
6239 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
6240 if (!last_stmt
6241 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
6242 last_stmt = vstmt;
6243 }
6244 else if (!SLP_TREE_VECTYPE (child))
6245 {
6246 /* For externals we use unvectorized at all scalar defs. */
6247 unsigned j;
6248 tree def;
6249 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
6250 if (TREE_CODE (def) == SSA_NAME
6251 && !SSA_NAME_IS_DEFAULT_DEF (def))
6252 {
6253 gimple *stmt = SSA_NAME_DEF_STMT (def);
6254 if (!last_stmt
6255 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
6256 last_stmt = stmt;
6257 }
6258 }
6259 else
6260 {
6261 /* For externals we have to look at all defs since their
6262 insertion place is decided per vector. But beware
6263 of pre-existing vectors where we need to make sure
6264 we do not insert before the region boundary. */
6265 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
6266 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
6267 seen_vector_def = true;
6268 else
6269 {
6270 unsigned j;
6271 tree vdef;
6272 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
6273 if (TREE_CODE (vdef) == SSA_NAME
6274 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
6275 {
6276 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
6277 if (!last_stmt
6278 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
6279 last_stmt = vstmt;
6280 }
6281 }
6282 }
6283 /* This can happen when all children are pre-existing vectors or
6284 constants. */
6285 if (!last_stmt)
6286 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
6287 if (!last_stmt)
6288 {
6289 gcc_assert (seen_vector_def);
6290 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
6291 }
6292 else if (is_a <gphi *> (last_stmt))
6293 si = gsi_after_labels (gimple_bb (last_stmt));
6294 else
6295 {
6296 si = gsi_for_stmt (last_stmt);
6297 gsi_next (&si);
6298 }
6299 }
6300
6301 bool done_p = false;
6302
6303 /* Handle purely internal nodes. */
6304 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6305 {
6306 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
6307 be shared with different SLP nodes (but usually it's the same
6308 operation apart from the case the stmt is only there for denoting
6309 the actual scalar lane defs ...). So do not call vect_transform_stmt
6310 but open-code it here (partly). */
6311 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
6312 gcc_assert (done);
6313 done_p = true;
6314 }
6315 if (!done_p)
6316 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
6317 }
6318
6319 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
6320 For loop vectorization this is done in vectorizable_call, but for SLP
6321 it needs to be deferred until end of vect_schedule_slp, because multiple
6322 SLP instances may refer to the same scalar stmt. */
6323
6324 static void
6325 vect_remove_slp_scalar_calls (vec_info *vinfo,
6326 slp_tree node, hash_set<slp_tree> &visited)
6327 {
6328 gimple *new_stmt;
6329 gimple_stmt_iterator gsi;
6330 int i;
6331 slp_tree child;
6332 tree lhs;
6333 stmt_vec_info stmt_info;
6334
6335 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
6336 return;
6337
6338 if (visited.add (node))
6339 return;
6340
6341 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6342 vect_remove_slp_scalar_calls (vinfo, child, visited);
6343
6344 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6345 {
6346 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
6347 if (!stmt || gimple_bb (stmt) == NULL)
6348 continue;
6349 if (is_pattern_stmt_p (stmt_info)
6350 || !PURE_SLP_STMT (stmt_info))
6351 continue;
6352 lhs = gimple_call_lhs (stmt);
6353 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
6354 gsi = gsi_for_stmt (stmt);
6355 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
6356 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
6357 }
6358 }
6359
6360 static void
6361 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
6362 {
6363 hash_set<slp_tree> visited;
6364 vect_remove_slp_scalar_calls (vinfo, node, visited);
6365 }
6366
6367 /* Vectorize the instance root. */
6368
6369 void
6370 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
6371 {
6372 gassign *rstmt = NULL;
6373
6374 if (instance->kind == slp_inst_kind_ctor)
6375 {
6376 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
6377 {
6378 gimple *child_stmt;
6379 int j;
6380
6381 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
6382 {
6383 tree vect_lhs = gimple_get_lhs (child_stmt);
6384 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
6385 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
6386 TREE_TYPE (vect_lhs)))
6387 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
6388 vect_lhs);
6389 rstmt = gimple_build_assign (root_lhs, vect_lhs);
6390 break;
6391 }
6392 }
6393 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
6394 {
6395 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6396 gimple *child_stmt;
6397 int j;
6398 vec<constructor_elt, va_gc> *v;
6399 vec_alloc (v, nelts);
6400
6401 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
6402 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
6403 gimple_get_lhs (child_stmt));
6404 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
6405 tree rtype
6406 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
6407 tree r_constructor = build_constructor (rtype, v);
6408 rstmt = gimple_build_assign (lhs, r_constructor);
6409 }
6410 }
6411 else
6412 gcc_unreachable ();
6413
6414 gcc_assert (rstmt);
6415
6416 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
6417 gsi_replace (&rgsi, rstmt, true);
6418 }
6419
6420 struct slp_scc_info
6421 {
6422 bool on_stack;
6423 int dfs;
6424 int lowlink;
6425 };
6426
6427 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
6428
6429 static void
6430 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
6431 hash_map<slp_tree, slp_scc_info> &scc_info,
6432 int &maxdfs, vec<slp_tree> &stack)
6433 {
6434 bool existed_p;
6435 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
6436 gcc_assert (!existed_p);
6437 info->dfs = maxdfs;
6438 info->lowlink = maxdfs;
6439 maxdfs++;
6440
6441 /* Leaf. */
6442 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
6443 {
6444 info->on_stack = false;
6445 vect_schedule_slp_node (vinfo, node, instance);
6446 return;
6447 }
6448
6449 info->on_stack = true;
6450 stack.safe_push (node);
6451
6452 unsigned i;
6453 slp_tree child;
6454 /* DFS recurse. */
6455 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6456 {
6457 if (!child)
6458 continue;
6459 slp_scc_info *child_info = scc_info.get (child);
6460 if (!child_info)
6461 {
6462 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
6463 /* Recursion might have re-allocated the node. */
6464 info = scc_info.get (node);
6465 child_info = scc_info.get (child);
6466 info->lowlink = MIN (info->lowlink, child_info->lowlink);
6467 }
6468 else if (child_info->on_stack)
6469 info->lowlink = MIN (info->lowlink, child_info->dfs);
6470 }
6471 if (info->lowlink != info->dfs)
6472 return;
6473
6474 auto_vec<slp_tree, 4> phis_to_fixup;
6475
6476 /* Singleton. */
6477 if (stack.last () == node)
6478 {
6479 stack.pop ();
6480 info->on_stack = false;
6481 vect_schedule_slp_node (vinfo, node, instance);
6482 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6483 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
6484 phis_to_fixup.quick_push (node);
6485 }
6486 else
6487 {
6488 /* SCC. */
6489 int last_idx = stack.length () - 1;
6490 while (stack[last_idx] != node)
6491 last_idx--;
6492 /* We can break the cycle at PHIs who have at least one child
6493 code generated. Then we could re-start the DFS walk until
6494 all nodes in the SCC are covered (we might have new entries
6495 for only back-reachable nodes). But it's simpler to just
6496 iterate and schedule those that are ready. */
6497 unsigned todo = stack.length () - last_idx;
6498 do
6499 {
6500 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
6501 {
6502 slp_tree entry = stack[idx];
6503 if (!entry)
6504 continue;
6505 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
6506 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
6507 bool ready = !phi;
6508 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
6509 if (!child)
6510 {
6511 gcc_assert (phi);
6512 ready = true;
6513 break;
6514 }
6515 else if (scc_info.get (child)->on_stack)
6516 {
6517 if (!phi)
6518 {
6519 ready = false;
6520 break;
6521 }
6522 }
6523 else
6524 {
6525 if (phi)
6526 {
6527 ready = true;
6528 break;
6529 }
6530 }
6531 if (ready)
6532 {
6533 vect_schedule_slp_node (vinfo, entry, instance);
6534 scc_info.get (entry)->on_stack = false;
6535 stack[idx] = NULL;
6536 todo--;
6537 if (phi)
6538 phis_to_fixup.safe_push (entry);
6539 }
6540 }
6541 }
6542 while (todo != 0);
6543
6544 /* Pop the SCC. */
6545 stack.truncate (last_idx);
6546 }
6547
6548 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
6549 slp_tree phi_node;
6550 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
6551 {
6552 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
6553 edge_iterator ei;
6554 edge e;
6555 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
6556 {
6557 unsigned dest_idx = e->dest_idx;
6558 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
6559 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
6560 continue;
6561 /* Simply fill all args. */
6562 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
6563 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
6564 vect_get_slp_vect_def (child, i),
6565 e, gimple_phi_arg_location (phi, dest_idx));
6566 }
6567 }
6568 }
6569
6570 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
6571
6572 void
6573 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
6574 {
6575 slp_instance instance;
6576 unsigned int i;
6577
6578 hash_map<slp_tree, slp_scc_info> scc_info;
6579 int maxdfs = 0;
6580 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6581 {
6582 slp_tree node = SLP_INSTANCE_TREE (instance);
6583 if (dump_enabled_p ())
6584 {
6585 dump_printf_loc (MSG_NOTE, vect_location,
6586 "Vectorizing SLP tree:\n");
6587 /* ??? Dump all? */
6588 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6589 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
6590 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
6591 vect_print_slp_graph (MSG_NOTE, vect_location,
6592 SLP_INSTANCE_TREE (instance));
6593 }
6594 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
6595 have a PHI be the node breaking the cycle. */
6596 auto_vec<slp_tree> stack;
6597 if (!scc_info.get (node))
6598 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
6599
6600 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6601 vectorize_slp_instance_root_stmt (node, instance);
6602
6603 if (dump_enabled_p ())
6604 dump_printf_loc (MSG_NOTE, vect_location,
6605 "vectorizing stmts using SLP.\n");
6606 }
6607
6608 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6609 {
6610 slp_tree root = SLP_INSTANCE_TREE (instance);
6611 stmt_vec_info store_info;
6612 unsigned int j;
6613
6614 /* Remove scalar call stmts. Do not do this for basic-block
6615 vectorization as not all uses may be vectorized.
6616 ??? Why should this be necessary? DCE should be able to
6617 remove the stmts itself.
6618 ??? For BB vectorization we can as well remove scalar
6619 stmts starting from the SLP tree root if they have no
6620 uses. */
6621 if (is_a <loop_vec_info> (vinfo))
6622 vect_remove_slp_scalar_calls (vinfo, root);
6623
6624 /* Remove vectorized stores original scalar stmts. */
6625 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
6626 {
6627 if (!STMT_VINFO_DATA_REF (store_info)
6628 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
6629 break;
6630
6631 store_info = vect_orig_stmt (store_info);
6632 /* Free the attached stmt_vec_info and remove the stmt. */
6633 vinfo->remove_stmt (store_info);
6634
6635 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
6636 to not crash in vect_free_slp_tree later. */
6637 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
6638 SLP_TREE_REPRESENTATIVE (root) = NULL;
6639 }
6640 }
6641 }