]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
tree-optimization/101615 - SLP permute opt of existing vectors
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
52
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
56
57 static object_allocator<_slp_tree> *slp_tree_pool;
58 static slp_tree slp_first_node;
59
60 void
61 vect_slp_init (void)
62 {
63 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
64 }
65
66 void
67 vect_slp_fini (void)
68 {
69 while (slp_first_node)
70 delete slp_first_node;
71 delete slp_tree_pool;
72 slp_tree_pool = NULL;
73 }
74
75 void *
76 _slp_tree::operator new (size_t n)
77 {
78 gcc_assert (n == sizeof (_slp_tree));
79 return slp_tree_pool->allocate_raw ();
80 }
81
82 void
83 _slp_tree::operator delete (void *node, size_t n)
84 {
85 gcc_assert (n == sizeof (_slp_tree));
86 slp_tree_pool->remove_raw (node);
87 }
88
89
90 /* Initialize a SLP node. */
91
92 _slp_tree::_slp_tree ()
93 {
94 this->prev_node = NULL;
95 if (slp_first_node)
96 slp_first_node->prev_node = this;
97 this->next_node = slp_first_node;
98 slp_first_node = this;
99 SLP_TREE_SCALAR_STMTS (this) = vNULL;
100 SLP_TREE_SCALAR_OPS (this) = vNULL;
101 SLP_TREE_VEC_STMTS (this) = vNULL;
102 SLP_TREE_VEC_DEFS (this) = vNULL;
103 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
104 SLP_TREE_CHILDREN (this) = vNULL;
105 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
106 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
107 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
108 SLP_TREE_CODE (this) = ERROR_MARK;
109 SLP_TREE_VECTYPE (this) = NULL_TREE;
110 SLP_TREE_REPRESENTATIVE (this) = NULL;
111 SLP_TREE_REF_COUNT (this) = 1;
112 this->failed = NULL;
113 this->max_nunits = 1;
114 this->lanes = 0;
115 }
116
117 /* Tear down a SLP node. */
118
119 _slp_tree::~_slp_tree ()
120 {
121 if (this->prev_node)
122 this->prev_node->next_node = this->next_node;
123 else
124 slp_first_node = this->next_node;
125 if (this->next_node)
126 this->next_node->prev_node = this->prev_node;
127 SLP_TREE_CHILDREN (this).release ();
128 SLP_TREE_SCALAR_STMTS (this).release ();
129 SLP_TREE_SCALAR_OPS (this).release ();
130 SLP_TREE_VEC_STMTS (this).release ();
131 SLP_TREE_VEC_DEFS (this).release ();
132 SLP_TREE_LOAD_PERMUTATION (this).release ();
133 SLP_TREE_LANE_PERMUTATION (this).release ();
134 if (this->failed)
135 free (failed);
136 }
137
138 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
139
140 void
141 vect_free_slp_tree (slp_tree node)
142 {
143 int i;
144 slp_tree child;
145
146 if (--SLP_TREE_REF_COUNT (node) != 0)
147 return;
148
149 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
150 if (child)
151 vect_free_slp_tree (child);
152
153 /* If the node defines any SLP only patterns then those patterns are no
154 longer valid and should be removed. */
155 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
156 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
157 {
158 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
159 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
160 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
161 }
162
163 delete node;
164 }
165
166 /* Return a location suitable for dumpings related to the SLP instance. */
167
168 dump_user_location_t
169 _slp_instance::location () const
170 {
171 if (!root_stmts.is_empty ())
172 return root_stmts[0]->stmt;
173 else
174 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
175 }
176
177
178 /* Free the memory allocated for the SLP instance. */
179
180 void
181 vect_free_slp_instance (slp_instance instance)
182 {
183 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
184 SLP_INSTANCE_LOADS (instance).release ();
185 SLP_INSTANCE_ROOT_STMTS (instance).release ();
186 instance->subgraph_entries.release ();
187 instance->cost_vec.release ();
188 free (instance);
189 }
190
191
192 /* Create an SLP node for SCALAR_STMTS. */
193
194 slp_tree
195 vect_create_new_slp_node (unsigned nops, tree_code code)
196 {
197 slp_tree node = new _slp_tree;
198 SLP_TREE_SCALAR_STMTS (node) = vNULL;
199 SLP_TREE_CHILDREN (node).create (nops);
200 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
201 SLP_TREE_CODE (node) = code;
202 return node;
203 }
204 /* Create an SLP node for SCALAR_STMTS. */
205
206 static slp_tree
207 vect_create_new_slp_node (slp_tree node,
208 vec<stmt_vec_info> scalar_stmts, unsigned nops)
209 {
210 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
211 SLP_TREE_CHILDREN (node).create (nops);
212 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
213 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
214 SLP_TREE_LANES (node) = scalar_stmts.length ();
215 return node;
216 }
217
218 /* Create an SLP node for SCALAR_STMTS. */
219
220 static slp_tree
221 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
222 {
223 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
224 }
225
226 /* Create an SLP node for OPS. */
227
228 static slp_tree
229 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
230 {
231 SLP_TREE_SCALAR_OPS (node) = ops;
232 SLP_TREE_DEF_TYPE (node) = vect_external_def;
233 SLP_TREE_LANES (node) = ops.length ();
234 return node;
235 }
236
237 /* Create an SLP node for OPS. */
238
239 static slp_tree
240 vect_create_new_slp_node (vec<tree> ops)
241 {
242 return vect_create_new_slp_node (new _slp_tree, ops);
243 }
244
245
246 /* This structure is used in creation of an SLP tree. Each instance
247 corresponds to the same operand in a group of scalar stmts in an SLP
248 node. */
249 typedef struct _slp_oprnd_info
250 {
251 /* Def-stmts for the operands. */
252 vec<stmt_vec_info> def_stmts;
253 /* Operands. */
254 vec<tree> ops;
255 /* Information about the first statement, its vector def-type, type, the
256 operand itself in case it's constant, and an indication if it's a pattern
257 stmt. */
258 tree first_op_type;
259 enum vect_def_type first_dt;
260 bool any_pattern;
261 } *slp_oprnd_info;
262
263
264 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
265 operand. */
266 static vec<slp_oprnd_info>
267 vect_create_oprnd_info (int nops, int group_size)
268 {
269 int i;
270 slp_oprnd_info oprnd_info;
271 vec<slp_oprnd_info> oprnds_info;
272
273 oprnds_info.create (nops);
274 for (i = 0; i < nops; i++)
275 {
276 oprnd_info = XNEW (struct _slp_oprnd_info);
277 oprnd_info->def_stmts.create (group_size);
278 oprnd_info->ops.create (group_size);
279 oprnd_info->first_dt = vect_uninitialized_def;
280 oprnd_info->first_op_type = NULL_TREE;
281 oprnd_info->any_pattern = false;
282 oprnds_info.quick_push (oprnd_info);
283 }
284
285 return oprnds_info;
286 }
287
288
289 /* Free operands info. */
290
291 static void
292 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
293 {
294 int i;
295 slp_oprnd_info oprnd_info;
296
297 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
298 {
299 oprnd_info->def_stmts.release ();
300 oprnd_info->ops.release ();
301 XDELETE (oprnd_info);
302 }
303
304 oprnds_info.release ();
305 }
306
307
308 /* Return true if STMTS contains a pattern statement. */
309
310 static bool
311 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
312 {
313 stmt_vec_info stmt_info;
314 unsigned int i;
315 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
316 if (is_pattern_stmt_p (stmt_info))
317 return true;
318 return false;
319 }
320
321 /* Return true when all lanes in the external or constant NODE have
322 the same value. */
323
324 static bool
325 vect_slp_tree_uniform_p (slp_tree node)
326 {
327 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
328 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
329
330 /* Pre-exsting vectors. */
331 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
332 return false;
333
334 unsigned i;
335 tree op, first = NULL_TREE;
336 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
337 if (!first)
338 first = op;
339 else if (!operand_equal_p (first, op, 0))
340 return false;
341
342 return true;
343 }
344
345 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
346 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
347 of the chain. */
348
349 int
350 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
351 stmt_vec_info first_stmt_info)
352 {
353 stmt_vec_info next_stmt_info = first_stmt_info;
354 int result = 0;
355
356 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
357 return -1;
358
359 do
360 {
361 if (next_stmt_info == stmt_info)
362 return result;
363 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
364 if (next_stmt_info)
365 result += DR_GROUP_GAP (next_stmt_info);
366 }
367 while (next_stmt_info);
368
369 return -1;
370 }
371
372 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
373 using the method implemented by duplicate_and_interleave. Return true
374 if so, returning the number of intermediate vectors in *NVECTORS_OUT
375 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
376 (if nonnull). */
377
378 bool
379 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
380 tree elt_type, unsigned int *nvectors_out,
381 tree *vector_type_out,
382 tree *permutes)
383 {
384 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
385 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
386 return false;
387
388 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
389 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
390 unsigned int nvectors = 1;
391 for (;;)
392 {
393 scalar_int_mode int_mode;
394 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
395 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
396 {
397 /* Get the natural vector type for this SLP group size. */
398 tree int_type = build_nonstandard_integer_type
399 (GET_MODE_BITSIZE (int_mode), 1);
400 tree vector_type
401 = get_vectype_for_scalar_type (vinfo, int_type, count);
402 if (vector_type
403 && VECTOR_MODE_P (TYPE_MODE (vector_type))
404 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
405 GET_MODE_SIZE (base_vector_mode)))
406 {
407 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
408 together into elements of type INT_TYPE and using the result
409 to build NVECTORS vectors. */
410 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
411 vec_perm_builder sel1 (nelts, 2, 3);
412 vec_perm_builder sel2 (nelts, 2, 3);
413 poly_int64 half_nelts = exact_div (nelts, 2);
414 for (unsigned int i = 0; i < 3; ++i)
415 {
416 sel1.quick_push (i);
417 sel1.quick_push (i + nelts);
418 sel2.quick_push (half_nelts + i);
419 sel2.quick_push (half_nelts + i + nelts);
420 }
421 vec_perm_indices indices1 (sel1, 2, nelts);
422 vec_perm_indices indices2 (sel2, 2, nelts);
423 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
424 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
425 {
426 if (nvectors_out)
427 *nvectors_out = nvectors;
428 if (vector_type_out)
429 *vector_type_out = vector_type;
430 if (permutes)
431 {
432 permutes[0] = vect_gen_perm_mask_checked (vector_type,
433 indices1);
434 permutes[1] = vect_gen_perm_mask_checked (vector_type,
435 indices2);
436 }
437 return true;
438 }
439 }
440 }
441 if (!multiple_p (elt_bytes, 2, &elt_bytes))
442 return false;
443 nvectors *= 2;
444 }
445 }
446
447 /* Return true if DTA and DTB match. */
448
449 static bool
450 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
451 {
452 return (dta == dtb
453 || ((dta == vect_external_def || dta == vect_constant_def)
454 && (dtb == vect_external_def || dtb == vect_constant_def)));
455 }
456
457 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
458 they are of a valid type and that they match the defs of the first stmt of
459 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
460 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
461 indicates swap is required for cond_expr stmts. Specifically, *SWAP
462 is 1 if STMT is cond and operands of comparison need to be swapped;
463 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
464 If there is any operand swap in this function, *SWAP is set to non-zero
465 value.
466 If there was a fatal error return -1; if the error could be corrected by
467 swapping operands of father node of this one, return 1; if everything is
468 ok return 0. */
469 static int
470 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
471 bool *skip_args,
472 vec<stmt_vec_info> stmts, unsigned stmt_num,
473 vec<slp_oprnd_info> *oprnds_info)
474 {
475 stmt_vec_info stmt_info = stmts[stmt_num];
476 tree oprnd;
477 unsigned int i, number_of_oprnds;
478 enum vect_def_type dt = vect_uninitialized_def;
479 slp_oprnd_info oprnd_info;
480 int first_op_idx = 1;
481 unsigned int commutative_op = -1U;
482 bool first_op_cond = false;
483 bool first = stmt_num == 0;
484
485 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
486 {
487 number_of_oprnds = gimple_call_num_args (stmt);
488 first_op_idx = 3;
489 if (gimple_call_internal_p (stmt))
490 {
491 internal_fn ifn = gimple_call_internal_fn (stmt);
492 commutative_op = first_commutative_argument (ifn);
493
494 /* Masked load, only look at mask. */
495 if (ifn == IFN_MASK_LOAD)
496 {
497 number_of_oprnds = 1;
498 /* Mask operand index. */
499 first_op_idx = 5;
500 }
501 }
502 }
503 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
504 {
505 enum tree_code code = gimple_assign_rhs_code (stmt);
506 number_of_oprnds = gimple_num_ops (stmt) - 1;
507 /* Swap can only be done for cond_expr if asked to, otherwise we
508 could result in different comparison code to the first stmt. */
509 if (code == COND_EXPR
510 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
511 {
512 first_op_cond = true;
513 number_of_oprnds++;
514 }
515 else
516 commutative_op = commutative_tree_code (code) ? 0U : -1U;
517 }
518 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
519 number_of_oprnds = gimple_phi_num_args (stmt);
520 else
521 return -1;
522
523 bool swapped = (swap != 0);
524 bool backedge = false;
525 gcc_assert (!swapped || first_op_cond);
526 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
527 for (i = 0; i < number_of_oprnds; i++)
528 {
529 if (first_op_cond)
530 {
531 /* Map indicating how operands of cond_expr should be swapped. */
532 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
533 int *map = maps[swap];
534
535 if (i < 2)
536 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
537 first_op_idx), map[i]);
538 else
539 oprnd = gimple_op (stmt_info->stmt, map[i]);
540 }
541 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
542 {
543 oprnd = gimple_phi_arg_def (stmt, i);
544 backedge = dominated_by_p (CDI_DOMINATORS,
545 gimple_phi_arg_edge (stmt, i)->src,
546 gimple_bb (stmt_info->stmt));
547 }
548 else
549 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
550 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
551 oprnd = TREE_OPERAND (oprnd, 0);
552
553 oprnd_info = (*oprnds_info)[i];
554
555 stmt_vec_info def_stmt_info;
556 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
557 {
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
560 "Build SLP failed: can't analyze def for %T\n",
561 oprnd);
562
563 return -1;
564 }
565
566 if (skip_args[i])
567 {
568 oprnd_info->def_stmts.quick_push (NULL);
569 oprnd_info->ops.quick_push (NULL_TREE);
570 oprnd_info->first_dt = vect_uninitialized_def;
571 continue;
572 }
573
574 oprnd_info->def_stmts.quick_push (def_stmt_info);
575 oprnd_info->ops.quick_push (oprnd);
576
577 if (def_stmt_info
578 && is_pattern_stmt_p (def_stmt_info))
579 {
580 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
581 != def_stmt_info)
582 oprnd_info->any_pattern = true;
583 else
584 /* If we promote this to external use the original stmt def. */
585 oprnd_info->ops.last ()
586 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
587 }
588
589 /* If there's a extern def on a backedge make sure we can
590 code-generate at the region start.
591 ??? This is another case that could be fixed by adjusting
592 how we split the function but at the moment we'd have conflicting
593 goals there. */
594 if (backedge
595 && dts[i] == vect_external_def
596 && is_a <bb_vec_info> (vinfo)
597 && TREE_CODE (oprnd) == SSA_NAME
598 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
599 && !dominated_by_p (CDI_DOMINATORS,
600 as_a <bb_vec_info> (vinfo)->bbs[0],
601 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
602 {
603 if (dump_enabled_p ())
604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
605 "Build SLP failed: extern def %T only defined "
606 "on backedge\n", oprnd);
607 return -1;
608 }
609
610 if (first)
611 {
612 tree type = TREE_TYPE (oprnd);
613 dt = dts[i];
614 if ((dt == vect_constant_def
615 || dt == vect_external_def)
616 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
617 && (TREE_CODE (type) == BOOLEAN_TYPE
618 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
619 type)))
620 {
621 if (dump_enabled_p ())
622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
623 "Build SLP failed: invalid type of def "
624 "for variable-length SLP %T\n", oprnd);
625 return -1;
626 }
627
628 /* For the swapping logic below force vect_reduction_def
629 for the reduction op in a SLP reduction group. */
630 if (!STMT_VINFO_DATA_REF (stmt_info)
631 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
632 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
633 && def_stmt_info)
634 dts[i] = dt = vect_reduction_def;
635
636 /* Check the types of the definition. */
637 switch (dt)
638 {
639 case vect_external_def:
640 case vect_constant_def:
641 case vect_internal_def:
642 case vect_reduction_def:
643 case vect_induction_def:
644 case vect_nested_cycle:
645 break;
646
647 default:
648 /* FORNOW: Not supported. */
649 if (dump_enabled_p ())
650 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
651 "Build SLP failed: illegal type of def %T\n",
652 oprnd);
653 return -1;
654 }
655
656 oprnd_info->first_dt = dt;
657 oprnd_info->first_op_type = type;
658 }
659 }
660 if (first)
661 return 0;
662
663 /* Now match the operand definition types to that of the first stmt. */
664 for (i = 0; i < number_of_oprnds;)
665 {
666 if (skip_args[i])
667 {
668 ++i;
669 continue;
670 }
671
672 oprnd_info = (*oprnds_info)[i];
673 dt = dts[i];
674 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
675 oprnd = oprnd_info->ops[stmt_num];
676 tree type = TREE_TYPE (oprnd);
677
678 if (!types_compatible_p (oprnd_info->first_op_type, type))
679 {
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
682 "Build SLP failed: different operand types\n");
683 return 1;
684 }
685
686 /* Not first stmt of the group, check that the def-stmt/s match
687 the def-stmt/s of the first stmt. Allow different definition
688 types for reduction chains: the first stmt must be a
689 vect_reduction_def (a phi node), and the rest
690 end in the reduction chain. */
691 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
692 && !(oprnd_info->first_dt == vect_reduction_def
693 && !STMT_VINFO_DATA_REF (stmt_info)
694 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
695 && def_stmt_info
696 && !STMT_VINFO_DATA_REF (def_stmt_info)
697 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
698 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
699 || (!STMT_VINFO_DATA_REF (stmt_info)
700 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
701 && ((!def_stmt_info
702 || STMT_VINFO_DATA_REF (def_stmt_info)
703 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
704 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
705 != (oprnd_info->first_dt != vect_reduction_def))))
706 {
707 /* Try swapping operands if we got a mismatch. For BB
708 vectorization only in case it will clearly improve things. */
709 if (i == commutative_op && !swapped
710 && (!is_a <bb_vec_info> (vinfo)
711 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
712 dts[i+1])
713 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
714 || vect_def_types_match
715 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
716 {
717 if (dump_enabled_p ())
718 dump_printf_loc (MSG_NOTE, vect_location,
719 "trying swapped operands\n");
720 std::swap (dts[i], dts[i+1]);
721 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
722 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
723 std::swap ((*oprnds_info)[i]->ops[stmt_num],
724 (*oprnds_info)[i+1]->ops[stmt_num]);
725 swapped = true;
726 continue;
727 }
728
729 if (is_a <bb_vec_info> (vinfo)
730 && !oprnd_info->any_pattern)
731 {
732 /* Now for commutative ops we should see whether we can
733 make the other operand matching. */
734 if (dump_enabled_p ())
735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
736 "treating operand as external\n");
737 oprnd_info->first_dt = dt = vect_external_def;
738 }
739 else
740 {
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
743 "Build SLP failed: different types\n");
744 return 1;
745 }
746 }
747
748 /* Make sure to demote the overall operand to external. */
749 if (dt == vect_external_def)
750 oprnd_info->first_dt = vect_external_def;
751 /* For a SLP reduction chain we want to duplicate the reduction to
752 each of the chain members. That gets us a sane SLP graph (still
753 the stmts are not 100% correct wrt the initial values). */
754 else if ((dt == vect_internal_def
755 || dt == vect_reduction_def)
756 && oprnd_info->first_dt == vect_reduction_def
757 && !STMT_VINFO_DATA_REF (stmt_info)
758 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
759 && !STMT_VINFO_DATA_REF (def_stmt_info)
760 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
761 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
762 {
763 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
764 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
765 }
766
767 ++i;
768 }
769
770 /* Swap operands. */
771 if (swapped)
772 {
773 if (dump_enabled_p ())
774 dump_printf_loc (MSG_NOTE, vect_location,
775 "swapped operands to match def types in %G",
776 stmt_info->stmt);
777 }
778
779 return 0;
780 }
781
782 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
783 Return true if we can, meaning that this choice doesn't conflict with
784 existing SLP nodes that use STMT_INFO. */
785
786 bool
787 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
788 {
789 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
790 if (old_vectype)
791 return useless_type_conversion_p (vectype, old_vectype);
792
793 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
794 {
795 /* We maintain the invariant that if any statement in the group is
796 used, all other members of the group have the same vector type. */
797 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
798 stmt_vec_info member_info = first_info;
799 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
800 if (is_pattern_stmt_p (member_info)
801 && !useless_type_conversion_p (vectype,
802 STMT_VINFO_VECTYPE (member_info)))
803 break;
804
805 if (!member_info)
806 {
807 for (member_info = first_info; member_info;
808 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
809 STMT_VINFO_VECTYPE (member_info) = vectype;
810 return true;
811 }
812 }
813 else if (!is_pattern_stmt_p (stmt_info))
814 {
815 STMT_VINFO_VECTYPE (stmt_info) = vectype;
816 return true;
817 }
818
819 if (dump_enabled_p ())
820 {
821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822 "Build SLP failed: incompatible vector"
823 " types for: %G", stmt_info->stmt);
824 dump_printf_loc (MSG_NOTE, vect_location,
825 " old vector type: %T\n", old_vectype);
826 dump_printf_loc (MSG_NOTE, vect_location,
827 " new vector type: %T\n", vectype);
828 }
829 return false;
830 }
831
832 /* Return true if call statements CALL1 and CALL2 are similar enough
833 to be combined into the same SLP group. */
834
835 static bool
836 compatible_calls_p (gcall *call1, gcall *call2)
837 {
838 unsigned int nargs = gimple_call_num_args (call1);
839 if (nargs != gimple_call_num_args (call2))
840 return false;
841
842 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
843 return false;
844
845 if (gimple_call_internal_p (call1))
846 {
847 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
848 TREE_TYPE (gimple_call_lhs (call2))))
849 return false;
850 for (unsigned int i = 0; i < nargs; ++i)
851 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
852 TREE_TYPE (gimple_call_arg (call2, i))))
853 return false;
854 }
855 else
856 {
857 if (!operand_equal_p (gimple_call_fn (call1),
858 gimple_call_fn (call2), 0))
859 return false;
860
861 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
862 return false;
863 }
864 return true;
865 }
866
867 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
868 caller's attempt to find the vector type in STMT_INFO with the narrowest
869 element type. Return true if VECTYPE is nonnull and if it is valid
870 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
871 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
872 vect_build_slp_tree. */
873
874 static bool
875 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
876 unsigned int group_size,
877 tree vectype, poly_uint64 *max_nunits)
878 {
879 if (!vectype)
880 {
881 if (dump_enabled_p ())
882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
883 "Build SLP failed: unsupported data-type in %G\n",
884 stmt_info->stmt);
885 /* Fatal mismatch. */
886 return false;
887 }
888
889 /* If populating the vector type requires unrolling then fail
890 before adjusting *max_nunits for basic-block vectorization. */
891 if (is_a <bb_vec_info> (vinfo)
892 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
893 {
894 if (dump_enabled_p ())
895 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
896 "Build SLP failed: unrolling required "
897 "in basic block SLP\n");
898 /* Fatal mismatch. */
899 return false;
900 }
901
902 /* In case of multiple types we need to detect the smallest type. */
903 vect_update_max_nunits (max_nunits, vectype);
904 return true;
905 }
906
907 /* Verify if the scalar stmts STMTS are isomorphic, require data
908 permutation or are of unsupported types of operation. Return
909 true if they are, otherwise return false and indicate in *MATCHES
910 which stmts are not isomorphic to the first one. If MATCHES[0]
911 is false then this indicates the comparison could not be
912 carried out or the stmts will never be vectorized by SLP.
913
914 Note COND_EXPR is possibly isomorphic to another one after swapping its
915 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
916 the first stmt by swapping the two operands of comparison; set SWAP[i]
917 to 2 if stmt I is isormorphic to the first stmt by inverting the code
918 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
919 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
920
921 static bool
922 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
923 vec<stmt_vec_info> stmts, unsigned int group_size,
924 poly_uint64 *max_nunits, bool *matches,
925 bool *two_operators, tree *node_vectype)
926 {
927 unsigned int i;
928 stmt_vec_info first_stmt_info = stmts[0];
929 enum tree_code first_stmt_code = ERROR_MARK;
930 enum tree_code alt_stmt_code = ERROR_MARK;
931 enum tree_code rhs_code = ERROR_MARK;
932 enum tree_code first_cond_code = ERROR_MARK;
933 tree lhs;
934 bool need_same_oprnds = false;
935 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
936 optab optab;
937 int icode;
938 machine_mode optab_op2_mode;
939 machine_mode vec_mode;
940 stmt_vec_info first_load = NULL, prev_first_load = NULL;
941 bool first_stmt_load_p = false, load_p = false;
942 bool first_stmt_phi_p = false, phi_p = false;
943 bool maybe_soft_fail = false;
944 tree soft_fail_nunits_vectype = NULL_TREE;
945
946 /* For every stmt in NODE find its def stmt/s. */
947 stmt_vec_info stmt_info;
948 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
949 {
950 gimple *stmt = stmt_info->stmt;
951 swap[i] = 0;
952 matches[i] = false;
953
954 if (dump_enabled_p ())
955 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
956
957 /* Fail to vectorize statements marked as unvectorizable, throw
958 or are volatile. */
959 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
960 || stmt_can_throw_internal (cfun, stmt)
961 || gimple_has_volatile_ops (stmt))
962 {
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
965 "Build SLP failed: unvectorizable statement %G",
966 stmt);
967 /* ??? For BB vectorization we want to commutate operands in a way
968 to shuffle all unvectorizable defs into one operand and have
969 the other still vectorized. The following doesn't reliably
970 work for this though but it's the easiest we can do here. */
971 if (is_a <bb_vec_info> (vinfo) && i != 0)
972 continue;
973 /* Fatal mismatch. */
974 matches[0] = false;
975 return false;
976 }
977
978 lhs = gimple_get_lhs (stmt);
979 if (lhs == NULL_TREE)
980 {
981 if (dump_enabled_p ())
982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
983 "Build SLP failed: not GIMPLE_ASSIGN nor "
984 "GIMPLE_CALL %G", stmt);
985 if (is_a <bb_vec_info> (vinfo) && i != 0)
986 continue;
987 /* Fatal mismatch. */
988 matches[0] = false;
989 return false;
990 }
991
992 tree nunits_vectype;
993 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
994 &nunits_vectype, group_size))
995 {
996 if (is_a <bb_vec_info> (vinfo) && i != 0)
997 continue;
998 /* Fatal mismatch. */
999 matches[0] = false;
1000 return false;
1001 }
1002 /* Record nunits required but continue analysis, producing matches[]
1003 as if nunits was not an issue. This allows splitting of groups
1004 to happen. */
1005 if (nunits_vectype
1006 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1007 nunits_vectype, max_nunits))
1008 {
1009 gcc_assert (is_a <bb_vec_info> (vinfo));
1010 maybe_soft_fail = true;
1011 soft_fail_nunits_vectype = nunits_vectype;
1012 }
1013
1014 gcc_assert (vectype);
1015
1016 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1017 if (call_stmt)
1018 {
1019 rhs_code = CALL_EXPR;
1020
1021 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
1022 load_p = true;
1023 else if ((gimple_call_internal_p (call_stmt)
1024 && (!vectorizable_internal_fn_p
1025 (gimple_call_internal_fn (call_stmt))))
1026 || gimple_call_tail_p (call_stmt)
1027 || gimple_call_noreturn_p (call_stmt)
1028 || !gimple_call_nothrow_p (call_stmt)
1029 || gimple_call_chain (call_stmt))
1030 {
1031 if (dump_enabled_p ())
1032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1033 "Build SLP failed: unsupported call type %G",
1034 call_stmt);
1035 if (is_a <bb_vec_info> (vinfo) && i != 0)
1036 continue;
1037 /* Fatal mismatch. */
1038 matches[0] = false;
1039 return false;
1040 }
1041 }
1042 else if (gimple_code (stmt) == GIMPLE_PHI)
1043 {
1044 rhs_code = ERROR_MARK;
1045 phi_p = true;
1046 }
1047 else
1048 {
1049 rhs_code = gimple_assign_rhs_code (stmt);
1050 load_p = gimple_vuse (stmt);
1051 }
1052
1053 /* Check the operation. */
1054 if (i == 0)
1055 {
1056 *node_vectype = vectype;
1057 first_stmt_code = rhs_code;
1058 first_stmt_load_p = load_p;
1059 first_stmt_phi_p = phi_p;
1060
1061 /* Shift arguments should be equal in all the packed stmts for a
1062 vector shift with scalar shift operand. */
1063 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1064 || rhs_code == LROTATE_EXPR
1065 || rhs_code == RROTATE_EXPR)
1066 {
1067 vec_mode = TYPE_MODE (vectype);
1068
1069 /* First see if we have a vector/vector shift. */
1070 optab = optab_for_tree_code (rhs_code, vectype,
1071 optab_vector);
1072
1073 if (!optab
1074 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1075 {
1076 /* No vector/vector shift, try for a vector/scalar shift. */
1077 optab = optab_for_tree_code (rhs_code, vectype,
1078 optab_scalar);
1079
1080 if (!optab)
1081 {
1082 if (dump_enabled_p ())
1083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1084 "Build SLP failed: no optab.\n");
1085 if (is_a <bb_vec_info> (vinfo) && i != 0)
1086 continue;
1087 /* Fatal mismatch. */
1088 matches[0] = false;
1089 return false;
1090 }
1091 icode = (int) optab_handler (optab, vec_mode);
1092 if (icode == CODE_FOR_nothing)
1093 {
1094 if (dump_enabled_p ())
1095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1096 "Build SLP failed: "
1097 "op not supported by target.\n");
1098 if (is_a <bb_vec_info> (vinfo) && i != 0)
1099 continue;
1100 /* Fatal mismatch. */
1101 matches[0] = false;
1102 return false;
1103 }
1104 optab_op2_mode = insn_data[icode].operand[2].mode;
1105 if (!VECTOR_MODE_P (optab_op2_mode))
1106 {
1107 need_same_oprnds = true;
1108 first_op1 = gimple_assign_rhs2 (stmt);
1109 }
1110 }
1111 }
1112 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1113 {
1114 need_same_oprnds = true;
1115 first_op1 = gimple_assign_rhs2 (stmt);
1116 }
1117 else if (!load_p
1118 && rhs_code == BIT_FIELD_REF)
1119 {
1120 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1121 if (!is_a <bb_vec_info> (vinfo)
1122 || TREE_CODE (vec) != SSA_NAME
1123 || !operand_equal_p (TYPE_SIZE (vectype),
1124 TYPE_SIZE (TREE_TYPE (vec))))
1125 {
1126 if (dump_enabled_p ())
1127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1128 "Build SLP failed: "
1129 "BIT_FIELD_REF not supported\n");
1130 /* Fatal mismatch. */
1131 matches[0] = false;
1132 return false;
1133 }
1134 }
1135 else if (call_stmt
1136 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1137 {
1138 need_same_oprnds = true;
1139 first_op1 = gimple_call_arg (call_stmt, 1);
1140 }
1141 }
1142 else
1143 {
1144 if (first_stmt_code != rhs_code
1145 && alt_stmt_code == ERROR_MARK)
1146 alt_stmt_code = rhs_code;
1147 if ((first_stmt_code != rhs_code
1148 && (first_stmt_code != IMAGPART_EXPR
1149 || rhs_code != REALPART_EXPR)
1150 && (first_stmt_code != REALPART_EXPR
1151 || rhs_code != IMAGPART_EXPR)
1152 /* Handle mismatches in plus/minus by computing both
1153 and merging the results. */
1154 && !((first_stmt_code == PLUS_EXPR
1155 || first_stmt_code == MINUS_EXPR)
1156 && (alt_stmt_code == PLUS_EXPR
1157 || alt_stmt_code == MINUS_EXPR)
1158 && rhs_code == alt_stmt_code)
1159 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1160 && (first_stmt_code == ARRAY_REF
1161 || first_stmt_code == BIT_FIELD_REF
1162 || first_stmt_code == INDIRECT_REF
1163 || first_stmt_code == COMPONENT_REF
1164 || first_stmt_code == MEM_REF)))
1165 || first_stmt_load_p != load_p
1166 || first_stmt_phi_p != phi_p)
1167 {
1168 if (dump_enabled_p ())
1169 {
1170 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1171 "Build SLP failed: different operation "
1172 "in stmt %G", stmt);
1173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1174 "original stmt %G", first_stmt_info->stmt);
1175 }
1176 /* Mismatch. */
1177 continue;
1178 }
1179
1180 if (!load_p
1181 && first_stmt_code == BIT_FIELD_REF
1182 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1183 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1184 {
1185 if (dump_enabled_p ())
1186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1187 "Build SLP failed: different BIT_FIELD_REF "
1188 "arguments in %G", stmt);
1189 /* Mismatch. */
1190 continue;
1191 }
1192
1193 if (!load_p && rhs_code == CALL_EXPR)
1194 {
1195 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1196 as_a <gcall *> (stmt)))
1197 {
1198 if (dump_enabled_p ())
1199 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1200 "Build SLP failed: different calls in %G",
1201 stmt);
1202 /* Mismatch. */
1203 continue;
1204 }
1205 }
1206
1207 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1208 && (gimple_bb (first_stmt_info->stmt)
1209 != gimple_bb (stmt_info->stmt)))
1210 {
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1213 "Build SLP failed: different BB for PHI "
1214 "or possibly trapping operation in %G", stmt);
1215 /* Mismatch. */
1216 continue;
1217 }
1218
1219 if (need_same_oprnds)
1220 {
1221 tree other_op1 = (call_stmt
1222 ? gimple_call_arg (call_stmt, 1)
1223 : gimple_assign_rhs2 (stmt));
1224 if (!operand_equal_p (first_op1, other_op1, 0))
1225 {
1226 if (dump_enabled_p ())
1227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1228 "Build SLP failed: different shift "
1229 "arguments in %G", stmt);
1230 /* Mismatch. */
1231 continue;
1232 }
1233 }
1234
1235 if (!types_compatible_p (vectype, *node_vectype))
1236 {
1237 if (dump_enabled_p ())
1238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1239 "Build SLP failed: different vector type "
1240 "in %G", stmt);
1241 /* Mismatch. */
1242 continue;
1243 }
1244 }
1245
1246 /* Grouped store or load. */
1247 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1248 {
1249 if (REFERENCE_CLASS_P (lhs))
1250 {
1251 /* Store. */
1252 ;
1253 }
1254 else
1255 {
1256 /* Load. */
1257 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1258 if (prev_first_load)
1259 {
1260 /* Check that there are no loads from different interleaving
1261 chains in the same node. */
1262 if (prev_first_load != first_load)
1263 {
1264 if (dump_enabled_p ())
1265 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1266 vect_location,
1267 "Build SLP failed: different "
1268 "interleaving chains in one node %G",
1269 stmt);
1270 /* Mismatch. */
1271 continue;
1272 }
1273 }
1274 else
1275 prev_first_load = first_load;
1276 }
1277 } /* Grouped access. */
1278 else
1279 {
1280 if (load_p)
1281 {
1282 /* Not grouped load. */
1283 if (dump_enabled_p ())
1284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1285 "Build SLP failed: not grouped load %G", stmt);
1286
1287 /* FORNOW: Not grouped loads are not supported. */
1288 if (is_a <bb_vec_info> (vinfo) && i != 0)
1289 continue;
1290 /* Fatal mismatch. */
1291 matches[0] = false;
1292 return false;
1293 }
1294
1295 /* Not memory operation. */
1296 if (!phi_p
1297 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1298 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1299 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1300 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1301 && rhs_code != VIEW_CONVERT_EXPR
1302 && rhs_code != CALL_EXPR
1303 && rhs_code != BIT_FIELD_REF)
1304 {
1305 if (dump_enabled_p ())
1306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1307 "Build SLP failed: operation unsupported %G",
1308 stmt);
1309 if (is_a <bb_vec_info> (vinfo) && i != 0)
1310 continue;
1311 /* Fatal mismatch. */
1312 matches[0] = false;
1313 return false;
1314 }
1315
1316 if (rhs_code == COND_EXPR)
1317 {
1318 tree cond_expr = gimple_assign_rhs1 (stmt);
1319 enum tree_code cond_code = TREE_CODE (cond_expr);
1320 enum tree_code swap_code = ERROR_MARK;
1321 enum tree_code invert_code = ERROR_MARK;
1322
1323 if (i == 0)
1324 first_cond_code = TREE_CODE (cond_expr);
1325 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1326 {
1327 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1328 swap_code = swap_tree_comparison (cond_code);
1329 invert_code = invert_tree_comparison (cond_code, honor_nans);
1330 }
1331
1332 if (first_cond_code == cond_code)
1333 ;
1334 /* Isomorphic can be achieved by swapping. */
1335 else if (first_cond_code == swap_code)
1336 swap[i] = 1;
1337 /* Isomorphic can be achieved by inverting. */
1338 else if (first_cond_code == invert_code)
1339 swap[i] = 2;
1340 else
1341 {
1342 if (dump_enabled_p ())
1343 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1344 "Build SLP failed: different"
1345 " operation %G", stmt);
1346 /* Mismatch. */
1347 continue;
1348 }
1349 }
1350 }
1351
1352 matches[i] = true;
1353 }
1354
1355 for (i = 0; i < group_size; ++i)
1356 if (!matches[i])
1357 return false;
1358
1359 /* If we allowed a two-operation SLP node verify the target can cope
1360 with the permute we are going to use. */
1361 if (alt_stmt_code != ERROR_MARK
1362 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1363 {
1364 *two_operators = true;
1365 }
1366
1367 if (maybe_soft_fail)
1368 {
1369 unsigned HOST_WIDE_INT const_nunits;
1370 if (!TYPE_VECTOR_SUBPARTS
1371 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1372 || const_nunits > group_size)
1373 matches[0] = false;
1374 else
1375 {
1376 /* With constant vector elements simulate a mismatch at the
1377 point we need to split. */
1378 unsigned tail = group_size & (const_nunits - 1);
1379 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1380 }
1381 return false;
1382 }
1383
1384 return true;
1385 }
1386
1387 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1388 Note we never remove apart from at destruction time so we do not
1389 need a special value for deleted that differs from empty. */
1390 struct bst_traits
1391 {
1392 typedef vec <stmt_vec_info> value_type;
1393 typedef vec <stmt_vec_info> compare_type;
1394 static inline hashval_t hash (value_type);
1395 static inline bool equal (value_type existing, value_type candidate);
1396 static inline bool is_empty (value_type x) { return !x.exists (); }
1397 static inline bool is_deleted (value_type x) { return !x.exists (); }
1398 static const bool empty_zero_p = true;
1399 static inline void mark_empty (value_type &x) { x.release (); }
1400 static inline void mark_deleted (value_type &x) { x.release (); }
1401 static inline void remove (value_type &x) { x.release (); }
1402 };
1403 inline hashval_t
1404 bst_traits::hash (value_type x)
1405 {
1406 inchash::hash h;
1407 for (unsigned i = 0; i < x.length (); ++i)
1408 h.add_int (gimple_uid (x[i]->stmt));
1409 return h.end ();
1410 }
1411 inline bool
1412 bst_traits::equal (value_type existing, value_type candidate)
1413 {
1414 if (existing.length () != candidate.length ())
1415 return false;
1416 for (unsigned i = 0; i < existing.length (); ++i)
1417 if (existing[i] != candidate[i])
1418 return false;
1419 return true;
1420 }
1421
1422 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1423 but then vec::insert does memmove and that's not compatible with
1424 std::pair. */
1425 struct chain_op_t
1426 {
1427 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1428 : code (code_), dt (dt_), op (op_) {}
1429 tree_code code;
1430 vect_def_type dt;
1431 tree op;
1432 };
1433
1434 /* Comparator for sorting associatable chains. */
1435
1436 static int
1437 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1438 {
1439 auto *op1 = (const chain_op_t *) op1_;
1440 auto *op2 = (const chain_op_t *) op2_;
1441 if (op1->dt != op2->dt)
1442 return (int)op1->dt - (int)op2->dt;
1443 return (int)op1->code - (int)op2->code;
1444 }
1445
1446 /* Linearize the associatable expression chain at START with the
1447 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1448 filling CHAIN with the result and using WORKLIST as intermediate storage.
1449 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1450 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1451 stmts, starting with START. */
1452
1453 static void
1454 vect_slp_linearize_chain (vec_info *vinfo,
1455 vec<std::pair<tree_code, gimple *> > &worklist,
1456 vec<chain_op_t> &chain,
1457 enum tree_code code, gimple *start,
1458 gimple *&code_stmt, gimple *&alt_code_stmt,
1459 vec<gimple *> *chain_stmts)
1460 {
1461 /* For each lane linearize the addition/subtraction (or other
1462 uniform associatable operation) expression tree. */
1463 worklist.safe_push (std::make_pair (code, start));
1464 while (!worklist.is_empty ())
1465 {
1466 auto entry = worklist.pop ();
1467 gassign *stmt = as_a <gassign *> (entry.second);
1468 enum tree_code in_code = entry.first;
1469 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1470 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1471 if (!code_stmt
1472 && gimple_assign_rhs_code (stmt) == code)
1473 code_stmt = stmt;
1474 else if (!alt_code_stmt
1475 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1476 alt_code_stmt = stmt;
1477 if (chain_stmts)
1478 chain_stmts->safe_push (stmt);
1479 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1480 {
1481 tree op = gimple_op (stmt, opnum);
1482 vect_def_type dt;
1483 stmt_vec_info def_stmt_info;
1484 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1485 gcc_assert (res);
1486 if (dt == vect_internal_def
1487 && is_pattern_stmt_p (def_stmt_info))
1488 op = gimple_get_lhs (def_stmt_info->stmt);
1489 gimple *use_stmt;
1490 use_operand_p use_p;
1491 if (dt == vect_internal_def
1492 && single_imm_use (op, &use_p, &use_stmt)
1493 && is_gimple_assign (def_stmt_info->stmt)
1494 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1495 || (code == PLUS_EXPR
1496 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1497 == MINUS_EXPR))))
1498 {
1499 tree_code op_def_code = this_code;
1500 if (op_def_code == MINUS_EXPR && opnum == 1)
1501 op_def_code = PLUS_EXPR;
1502 if (in_code == MINUS_EXPR)
1503 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1504 worklist.safe_push (std::make_pair (op_def_code,
1505 def_stmt_info->stmt));
1506 }
1507 else
1508 {
1509 tree_code op_def_code = this_code;
1510 if (op_def_code == MINUS_EXPR && opnum == 1)
1511 op_def_code = PLUS_EXPR;
1512 if (in_code == MINUS_EXPR)
1513 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1514 chain.safe_push (chain_op_t (op_def_code, dt, op));
1515 }
1516 }
1517 }
1518 }
1519
1520 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1521 simple_hashmap_traits <bst_traits, slp_tree> >
1522 scalar_stmts_to_slp_tree_map_t;
1523
1524 static slp_tree
1525 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1526 vec<stmt_vec_info> stmts, unsigned int group_size,
1527 poly_uint64 *max_nunits,
1528 bool *matches, unsigned *limit, unsigned *tree_size,
1529 scalar_stmts_to_slp_tree_map_t *bst_map);
1530
1531 static slp_tree
1532 vect_build_slp_tree (vec_info *vinfo,
1533 vec<stmt_vec_info> stmts, unsigned int group_size,
1534 poly_uint64 *max_nunits,
1535 bool *matches, unsigned *limit, unsigned *tree_size,
1536 scalar_stmts_to_slp_tree_map_t *bst_map)
1537 {
1538 if (slp_tree *leader = bst_map->get (stmts))
1539 {
1540 if (dump_enabled_p ())
1541 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1542 !(*leader)->failed ? "" : "failed ", *leader);
1543 if (!(*leader)->failed)
1544 {
1545 SLP_TREE_REF_COUNT (*leader)++;
1546 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1547 stmts.release ();
1548 return *leader;
1549 }
1550 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1551 return NULL;
1552 }
1553
1554 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1555 so we can pick up backedge destinations during discovery. */
1556 slp_tree res = new _slp_tree;
1557 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1558 SLP_TREE_SCALAR_STMTS (res) = stmts;
1559 bst_map->put (stmts.copy (), res);
1560
1561 if (*limit == 0)
1562 {
1563 if (dump_enabled_p ())
1564 dump_printf_loc (MSG_NOTE, vect_location,
1565 "SLP discovery limit exceeded\n");
1566 /* Mark the node invalid so we can detect those when still in use
1567 as backedge destinations. */
1568 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1569 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1570 res->failed = XNEWVEC (bool, group_size);
1571 memset (res->failed, 0, sizeof (bool) * group_size);
1572 memset (matches, 0, sizeof (bool) * group_size);
1573 return NULL;
1574 }
1575 --*limit;
1576
1577 if (dump_enabled_p ())
1578 dump_printf_loc (MSG_NOTE, vect_location,
1579 "starting SLP discovery for node %p\n", res);
1580
1581 poly_uint64 this_max_nunits = 1;
1582 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1583 &this_max_nunits,
1584 matches, limit, tree_size, bst_map);
1585 if (!res_)
1586 {
1587 if (dump_enabled_p ())
1588 dump_printf_loc (MSG_NOTE, vect_location,
1589 "SLP discovery for node %p failed\n", res);
1590 /* Mark the node invalid so we can detect those when still in use
1591 as backedge destinations. */
1592 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1593 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1594 res->failed = XNEWVEC (bool, group_size);
1595 if (flag_checking)
1596 {
1597 unsigned i;
1598 for (i = 0; i < group_size; ++i)
1599 if (!matches[i])
1600 break;
1601 gcc_assert (i < group_size);
1602 }
1603 memcpy (res->failed, matches, sizeof (bool) * group_size);
1604 }
1605 else
1606 {
1607 if (dump_enabled_p ())
1608 dump_printf_loc (MSG_NOTE, vect_location,
1609 "SLP discovery for node %p succeeded\n", res);
1610 gcc_assert (res_ == res);
1611 res->max_nunits = this_max_nunits;
1612 vect_update_max_nunits (max_nunits, this_max_nunits);
1613 /* Keep a reference for the bst_map use. */
1614 SLP_TREE_REF_COUNT (res)++;
1615 }
1616 return res_;
1617 }
1618
1619 /* Helper for building an associated SLP node chain. */
1620
1621 static void
1622 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1623 slp_tree op0, slp_tree op1,
1624 stmt_vec_info oper1, stmt_vec_info oper2,
1625 vec<std::pair<unsigned, unsigned> > lperm)
1626 {
1627 unsigned group_size = SLP_TREE_LANES (op1);
1628
1629 slp_tree child1 = new _slp_tree;
1630 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1631 SLP_TREE_VECTYPE (child1) = vectype;
1632 SLP_TREE_LANES (child1) = group_size;
1633 SLP_TREE_CHILDREN (child1).create (2);
1634 SLP_TREE_CHILDREN (child1).quick_push (op0);
1635 SLP_TREE_CHILDREN (child1).quick_push (op1);
1636 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1637
1638 slp_tree child2 = new _slp_tree;
1639 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1640 SLP_TREE_VECTYPE (child2) = vectype;
1641 SLP_TREE_LANES (child2) = group_size;
1642 SLP_TREE_CHILDREN (child2).create (2);
1643 SLP_TREE_CHILDREN (child2).quick_push (op0);
1644 SLP_TREE_REF_COUNT (op0)++;
1645 SLP_TREE_CHILDREN (child2).quick_push (op1);
1646 SLP_TREE_REF_COUNT (op1)++;
1647 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1648
1649 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1650 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1651 SLP_TREE_VECTYPE (perm) = vectype;
1652 SLP_TREE_LANES (perm) = group_size;
1653 /* ??? We should set this NULL but that's not expected. */
1654 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1655 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1656 SLP_TREE_CHILDREN (perm).quick_push (child1);
1657 SLP_TREE_CHILDREN (perm).quick_push (child2);
1658 }
1659
1660 /* Recursively build an SLP tree starting from NODE.
1661 Fail (and return a value not equal to zero) if def-stmts are not
1662 isomorphic, require data permutation or are of unsupported types of
1663 operation. Otherwise, return 0.
1664 The value returned is the depth in the SLP tree where a mismatch
1665 was found. */
1666
1667 static slp_tree
1668 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1669 vec<stmt_vec_info> stmts, unsigned int group_size,
1670 poly_uint64 *max_nunits,
1671 bool *matches, unsigned *limit, unsigned *tree_size,
1672 scalar_stmts_to_slp_tree_map_t *bst_map)
1673 {
1674 unsigned nops, i, this_tree_size = 0;
1675 poly_uint64 this_max_nunits = *max_nunits;
1676
1677 matches[0] = false;
1678
1679 stmt_vec_info stmt_info = stmts[0];
1680 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1681 nops = gimple_call_num_args (stmt);
1682 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1683 {
1684 nops = gimple_num_ops (stmt) - 1;
1685 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1686 nops++;
1687 }
1688 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1689 nops = gimple_phi_num_args (phi);
1690 else
1691 return NULL;
1692
1693 /* If the SLP node is a PHI (induction or reduction), terminate
1694 the recursion. */
1695 bool *skip_args = XALLOCAVEC (bool, nops);
1696 memset (skip_args, 0, sizeof (bool) * nops);
1697 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1698 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1699 {
1700 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1701 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1702 group_size);
1703 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1704 max_nunits))
1705 return NULL;
1706
1707 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1708 if (def_type == vect_induction_def)
1709 {
1710 /* Induction PHIs are not cycles but walk the initial
1711 value. Only for inner loops through, for outer loops
1712 we need to pick up the value from the actual PHIs
1713 to more easily support peeling and epilogue vectorization. */
1714 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1715 if (!nested_in_vect_loop_p (loop, stmt_info))
1716 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1717 else
1718 loop = loop->inner;
1719 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1720 }
1721 else if (def_type == vect_reduction_def
1722 || def_type == vect_double_reduction_def
1723 || def_type == vect_nested_cycle)
1724 {
1725 /* Else def types have to match. */
1726 stmt_vec_info other_info;
1727 bool all_same = true;
1728 FOR_EACH_VEC_ELT (stmts, i, other_info)
1729 {
1730 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1731 return NULL;
1732 if (other_info != stmt_info)
1733 all_same = false;
1734 }
1735 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1736 /* Reduction initial values are not explicitely represented. */
1737 if (!nested_in_vect_loop_p (loop, stmt_info))
1738 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1739 /* Reduction chain backedge defs are filled manually.
1740 ??? Need a better way to identify a SLP reduction chain PHI.
1741 Or a better overall way to SLP match those. */
1742 if (all_same && def_type == vect_reduction_def)
1743 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1744 }
1745 else if (def_type != vect_internal_def)
1746 return NULL;
1747 }
1748
1749
1750 bool two_operators = false;
1751 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1752 tree vectype = NULL_TREE;
1753 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1754 &this_max_nunits, matches, &two_operators,
1755 &vectype))
1756 return NULL;
1757
1758 /* If the SLP node is a load, terminate the recursion unless masked. */
1759 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1760 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1761 {
1762 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1763 {
1764 /* Masked load. */
1765 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1766 nops = 1;
1767 }
1768 else
1769 {
1770 *max_nunits = this_max_nunits;
1771 (*tree_size)++;
1772 node = vect_create_new_slp_node (node, stmts, 0);
1773 SLP_TREE_VECTYPE (node) = vectype;
1774 /* And compute the load permutation. Whether it is actually
1775 a permutation depends on the unrolling factor which is
1776 decided later. */
1777 vec<unsigned> load_permutation;
1778 int j;
1779 stmt_vec_info load_info;
1780 load_permutation.create (group_size);
1781 stmt_vec_info first_stmt_info
1782 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1783 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1784 {
1785 int load_place = vect_get_place_in_interleaving_chain
1786 (load_info, first_stmt_info);
1787 gcc_assert (load_place != -1);
1788 load_permutation.safe_push (load_place);
1789 }
1790 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1791 return node;
1792 }
1793 }
1794 else if (gimple_assign_single_p (stmt_info->stmt)
1795 && !gimple_vuse (stmt_info->stmt)
1796 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1797 {
1798 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1799 the same SSA name vector of a compatible type to vectype. */
1800 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1801 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1802 stmt_vec_info estmt_info;
1803 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1804 {
1805 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1806 tree bfref = gimple_assign_rhs1 (estmt);
1807 HOST_WIDE_INT lane;
1808 if (!known_eq (bit_field_size (bfref),
1809 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1810 || !constant_multiple_p (bit_field_offset (bfref),
1811 bit_field_size (bfref), &lane))
1812 {
1813 lperm.release ();
1814 return NULL;
1815 }
1816 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1817 }
1818 slp_tree vnode = vect_create_new_slp_node (vNULL);
1819 /* ??? We record vectype here but we hide eventually necessary
1820 punning and instead rely on code generation to materialize
1821 VIEW_CONVERT_EXPRs as necessary. We instead should make
1822 this explicit somehow. */
1823 SLP_TREE_VECTYPE (vnode) = vectype;
1824 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1825 /* We are always building a permutation node even if it is an identity
1826 permute to shield the rest of the vectorizer from the odd node
1827 representing an actual vector without any scalar ops.
1828 ??? We could hide it completely with making the permute node
1829 external? */
1830 node = vect_create_new_slp_node (node, stmts, 1);
1831 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1832 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1833 SLP_TREE_VECTYPE (node) = vectype;
1834 SLP_TREE_CHILDREN (node).quick_push (vnode);
1835 return node;
1836 }
1837 /* When discovery reaches an associatable operation see whether we can
1838 improve that to match up lanes in a way superior to the operand
1839 swapping code which at most looks at two defs.
1840 ??? For BB vectorization we cannot do the brute-force search
1841 for matching as we can succeed by means of builds from scalars
1842 and have no good way to "cost" one build against another. */
1843 else if (is_a <loop_vec_info> (vinfo)
1844 /* ??? We don't handle !vect_internal_def defs below. */
1845 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1846 && is_gimple_assign (stmt_info->stmt)
1847 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1848 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1849 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1850 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1851 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1852 {
1853 /* See if we have a chain of (mixed) adds or subtracts or other
1854 associatable ops. */
1855 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1856 if (code == MINUS_EXPR)
1857 code = PLUS_EXPR;
1858 stmt_vec_info other_op_stmt_info = NULL;
1859 stmt_vec_info op_stmt_info = NULL;
1860 unsigned chain_len = 0;
1861 auto_vec<chain_op_t> chain;
1862 auto_vec<std::pair<tree_code, gimple *> > worklist;
1863 auto_vec<vec<chain_op_t> > chains (group_size);
1864 auto_vec<slp_tree, 4> children;
1865 bool hard_fail = true;
1866 for (unsigned lane = 0; lane < group_size; ++lane)
1867 {
1868 /* For each lane linearize the addition/subtraction (or other
1869 uniform associatable operation) expression tree. */
1870 gimple *op_stmt = NULL, *other_op_stmt = NULL;
1871 vect_slp_linearize_chain (vinfo, worklist, chain, code,
1872 stmts[lane]->stmt, op_stmt, other_op_stmt,
1873 NULL);
1874 if (!op_stmt_info && op_stmt)
1875 op_stmt_info = vinfo->lookup_stmt (op_stmt);
1876 if (!other_op_stmt_info && other_op_stmt)
1877 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1878 if (chain.length () == 2)
1879 {
1880 /* In a chain of just two elements resort to the regular
1881 operand swapping scheme. If we run into a length
1882 mismatch still hard-FAIL. */
1883 if (chain_len == 0)
1884 hard_fail = false;
1885 else
1886 {
1887 matches[lane] = false;
1888 /* ??? We might want to process the other lanes, but
1889 make sure to not give false matching hints to the
1890 caller for lanes we did not process. */
1891 if (lane != group_size - 1)
1892 matches[0] = false;
1893 }
1894 break;
1895 }
1896 else if (chain_len == 0)
1897 chain_len = chain.length ();
1898 else if (chain.length () != chain_len)
1899 {
1900 /* ??? Here we could slip in magic to compensate with
1901 neutral operands. */
1902 matches[lane] = false;
1903 if (lane != group_size - 1)
1904 matches[0] = false;
1905 break;
1906 }
1907 chains.quick_push (chain.copy ());
1908 chain.truncate (0);
1909 }
1910 if (chains.length () == group_size)
1911 {
1912 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
1913 if (!op_stmt_info)
1914 {
1915 hard_fail = false;
1916 goto out;
1917 }
1918 /* Now we have a set of chains with the same length. */
1919 /* 1. pre-sort according to def_type and operation. */
1920 for (unsigned lane = 0; lane < group_size; ++lane)
1921 chains[lane].stablesort (dt_sort_cmp, vinfo);
1922 if (dump_enabled_p ())
1923 {
1924 dump_printf_loc (MSG_NOTE, vect_location,
1925 "pre-sorted chains of %s\n",
1926 get_tree_code_name (code));
1927 for (unsigned lane = 0; lane < group_size; ++lane)
1928 {
1929 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1930 dump_printf (MSG_NOTE, "%s %T ",
1931 get_tree_code_name (chains[lane][opnum].code),
1932 chains[lane][opnum].op);
1933 dump_printf (MSG_NOTE, "\n");
1934 }
1935 }
1936 /* 2. try to build children nodes, associating as necessary. */
1937 for (unsigned n = 0; n < chain_len; ++n)
1938 {
1939 vect_def_type dt = chains[0][n].dt;
1940 unsigned lane;
1941 for (lane = 0; lane < group_size; ++lane)
1942 if (chains[lane][n].dt != dt)
1943 {
1944 if (dt == vect_constant_def
1945 && chains[lane][n].dt == vect_external_def)
1946 dt = vect_external_def;
1947 else if (dt == vect_external_def
1948 && chains[lane][n].dt == vect_constant_def)
1949 ;
1950 else
1951 break;
1952 }
1953 if (lane != group_size)
1954 {
1955 if (dump_enabled_p ())
1956 dump_printf_loc (MSG_NOTE, vect_location,
1957 "giving up on chain due to mismatched "
1958 "def types\n");
1959 matches[lane] = false;
1960 if (lane != group_size - 1)
1961 matches[0] = false;
1962 goto out;
1963 }
1964 if (dt == vect_constant_def
1965 || dt == vect_external_def)
1966 {
1967 /* We can always build those. Might want to sort last
1968 or defer building. */
1969 vec<tree> ops;
1970 ops.create (group_size);
1971 for (lane = 0; lane < group_size; ++lane)
1972 ops.quick_push (chains[lane][n].op);
1973 slp_tree child = vect_create_new_slp_node (ops);
1974 SLP_TREE_DEF_TYPE (child) = dt;
1975 children.safe_push (child);
1976 }
1977 else if (dt != vect_internal_def)
1978 {
1979 /* Not sure, we might need sth special.
1980 gcc.dg/vect/pr96854.c,
1981 gfortran.dg/vect/fast-math-pr37021.f90
1982 and gfortran.dg/vect/pr61171.f trigger. */
1983 /* Soft-fail for now. */
1984 hard_fail = false;
1985 goto out;
1986 }
1987 else
1988 {
1989 vec<stmt_vec_info> op_stmts;
1990 op_stmts.create (group_size);
1991 slp_tree child = NULL;
1992 /* Brute-force our way. We have to consider a lane
1993 failing after fixing an earlier fail up in the
1994 SLP discovery recursion. So track the current
1995 permute per lane. */
1996 unsigned *perms = XALLOCAVEC (unsigned, group_size);
1997 memset (perms, 0, sizeof (unsigned) * group_size);
1998 do
1999 {
2000 op_stmts.truncate (0);
2001 for (lane = 0; lane < group_size; ++lane)
2002 op_stmts.quick_push
2003 (vinfo->lookup_def (chains[lane][n].op));
2004 child = vect_build_slp_tree (vinfo, op_stmts,
2005 group_size, &this_max_nunits,
2006 matches, limit,
2007 &this_tree_size, bst_map);
2008 /* ??? We're likely getting too many fatal mismatches
2009 here so maybe we want to ignore them (but then we
2010 have no idea which lanes fatally mismatched). */
2011 if (child || !matches[0])
2012 break;
2013 /* Swap another lane we have not yet matched up into
2014 lanes that did not match. If we run out of
2015 permute possibilities for a lane terminate the
2016 search. */
2017 bool term = false;
2018 for (lane = 1; lane < group_size; ++lane)
2019 if (!matches[lane])
2020 {
2021 if (n + perms[lane] + 1 == chain_len)
2022 {
2023 term = true;
2024 break;
2025 }
2026 std::swap (chains[lane][n],
2027 chains[lane][n + perms[lane] + 1]);
2028 perms[lane]++;
2029 }
2030 if (term)
2031 break;
2032 }
2033 while (1);
2034 if (!child)
2035 {
2036 if (dump_enabled_p ())
2037 dump_printf_loc (MSG_NOTE, vect_location,
2038 "failed to match up op %d\n", n);
2039 op_stmts.release ();
2040 if (lane != group_size - 1)
2041 matches[0] = false;
2042 else
2043 matches[lane] = false;
2044 goto out;
2045 }
2046 if (dump_enabled_p ())
2047 {
2048 dump_printf_loc (MSG_NOTE, vect_location,
2049 "matched up op %d to\n", n);
2050 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2051 }
2052 children.safe_push (child);
2053 }
2054 }
2055 /* 3. build SLP nodes to combine the chain. */
2056 for (unsigned lane = 0; lane < group_size; ++lane)
2057 if (chains[lane][0].code != code)
2058 {
2059 /* See if there's any alternate all-PLUS entry. */
2060 unsigned n;
2061 for (n = 1; n < chain_len; ++n)
2062 {
2063 for (lane = 0; lane < group_size; ++lane)
2064 if (chains[lane][n].code != code)
2065 break;
2066 if (lane == group_size)
2067 break;
2068 }
2069 if (n != chain_len)
2070 {
2071 /* Swap that in at first position. */
2072 std::swap (children[0], children[n]);
2073 for (lane = 0; lane < group_size; ++lane)
2074 std::swap (chains[lane][0], chains[lane][n]);
2075 }
2076 else
2077 {
2078 /* ??? When this triggers and we end up with two
2079 vect_constant/external_def up-front things break (ICE)
2080 spectacularly finding an insertion place for the
2081 all-constant op. We should have a fully
2082 vect_internal_def operand though(?) so we can swap
2083 that into first place and then prepend the all-zero
2084 constant. */
2085 if (dump_enabled_p ())
2086 dump_printf_loc (MSG_NOTE, vect_location,
2087 "inserting constant zero to compensate "
2088 "for (partially) negated first "
2089 "operand\n");
2090 chain_len++;
2091 for (lane = 0; lane < group_size; ++lane)
2092 chains[lane].safe_insert
2093 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2094 vec<tree> zero_ops;
2095 zero_ops.create (group_size);
2096 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2097 for (lane = 1; lane < group_size; ++lane)
2098 zero_ops.quick_push (zero_ops[0]);
2099 slp_tree zero = vect_create_new_slp_node (zero_ops);
2100 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2101 children.safe_insert (0, zero);
2102 }
2103 break;
2104 }
2105 for (unsigned i = 1; i < children.length (); ++i)
2106 {
2107 slp_tree op0 = children[i - 1];
2108 slp_tree op1 = children[i];
2109 bool this_two_op = false;
2110 for (unsigned lane = 0; lane < group_size; ++lane)
2111 if (chains[lane][i].code != chains[0][i].code)
2112 {
2113 this_two_op = true;
2114 break;
2115 }
2116 slp_tree child;
2117 if (i == children.length () - 1)
2118 child = vect_create_new_slp_node (node, stmts, 2);
2119 else
2120 child = vect_create_new_slp_node (2, ERROR_MARK);
2121 if (this_two_op)
2122 {
2123 vec<std::pair<unsigned, unsigned> > lperm;
2124 lperm.create (group_size);
2125 for (unsigned lane = 0; lane < group_size; ++lane)
2126 lperm.quick_push (std::make_pair
2127 (chains[lane][i].code != chains[0][i].code, lane));
2128 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2129 (chains[0][i].code == code
2130 ? op_stmt_info
2131 : other_op_stmt_info),
2132 (chains[0][i].code == code
2133 ? other_op_stmt_info
2134 : op_stmt_info),
2135 lperm);
2136 }
2137 else
2138 {
2139 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2140 SLP_TREE_VECTYPE (child) = vectype;
2141 SLP_TREE_LANES (child) = group_size;
2142 SLP_TREE_CHILDREN (child).quick_push (op0);
2143 SLP_TREE_CHILDREN (child).quick_push (op1);
2144 SLP_TREE_REPRESENTATIVE (child)
2145 = (chains[0][i].code == code
2146 ? op_stmt_info : other_op_stmt_info);
2147 }
2148 children[i] = child;
2149 }
2150 *tree_size += this_tree_size + 1;
2151 *max_nunits = this_max_nunits;
2152 while (!chains.is_empty ())
2153 chains.pop ().release ();
2154 return node;
2155 }
2156 out:
2157 while (!children.is_empty ())
2158 vect_free_slp_tree (children.pop ());
2159 while (!chains.is_empty ())
2160 chains.pop ().release ();
2161 /* Hard-fail, otherwise we might run into quadratic processing of the
2162 chains starting one stmt into the chain again. */
2163 if (hard_fail)
2164 return NULL;
2165 /* Fall thru to normal processing. */
2166 }
2167
2168 /* Get at the operands, verifying they are compatible. */
2169 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2170 slp_oprnd_info oprnd_info;
2171 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2172 {
2173 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2174 stmts, i, &oprnds_info);
2175 if (res != 0)
2176 matches[(res == -1) ? 0 : i] = false;
2177 if (!matches[0])
2178 break;
2179 }
2180 for (i = 0; i < group_size; ++i)
2181 if (!matches[i])
2182 {
2183 vect_free_oprnd_info (oprnds_info);
2184 return NULL;
2185 }
2186 swap = NULL;
2187
2188 auto_vec<slp_tree, 4> children;
2189
2190 stmt_info = stmts[0];
2191
2192 /* Create SLP_TREE nodes for the definition node/s. */
2193 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2194 {
2195 slp_tree child;
2196 unsigned int j;
2197
2198 /* We're skipping certain operands from processing, for example
2199 outer loop reduction initial defs. */
2200 if (skip_args[i])
2201 {
2202 children.safe_push (NULL);
2203 continue;
2204 }
2205
2206 if (oprnd_info->first_dt == vect_uninitialized_def)
2207 {
2208 /* COND_EXPR have one too many eventually if the condition
2209 is a SSA name. */
2210 gcc_assert (i == 3 && nops == 4);
2211 continue;
2212 }
2213
2214 if (is_a <bb_vec_info> (vinfo)
2215 && oprnd_info->first_dt == vect_internal_def
2216 && !oprnd_info->any_pattern)
2217 {
2218 /* For BB vectorization, if all defs are the same do not
2219 bother to continue the build along the single-lane
2220 graph but use a splat of the scalar value. */
2221 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2222 for (j = 1; j < group_size; ++j)
2223 if (oprnd_info->def_stmts[j] != first_def)
2224 break;
2225 if (j == group_size
2226 /* But avoid doing this for loads where we may be
2227 able to CSE things, unless the stmt is not
2228 vectorizable. */
2229 && (!STMT_VINFO_VECTORIZABLE (first_def)
2230 || !gimple_vuse (first_def->stmt)))
2231 {
2232 if (dump_enabled_p ())
2233 dump_printf_loc (MSG_NOTE, vect_location,
2234 "Using a splat of the uniform operand\n");
2235 oprnd_info->first_dt = vect_external_def;
2236 }
2237 }
2238
2239 if (oprnd_info->first_dt == vect_external_def
2240 || oprnd_info->first_dt == vect_constant_def)
2241 {
2242 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2243 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2244 oprnd_info->ops = vNULL;
2245 children.safe_push (invnode);
2246 continue;
2247 }
2248
2249 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2250 group_size, &this_max_nunits,
2251 matches, limit,
2252 &this_tree_size, bst_map)) != NULL)
2253 {
2254 oprnd_info->def_stmts = vNULL;
2255 children.safe_push (child);
2256 continue;
2257 }
2258
2259 /* If the SLP build for operand zero failed and operand zero
2260 and one can be commutated try that for the scalar stmts
2261 that failed the match. */
2262 if (i == 0
2263 /* A first scalar stmt mismatch signals a fatal mismatch. */
2264 && matches[0]
2265 /* ??? For COND_EXPRs we can swap the comparison operands
2266 as well as the arms under some constraints. */
2267 && nops == 2
2268 && oprnds_info[1]->first_dt == vect_internal_def
2269 && is_gimple_assign (stmt_info->stmt)
2270 /* Swapping operands for reductions breaks assumptions later on. */
2271 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2272 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2273 {
2274 /* See whether we can swap the matching or the non-matching
2275 stmt operands. */
2276 bool swap_not_matching = true;
2277 do
2278 {
2279 for (j = 0; j < group_size; ++j)
2280 {
2281 if (matches[j] != !swap_not_matching)
2282 continue;
2283 stmt_vec_info stmt_info = stmts[j];
2284 /* Verify if we can swap operands of this stmt. */
2285 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2286 if (!stmt
2287 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2288 {
2289 if (!swap_not_matching)
2290 goto fail;
2291 swap_not_matching = false;
2292 break;
2293 }
2294 }
2295 }
2296 while (j != group_size);
2297
2298 /* Swap mismatched definition stmts. */
2299 if (dump_enabled_p ())
2300 dump_printf_loc (MSG_NOTE, vect_location,
2301 "Re-trying with swapped operands of stmts ");
2302 for (j = 0; j < group_size; ++j)
2303 if (matches[j] == !swap_not_matching)
2304 {
2305 std::swap (oprnds_info[0]->def_stmts[j],
2306 oprnds_info[1]->def_stmts[j]);
2307 std::swap (oprnds_info[0]->ops[j],
2308 oprnds_info[1]->ops[j]);
2309 if (dump_enabled_p ())
2310 dump_printf (MSG_NOTE, "%d ", j);
2311 }
2312 if (dump_enabled_p ())
2313 dump_printf (MSG_NOTE, "\n");
2314 /* And try again with scratch 'matches' ... */
2315 bool *tem = XALLOCAVEC (bool, group_size);
2316 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2317 group_size, &this_max_nunits,
2318 tem, limit,
2319 &this_tree_size, bst_map)) != NULL)
2320 {
2321 oprnd_info->def_stmts = vNULL;
2322 children.safe_push (child);
2323 continue;
2324 }
2325 }
2326 fail:
2327
2328 /* If the SLP build failed and we analyze a basic-block
2329 simply treat nodes we fail to build as externally defined
2330 (and thus build vectors from the scalar defs).
2331 The cost model will reject outright expensive cases.
2332 ??? This doesn't treat cases where permutation ultimatively
2333 fails (or we don't try permutation below). Ideally we'd
2334 even compute a permutation that will end up with the maximum
2335 SLP tree size... */
2336 if (is_a <bb_vec_info> (vinfo)
2337 /* ??? Rejecting patterns this way doesn't work. We'd have to
2338 do extra work to cancel the pattern so the uses see the
2339 scalar version. */
2340 && !is_pattern_stmt_p (stmt_info)
2341 && !oprnd_info->any_pattern)
2342 {
2343 /* But if there's a leading vector sized set of matching stmts
2344 fail here so we can split the group. This matches the condition
2345 vect_analyze_slp_instance uses. */
2346 /* ??? We might want to split here and combine the results to support
2347 multiple vector sizes better. */
2348 for (j = 0; j < group_size; ++j)
2349 if (!matches[j])
2350 break;
2351 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2352 {
2353 if (dump_enabled_p ())
2354 dump_printf_loc (MSG_NOTE, vect_location,
2355 "Building vector operands from scalars\n");
2356 this_tree_size++;
2357 child = vect_create_new_slp_node (oprnd_info->ops);
2358 children.safe_push (child);
2359 oprnd_info->ops = vNULL;
2360 continue;
2361 }
2362 }
2363
2364 gcc_assert (child == NULL);
2365 FOR_EACH_VEC_ELT (children, j, child)
2366 if (child)
2367 vect_free_slp_tree (child);
2368 vect_free_oprnd_info (oprnds_info);
2369 return NULL;
2370 }
2371
2372 vect_free_oprnd_info (oprnds_info);
2373
2374 /* If we have all children of a child built up from uniform scalars
2375 or does more than one possibly expensive vector construction then
2376 just throw that away, causing it built up from scalars.
2377 The exception is the SLP node for the vector store. */
2378 if (is_a <bb_vec_info> (vinfo)
2379 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2380 /* ??? Rejecting patterns this way doesn't work. We'd have to
2381 do extra work to cancel the pattern so the uses see the
2382 scalar version. */
2383 && !is_pattern_stmt_p (stmt_info))
2384 {
2385 slp_tree child;
2386 unsigned j;
2387 bool all_uniform_p = true;
2388 unsigned n_vector_builds = 0;
2389 FOR_EACH_VEC_ELT (children, j, child)
2390 {
2391 if (!child)
2392 ;
2393 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2394 all_uniform_p = false;
2395 else if (!vect_slp_tree_uniform_p (child))
2396 {
2397 all_uniform_p = false;
2398 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2399 n_vector_builds++;
2400 }
2401 }
2402 if (all_uniform_p
2403 || n_vector_builds > 1
2404 || (n_vector_builds == children.length ()
2405 && is_a <gphi *> (stmt_info->stmt)))
2406 {
2407 /* Roll back. */
2408 matches[0] = false;
2409 FOR_EACH_VEC_ELT (children, j, child)
2410 if (child)
2411 vect_free_slp_tree (child);
2412
2413 if (dump_enabled_p ())
2414 dump_printf_loc (MSG_NOTE, vect_location,
2415 "Building parent vector operands from "
2416 "scalars instead\n");
2417 return NULL;
2418 }
2419 }
2420
2421 *tree_size += this_tree_size + 1;
2422 *max_nunits = this_max_nunits;
2423
2424 if (two_operators)
2425 {
2426 /* ??? We'd likely want to either cache in bst_map sth like
2427 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2428 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2429 explicit stmts to put in so the keying on 'stmts' doesn't
2430 work (but we have the same issue with nodes that use 'ops'). */
2431 slp_tree one = new _slp_tree;
2432 slp_tree two = new _slp_tree;
2433 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2434 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2435 SLP_TREE_VECTYPE (one) = vectype;
2436 SLP_TREE_VECTYPE (two) = vectype;
2437 SLP_TREE_CHILDREN (one).safe_splice (children);
2438 SLP_TREE_CHILDREN (two).safe_splice (children);
2439 slp_tree child;
2440 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2441 SLP_TREE_REF_COUNT (child)++;
2442
2443 /* Here we record the original defs since this
2444 node represents the final lane configuration. */
2445 node = vect_create_new_slp_node (node, stmts, 2);
2446 SLP_TREE_VECTYPE (node) = vectype;
2447 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2448 SLP_TREE_CHILDREN (node).quick_push (one);
2449 SLP_TREE_CHILDREN (node).quick_push (two);
2450 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2451 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2452 enum tree_code ocode = ERROR_MARK;
2453 stmt_vec_info ostmt_info;
2454 unsigned j = 0;
2455 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2456 {
2457 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2458 if (gimple_assign_rhs_code (ostmt) != code0)
2459 {
2460 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2461 ocode = gimple_assign_rhs_code (ostmt);
2462 j = i;
2463 }
2464 else
2465 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2466 }
2467 SLP_TREE_CODE (one) = code0;
2468 SLP_TREE_CODE (two) = ocode;
2469 SLP_TREE_LANES (one) = stmts.length ();
2470 SLP_TREE_LANES (two) = stmts.length ();
2471 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2472 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2473 return node;
2474 }
2475
2476 node = vect_create_new_slp_node (node, stmts, nops);
2477 SLP_TREE_VECTYPE (node) = vectype;
2478 SLP_TREE_CHILDREN (node).splice (children);
2479 return node;
2480 }
2481
2482 /* Dump a single SLP tree NODE. */
2483
2484 static void
2485 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2486 slp_tree node)
2487 {
2488 unsigned i, j;
2489 slp_tree child;
2490 stmt_vec_info stmt_info;
2491 tree op;
2492
2493 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2494 dump_user_location_t user_loc = loc.get_user_location ();
2495 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
2496 SLP_TREE_DEF_TYPE (node) == vect_external_def
2497 ? " (external)"
2498 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2499 ? " (constant)"
2500 : ""), node,
2501 estimated_poly_value (node->max_nunits),
2502 SLP_TREE_REF_COUNT (node));
2503 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2504 {
2505 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2506 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2507 else
2508 dump_printf_loc (metadata, user_loc, "op template: %G",
2509 SLP_TREE_REPRESENTATIVE (node)->stmt);
2510 }
2511 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2512 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2513 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2514 else
2515 {
2516 dump_printf_loc (metadata, user_loc, "\t{ ");
2517 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2518 dump_printf (metadata, "%T%s ", op,
2519 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2520 dump_printf (metadata, "}\n");
2521 }
2522 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2523 {
2524 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2525 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2526 dump_printf (dump_kind, " %u", j);
2527 dump_printf (dump_kind, " }\n");
2528 }
2529 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2530 {
2531 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2532 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2533 dump_printf (dump_kind, " %u[%u]",
2534 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2535 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2536 dump_printf (dump_kind, " }\n");
2537 }
2538 if (SLP_TREE_CHILDREN (node).is_empty ())
2539 return;
2540 dump_printf_loc (metadata, user_loc, "\tchildren");
2541 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2542 dump_printf (dump_kind, " %p", (void *)child);
2543 dump_printf (dump_kind, "\n");
2544 }
2545
2546 DEBUG_FUNCTION void
2547 debug (slp_tree node)
2548 {
2549 debug_dump_context ctx;
2550 vect_print_slp_tree (MSG_NOTE,
2551 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2552 node);
2553 }
2554
2555 /* Recursive helper for the dot producer below. */
2556
2557 static void
2558 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2559 {
2560 if (visited.add (node))
2561 return;
2562
2563 fprintf (f, "\"%p\" [label=\"", (void *)node);
2564 vect_print_slp_tree (MSG_NOTE,
2565 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2566 node);
2567 fprintf (f, "\"];\n");
2568
2569
2570 for (slp_tree child : SLP_TREE_CHILDREN (node))
2571 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2572
2573 for (slp_tree child : SLP_TREE_CHILDREN (node))
2574 dot_slp_tree (f, child, visited);
2575 }
2576
2577 DEBUG_FUNCTION void
2578 dot_slp_tree (const char *fname, slp_tree node)
2579 {
2580 FILE *f = fopen (fname, "w");
2581 fprintf (f, "digraph {\n");
2582 fflush (f);
2583 {
2584 debug_dump_context ctx (f);
2585 hash_set<slp_tree> visited;
2586 dot_slp_tree (f, node, visited);
2587 }
2588 fflush (f);
2589 fprintf (f, "}\n");
2590 fclose (f);
2591 }
2592
2593 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2594
2595 static void
2596 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2597 slp_tree node, hash_set<slp_tree> &visited)
2598 {
2599 unsigned i;
2600 slp_tree child;
2601
2602 if (visited.add (node))
2603 return;
2604
2605 vect_print_slp_tree (dump_kind, loc, node);
2606
2607 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2608 if (child)
2609 vect_print_slp_graph (dump_kind, loc, child, visited);
2610 }
2611
2612 static void
2613 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2614 slp_tree entry)
2615 {
2616 hash_set<slp_tree> visited;
2617 vect_print_slp_graph (dump_kind, loc, entry, visited);
2618 }
2619
2620 /* Mark the tree rooted at NODE with PURE_SLP. */
2621
2622 static void
2623 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2624 {
2625 int i;
2626 stmt_vec_info stmt_info;
2627 slp_tree child;
2628
2629 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2630 return;
2631
2632 if (visited.add (node))
2633 return;
2634
2635 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2636 STMT_SLP_TYPE (stmt_info) = pure_slp;
2637
2638 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2639 if (child)
2640 vect_mark_slp_stmts (child, visited);
2641 }
2642
2643 static void
2644 vect_mark_slp_stmts (slp_tree node)
2645 {
2646 hash_set<slp_tree> visited;
2647 vect_mark_slp_stmts (node, visited);
2648 }
2649
2650 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2651
2652 static void
2653 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2654 {
2655 int i;
2656 stmt_vec_info stmt_info;
2657 slp_tree child;
2658
2659 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2660 return;
2661
2662 if (visited.add (node))
2663 return;
2664
2665 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2666 {
2667 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2668 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2669 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2670 }
2671
2672 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2673 if (child)
2674 vect_mark_slp_stmts_relevant (child, visited);
2675 }
2676
2677 static void
2678 vect_mark_slp_stmts_relevant (slp_tree node)
2679 {
2680 hash_set<slp_tree> visited;
2681 vect_mark_slp_stmts_relevant (node, visited);
2682 }
2683
2684
2685 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2686
2687 static void
2688 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2689 hash_set<slp_tree> &visited)
2690 {
2691 if (!node || visited.add (node))
2692 return;
2693
2694 if (SLP_TREE_CHILDREN (node).length () == 0)
2695 {
2696 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2697 return;
2698 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2699 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2700 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2701 loads.safe_push (node);
2702 }
2703 else
2704 {
2705 unsigned i;
2706 slp_tree child;
2707 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2708 vect_gather_slp_loads (loads, child, visited);
2709 }
2710 }
2711
2712
2713 /* Find the last store in SLP INSTANCE. */
2714
2715 stmt_vec_info
2716 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2717 {
2718 stmt_vec_info last = NULL;
2719 stmt_vec_info stmt_vinfo;
2720
2721 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2722 {
2723 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2724 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2725 }
2726
2727 return last;
2728 }
2729
2730 /* Find the first stmt in NODE. */
2731
2732 stmt_vec_info
2733 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2734 {
2735 stmt_vec_info first = NULL;
2736 stmt_vec_info stmt_vinfo;
2737
2738 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2739 {
2740 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2741 if (!first
2742 || get_later_stmt (stmt_vinfo, first) == first)
2743 first = stmt_vinfo;
2744 }
2745
2746 return first;
2747 }
2748
2749 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2750 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2751 (also containing the first GROUP1_SIZE stmts, since stores are
2752 consecutive), the second containing the remainder.
2753 Return the first stmt in the second group. */
2754
2755 static stmt_vec_info
2756 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2757 {
2758 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2759 gcc_assert (group1_size > 0);
2760 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2761 gcc_assert (group2_size > 0);
2762 DR_GROUP_SIZE (first_vinfo) = group1_size;
2763
2764 stmt_vec_info stmt_info = first_vinfo;
2765 for (unsigned i = group1_size; i > 1; i--)
2766 {
2767 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2768 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2769 }
2770 /* STMT is now the last element of the first group. */
2771 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2772 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2773
2774 DR_GROUP_SIZE (group2) = group2_size;
2775 for (stmt_info = group2; stmt_info;
2776 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2777 {
2778 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2779 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2780 }
2781
2782 /* For the second group, the DR_GROUP_GAP is that before the original group,
2783 plus skipping over the first vector. */
2784 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2785
2786 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2787 DR_GROUP_GAP (first_vinfo) += group2_size;
2788
2789 if (dump_enabled_p ())
2790 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2791 group1_size, group2_size);
2792
2793 return group2;
2794 }
2795
2796 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2797 statements and a vector of NUNITS elements. */
2798
2799 static poly_uint64
2800 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2801 {
2802 return exact_div (common_multiple (nunits, group_size), group_size);
2803 }
2804
2805 /* Helper that checks to see if a node is a load node. */
2806
2807 static inline bool
2808 vect_is_slp_load_node (slp_tree root)
2809 {
2810 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2811 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2812 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2813 }
2814
2815
2816 /* Helper function of optimize_load_redistribution that performs the operation
2817 recursively. */
2818
2819 static slp_tree
2820 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2821 vec_info *vinfo, unsigned int group_size,
2822 hash_map<slp_tree, slp_tree> *load_map,
2823 slp_tree root)
2824 {
2825 if (slp_tree *leader = load_map->get (root))
2826 return *leader;
2827
2828 slp_tree node;
2829 unsigned i;
2830
2831 /* For now, we don't know anything about externals so do not do anything. */
2832 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2833 return NULL;
2834 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2835 {
2836 /* First convert this node into a load node and add it to the leaves
2837 list and flatten the permute from a lane to a load one. If it's
2838 unneeded it will be elided later. */
2839 vec<stmt_vec_info> stmts;
2840 stmts.create (SLP_TREE_LANES (root));
2841 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2842 for (unsigned j = 0; j < lane_perm.length (); j++)
2843 {
2844 std::pair<unsigned, unsigned> perm = lane_perm[j];
2845 node = SLP_TREE_CHILDREN (root)[perm.first];
2846
2847 if (!vect_is_slp_load_node (node)
2848 || SLP_TREE_CHILDREN (node).exists ())
2849 {
2850 stmts.release ();
2851 goto next;
2852 }
2853
2854 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2855 }
2856
2857 if (dump_enabled_p ())
2858 dump_printf_loc (MSG_NOTE, vect_location,
2859 "converting stmts on permute node %p\n", root);
2860
2861 bool *matches = XALLOCAVEC (bool, group_size);
2862 poly_uint64 max_nunits = 1;
2863 unsigned tree_size = 0, limit = 1;
2864 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2865 matches, &limit, &tree_size, bst_map);
2866 if (!node)
2867 stmts.release ();
2868
2869 load_map->put (root, node);
2870 return node;
2871 }
2872
2873 next:
2874 load_map->put (root, NULL);
2875
2876 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2877 {
2878 slp_tree value
2879 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2880 node);
2881 if (value)
2882 {
2883 SLP_TREE_REF_COUNT (value)++;
2884 SLP_TREE_CHILDREN (root)[i] = value;
2885 /* ??? We know the original leafs of the replaced nodes will
2886 be referenced by bst_map, only the permutes created by
2887 pattern matching are not. */
2888 if (SLP_TREE_REF_COUNT (node) == 1)
2889 load_map->remove (node);
2890 vect_free_slp_tree (node);
2891 }
2892 }
2893
2894 return NULL;
2895 }
2896
2897 /* Temporary workaround for loads not being CSEd during SLP build. This
2898 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2899 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2900 same DR such that the final operation is equal to a permuted load. Such
2901 NODES are then directly converted into LOADS themselves. The nodes are
2902 CSEd using BST_MAP. */
2903
2904 static void
2905 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2906 vec_info *vinfo, unsigned int group_size,
2907 hash_map<slp_tree, slp_tree> *load_map,
2908 slp_tree root)
2909 {
2910 slp_tree node;
2911 unsigned i;
2912
2913 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2914 {
2915 slp_tree value
2916 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2917 node);
2918 if (value)
2919 {
2920 SLP_TREE_REF_COUNT (value)++;
2921 SLP_TREE_CHILDREN (root)[i] = value;
2922 /* ??? We know the original leafs of the replaced nodes will
2923 be referenced by bst_map, only the permutes created by
2924 pattern matching are not. */
2925 if (SLP_TREE_REF_COUNT (node) == 1)
2926 load_map->remove (node);
2927 vect_free_slp_tree (node);
2928 }
2929 }
2930 }
2931
2932 /* Helper function of vect_match_slp_patterns.
2933
2934 Attempts to match patterns against the slp tree rooted in REF_NODE using
2935 VINFO. Patterns are matched in post-order traversal.
2936
2937 If matching is successful the value in REF_NODE is updated and returned, if
2938 not then it is returned unchanged. */
2939
2940 static bool
2941 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2942 slp_tree_to_load_perm_map_t *perm_cache,
2943 hash_set<slp_tree> *visited)
2944 {
2945 unsigned i;
2946 slp_tree node = *ref_node;
2947 bool found_p = false;
2948 if (!node || visited->add (node))
2949 return false;
2950
2951 slp_tree child;
2952 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2953 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2954 vinfo, perm_cache, visited);
2955
2956 for (unsigned x = 0; x < num__slp_patterns; x++)
2957 {
2958 vect_pattern *pattern = slp_patterns[x] (perm_cache, ref_node);
2959 if (pattern)
2960 {
2961 pattern->build (vinfo);
2962 delete pattern;
2963 found_p = true;
2964 }
2965 }
2966
2967 return found_p;
2968 }
2969
2970 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2971 vec_info VINFO.
2972
2973 The modified tree is returned. Patterns are tried in order and multiple
2974 patterns may match. */
2975
2976 static bool
2977 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2978 hash_set<slp_tree> *visited,
2979 slp_tree_to_load_perm_map_t *perm_cache)
2980 {
2981 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2982 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2983
2984 if (dump_enabled_p ())
2985 dump_printf_loc (MSG_NOTE, vect_location,
2986 "Analyzing SLP tree %p for patterns\n",
2987 SLP_INSTANCE_TREE (instance));
2988
2989 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
2990 }
2991
2992 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
2993 splitting into two, with the first split group having size NEW_GROUP_SIZE.
2994 Return true if we could use IFN_STORE_LANES instead and if that appears
2995 to be the better approach. */
2996
2997 static bool
2998 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
2999 unsigned int group_size,
3000 unsigned int new_group_size)
3001 {
3002 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3003 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3004 if (!vectype)
3005 return false;
3006 /* Allow the split if one of the two new groups would operate on full
3007 vectors *within* rather than across one scalar loop iteration.
3008 This is purely a heuristic, but it should work well for group
3009 sizes of 3 and 4, where the possible splits are:
3010
3011 3->2+1: OK if the vector has exactly two elements
3012 4->2+2: Likewise
3013 4->3+1: Less clear-cut. */
3014 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3015 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3016 return false;
3017 return vect_store_lanes_supported (vectype, group_size, false);
3018 }
3019
3020 /* Analyze an SLP instance starting from a group of grouped stores. Call
3021 vect_build_slp_tree to build a tree of packed stmts if possible.
3022 Return FALSE if it's impossible to SLP any stmt in the loop. */
3023
3024 static bool
3025 vect_analyze_slp_instance (vec_info *vinfo,
3026 scalar_stmts_to_slp_tree_map_t *bst_map,
3027 stmt_vec_info stmt_info, slp_instance_kind kind,
3028 unsigned max_tree_size, unsigned *limit);
3029
3030 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3031 of KIND. Return true if successful. */
3032
3033 static bool
3034 vect_build_slp_instance (vec_info *vinfo,
3035 slp_instance_kind kind,
3036 vec<stmt_vec_info> &scalar_stmts,
3037 vec<stmt_vec_info> &root_stmt_infos,
3038 unsigned max_tree_size, unsigned *limit,
3039 scalar_stmts_to_slp_tree_map_t *bst_map,
3040 /* ??? We need stmt_info for group splitting. */
3041 stmt_vec_info stmt_info_)
3042 {
3043 if (dump_enabled_p ())
3044 {
3045 dump_printf_loc (MSG_NOTE, vect_location,
3046 "Starting SLP discovery for\n");
3047 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3048 dump_printf_loc (MSG_NOTE, vect_location,
3049 " %G", scalar_stmts[i]->stmt);
3050 }
3051
3052 /* Build the tree for the SLP instance. */
3053 unsigned int group_size = scalar_stmts.length ();
3054 bool *matches = XALLOCAVEC (bool, group_size);
3055 poly_uint64 max_nunits = 1;
3056 unsigned tree_size = 0;
3057 unsigned i;
3058 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3059 &max_nunits, matches, limit,
3060 &tree_size, bst_map);
3061 if (node != NULL)
3062 {
3063 /* Calculate the unrolling factor based on the smallest type. */
3064 poly_uint64 unrolling_factor
3065 = calculate_unrolling_factor (max_nunits, group_size);
3066
3067 if (maybe_ne (unrolling_factor, 1U)
3068 && is_a <bb_vec_info> (vinfo))
3069 {
3070 unsigned HOST_WIDE_INT const_max_nunits;
3071 if (!max_nunits.is_constant (&const_max_nunits)
3072 || const_max_nunits > group_size)
3073 {
3074 if (dump_enabled_p ())
3075 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3076 "Build SLP failed: store group "
3077 "size not a multiple of the vector size "
3078 "in basic block SLP\n");
3079 vect_free_slp_tree (node);
3080 return false;
3081 }
3082 /* Fatal mismatch. */
3083 if (dump_enabled_p ())
3084 dump_printf_loc (MSG_NOTE, vect_location,
3085 "SLP discovery succeeded but node needs "
3086 "splitting\n");
3087 memset (matches, true, group_size);
3088 matches[group_size / const_max_nunits * const_max_nunits] = false;
3089 vect_free_slp_tree (node);
3090 }
3091 else
3092 {
3093 /* Create a new SLP instance. */
3094 slp_instance new_instance = XNEW (class _slp_instance);
3095 SLP_INSTANCE_TREE (new_instance) = node;
3096 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3097 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3098 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3099 SLP_INSTANCE_KIND (new_instance) = kind;
3100 new_instance->reduc_phis = NULL;
3101 new_instance->cost_vec = vNULL;
3102 new_instance->subgraph_entries = vNULL;
3103
3104 if (dump_enabled_p ())
3105 dump_printf_loc (MSG_NOTE, vect_location,
3106 "SLP size %u vs. limit %u.\n",
3107 tree_size, max_tree_size);
3108
3109 /* Fixup SLP reduction chains. */
3110 if (kind == slp_inst_kind_reduc_chain)
3111 {
3112 /* If this is a reduction chain with a conversion in front
3113 amend the SLP tree with a node for that. */
3114 gimple *scalar_def
3115 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3116 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3117 {
3118 /* Get at the conversion stmt - we know it's the single use
3119 of the last stmt of the reduction chain. */
3120 use_operand_p use_p;
3121 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3122 &use_p, &scalar_def);
3123 gcc_assert (r);
3124 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3125 next_info = vect_stmt_to_vectorize (next_info);
3126 scalar_stmts = vNULL;
3127 scalar_stmts.create (group_size);
3128 for (unsigned i = 0; i < group_size; ++i)
3129 scalar_stmts.quick_push (next_info);
3130 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3131 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3132 SLP_TREE_CHILDREN (conv).quick_push (node);
3133 SLP_INSTANCE_TREE (new_instance) = conv;
3134 /* We also have to fake this conversion stmt as SLP reduction
3135 group so we don't have to mess with too much code
3136 elsewhere. */
3137 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3138 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3139 }
3140 /* Fill the backedge child of the PHI SLP node. The
3141 general matching code cannot find it because the
3142 scalar code does not reflect how we vectorize the
3143 reduction. */
3144 use_operand_p use_p;
3145 imm_use_iterator imm_iter;
3146 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3147 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3148 gimple_get_lhs (scalar_def))
3149 /* There are exactly two non-debug uses, the reduction
3150 PHI and the loop-closed PHI node. */
3151 if (!is_gimple_debug (USE_STMT (use_p))
3152 && gimple_bb (USE_STMT (use_p)) == loop->header)
3153 {
3154 auto_vec<stmt_vec_info, 64> phis (group_size);
3155 stmt_vec_info phi_info
3156 = vinfo->lookup_stmt (USE_STMT (use_p));
3157 for (unsigned i = 0; i < group_size; ++i)
3158 phis.quick_push (phi_info);
3159 slp_tree *phi_node = bst_map->get (phis);
3160 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3161 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3162 = SLP_INSTANCE_TREE (new_instance);
3163 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3164 }
3165 }
3166
3167 vinfo->slp_instances.safe_push (new_instance);
3168
3169 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3170 the number of scalar stmts in the root in a few places.
3171 Verify that assumption holds. */
3172 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3173 .length () == group_size);
3174
3175 if (dump_enabled_p ())
3176 {
3177 dump_printf_loc (MSG_NOTE, vect_location,
3178 "Final SLP tree for instance %p:\n", new_instance);
3179 vect_print_slp_graph (MSG_NOTE, vect_location,
3180 SLP_INSTANCE_TREE (new_instance));
3181 }
3182
3183 return true;
3184 }
3185 }
3186 else
3187 {
3188 /* Failed to SLP. */
3189 /* Free the allocated memory. */
3190 scalar_stmts.release ();
3191 }
3192
3193 stmt_vec_info stmt_info = stmt_info_;
3194 /* Try to break the group up into pieces. */
3195 if (kind == slp_inst_kind_store)
3196 {
3197 /* ??? We could delay all the actual splitting of store-groups
3198 until after SLP discovery of the original group completed.
3199 Then we can recurse to vect_build_slp_instance directly. */
3200 for (i = 0; i < group_size; i++)
3201 if (!matches[i])
3202 break;
3203
3204 /* For basic block SLP, try to break the group up into multiples of
3205 a vector size. */
3206 if (is_a <bb_vec_info> (vinfo)
3207 && (i > 1 && i < group_size))
3208 {
3209 tree scalar_type
3210 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3211 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3212 1 << floor_log2 (i));
3213 unsigned HOST_WIDE_INT const_nunits;
3214 if (vectype
3215 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3216 {
3217 /* Split into two groups at the first vector boundary. */
3218 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3219 unsigned group1_size = i & ~(const_nunits - 1);
3220
3221 if (dump_enabled_p ())
3222 dump_printf_loc (MSG_NOTE, vect_location,
3223 "Splitting SLP group at stmt %u\n", i);
3224 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3225 group1_size);
3226 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3227 kind, max_tree_size,
3228 limit);
3229 /* Split the rest at the failure point and possibly
3230 re-analyze the remaining matching part if it has
3231 at least two lanes. */
3232 if (group1_size < i
3233 && (i + 1 < group_size
3234 || i - group1_size > 1))
3235 {
3236 stmt_vec_info rest2 = rest;
3237 rest = vect_split_slp_store_group (rest, i - group1_size);
3238 if (i - group1_size > 1)
3239 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3240 kind, max_tree_size,
3241 limit);
3242 }
3243 /* Re-analyze the non-matching tail if it has at least
3244 two lanes. */
3245 if (i + 1 < group_size)
3246 res |= vect_analyze_slp_instance (vinfo, bst_map,
3247 rest, kind, max_tree_size,
3248 limit);
3249 return res;
3250 }
3251 }
3252
3253 /* For loop vectorization split into arbitrary pieces of size > 1. */
3254 if (is_a <loop_vec_info> (vinfo)
3255 && (i > 1 && i < group_size)
3256 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3257 {
3258 unsigned group1_size = i;
3259
3260 if (dump_enabled_p ())
3261 dump_printf_loc (MSG_NOTE, vect_location,
3262 "Splitting SLP group at stmt %u\n", i);
3263
3264 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3265 group1_size);
3266 /* Loop vectorization cannot handle gaps in stores, make sure
3267 the split group appears as strided. */
3268 STMT_VINFO_STRIDED_P (rest) = 1;
3269 DR_GROUP_GAP (rest) = 0;
3270 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3271 DR_GROUP_GAP (stmt_info) = 0;
3272
3273 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3274 kind, max_tree_size, limit);
3275 if (i + 1 < group_size)
3276 res |= vect_analyze_slp_instance (vinfo, bst_map,
3277 rest, kind, max_tree_size, limit);
3278
3279 return res;
3280 }
3281
3282 /* Even though the first vector did not all match, we might be able to SLP
3283 (some) of the remainder. FORNOW ignore this possibility. */
3284 }
3285
3286 /* Failed to SLP. */
3287 if (dump_enabled_p ())
3288 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3289 return false;
3290 }
3291
3292
3293 /* Analyze an SLP instance starting from a group of grouped stores. Call
3294 vect_build_slp_tree to build a tree of packed stmts if possible.
3295 Return FALSE if it's impossible to SLP any stmt in the loop. */
3296
3297 static bool
3298 vect_analyze_slp_instance (vec_info *vinfo,
3299 scalar_stmts_to_slp_tree_map_t *bst_map,
3300 stmt_vec_info stmt_info,
3301 slp_instance_kind kind,
3302 unsigned max_tree_size, unsigned *limit)
3303 {
3304 unsigned int i;
3305 vec<stmt_vec_info> scalar_stmts;
3306
3307 if (is_a <bb_vec_info> (vinfo))
3308 vect_location = stmt_info->stmt;
3309
3310 stmt_vec_info next_info = stmt_info;
3311 if (kind == slp_inst_kind_store)
3312 {
3313 /* Collect the stores and store them in scalar_stmts. */
3314 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3315 while (next_info)
3316 {
3317 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3318 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3319 }
3320 }
3321 else if (kind == slp_inst_kind_reduc_chain)
3322 {
3323 /* Collect the reduction stmts and store them in scalar_stmts. */
3324 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3325 while (next_info)
3326 {
3327 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3328 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3329 }
3330 /* Mark the first element of the reduction chain as reduction to properly
3331 transform the node. In the reduction analysis phase only the last
3332 element of the chain is marked as reduction. */
3333 STMT_VINFO_DEF_TYPE (stmt_info)
3334 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3335 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3336 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3337 }
3338 else if (kind == slp_inst_kind_ctor)
3339 {
3340 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3341 tree val;
3342 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3343 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3344 {
3345 stmt_vec_info def_info = vinfo->lookup_def (val);
3346 def_info = vect_stmt_to_vectorize (def_info);
3347 scalar_stmts.quick_push (def_info);
3348 }
3349 if (dump_enabled_p ())
3350 dump_printf_loc (MSG_NOTE, vect_location,
3351 "Analyzing vectorizable constructor: %G\n",
3352 stmt_info->stmt);
3353 }
3354 else if (kind == slp_inst_kind_reduc_group)
3355 {
3356 /* Collect reduction statements. */
3357 const vec<stmt_vec_info> &reductions
3358 = as_a <loop_vec_info> (vinfo)->reductions;
3359 scalar_stmts.create (reductions.length ());
3360 for (i = 0; reductions.iterate (i, &next_info); i++)
3361 if (STMT_VINFO_RELEVANT_P (next_info)
3362 || STMT_VINFO_LIVE_P (next_info))
3363 scalar_stmts.quick_push (next_info);
3364 /* If less than two were relevant/live there's nothing to SLP. */
3365 if (scalar_stmts.length () < 2)
3366 return false;
3367 }
3368 else
3369 gcc_unreachable ();
3370
3371 vec<stmt_vec_info> roots = vNULL;
3372 if (kind == slp_inst_kind_ctor)
3373 {
3374 roots.create (1);
3375 roots.quick_push (stmt_info);
3376 }
3377 /* Build the tree for the SLP instance. */
3378 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3379 roots,
3380 max_tree_size, limit, bst_map,
3381 kind == slp_inst_kind_store
3382 ? stmt_info : NULL);
3383 if (!res)
3384 roots.release ();
3385
3386 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3387 where we should do store group splitting. */
3388
3389 return res;
3390 }
3391
3392 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3393 trees of packed scalar stmts if SLP is possible. */
3394
3395 opt_result
3396 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3397 {
3398 unsigned int i;
3399 stmt_vec_info first_element;
3400 slp_instance instance;
3401
3402 DUMP_VECT_SCOPE ("vect_analyze_slp");
3403
3404 unsigned limit = max_tree_size;
3405
3406 scalar_stmts_to_slp_tree_map_t *bst_map
3407 = new scalar_stmts_to_slp_tree_map_t ();
3408
3409 /* Find SLP sequences starting from groups of grouped stores. */
3410 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3411 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3412 STMT_VINFO_GROUPED_ACCESS (first_element)
3413 ? slp_inst_kind_store : slp_inst_kind_ctor,
3414 max_tree_size, &limit);
3415
3416 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3417 {
3418 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3419 {
3420 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3421 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3422 bb_vinfo->roots[i].stmts,
3423 bb_vinfo->roots[i].roots,
3424 max_tree_size, &limit, bst_map, NULL))
3425 {
3426 bb_vinfo->roots[i].stmts = vNULL;
3427 bb_vinfo->roots[i].roots = vNULL;
3428 }
3429 }
3430 }
3431
3432 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3433 {
3434 /* Find SLP sequences starting from reduction chains. */
3435 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3436 if (! STMT_VINFO_RELEVANT_P (first_element)
3437 && ! STMT_VINFO_LIVE_P (first_element))
3438 ;
3439 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3440 slp_inst_kind_reduc_chain,
3441 max_tree_size, &limit))
3442 {
3443 /* Dissolve reduction chain group. */
3444 stmt_vec_info vinfo = first_element;
3445 stmt_vec_info last = NULL;
3446 while (vinfo)
3447 {
3448 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3449 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3450 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3451 last = vinfo;
3452 vinfo = next;
3453 }
3454 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3455 /* It can be still vectorized as part of an SLP reduction. */
3456 loop_vinfo->reductions.safe_push (last);
3457 }
3458
3459 /* Find SLP sequences starting from groups of reductions. */
3460 if (loop_vinfo->reductions.length () > 1)
3461 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3462 slp_inst_kind_reduc_group, max_tree_size,
3463 &limit);
3464 }
3465
3466 hash_set<slp_tree> visited_patterns;
3467 slp_tree_to_load_perm_map_t perm_cache;
3468
3469 /* See if any patterns can be found in the SLP tree. */
3470 bool pattern_found = false;
3471 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3472 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3473 &visited_patterns, &perm_cache);
3474
3475 /* If any were found optimize permutations of loads. */
3476 if (pattern_found)
3477 {
3478 hash_map<slp_tree, slp_tree> load_map;
3479 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3480 {
3481 slp_tree root = SLP_INSTANCE_TREE (instance);
3482 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3483 &load_map, root);
3484 }
3485 }
3486
3487
3488
3489 /* The map keeps a reference on SLP nodes built, release that. */
3490 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3491 it != bst_map->end (); ++it)
3492 if ((*it).second)
3493 vect_free_slp_tree ((*it).second);
3494 delete bst_map;
3495
3496 if (pattern_found && dump_enabled_p ())
3497 {
3498 dump_printf_loc (MSG_NOTE, vect_location,
3499 "Pattern matched SLP tree\n");
3500 hash_set<slp_tree> visited;
3501 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3502 vect_print_slp_graph (MSG_NOTE, vect_location,
3503 SLP_INSTANCE_TREE (instance), visited);
3504 }
3505
3506 return opt_result::success ();
3507 }
3508
3509 struct slpg_vertex
3510 {
3511 slpg_vertex (slp_tree node_)
3512 : node (node_), perm_in (-1), perm_out (-1) {}
3513
3514 int get_perm_materialized () const
3515 { return perm_in != perm_out ? perm_in : 0; }
3516
3517 slp_tree node;
3518 /* The common permutation on the incoming lanes (towards SLP children). */
3519 int perm_in;
3520 /* The permutation on the outgoing lanes (towards SLP parents). When
3521 the node is a materialization point for a permute this differs
3522 from perm_in (and is then usually zero). Materialization happens
3523 on the input side. */
3524 int perm_out;
3525 };
3526
3527 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3528
3529 static void
3530 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
3531 vec<slpg_vertex> &vertices, vec<int> &leafs)
3532 {
3533 unsigned i;
3534 slp_tree child;
3535
3536 if (visited.add (node))
3537 return;
3538
3539 node->vertex = vertices.length ();
3540 vertices.safe_push (slpg_vertex (node));
3541
3542 bool leaf = true;
3543 bool force_leaf = false;
3544 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3545 if (child)
3546 {
3547 leaf = false;
3548 vect_slp_build_vertices (visited, child, vertices, leafs);
3549 }
3550 else
3551 force_leaf = true;
3552 /* Since SLP discovery works along use-def edges all cycles have an
3553 entry - but there's the exception of cycles where we do not handle
3554 the entry explicitely (but with a NULL SLP node), like some reductions
3555 and inductions. Force those SLP PHIs to act as leafs to make them
3556 backwards reachable. */
3557 if (leaf || force_leaf)
3558 leafs.safe_push (node->vertex);
3559 }
3560
3561 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3562
3563 static void
3564 vect_slp_build_vertices (vec_info *info, vec<slpg_vertex> &vertices,
3565 vec<int> &leafs)
3566 {
3567 hash_set<slp_tree> visited;
3568 unsigned i;
3569 slp_instance instance;
3570 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
3571 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
3572 leafs);
3573 }
3574
3575 /* Apply (reverse) bijectite PERM to VEC. */
3576
3577 template <class T>
3578 static void
3579 vect_slp_permute (vec<unsigned> perm,
3580 vec<T> &vec, bool reverse)
3581 {
3582 auto_vec<T, 64> saved;
3583 saved.create (vec.length ());
3584 for (unsigned i = 0; i < vec.length (); ++i)
3585 saved.quick_push (vec[i]);
3586
3587 if (reverse)
3588 {
3589 for (unsigned i = 0; i < vec.length (); ++i)
3590 vec[perm[i]] = saved[i];
3591 for (unsigned i = 0; i < vec.length (); ++i)
3592 gcc_assert (vec[perm[i]] == saved[i]);
3593 }
3594 else
3595 {
3596 for (unsigned i = 0; i < vec.length (); ++i)
3597 vec[i] = saved[perm[i]];
3598 for (unsigned i = 0; i < vec.length (); ++i)
3599 gcc_assert (vec[i] == saved[perm[i]]);
3600 }
3601 }
3602
3603 /* Return whether permutations PERM_A and PERM_B as recorded in the
3604 PERMS vector are equal. */
3605
3606 static bool
3607 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
3608 int perm_a, int perm_b)
3609 {
3610 return (perm_a == perm_b
3611 || (perm_a != -1 && perm_b != -1
3612 && perms[perm_a].length () == perms[perm_b].length ()
3613 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
3614 sizeof (unsigned) * perms[perm_a].length ()) == 0));
3615 }
3616
3617 /* Optimize the SLP graph of VINFO. */
3618
3619 void
3620 vect_optimize_slp (vec_info *vinfo)
3621 {
3622 if (vinfo->slp_instances.is_empty ())
3623 return;
3624
3625 slp_tree node;
3626 unsigned i;
3627 auto_vec<slpg_vertex> vertices;
3628 auto_vec<int> leafs;
3629 vect_slp_build_vertices (vinfo, vertices, leafs);
3630
3631 struct graph *slpg = new_graph (vertices.length ());
3632 for (slpg_vertex &v : vertices)
3633 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
3634 if (child)
3635 add_edge (slpg, v.node->vertex, child->vertex);
3636
3637 /* Compute (reverse) postorder on the inverted graph. */
3638 auto_vec<int> ipo;
3639 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3640
3641 auto_vec<vec<unsigned> > perms;
3642 perms.safe_push (vNULL); /* zero is no permute */
3643
3644 /* Produce initial permutations. */
3645 for (i = 0; i < leafs.length (); ++i)
3646 {
3647 int idx = leafs[i];
3648 slp_tree node = vertices[idx].node;
3649
3650 /* Handle externals and constants optimistically throughout the
3651 iteration. But treat existing vectors as fixed since we
3652 do not handle permuting them below. */
3653 if ((SLP_TREE_DEF_TYPE (node) == vect_external_def
3654 && !SLP_TREE_VEC_DEFS (node).exists ())
3655 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3656 continue;
3657
3658 /* Leafs do not change across iterations. Note leafs also double
3659 as entries to the reverse graph. */
3660 if (!slpg->vertices[idx].succ)
3661 {
3662 vertices[idx].perm_in = 0;
3663 vertices[idx].perm_out = 0;
3664 }
3665
3666 /* Loads are the only thing generating permutes. */
3667 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3668 continue;
3669
3670 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3671 node unpermuted, record this permute. */
3672 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3673 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3674 continue;
3675 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3676 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3677 bool any_permute = false;
3678 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3679 {
3680 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3681 imin = MIN (imin, idx);
3682 imax = MAX (imax, idx);
3683 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3684 any_permute = true;
3685 }
3686 /* If there's no permute no need to split one out. */
3687 if (!any_permute)
3688 continue;
3689 /* If the span doesn't match we'd disrupt VF computation, avoid
3690 that for now. */
3691 if (imax - imin + 1 != SLP_TREE_LANES (node))
3692 continue;
3693
3694 /* For now only handle true permutes, like
3695 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3696 when permuting constants and invariants keeping the permute
3697 bijective. */
3698 auto_sbitmap load_index (SLP_TREE_LANES (node));
3699 bitmap_clear (load_index);
3700 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3701 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3702 unsigned j;
3703 for (j = 0; j < SLP_TREE_LANES (node); ++j)
3704 if (!bitmap_bit_p (load_index, j))
3705 break;
3706 if (j != SLP_TREE_LANES (node))
3707 continue;
3708
3709 vec<unsigned> perm = vNULL;
3710 perm.safe_grow (SLP_TREE_LANES (node), true);
3711 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3712 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3713 perms.safe_push (perm);
3714 vertices[idx].perm_in = perms.length () - 1;
3715 vertices[idx].perm_out = perms.length () - 1;
3716 }
3717
3718 /* Propagate permutes along the graph and compute materialization points. */
3719 bool changed;
3720 bool do_materialization = false;
3721 unsigned iteration = 0;
3722 do
3723 {
3724 changed = false;
3725 ++iteration;
3726
3727 if (dump_enabled_p ())
3728 dump_printf_loc (MSG_NOTE, vect_location,
3729 "SLP optimize iteration %d\n", iteration);
3730
3731 for (i = vertices.length (); i > 0 ; --i)
3732 {
3733 int idx = ipo[i-1];
3734 slp_tree node = vertices[idx].node;
3735
3736 /* Handle externals and constants optimistically throughout the
3737 iteration. */
3738 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3739 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3740 continue;
3741
3742 /* We still eventually have failed backedge SLP nodes in the
3743 graph, those are only cancelled when analyzing operations.
3744 Simply treat them as transparent ops, propagating permutes
3745 through them. */
3746 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3747 {
3748 /* We do not handle stores with a permutation, so all
3749 incoming permutes must have been materialized. */
3750 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
3751 if (STMT_VINFO_DATA_REF (rep)
3752 && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
3753 {
3754 /* ??? We're forcing materialization in place
3755 of the child here, we'd need special handling
3756 in materialization to leave perm_in -1 here. */
3757 vertices[idx].perm_in = 0;
3758 vertices[idx].perm_out = 0;
3759 }
3760 /* We cannot move a permute across an operation that is
3761 not independent on lanes. Note this is an explicit
3762 negative list since that's much shorter than the respective
3763 positive one but it's critical to keep maintaining it. */
3764 if (is_gimple_call (STMT_VINFO_STMT (rep)))
3765 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
3766 {
3767 case CFN_COMPLEX_ADD_ROT90:
3768 case CFN_COMPLEX_ADD_ROT270:
3769 case CFN_COMPLEX_MUL:
3770 case CFN_COMPLEX_MUL_CONJ:
3771 case CFN_VEC_ADDSUB:
3772 case CFN_VEC_FMADDSUB:
3773 case CFN_VEC_FMSUBADD:
3774 vertices[idx].perm_in = 0;
3775 vertices[idx].perm_out = 0;
3776 default:;
3777 }
3778 }
3779
3780 if (!slpg->vertices[idx].succ)
3781 /* Pick up pre-computed leaf values. */
3782 ;
3783 else
3784 {
3785 bool any_succ_perm_out_m1 = false;
3786 int perm_in = vertices[idx].perm_in;
3787 for (graph_edge *succ = slpg->vertices[idx].succ;
3788 succ; succ = succ->succ_next)
3789 {
3790 int succ_idx = succ->dest;
3791 int succ_perm = vertices[succ_idx].perm_out;
3792 /* Handle unvisited (and constant) nodes optimistically. */
3793 /* ??? But for constants once we want to handle
3794 non-bijective permutes we have to verify the permute,
3795 when unifying lanes, will not unify different constants.
3796 For example see gcc.dg/vect/bb-slp-14.c for a case
3797 that would break. */
3798 if (succ_perm == -1)
3799 {
3800 /* When we handled a non-leaf optimistically, note
3801 that so we can adjust its outgoing permute below. */
3802 slp_tree succ_node = vertices[succ_idx].node;
3803 if (SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3804 && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3805 any_succ_perm_out_m1 = true;
3806 continue;
3807 }
3808 if (perm_in == -1)
3809 perm_in = succ_perm;
3810 else if (succ_perm == 0
3811 || !vect_slp_perms_eq (perms, perm_in, succ_perm))
3812 {
3813 perm_in = 0;
3814 break;
3815 }
3816 }
3817
3818 /* Adjust any incoming permutes we treated optimistically. */
3819 if (perm_in != -1 && any_succ_perm_out_m1)
3820 {
3821 for (graph_edge *succ = slpg->vertices[idx].succ;
3822 succ; succ = succ->succ_next)
3823 {
3824 slp_tree succ_node = vertices[succ->dest].node;
3825 if (vertices[succ->dest].perm_out == -1
3826 && SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3827 && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3828 {
3829 vertices[succ->dest].perm_out = perm_in;
3830 /* And ensure this propagates. */
3831 if (vertices[succ->dest].perm_in == -1)
3832 vertices[succ->dest].perm_in = perm_in;
3833 }
3834 }
3835 changed = true;
3836 }
3837
3838 if (!vect_slp_perms_eq (perms, perm_in,
3839 vertices[idx].perm_in))
3840 {
3841 /* Make sure we eventually converge. */
3842 gcc_checking_assert (vertices[idx].perm_in == -1
3843 || perm_in == 0);
3844 vertices[idx].perm_in = perm_in;
3845
3846 /* While we can handle VEC_PERM nodes as transparent
3847 pass-through they can be a cheap materialization
3848 point as well. In addition they can act as source
3849 of a random permutation as well.
3850 The following ensures that former materialization
3851 points that now have zero incoming permutes no
3852 longer appear as such and that former "any" permutes
3853 get pass-through. We keep VEC_PERM nodes optimistic
3854 as "any" outgoing permute though. */
3855 if (vertices[idx].perm_out != 0
3856 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
3857 vertices[idx].perm_out = perm_in;
3858 changed = true;
3859 }
3860 }
3861
3862 /* Elide pruning at materialization points in the first
3863 iteration phase. */
3864 if (!do_materialization)
3865 continue;
3866
3867 int perm = vertices[idx].perm_out;
3868 if (perm == 0 || perm == -1)
3869 continue;
3870
3871 /* Decide on permute materialization. Look whether there's
3872 a use (pred) edge that is permuted differently than us.
3873 In that case mark ourselves so the permutation is applied. */
3874 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
3875 if (all_preds_permuted)
3876 for (graph_edge *pred = slpg->vertices[idx].pred;
3877 pred; pred = pred->pred_next)
3878 {
3879 int pred_perm = vertices[pred->src].perm_in;
3880 gcc_checking_assert (pred_perm != -1);
3881 if (!vect_slp_perms_eq (perms, perm, pred_perm))
3882 {
3883 all_preds_permuted = false;
3884 break;
3885 }
3886 }
3887 if (!all_preds_permuted)
3888 {
3889 vertices[idx].perm_out = 0;
3890 changed = true;
3891 }
3892 }
3893
3894 /* If the initial propagation converged, switch on materialization
3895 and re-propagate. */
3896 if (!changed && !do_materialization)
3897 {
3898 do_materialization = true;
3899 changed = true;
3900 }
3901 }
3902 while (changed);
3903 statistics_histogram_event (cfun, "SLP optimize perm iterations", iteration);
3904
3905 /* Materialize. */
3906 for (i = 0; i < vertices.length (); ++i)
3907 {
3908 int perm_in = vertices[i].perm_in;
3909 slp_tree node = vertices[i].node;
3910
3911 /* First permute invariant/external original successors, we handle
3912 those optimistically during propagation and duplicate them if
3913 they are used with different permutations. */
3914 unsigned j;
3915 slp_tree child;
3916 if (perm_in > 0)
3917 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3918 {
3919 if (!child
3920 || (SLP_TREE_DEF_TYPE (child) != vect_constant_def
3921 && SLP_TREE_DEF_TYPE (child) != vect_external_def))
3922 continue;
3923
3924 /* If the vector is uniform there's nothing to do. */
3925 if (vect_slp_tree_uniform_p (child))
3926 continue;
3927
3928 /* We can end up sharing some externals via two_operator
3929 handling. Be prepared to unshare those. */
3930 if (child->refcnt != 1)
3931 {
3932 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3933 SLP_TREE_CHILDREN (node)[j] = child
3934 = vect_create_new_slp_node
3935 (SLP_TREE_SCALAR_OPS (child).copy ());
3936 }
3937 vect_slp_permute (perms[perm_in],
3938 SLP_TREE_SCALAR_OPS (child), true);
3939 }
3940
3941 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3942 {
3943 /* Apply the common permutes to the input vectors. */
3944 if (perm_in > 0)
3945 {
3946 /* If the node is already a permute node we can apply
3947 the permutation to the lane selection, effectively
3948 materializing it on the incoming vectors. */
3949 if (dump_enabled_p ())
3950 dump_printf_loc (MSG_NOTE, vect_location,
3951 "simplifying permute node %p\n",
3952 node);
3953 for (unsigned k = 0;
3954 k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3955 SLP_TREE_LANE_PERMUTATION (node)[k].second
3956 = perms[perm_in][SLP_TREE_LANE_PERMUTATION (node)[k].second];
3957 }
3958 /* Apply the anticipated output permute to the permute and
3959 stmt vectors. */
3960 int perm_out = vertices[i].perm_out;
3961 if (perm_out > 0)
3962 {
3963 vect_slp_permute (perms[perm_out],
3964 SLP_TREE_SCALAR_STMTS (node), true);
3965 vect_slp_permute (perms[perm_out],
3966 SLP_TREE_LANE_PERMUTATION (node), true);
3967 }
3968 }
3969 else if (vertices[i].get_perm_materialized () != 0)
3970 {
3971 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3972 /* For loads simply drop the permutation, the load permutation
3973 already performs the desired permutation. */
3974 ;
3975 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3976 gcc_unreachable ();
3977 else
3978 {
3979 if (dump_enabled_p ())
3980 dump_printf_loc (MSG_NOTE, vect_location,
3981 "inserting permute node in place of %p\n",
3982 node);
3983
3984 /* Make a copy of NODE and in-place change it to a
3985 VEC_PERM node to permute the lanes of the copy. */
3986 slp_tree copy = new _slp_tree;
3987 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3988 SLP_TREE_CHILDREN (node) = vNULL;
3989 SLP_TREE_SCALAR_STMTS (copy)
3990 = SLP_TREE_SCALAR_STMTS (node).copy ();
3991 vect_slp_permute (perms[perm_in],
3992 SLP_TREE_SCALAR_STMTS (copy), true);
3993 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3994 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3995 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3996 SLP_TREE_LANE_PERMUTATION (copy)
3997 = SLP_TREE_LANE_PERMUTATION (node);
3998 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3999 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
4000 copy->refcnt = 1;
4001 copy->max_nunits = node->max_nunits;
4002 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
4003 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
4004 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
4005
4006 /* Now turn NODE into a VEC_PERM. */
4007 SLP_TREE_CHILDREN (node).safe_push (copy);
4008 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
4009 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4010 SLP_TREE_LANE_PERMUTATION (node)
4011 .quick_push (std::make_pair (0, perms[perm_in][j]));
4012 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
4013 }
4014 }
4015 else if (perm_in > 0) /* perm_in == perm_out */
4016 {
4017 /* Apply the reverse permutation to our stmts. */
4018 vect_slp_permute (perms[perm_in],
4019 SLP_TREE_SCALAR_STMTS (node), true);
4020 /* And to the lane/load permutation, which we can simply
4021 make regular by design. */
4022 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4023 {
4024 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
4025 /* ??? When we handle non-bijective permutes the idea
4026 is that we can force the load-permutation to be
4027 { min, min + 1, min + 2, ... max }. But then the
4028 scalar defs might no longer match the lane content
4029 which means wrong-code with live lane vectorization.
4030 So we possibly have to have NULL entries for those. */
4031 vect_slp_permute (perms[perm_in],
4032 SLP_TREE_LOAD_PERMUTATION (node), true);
4033 }
4034 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
4035 gcc_unreachable ();
4036 }
4037 }
4038
4039 /* Elide any permutations at BB reduction roots. */
4040 if (is_a <bb_vec_info> (vinfo))
4041 {
4042 for (slp_instance instance : vinfo->slp_instances)
4043 {
4044 if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
4045 continue;
4046 slp_tree old = SLP_INSTANCE_TREE (instance);
4047 if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
4048 && SLP_TREE_CHILDREN (old).length () == 1)
4049 {
4050 slp_tree child = SLP_TREE_CHILDREN (old)[0];
4051 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
4052 {
4053 /* Preserve the special VEC_PERM we use to shield existing
4054 vector defs from the rest. But make it a no-op. */
4055 unsigned i = 0;
4056 for (std::pair<unsigned, unsigned> &p
4057 : SLP_TREE_LANE_PERMUTATION (old))
4058 p.second = i++;
4059 }
4060 else
4061 {
4062 SLP_INSTANCE_TREE (instance) = child;
4063 SLP_TREE_REF_COUNT (child)++;
4064 vect_free_slp_tree (old);
4065 }
4066 }
4067 else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
4068 && SLP_TREE_REF_COUNT (old) == 1
4069 && vertices[old->vertex].get_perm_materialized () != 0)
4070 {
4071 /* ??? For loads the situation is more complex since
4072 we can't modify the permute in place in case the
4073 node is used multiple times. In fact for loads this
4074 should be somehow handled in the propagation engine. */
4075 /* Apply the reverse permutation to our stmts. */
4076 int perm = vertices[old->vertex].get_perm_materialized ();
4077 vect_slp_permute (perms[perm],
4078 SLP_TREE_SCALAR_STMTS (old), true);
4079 vect_slp_permute (perms[perm],
4080 SLP_TREE_LOAD_PERMUTATION (old), true);
4081 }
4082 }
4083 }
4084
4085 /* Free the perms vector used for propagation. */
4086 while (!perms.is_empty ())
4087 perms.pop ().release ();
4088 free_graph (slpg);
4089
4090
4091 /* Now elide load permutations that are not necessary. */
4092 for (i = 0; i < leafs.length (); ++i)
4093 {
4094 node = vertices[leafs[i]].node;
4095 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
4096 continue;
4097
4098 /* In basic block vectorization we allow any subchain of an interleaving
4099 chain.
4100 FORNOW: not in loop SLP because of realignment complications. */
4101 if (is_a <bb_vec_info> (vinfo))
4102 {
4103 bool subchain_p = true;
4104 stmt_vec_info next_load_info = NULL;
4105 stmt_vec_info load_info;
4106 unsigned j;
4107 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4108 {
4109 if (j != 0
4110 && (next_load_info != load_info
4111 || DR_GROUP_GAP (load_info) != 1))
4112 {
4113 subchain_p = false;
4114 break;
4115 }
4116 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
4117 }
4118 if (subchain_p)
4119 {
4120 SLP_TREE_LOAD_PERMUTATION (node).release ();
4121 continue;
4122 }
4123 }
4124 else
4125 {
4126 stmt_vec_info load_info;
4127 bool this_load_permuted = false;
4128 unsigned j;
4129 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4130 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
4131 {
4132 this_load_permuted = true;
4133 break;
4134 }
4135 stmt_vec_info first_stmt_info
4136 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
4137 if (!this_load_permuted
4138 /* The load requires permutation when unrolling exposes
4139 a gap either because the group is larger than the SLP
4140 group-size or because there is a gap between the groups. */
4141 && (known_eq (LOOP_VINFO_VECT_FACTOR
4142 (as_a <loop_vec_info> (vinfo)), 1U)
4143 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
4144 && DR_GROUP_GAP (first_stmt_info) == 0)))
4145 {
4146 SLP_TREE_LOAD_PERMUTATION (node).release ();
4147 continue;
4148 }
4149 }
4150 }
4151 }
4152
4153 /* Gather loads reachable from the individual SLP graph entries. */
4154
4155 void
4156 vect_gather_slp_loads (vec_info *vinfo)
4157 {
4158 unsigned i;
4159 slp_instance instance;
4160 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
4161 {
4162 hash_set<slp_tree> visited;
4163 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
4164 SLP_INSTANCE_TREE (instance), visited);
4165 }
4166 }
4167
4168
4169 /* For each possible SLP instance decide whether to SLP it and calculate overall
4170 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
4171 least one instance. */
4172
4173 bool
4174 vect_make_slp_decision (loop_vec_info loop_vinfo)
4175 {
4176 unsigned int i;
4177 poly_uint64 unrolling_factor = 1;
4178 const vec<slp_instance> &slp_instances
4179 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
4180 slp_instance instance;
4181 int decided_to_slp = 0;
4182
4183 DUMP_VECT_SCOPE ("vect_make_slp_decision");
4184
4185 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4186 {
4187 /* FORNOW: SLP if you can. */
4188 /* All unroll factors have the form:
4189
4190 GET_MODE_SIZE (vinfo->vector_mode) * X
4191
4192 for some rational X, so they must have a common multiple. */
4193 unrolling_factor
4194 = force_common_multiple (unrolling_factor,
4195 SLP_INSTANCE_UNROLLING_FACTOR (instance));
4196
4197 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
4198 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
4199 loop-based vectorization. Such stmts will be marked as HYBRID. */
4200 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4201 decided_to_slp++;
4202 }
4203
4204 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
4205
4206 if (decided_to_slp && dump_enabled_p ())
4207 {
4208 dump_printf_loc (MSG_NOTE, vect_location,
4209 "Decided to SLP %d instances. Unrolling factor ",
4210 decided_to_slp);
4211 dump_dec (MSG_NOTE, unrolling_factor);
4212 dump_printf (MSG_NOTE, "\n");
4213 }
4214
4215 return (decided_to_slp > 0);
4216 }
4217
4218 /* Private data for vect_detect_hybrid_slp. */
4219 struct vdhs_data
4220 {
4221 loop_vec_info loop_vinfo;
4222 vec<stmt_vec_info> *worklist;
4223 };
4224
4225 /* Walker for walk_gimple_op. */
4226
4227 static tree
4228 vect_detect_hybrid_slp (tree *tp, int *, void *data)
4229 {
4230 walk_stmt_info *wi = (walk_stmt_info *)data;
4231 vdhs_data *dat = (vdhs_data *)wi->info;
4232
4233 if (wi->is_lhs)
4234 return NULL_TREE;
4235
4236 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
4237 if (!def_stmt_info)
4238 return NULL_TREE;
4239 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
4240 if (PURE_SLP_STMT (def_stmt_info))
4241 {
4242 if (dump_enabled_p ())
4243 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
4244 def_stmt_info->stmt);
4245 STMT_SLP_TYPE (def_stmt_info) = hybrid;
4246 dat->worklist->safe_push (def_stmt_info);
4247 }
4248
4249 return NULL_TREE;
4250 }
4251
4252 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
4253 if so, otherwise pushing it to WORKLIST. */
4254
4255 static void
4256 maybe_push_to_hybrid_worklist (vec_info *vinfo,
4257 vec<stmt_vec_info> &worklist,
4258 stmt_vec_info stmt_info)
4259 {
4260 if (dump_enabled_p ())
4261 dump_printf_loc (MSG_NOTE, vect_location,
4262 "Processing hybrid candidate : %G", stmt_info->stmt);
4263 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
4264 imm_use_iterator iter2;
4265 ssa_op_iter iter1;
4266 use_operand_p use_p;
4267 def_operand_p def_p;
4268 bool any_def = false;
4269 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
4270 {
4271 any_def = true;
4272 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
4273 {
4274 if (is_gimple_debug (USE_STMT (use_p)))
4275 continue;
4276 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
4277 /* An out-of loop use means this is a loop_vect sink. */
4278 if (!use_info)
4279 {
4280 if (dump_enabled_p ())
4281 dump_printf_loc (MSG_NOTE, vect_location,
4282 "Found loop_vect sink: %G", stmt_info->stmt);
4283 worklist.safe_push (stmt_info);
4284 return;
4285 }
4286 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
4287 {
4288 if (dump_enabled_p ())
4289 dump_printf_loc (MSG_NOTE, vect_location,
4290 "Found loop_vect use: %G", use_info->stmt);
4291 worklist.safe_push (stmt_info);
4292 return;
4293 }
4294 }
4295 }
4296 /* No def means this is a loo_vect sink. */
4297 if (!any_def)
4298 {
4299 if (dump_enabled_p ())
4300 dump_printf_loc (MSG_NOTE, vect_location,
4301 "Found loop_vect sink: %G", stmt_info->stmt);
4302 worklist.safe_push (stmt_info);
4303 return;
4304 }
4305 if (dump_enabled_p ())
4306 dump_printf_loc (MSG_NOTE, vect_location,
4307 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
4308 STMT_SLP_TYPE (stmt_info) = pure_slp;
4309 }
4310
4311 /* Find stmts that must be both vectorized and SLPed. */
4312
4313 void
4314 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
4315 {
4316 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
4317
4318 /* All stmts participating in SLP are marked pure_slp, all other
4319 stmts are loop_vect.
4320 First collect all loop_vect stmts into a worklist.
4321 SLP patterns cause not all original scalar stmts to appear in
4322 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
4323 Rectify this here and do a backward walk over the IL only considering
4324 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
4325 mark them as pure_slp. */
4326 auto_vec<stmt_vec_info> worklist;
4327 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
4328 {
4329 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
4330 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
4331 gsi_next (&gsi))
4332 {
4333 gphi *phi = gsi.phi ();
4334 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
4335 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4336 maybe_push_to_hybrid_worklist (loop_vinfo,
4337 worklist, stmt_info);
4338 }
4339 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
4340 gsi_prev (&gsi))
4341 {
4342 gimple *stmt = gsi_stmt (gsi);
4343 if (is_gimple_debug (stmt))
4344 continue;
4345 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
4346 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
4347 {
4348 for (gimple_stmt_iterator gsi2
4349 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
4350 !gsi_end_p (gsi2); gsi_next (&gsi2))
4351 {
4352 stmt_vec_info patt_info
4353 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
4354 if (!STMT_SLP_TYPE (patt_info)
4355 && STMT_VINFO_RELEVANT (patt_info))
4356 maybe_push_to_hybrid_worklist (loop_vinfo,
4357 worklist, patt_info);
4358 }
4359 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4360 }
4361 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4362 maybe_push_to_hybrid_worklist (loop_vinfo,
4363 worklist, stmt_info);
4364 }
4365 }
4366
4367 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
4368 mark any SLP vectorized stmt as hybrid.
4369 ??? We're visiting def stmts N times (once for each non-SLP and
4370 once for each hybrid-SLP use). */
4371 walk_stmt_info wi;
4372 vdhs_data dat;
4373 dat.worklist = &worklist;
4374 dat.loop_vinfo = loop_vinfo;
4375 memset (&wi, 0, sizeof (wi));
4376 wi.info = (void *)&dat;
4377 while (!worklist.is_empty ())
4378 {
4379 stmt_vec_info stmt_info = worklist.pop ();
4380 /* Since SSA operands are not set up for pattern stmts we need
4381 to use walk_gimple_op. */
4382 wi.is_lhs = 0;
4383 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
4384 }
4385 }
4386
4387
4388 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
4389
4390 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
4391 : vec_info (vec_info::bb, init_cost (NULL, false), shared),
4392 bbs (_bbs),
4393 roots (vNULL)
4394 {
4395 for (unsigned i = 0; i < bbs.length (); ++i)
4396 {
4397 if (i != 0)
4398 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4399 gsi_next (&si))
4400 {
4401 gphi *phi = si.phi ();
4402 gimple_set_uid (phi, 0);
4403 add_stmt (phi);
4404 }
4405 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4406 !gsi_end_p (gsi); gsi_next (&gsi))
4407 {
4408 gimple *stmt = gsi_stmt (gsi);
4409 gimple_set_uid (stmt, 0);
4410 if (is_gimple_debug (stmt))
4411 continue;
4412 add_stmt (stmt);
4413 }
4414 }
4415 }
4416
4417
4418 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
4419 stmts in the basic block. */
4420
4421 _bb_vec_info::~_bb_vec_info ()
4422 {
4423 /* Reset region marker. */
4424 for (unsigned i = 0; i < bbs.length (); ++i)
4425 {
4426 if (i != 0)
4427 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4428 gsi_next (&si))
4429 {
4430 gphi *phi = si.phi ();
4431 gimple_set_uid (phi, -1);
4432 }
4433 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4434 !gsi_end_p (gsi); gsi_next (&gsi))
4435 {
4436 gimple *stmt = gsi_stmt (gsi);
4437 gimple_set_uid (stmt, -1);
4438 }
4439 }
4440
4441 for (unsigned i = 0; i < roots.length (); ++i)
4442 {
4443 roots[i].stmts.release ();
4444 roots[i].roots.release ();
4445 }
4446 roots.release ();
4447 }
4448
4449 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
4450 given then that child nodes have already been processed, and that
4451 their def types currently match their SLP node's def type. */
4452
4453 static bool
4454 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
4455 slp_instance node_instance,
4456 stmt_vector_for_cost *cost_vec)
4457 {
4458 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4459
4460 /* Calculate the number of vector statements to be created for the
4461 scalar stmts in this node. For SLP reductions it is equal to the
4462 number of vector statements in the children (which has already been
4463 calculated by the recursive call). Otherwise it is the number of
4464 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
4465 VF divided by the number of elements in a vector. */
4466 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
4467 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
4468 {
4469 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
4470 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
4471 {
4472 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4473 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
4474 break;
4475 }
4476 }
4477 else
4478 {
4479 poly_uint64 vf;
4480 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4481 vf = loop_vinfo->vectorization_factor;
4482 else
4483 vf = 1;
4484 unsigned int group_size = SLP_TREE_LANES (node);
4485 tree vectype = SLP_TREE_VECTYPE (node);
4486 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4487 = vect_get_num_vectors (vf * group_size, vectype);
4488 }
4489
4490 /* Handle purely internal nodes. */
4491 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4492 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
4493
4494 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
4495 if (is_a <bb_vec_info> (vinfo)
4496 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
4497 {
4498 if (dump_enabled_p ())
4499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4500 "desired vector type conflicts with earlier one "
4501 "for %G", stmt_info->stmt);
4502 return false;
4503 }
4504
4505 bool dummy;
4506 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
4507 node, node_instance, cost_vec);
4508 }
4509
4510 /* Try to build NODE from scalars, returning true on success.
4511 NODE_INSTANCE is the SLP instance that contains NODE. */
4512
4513 static bool
4514 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
4515 slp_instance node_instance)
4516 {
4517 stmt_vec_info stmt_info;
4518 unsigned int i;
4519
4520 if (!is_a <bb_vec_info> (vinfo)
4521 || node == SLP_INSTANCE_TREE (node_instance)
4522 || !SLP_TREE_SCALAR_STMTS (node).exists ()
4523 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
4524 return false;
4525
4526 if (dump_enabled_p ())
4527 dump_printf_loc (MSG_NOTE, vect_location,
4528 "Building vector operands of %p from scalars instead\n", node);
4529
4530 /* Don't remove and free the child nodes here, since they could be
4531 referenced by other structures. The analysis and scheduling phases
4532 (need to) ignore child nodes of anything that isn't vect_internal_def. */
4533 unsigned int group_size = SLP_TREE_LANES (node);
4534 SLP_TREE_DEF_TYPE (node) = vect_external_def;
4535 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
4536 SLP_TREE_LOAD_PERMUTATION (node).release ();
4537 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4538 {
4539 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
4540 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
4541 }
4542 return true;
4543 }
4544
4545 /* Compute the prologue cost for invariant or constant operands represented
4546 by NODE. */
4547
4548 static void
4549 vect_prologue_cost_for_slp (slp_tree node,
4550 stmt_vector_for_cost *cost_vec)
4551 {
4552 /* There's a special case of an existing vector, that costs nothing. */
4553 if (SLP_TREE_SCALAR_OPS (node).length () == 0
4554 && !SLP_TREE_VEC_DEFS (node).is_empty ())
4555 return;
4556 /* Without looking at the actual initializer a vector of
4557 constants can be implemented as load from the constant pool.
4558 When all elements are the same we can use a splat. */
4559 tree vectype = SLP_TREE_VECTYPE (node);
4560 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
4561 unsigned num_vects_to_check;
4562 unsigned HOST_WIDE_INT const_nunits;
4563 unsigned nelt_limit;
4564 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
4565 && ! multiple_p (const_nunits, group_size))
4566 {
4567 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4568 nelt_limit = const_nunits;
4569 }
4570 else
4571 {
4572 /* If either the vector has variable length or the vectors
4573 are composed of repeated whole groups we only need to
4574 cost construction once. All vectors will be the same. */
4575 num_vects_to_check = 1;
4576 nelt_limit = group_size;
4577 }
4578 tree elt = NULL_TREE;
4579 unsigned nelt = 0;
4580 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
4581 {
4582 unsigned si = j % group_size;
4583 if (nelt == 0)
4584 elt = SLP_TREE_SCALAR_OPS (node)[si];
4585 /* ??? We're just tracking whether all operands of a single
4586 vector initializer are the same, ideally we'd check if
4587 we emitted the same one already. */
4588 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
4589 elt = NULL_TREE;
4590 nelt++;
4591 if (nelt == nelt_limit)
4592 {
4593 record_stmt_cost (cost_vec, 1,
4594 SLP_TREE_DEF_TYPE (node) == vect_external_def
4595 ? (elt ? scalar_to_vec : vec_construct)
4596 : vector_load,
4597 NULL, vectype, 0, vect_prologue);
4598 nelt = 0;
4599 }
4600 }
4601 }
4602
4603 /* Analyze statements contained in SLP tree NODE after recursively analyzing
4604 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
4605
4606 Return true if the operations are supported. */
4607
4608 static bool
4609 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
4610 slp_instance node_instance,
4611 hash_set<slp_tree> &visited_set,
4612 vec<slp_tree> &visited_vec,
4613 stmt_vector_for_cost *cost_vec)
4614 {
4615 int i, j;
4616 slp_tree child;
4617
4618 /* Assume we can code-generate all invariants. */
4619 if (!node
4620 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
4621 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4622 return true;
4623
4624 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
4625 {
4626 if (dump_enabled_p ())
4627 dump_printf_loc (MSG_NOTE, vect_location,
4628 "Failed cyclic SLP reference in %p\n", node);
4629 return false;
4630 }
4631 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
4632
4633 /* If we already analyzed the exact same set of scalar stmts we're done.
4634 We share the generated vector stmts for those. */
4635 if (visited_set.add (node))
4636 return true;
4637 visited_vec.safe_push (node);
4638
4639 bool res = true;
4640 unsigned visited_rec_start = visited_vec.length ();
4641 unsigned cost_vec_rec_start = cost_vec->length ();
4642 bool seen_non_constant_child = false;
4643 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4644 {
4645 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
4646 visited_set, visited_vec,
4647 cost_vec);
4648 if (!res)
4649 break;
4650 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
4651 seen_non_constant_child = true;
4652 }
4653 /* We're having difficulties scheduling nodes with just constant
4654 operands and no scalar stmts since we then cannot compute a stmt
4655 insertion place. */
4656 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
4657 {
4658 if (dump_enabled_p ())
4659 dump_printf_loc (MSG_NOTE, vect_location,
4660 "Cannot vectorize all-constant op node %p\n", node);
4661 res = false;
4662 }
4663
4664 if (res)
4665 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
4666 cost_vec);
4667 /* If analysis failed we have to pop all recursive visited nodes
4668 plus ourselves. */
4669 if (!res)
4670 {
4671 while (visited_vec.length () >= visited_rec_start)
4672 visited_set.remove (visited_vec.pop ());
4673 cost_vec->truncate (cost_vec_rec_start);
4674 }
4675
4676 /* When the node can be vectorized cost invariant nodes it references.
4677 This is not done in DFS order to allow the refering node
4678 vectorizable_* calls to nail down the invariant nodes vector type
4679 and possibly unshare it if it needs a different vector type than
4680 other referrers. */
4681 if (res)
4682 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
4683 if (child
4684 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
4685 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
4686 /* Perform usual caching, note code-generation still
4687 code-gens these nodes multiple times but we expect
4688 to CSE them later. */
4689 && !visited_set.add (child))
4690 {
4691 visited_vec.safe_push (child);
4692 /* ??? After auditing more code paths make a "default"
4693 and push the vector type from NODE to all children
4694 if it is not already set. */
4695 /* Compute the number of vectors to be generated. */
4696 tree vector_type = SLP_TREE_VECTYPE (child);
4697 if (!vector_type)
4698 {
4699 /* For shifts with a scalar argument we don't need
4700 to cost or code-generate anything.
4701 ??? Represent this more explicitely. */
4702 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4703 == shift_vec_info_type)
4704 && j == 1);
4705 continue;
4706 }
4707 unsigned group_size = SLP_TREE_LANES (child);
4708 poly_uint64 vf = 1;
4709 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4710 vf = loop_vinfo->vectorization_factor;
4711 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
4712 = vect_get_num_vectors (vf * group_size, vector_type);
4713 /* And cost them. */
4714 vect_prologue_cost_for_slp (child, cost_vec);
4715 }
4716
4717 /* If this node or any of its children can't be vectorized, try pruning
4718 the tree here rather than felling the whole thing. */
4719 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
4720 {
4721 /* We'll need to revisit this for invariant costing and number
4722 of vectorized stmt setting. */
4723 res = true;
4724 }
4725
4726 return res;
4727 }
4728
4729 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4730 region and that can be vectorized using vectorizable_live_operation
4731 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
4732 scalar code computing it to be retained. */
4733
4734 static void
4735 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
4736 slp_instance instance,
4737 stmt_vector_for_cost *cost_vec,
4738 hash_set<stmt_vec_info> &svisited,
4739 hash_set<slp_tree> &visited)
4740 {
4741 if (visited.add (node))
4742 return;
4743
4744 unsigned i;
4745 stmt_vec_info stmt_info;
4746 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
4747 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4748 {
4749 if (svisited.contains (stmt_info))
4750 continue;
4751 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4752 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
4753 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
4754 /* Only the pattern root stmt computes the original scalar value. */
4755 continue;
4756 bool mark_visited = true;
4757 gimple *orig_stmt = orig_stmt_info->stmt;
4758 ssa_op_iter op_iter;
4759 def_operand_p def_p;
4760 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4761 {
4762 imm_use_iterator use_iter;
4763 gimple *use_stmt;
4764 stmt_vec_info use_stmt_info;
4765 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4766 if (!is_gimple_debug (use_stmt))
4767 {
4768 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
4769 if (!use_stmt_info
4770 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4771 {
4772 STMT_VINFO_LIVE_P (stmt_info) = true;
4773 if (vectorizable_live_operation (bb_vinfo, stmt_info,
4774 NULL, node, instance, i,
4775 false, cost_vec))
4776 /* ??? So we know we can vectorize the live stmt
4777 from one SLP node. If we cannot do so from all
4778 or none consistently we'd have to record which
4779 SLP node (and lane) we want to use for the live
4780 operation. So make sure we can code-generate
4781 from all nodes. */
4782 mark_visited = false;
4783 else
4784 STMT_VINFO_LIVE_P (stmt_info) = false;
4785 break;
4786 }
4787 }
4788 /* We have to verify whether we can insert the lane extract
4789 before all uses. The following is a conservative approximation.
4790 We cannot put this into vectorizable_live_operation because
4791 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4792 doesn't work.
4793 Note that while the fact that we emit code for loads at the
4794 first load should make this a non-problem leafs we construct
4795 from scalars are vectorized after the last scalar def.
4796 ??? If we'd actually compute the insert location during
4797 analysis we could use sth less conservative than the last
4798 scalar stmt in the node for the dominance check. */
4799 /* ??? What remains is "live" uses in vector CTORs in the same
4800 SLP graph which is where those uses can end up code-generated
4801 right after their definition instead of close to their original
4802 use. But that would restrict us to code-generate lane-extracts
4803 from the latest stmt in a node. So we compensate for this
4804 during code-generation, simply not replacing uses for those
4805 hopefully rare cases. */
4806 if (STMT_VINFO_LIVE_P (stmt_info))
4807 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4808 if (!is_gimple_debug (use_stmt)
4809 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4810 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4811 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4812 {
4813 if (dump_enabled_p ())
4814 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4815 "Cannot determine insertion place for "
4816 "lane extract\n");
4817 STMT_VINFO_LIVE_P (stmt_info) = false;
4818 mark_visited = true;
4819 }
4820 }
4821 if (mark_visited)
4822 svisited.add (stmt_info);
4823 }
4824
4825 slp_tree child;
4826 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4827 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4828 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4829 cost_vec, svisited, visited);
4830 }
4831
4832 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
4833
4834 static bool
4835 vectorizable_bb_reduc_epilogue (slp_instance instance,
4836 stmt_vector_for_cost *cost_vec)
4837 {
4838 enum tree_code reduc_code
4839 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
4840 if (reduc_code == MINUS_EXPR)
4841 reduc_code = PLUS_EXPR;
4842 internal_fn reduc_fn;
4843 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
4844 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
4845 || reduc_fn == IFN_LAST
4846 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH))
4847 return false;
4848
4849 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
4850 cost log2 vector operations plus shuffles. */
4851 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
4852 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
4853 vectype, 0, vect_body);
4854 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
4855 vectype, 0, vect_body);
4856 return true;
4857 }
4858
4859 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
4860 and recurse to children. */
4861
4862 static void
4863 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
4864 hash_set<slp_tree> &visited)
4865 {
4866 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
4867 || visited.add (node))
4868 return;
4869
4870 stmt_vec_info stmt;
4871 unsigned i;
4872 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4873 roots.remove (vect_orig_stmt (stmt));
4874
4875 slp_tree child;
4876 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4877 if (child)
4878 vect_slp_prune_covered_roots (child, roots, visited);
4879 }
4880
4881 /* Analyze statements in SLP instances of VINFO. Return true if the
4882 operations are supported. */
4883
4884 bool
4885 vect_slp_analyze_operations (vec_info *vinfo)
4886 {
4887 slp_instance instance;
4888 int i;
4889
4890 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4891
4892 hash_set<slp_tree> visited;
4893 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4894 {
4895 auto_vec<slp_tree> visited_vec;
4896 stmt_vector_for_cost cost_vec;
4897 cost_vec.create (2);
4898 if (is_a <bb_vec_info> (vinfo))
4899 vect_location = instance->location ();
4900 if (!vect_slp_analyze_node_operations (vinfo,
4901 SLP_INSTANCE_TREE (instance),
4902 instance, visited, visited_vec,
4903 &cost_vec)
4904 /* CTOR instances require vectorized defs for the SLP tree root. */
4905 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
4906 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
4907 != vect_internal_def))
4908 /* Check we can vectorize the reduction. */
4909 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
4910 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
4911 {
4912 slp_tree node = SLP_INSTANCE_TREE (instance);
4913 stmt_vec_info stmt_info;
4914 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
4915 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
4916 else
4917 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4918 if (dump_enabled_p ())
4919 dump_printf_loc (MSG_NOTE, vect_location,
4920 "removing SLP instance operations starting from: %G",
4921 stmt_info->stmt);
4922 vect_free_slp_instance (instance);
4923 vinfo->slp_instances.ordered_remove (i);
4924 cost_vec.release ();
4925 while (!visited_vec.is_empty ())
4926 visited.remove (visited_vec.pop ());
4927 }
4928 else
4929 {
4930 i++;
4931
4932 /* For BB vectorization remember the SLP graph entry
4933 cost for later. */
4934 if (is_a <bb_vec_info> (vinfo))
4935 instance->cost_vec = cost_vec;
4936 else
4937 {
4938 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
4939 cost_vec.release ();
4940 }
4941 }
4942 }
4943
4944 /* Now look for SLP instances with a root that are covered by other
4945 instances and remove them. */
4946 hash_set<stmt_vec_info> roots;
4947 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4948 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
4949 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
4950 if (!roots.is_empty ())
4951 {
4952 visited.empty ();
4953 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4954 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
4955 visited);
4956 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4957 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
4958 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
4959 {
4960 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
4961 if (dump_enabled_p ())
4962 dump_printf_loc (MSG_NOTE, vect_location,
4963 "removing SLP instance operations starting "
4964 "from: %G", root->stmt);
4965 vect_free_slp_instance (instance);
4966 vinfo->slp_instances.ordered_remove (i);
4967 }
4968 else
4969 ++i;
4970 }
4971
4972 /* Compute vectorizable live stmts. */
4973 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4974 {
4975 hash_set<stmt_vec_info> svisited;
4976 hash_set<slp_tree> visited;
4977 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4978 {
4979 vect_location = instance->location ();
4980 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
4981 instance, &instance->cost_vec, svisited,
4982 visited);
4983 }
4984 }
4985
4986 return !vinfo->slp_instances.is_empty ();
4987 }
4988
4989 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
4990 closing the eventual chain. */
4991
4992 static slp_instance
4993 get_ultimate_leader (slp_instance instance,
4994 hash_map<slp_instance, slp_instance> &instance_leader)
4995 {
4996 auto_vec<slp_instance *, 8> chain;
4997 slp_instance *tem;
4998 while (*(tem = instance_leader.get (instance)) != instance)
4999 {
5000 chain.safe_push (tem);
5001 instance = *tem;
5002 }
5003 while (!chain.is_empty ())
5004 *chain.pop () = instance;
5005 return instance;
5006 }
5007
5008 /* Worker of vect_bb_partition_graph, recurse on NODE. */
5009
5010 static void
5011 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
5012 slp_instance instance, slp_tree node,
5013 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
5014 hash_map<slp_instance, slp_instance> &instance_leader,
5015 hash_set<slp_tree> &visited)
5016 {
5017 stmt_vec_info stmt_info;
5018 unsigned i;
5019
5020 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5021 {
5022 bool existed_p;
5023 slp_instance &stmt_instance
5024 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
5025 if (!existed_p)
5026 ;
5027 else if (stmt_instance != instance)
5028 {
5029 /* If we're running into a previously marked stmt make us the
5030 leader of the current ultimate leader. This keeps the
5031 leader chain acyclic and works even when the current instance
5032 connects two previously independent graph parts. */
5033 slp_instance stmt_leader
5034 = get_ultimate_leader (stmt_instance, instance_leader);
5035 if (stmt_leader != instance)
5036 instance_leader.put (stmt_leader, instance);
5037 }
5038 stmt_instance = instance;
5039 }
5040
5041 if (!SLP_TREE_SCALAR_STMTS (node).is_empty () && visited.add (node))
5042 return;
5043
5044 slp_tree child;
5045 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5046 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5047 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
5048 instance_leader, visited);
5049 }
5050
5051 /* Partition the SLP graph into pieces that can be costed independently. */
5052
5053 static void
5054 vect_bb_partition_graph (bb_vec_info bb_vinfo)
5055 {
5056 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
5057
5058 /* First walk the SLP graph assigning each involved scalar stmt a
5059 corresponding SLP graph entry and upon visiting a previously
5060 marked stmt, make the stmts leader the current SLP graph entry. */
5061 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
5062 hash_map<slp_instance, slp_instance> instance_leader;
5063 hash_set<slp_tree> visited;
5064 slp_instance instance;
5065 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5066 {
5067 instance_leader.put (instance, instance);
5068 vect_bb_partition_graph_r (bb_vinfo,
5069 instance, SLP_INSTANCE_TREE (instance),
5070 stmt_to_instance, instance_leader,
5071 visited);
5072 }
5073
5074 /* Then collect entries to each independent subgraph. */
5075 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5076 {
5077 slp_instance leader = get_ultimate_leader (instance, instance_leader);
5078 leader->subgraph_entries.safe_push (instance);
5079 if (dump_enabled_p ()
5080 && leader != instance)
5081 dump_printf_loc (MSG_NOTE, vect_location,
5082 "instance %p is leader of %p\n",
5083 leader, instance);
5084 }
5085 }
5086
5087 /* Compute the scalar cost of the SLP node NODE and its children
5088 and return it. Do not account defs that are marked in LIFE and
5089 update LIFE according to uses of NODE. */
5090
5091 static void
5092 vect_bb_slp_scalar_cost (vec_info *vinfo,
5093 slp_tree node, vec<bool, va_heap> *life,
5094 stmt_vector_for_cost *cost_vec,
5095 hash_set<slp_tree> &visited)
5096 {
5097 unsigned i;
5098 stmt_vec_info stmt_info;
5099 slp_tree child;
5100
5101 if (visited.add (node))
5102 return;
5103
5104 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5105 {
5106 ssa_op_iter op_iter;
5107 def_operand_p def_p;
5108
5109 if ((*life)[i])
5110 continue;
5111
5112 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5113 gimple *orig_stmt = orig_stmt_info->stmt;
5114
5115 /* If there is a non-vectorized use of the defs then the scalar
5116 stmt is kept live in which case we do not account it or any
5117 required defs in the SLP children in the scalar cost. This
5118 way we make the vectorization more costly when compared to
5119 the scalar cost. */
5120 if (!STMT_VINFO_LIVE_P (stmt_info))
5121 {
5122 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
5123 {
5124 imm_use_iterator use_iter;
5125 gimple *use_stmt;
5126 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
5127 if (!is_gimple_debug (use_stmt))
5128 {
5129 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
5130 if (!use_stmt_info
5131 || !PURE_SLP_STMT
5132 (vect_stmt_to_vectorize (use_stmt_info)))
5133 {
5134 (*life)[i] = true;
5135 break;
5136 }
5137 }
5138 }
5139 if ((*life)[i])
5140 continue;
5141 }
5142
5143 /* Count scalar stmts only once. */
5144 if (gimple_visited_p (orig_stmt))
5145 continue;
5146 gimple_set_visited (orig_stmt, true);
5147
5148 vect_cost_for_stmt kind;
5149 if (STMT_VINFO_DATA_REF (orig_stmt_info))
5150 {
5151 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
5152 kind = scalar_load;
5153 else
5154 kind = scalar_store;
5155 }
5156 else if (vect_nop_conversion_p (orig_stmt_info))
5157 continue;
5158 /* For single-argument PHIs assume coalescing which means zero cost
5159 for the scalar and the vector PHIs. This avoids artificially
5160 favoring the vector path (but may pessimize it in some cases). */
5161 else if (is_a <gphi *> (orig_stmt_info->stmt)
5162 && gimple_phi_num_args
5163 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
5164 continue;
5165 else
5166 kind = scalar_stmt;
5167 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
5168 SLP_TREE_VECTYPE (node), 0, vect_body);
5169 }
5170
5171 auto_vec<bool, 20> subtree_life;
5172 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5173 {
5174 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5175 {
5176 /* Do not directly pass LIFE to the recursive call, copy it to
5177 confine changes in the callee to the current child/subtree. */
5178 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5179 {
5180 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
5181 for (unsigned j = 0;
5182 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
5183 {
5184 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
5185 if (perm.first == i)
5186 subtree_life[perm.second] = (*life)[j];
5187 }
5188 }
5189 else
5190 {
5191 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
5192 subtree_life.safe_splice (*life);
5193 }
5194 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
5195 visited);
5196 subtree_life.truncate (0);
5197 }
5198 }
5199 }
5200
5201 /* Comparator for the loop-index sorted cost vectors. */
5202
5203 static int
5204 li_cost_vec_cmp (const void *a_, const void *b_)
5205 {
5206 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
5207 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
5208 if (a->first < b->first)
5209 return -1;
5210 else if (a->first == b->first)
5211 return 0;
5212 return 1;
5213 }
5214
5215 /* Check if vectorization of the basic block is profitable for the
5216 subgraph denoted by SLP_INSTANCES. */
5217
5218 static bool
5219 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
5220 vec<slp_instance> slp_instances)
5221 {
5222 slp_instance instance;
5223 int i;
5224 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
5225 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
5226
5227 if (dump_enabled_p ())
5228 {
5229 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
5230 hash_set<slp_tree> visited;
5231 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5232 vect_print_slp_graph (MSG_NOTE, vect_location,
5233 SLP_INSTANCE_TREE (instance), visited);
5234 }
5235
5236 /* Calculate scalar cost and sum the cost for the vector stmts
5237 previously collected. */
5238 stmt_vector_for_cost scalar_costs = vNULL;
5239 stmt_vector_for_cost vector_costs = vNULL;
5240 hash_set<slp_tree> visited;
5241 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5242 {
5243 auto_vec<bool, 20> life;
5244 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
5245 true);
5246 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5247 record_stmt_cost (&scalar_costs,
5248 SLP_INSTANCE_ROOT_STMTS (instance).length (),
5249 scalar_stmt,
5250 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
5251 vect_bb_slp_scalar_cost (bb_vinfo,
5252 SLP_INSTANCE_TREE (instance),
5253 &life, &scalar_costs, visited);
5254 vector_costs.safe_splice (instance->cost_vec);
5255 instance->cost_vec.release ();
5256 }
5257 /* Unset visited flag. */
5258 stmt_info_for_cost *cost;
5259 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5260 gimple_set_visited (cost->stmt_info->stmt, false);
5261
5262 if (dump_enabled_p ())
5263 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5264
5265 /* When costing non-loop vectorization we need to consider each covered
5266 loop independently and make sure vectorization is profitable. For
5267 now we assume a loop may be not entered or executed an arbitrary
5268 number of iterations (??? static information can provide more
5269 precise info here) which means we can simply cost each containing
5270 loops stmts separately. */
5271
5272 /* First produce cost vectors sorted by loop index. */
5273 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5274 li_scalar_costs (scalar_costs.length ());
5275 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5276 li_vector_costs (vector_costs.length ());
5277 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5278 {
5279 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5280 li_scalar_costs.quick_push (std::make_pair (l, cost));
5281 }
5282 /* Use a random used loop as fallback in case the first vector_costs
5283 entry does not have a stmt_info associated with it. */
5284 unsigned l = li_scalar_costs[0].first;
5285 FOR_EACH_VEC_ELT (vector_costs, i, cost)
5286 {
5287 /* We inherit from the previous COST, invariants, externals and
5288 extracts immediately follow the cost for the related stmt. */
5289 if (cost->stmt_info)
5290 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5291 li_vector_costs.quick_push (std::make_pair (l, cost));
5292 }
5293 li_scalar_costs.qsort (li_cost_vec_cmp);
5294 li_vector_costs.qsort (li_cost_vec_cmp);
5295
5296 /* Now cost the portions individually. */
5297 unsigned vi = 0;
5298 unsigned si = 0;
5299 while (si < li_scalar_costs.length ()
5300 && vi < li_vector_costs.length ())
5301 {
5302 unsigned sl = li_scalar_costs[si].first;
5303 unsigned vl = li_vector_costs[vi].first;
5304 if (sl != vl)
5305 {
5306 if (dump_enabled_p ())
5307 dump_printf_loc (MSG_NOTE, vect_location,
5308 "Scalar %d and vector %d loop part do not "
5309 "match up, skipping scalar part\n", sl, vl);
5310 /* Skip the scalar part, assuming zero cost on the vector side. */
5311 do
5312 {
5313 si++;
5314 }
5315 while (si < li_scalar_costs.length ()
5316 && li_scalar_costs[si].first == sl);
5317 continue;
5318 }
5319
5320 void *scalar_target_cost_data = init_cost (NULL, true);
5321 do
5322 {
5323 add_stmt_cost (bb_vinfo, scalar_target_cost_data,
5324 li_scalar_costs[si].second);
5325 si++;
5326 }
5327 while (si < li_scalar_costs.length ()
5328 && li_scalar_costs[si].first == sl);
5329 unsigned dummy;
5330 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
5331 destroy_cost_data (scalar_target_cost_data);
5332
5333 /* Complete the target-specific vector cost calculation. */
5334 void *vect_target_cost_data = init_cost (NULL, false);
5335 do
5336 {
5337 add_stmt_cost (bb_vinfo, vect_target_cost_data,
5338 li_vector_costs[vi].second);
5339 vi++;
5340 }
5341 while (vi < li_vector_costs.length ()
5342 && li_vector_costs[vi].first == vl);
5343 finish_cost (vect_target_cost_data, &vec_prologue_cost,
5344 &vec_inside_cost, &vec_epilogue_cost);
5345 destroy_cost_data (vect_target_cost_data);
5346
5347 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
5348
5349 if (dump_enabled_p ())
5350 {
5351 dump_printf_loc (MSG_NOTE, vect_location,
5352 "Cost model analysis for part in loop %d:\n", sl);
5353 dump_printf (MSG_NOTE, " Vector cost: %d\n",
5354 vec_inside_cost + vec_outside_cost);
5355 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
5356 }
5357
5358 /* Vectorization is profitable if its cost is more than the cost of scalar
5359 version. Note that we err on the vector side for equal cost because
5360 the cost estimate is otherwise quite pessimistic (constant uses are
5361 free on the scalar side but cost a load on the vector side for
5362 example). */
5363 if (vec_outside_cost + vec_inside_cost > scalar_cost)
5364 {
5365 scalar_costs.release ();
5366 vector_costs.release ();
5367 return false;
5368 }
5369 }
5370 if (vi < li_vector_costs.length ())
5371 {
5372 if (dump_enabled_p ())
5373 dump_printf_loc (MSG_NOTE, vect_location,
5374 "Excess vector cost for part in loop %d:\n",
5375 li_vector_costs[vi].first);
5376 scalar_costs.release ();
5377 vector_costs.release ();
5378 return false;
5379 }
5380
5381 scalar_costs.release ();
5382 vector_costs.release ();
5383 return true;
5384 }
5385
5386 /* qsort comparator for lane defs. */
5387
5388 static int
5389 vld_cmp (const void *a_, const void *b_)
5390 {
5391 auto *a = (const std::pair<unsigned, tree> *)a_;
5392 auto *b = (const std::pair<unsigned, tree> *)b_;
5393 return a->first - b->first;
5394 }
5395
5396 /* Return true if USE_STMT is a vector lane insert into VEC and set
5397 *THIS_LANE to the lane number that is set. */
5398
5399 static bool
5400 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
5401 {
5402 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
5403 if (!use_ass
5404 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
5405 || (vec
5406 ? gimple_assign_rhs1 (use_ass) != vec
5407 : ((vec = gimple_assign_rhs1 (use_ass)), false))
5408 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
5409 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
5410 || !constant_multiple_p
5411 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
5412 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
5413 this_lane))
5414 return false;
5415 return true;
5416 }
5417
5418 /* Find any vectorizable constructors and add them to the grouped_store
5419 array. */
5420
5421 static void
5422 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
5423 {
5424 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
5425 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
5426 !gsi_end_p (gsi); gsi_next (&gsi))
5427 {
5428 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
5429 if (!assign)
5430 continue;
5431
5432 tree rhs = gimple_assign_rhs1 (assign);
5433 enum tree_code code = gimple_assign_rhs_code (assign);
5434 use_operand_p use_p;
5435 gimple *use_stmt;
5436 if (code == CONSTRUCTOR)
5437 {
5438 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5439 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
5440 CONSTRUCTOR_NELTS (rhs))
5441 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
5442 || uniform_vector_p (rhs))
5443 continue;
5444
5445 unsigned j;
5446 tree val;
5447 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
5448 if (TREE_CODE (val) != SSA_NAME
5449 || !bb_vinfo->lookup_def (val))
5450 break;
5451 if (j != CONSTRUCTOR_NELTS (rhs))
5452 continue;
5453
5454 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
5455 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
5456 }
5457 else if (code == BIT_INSERT_EXPR
5458 && VECTOR_TYPE_P (TREE_TYPE (rhs))
5459 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
5460 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
5461 && integer_zerop (gimple_assign_rhs3 (assign))
5462 && useless_type_conversion_p
5463 (TREE_TYPE (TREE_TYPE (rhs)),
5464 TREE_TYPE (gimple_assign_rhs2 (assign)))
5465 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
5466 {
5467 /* We start to match on insert to lane zero but since the
5468 inserts need not be ordered we'd have to search both
5469 the def and the use chains. */
5470 tree vectype = TREE_TYPE (rhs);
5471 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5472 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
5473 auto_sbitmap lanes (nlanes);
5474 bitmap_clear (lanes);
5475 bitmap_set_bit (lanes, 0);
5476 tree def = gimple_assign_lhs (assign);
5477 lane_defs.quick_push
5478 (std::make_pair (0, gimple_assign_rhs2 (assign)));
5479 unsigned lanes_found = 1;
5480 /* Start with the use chains, the last stmt will be the root. */
5481 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
5482 vec<stmt_vec_info> roots = vNULL;
5483 roots.safe_push (last);
5484 do
5485 {
5486 use_operand_p use_p;
5487 gimple *use_stmt;
5488 if (!single_imm_use (def, &use_p, &use_stmt))
5489 break;
5490 unsigned this_lane;
5491 if (!bb_vinfo->lookup_stmt (use_stmt)
5492 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
5493 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
5494 break;
5495 if (bitmap_bit_p (lanes, this_lane))
5496 break;
5497 lanes_found++;
5498 bitmap_set_bit (lanes, this_lane);
5499 gassign *use_ass = as_a <gassign *> (use_stmt);
5500 lane_defs.quick_push (std::make_pair
5501 (this_lane, gimple_assign_rhs2 (use_ass)));
5502 last = bb_vinfo->lookup_stmt (use_ass);
5503 roots.safe_push (last);
5504 def = gimple_assign_lhs (use_ass);
5505 }
5506 while (lanes_found < nlanes);
5507 if (roots.length () > 1)
5508 std::swap(roots[0], roots[roots.length () - 1]);
5509 if (lanes_found < nlanes)
5510 {
5511 /* Now search the def chain. */
5512 def = gimple_assign_rhs1 (assign);
5513 do
5514 {
5515 if (TREE_CODE (def) != SSA_NAME
5516 || !has_single_use (def))
5517 break;
5518 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
5519 unsigned this_lane;
5520 if (!bb_vinfo->lookup_stmt (def_stmt)
5521 || !vect_slp_is_lane_insert (def_stmt,
5522 NULL_TREE, &this_lane)
5523 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
5524 break;
5525 if (bitmap_bit_p (lanes, this_lane))
5526 break;
5527 lanes_found++;
5528 bitmap_set_bit (lanes, this_lane);
5529 lane_defs.quick_push (std::make_pair
5530 (this_lane,
5531 gimple_assign_rhs2 (def_stmt)));
5532 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
5533 def = gimple_assign_rhs1 (def_stmt);
5534 }
5535 while (lanes_found < nlanes);
5536 }
5537 if (lanes_found == nlanes)
5538 {
5539 /* Sort lane_defs after the lane index and register the root. */
5540 lane_defs.qsort (vld_cmp);
5541 vec<stmt_vec_info> stmts;
5542 stmts.create (nlanes);
5543 for (unsigned i = 0; i < nlanes; ++i)
5544 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
5545 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
5546 stmts, roots));
5547 }
5548 else
5549 roots.release ();
5550 }
5551 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5552 && (associative_tree_code (code) || code == MINUS_EXPR)
5553 /* ??? The flag_associative_math and TYPE_OVERFLOW_WRAPS
5554 checks pessimize a two-element reduction. PR54400.
5555 ??? In-order reduction could be handled if we only
5556 traverse one operand chain in vect_slp_linearize_chain. */
5557 && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
5558 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
5559 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
5560 /* Ops with constants at the tail can be stripped here. */
5561 && TREE_CODE (rhs) == SSA_NAME
5562 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
5563 /* Should be the chain end. */
5564 && (!single_imm_use (gimple_assign_lhs (assign),
5565 &use_p, &use_stmt)
5566 || !is_gimple_assign (use_stmt)
5567 || (gimple_assign_rhs_code (use_stmt) != code
5568 && ((code != PLUS_EXPR && code != MINUS_EXPR)
5569 || (gimple_assign_rhs_code (use_stmt)
5570 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
5571 {
5572 /* We start the match at the end of a possible association
5573 chain. */
5574 auto_vec<chain_op_t> chain;
5575 auto_vec<std::pair<tree_code, gimple *> > worklist;
5576 auto_vec<gimple *> chain_stmts;
5577 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
5578 if (code == MINUS_EXPR)
5579 code = PLUS_EXPR;
5580 internal_fn reduc_fn;
5581 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
5582 || reduc_fn == IFN_LAST)
5583 continue;
5584 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
5585 /* ??? */
5586 code_stmt, alt_code_stmt, &chain_stmts);
5587 if (chain.length () > 1)
5588 {
5589 /* Sort the chain according to def_type and operation. */
5590 chain.sort (dt_sort_cmp, bb_vinfo);
5591 /* ??? Now we'd want to strip externals and constants
5592 but record those to be handled in the epilogue. */
5593 /* ??? For now do not allow mixing ops or externs/constants. */
5594 bool invalid = false;
5595 for (unsigned i = 0; i < chain.length (); ++i)
5596 if (chain[i].dt != vect_internal_def
5597 || chain[i].code != code)
5598 invalid = true;
5599 if (!invalid)
5600 {
5601 vec<stmt_vec_info> stmts;
5602 stmts.create (chain.length ());
5603 for (unsigned i = 0; i < chain.length (); ++i)
5604 stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
5605 vec<stmt_vec_info> roots;
5606 roots.create (chain_stmts.length ());
5607 for (unsigned i = 0; i < chain_stmts.length (); ++i)
5608 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
5609 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
5610 stmts, roots));
5611 }
5612 }
5613 }
5614 }
5615 }
5616
5617 /* Walk the grouped store chains and replace entries with their
5618 pattern variant if any. */
5619
5620 static void
5621 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
5622 {
5623 stmt_vec_info first_element;
5624 unsigned i;
5625
5626 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5627 {
5628 /* We also have CTORs in this array. */
5629 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
5630 continue;
5631 if (STMT_VINFO_IN_PATTERN_P (first_element))
5632 {
5633 stmt_vec_info orig = first_element;
5634 first_element = STMT_VINFO_RELATED_STMT (first_element);
5635 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
5636 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
5637 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
5638 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
5639 vinfo->grouped_stores[i] = first_element;
5640 }
5641 stmt_vec_info prev = first_element;
5642 while (DR_GROUP_NEXT_ELEMENT (prev))
5643 {
5644 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
5645 if (STMT_VINFO_IN_PATTERN_P (elt))
5646 {
5647 stmt_vec_info orig = elt;
5648 elt = STMT_VINFO_RELATED_STMT (elt);
5649 DR_GROUP_NEXT_ELEMENT (prev) = elt;
5650 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
5651 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
5652 }
5653 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
5654 prev = elt;
5655 }
5656 }
5657 }
5658
5659 /* Check if the region described by BB_VINFO can be vectorized, returning
5660 true if so. When returning false, set FATAL to true if the same failure
5661 would prevent vectorization at other vector sizes, false if it is still
5662 worth trying other sizes. N_STMTS is the number of statements in the
5663 region. */
5664
5665 static bool
5666 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
5667 vec<int> *dataref_groups)
5668 {
5669 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
5670
5671 slp_instance instance;
5672 int i;
5673 poly_uint64 min_vf = 2;
5674
5675 /* The first group of checks is independent of the vector size. */
5676 fatal = true;
5677
5678 /* Analyze the data references. */
5679
5680 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
5681 {
5682 if (dump_enabled_p ())
5683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5684 "not vectorized: unhandled data-ref in basic "
5685 "block.\n");
5686 return false;
5687 }
5688
5689 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
5690 {
5691 if (dump_enabled_p ())
5692 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5693 "not vectorized: unhandled data access in "
5694 "basic block.\n");
5695 return false;
5696 }
5697
5698 vect_slp_check_for_constructors (bb_vinfo);
5699
5700 /* If there are no grouped stores and no constructors in the region
5701 there is no need to continue with pattern recog as vect_analyze_slp
5702 will fail anyway. */
5703 if (bb_vinfo->grouped_stores.is_empty ()
5704 && bb_vinfo->roots.is_empty ())
5705 {
5706 if (dump_enabled_p ())
5707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5708 "not vectorized: no grouped stores in "
5709 "basic block.\n");
5710 return false;
5711 }
5712
5713 /* While the rest of the analysis below depends on it in some way. */
5714 fatal = false;
5715
5716 vect_pattern_recog (bb_vinfo);
5717
5718 /* Update store groups from pattern processing. */
5719 vect_fixup_store_groups_with_patterns (bb_vinfo);
5720
5721 /* Check the SLP opportunities in the basic block, analyze and build SLP
5722 trees. */
5723 if (!vect_analyze_slp (bb_vinfo, n_stmts))
5724 {
5725 if (dump_enabled_p ())
5726 {
5727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5728 "Failed to SLP the basic block.\n");
5729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5730 "not vectorized: failed to find SLP opportunities "
5731 "in basic block.\n");
5732 }
5733 return false;
5734 }
5735
5736 /* Optimize permutations. */
5737 vect_optimize_slp (bb_vinfo);
5738
5739 /* Gather the loads reachable from the SLP graph entries. */
5740 vect_gather_slp_loads (bb_vinfo);
5741
5742 vect_record_base_alignments (bb_vinfo);
5743
5744 /* Analyze and verify the alignment of data references and the
5745 dependence in the SLP instances. */
5746 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
5747 {
5748 vect_location = instance->location ();
5749 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
5750 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
5751 {
5752 slp_tree node = SLP_INSTANCE_TREE (instance);
5753 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5754 if (dump_enabled_p ())
5755 dump_printf_loc (MSG_NOTE, vect_location,
5756 "removing SLP instance operations starting from: %G",
5757 stmt_info->stmt);
5758 vect_free_slp_instance (instance);
5759 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
5760 continue;
5761 }
5762
5763 /* Mark all the statements that we want to vectorize as pure SLP and
5764 relevant. */
5765 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5766 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
5767 unsigned j;
5768 stmt_vec_info root;
5769 /* Likewise consider instance root stmts as vectorized. */
5770 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
5771 STMT_SLP_TYPE (root) = pure_slp;
5772
5773 i++;
5774 }
5775 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
5776 return false;
5777
5778 if (!vect_slp_analyze_operations (bb_vinfo))
5779 {
5780 if (dump_enabled_p ())
5781 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5782 "not vectorized: bad operation in basic block.\n");
5783 return false;
5784 }
5785
5786 vect_bb_partition_graph (bb_vinfo);
5787
5788 return true;
5789 }
5790
5791 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
5792 basic blocks in BBS, returning true on success.
5793 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
5794
5795 static bool
5796 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
5797 vec<int> *dataref_groups, unsigned int n_stmts)
5798 {
5799 bb_vec_info bb_vinfo;
5800 auto_vector_modes vector_modes;
5801
5802 /* Autodetect first vector size we try. */
5803 machine_mode next_vector_mode = VOIDmode;
5804 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
5805 unsigned int mode_i = 0;
5806
5807 vec_info_shared shared;
5808
5809 machine_mode autodetected_vector_mode = VOIDmode;
5810 while (1)
5811 {
5812 bool vectorized = false;
5813 bool fatal = false;
5814 bb_vinfo = new _bb_vec_info (bbs, &shared);
5815
5816 bool first_time_p = shared.datarefs.is_empty ();
5817 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
5818 if (first_time_p)
5819 bb_vinfo->shared->save_datarefs ();
5820 else
5821 bb_vinfo->shared->check_datarefs ();
5822 bb_vinfo->vector_mode = next_vector_mode;
5823
5824 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
5825 {
5826 if (dump_enabled_p ())
5827 {
5828 dump_printf_loc (MSG_NOTE, vect_location,
5829 "***** Analysis succeeded with vector mode"
5830 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
5831 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
5832 }
5833
5834 bb_vinfo->shared->check_datarefs ();
5835
5836 unsigned i;
5837 slp_instance instance;
5838 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
5839 {
5840 if (instance->subgraph_entries.is_empty ())
5841 continue;
5842
5843 vect_location = instance->location ();
5844 if (!unlimited_cost_model (NULL)
5845 && !vect_bb_vectorization_profitable_p
5846 (bb_vinfo, instance->subgraph_entries))
5847 {
5848 if (dump_enabled_p ())
5849 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5850 "not vectorized: vectorization is not "
5851 "profitable.\n");
5852 continue;
5853 }
5854
5855 if (!dbg_cnt (vect_slp))
5856 continue;
5857
5858 if (!vectorized && dump_enabled_p ())
5859 dump_printf_loc (MSG_NOTE, vect_location,
5860 "Basic block will be vectorized "
5861 "using SLP\n");
5862 vectorized = true;
5863
5864 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
5865
5866 unsigned HOST_WIDE_INT bytes;
5867 if (dump_enabled_p ())
5868 {
5869 if (GET_MODE_SIZE
5870 (bb_vinfo->vector_mode).is_constant (&bytes))
5871 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5872 "basic block part vectorized using %wu "
5873 "byte vectors\n", bytes);
5874 else
5875 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5876 "basic block part vectorized using "
5877 "variable length vectors\n");
5878 }
5879 }
5880 }
5881 else
5882 {
5883 if (dump_enabled_p ())
5884 dump_printf_loc (MSG_NOTE, vect_location,
5885 "***** Analysis failed with vector mode %s\n",
5886 GET_MODE_NAME (bb_vinfo->vector_mode));
5887 }
5888
5889 if (mode_i == 0)
5890 autodetected_vector_mode = bb_vinfo->vector_mode;
5891
5892 if (!fatal)
5893 while (mode_i < vector_modes.length ()
5894 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
5895 {
5896 if (dump_enabled_p ())
5897 dump_printf_loc (MSG_NOTE, vect_location,
5898 "***** The result for vector mode %s would"
5899 " be the same\n",
5900 GET_MODE_NAME (vector_modes[mode_i]));
5901 mode_i += 1;
5902 }
5903
5904 delete bb_vinfo;
5905
5906 if (mode_i < vector_modes.length ()
5907 && VECTOR_MODE_P (autodetected_vector_mode)
5908 && (related_vector_mode (vector_modes[mode_i],
5909 GET_MODE_INNER (autodetected_vector_mode))
5910 == autodetected_vector_mode)
5911 && (related_vector_mode (autodetected_vector_mode,
5912 GET_MODE_INNER (vector_modes[mode_i]))
5913 == vector_modes[mode_i]))
5914 {
5915 if (dump_enabled_p ())
5916 dump_printf_loc (MSG_NOTE, vect_location,
5917 "***** Skipping vector mode %s, which would"
5918 " repeat the analysis for %s\n",
5919 GET_MODE_NAME (vector_modes[mode_i]),
5920 GET_MODE_NAME (autodetected_vector_mode));
5921 mode_i += 1;
5922 }
5923
5924 if (vectorized
5925 || mode_i == vector_modes.length ()
5926 || autodetected_vector_mode == VOIDmode
5927 /* If vect_slp_analyze_bb_1 signaled that analysis for all
5928 vector sizes will fail do not bother iterating. */
5929 || fatal)
5930 return vectorized;
5931
5932 /* Try the next biggest vector size. */
5933 next_vector_mode = vector_modes[mode_i++];
5934 if (dump_enabled_p ())
5935 dump_printf_loc (MSG_NOTE, vect_location,
5936 "***** Re-trying analysis with vector mode %s\n",
5937 GET_MODE_NAME (next_vector_mode));
5938 }
5939 }
5940
5941
5942 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
5943 true if anything in the basic-block was vectorized. */
5944
5945 static bool
5946 vect_slp_bbs (const vec<basic_block> &bbs)
5947 {
5948 vec<data_reference_p> datarefs = vNULL;
5949 auto_vec<int> dataref_groups;
5950 int insns = 0;
5951 int current_group = 0;
5952
5953 for (unsigned i = 0; i < bbs.length (); i++)
5954 {
5955 basic_block bb = bbs[i];
5956 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
5957 gsi_next (&gsi))
5958 {
5959 gimple *stmt = gsi_stmt (gsi);
5960 if (is_gimple_debug (stmt))
5961 continue;
5962
5963 insns++;
5964
5965 if (gimple_location (stmt) != UNKNOWN_LOCATION)
5966 vect_location = stmt;
5967
5968 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
5969 &dataref_groups, current_group))
5970 ++current_group;
5971 }
5972 }
5973
5974 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
5975 }
5976
5977 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5978 true if anything in the basic-block was vectorized. */
5979
5980 bool
5981 vect_slp_bb (basic_block bb)
5982 {
5983 auto_vec<basic_block> bbs;
5984 bbs.safe_push (bb);
5985 return vect_slp_bbs (bbs);
5986 }
5987
5988 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5989 true if anything in the basic-block was vectorized. */
5990
5991 bool
5992 vect_slp_function (function *fun)
5993 {
5994 bool r = false;
5995 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
5996 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
5997
5998 /* For the moment split the function into pieces to avoid making
5999 the iteration on the vector mode moot. Split at points we know
6000 to not handle well which is CFG merges (SLP discovery doesn't
6001 handle non-loop-header PHIs) and loop exits. Since pattern
6002 recog requires reverse iteration to visit uses before defs
6003 simply chop RPO into pieces. */
6004 auto_vec<basic_block> bbs;
6005 for (unsigned i = 0; i < n; i++)
6006 {
6007 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
6008 bool split = false;
6009
6010 /* Split when a BB is not dominated by the first block. */
6011 if (!bbs.is_empty ()
6012 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
6013 {
6014 if (dump_enabled_p ())
6015 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6016 "splitting region at dominance boundary bb%d\n",
6017 bb->index);
6018 split = true;
6019 }
6020 /* Split when the loop determined by the first block
6021 is exited. This is because we eventually insert
6022 invariants at region begin. */
6023 else if (!bbs.is_empty ()
6024 && bbs[0]->loop_father != bb->loop_father
6025 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
6026 {
6027 if (dump_enabled_p ())
6028 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6029 "splitting region at loop %d exit at bb%d\n",
6030 bbs[0]->loop_father->num, bb->index);
6031 split = true;
6032 }
6033
6034 if (split && !bbs.is_empty ())
6035 {
6036 r |= vect_slp_bbs (bbs);
6037 bbs.truncate (0);
6038 bbs.quick_push (bb);
6039 }
6040 else
6041 bbs.safe_push (bb);
6042
6043 /* When we have a stmt ending this block and defining a
6044 value we have to insert on edges when inserting after it for
6045 a vector containing its definition. Avoid this for now. */
6046 if (gimple *last = last_stmt (bb))
6047 if (gimple_get_lhs (last)
6048 && is_ctrl_altering_stmt (last))
6049 {
6050 if (dump_enabled_p ())
6051 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6052 "splitting region at control altering "
6053 "definition %G", last);
6054 r |= vect_slp_bbs (bbs);
6055 bbs.truncate (0);
6056 }
6057 }
6058
6059 if (!bbs.is_empty ())
6060 r |= vect_slp_bbs (bbs);
6061
6062 free (rpo);
6063
6064 return r;
6065 }
6066
6067 /* Build a variable-length vector in which the elements in ELTS are repeated
6068 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
6069 RESULTS and add any new instructions to SEQ.
6070
6071 The approach we use is:
6072
6073 (1) Find a vector mode VM with integer elements of mode IM.
6074
6075 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6076 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
6077 from small vectors to IM.
6078
6079 (3) Duplicate each ELTS'[I] into a vector of mode VM.
6080
6081 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
6082 correct byte contents.
6083
6084 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
6085
6086 We try to find the largest IM for which this sequence works, in order
6087 to cut down on the number of interleaves. */
6088
6089 void
6090 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
6091 const vec<tree> &elts, unsigned int nresults,
6092 vec<tree> &results)
6093 {
6094 unsigned int nelts = elts.length ();
6095 tree element_type = TREE_TYPE (vector_type);
6096
6097 /* (1) Find a vector mode VM with integer elements of mode IM. */
6098 unsigned int nvectors = 1;
6099 tree new_vector_type;
6100 tree permutes[2];
6101 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
6102 &nvectors, &new_vector_type,
6103 permutes))
6104 gcc_unreachable ();
6105
6106 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
6107 unsigned int partial_nelts = nelts / nvectors;
6108 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
6109
6110 tree_vector_builder partial_elts;
6111 auto_vec<tree, 32> pieces (nvectors * 2);
6112 pieces.quick_grow_cleared (nvectors * 2);
6113 for (unsigned int i = 0; i < nvectors; ++i)
6114 {
6115 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6116 ELTS' has mode IM. */
6117 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
6118 for (unsigned int j = 0; j < partial_nelts; ++j)
6119 partial_elts.quick_push (elts[i * partial_nelts + j]);
6120 tree t = gimple_build_vector (seq, &partial_elts);
6121 t = gimple_build (seq, VIEW_CONVERT_EXPR,
6122 TREE_TYPE (new_vector_type), t);
6123
6124 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
6125 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
6126 }
6127
6128 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
6129 correct byte contents.
6130
6131 Conceptually, we need to repeat the following operation log2(nvectors)
6132 times, where hi_start = nvectors / 2:
6133
6134 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
6135 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
6136
6137 However, if each input repeats every N elements and the VF is
6138 a multiple of N * 2, the HI result is the same as the LO result.
6139 This will be true for the first N1 iterations of the outer loop,
6140 followed by N2 iterations for which both the LO and HI results
6141 are needed. I.e.:
6142
6143 N1 + N2 = log2(nvectors)
6144
6145 Each "N1 iteration" doubles the number of redundant vectors and the
6146 effect of the process as a whole is to have a sequence of nvectors/2**N1
6147 vectors that repeats 2**N1 times. Rather than generate these redundant
6148 vectors, we halve the number of vectors for each N1 iteration. */
6149 unsigned int in_start = 0;
6150 unsigned int out_start = nvectors;
6151 unsigned int new_nvectors = nvectors;
6152 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
6153 {
6154 unsigned int hi_start = new_nvectors / 2;
6155 unsigned int out_i = 0;
6156 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
6157 {
6158 if ((in_i & 1) != 0
6159 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
6160 2 * in_repeat))
6161 continue;
6162
6163 tree output = make_ssa_name (new_vector_type);
6164 tree input1 = pieces[in_start + (in_i / 2)];
6165 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
6166 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
6167 input1, input2,
6168 permutes[in_i & 1]);
6169 gimple_seq_add_stmt (seq, stmt);
6170 pieces[out_start + out_i] = output;
6171 out_i += 1;
6172 }
6173 std::swap (in_start, out_start);
6174 new_nvectors = out_i;
6175 }
6176
6177 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
6178 results.reserve (nresults);
6179 for (unsigned int i = 0; i < nresults; ++i)
6180 if (i < new_nvectors)
6181 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
6182 pieces[in_start + i]));
6183 else
6184 results.quick_push (results[i - new_nvectors]);
6185 }
6186
6187
6188 /* For constant and loop invariant defs in OP_NODE this function creates
6189 vector defs that will be used in the vectorized stmts and stores them
6190 to SLP_TREE_VEC_DEFS of OP_NODE. */
6191
6192 static void
6193 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
6194 {
6195 unsigned HOST_WIDE_INT nunits;
6196 tree vec_cst;
6197 unsigned j, number_of_places_left_in_vector;
6198 tree vector_type;
6199 tree vop;
6200 int group_size = op_node->ops.length ();
6201 unsigned int vec_num, i;
6202 unsigned number_of_copies = 1;
6203 bool constant_p;
6204 gimple_seq ctor_seq = NULL;
6205 auto_vec<tree, 16> permute_results;
6206
6207 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
6208 vector_type = SLP_TREE_VECTYPE (op_node);
6209
6210 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
6211 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
6212 auto_vec<tree> voprnds (number_of_vectors);
6213
6214 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
6215 created vectors. It is greater than 1 if unrolling is performed.
6216
6217 For example, we have two scalar operands, s1 and s2 (e.g., group of
6218 strided accesses of size two), while NUNITS is four (i.e., four scalars
6219 of this type can be packed in a vector). The output vector will contain
6220 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
6221 will be 2).
6222
6223 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
6224 containing the operands.
6225
6226 For example, NUNITS is four as before, and the group size is 8
6227 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
6228 {s5, s6, s7, s8}. */
6229
6230 /* When using duplicate_and_interleave, we just need one element for
6231 each scalar statement. */
6232 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
6233 nunits = group_size;
6234
6235 number_of_copies = nunits * number_of_vectors / group_size;
6236
6237 number_of_places_left_in_vector = nunits;
6238 constant_p = true;
6239 tree_vector_builder elts (vector_type, nunits, 1);
6240 elts.quick_grow (nunits);
6241 stmt_vec_info insert_after = NULL;
6242 for (j = 0; j < number_of_copies; j++)
6243 {
6244 tree op;
6245 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
6246 {
6247 /* Create 'vect_ = {op0,op1,...,opn}'. */
6248 number_of_places_left_in_vector--;
6249 tree orig_op = op;
6250 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
6251 {
6252 if (CONSTANT_CLASS_P (op))
6253 {
6254 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6255 {
6256 /* Can't use VIEW_CONVERT_EXPR for booleans because
6257 of possibly different sizes of scalar value and
6258 vector element. */
6259 if (integer_zerop (op))
6260 op = build_int_cst (TREE_TYPE (vector_type), 0);
6261 else if (integer_onep (op))
6262 op = build_all_ones_cst (TREE_TYPE (vector_type));
6263 else
6264 gcc_unreachable ();
6265 }
6266 else
6267 op = fold_unary (VIEW_CONVERT_EXPR,
6268 TREE_TYPE (vector_type), op);
6269 gcc_assert (op && CONSTANT_CLASS_P (op));
6270 }
6271 else
6272 {
6273 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
6274 gimple *init_stmt;
6275 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6276 {
6277 tree true_val
6278 = build_all_ones_cst (TREE_TYPE (vector_type));
6279 tree false_val
6280 = build_zero_cst (TREE_TYPE (vector_type));
6281 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
6282 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
6283 op, true_val,
6284 false_val);
6285 }
6286 else
6287 {
6288 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
6289 op);
6290 init_stmt
6291 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
6292 op);
6293 }
6294 gimple_seq_add_stmt (&ctor_seq, init_stmt);
6295 op = new_temp;
6296 }
6297 }
6298 elts[number_of_places_left_in_vector] = op;
6299 if (!CONSTANT_CLASS_P (op))
6300 constant_p = false;
6301 /* For BB vectorization we have to compute an insert location
6302 when a def is inside the analyzed region since we cannot
6303 simply insert at the BB start in this case. */
6304 stmt_vec_info opdef;
6305 if (TREE_CODE (orig_op) == SSA_NAME
6306 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
6307 && is_a <bb_vec_info> (vinfo)
6308 && (opdef = vinfo->lookup_def (orig_op)))
6309 {
6310 if (!insert_after)
6311 insert_after = opdef;
6312 else
6313 insert_after = get_later_stmt (insert_after, opdef);
6314 }
6315
6316 if (number_of_places_left_in_vector == 0)
6317 {
6318 if (constant_p
6319 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
6320 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
6321 vec_cst = gimple_build_vector (&ctor_seq, &elts);
6322 else
6323 {
6324 if (permute_results.is_empty ())
6325 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
6326 elts, number_of_vectors,
6327 permute_results);
6328 vec_cst = permute_results[number_of_vectors - j - 1];
6329 }
6330 if (!gimple_seq_empty_p (ctor_seq))
6331 {
6332 if (insert_after)
6333 {
6334 gimple_stmt_iterator gsi;
6335 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
6336 {
6337 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
6338 gsi_insert_seq_before (&gsi, ctor_seq,
6339 GSI_CONTINUE_LINKING);
6340 }
6341 else if (!stmt_ends_bb_p (insert_after->stmt))
6342 {
6343 gsi = gsi_for_stmt (insert_after->stmt);
6344 gsi_insert_seq_after (&gsi, ctor_seq,
6345 GSI_CONTINUE_LINKING);
6346 }
6347 else
6348 {
6349 /* When we want to insert after a def where the
6350 defining stmt throws then insert on the fallthru
6351 edge. */
6352 edge e = find_fallthru_edge
6353 (gimple_bb (insert_after->stmt)->succs);
6354 basic_block new_bb
6355 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
6356 gcc_assert (!new_bb);
6357 }
6358 }
6359 else
6360 vinfo->insert_seq_on_entry (NULL, ctor_seq);
6361 ctor_seq = NULL;
6362 }
6363 voprnds.quick_push (vec_cst);
6364 insert_after = NULL;
6365 number_of_places_left_in_vector = nunits;
6366 constant_p = true;
6367 elts.new_vector (vector_type, nunits, 1);
6368 elts.quick_grow (nunits);
6369 }
6370 }
6371 }
6372
6373 /* Since the vectors are created in the reverse order, we should invert
6374 them. */
6375 vec_num = voprnds.length ();
6376 for (j = vec_num; j != 0; j--)
6377 {
6378 vop = voprnds[j - 1];
6379 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6380 }
6381
6382 /* In case that VF is greater than the unrolling factor needed for the SLP
6383 group of stmts, NUMBER_OF_VECTORS to be created is greater than
6384 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
6385 to replicate the vectors. */
6386 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
6387 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
6388 i++)
6389 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6390 }
6391
6392 /* Get the Ith vectorized definition from SLP_NODE. */
6393
6394 tree
6395 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
6396 {
6397 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
6398 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
6399 else
6400 return SLP_TREE_VEC_DEFS (slp_node)[i];
6401 }
6402
6403 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
6404
6405 void
6406 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
6407 {
6408 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
6409 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
6410 {
6411 unsigned j;
6412 gimple *vec_def_stmt;
6413 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
6414 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
6415 }
6416 else
6417 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
6418 }
6419
6420 /* Get N vectorized definitions for SLP_NODE. */
6421
6422 void
6423 vect_get_slp_defs (vec_info *,
6424 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
6425 {
6426 if (n == -1U)
6427 n = SLP_TREE_CHILDREN (slp_node).length ();
6428
6429 for (unsigned i = 0; i < n; ++i)
6430 {
6431 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
6432 vec<tree> vec_defs = vNULL;
6433 vect_get_slp_defs (child, &vec_defs);
6434 vec_oprnds->quick_push (vec_defs);
6435 }
6436 }
6437
6438 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6439 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6440 permute statements for the SLP node NODE. Store the number of vector
6441 permute instructions in *N_PERMS and the number of vector load
6442 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
6443 that were not needed. */
6444
6445 bool
6446 vect_transform_slp_perm_load (vec_info *vinfo,
6447 slp_tree node, const vec<tree> &dr_chain,
6448 gimple_stmt_iterator *gsi, poly_uint64 vf,
6449 bool analyze_only, unsigned *n_perms,
6450 unsigned int *n_loads, bool dce_chain)
6451 {
6452 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6453 int vec_index = 0;
6454 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6455 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
6456 unsigned int mask_element;
6457 machine_mode mode;
6458
6459 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
6460 return false;
6461
6462 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6463
6464 mode = TYPE_MODE (vectype);
6465 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6466
6467 /* Initialize the vect stmts of NODE to properly insert the generated
6468 stmts later. */
6469 if (! analyze_only)
6470 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
6471 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6472 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
6473
6474 /* Generate permutation masks for every NODE. Number of masks for each NODE
6475 is equal to GROUP_SIZE.
6476 E.g., we have a group of three nodes with three loads from the same
6477 location in each node, and the vector size is 4. I.e., we have a
6478 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6479 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6480 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6481 ...
6482
6483 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
6484 The last mask is illegal since we assume two operands for permute
6485 operation, and the mask element values can't be outside that range.
6486 Hence, the last mask must be converted into {2,5,5,5}.
6487 For the first two permutations we need the first and the second input
6488 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6489 we need the second and the third vectors: {b1,c1,a2,b2} and
6490 {c2,a3,b3,c3}. */
6491
6492 int vect_stmts_counter = 0;
6493 unsigned int index = 0;
6494 int first_vec_index = -1;
6495 int second_vec_index = -1;
6496 bool noop_p = true;
6497 *n_perms = 0;
6498
6499 vec_perm_builder mask;
6500 unsigned int nelts_to_build;
6501 unsigned int nvectors_per_build;
6502 unsigned int in_nlanes;
6503 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
6504 && multiple_p (nunits, group_size));
6505 if (repeating_p)
6506 {
6507 /* A single vector contains a whole number of copies of the node, so:
6508 (a) all permutes can use the same mask; and
6509 (b) the permutes only need a single vector input. */
6510 mask.new_vector (nunits, group_size, 3);
6511 nelts_to_build = mask.encoded_nelts ();
6512 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
6513 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
6514 }
6515 else
6516 {
6517 /* We need to construct a separate mask for each vector statement. */
6518 unsigned HOST_WIDE_INT const_nunits, const_vf;
6519 if (!nunits.is_constant (&const_nunits)
6520 || !vf.is_constant (&const_vf))
6521 return false;
6522 mask.new_vector (const_nunits, const_nunits, 1);
6523 nelts_to_build = const_vf * group_size;
6524 nvectors_per_build = 1;
6525 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
6526 }
6527 auto_sbitmap used_in_lanes (in_nlanes);
6528 bitmap_clear (used_in_lanes);
6529 auto_bitmap used_defs;
6530
6531 unsigned int count = mask.encoded_nelts ();
6532 mask.quick_grow (count);
6533 vec_perm_indices indices;
6534
6535 for (unsigned int j = 0; j < nelts_to_build; j++)
6536 {
6537 unsigned int iter_num = j / group_size;
6538 unsigned int stmt_num = j % group_size;
6539 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
6540 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
6541 bitmap_set_bit (used_in_lanes, i);
6542 if (repeating_p)
6543 {
6544 first_vec_index = 0;
6545 mask_element = i;
6546 }
6547 else
6548 {
6549 /* Enforced before the loop when !repeating_p. */
6550 unsigned int const_nunits = nunits.to_constant ();
6551 vec_index = i / const_nunits;
6552 mask_element = i % const_nunits;
6553 if (vec_index == first_vec_index
6554 || first_vec_index == -1)
6555 {
6556 first_vec_index = vec_index;
6557 }
6558 else if (vec_index == second_vec_index
6559 || second_vec_index == -1)
6560 {
6561 second_vec_index = vec_index;
6562 mask_element += const_nunits;
6563 }
6564 else
6565 {
6566 if (dump_enabled_p ())
6567 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6568 "permutation requires at "
6569 "least three vectors %G",
6570 stmt_info->stmt);
6571 gcc_assert (analyze_only);
6572 return false;
6573 }
6574
6575 gcc_assert (mask_element < 2 * const_nunits);
6576 }
6577
6578 if (mask_element != index)
6579 noop_p = false;
6580 mask[index++] = mask_element;
6581
6582 if (index == count && !noop_p)
6583 {
6584 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
6585 if (!can_vec_perm_const_p (mode, indices))
6586 {
6587 if (dump_enabled_p ())
6588 {
6589 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
6590 vect_location,
6591 "unsupported vect permute { ");
6592 for (i = 0; i < count; ++i)
6593 {
6594 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
6595 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
6596 }
6597 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
6598 }
6599 gcc_assert (analyze_only);
6600 return false;
6601 }
6602
6603 ++*n_perms;
6604 }
6605
6606 if (index == count)
6607 {
6608 if (!analyze_only)
6609 {
6610 tree mask_vec = NULL_TREE;
6611
6612 if (! noop_p)
6613 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6614
6615 if (second_vec_index == -1)
6616 second_vec_index = first_vec_index;
6617
6618 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
6619 {
6620 /* Generate the permute statement if necessary. */
6621 tree first_vec = dr_chain[first_vec_index + ri];
6622 tree second_vec = dr_chain[second_vec_index + ri];
6623 gimple *perm_stmt;
6624 if (! noop_p)
6625 {
6626 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6627 tree perm_dest
6628 = vect_create_destination_var (gimple_assign_lhs (stmt),
6629 vectype);
6630 perm_dest = make_ssa_name (perm_dest);
6631 perm_stmt
6632 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6633 first_vec, second_vec,
6634 mask_vec);
6635 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
6636 gsi);
6637 if (dce_chain)
6638 {
6639 bitmap_set_bit (used_defs, first_vec_index + ri);
6640 bitmap_set_bit (used_defs, second_vec_index + ri);
6641 }
6642 }
6643 else
6644 {
6645 /* If mask was NULL_TREE generate the requested
6646 identity transform. */
6647 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
6648 if (dce_chain)
6649 bitmap_set_bit (used_defs, first_vec_index + ri);
6650 }
6651
6652 /* Store the vector statement in NODE. */
6653 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
6654 }
6655 }
6656
6657 index = 0;
6658 first_vec_index = -1;
6659 second_vec_index = -1;
6660 noop_p = true;
6661 }
6662 }
6663
6664 if (n_loads)
6665 {
6666 if (repeating_p)
6667 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6668 else
6669 {
6670 /* Enforced above when !repeating_p. */
6671 unsigned int const_nunits = nunits.to_constant ();
6672 *n_loads = 0;
6673 bool load_seen = false;
6674 for (unsigned i = 0; i < in_nlanes; ++i)
6675 {
6676 if (i % const_nunits == 0)
6677 {
6678 if (load_seen)
6679 *n_loads += 1;
6680 load_seen = false;
6681 }
6682 if (bitmap_bit_p (used_in_lanes, i))
6683 load_seen = true;
6684 }
6685 if (load_seen)
6686 *n_loads += 1;
6687 }
6688 }
6689
6690 if (dce_chain)
6691 for (unsigned i = 0; i < dr_chain.length (); ++i)
6692 if (!bitmap_bit_p (used_defs, i))
6693 {
6694 gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
6695 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
6696 gsi_remove (&rgsi, true);
6697 release_defs (stmt);
6698 }
6699
6700 return true;
6701 }
6702
6703 /* Produce the next vector result for SLP permutation NODE by adding a vector
6704 statement at GSI. If MASK_VEC is nonnull, add:
6705
6706 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
6707
6708 otherwise add:
6709
6710 <new SSA name> = FIRST_DEF. */
6711
6712 static void
6713 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6714 slp_tree node, tree first_def, tree second_def,
6715 tree mask_vec)
6716 {
6717 tree vectype = SLP_TREE_VECTYPE (node);
6718
6719 /* ??? We SLP match existing vector element extracts but
6720 allow punning which we need to re-instantiate at uses
6721 but have no good way of explicitly representing. */
6722 if (!types_compatible_p (TREE_TYPE (first_def), vectype))
6723 {
6724 gassign *conv_stmt
6725 = gimple_build_assign (make_ssa_name (vectype),
6726 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
6727 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6728 first_def = gimple_assign_lhs (conv_stmt);
6729 }
6730 gassign *perm_stmt;
6731 tree perm_dest = make_ssa_name (vectype);
6732 if (mask_vec)
6733 {
6734 if (!types_compatible_p (TREE_TYPE (second_def), vectype))
6735 {
6736 gassign *conv_stmt
6737 = gimple_build_assign (make_ssa_name (vectype),
6738 build1 (VIEW_CONVERT_EXPR,
6739 vectype, second_def));
6740 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6741 second_def = gimple_assign_lhs (conv_stmt);
6742 }
6743 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6744 first_def, second_def,
6745 mask_vec);
6746 }
6747 else
6748 /* We need a copy here in case the def was external. */
6749 perm_stmt = gimple_build_assign (perm_dest, first_def);
6750 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
6751 /* Store the vector statement in NODE. */
6752 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
6753 }
6754
6755 /* Vectorize the SLP permutations in NODE as specified
6756 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
6757 child number and lane number.
6758 Interleaving of two two-lane two-child SLP subtrees (not supported):
6759 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
6760 A blend of two four-lane two-child SLP subtrees:
6761 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
6762 Highpart of a four-lane one-child SLP subtree (not supported):
6763 [ { 0, 2 }, { 0, 3 } ]
6764 Where currently only a subset is supported by code generating below. */
6765
6766 static bool
6767 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6768 slp_tree node, stmt_vector_for_cost *cost_vec)
6769 {
6770 tree vectype = SLP_TREE_VECTYPE (node);
6771
6772 /* ??? We currently only support all same vector input and output types
6773 while the SLP IL should really do a concat + select and thus accept
6774 arbitrary mismatches. */
6775 slp_tree child;
6776 unsigned i;
6777 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6778 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
6779 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6780 {
6781 if (!vect_maybe_update_slp_op_vectype (child, vectype)
6782 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
6783 {
6784 if (dump_enabled_p ())
6785 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6786 "Unsupported lane permutation\n");
6787 return false;
6788 }
6789 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
6790 repeating_p = false;
6791 }
6792
6793 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
6794 gcc_assert (perm.length () == SLP_TREE_LANES (node));
6795 if (dump_enabled_p ())
6796 {
6797 dump_printf_loc (MSG_NOTE, vect_location,
6798 "vectorizing permutation");
6799 for (unsigned i = 0; i < perm.length (); ++i)
6800 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
6801 if (repeating_p)
6802 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
6803 dump_printf (MSG_NOTE, "\n");
6804 }
6805
6806 /* REPEATING_P is true if every output vector is guaranteed to use the
6807 same permute vector. We can handle that case for both variable-length
6808 and constant-length vectors, but we only handle other cases for
6809 constant-length vectors.
6810
6811 Set:
6812
6813 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
6814 mask vector that we want to build.
6815
6816 - NCOPIES to the number of copies of PERM that we need in order
6817 to build the necessary permute mask vectors.
6818
6819 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
6820 for each permute mask vector. This is only relevant when GSI is
6821 nonnull. */
6822 uint64_t npatterns;
6823 unsigned nelts_per_pattern;
6824 uint64_t ncopies;
6825 unsigned noutputs_per_mask;
6826 if (repeating_p)
6827 {
6828 /* We need a single permute mask vector that has the form:
6829
6830 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
6831
6832 In other words, the original n-element permute in PERM is
6833 "unrolled" to fill a full vector. The stepped vector encoding
6834 that we use for permutes requires 3n elements. */
6835 npatterns = SLP_TREE_LANES (node);
6836 nelts_per_pattern = ncopies = 3;
6837 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6838 }
6839 else
6840 {
6841 /* Calculate every element of every permute mask vector explicitly,
6842 instead of relying on the pattern described above. */
6843 if (!nunits.is_constant (&npatterns))
6844 return false;
6845 nelts_per_pattern = ncopies = 1;
6846 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
6847 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
6848 return false;
6849 noutputs_per_mask = 1;
6850 }
6851 unsigned olanes = ncopies * SLP_TREE_LANES (node);
6852 gcc_assert (repeating_p || multiple_p (olanes, nunits));
6853
6854 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
6855 from the { SLP operand, scalar lane } permutation as recorded in the
6856 SLP node as intermediate step. This part should already work
6857 with SLP children with arbitrary number of lanes. */
6858 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
6859 auto_vec<unsigned> active_lane;
6860 vperm.create (olanes);
6861 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
6862 for (unsigned i = 0; i < ncopies; ++i)
6863 {
6864 for (unsigned pi = 0; pi < perm.length (); ++pi)
6865 {
6866 std::pair<unsigned, unsigned> p = perm[pi];
6867 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
6868 if (repeating_p)
6869 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
6870 else
6871 {
6872 /* We checked above that the vectors are constant-length. */
6873 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
6874 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
6875 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
6876 vperm.quick_push ({{p.first, vi}, vl});
6877 }
6878 }
6879 /* Advance to the next group. */
6880 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
6881 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
6882 }
6883
6884 if (dump_enabled_p ())
6885 {
6886 dump_printf_loc (MSG_NOTE, vect_location, "as");
6887 for (unsigned i = 0; i < vperm.length (); ++i)
6888 {
6889 if (i != 0
6890 && (repeating_p
6891 ? multiple_p (i, npatterns)
6892 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
6893 dump_printf (MSG_NOTE, ",");
6894 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
6895 vperm[i].first.first, vperm[i].first.second,
6896 vperm[i].second);
6897 }
6898 dump_printf (MSG_NOTE, "\n");
6899 }
6900
6901 /* We can only handle two-vector permutes, everything else should
6902 be lowered on the SLP level. The following is closely inspired
6903 by vect_transform_slp_perm_load and is supposed to eventually
6904 replace it.
6905 ??? As intermediate step do code-gen in the SLP tree representation
6906 somehow? */
6907 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
6908 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
6909 unsigned int index = 0;
6910 poly_uint64 mask_element;
6911 vec_perm_builder mask;
6912 mask.new_vector (nunits, npatterns, nelts_per_pattern);
6913 unsigned int count = mask.encoded_nelts ();
6914 mask.quick_grow (count);
6915 vec_perm_indices indices;
6916 unsigned nperms = 0;
6917 for (unsigned i = 0; i < vperm.length (); ++i)
6918 {
6919 mask_element = vperm[i].second;
6920 if (first_vec.first == -1U
6921 || first_vec == vperm[i].first)
6922 first_vec = vperm[i].first;
6923 else if (second_vec.first == -1U
6924 || second_vec == vperm[i].first)
6925 {
6926 second_vec = vperm[i].first;
6927 mask_element += nunits;
6928 }
6929 else
6930 {
6931 if (dump_enabled_p ())
6932 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6933 "permutation requires at "
6934 "least three vectors\n");
6935 gcc_assert (!gsi);
6936 return false;
6937 }
6938
6939 mask[index++] = mask_element;
6940
6941 if (index == count)
6942 {
6943 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, nunits);
6944 bool identity_p = indices.series_p (0, 1, 0, 1);
6945 if (!identity_p
6946 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6947 {
6948 if (dump_enabled_p ())
6949 {
6950 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
6951 vect_location,
6952 "unsupported vect permute { ");
6953 for (i = 0; i < count; ++i)
6954 {
6955 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
6956 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
6957 }
6958 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
6959 }
6960 gcc_assert (!gsi);
6961 return false;
6962 }
6963
6964 if (!identity_p)
6965 nperms++;
6966 if (gsi)
6967 {
6968 if (second_vec.first == -1U)
6969 second_vec = first_vec;
6970
6971 slp_tree
6972 first_node = SLP_TREE_CHILDREN (node)[first_vec.first],
6973 second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
6974
6975 tree mask_vec = NULL_TREE;
6976 if (!identity_p)
6977 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6978
6979 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
6980 {
6981 tree first_def
6982 = vect_get_slp_vect_def (first_node,
6983 first_vec.second + vi);
6984 tree second_def
6985 = vect_get_slp_vect_def (second_node,
6986 second_vec.second + vi);
6987 vect_add_slp_permutation (vinfo, gsi, node, first_def,
6988 second_def, mask_vec);
6989 }
6990 }
6991
6992 index = 0;
6993 first_vec = std::make_pair (-1U, -1U);
6994 second_vec = std::make_pair (-1U, -1U);
6995 }
6996 }
6997
6998 if (!gsi)
6999 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
7000
7001 return true;
7002 }
7003
7004 /* Vectorize SLP NODE. */
7005
7006 static void
7007 vect_schedule_slp_node (vec_info *vinfo,
7008 slp_tree node, slp_instance instance)
7009 {
7010 gimple_stmt_iterator si;
7011 int i;
7012 slp_tree child;
7013
7014 /* For existing vectors there's nothing to do. */
7015 if (SLP_TREE_VEC_DEFS (node).exists ())
7016 return;
7017
7018 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
7019
7020 /* Vectorize externals and constants. */
7021 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
7022 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
7023 {
7024 /* ??? vectorizable_shift can end up using a scalar operand which is
7025 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
7026 node in this case. */
7027 if (!SLP_TREE_VECTYPE (node))
7028 return;
7029
7030 vect_create_constant_vectors (vinfo, node);
7031 return;
7032 }
7033
7034 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7035
7036 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
7037 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
7038
7039 if (dump_enabled_p ())
7040 dump_printf_loc (MSG_NOTE, vect_location,
7041 "------>vectorizing SLP node starting from: %G",
7042 stmt_info->stmt);
7043
7044 if (STMT_VINFO_DATA_REF (stmt_info)
7045 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7046 {
7047 /* Vectorized loads go before the first scalar load to make it
7048 ready early, vectorized stores go before the last scalar
7049 stmt which is where all uses are ready. */
7050 stmt_vec_info last_stmt_info = NULL;
7051 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
7052 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
7053 else /* DR_IS_WRITE */
7054 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
7055 si = gsi_for_stmt (last_stmt_info->stmt);
7056 }
7057 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
7058 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
7059 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
7060 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7061 {
7062 /* For PHI node vectorization we do not use the insertion iterator. */
7063 si = gsi_none ();
7064 }
7065 else
7066 {
7067 /* Emit other stmts after the children vectorized defs which is
7068 earliest possible. */
7069 gimple *last_stmt = NULL;
7070 bool seen_vector_def = false;
7071 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7072 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7073 {
7074 /* For fold-left reductions we are retaining the scalar
7075 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
7076 set so the representation isn't perfect. Resort to the
7077 last scalar def here. */
7078 if (SLP_TREE_VEC_STMTS (child).is_empty ())
7079 {
7080 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
7081 == cycle_phi_info_type);
7082 gphi *phi = as_a <gphi *>
7083 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
7084 if (!last_stmt
7085 || vect_stmt_dominates_stmt_p (last_stmt, phi))
7086 last_stmt = phi;
7087 }
7088 /* We are emitting all vectorized stmts in the same place and
7089 the last one is the last.
7090 ??? Unless we have a load permutation applied and that
7091 figures to re-use an earlier generated load. */
7092 unsigned j;
7093 gimple *vstmt;
7094 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
7095 if (!last_stmt
7096 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7097 last_stmt = vstmt;
7098 }
7099 else if (!SLP_TREE_VECTYPE (child))
7100 {
7101 /* For externals we use unvectorized at all scalar defs. */
7102 unsigned j;
7103 tree def;
7104 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
7105 if (TREE_CODE (def) == SSA_NAME
7106 && !SSA_NAME_IS_DEFAULT_DEF (def))
7107 {
7108 gimple *stmt = SSA_NAME_DEF_STMT (def);
7109 if (!last_stmt
7110 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
7111 last_stmt = stmt;
7112 }
7113 }
7114 else
7115 {
7116 /* For externals we have to look at all defs since their
7117 insertion place is decided per vector. But beware
7118 of pre-existing vectors where we need to make sure
7119 we do not insert before the region boundary. */
7120 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
7121 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
7122 seen_vector_def = true;
7123 else
7124 {
7125 unsigned j;
7126 tree vdef;
7127 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
7128 if (TREE_CODE (vdef) == SSA_NAME
7129 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
7130 {
7131 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
7132 if (!last_stmt
7133 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7134 last_stmt = vstmt;
7135 }
7136 }
7137 }
7138 /* This can happen when all children are pre-existing vectors or
7139 constants. */
7140 if (!last_stmt)
7141 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
7142 if (!last_stmt)
7143 {
7144 gcc_assert (seen_vector_def);
7145 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
7146 }
7147 else if (is_a <bb_vec_info> (vinfo)
7148 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
7149 && gimple_could_trap_p (stmt_info->stmt))
7150 {
7151 /* We've constrained possibly trapping operations to all come
7152 from the same basic-block, if vectorized defs would allow earlier
7153 scheduling still force vectorized stmts to the original block.
7154 This is only necessary for BB vectorization since for loop vect
7155 all operations are in a single BB and scalar stmt based
7156 placement doesn't play well with epilogue vectorization. */
7157 gcc_assert (dominated_by_p (CDI_DOMINATORS,
7158 gimple_bb (stmt_info->stmt),
7159 gimple_bb (last_stmt)));
7160 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
7161 }
7162 else if (is_a <gphi *> (last_stmt))
7163 si = gsi_after_labels (gimple_bb (last_stmt));
7164 else
7165 {
7166 si = gsi_for_stmt (last_stmt);
7167 gsi_next (&si);
7168 }
7169 }
7170
7171 bool done_p = false;
7172
7173 /* Handle purely internal nodes. */
7174 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7175 {
7176 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
7177 be shared with different SLP nodes (but usually it's the same
7178 operation apart from the case the stmt is only there for denoting
7179 the actual scalar lane defs ...). So do not call vect_transform_stmt
7180 but open-code it here (partly). */
7181 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
7182 gcc_assert (done);
7183 done_p = true;
7184 }
7185 if (!done_p)
7186 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
7187 }
7188
7189 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
7190 For loop vectorization this is done in vectorizable_call, but for SLP
7191 it needs to be deferred until end of vect_schedule_slp, because multiple
7192 SLP instances may refer to the same scalar stmt. */
7193
7194 static void
7195 vect_remove_slp_scalar_calls (vec_info *vinfo,
7196 slp_tree node, hash_set<slp_tree> &visited)
7197 {
7198 gimple *new_stmt;
7199 gimple_stmt_iterator gsi;
7200 int i;
7201 slp_tree child;
7202 tree lhs;
7203 stmt_vec_info stmt_info;
7204
7205 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7206 return;
7207
7208 if (visited.add (node))
7209 return;
7210
7211 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7212 vect_remove_slp_scalar_calls (vinfo, child, visited);
7213
7214 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7215 {
7216 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
7217 if (!stmt || gimple_bb (stmt) == NULL)
7218 continue;
7219 if (is_pattern_stmt_p (stmt_info)
7220 || !PURE_SLP_STMT (stmt_info))
7221 continue;
7222 lhs = gimple_call_lhs (stmt);
7223 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
7224 gsi = gsi_for_stmt (stmt);
7225 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
7226 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
7227 }
7228 }
7229
7230 static void
7231 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
7232 {
7233 hash_set<slp_tree> visited;
7234 vect_remove_slp_scalar_calls (vinfo, node, visited);
7235 }
7236
7237 /* Vectorize the instance root. */
7238
7239 void
7240 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
7241 {
7242 gassign *rstmt = NULL;
7243
7244 if (instance->kind == slp_inst_kind_ctor)
7245 {
7246 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
7247 {
7248 gimple *child_stmt;
7249 int j;
7250
7251 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
7252 {
7253 tree vect_lhs = gimple_get_lhs (child_stmt);
7254 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7255 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
7256 TREE_TYPE (vect_lhs)))
7257 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
7258 vect_lhs);
7259 rstmt = gimple_build_assign (root_lhs, vect_lhs);
7260 break;
7261 }
7262 }
7263 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
7264 {
7265 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
7266 gimple *child_stmt;
7267 int j;
7268 vec<constructor_elt, va_gc> *v;
7269 vec_alloc (v, nelts);
7270
7271 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
7272 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
7273 gimple_get_lhs (child_stmt));
7274 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7275 tree rtype
7276 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
7277 tree r_constructor = build_constructor (rtype, v);
7278 rstmt = gimple_build_assign (lhs, r_constructor);
7279 }
7280 }
7281 else if (instance->kind == slp_inst_kind_bb_reduc)
7282 {
7283 /* Largely inspired by reduction chain epilogue handling in
7284 vect_create_epilog_for_reduction. */
7285 vec<tree> vec_defs = vNULL;
7286 vect_get_slp_defs (node, &vec_defs);
7287 enum tree_code reduc_code
7288 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
7289 /* ??? We actually have to reflect signs somewhere. */
7290 if (reduc_code == MINUS_EXPR)
7291 reduc_code = PLUS_EXPR;
7292 gimple_seq epilogue = NULL;
7293 /* We may end up with more than one vector result, reduce them
7294 to one vector. */
7295 tree vec_def = vec_defs[0];
7296 for (unsigned i = 1; i < vec_defs.length (); ++i)
7297 vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
7298 vec_def, vec_defs[i]);
7299 vec_defs.release ();
7300 /* ??? Support other schemes than direct internal fn. */
7301 internal_fn reduc_fn;
7302 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
7303 || reduc_fn == IFN_LAST)
7304 gcc_unreachable ();
7305 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
7306 TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
7307
7308 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7309 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
7310 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
7311 update_stmt (gsi_stmt (rgsi));
7312 return;
7313 }
7314 else
7315 gcc_unreachable ();
7316
7317 gcc_assert (rstmt);
7318
7319 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7320 gsi_replace (&rgsi, rstmt, true);
7321 }
7322
7323 struct slp_scc_info
7324 {
7325 bool on_stack;
7326 int dfs;
7327 int lowlink;
7328 };
7329
7330 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
7331
7332 static void
7333 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
7334 hash_map<slp_tree, slp_scc_info> &scc_info,
7335 int &maxdfs, vec<slp_tree> &stack)
7336 {
7337 bool existed_p;
7338 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
7339 gcc_assert (!existed_p);
7340 info->dfs = maxdfs;
7341 info->lowlink = maxdfs;
7342 maxdfs++;
7343
7344 /* Leaf. */
7345 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7346 {
7347 info->on_stack = false;
7348 vect_schedule_slp_node (vinfo, node, instance);
7349 return;
7350 }
7351
7352 info->on_stack = true;
7353 stack.safe_push (node);
7354
7355 unsigned i;
7356 slp_tree child;
7357 /* DFS recurse. */
7358 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7359 {
7360 if (!child)
7361 continue;
7362 slp_scc_info *child_info = scc_info.get (child);
7363 if (!child_info)
7364 {
7365 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
7366 /* Recursion might have re-allocated the node. */
7367 info = scc_info.get (node);
7368 child_info = scc_info.get (child);
7369 info->lowlink = MIN (info->lowlink, child_info->lowlink);
7370 }
7371 else if (child_info->on_stack)
7372 info->lowlink = MIN (info->lowlink, child_info->dfs);
7373 }
7374 if (info->lowlink != info->dfs)
7375 return;
7376
7377 auto_vec<slp_tree, 4> phis_to_fixup;
7378
7379 /* Singleton. */
7380 if (stack.last () == node)
7381 {
7382 stack.pop ();
7383 info->on_stack = false;
7384 vect_schedule_slp_node (vinfo, node, instance);
7385 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
7386 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
7387 phis_to_fixup.quick_push (node);
7388 }
7389 else
7390 {
7391 /* SCC. */
7392 int last_idx = stack.length () - 1;
7393 while (stack[last_idx] != node)
7394 last_idx--;
7395 /* We can break the cycle at PHIs who have at least one child
7396 code generated. Then we could re-start the DFS walk until
7397 all nodes in the SCC are covered (we might have new entries
7398 for only back-reachable nodes). But it's simpler to just
7399 iterate and schedule those that are ready. */
7400 unsigned todo = stack.length () - last_idx;
7401 do
7402 {
7403 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
7404 {
7405 slp_tree entry = stack[idx];
7406 if (!entry)
7407 continue;
7408 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
7409 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
7410 bool ready = !phi;
7411 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
7412 if (!child)
7413 {
7414 gcc_assert (phi);
7415 ready = true;
7416 break;
7417 }
7418 else if (scc_info.get (child)->on_stack)
7419 {
7420 if (!phi)
7421 {
7422 ready = false;
7423 break;
7424 }
7425 }
7426 else
7427 {
7428 if (phi)
7429 {
7430 ready = true;
7431 break;
7432 }
7433 }
7434 if (ready)
7435 {
7436 vect_schedule_slp_node (vinfo, entry, instance);
7437 scc_info.get (entry)->on_stack = false;
7438 stack[idx] = NULL;
7439 todo--;
7440 if (phi)
7441 phis_to_fixup.safe_push (entry);
7442 }
7443 }
7444 }
7445 while (todo != 0);
7446
7447 /* Pop the SCC. */
7448 stack.truncate (last_idx);
7449 }
7450
7451 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
7452 slp_tree phi_node;
7453 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
7454 {
7455 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
7456 edge_iterator ei;
7457 edge e;
7458 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
7459 {
7460 unsigned dest_idx = e->dest_idx;
7461 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
7462 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
7463 continue;
7464 /* Simply fill all args. */
7465 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
7466 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
7467 vect_get_slp_vect_def (child, i),
7468 e, gimple_phi_arg_location (phi, dest_idx));
7469 }
7470 }
7471 }
7472
7473 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
7474
7475 void
7476 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
7477 {
7478 slp_instance instance;
7479 unsigned int i;
7480
7481 hash_map<slp_tree, slp_scc_info> scc_info;
7482 int maxdfs = 0;
7483 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7484 {
7485 slp_tree node = SLP_INSTANCE_TREE (instance);
7486 if (dump_enabled_p ())
7487 {
7488 dump_printf_loc (MSG_NOTE, vect_location,
7489 "Vectorizing SLP tree:\n");
7490 /* ??? Dump all? */
7491 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7492 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
7493 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
7494 vect_print_slp_graph (MSG_NOTE, vect_location,
7495 SLP_INSTANCE_TREE (instance));
7496 }
7497 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
7498 have a PHI be the node breaking the cycle. */
7499 auto_vec<slp_tree> stack;
7500 if (!scc_info.get (node))
7501 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
7502
7503 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7504 vectorize_slp_instance_root_stmt (node, instance);
7505
7506 if (dump_enabled_p ())
7507 dump_printf_loc (MSG_NOTE, vect_location,
7508 "vectorizing stmts using SLP.\n");
7509 }
7510
7511 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7512 {
7513 slp_tree root = SLP_INSTANCE_TREE (instance);
7514 stmt_vec_info store_info;
7515 unsigned int j;
7516
7517 /* Remove scalar call stmts. Do not do this for basic-block
7518 vectorization as not all uses may be vectorized.
7519 ??? Why should this be necessary? DCE should be able to
7520 remove the stmts itself.
7521 ??? For BB vectorization we can as well remove scalar
7522 stmts starting from the SLP tree root if they have no
7523 uses. */
7524 if (is_a <loop_vec_info> (vinfo))
7525 vect_remove_slp_scalar_calls (vinfo, root);
7526
7527 /* Remove vectorized stores original scalar stmts. */
7528 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
7529 {
7530 if (!STMT_VINFO_DATA_REF (store_info)
7531 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
7532 break;
7533
7534 store_info = vect_orig_stmt (store_info);
7535 /* Free the attached stmt_vec_info and remove the stmt. */
7536 vinfo->remove_stmt (store_info);
7537
7538 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
7539 to not crash in vect_free_slp_tree later. */
7540 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
7541 SLP_TREE_REPRESENTATIVE (root) = NULL;
7542 }
7543 }
7544 }