]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
tree-optimization/101154 - fix out-of bound access in SLP
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
52
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
56
57 static object_allocator<_slp_tree> *slp_tree_pool;
58 static slp_tree slp_first_node;
59
60 void
61 vect_slp_init (void)
62 {
63 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
64 }
65
66 void
67 vect_slp_fini (void)
68 {
69 while (slp_first_node)
70 delete slp_first_node;
71 delete slp_tree_pool;
72 slp_tree_pool = NULL;
73 }
74
75 void *
76 _slp_tree::operator new (size_t n)
77 {
78 gcc_assert (n == sizeof (_slp_tree));
79 return slp_tree_pool->allocate_raw ();
80 }
81
82 void
83 _slp_tree::operator delete (void *node, size_t n)
84 {
85 gcc_assert (n == sizeof (_slp_tree));
86 slp_tree_pool->remove_raw (node);
87 }
88
89
90 /* Initialize a SLP node. */
91
92 _slp_tree::_slp_tree ()
93 {
94 this->prev_node = NULL;
95 if (slp_first_node)
96 slp_first_node->prev_node = this;
97 this->next_node = slp_first_node;
98 slp_first_node = this;
99 SLP_TREE_SCALAR_STMTS (this) = vNULL;
100 SLP_TREE_SCALAR_OPS (this) = vNULL;
101 SLP_TREE_VEC_STMTS (this) = vNULL;
102 SLP_TREE_VEC_DEFS (this) = vNULL;
103 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
104 SLP_TREE_CHILDREN (this) = vNULL;
105 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
106 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
107 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
108 SLP_TREE_CODE (this) = ERROR_MARK;
109 SLP_TREE_VECTYPE (this) = NULL_TREE;
110 SLP_TREE_REPRESENTATIVE (this) = NULL;
111 SLP_TREE_REF_COUNT (this) = 1;
112 this->failed = NULL;
113 this->max_nunits = 1;
114 this->lanes = 0;
115 }
116
117 /* Tear down a SLP node. */
118
119 _slp_tree::~_slp_tree ()
120 {
121 if (this->prev_node)
122 this->prev_node->next_node = this->next_node;
123 else
124 slp_first_node = this->next_node;
125 if (this->next_node)
126 this->next_node->prev_node = this->prev_node;
127 SLP_TREE_CHILDREN (this).release ();
128 SLP_TREE_SCALAR_STMTS (this).release ();
129 SLP_TREE_SCALAR_OPS (this).release ();
130 SLP_TREE_VEC_STMTS (this).release ();
131 SLP_TREE_VEC_DEFS (this).release ();
132 SLP_TREE_LOAD_PERMUTATION (this).release ();
133 SLP_TREE_LANE_PERMUTATION (this).release ();
134 if (this->failed)
135 free (failed);
136 }
137
138 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
139
140 void
141 vect_free_slp_tree (slp_tree node)
142 {
143 int i;
144 slp_tree child;
145
146 if (--SLP_TREE_REF_COUNT (node) != 0)
147 return;
148
149 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
150 if (child)
151 vect_free_slp_tree (child);
152
153 /* If the node defines any SLP only patterns then those patterns are no
154 longer valid and should be removed. */
155 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
156 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
157 {
158 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
159 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
160 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
161 }
162
163 delete node;
164 }
165
166 /* Return a location suitable for dumpings related to the SLP instance. */
167
168 dump_user_location_t
169 _slp_instance::location () const
170 {
171 if (!root_stmts.is_empty ())
172 return root_stmts[0]->stmt;
173 else
174 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
175 }
176
177
178 /* Free the memory allocated for the SLP instance. */
179
180 void
181 vect_free_slp_instance (slp_instance instance)
182 {
183 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
184 SLP_INSTANCE_LOADS (instance).release ();
185 SLP_INSTANCE_ROOT_STMTS (instance).release ();
186 instance->subgraph_entries.release ();
187 instance->cost_vec.release ();
188 free (instance);
189 }
190
191
192 /* Create an SLP node for SCALAR_STMTS. */
193
194 slp_tree
195 vect_create_new_slp_node (unsigned nops, tree_code code)
196 {
197 slp_tree node = new _slp_tree;
198 SLP_TREE_SCALAR_STMTS (node) = vNULL;
199 SLP_TREE_CHILDREN (node).create (nops);
200 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
201 SLP_TREE_CODE (node) = code;
202 return node;
203 }
204 /* Create an SLP node for SCALAR_STMTS. */
205
206 static slp_tree
207 vect_create_new_slp_node (slp_tree node,
208 vec<stmt_vec_info> scalar_stmts, unsigned nops)
209 {
210 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
211 SLP_TREE_CHILDREN (node).create (nops);
212 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
213 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
214 SLP_TREE_LANES (node) = scalar_stmts.length ();
215 return node;
216 }
217
218 /* Create an SLP node for SCALAR_STMTS. */
219
220 static slp_tree
221 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
222 {
223 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
224 }
225
226 /* Create an SLP node for OPS. */
227
228 static slp_tree
229 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
230 {
231 SLP_TREE_SCALAR_OPS (node) = ops;
232 SLP_TREE_DEF_TYPE (node) = vect_external_def;
233 SLP_TREE_LANES (node) = ops.length ();
234 return node;
235 }
236
237 /* Create an SLP node for OPS. */
238
239 static slp_tree
240 vect_create_new_slp_node (vec<tree> ops)
241 {
242 return vect_create_new_slp_node (new _slp_tree, ops);
243 }
244
245
246 /* This structure is used in creation of an SLP tree. Each instance
247 corresponds to the same operand in a group of scalar stmts in an SLP
248 node. */
249 typedef struct _slp_oprnd_info
250 {
251 /* Def-stmts for the operands. */
252 vec<stmt_vec_info> def_stmts;
253 /* Operands. */
254 vec<tree> ops;
255 /* Information about the first statement, its vector def-type, type, the
256 operand itself in case it's constant, and an indication if it's a pattern
257 stmt. */
258 tree first_op_type;
259 enum vect_def_type first_dt;
260 bool any_pattern;
261 } *slp_oprnd_info;
262
263
264 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
265 operand. */
266 static vec<slp_oprnd_info>
267 vect_create_oprnd_info (int nops, int group_size)
268 {
269 int i;
270 slp_oprnd_info oprnd_info;
271 vec<slp_oprnd_info> oprnds_info;
272
273 oprnds_info.create (nops);
274 for (i = 0; i < nops; i++)
275 {
276 oprnd_info = XNEW (struct _slp_oprnd_info);
277 oprnd_info->def_stmts.create (group_size);
278 oprnd_info->ops.create (group_size);
279 oprnd_info->first_dt = vect_uninitialized_def;
280 oprnd_info->first_op_type = NULL_TREE;
281 oprnd_info->any_pattern = false;
282 oprnds_info.quick_push (oprnd_info);
283 }
284
285 return oprnds_info;
286 }
287
288
289 /* Free operands info. */
290
291 static void
292 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
293 {
294 int i;
295 slp_oprnd_info oprnd_info;
296
297 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
298 {
299 oprnd_info->def_stmts.release ();
300 oprnd_info->ops.release ();
301 XDELETE (oprnd_info);
302 }
303
304 oprnds_info.release ();
305 }
306
307
308 /* Return true if STMTS contains a pattern statement. */
309
310 static bool
311 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
312 {
313 stmt_vec_info stmt_info;
314 unsigned int i;
315 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
316 if (is_pattern_stmt_p (stmt_info))
317 return true;
318 return false;
319 }
320
321 /* Return true when all lanes in the external or constant NODE have
322 the same value. */
323
324 static bool
325 vect_slp_tree_uniform_p (slp_tree node)
326 {
327 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
328 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
329
330 /* Pre-exsting vectors. */
331 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
332 return false;
333
334 unsigned i;
335 tree op, first = NULL_TREE;
336 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
337 if (!first)
338 first = op;
339 else if (!operand_equal_p (first, op, 0))
340 return false;
341
342 return true;
343 }
344
345 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
346 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
347 of the chain. */
348
349 int
350 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
351 stmt_vec_info first_stmt_info)
352 {
353 stmt_vec_info next_stmt_info = first_stmt_info;
354 int result = 0;
355
356 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
357 return -1;
358
359 do
360 {
361 if (next_stmt_info == stmt_info)
362 return result;
363 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
364 if (next_stmt_info)
365 result += DR_GROUP_GAP (next_stmt_info);
366 }
367 while (next_stmt_info);
368
369 return -1;
370 }
371
372 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
373 using the method implemented by duplicate_and_interleave. Return true
374 if so, returning the number of intermediate vectors in *NVECTORS_OUT
375 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
376 (if nonnull). */
377
378 bool
379 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
380 tree elt_type, unsigned int *nvectors_out,
381 tree *vector_type_out,
382 tree *permutes)
383 {
384 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
385 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
386 return false;
387
388 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
389 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
390 unsigned int nvectors = 1;
391 for (;;)
392 {
393 scalar_int_mode int_mode;
394 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
395 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
396 {
397 /* Get the natural vector type for this SLP group size. */
398 tree int_type = build_nonstandard_integer_type
399 (GET_MODE_BITSIZE (int_mode), 1);
400 tree vector_type
401 = get_vectype_for_scalar_type (vinfo, int_type, count);
402 if (vector_type
403 && VECTOR_MODE_P (TYPE_MODE (vector_type))
404 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
405 GET_MODE_SIZE (base_vector_mode)))
406 {
407 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
408 together into elements of type INT_TYPE and using the result
409 to build NVECTORS vectors. */
410 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
411 vec_perm_builder sel1 (nelts, 2, 3);
412 vec_perm_builder sel2 (nelts, 2, 3);
413 poly_int64 half_nelts = exact_div (nelts, 2);
414 for (unsigned int i = 0; i < 3; ++i)
415 {
416 sel1.quick_push (i);
417 sel1.quick_push (i + nelts);
418 sel2.quick_push (half_nelts + i);
419 sel2.quick_push (half_nelts + i + nelts);
420 }
421 vec_perm_indices indices1 (sel1, 2, nelts);
422 vec_perm_indices indices2 (sel2, 2, nelts);
423 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
424 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
425 {
426 if (nvectors_out)
427 *nvectors_out = nvectors;
428 if (vector_type_out)
429 *vector_type_out = vector_type;
430 if (permutes)
431 {
432 permutes[0] = vect_gen_perm_mask_checked (vector_type,
433 indices1);
434 permutes[1] = vect_gen_perm_mask_checked (vector_type,
435 indices2);
436 }
437 return true;
438 }
439 }
440 }
441 if (!multiple_p (elt_bytes, 2, &elt_bytes))
442 return false;
443 nvectors *= 2;
444 }
445 }
446
447 /* Return true if DTA and DTB match. */
448
449 static bool
450 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
451 {
452 return (dta == dtb
453 || ((dta == vect_external_def || dta == vect_constant_def)
454 && (dtb == vect_external_def || dtb == vect_constant_def)));
455 }
456
457 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
458 they are of a valid type and that they match the defs of the first stmt of
459 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
460 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
461 indicates swap is required for cond_expr stmts. Specifically, *SWAP
462 is 1 if STMT is cond and operands of comparison need to be swapped;
463 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
464 If there is any operand swap in this function, *SWAP is set to non-zero
465 value.
466 If there was a fatal error return -1; if the error could be corrected by
467 swapping operands of father node of this one, return 1; if everything is
468 ok return 0. */
469 static int
470 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
471 bool *skip_args,
472 vec<stmt_vec_info> stmts, unsigned stmt_num,
473 vec<slp_oprnd_info> *oprnds_info)
474 {
475 stmt_vec_info stmt_info = stmts[stmt_num];
476 tree oprnd;
477 unsigned int i, number_of_oprnds;
478 enum vect_def_type dt = vect_uninitialized_def;
479 slp_oprnd_info oprnd_info;
480 int first_op_idx = 1;
481 unsigned int commutative_op = -1U;
482 bool first_op_cond = false;
483 bool first = stmt_num == 0;
484
485 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
486 {
487 number_of_oprnds = gimple_call_num_args (stmt);
488 first_op_idx = 3;
489 if (gimple_call_internal_p (stmt))
490 {
491 internal_fn ifn = gimple_call_internal_fn (stmt);
492 commutative_op = first_commutative_argument (ifn);
493
494 /* Masked load, only look at mask. */
495 if (ifn == IFN_MASK_LOAD)
496 {
497 number_of_oprnds = 1;
498 /* Mask operand index. */
499 first_op_idx = 5;
500 }
501 }
502 }
503 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
504 {
505 enum tree_code code = gimple_assign_rhs_code (stmt);
506 number_of_oprnds = gimple_num_ops (stmt) - 1;
507 /* Swap can only be done for cond_expr if asked to, otherwise we
508 could result in different comparison code to the first stmt. */
509 if (code == COND_EXPR
510 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
511 {
512 first_op_cond = true;
513 number_of_oprnds++;
514 }
515 else
516 commutative_op = commutative_tree_code (code) ? 0U : -1U;
517 }
518 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
519 number_of_oprnds = gimple_phi_num_args (stmt);
520 else
521 return -1;
522
523 bool swapped = (swap != 0);
524 bool backedge = false;
525 gcc_assert (!swapped || first_op_cond);
526 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
527 for (i = 0; i < number_of_oprnds; i++)
528 {
529 if (first_op_cond)
530 {
531 /* Map indicating how operands of cond_expr should be swapped. */
532 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
533 int *map = maps[swap];
534
535 if (i < 2)
536 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
537 first_op_idx), map[i]);
538 else
539 oprnd = gimple_op (stmt_info->stmt, map[i]);
540 }
541 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
542 {
543 oprnd = gimple_phi_arg_def (stmt, i);
544 backedge = dominated_by_p (CDI_DOMINATORS,
545 gimple_phi_arg_edge (stmt, i)->src,
546 gimple_bb (stmt_info->stmt));
547 }
548 else
549 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
550 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
551 oprnd = TREE_OPERAND (oprnd, 0);
552
553 oprnd_info = (*oprnds_info)[i];
554
555 stmt_vec_info def_stmt_info;
556 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
557 {
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
560 "Build SLP failed: can't analyze def for %T\n",
561 oprnd);
562
563 return -1;
564 }
565
566 if (skip_args[i])
567 {
568 oprnd_info->def_stmts.quick_push (NULL);
569 oprnd_info->ops.quick_push (NULL_TREE);
570 oprnd_info->first_dt = vect_uninitialized_def;
571 continue;
572 }
573
574 oprnd_info->def_stmts.quick_push (def_stmt_info);
575 oprnd_info->ops.quick_push (oprnd);
576
577 if (def_stmt_info
578 && is_pattern_stmt_p (def_stmt_info))
579 {
580 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
581 != def_stmt_info)
582 oprnd_info->any_pattern = true;
583 else
584 /* If we promote this to external use the original stmt def. */
585 oprnd_info->ops.last ()
586 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
587 }
588
589 /* If there's a extern def on a backedge make sure we can
590 code-generate at the region start.
591 ??? This is another case that could be fixed by adjusting
592 how we split the function but at the moment we'd have conflicting
593 goals there. */
594 if (backedge
595 && dts[i] == vect_external_def
596 && is_a <bb_vec_info> (vinfo)
597 && TREE_CODE (oprnd) == SSA_NAME
598 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
599 && !dominated_by_p (CDI_DOMINATORS,
600 as_a <bb_vec_info> (vinfo)->bbs[0],
601 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
602 {
603 if (dump_enabled_p ())
604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
605 "Build SLP failed: extern def %T only defined "
606 "on backedge\n", oprnd);
607 return -1;
608 }
609
610 if (first)
611 {
612 tree type = TREE_TYPE (oprnd);
613 dt = dts[i];
614 if ((dt == vect_constant_def
615 || dt == vect_external_def)
616 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
617 && (TREE_CODE (type) == BOOLEAN_TYPE
618 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
619 type)))
620 {
621 if (dump_enabled_p ())
622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
623 "Build SLP failed: invalid type of def "
624 "for variable-length SLP %T\n", oprnd);
625 return -1;
626 }
627
628 /* For the swapping logic below force vect_reduction_def
629 for the reduction op in a SLP reduction group. */
630 if (!STMT_VINFO_DATA_REF (stmt_info)
631 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
632 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
633 && def_stmt_info)
634 dts[i] = dt = vect_reduction_def;
635
636 /* Check the types of the definition. */
637 switch (dt)
638 {
639 case vect_external_def:
640 case vect_constant_def:
641 case vect_internal_def:
642 case vect_reduction_def:
643 case vect_induction_def:
644 case vect_nested_cycle:
645 break;
646
647 default:
648 /* FORNOW: Not supported. */
649 if (dump_enabled_p ())
650 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
651 "Build SLP failed: illegal type of def %T\n",
652 oprnd);
653 return -1;
654 }
655
656 oprnd_info->first_dt = dt;
657 oprnd_info->first_op_type = type;
658 }
659 }
660 if (first)
661 return 0;
662
663 /* Now match the operand definition types to that of the first stmt. */
664 for (i = 0; i < number_of_oprnds;)
665 {
666 if (skip_args[i])
667 {
668 ++i;
669 continue;
670 }
671
672 oprnd_info = (*oprnds_info)[i];
673 dt = dts[i];
674 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
675 oprnd = oprnd_info->ops[stmt_num];
676 tree type = TREE_TYPE (oprnd);
677
678 if (!types_compatible_p (oprnd_info->first_op_type, type))
679 {
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
682 "Build SLP failed: different operand types\n");
683 return 1;
684 }
685
686 /* Not first stmt of the group, check that the def-stmt/s match
687 the def-stmt/s of the first stmt. Allow different definition
688 types for reduction chains: the first stmt must be a
689 vect_reduction_def (a phi node), and the rest
690 end in the reduction chain. */
691 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
692 && !(oprnd_info->first_dt == vect_reduction_def
693 && !STMT_VINFO_DATA_REF (stmt_info)
694 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
695 && def_stmt_info
696 && !STMT_VINFO_DATA_REF (def_stmt_info)
697 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
698 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
699 || (!STMT_VINFO_DATA_REF (stmt_info)
700 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
701 && ((!def_stmt_info
702 || STMT_VINFO_DATA_REF (def_stmt_info)
703 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
704 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
705 != (oprnd_info->first_dt != vect_reduction_def))))
706 {
707 /* Try swapping operands if we got a mismatch. For BB
708 vectorization only in case it will clearly improve things. */
709 if (i == commutative_op && !swapped
710 && (!is_a <bb_vec_info> (vinfo)
711 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
712 dts[i+1])
713 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
714 || vect_def_types_match
715 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
716 {
717 if (dump_enabled_p ())
718 dump_printf_loc (MSG_NOTE, vect_location,
719 "trying swapped operands\n");
720 std::swap (dts[i], dts[i+1]);
721 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
722 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
723 std::swap ((*oprnds_info)[i]->ops[stmt_num],
724 (*oprnds_info)[i+1]->ops[stmt_num]);
725 swapped = true;
726 continue;
727 }
728
729 if (is_a <bb_vec_info> (vinfo)
730 && !oprnd_info->any_pattern)
731 {
732 /* Now for commutative ops we should see whether we can
733 make the other operand matching. */
734 if (dump_enabled_p ())
735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
736 "treating operand as external\n");
737 oprnd_info->first_dt = dt = vect_external_def;
738 }
739 else
740 {
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
743 "Build SLP failed: different types\n");
744 return 1;
745 }
746 }
747
748 /* Make sure to demote the overall operand to external. */
749 if (dt == vect_external_def)
750 oprnd_info->first_dt = vect_external_def;
751 /* For a SLP reduction chain we want to duplicate the reduction to
752 each of the chain members. That gets us a sane SLP graph (still
753 the stmts are not 100% correct wrt the initial values). */
754 else if ((dt == vect_internal_def
755 || dt == vect_reduction_def)
756 && oprnd_info->first_dt == vect_reduction_def
757 && !STMT_VINFO_DATA_REF (stmt_info)
758 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
759 && !STMT_VINFO_DATA_REF (def_stmt_info)
760 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
761 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
762 {
763 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
764 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
765 }
766
767 ++i;
768 }
769
770 /* Swap operands. */
771 if (swapped)
772 {
773 if (dump_enabled_p ())
774 dump_printf_loc (MSG_NOTE, vect_location,
775 "swapped operands to match def types in %G",
776 stmt_info->stmt);
777 }
778
779 return 0;
780 }
781
782 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
783 Return true if we can, meaning that this choice doesn't conflict with
784 existing SLP nodes that use STMT_INFO. */
785
786 bool
787 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
788 {
789 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
790 if (old_vectype)
791 return useless_type_conversion_p (vectype, old_vectype);
792
793 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
794 {
795 /* We maintain the invariant that if any statement in the group is
796 used, all other members of the group have the same vector type. */
797 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
798 stmt_vec_info member_info = first_info;
799 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
800 if (is_pattern_stmt_p (member_info)
801 && !useless_type_conversion_p (vectype,
802 STMT_VINFO_VECTYPE (member_info)))
803 break;
804
805 if (!member_info)
806 {
807 for (member_info = first_info; member_info;
808 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
809 STMT_VINFO_VECTYPE (member_info) = vectype;
810 return true;
811 }
812 }
813 else if (!is_pattern_stmt_p (stmt_info))
814 {
815 STMT_VINFO_VECTYPE (stmt_info) = vectype;
816 return true;
817 }
818
819 if (dump_enabled_p ())
820 {
821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822 "Build SLP failed: incompatible vector"
823 " types for: %G", stmt_info->stmt);
824 dump_printf_loc (MSG_NOTE, vect_location,
825 " old vector type: %T\n", old_vectype);
826 dump_printf_loc (MSG_NOTE, vect_location,
827 " new vector type: %T\n", vectype);
828 }
829 return false;
830 }
831
832 /* Return true if call statements CALL1 and CALL2 are similar enough
833 to be combined into the same SLP group. */
834
835 static bool
836 compatible_calls_p (gcall *call1, gcall *call2)
837 {
838 unsigned int nargs = gimple_call_num_args (call1);
839 if (nargs != gimple_call_num_args (call2))
840 return false;
841
842 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
843 return false;
844
845 if (gimple_call_internal_p (call1))
846 {
847 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
848 TREE_TYPE (gimple_call_lhs (call2))))
849 return false;
850 for (unsigned int i = 0; i < nargs; ++i)
851 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
852 TREE_TYPE (gimple_call_arg (call2, i))))
853 return false;
854 }
855 else
856 {
857 if (!operand_equal_p (gimple_call_fn (call1),
858 gimple_call_fn (call2), 0))
859 return false;
860
861 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
862 return false;
863 }
864 return true;
865 }
866
867 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
868 caller's attempt to find the vector type in STMT_INFO with the narrowest
869 element type. Return true if VECTYPE is nonnull and if it is valid
870 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
871 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
872 vect_build_slp_tree. */
873
874 static bool
875 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
876 unsigned int group_size,
877 tree vectype, poly_uint64 *max_nunits)
878 {
879 if (!vectype)
880 {
881 if (dump_enabled_p ())
882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
883 "Build SLP failed: unsupported data-type in %G\n",
884 stmt_info->stmt);
885 /* Fatal mismatch. */
886 return false;
887 }
888
889 /* If populating the vector type requires unrolling then fail
890 before adjusting *max_nunits for basic-block vectorization. */
891 if (is_a <bb_vec_info> (vinfo)
892 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
893 {
894 if (dump_enabled_p ())
895 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
896 "Build SLP failed: unrolling required "
897 "in basic block SLP\n");
898 /* Fatal mismatch. */
899 return false;
900 }
901
902 /* In case of multiple types we need to detect the smallest type. */
903 vect_update_max_nunits (max_nunits, vectype);
904 return true;
905 }
906
907 /* Verify if the scalar stmts STMTS are isomorphic, require data
908 permutation or are of unsupported types of operation. Return
909 true if they are, otherwise return false and indicate in *MATCHES
910 which stmts are not isomorphic to the first one. If MATCHES[0]
911 is false then this indicates the comparison could not be
912 carried out or the stmts will never be vectorized by SLP.
913
914 Note COND_EXPR is possibly isomorphic to another one after swapping its
915 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
916 the first stmt by swapping the two operands of comparison; set SWAP[i]
917 to 2 if stmt I is isormorphic to the first stmt by inverting the code
918 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
919 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
920
921 static bool
922 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
923 vec<stmt_vec_info> stmts, unsigned int group_size,
924 poly_uint64 *max_nunits, bool *matches,
925 bool *two_operators, tree *node_vectype)
926 {
927 unsigned int i;
928 stmt_vec_info first_stmt_info = stmts[0];
929 enum tree_code first_stmt_code = ERROR_MARK;
930 enum tree_code alt_stmt_code = ERROR_MARK;
931 enum tree_code rhs_code = ERROR_MARK;
932 enum tree_code first_cond_code = ERROR_MARK;
933 tree lhs;
934 bool need_same_oprnds = false;
935 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
936 optab optab;
937 int icode;
938 machine_mode optab_op2_mode;
939 machine_mode vec_mode;
940 stmt_vec_info first_load = NULL, prev_first_load = NULL;
941 bool first_stmt_load_p = false, load_p = false;
942 bool first_stmt_phi_p = false, phi_p = false;
943 bool maybe_soft_fail = false;
944 tree soft_fail_nunits_vectype = NULL_TREE;
945
946 /* For every stmt in NODE find its def stmt/s. */
947 stmt_vec_info stmt_info;
948 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
949 {
950 gimple *stmt = stmt_info->stmt;
951 swap[i] = 0;
952 matches[i] = false;
953
954 if (dump_enabled_p ())
955 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
956
957 /* Fail to vectorize statements marked as unvectorizable, throw
958 or are volatile. */
959 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
960 || stmt_can_throw_internal (cfun, stmt)
961 || gimple_has_volatile_ops (stmt))
962 {
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
965 "Build SLP failed: unvectorizable statement %G",
966 stmt);
967 /* ??? For BB vectorization we want to commutate operands in a way
968 to shuffle all unvectorizable defs into one operand and have
969 the other still vectorized. The following doesn't reliably
970 work for this though but it's the easiest we can do here. */
971 if (is_a <bb_vec_info> (vinfo) && i != 0)
972 continue;
973 /* Fatal mismatch. */
974 matches[0] = false;
975 return false;
976 }
977
978 lhs = gimple_get_lhs (stmt);
979 if (lhs == NULL_TREE)
980 {
981 if (dump_enabled_p ())
982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
983 "Build SLP failed: not GIMPLE_ASSIGN nor "
984 "GIMPLE_CALL %G", stmt);
985 if (is_a <bb_vec_info> (vinfo) && i != 0)
986 continue;
987 /* Fatal mismatch. */
988 matches[0] = false;
989 return false;
990 }
991
992 tree nunits_vectype;
993 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
994 &nunits_vectype, group_size))
995 {
996 if (is_a <bb_vec_info> (vinfo) && i != 0)
997 continue;
998 /* Fatal mismatch. */
999 matches[0] = false;
1000 return false;
1001 }
1002 /* Record nunits required but continue analysis, producing matches[]
1003 as if nunits was not an issue. This allows splitting of groups
1004 to happen. */
1005 if (nunits_vectype
1006 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1007 nunits_vectype, max_nunits))
1008 {
1009 gcc_assert (is_a <bb_vec_info> (vinfo));
1010 maybe_soft_fail = true;
1011 soft_fail_nunits_vectype = nunits_vectype;
1012 }
1013
1014 gcc_assert (vectype);
1015
1016 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1017 if (call_stmt)
1018 {
1019 rhs_code = CALL_EXPR;
1020
1021 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
1022 load_p = true;
1023 else if ((gimple_call_internal_p (call_stmt)
1024 && (!vectorizable_internal_fn_p
1025 (gimple_call_internal_fn (call_stmt))))
1026 || gimple_call_tail_p (call_stmt)
1027 || gimple_call_noreturn_p (call_stmt)
1028 || !gimple_call_nothrow_p (call_stmt)
1029 || gimple_call_chain (call_stmt))
1030 {
1031 if (dump_enabled_p ())
1032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1033 "Build SLP failed: unsupported call type %G",
1034 call_stmt);
1035 if (is_a <bb_vec_info> (vinfo) && i != 0)
1036 continue;
1037 /* Fatal mismatch. */
1038 matches[0] = false;
1039 return false;
1040 }
1041 }
1042 else if (gimple_code (stmt) == GIMPLE_PHI)
1043 {
1044 rhs_code = ERROR_MARK;
1045 phi_p = true;
1046 }
1047 else
1048 {
1049 rhs_code = gimple_assign_rhs_code (stmt);
1050 load_p = gimple_vuse (stmt);
1051 }
1052
1053 /* Check the operation. */
1054 if (i == 0)
1055 {
1056 *node_vectype = vectype;
1057 first_stmt_code = rhs_code;
1058 first_stmt_load_p = load_p;
1059 first_stmt_phi_p = phi_p;
1060
1061 /* Shift arguments should be equal in all the packed stmts for a
1062 vector shift with scalar shift operand. */
1063 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1064 || rhs_code == LROTATE_EXPR
1065 || rhs_code == RROTATE_EXPR)
1066 {
1067 vec_mode = TYPE_MODE (vectype);
1068
1069 /* First see if we have a vector/vector shift. */
1070 optab = optab_for_tree_code (rhs_code, vectype,
1071 optab_vector);
1072
1073 if (!optab
1074 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1075 {
1076 /* No vector/vector shift, try for a vector/scalar shift. */
1077 optab = optab_for_tree_code (rhs_code, vectype,
1078 optab_scalar);
1079
1080 if (!optab)
1081 {
1082 if (dump_enabled_p ())
1083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1084 "Build SLP failed: no optab.\n");
1085 if (is_a <bb_vec_info> (vinfo) && i != 0)
1086 continue;
1087 /* Fatal mismatch. */
1088 matches[0] = false;
1089 return false;
1090 }
1091 icode = (int) optab_handler (optab, vec_mode);
1092 if (icode == CODE_FOR_nothing)
1093 {
1094 if (dump_enabled_p ())
1095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1096 "Build SLP failed: "
1097 "op not supported by target.\n");
1098 if (is_a <bb_vec_info> (vinfo) && i != 0)
1099 continue;
1100 /* Fatal mismatch. */
1101 matches[0] = false;
1102 return false;
1103 }
1104 optab_op2_mode = insn_data[icode].operand[2].mode;
1105 if (!VECTOR_MODE_P (optab_op2_mode))
1106 {
1107 need_same_oprnds = true;
1108 first_op1 = gimple_assign_rhs2 (stmt);
1109 }
1110 }
1111 }
1112 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1113 {
1114 need_same_oprnds = true;
1115 first_op1 = gimple_assign_rhs2 (stmt);
1116 }
1117 else if (!load_p
1118 && rhs_code == BIT_FIELD_REF)
1119 {
1120 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1121 if (!is_a <bb_vec_info> (vinfo)
1122 || TREE_CODE (vec) != SSA_NAME
1123 || !operand_equal_p (TYPE_SIZE (vectype),
1124 TYPE_SIZE (TREE_TYPE (vec))))
1125 {
1126 if (dump_enabled_p ())
1127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1128 "Build SLP failed: "
1129 "BIT_FIELD_REF not supported\n");
1130 /* Fatal mismatch. */
1131 matches[0] = false;
1132 return false;
1133 }
1134 }
1135 else if (call_stmt
1136 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1137 {
1138 need_same_oprnds = true;
1139 first_op1 = gimple_call_arg (call_stmt, 1);
1140 }
1141 }
1142 else
1143 {
1144 if (first_stmt_code != rhs_code
1145 && alt_stmt_code == ERROR_MARK)
1146 alt_stmt_code = rhs_code;
1147 if ((first_stmt_code != rhs_code
1148 && (first_stmt_code != IMAGPART_EXPR
1149 || rhs_code != REALPART_EXPR)
1150 && (first_stmt_code != REALPART_EXPR
1151 || rhs_code != IMAGPART_EXPR)
1152 /* Handle mismatches in plus/minus by computing both
1153 and merging the results. */
1154 && !((first_stmt_code == PLUS_EXPR
1155 || first_stmt_code == MINUS_EXPR)
1156 && (alt_stmt_code == PLUS_EXPR
1157 || alt_stmt_code == MINUS_EXPR)
1158 && rhs_code == alt_stmt_code)
1159 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1160 && (first_stmt_code == ARRAY_REF
1161 || first_stmt_code == BIT_FIELD_REF
1162 || first_stmt_code == INDIRECT_REF
1163 || first_stmt_code == COMPONENT_REF
1164 || first_stmt_code == MEM_REF)))
1165 || first_stmt_load_p != load_p
1166 || first_stmt_phi_p != phi_p)
1167 {
1168 if (dump_enabled_p ())
1169 {
1170 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1171 "Build SLP failed: different operation "
1172 "in stmt %G", stmt);
1173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1174 "original stmt %G", first_stmt_info->stmt);
1175 }
1176 /* Mismatch. */
1177 continue;
1178 }
1179
1180 if (need_same_oprnds)
1181 {
1182 tree other_op1 = (call_stmt
1183 ? gimple_call_arg (call_stmt, 1)
1184 : gimple_assign_rhs2 (stmt));
1185 if (!operand_equal_p (first_op1, other_op1, 0))
1186 {
1187 if (dump_enabled_p ())
1188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1189 "Build SLP failed: different shift "
1190 "arguments in %G", stmt);
1191 /* Mismatch. */
1192 continue;
1193 }
1194 }
1195 if (!load_p
1196 && first_stmt_code == BIT_FIELD_REF
1197 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1198 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1199 {
1200 if (dump_enabled_p ())
1201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1202 "Build SLP failed: different BIT_FIELD_REF "
1203 "arguments in %G", stmt);
1204 /* Mismatch. */
1205 continue;
1206 }
1207
1208 if (!load_p && rhs_code == CALL_EXPR)
1209 {
1210 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1211 as_a <gcall *> (stmt)))
1212 {
1213 if (dump_enabled_p ())
1214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1215 "Build SLP failed: different calls in %G",
1216 stmt);
1217 /* Mismatch. */
1218 continue;
1219 }
1220 }
1221
1222 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1223 && (gimple_bb (first_stmt_info->stmt)
1224 != gimple_bb (stmt_info->stmt)))
1225 {
1226 if (dump_enabled_p ())
1227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1228 "Build SLP failed: different BB for PHI "
1229 "or possibly trapping operation in %G", stmt);
1230 /* Mismatch. */
1231 continue;
1232 }
1233
1234 if (!types_compatible_p (vectype, *node_vectype))
1235 {
1236 if (dump_enabled_p ())
1237 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1238 "Build SLP failed: different vector type "
1239 "in %G", stmt);
1240 /* Mismatch. */
1241 continue;
1242 }
1243 }
1244
1245 /* Grouped store or load. */
1246 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1247 {
1248 if (REFERENCE_CLASS_P (lhs))
1249 {
1250 /* Store. */
1251 ;
1252 }
1253 else
1254 {
1255 /* Load. */
1256 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1257 if (prev_first_load)
1258 {
1259 /* Check that there are no loads from different interleaving
1260 chains in the same node. */
1261 if (prev_first_load != first_load)
1262 {
1263 if (dump_enabled_p ())
1264 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1265 vect_location,
1266 "Build SLP failed: different "
1267 "interleaving chains in one node %G",
1268 stmt);
1269 /* Mismatch. */
1270 continue;
1271 }
1272 }
1273 else
1274 prev_first_load = first_load;
1275 }
1276 } /* Grouped access. */
1277 else
1278 {
1279 if (load_p)
1280 {
1281 /* Not grouped load. */
1282 if (dump_enabled_p ())
1283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1284 "Build SLP failed: not grouped load %G", stmt);
1285
1286 /* FORNOW: Not grouped loads are not supported. */
1287 if (is_a <bb_vec_info> (vinfo) && i != 0)
1288 continue;
1289 /* Fatal mismatch. */
1290 matches[0] = false;
1291 return false;
1292 }
1293
1294 /* Not memory operation. */
1295 if (!phi_p
1296 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1297 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1298 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1299 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1300 && rhs_code != VIEW_CONVERT_EXPR
1301 && rhs_code != CALL_EXPR
1302 && rhs_code != BIT_FIELD_REF)
1303 {
1304 if (dump_enabled_p ())
1305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1306 "Build SLP failed: operation unsupported %G",
1307 stmt);
1308 if (is_a <bb_vec_info> (vinfo) && i != 0)
1309 continue;
1310 /* Fatal mismatch. */
1311 matches[0] = false;
1312 return false;
1313 }
1314
1315 if (rhs_code == COND_EXPR)
1316 {
1317 tree cond_expr = gimple_assign_rhs1 (stmt);
1318 enum tree_code cond_code = TREE_CODE (cond_expr);
1319 enum tree_code swap_code = ERROR_MARK;
1320 enum tree_code invert_code = ERROR_MARK;
1321
1322 if (i == 0)
1323 first_cond_code = TREE_CODE (cond_expr);
1324 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1325 {
1326 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1327 swap_code = swap_tree_comparison (cond_code);
1328 invert_code = invert_tree_comparison (cond_code, honor_nans);
1329 }
1330
1331 if (first_cond_code == cond_code)
1332 ;
1333 /* Isomorphic can be achieved by swapping. */
1334 else if (first_cond_code == swap_code)
1335 swap[i] = 1;
1336 /* Isomorphic can be achieved by inverting. */
1337 else if (first_cond_code == invert_code)
1338 swap[i] = 2;
1339 else
1340 {
1341 if (dump_enabled_p ())
1342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1343 "Build SLP failed: different"
1344 " operation %G", stmt);
1345 /* Mismatch. */
1346 continue;
1347 }
1348 }
1349 }
1350
1351 matches[i] = true;
1352 }
1353
1354 for (i = 0; i < group_size; ++i)
1355 if (!matches[i])
1356 return false;
1357
1358 /* If we allowed a two-operation SLP node verify the target can cope
1359 with the permute we are going to use. */
1360 if (alt_stmt_code != ERROR_MARK
1361 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1362 {
1363 *two_operators = true;
1364 }
1365
1366 if (maybe_soft_fail)
1367 {
1368 unsigned HOST_WIDE_INT const_nunits;
1369 if (!TYPE_VECTOR_SUBPARTS
1370 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1371 || const_nunits > group_size)
1372 matches[0] = false;
1373 else
1374 {
1375 /* With constant vector elements simulate a mismatch at the
1376 point we need to split. */
1377 unsigned tail = group_size & (const_nunits - 1);
1378 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1379 }
1380 return false;
1381 }
1382
1383 return true;
1384 }
1385
1386 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1387 Note we never remove apart from at destruction time so we do not
1388 need a special value for deleted that differs from empty. */
1389 struct bst_traits
1390 {
1391 typedef vec <stmt_vec_info> value_type;
1392 typedef vec <stmt_vec_info> compare_type;
1393 static inline hashval_t hash (value_type);
1394 static inline bool equal (value_type existing, value_type candidate);
1395 static inline bool is_empty (value_type x) { return !x.exists (); }
1396 static inline bool is_deleted (value_type x) { return !x.exists (); }
1397 static const bool empty_zero_p = true;
1398 static inline void mark_empty (value_type &x) { x.release (); }
1399 static inline void mark_deleted (value_type &x) { x.release (); }
1400 static inline void remove (value_type &x) { x.release (); }
1401 };
1402 inline hashval_t
1403 bst_traits::hash (value_type x)
1404 {
1405 inchash::hash h;
1406 for (unsigned i = 0; i < x.length (); ++i)
1407 h.add_int (gimple_uid (x[i]->stmt));
1408 return h.end ();
1409 }
1410 inline bool
1411 bst_traits::equal (value_type existing, value_type candidate)
1412 {
1413 if (existing.length () != candidate.length ())
1414 return false;
1415 for (unsigned i = 0; i < existing.length (); ++i)
1416 if (existing[i] != candidate[i])
1417 return false;
1418 return true;
1419 }
1420
1421 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1422 but then vec::insert does memmove and that's not compatible with
1423 std::pair. */
1424 struct chain_op_t
1425 {
1426 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1427 : code (code_), dt (dt_), op (op_) {}
1428 tree_code code;
1429 vect_def_type dt;
1430 tree op;
1431 };
1432
1433 /* Comparator for sorting associatable chains. */
1434
1435 static int
1436 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1437 {
1438 auto *op1 = (const chain_op_t *) op1_;
1439 auto *op2 = (const chain_op_t *) op2_;
1440 if (op1->dt != op2->dt)
1441 return (int)op1->dt - (int)op2->dt;
1442 return (int)op1->code - (int)op2->code;
1443 }
1444
1445 /* Linearize the associatable expression chain at START with the
1446 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1447 filling CHAIN with the result and using WORKLIST as intermediate storage.
1448 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1449 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1450 stmts, starting with START. */
1451
1452 static void
1453 vect_slp_linearize_chain (vec_info *vinfo,
1454 vec<std::pair<tree_code, gimple *> > &worklist,
1455 vec<chain_op_t> &chain,
1456 enum tree_code code, gimple *start,
1457 gimple *&code_stmt, gimple *&alt_code_stmt,
1458 vec<gimple *> *chain_stmts)
1459 {
1460 /* For each lane linearize the addition/subtraction (or other
1461 uniform associatable operation) expression tree. */
1462 worklist.safe_push (std::make_pair (code, start));
1463 while (!worklist.is_empty ())
1464 {
1465 auto entry = worklist.pop ();
1466 gassign *stmt = as_a <gassign *> (entry.second);
1467 enum tree_code in_code = entry.first;
1468 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1469 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1470 if (!code_stmt
1471 && gimple_assign_rhs_code (stmt) == code)
1472 code_stmt = stmt;
1473 else if (!alt_code_stmt
1474 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1475 alt_code_stmt = stmt;
1476 if (chain_stmts)
1477 chain_stmts->safe_push (stmt);
1478 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1479 {
1480 tree op = gimple_op (stmt, opnum);
1481 vect_def_type dt;
1482 stmt_vec_info def_stmt_info;
1483 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1484 gcc_assert (res);
1485 if (dt == vect_internal_def
1486 && is_pattern_stmt_p (def_stmt_info))
1487 op = gimple_get_lhs (def_stmt_info->stmt);
1488 gimple *use_stmt;
1489 use_operand_p use_p;
1490 if (dt == vect_internal_def
1491 && single_imm_use (op, &use_p, &use_stmt)
1492 && is_gimple_assign (def_stmt_info->stmt)
1493 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1494 || (code == PLUS_EXPR
1495 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1496 == MINUS_EXPR))))
1497 {
1498 tree_code op_def_code = this_code;
1499 if (op_def_code == MINUS_EXPR && opnum == 1)
1500 op_def_code = PLUS_EXPR;
1501 if (in_code == MINUS_EXPR)
1502 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1503 worklist.safe_push (std::make_pair (op_def_code,
1504 def_stmt_info->stmt));
1505 }
1506 else
1507 {
1508 tree_code op_def_code = this_code;
1509 if (op_def_code == MINUS_EXPR && opnum == 1)
1510 op_def_code = PLUS_EXPR;
1511 if (in_code == MINUS_EXPR)
1512 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1513 chain.safe_push (chain_op_t (op_def_code, dt, op));
1514 }
1515 }
1516 }
1517 }
1518
1519 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1520 simple_hashmap_traits <bst_traits, slp_tree> >
1521 scalar_stmts_to_slp_tree_map_t;
1522
1523 static slp_tree
1524 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1525 vec<stmt_vec_info> stmts, unsigned int group_size,
1526 poly_uint64 *max_nunits,
1527 bool *matches, unsigned *limit, unsigned *tree_size,
1528 scalar_stmts_to_slp_tree_map_t *bst_map);
1529
1530 static slp_tree
1531 vect_build_slp_tree (vec_info *vinfo,
1532 vec<stmt_vec_info> stmts, unsigned int group_size,
1533 poly_uint64 *max_nunits,
1534 bool *matches, unsigned *limit, unsigned *tree_size,
1535 scalar_stmts_to_slp_tree_map_t *bst_map)
1536 {
1537 if (slp_tree *leader = bst_map->get (stmts))
1538 {
1539 if (dump_enabled_p ())
1540 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1541 !(*leader)->failed ? "" : "failed ", *leader);
1542 if (!(*leader)->failed)
1543 {
1544 SLP_TREE_REF_COUNT (*leader)++;
1545 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1546 stmts.release ();
1547 return *leader;
1548 }
1549 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1550 return NULL;
1551 }
1552
1553 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1554 so we can pick up backedge destinations during discovery. */
1555 slp_tree res = new _slp_tree;
1556 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1557 SLP_TREE_SCALAR_STMTS (res) = stmts;
1558 bst_map->put (stmts.copy (), res);
1559
1560 if (*limit == 0)
1561 {
1562 if (dump_enabled_p ())
1563 dump_printf_loc (MSG_NOTE, vect_location,
1564 "SLP discovery limit exceeded\n");
1565 /* Mark the node invalid so we can detect those when still in use
1566 as backedge destinations. */
1567 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1568 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1569 res->failed = XNEWVEC (bool, group_size);
1570 memset (res->failed, 0, sizeof (bool) * group_size);
1571 memset (matches, 0, sizeof (bool) * group_size);
1572 return NULL;
1573 }
1574 --*limit;
1575
1576 if (dump_enabled_p ())
1577 dump_printf_loc (MSG_NOTE, vect_location,
1578 "starting SLP discovery for node %p\n", res);
1579
1580 poly_uint64 this_max_nunits = 1;
1581 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1582 &this_max_nunits,
1583 matches, limit, tree_size, bst_map);
1584 if (!res_)
1585 {
1586 if (dump_enabled_p ())
1587 dump_printf_loc (MSG_NOTE, vect_location,
1588 "SLP discovery for node %p failed\n", res);
1589 /* Mark the node invalid so we can detect those when still in use
1590 as backedge destinations. */
1591 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1592 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1593 res->failed = XNEWVEC (bool, group_size);
1594 if (flag_checking)
1595 {
1596 unsigned i;
1597 for (i = 0; i < group_size; ++i)
1598 if (!matches[i])
1599 break;
1600 gcc_assert (i < group_size);
1601 }
1602 memcpy (res->failed, matches, sizeof (bool) * group_size);
1603 }
1604 else
1605 {
1606 if (dump_enabled_p ())
1607 dump_printf_loc (MSG_NOTE, vect_location,
1608 "SLP discovery for node %p succeeded\n", res);
1609 gcc_assert (res_ == res);
1610 res->max_nunits = this_max_nunits;
1611 vect_update_max_nunits (max_nunits, this_max_nunits);
1612 /* Keep a reference for the bst_map use. */
1613 SLP_TREE_REF_COUNT (res)++;
1614 }
1615 return res_;
1616 }
1617
1618 /* Helper for building an associated SLP node chain. */
1619
1620 static void
1621 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1622 slp_tree op0, slp_tree op1,
1623 stmt_vec_info oper1, stmt_vec_info oper2,
1624 vec<std::pair<unsigned, unsigned> > lperm)
1625 {
1626 unsigned group_size = SLP_TREE_LANES (op1);
1627
1628 slp_tree child1 = new _slp_tree;
1629 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1630 SLP_TREE_VECTYPE (child1) = vectype;
1631 SLP_TREE_LANES (child1) = group_size;
1632 SLP_TREE_CHILDREN (child1).create (2);
1633 SLP_TREE_CHILDREN (child1).quick_push (op0);
1634 SLP_TREE_CHILDREN (child1).quick_push (op1);
1635 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1636
1637 slp_tree child2 = new _slp_tree;
1638 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1639 SLP_TREE_VECTYPE (child2) = vectype;
1640 SLP_TREE_LANES (child2) = group_size;
1641 SLP_TREE_CHILDREN (child2).create (2);
1642 SLP_TREE_CHILDREN (child2).quick_push (op0);
1643 SLP_TREE_REF_COUNT (op0)++;
1644 SLP_TREE_CHILDREN (child2).quick_push (op1);
1645 SLP_TREE_REF_COUNT (op1)++;
1646 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1647
1648 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1649 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1650 SLP_TREE_VECTYPE (perm) = vectype;
1651 SLP_TREE_LANES (perm) = group_size;
1652 /* ??? We should set this NULL but that's not expected. */
1653 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1654 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1655 SLP_TREE_CHILDREN (perm).quick_push (child1);
1656 SLP_TREE_CHILDREN (perm).quick_push (child2);
1657 }
1658
1659 /* Recursively build an SLP tree starting from NODE.
1660 Fail (and return a value not equal to zero) if def-stmts are not
1661 isomorphic, require data permutation or are of unsupported types of
1662 operation. Otherwise, return 0.
1663 The value returned is the depth in the SLP tree where a mismatch
1664 was found. */
1665
1666 static slp_tree
1667 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1668 vec<stmt_vec_info> stmts, unsigned int group_size,
1669 poly_uint64 *max_nunits,
1670 bool *matches, unsigned *limit, unsigned *tree_size,
1671 scalar_stmts_to_slp_tree_map_t *bst_map)
1672 {
1673 unsigned nops, i, this_tree_size = 0;
1674 poly_uint64 this_max_nunits = *max_nunits;
1675
1676 matches[0] = false;
1677
1678 stmt_vec_info stmt_info = stmts[0];
1679 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1680 nops = gimple_call_num_args (stmt);
1681 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1682 {
1683 nops = gimple_num_ops (stmt) - 1;
1684 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1685 nops++;
1686 }
1687 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1688 nops = gimple_phi_num_args (phi);
1689 else
1690 return NULL;
1691
1692 /* If the SLP node is a PHI (induction or reduction), terminate
1693 the recursion. */
1694 bool *skip_args = XALLOCAVEC (bool, nops);
1695 memset (skip_args, 0, sizeof (bool) * nops);
1696 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1697 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1698 {
1699 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1700 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1701 group_size);
1702 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1703 max_nunits))
1704 return NULL;
1705
1706 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1707 if (def_type == vect_induction_def)
1708 {
1709 /* Induction PHIs are not cycles but walk the initial
1710 value. Only for inner loops through, for outer loops
1711 we need to pick up the value from the actual PHIs
1712 to more easily support peeling and epilogue vectorization. */
1713 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1714 if (!nested_in_vect_loop_p (loop, stmt_info))
1715 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1716 else
1717 loop = loop->inner;
1718 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1719 }
1720 else if (def_type == vect_reduction_def
1721 || def_type == vect_double_reduction_def
1722 || def_type == vect_nested_cycle)
1723 {
1724 /* Else def types have to match. */
1725 stmt_vec_info other_info;
1726 bool all_same = true;
1727 FOR_EACH_VEC_ELT (stmts, i, other_info)
1728 {
1729 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1730 return NULL;
1731 if (other_info != stmt_info)
1732 all_same = false;
1733 }
1734 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1735 /* Reduction initial values are not explicitely represented. */
1736 if (!nested_in_vect_loop_p (loop, stmt_info))
1737 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1738 /* Reduction chain backedge defs are filled manually.
1739 ??? Need a better way to identify a SLP reduction chain PHI.
1740 Or a better overall way to SLP match those. */
1741 if (all_same && def_type == vect_reduction_def)
1742 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1743 }
1744 else if (def_type != vect_internal_def)
1745 return NULL;
1746 }
1747
1748
1749 bool two_operators = false;
1750 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1751 tree vectype = NULL_TREE;
1752 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1753 &this_max_nunits, matches, &two_operators,
1754 &vectype))
1755 return NULL;
1756
1757 /* If the SLP node is a load, terminate the recursion unless masked. */
1758 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1759 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1760 {
1761 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1762 {
1763 /* Masked load. */
1764 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1765 nops = 1;
1766 }
1767 else
1768 {
1769 *max_nunits = this_max_nunits;
1770 (*tree_size)++;
1771 node = vect_create_new_slp_node (node, stmts, 0);
1772 SLP_TREE_VECTYPE (node) = vectype;
1773 /* And compute the load permutation. Whether it is actually
1774 a permutation depends on the unrolling factor which is
1775 decided later. */
1776 vec<unsigned> load_permutation;
1777 int j;
1778 stmt_vec_info load_info;
1779 load_permutation.create (group_size);
1780 stmt_vec_info first_stmt_info
1781 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1782 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1783 {
1784 int load_place = vect_get_place_in_interleaving_chain
1785 (load_info, first_stmt_info);
1786 gcc_assert (load_place != -1);
1787 load_permutation.safe_push (load_place);
1788 }
1789 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1790 return node;
1791 }
1792 }
1793 else if (gimple_assign_single_p (stmt_info->stmt)
1794 && !gimple_vuse (stmt_info->stmt)
1795 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1796 {
1797 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1798 the same SSA name vector of a compatible type to vectype. */
1799 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1800 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1801 stmt_vec_info estmt_info;
1802 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1803 {
1804 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1805 tree bfref = gimple_assign_rhs1 (estmt);
1806 HOST_WIDE_INT lane;
1807 if (!known_eq (bit_field_size (bfref),
1808 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1809 || !constant_multiple_p (bit_field_offset (bfref),
1810 bit_field_size (bfref), &lane))
1811 {
1812 lperm.release ();
1813 return NULL;
1814 }
1815 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1816 }
1817 slp_tree vnode = vect_create_new_slp_node (vNULL);
1818 /* ??? We record vectype here but we hide eventually necessary
1819 punning and instead rely on code generation to materialize
1820 VIEW_CONVERT_EXPRs as necessary. We instead should make
1821 this explicit somehow. */
1822 SLP_TREE_VECTYPE (vnode) = vectype;
1823 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1824 /* We are always building a permutation node even if it is an identity
1825 permute to shield the rest of the vectorizer from the odd node
1826 representing an actual vector without any scalar ops.
1827 ??? We could hide it completely with making the permute node
1828 external? */
1829 node = vect_create_new_slp_node (node, stmts, 1);
1830 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1831 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1832 SLP_TREE_VECTYPE (node) = vectype;
1833 SLP_TREE_CHILDREN (node).quick_push (vnode);
1834 return node;
1835 }
1836 /* When discovery reaches an associatable operation see whether we can
1837 improve that to match up lanes in a way superior to the operand
1838 swapping code which at most looks at two defs.
1839 ??? For BB vectorization we cannot do the brute-force search
1840 for matching as we can succeed by means of builds from scalars
1841 and have no good way to "cost" one build against another. */
1842 else if (is_a <loop_vec_info> (vinfo)
1843 /* ??? We don't handle !vect_internal_def defs below. */
1844 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1845 && is_gimple_assign (stmt_info->stmt)
1846 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1847 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1848 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1849 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1850 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1851 {
1852 /* See if we have a chain of (mixed) adds or subtracts or other
1853 associatable ops. */
1854 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1855 if (code == MINUS_EXPR)
1856 code = PLUS_EXPR;
1857 stmt_vec_info other_op_stmt_info = NULL;
1858 stmt_vec_info op_stmt_info = NULL;
1859 unsigned chain_len = 0;
1860 auto_vec<chain_op_t> chain;
1861 auto_vec<std::pair<tree_code, gimple *> > worklist;
1862 auto_vec<vec<chain_op_t> > chains (group_size);
1863 auto_vec<slp_tree, 4> children;
1864 bool hard_fail = true;
1865 for (unsigned lane = 0; lane < group_size; ++lane)
1866 {
1867 /* For each lane linearize the addition/subtraction (or other
1868 uniform associatable operation) expression tree. */
1869 gimple *op_stmt = NULL, *other_op_stmt = NULL;
1870 vect_slp_linearize_chain (vinfo, worklist, chain, code,
1871 stmts[lane]->stmt, op_stmt, other_op_stmt,
1872 NULL);
1873 if (!op_stmt_info && op_stmt)
1874 op_stmt_info = vinfo->lookup_stmt (op_stmt);
1875 if (!other_op_stmt_info && other_op_stmt)
1876 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1877 if (chain.length () == 2)
1878 {
1879 /* In a chain of just two elements resort to the regular
1880 operand swapping scheme. If we run into a length
1881 mismatch still hard-FAIL. */
1882 if (chain_len == 0)
1883 hard_fail = false;
1884 else
1885 {
1886 matches[lane] = false;
1887 /* ??? We might want to process the other lanes, but
1888 make sure to not give false matching hints to the
1889 caller for lanes we did not process. */
1890 if (lane != group_size - 1)
1891 matches[0] = false;
1892 }
1893 break;
1894 }
1895 else if (chain_len == 0)
1896 chain_len = chain.length ();
1897 else if (chain.length () != chain_len)
1898 {
1899 /* ??? Here we could slip in magic to compensate with
1900 neutral operands. */
1901 matches[lane] = false;
1902 if (lane != group_size - 1)
1903 matches[0] = false;
1904 break;
1905 }
1906 chains.quick_push (chain.copy ());
1907 chain.truncate (0);
1908 }
1909 if (chains.length () == group_size)
1910 {
1911 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
1912 if (!op_stmt_info)
1913 {
1914 hard_fail = false;
1915 goto out;
1916 }
1917 /* Now we have a set of chains with the same length. */
1918 /* 1. pre-sort according to def_type and operation. */
1919 for (unsigned lane = 0; lane < group_size; ++lane)
1920 chains[lane].stablesort (dt_sort_cmp, vinfo);
1921 if (dump_enabled_p ())
1922 {
1923 dump_printf_loc (MSG_NOTE, vect_location,
1924 "pre-sorted chains of %s\n",
1925 get_tree_code_name (code));
1926 for (unsigned lane = 0; lane < group_size; ++lane)
1927 {
1928 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1929 dump_printf (MSG_NOTE, "%s %T ",
1930 get_tree_code_name (chains[lane][opnum].code),
1931 chains[lane][opnum].op);
1932 dump_printf (MSG_NOTE, "\n");
1933 }
1934 }
1935 /* 2. try to build children nodes, associating as necessary. */
1936 for (unsigned n = 0; n < chain_len; ++n)
1937 {
1938 vect_def_type dt = chains[0][n].dt;
1939 unsigned lane;
1940 for (lane = 0; lane < group_size; ++lane)
1941 if (chains[lane][n].dt != dt)
1942 {
1943 if (dt == vect_constant_def
1944 && chains[lane][n].dt == vect_external_def)
1945 dt = vect_external_def;
1946 else if (dt == vect_external_def
1947 && chains[lane][n].dt == vect_constant_def)
1948 ;
1949 else
1950 break;
1951 }
1952 if (lane != group_size)
1953 {
1954 if (dump_enabled_p ())
1955 dump_printf_loc (MSG_NOTE, vect_location,
1956 "giving up on chain due to mismatched "
1957 "def types\n");
1958 matches[lane] = false;
1959 if (lane != group_size - 1)
1960 matches[0] = false;
1961 goto out;
1962 }
1963 if (dt == vect_constant_def
1964 || dt == vect_external_def)
1965 {
1966 /* We can always build those. Might want to sort last
1967 or defer building. */
1968 vec<tree> ops;
1969 ops.create (group_size);
1970 for (lane = 0; lane < group_size; ++lane)
1971 ops.quick_push (chains[lane][n].op);
1972 slp_tree child = vect_create_new_slp_node (ops);
1973 SLP_TREE_DEF_TYPE (child) = dt;
1974 children.safe_push (child);
1975 }
1976 else if (dt != vect_internal_def)
1977 {
1978 /* Not sure, we might need sth special.
1979 gcc.dg/vect/pr96854.c,
1980 gfortran.dg/vect/fast-math-pr37021.f90
1981 and gfortran.dg/vect/pr61171.f trigger. */
1982 /* Soft-fail for now. */
1983 hard_fail = false;
1984 goto out;
1985 }
1986 else
1987 {
1988 vec<stmt_vec_info> op_stmts;
1989 op_stmts.create (group_size);
1990 slp_tree child = NULL;
1991 /* Brute-force our way. We have to consider a lane
1992 failing after fixing an earlier fail up in the
1993 SLP discovery recursion. So track the current
1994 permute per lane. */
1995 unsigned *perms = XALLOCAVEC (unsigned, group_size);
1996 memset (perms, 0, sizeof (unsigned) * group_size);
1997 do
1998 {
1999 op_stmts.truncate (0);
2000 for (lane = 0; lane < group_size; ++lane)
2001 op_stmts.quick_push
2002 (vinfo->lookup_def (chains[lane][n].op));
2003 child = vect_build_slp_tree (vinfo, op_stmts,
2004 group_size, &this_max_nunits,
2005 matches, limit,
2006 &this_tree_size, bst_map);
2007 /* ??? We're likely getting too many fatal mismatches
2008 here so maybe we want to ignore them (but then we
2009 have no idea which lanes fatally mismatched). */
2010 if (child || !matches[0])
2011 break;
2012 /* Swap another lane we have not yet matched up into
2013 lanes that did not match. If we run out of
2014 permute possibilities for a lane terminate the
2015 search. */
2016 bool term = false;
2017 for (lane = 1; lane < group_size; ++lane)
2018 if (!matches[lane])
2019 {
2020 if (n + perms[lane] + 1 == chain_len)
2021 {
2022 term = true;
2023 break;
2024 }
2025 std::swap (chains[lane][n],
2026 chains[lane][n + perms[lane] + 1]);
2027 perms[lane]++;
2028 }
2029 if (term)
2030 break;
2031 }
2032 while (1);
2033 if (!child)
2034 {
2035 if (dump_enabled_p ())
2036 dump_printf_loc (MSG_NOTE, vect_location,
2037 "failed to match up op %d\n", n);
2038 op_stmts.release ();
2039 if (lane != group_size - 1)
2040 matches[0] = false;
2041 else
2042 matches[lane] = false;
2043 goto out;
2044 }
2045 if (dump_enabled_p ())
2046 {
2047 dump_printf_loc (MSG_NOTE, vect_location,
2048 "matched up op %d to\n", n);
2049 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2050 }
2051 children.safe_push (child);
2052 }
2053 }
2054 /* 3. build SLP nodes to combine the chain. */
2055 for (unsigned lane = 0; lane < group_size; ++lane)
2056 if (chains[lane][0].code != code)
2057 {
2058 /* See if there's any alternate all-PLUS entry. */
2059 unsigned n;
2060 for (n = 1; n < chain_len; ++n)
2061 {
2062 for (lane = 0; lane < group_size; ++lane)
2063 if (chains[lane][n].code != code)
2064 break;
2065 if (lane == group_size)
2066 break;
2067 }
2068 if (n != chain_len)
2069 {
2070 /* Swap that in at first position. */
2071 std::swap (children[0], children[n]);
2072 for (lane = 0; lane < group_size; ++lane)
2073 std::swap (chains[lane][0], chains[lane][n]);
2074 }
2075 else
2076 {
2077 /* ??? When this triggers and we end up with two
2078 vect_constant/external_def up-front things break (ICE)
2079 spectacularly finding an insertion place for the
2080 all-constant op. We should have a fully
2081 vect_internal_def operand though(?) so we can swap
2082 that into first place and then prepend the all-zero
2083 constant. */
2084 if (dump_enabled_p ())
2085 dump_printf_loc (MSG_NOTE, vect_location,
2086 "inserting constant zero to compensate "
2087 "for (partially) negated first "
2088 "operand\n");
2089 chain_len++;
2090 for (lane = 0; lane < group_size; ++lane)
2091 chains[lane].safe_insert
2092 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2093 vec<tree> zero_ops;
2094 zero_ops.create (group_size);
2095 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2096 for (lane = 1; lane < group_size; ++lane)
2097 zero_ops.quick_push (zero_ops[0]);
2098 slp_tree zero = vect_create_new_slp_node (zero_ops);
2099 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2100 children.safe_insert (0, zero);
2101 }
2102 break;
2103 }
2104 for (unsigned i = 1; i < children.length (); ++i)
2105 {
2106 slp_tree op0 = children[i - 1];
2107 slp_tree op1 = children[i];
2108 bool this_two_op = false;
2109 for (unsigned lane = 0; lane < group_size; ++lane)
2110 if (chains[lane][i].code != chains[0][i].code)
2111 {
2112 this_two_op = true;
2113 break;
2114 }
2115 slp_tree child;
2116 if (i == children.length () - 1)
2117 child = vect_create_new_slp_node (node, stmts, 2);
2118 else
2119 child = vect_create_new_slp_node (2, ERROR_MARK);
2120 if (this_two_op)
2121 {
2122 vec<std::pair<unsigned, unsigned> > lperm;
2123 lperm.create (group_size);
2124 for (unsigned lane = 0; lane < group_size; ++lane)
2125 lperm.quick_push (std::make_pair
2126 (chains[lane][i].code != chains[0][i].code, lane));
2127 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2128 (chains[0][i].code == code
2129 ? op_stmt_info
2130 : other_op_stmt_info),
2131 (chains[0][i].code == code
2132 ? other_op_stmt_info
2133 : op_stmt_info),
2134 lperm);
2135 }
2136 else
2137 {
2138 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2139 SLP_TREE_VECTYPE (child) = vectype;
2140 SLP_TREE_LANES (child) = group_size;
2141 SLP_TREE_CHILDREN (child).quick_push (op0);
2142 SLP_TREE_CHILDREN (child).quick_push (op1);
2143 SLP_TREE_REPRESENTATIVE (child)
2144 = (chains[0][i].code == code
2145 ? op_stmt_info : other_op_stmt_info);
2146 }
2147 children[i] = child;
2148 }
2149 *tree_size += this_tree_size + 1;
2150 *max_nunits = this_max_nunits;
2151 while (!chains.is_empty ())
2152 chains.pop ().release ();
2153 return node;
2154 }
2155 out:
2156 while (!children.is_empty ())
2157 vect_free_slp_tree (children.pop ());
2158 while (!chains.is_empty ())
2159 chains.pop ().release ();
2160 /* Hard-fail, otherwise we might run into quadratic processing of the
2161 chains starting one stmt into the chain again. */
2162 if (hard_fail)
2163 return NULL;
2164 /* Fall thru to normal processing. */
2165 }
2166
2167 /* Get at the operands, verifying they are compatible. */
2168 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2169 slp_oprnd_info oprnd_info;
2170 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2171 {
2172 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2173 stmts, i, &oprnds_info);
2174 if (res != 0)
2175 matches[(res == -1) ? 0 : i] = false;
2176 if (!matches[0])
2177 break;
2178 }
2179 for (i = 0; i < group_size; ++i)
2180 if (!matches[i])
2181 {
2182 vect_free_oprnd_info (oprnds_info);
2183 return NULL;
2184 }
2185 swap = NULL;
2186
2187 auto_vec<slp_tree, 4> children;
2188
2189 stmt_info = stmts[0];
2190
2191 /* Create SLP_TREE nodes for the definition node/s. */
2192 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2193 {
2194 slp_tree child;
2195 unsigned int j;
2196
2197 /* We're skipping certain operands from processing, for example
2198 outer loop reduction initial defs. */
2199 if (skip_args[i])
2200 {
2201 children.safe_push (NULL);
2202 continue;
2203 }
2204
2205 if (oprnd_info->first_dt == vect_uninitialized_def)
2206 {
2207 /* COND_EXPR have one too many eventually if the condition
2208 is a SSA name. */
2209 gcc_assert (i == 3 && nops == 4);
2210 continue;
2211 }
2212
2213 if (is_a <bb_vec_info> (vinfo)
2214 && oprnd_info->first_dt == vect_internal_def
2215 && !oprnd_info->any_pattern)
2216 {
2217 /* For BB vectorization, if all defs are the same do not
2218 bother to continue the build along the single-lane
2219 graph but use a splat of the scalar value. */
2220 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2221 for (j = 1; j < group_size; ++j)
2222 if (oprnd_info->def_stmts[j] != first_def)
2223 break;
2224 if (j == group_size
2225 /* But avoid doing this for loads where we may be
2226 able to CSE things, unless the stmt is not
2227 vectorizable. */
2228 && (!STMT_VINFO_VECTORIZABLE (first_def)
2229 || !gimple_vuse (first_def->stmt)))
2230 {
2231 if (dump_enabled_p ())
2232 dump_printf_loc (MSG_NOTE, vect_location,
2233 "Using a splat of the uniform operand\n");
2234 oprnd_info->first_dt = vect_external_def;
2235 }
2236 }
2237
2238 if (oprnd_info->first_dt == vect_external_def
2239 || oprnd_info->first_dt == vect_constant_def)
2240 {
2241 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2242 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2243 oprnd_info->ops = vNULL;
2244 children.safe_push (invnode);
2245 continue;
2246 }
2247
2248 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2249 group_size, &this_max_nunits,
2250 matches, limit,
2251 &this_tree_size, bst_map)) != NULL)
2252 {
2253 oprnd_info->def_stmts = vNULL;
2254 children.safe_push (child);
2255 continue;
2256 }
2257
2258 /* If the SLP build for operand zero failed and operand zero
2259 and one can be commutated try that for the scalar stmts
2260 that failed the match. */
2261 if (i == 0
2262 /* A first scalar stmt mismatch signals a fatal mismatch. */
2263 && matches[0]
2264 /* ??? For COND_EXPRs we can swap the comparison operands
2265 as well as the arms under some constraints. */
2266 && nops == 2
2267 && oprnds_info[1]->first_dt == vect_internal_def
2268 && is_gimple_assign (stmt_info->stmt)
2269 /* Swapping operands for reductions breaks assumptions later on. */
2270 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2271 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2272 {
2273 /* See whether we can swap the matching or the non-matching
2274 stmt operands. */
2275 bool swap_not_matching = true;
2276 do
2277 {
2278 for (j = 0; j < group_size; ++j)
2279 {
2280 if (matches[j] != !swap_not_matching)
2281 continue;
2282 stmt_vec_info stmt_info = stmts[j];
2283 /* Verify if we can swap operands of this stmt. */
2284 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2285 if (!stmt
2286 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2287 {
2288 if (!swap_not_matching)
2289 goto fail;
2290 swap_not_matching = false;
2291 break;
2292 }
2293 }
2294 }
2295 while (j != group_size);
2296
2297 /* Swap mismatched definition stmts. */
2298 if (dump_enabled_p ())
2299 dump_printf_loc (MSG_NOTE, vect_location,
2300 "Re-trying with swapped operands of stmts ");
2301 for (j = 0; j < group_size; ++j)
2302 if (matches[j] == !swap_not_matching)
2303 {
2304 std::swap (oprnds_info[0]->def_stmts[j],
2305 oprnds_info[1]->def_stmts[j]);
2306 std::swap (oprnds_info[0]->ops[j],
2307 oprnds_info[1]->ops[j]);
2308 if (dump_enabled_p ())
2309 dump_printf (MSG_NOTE, "%d ", j);
2310 }
2311 if (dump_enabled_p ())
2312 dump_printf (MSG_NOTE, "\n");
2313 /* And try again with scratch 'matches' ... */
2314 bool *tem = XALLOCAVEC (bool, group_size);
2315 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2316 group_size, &this_max_nunits,
2317 tem, limit,
2318 &this_tree_size, bst_map)) != NULL)
2319 {
2320 oprnd_info->def_stmts = vNULL;
2321 children.safe_push (child);
2322 continue;
2323 }
2324 }
2325 fail:
2326
2327 /* If the SLP build failed and we analyze a basic-block
2328 simply treat nodes we fail to build as externally defined
2329 (and thus build vectors from the scalar defs).
2330 The cost model will reject outright expensive cases.
2331 ??? This doesn't treat cases where permutation ultimatively
2332 fails (or we don't try permutation below). Ideally we'd
2333 even compute a permutation that will end up with the maximum
2334 SLP tree size... */
2335 if (is_a <bb_vec_info> (vinfo)
2336 /* ??? Rejecting patterns this way doesn't work. We'd have to
2337 do extra work to cancel the pattern so the uses see the
2338 scalar version. */
2339 && !is_pattern_stmt_p (stmt_info)
2340 && !oprnd_info->any_pattern)
2341 {
2342 /* But if there's a leading vector sized set of matching stmts
2343 fail here so we can split the group. This matches the condition
2344 vect_analyze_slp_instance uses. */
2345 /* ??? We might want to split here and combine the results to support
2346 multiple vector sizes better. */
2347 for (j = 0; j < group_size; ++j)
2348 if (!matches[j])
2349 break;
2350 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2351 {
2352 if (dump_enabled_p ())
2353 dump_printf_loc (MSG_NOTE, vect_location,
2354 "Building vector operands from scalars\n");
2355 this_tree_size++;
2356 child = vect_create_new_slp_node (oprnd_info->ops);
2357 children.safe_push (child);
2358 oprnd_info->ops = vNULL;
2359 continue;
2360 }
2361 }
2362
2363 gcc_assert (child == NULL);
2364 FOR_EACH_VEC_ELT (children, j, child)
2365 if (child)
2366 vect_free_slp_tree (child);
2367 vect_free_oprnd_info (oprnds_info);
2368 return NULL;
2369 }
2370
2371 vect_free_oprnd_info (oprnds_info);
2372
2373 /* If we have all children of a child built up from uniform scalars
2374 or does more than one possibly expensive vector construction then
2375 just throw that away, causing it built up from scalars.
2376 The exception is the SLP node for the vector store. */
2377 if (is_a <bb_vec_info> (vinfo)
2378 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2379 /* ??? Rejecting patterns this way doesn't work. We'd have to
2380 do extra work to cancel the pattern so the uses see the
2381 scalar version. */
2382 && !is_pattern_stmt_p (stmt_info))
2383 {
2384 slp_tree child;
2385 unsigned j;
2386 bool all_uniform_p = true;
2387 unsigned n_vector_builds = 0;
2388 FOR_EACH_VEC_ELT (children, j, child)
2389 {
2390 if (!child)
2391 ;
2392 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2393 all_uniform_p = false;
2394 else if (!vect_slp_tree_uniform_p (child))
2395 {
2396 all_uniform_p = false;
2397 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2398 n_vector_builds++;
2399 }
2400 }
2401 if (all_uniform_p
2402 || n_vector_builds > 1
2403 || (n_vector_builds == children.length ()
2404 && is_a <gphi *> (stmt_info->stmt)))
2405 {
2406 /* Roll back. */
2407 matches[0] = false;
2408 FOR_EACH_VEC_ELT (children, j, child)
2409 if (child)
2410 vect_free_slp_tree (child);
2411
2412 if (dump_enabled_p ())
2413 dump_printf_loc (MSG_NOTE, vect_location,
2414 "Building parent vector operands from "
2415 "scalars instead\n");
2416 return NULL;
2417 }
2418 }
2419
2420 *tree_size += this_tree_size + 1;
2421 *max_nunits = this_max_nunits;
2422
2423 if (two_operators)
2424 {
2425 /* ??? We'd likely want to either cache in bst_map sth like
2426 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2427 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2428 explicit stmts to put in so the keying on 'stmts' doesn't
2429 work (but we have the same issue with nodes that use 'ops'). */
2430 slp_tree one = new _slp_tree;
2431 slp_tree two = new _slp_tree;
2432 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2433 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2434 SLP_TREE_VECTYPE (one) = vectype;
2435 SLP_TREE_VECTYPE (two) = vectype;
2436 SLP_TREE_CHILDREN (one).safe_splice (children);
2437 SLP_TREE_CHILDREN (two).safe_splice (children);
2438 slp_tree child;
2439 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2440 SLP_TREE_REF_COUNT (child)++;
2441
2442 /* Here we record the original defs since this
2443 node represents the final lane configuration. */
2444 node = vect_create_new_slp_node (node, stmts, 2);
2445 SLP_TREE_VECTYPE (node) = vectype;
2446 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2447 SLP_TREE_CHILDREN (node).quick_push (one);
2448 SLP_TREE_CHILDREN (node).quick_push (two);
2449 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2450 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2451 enum tree_code ocode = ERROR_MARK;
2452 stmt_vec_info ostmt_info;
2453 unsigned j = 0;
2454 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2455 {
2456 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2457 if (gimple_assign_rhs_code (ostmt) != code0)
2458 {
2459 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2460 ocode = gimple_assign_rhs_code (ostmt);
2461 j = i;
2462 }
2463 else
2464 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2465 }
2466 SLP_TREE_CODE (one) = code0;
2467 SLP_TREE_CODE (two) = ocode;
2468 SLP_TREE_LANES (one) = stmts.length ();
2469 SLP_TREE_LANES (two) = stmts.length ();
2470 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2471 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2472 return node;
2473 }
2474
2475 node = vect_create_new_slp_node (node, stmts, nops);
2476 SLP_TREE_VECTYPE (node) = vectype;
2477 SLP_TREE_CHILDREN (node).splice (children);
2478 return node;
2479 }
2480
2481 /* Dump a single SLP tree NODE. */
2482
2483 static void
2484 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2485 slp_tree node)
2486 {
2487 unsigned i, j;
2488 slp_tree child;
2489 stmt_vec_info stmt_info;
2490 tree op;
2491
2492 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2493 dump_user_location_t user_loc = loc.get_user_location ();
2494 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
2495 SLP_TREE_DEF_TYPE (node) == vect_external_def
2496 ? " (external)"
2497 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2498 ? " (constant)"
2499 : ""), node,
2500 estimated_poly_value (node->max_nunits),
2501 SLP_TREE_REF_COUNT (node));
2502 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2503 {
2504 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2505 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2506 else
2507 dump_printf_loc (metadata, user_loc, "op template: %G",
2508 SLP_TREE_REPRESENTATIVE (node)->stmt);
2509 }
2510 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2511 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2512 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2513 else
2514 {
2515 dump_printf_loc (metadata, user_loc, "\t{ ");
2516 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2517 dump_printf (metadata, "%T%s ", op,
2518 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2519 dump_printf (metadata, "}\n");
2520 }
2521 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2522 {
2523 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2524 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2525 dump_printf (dump_kind, " %u", j);
2526 dump_printf (dump_kind, " }\n");
2527 }
2528 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2529 {
2530 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2531 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2532 dump_printf (dump_kind, " %u[%u]",
2533 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2534 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2535 dump_printf (dump_kind, " }\n");
2536 }
2537 if (SLP_TREE_CHILDREN (node).is_empty ())
2538 return;
2539 dump_printf_loc (metadata, user_loc, "\tchildren");
2540 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2541 dump_printf (dump_kind, " %p", (void *)child);
2542 dump_printf (dump_kind, "\n");
2543 }
2544
2545 DEBUG_FUNCTION void
2546 debug (slp_tree node)
2547 {
2548 debug_dump_context ctx;
2549 vect_print_slp_tree (MSG_NOTE,
2550 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2551 node);
2552 }
2553
2554 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2555
2556 static void
2557 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2558 slp_tree node, hash_set<slp_tree> &visited)
2559 {
2560 unsigned i;
2561 slp_tree child;
2562
2563 if (visited.add (node))
2564 return;
2565
2566 vect_print_slp_tree (dump_kind, loc, node);
2567
2568 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2569 if (child)
2570 vect_print_slp_graph (dump_kind, loc, child, visited);
2571 }
2572
2573 static void
2574 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2575 slp_tree entry)
2576 {
2577 hash_set<slp_tree> visited;
2578 vect_print_slp_graph (dump_kind, loc, entry, visited);
2579 }
2580
2581 /* Mark the tree rooted at NODE with PURE_SLP. */
2582
2583 static void
2584 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2585 {
2586 int i;
2587 stmt_vec_info stmt_info;
2588 slp_tree child;
2589
2590 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2591 return;
2592
2593 if (visited.add (node))
2594 return;
2595
2596 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2597 STMT_SLP_TYPE (stmt_info) = pure_slp;
2598
2599 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2600 if (child)
2601 vect_mark_slp_stmts (child, visited);
2602 }
2603
2604 static void
2605 vect_mark_slp_stmts (slp_tree node)
2606 {
2607 hash_set<slp_tree> visited;
2608 vect_mark_slp_stmts (node, visited);
2609 }
2610
2611 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2612
2613 static void
2614 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2615 {
2616 int i;
2617 stmt_vec_info stmt_info;
2618 slp_tree child;
2619
2620 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2621 return;
2622
2623 if (visited.add (node))
2624 return;
2625
2626 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2627 {
2628 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2629 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2630 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2631 }
2632
2633 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2634 if (child)
2635 vect_mark_slp_stmts_relevant (child, visited);
2636 }
2637
2638 static void
2639 vect_mark_slp_stmts_relevant (slp_tree node)
2640 {
2641 hash_set<slp_tree> visited;
2642 vect_mark_slp_stmts_relevant (node, visited);
2643 }
2644
2645
2646 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2647
2648 static void
2649 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2650 hash_set<slp_tree> &visited)
2651 {
2652 if (!node || visited.add (node))
2653 return;
2654
2655 if (SLP_TREE_CHILDREN (node).length () == 0)
2656 {
2657 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2658 return;
2659 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2660 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2661 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2662 loads.safe_push (node);
2663 }
2664 else
2665 {
2666 unsigned i;
2667 slp_tree child;
2668 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2669 vect_gather_slp_loads (loads, child, visited);
2670 }
2671 }
2672
2673
2674 /* Find the last store in SLP INSTANCE. */
2675
2676 stmt_vec_info
2677 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2678 {
2679 stmt_vec_info last = NULL;
2680 stmt_vec_info stmt_vinfo;
2681
2682 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2683 {
2684 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2685 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2686 }
2687
2688 return last;
2689 }
2690
2691 /* Find the first stmt in NODE. */
2692
2693 stmt_vec_info
2694 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2695 {
2696 stmt_vec_info first = NULL;
2697 stmt_vec_info stmt_vinfo;
2698
2699 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2700 {
2701 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2702 if (!first
2703 || get_later_stmt (stmt_vinfo, first) == first)
2704 first = stmt_vinfo;
2705 }
2706
2707 return first;
2708 }
2709
2710 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2711 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2712 (also containing the first GROUP1_SIZE stmts, since stores are
2713 consecutive), the second containing the remainder.
2714 Return the first stmt in the second group. */
2715
2716 static stmt_vec_info
2717 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2718 {
2719 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2720 gcc_assert (group1_size > 0);
2721 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2722 gcc_assert (group2_size > 0);
2723 DR_GROUP_SIZE (first_vinfo) = group1_size;
2724
2725 stmt_vec_info stmt_info = first_vinfo;
2726 for (unsigned i = group1_size; i > 1; i--)
2727 {
2728 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2729 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2730 }
2731 /* STMT is now the last element of the first group. */
2732 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2733 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2734
2735 DR_GROUP_SIZE (group2) = group2_size;
2736 for (stmt_info = group2; stmt_info;
2737 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2738 {
2739 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2740 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2741 }
2742
2743 /* For the second group, the DR_GROUP_GAP is that before the original group,
2744 plus skipping over the first vector. */
2745 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2746
2747 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2748 DR_GROUP_GAP (first_vinfo) += group2_size;
2749
2750 if (dump_enabled_p ())
2751 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2752 group1_size, group2_size);
2753
2754 return group2;
2755 }
2756
2757 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2758 statements and a vector of NUNITS elements. */
2759
2760 static poly_uint64
2761 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2762 {
2763 return exact_div (common_multiple (nunits, group_size), group_size);
2764 }
2765
2766 /* Helper that checks to see if a node is a load node. */
2767
2768 static inline bool
2769 vect_is_slp_load_node (slp_tree root)
2770 {
2771 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2772 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2773 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2774 }
2775
2776
2777 /* Helper function of optimize_load_redistribution that performs the operation
2778 recursively. */
2779
2780 static slp_tree
2781 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2782 vec_info *vinfo, unsigned int group_size,
2783 hash_map<slp_tree, slp_tree> *load_map,
2784 slp_tree root)
2785 {
2786 if (slp_tree *leader = load_map->get (root))
2787 return *leader;
2788
2789 slp_tree node;
2790 unsigned i;
2791
2792 /* For now, we don't know anything about externals so do not do anything. */
2793 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2794 return NULL;
2795 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2796 {
2797 /* First convert this node into a load node and add it to the leaves
2798 list and flatten the permute from a lane to a load one. If it's
2799 unneeded it will be elided later. */
2800 vec<stmt_vec_info> stmts;
2801 stmts.create (SLP_TREE_LANES (root));
2802 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2803 for (unsigned j = 0; j < lane_perm.length (); j++)
2804 {
2805 std::pair<unsigned, unsigned> perm = lane_perm[j];
2806 node = SLP_TREE_CHILDREN (root)[perm.first];
2807
2808 if (!vect_is_slp_load_node (node)
2809 || SLP_TREE_CHILDREN (node).exists ())
2810 {
2811 stmts.release ();
2812 goto next;
2813 }
2814
2815 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2816 }
2817
2818 if (dump_enabled_p ())
2819 dump_printf_loc (MSG_NOTE, vect_location,
2820 "converting stmts on permute node %p\n", root);
2821
2822 bool *matches = XALLOCAVEC (bool, group_size);
2823 poly_uint64 max_nunits = 1;
2824 unsigned tree_size = 0, limit = 1;
2825 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2826 matches, &limit, &tree_size, bst_map);
2827 if (!node)
2828 stmts.release ();
2829
2830 load_map->put (root, node);
2831 return node;
2832 }
2833
2834 next:
2835 load_map->put (root, NULL);
2836
2837 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2838 {
2839 slp_tree value
2840 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2841 node);
2842 if (value)
2843 {
2844 SLP_TREE_REF_COUNT (value)++;
2845 SLP_TREE_CHILDREN (root)[i] = value;
2846 /* ??? We know the original leafs of the replaced nodes will
2847 be referenced by bst_map, only the permutes created by
2848 pattern matching are not. */
2849 if (SLP_TREE_REF_COUNT (node) == 1)
2850 load_map->remove (node);
2851 vect_free_slp_tree (node);
2852 }
2853 }
2854
2855 return NULL;
2856 }
2857
2858 /* Temporary workaround for loads not being CSEd during SLP build. This
2859 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2860 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2861 same DR such that the final operation is equal to a permuted load. Such
2862 NODES are then directly converted into LOADS themselves. The nodes are
2863 CSEd using BST_MAP. */
2864
2865 static void
2866 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2867 vec_info *vinfo, unsigned int group_size,
2868 hash_map<slp_tree, slp_tree> *load_map,
2869 slp_tree root)
2870 {
2871 slp_tree node;
2872 unsigned i;
2873
2874 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2875 {
2876 slp_tree value
2877 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2878 node);
2879 if (value)
2880 {
2881 SLP_TREE_REF_COUNT (value)++;
2882 SLP_TREE_CHILDREN (root)[i] = value;
2883 /* ??? We know the original leafs of the replaced nodes will
2884 be referenced by bst_map, only the permutes created by
2885 pattern matching are not. */
2886 if (SLP_TREE_REF_COUNT (node) == 1)
2887 load_map->remove (node);
2888 vect_free_slp_tree (node);
2889 }
2890 }
2891 }
2892
2893 /* Helper function of vect_match_slp_patterns.
2894
2895 Attempts to match patterns against the slp tree rooted in REF_NODE using
2896 VINFO. Patterns are matched in post-order traversal.
2897
2898 If matching is successful the value in REF_NODE is updated and returned, if
2899 not then it is returned unchanged. */
2900
2901 static bool
2902 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2903 slp_tree_to_load_perm_map_t *perm_cache,
2904 hash_set<slp_tree> *visited)
2905 {
2906 unsigned i;
2907 slp_tree node = *ref_node;
2908 bool found_p = false;
2909 if (!node || visited->add (node))
2910 return false;
2911
2912 slp_tree child;
2913 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2914 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2915 vinfo, perm_cache, visited);
2916
2917 for (unsigned x = 0; x < num__slp_patterns; x++)
2918 {
2919 vect_pattern *pattern = slp_patterns[x] (perm_cache, ref_node);
2920 if (pattern)
2921 {
2922 pattern->build (vinfo);
2923 delete pattern;
2924 found_p = true;
2925 }
2926 }
2927
2928 return found_p;
2929 }
2930
2931 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2932 vec_info VINFO.
2933
2934 The modified tree is returned. Patterns are tried in order and multiple
2935 patterns may match. */
2936
2937 static bool
2938 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2939 hash_set<slp_tree> *visited,
2940 slp_tree_to_load_perm_map_t *perm_cache)
2941 {
2942 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2943 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2944
2945 if (dump_enabled_p ())
2946 dump_printf_loc (MSG_NOTE, vect_location,
2947 "Analyzing SLP tree %p for patterns\n",
2948 SLP_INSTANCE_TREE (instance));
2949
2950 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
2951 }
2952
2953 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
2954 splitting into two, with the first split group having size NEW_GROUP_SIZE.
2955 Return true if we could use IFN_STORE_LANES instead and if that appears
2956 to be the better approach. */
2957
2958 static bool
2959 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
2960 unsigned int group_size,
2961 unsigned int new_group_size)
2962 {
2963 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2964 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
2965 if (!vectype)
2966 return false;
2967 /* Allow the split if one of the two new groups would operate on full
2968 vectors *within* rather than across one scalar loop iteration.
2969 This is purely a heuristic, but it should work well for group
2970 sizes of 3 and 4, where the possible splits are:
2971
2972 3->2+1: OK if the vector has exactly two elements
2973 4->2+2: Likewise
2974 4->3+1: Less clear-cut. */
2975 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
2976 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
2977 return false;
2978 return vect_store_lanes_supported (vectype, group_size, false);
2979 }
2980
2981 /* Analyze an SLP instance starting from a group of grouped stores. Call
2982 vect_build_slp_tree to build a tree of packed stmts if possible.
2983 Return FALSE if it's impossible to SLP any stmt in the loop. */
2984
2985 static bool
2986 vect_analyze_slp_instance (vec_info *vinfo,
2987 scalar_stmts_to_slp_tree_map_t *bst_map,
2988 stmt_vec_info stmt_info, slp_instance_kind kind,
2989 unsigned max_tree_size, unsigned *limit);
2990
2991 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2992 of KIND. Return true if successful. */
2993
2994 static bool
2995 vect_build_slp_instance (vec_info *vinfo,
2996 slp_instance_kind kind,
2997 vec<stmt_vec_info> &scalar_stmts,
2998 vec<stmt_vec_info> &root_stmt_infos,
2999 unsigned max_tree_size, unsigned *limit,
3000 scalar_stmts_to_slp_tree_map_t *bst_map,
3001 /* ??? We need stmt_info for group splitting. */
3002 stmt_vec_info stmt_info_)
3003 {
3004 if (dump_enabled_p ())
3005 {
3006 dump_printf_loc (MSG_NOTE, vect_location,
3007 "Starting SLP discovery for\n");
3008 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3009 dump_printf_loc (MSG_NOTE, vect_location,
3010 " %G", scalar_stmts[i]->stmt);
3011 }
3012
3013 /* Build the tree for the SLP instance. */
3014 unsigned int group_size = scalar_stmts.length ();
3015 bool *matches = XALLOCAVEC (bool, group_size);
3016 poly_uint64 max_nunits = 1;
3017 unsigned tree_size = 0;
3018 unsigned i;
3019 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3020 &max_nunits, matches, limit,
3021 &tree_size, bst_map);
3022 if (node != NULL)
3023 {
3024 /* Calculate the unrolling factor based on the smallest type. */
3025 poly_uint64 unrolling_factor
3026 = calculate_unrolling_factor (max_nunits, group_size);
3027
3028 if (maybe_ne (unrolling_factor, 1U)
3029 && is_a <bb_vec_info> (vinfo))
3030 {
3031 unsigned HOST_WIDE_INT const_max_nunits;
3032 if (!max_nunits.is_constant (&const_max_nunits)
3033 || const_max_nunits > group_size)
3034 {
3035 if (dump_enabled_p ())
3036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3037 "Build SLP failed: store group "
3038 "size not a multiple of the vector size "
3039 "in basic block SLP\n");
3040 vect_free_slp_tree (node);
3041 return false;
3042 }
3043 /* Fatal mismatch. */
3044 if (dump_enabled_p ())
3045 dump_printf_loc (MSG_NOTE, vect_location,
3046 "SLP discovery succeeded but node needs "
3047 "splitting\n");
3048 memset (matches, true, group_size);
3049 matches[group_size / const_max_nunits * const_max_nunits] = false;
3050 vect_free_slp_tree (node);
3051 }
3052 else
3053 {
3054 /* Create a new SLP instance. */
3055 slp_instance new_instance = XNEW (class _slp_instance);
3056 SLP_INSTANCE_TREE (new_instance) = node;
3057 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3058 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3059 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3060 SLP_INSTANCE_KIND (new_instance) = kind;
3061 new_instance->reduc_phis = NULL;
3062 new_instance->cost_vec = vNULL;
3063 new_instance->subgraph_entries = vNULL;
3064
3065 if (dump_enabled_p ())
3066 dump_printf_loc (MSG_NOTE, vect_location,
3067 "SLP size %u vs. limit %u.\n",
3068 tree_size, max_tree_size);
3069
3070 /* Fixup SLP reduction chains. */
3071 if (kind == slp_inst_kind_reduc_chain)
3072 {
3073 /* If this is a reduction chain with a conversion in front
3074 amend the SLP tree with a node for that. */
3075 gimple *scalar_def
3076 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3077 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3078 {
3079 /* Get at the conversion stmt - we know it's the single use
3080 of the last stmt of the reduction chain. */
3081 use_operand_p use_p;
3082 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3083 &use_p, &scalar_def);
3084 gcc_assert (r);
3085 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3086 next_info = vect_stmt_to_vectorize (next_info);
3087 scalar_stmts = vNULL;
3088 scalar_stmts.create (group_size);
3089 for (unsigned i = 0; i < group_size; ++i)
3090 scalar_stmts.quick_push (next_info);
3091 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3092 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3093 SLP_TREE_CHILDREN (conv).quick_push (node);
3094 SLP_INSTANCE_TREE (new_instance) = conv;
3095 /* We also have to fake this conversion stmt as SLP reduction
3096 group so we don't have to mess with too much code
3097 elsewhere. */
3098 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3099 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3100 }
3101 /* Fill the backedge child of the PHI SLP node. The
3102 general matching code cannot find it because the
3103 scalar code does not reflect how we vectorize the
3104 reduction. */
3105 use_operand_p use_p;
3106 imm_use_iterator imm_iter;
3107 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3108 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3109 gimple_get_lhs (scalar_def))
3110 /* There are exactly two non-debug uses, the reduction
3111 PHI and the loop-closed PHI node. */
3112 if (!is_gimple_debug (USE_STMT (use_p))
3113 && gimple_bb (USE_STMT (use_p)) == loop->header)
3114 {
3115 auto_vec<stmt_vec_info, 64> phis (group_size);
3116 stmt_vec_info phi_info
3117 = vinfo->lookup_stmt (USE_STMT (use_p));
3118 for (unsigned i = 0; i < group_size; ++i)
3119 phis.quick_push (phi_info);
3120 slp_tree *phi_node = bst_map->get (phis);
3121 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3122 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3123 = SLP_INSTANCE_TREE (new_instance);
3124 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3125 }
3126 }
3127
3128 vinfo->slp_instances.safe_push (new_instance);
3129
3130 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3131 the number of scalar stmts in the root in a few places.
3132 Verify that assumption holds. */
3133 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3134 .length () == group_size);
3135
3136 if (dump_enabled_p ())
3137 {
3138 dump_printf_loc (MSG_NOTE, vect_location,
3139 "Final SLP tree for instance %p:\n", new_instance);
3140 vect_print_slp_graph (MSG_NOTE, vect_location,
3141 SLP_INSTANCE_TREE (new_instance));
3142 }
3143
3144 return true;
3145 }
3146 }
3147 else
3148 {
3149 /* Failed to SLP. */
3150 /* Free the allocated memory. */
3151 scalar_stmts.release ();
3152 }
3153
3154 stmt_vec_info stmt_info = stmt_info_;
3155 /* Try to break the group up into pieces. */
3156 if (kind == slp_inst_kind_store)
3157 {
3158 /* ??? We could delay all the actual splitting of store-groups
3159 until after SLP discovery of the original group completed.
3160 Then we can recurse to vect_build_slp_instance directly. */
3161 for (i = 0; i < group_size; i++)
3162 if (!matches[i])
3163 break;
3164
3165 /* For basic block SLP, try to break the group up into multiples of
3166 a vector size. */
3167 if (is_a <bb_vec_info> (vinfo)
3168 && (i > 1 && i < group_size))
3169 {
3170 tree scalar_type
3171 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3172 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3173 1 << floor_log2 (i));
3174 unsigned HOST_WIDE_INT const_nunits;
3175 if (vectype
3176 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3177 {
3178 /* Split into two groups at the first vector boundary. */
3179 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3180 unsigned group1_size = i & ~(const_nunits - 1);
3181
3182 if (dump_enabled_p ())
3183 dump_printf_loc (MSG_NOTE, vect_location,
3184 "Splitting SLP group at stmt %u\n", i);
3185 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3186 group1_size);
3187 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3188 kind, max_tree_size,
3189 limit);
3190 /* Split the rest at the failure point and possibly
3191 re-analyze the remaining matching part if it has
3192 at least two lanes. */
3193 if (group1_size < i
3194 && (i + 1 < group_size
3195 || i - group1_size > 1))
3196 {
3197 stmt_vec_info rest2 = rest;
3198 rest = vect_split_slp_store_group (rest, i - group1_size);
3199 if (i - group1_size > 1)
3200 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3201 kind, max_tree_size,
3202 limit);
3203 }
3204 /* Re-analyze the non-matching tail if it has at least
3205 two lanes. */
3206 if (i + 1 < group_size)
3207 res |= vect_analyze_slp_instance (vinfo, bst_map,
3208 rest, kind, max_tree_size,
3209 limit);
3210 return res;
3211 }
3212 }
3213
3214 /* For loop vectorization split into arbitrary pieces of size > 1. */
3215 if (is_a <loop_vec_info> (vinfo)
3216 && (i > 1 && i < group_size)
3217 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3218 {
3219 unsigned group1_size = i;
3220
3221 if (dump_enabled_p ())
3222 dump_printf_loc (MSG_NOTE, vect_location,
3223 "Splitting SLP group at stmt %u\n", i);
3224
3225 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3226 group1_size);
3227 /* Loop vectorization cannot handle gaps in stores, make sure
3228 the split group appears as strided. */
3229 STMT_VINFO_STRIDED_P (rest) = 1;
3230 DR_GROUP_GAP (rest) = 0;
3231 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3232 DR_GROUP_GAP (stmt_info) = 0;
3233
3234 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3235 kind, max_tree_size, limit);
3236 if (i + 1 < group_size)
3237 res |= vect_analyze_slp_instance (vinfo, bst_map,
3238 rest, kind, max_tree_size, limit);
3239
3240 return res;
3241 }
3242
3243 /* Even though the first vector did not all match, we might be able to SLP
3244 (some) of the remainder. FORNOW ignore this possibility. */
3245 }
3246
3247 /* Failed to SLP. */
3248 if (dump_enabled_p ())
3249 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3250 return false;
3251 }
3252
3253
3254 /* Analyze an SLP instance starting from a group of grouped stores. Call
3255 vect_build_slp_tree to build a tree of packed stmts if possible.
3256 Return FALSE if it's impossible to SLP any stmt in the loop. */
3257
3258 static bool
3259 vect_analyze_slp_instance (vec_info *vinfo,
3260 scalar_stmts_to_slp_tree_map_t *bst_map,
3261 stmt_vec_info stmt_info,
3262 slp_instance_kind kind,
3263 unsigned max_tree_size, unsigned *limit)
3264 {
3265 unsigned int i;
3266 vec<stmt_vec_info> scalar_stmts;
3267
3268 if (is_a <bb_vec_info> (vinfo))
3269 vect_location = stmt_info->stmt;
3270
3271 stmt_vec_info next_info = stmt_info;
3272 if (kind == slp_inst_kind_store)
3273 {
3274 /* Collect the stores and store them in scalar_stmts. */
3275 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3276 while (next_info)
3277 {
3278 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3279 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3280 }
3281 }
3282 else if (kind == slp_inst_kind_reduc_chain)
3283 {
3284 /* Collect the reduction stmts and store them in scalar_stmts. */
3285 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3286 while (next_info)
3287 {
3288 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3289 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3290 }
3291 /* Mark the first element of the reduction chain as reduction to properly
3292 transform the node. In the reduction analysis phase only the last
3293 element of the chain is marked as reduction. */
3294 STMT_VINFO_DEF_TYPE (stmt_info)
3295 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3296 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3297 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3298 }
3299 else if (kind == slp_inst_kind_ctor)
3300 {
3301 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3302 tree val;
3303 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3304 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3305 {
3306 stmt_vec_info def_info = vinfo->lookup_def (val);
3307 def_info = vect_stmt_to_vectorize (def_info);
3308 scalar_stmts.quick_push (def_info);
3309 }
3310 if (dump_enabled_p ())
3311 dump_printf_loc (MSG_NOTE, vect_location,
3312 "Analyzing vectorizable constructor: %G\n",
3313 stmt_info->stmt);
3314 }
3315 else if (kind == slp_inst_kind_reduc_group)
3316 {
3317 /* Collect reduction statements. */
3318 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
3319 scalar_stmts.create (reductions.length ());
3320 for (i = 0; reductions.iterate (i, &next_info); i++)
3321 if (STMT_VINFO_RELEVANT_P (next_info)
3322 || STMT_VINFO_LIVE_P (next_info))
3323 scalar_stmts.quick_push (next_info);
3324 /* If less than two were relevant/live there's nothing to SLP. */
3325 if (scalar_stmts.length () < 2)
3326 return false;
3327 }
3328 else
3329 gcc_unreachable ();
3330
3331 vec<stmt_vec_info> roots = vNULL;
3332 if (kind == slp_inst_kind_ctor)
3333 {
3334 roots.create (1);
3335 roots.quick_push (stmt_info);
3336 }
3337 /* Build the tree for the SLP instance. */
3338 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3339 roots,
3340 max_tree_size, limit, bst_map,
3341 kind == slp_inst_kind_store
3342 ? stmt_info : NULL);
3343 if (!res)
3344 roots.release ();
3345
3346 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3347 where we should do store group splitting. */
3348
3349 return res;
3350 }
3351
3352 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3353 trees of packed scalar stmts if SLP is possible. */
3354
3355 opt_result
3356 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3357 {
3358 unsigned int i;
3359 stmt_vec_info first_element;
3360 slp_instance instance;
3361
3362 DUMP_VECT_SCOPE ("vect_analyze_slp");
3363
3364 unsigned limit = max_tree_size;
3365
3366 scalar_stmts_to_slp_tree_map_t *bst_map
3367 = new scalar_stmts_to_slp_tree_map_t ();
3368
3369 /* Find SLP sequences starting from groups of grouped stores. */
3370 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3371 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3372 STMT_VINFO_GROUPED_ACCESS (first_element)
3373 ? slp_inst_kind_store : slp_inst_kind_ctor,
3374 max_tree_size, &limit);
3375
3376 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3377 {
3378 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3379 {
3380 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3381 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3382 bb_vinfo->roots[i].stmts,
3383 bb_vinfo->roots[i].roots,
3384 max_tree_size, &limit, bst_map, NULL))
3385 {
3386 bb_vinfo->roots[i].stmts = vNULL;
3387 bb_vinfo->roots[i].roots = vNULL;
3388 }
3389 }
3390 }
3391
3392 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3393 {
3394 /* Find SLP sequences starting from reduction chains. */
3395 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3396 if (! STMT_VINFO_RELEVANT_P (first_element)
3397 && ! STMT_VINFO_LIVE_P (first_element))
3398 ;
3399 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3400 slp_inst_kind_reduc_chain,
3401 max_tree_size, &limit))
3402 {
3403 /* Dissolve reduction chain group. */
3404 stmt_vec_info vinfo = first_element;
3405 stmt_vec_info last = NULL;
3406 while (vinfo)
3407 {
3408 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3409 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3410 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3411 last = vinfo;
3412 vinfo = next;
3413 }
3414 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3415 /* It can be still vectorized as part of an SLP reduction. */
3416 loop_vinfo->reductions.safe_push (last);
3417 }
3418
3419 /* Find SLP sequences starting from groups of reductions. */
3420 if (loop_vinfo->reductions.length () > 1)
3421 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3422 slp_inst_kind_reduc_group, max_tree_size,
3423 &limit);
3424 }
3425
3426 hash_set<slp_tree> visited_patterns;
3427 slp_tree_to_load_perm_map_t perm_cache;
3428
3429 /* See if any patterns can be found in the SLP tree. */
3430 bool pattern_found = false;
3431 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3432 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3433 &visited_patterns, &perm_cache);
3434
3435 /* If any were found optimize permutations of loads. */
3436 if (pattern_found)
3437 {
3438 hash_map<slp_tree, slp_tree> load_map;
3439 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3440 {
3441 slp_tree root = SLP_INSTANCE_TREE (instance);
3442 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3443 &load_map, root);
3444 }
3445 }
3446
3447
3448
3449 /* The map keeps a reference on SLP nodes built, release that. */
3450 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3451 it != bst_map->end (); ++it)
3452 if ((*it).second)
3453 vect_free_slp_tree ((*it).second);
3454 delete bst_map;
3455
3456 if (pattern_found && dump_enabled_p ())
3457 {
3458 dump_printf_loc (MSG_NOTE, vect_location,
3459 "Pattern matched SLP tree\n");
3460 hash_set<slp_tree> visited;
3461 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3462 vect_print_slp_graph (MSG_NOTE, vect_location,
3463 SLP_INSTANCE_TREE (instance), visited);
3464 }
3465
3466 return opt_result::success ();
3467 }
3468
3469 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3470
3471 static void
3472 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
3473 vec<slp_tree> &vertices, vec<int> &leafs)
3474 {
3475 unsigned i;
3476 slp_tree child;
3477
3478 if (visited.add (node))
3479 return;
3480
3481 node->vertex = vertices.length ();
3482 vertices.safe_push (node);
3483
3484 bool leaf = true;
3485 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3486 if (child)
3487 {
3488 leaf = false;
3489 vect_slp_build_vertices (visited, child, vertices, leafs);
3490 }
3491 if (leaf)
3492 leafs.safe_push (node->vertex);
3493 }
3494
3495 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3496
3497 static void
3498 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
3499 vec<int> &leafs)
3500 {
3501 hash_set<slp_tree> visited;
3502 unsigned i;
3503 slp_instance instance;
3504 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
3505 {
3506 unsigned n_v = vertices.length ();
3507 unsigned n_l = leafs.length ();
3508 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
3509 leafs);
3510 /* If we added vertices but no entries to the reverse graph we've
3511 added a cycle that is not backwards-reachable. Push the entry
3512 to mimic as leaf then. */
3513 if (vertices.length () > n_v
3514 && leafs.length () == n_l)
3515 leafs.safe_push (SLP_INSTANCE_TREE (instance)->vertex);
3516 }
3517 }
3518
3519 /* Apply (reverse) bijectite PERM to VEC. */
3520
3521 template <class T>
3522 static void
3523 vect_slp_permute (vec<unsigned> perm,
3524 vec<T> &vec, bool reverse)
3525 {
3526 auto_vec<T, 64> saved;
3527 saved.create (vec.length ());
3528 for (unsigned i = 0; i < vec.length (); ++i)
3529 saved.quick_push (vec[i]);
3530
3531 if (reverse)
3532 {
3533 for (unsigned i = 0; i < vec.length (); ++i)
3534 vec[perm[i]] = saved[i];
3535 for (unsigned i = 0; i < vec.length (); ++i)
3536 gcc_assert (vec[perm[i]] == saved[i]);
3537 }
3538 else
3539 {
3540 for (unsigned i = 0; i < vec.length (); ++i)
3541 vec[i] = saved[perm[i]];
3542 for (unsigned i = 0; i < vec.length (); ++i)
3543 gcc_assert (vec[i] == saved[perm[i]]);
3544 }
3545 }
3546
3547 /* Return whether permutations PERM_A and PERM_B as recorded in the
3548 PERMS vector are equal. */
3549
3550 static bool
3551 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
3552 int perm_a, int perm_b)
3553 {
3554 return (perm_a == perm_b
3555 || (perms[perm_a].length () == perms[perm_b].length ()
3556 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
3557 sizeof (unsigned) * perms[perm_a].length ()) == 0));
3558 }
3559
3560 /* Optimize the SLP graph of VINFO. */
3561
3562 void
3563 vect_optimize_slp (vec_info *vinfo)
3564 {
3565 if (vinfo->slp_instances.is_empty ())
3566 return;
3567
3568 slp_tree node;
3569 unsigned i;
3570 auto_vec<slp_tree> vertices;
3571 auto_vec<int> leafs;
3572 vect_slp_build_vertices (vinfo, vertices, leafs);
3573
3574 struct graph *slpg = new_graph (vertices.length ());
3575 FOR_EACH_VEC_ELT (vertices, i, node)
3576 {
3577 unsigned j;
3578 slp_tree child;
3579 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3580 if (child)
3581 add_edge (slpg, i, child->vertex);
3582 }
3583
3584 /* Compute (reverse) postorder on the inverted graph. */
3585 auto_vec<int> ipo;
3586 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3587
3588 auto_sbitmap n_visited (vertices.length ());
3589 auto_sbitmap n_materialize (vertices.length ());
3590 auto_vec<int> n_perm (vertices.length ());
3591 auto_vec<vec<unsigned> > perms;
3592
3593 bitmap_clear (n_visited);
3594 bitmap_clear (n_materialize);
3595 n_perm.quick_grow_cleared (vertices.length ());
3596 perms.safe_push (vNULL); /* zero is no permute */
3597
3598 /* Produce initial permutations. */
3599 for (i = 0; i < leafs.length (); ++i)
3600 {
3601 int idx = leafs[i];
3602 slp_tree node = vertices[idx];
3603
3604 /* Handle externals and constants optimistically throughout the
3605 iteration. */
3606 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3607 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3608 continue;
3609
3610 /* Leafs do not change across iterations. Note leafs also double
3611 as entries to the reverse graph. */
3612 if (!slpg->vertices[idx].succ)
3613 bitmap_set_bit (n_visited, idx);
3614 /* Loads are the only thing generating permutes. */
3615 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3616 continue;
3617
3618 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3619 node unpermuted, record this permute. */
3620 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3621 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3622 continue;
3623 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3624 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3625 bool any_permute = false;
3626 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3627 {
3628 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3629 imin = MIN (imin, idx);
3630 imax = MAX (imax, idx);
3631 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3632 any_permute = true;
3633 }
3634 /* If there's no permute no need to split one out. */
3635 if (!any_permute)
3636 continue;
3637 /* If the span doesn't match we'd disrupt VF computation, avoid
3638 that for now. */
3639 if (imax - imin + 1 != SLP_TREE_LANES (node))
3640 continue;
3641
3642 /* For now only handle true permutes, like
3643 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3644 when permuting constants and invariants keeping the permute
3645 bijective. */
3646 auto_sbitmap load_index (SLP_TREE_LANES (node));
3647 bitmap_clear (load_index);
3648 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3649 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3650 unsigned j;
3651 for (j = 0; j < SLP_TREE_LANES (node); ++j)
3652 if (!bitmap_bit_p (load_index, j))
3653 break;
3654 if (j != SLP_TREE_LANES (node))
3655 continue;
3656
3657 vec<unsigned> perm = vNULL;
3658 perm.safe_grow (SLP_TREE_LANES (node), true);
3659 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3660 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3661 perms.safe_push (perm);
3662 n_perm[idx] = perms.length () - 1;
3663 }
3664
3665 /* Propagate permutes along the graph and compute materialization points. */
3666 bool changed;
3667 unsigned iteration = 0;
3668 do
3669 {
3670 changed = false;
3671 ++iteration;
3672
3673 for (i = vertices.length (); i > 0 ; --i)
3674 {
3675 int idx = ipo[i-1];
3676 slp_tree node = vertices[idx];
3677 /* For leafs there's nothing to do - we've seeded permutes
3678 on those above. */
3679 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3680 continue;
3681
3682 bitmap_set_bit (n_visited, idx);
3683
3684 /* We cannot move a permute across a store. */
3685 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
3686 && DR_IS_WRITE
3687 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
3688 continue;
3689
3690 int perm = -1;
3691 for (graph_edge *succ = slpg->vertices[idx].succ;
3692 succ; succ = succ->succ_next)
3693 {
3694 int succ_idx = succ->dest;
3695 /* Handle unvisited nodes optimistically. */
3696 /* ??? But for constants once we want to handle non-bijective
3697 permutes we have to verify the permute, when unifying lanes,
3698 will not unify different constants. For example see
3699 gcc.dg/vect/bb-slp-14.c for a case that would break. */
3700 if (!bitmap_bit_p (n_visited, succ_idx))
3701 continue;
3702 int succ_perm = n_perm[succ_idx];
3703 /* Once we materialize succs permutation its output lanes
3704 appear unpermuted to us. */
3705 if (bitmap_bit_p (n_materialize, succ_idx))
3706 succ_perm = 0;
3707 if (perm == -1)
3708 perm = succ_perm;
3709 else if (succ_perm == 0)
3710 {
3711 perm = 0;
3712 break;
3713 }
3714 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
3715 {
3716 perm = 0;
3717 break;
3718 }
3719 }
3720
3721 if (perm == -1)
3722 /* Pick up pre-computed leaf values. */
3723 perm = n_perm[idx];
3724 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
3725 {
3726 if (iteration > 1)
3727 /* Make sure we eventually converge. */
3728 gcc_checking_assert (perm == 0);
3729 n_perm[idx] = perm;
3730 if (perm == 0)
3731 bitmap_clear_bit (n_materialize, idx);
3732 changed = true;
3733 }
3734
3735 if (perm == 0)
3736 continue;
3737
3738 /* Elide pruning at materialization points in the first
3739 iteration so every node was visited once at least. */
3740 if (iteration == 1)
3741 continue;
3742
3743 /* Decide on permute materialization. Look whether there's
3744 a use (pred) edge that is permuted differently than us.
3745 In that case mark ourselves so the permutation is applied.
3746 For VEC_PERM_EXPRs the permutation doesn't carry along
3747 from children to parents so force materialization at the
3748 point of the VEC_PERM_EXPR. In principle VEC_PERM_EXPRs
3749 are a source of an arbitrary permutation again, similar
3750 to constants/externals - that's something we do not yet
3751 optimally handle. */
3752 bool all_preds_permuted = (SLP_TREE_CODE (node) != VEC_PERM_EXPR
3753 && slpg->vertices[idx].pred != NULL);
3754 if (all_preds_permuted)
3755 for (graph_edge *pred = slpg->vertices[idx].pred;
3756 pred; pred = pred->pred_next)
3757 {
3758 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
3759 int pred_perm = n_perm[pred->src];
3760 if (!vect_slp_perms_eq (perms, perm, pred_perm))
3761 {
3762 all_preds_permuted = false;
3763 break;
3764 }
3765 }
3766 if (!all_preds_permuted)
3767 {
3768 if (!bitmap_bit_p (n_materialize, idx))
3769 changed = true;
3770 bitmap_set_bit (n_materialize, idx);
3771 }
3772 }
3773 }
3774 while (changed || iteration == 1);
3775
3776 /* Materialize. */
3777 for (i = 0; i < vertices.length (); ++i)
3778 {
3779 int perm = n_perm[i];
3780 if (perm <= 0)
3781 continue;
3782
3783 slp_tree node = vertices[i];
3784
3785 /* First permute invariant/external original successors. */
3786 unsigned j;
3787 slp_tree child;
3788 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3789 {
3790 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3791 continue;
3792
3793 /* If the vector is uniform there's nothing to do. */
3794 if (vect_slp_tree_uniform_p (child))
3795 continue;
3796
3797 /* We can end up sharing some externals via two_operator
3798 handling. Be prepared to unshare those. */
3799 if (child->refcnt != 1)
3800 {
3801 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3802 SLP_TREE_CHILDREN (node)[j] = child
3803 = vect_create_new_slp_node
3804 (SLP_TREE_SCALAR_OPS (child).copy ());
3805 }
3806 vect_slp_permute (perms[perm],
3807 SLP_TREE_SCALAR_OPS (child), true);
3808 }
3809
3810 if (bitmap_bit_p (n_materialize, i))
3811 {
3812 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3813 /* For loads simply drop the permutation, the load permutation
3814 already performs the desired permutation. */
3815 ;
3816 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3817 {
3818 /* If the node is already a permute node we can apply
3819 the permutation to the lane selection, effectively
3820 materializing it on the incoming vectors. */
3821 if (dump_enabled_p ())
3822 dump_printf_loc (MSG_NOTE, vect_location,
3823 "simplifying permute node %p\n",
3824 node);
3825
3826 for (unsigned k = 0;
3827 k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3828 SLP_TREE_LANE_PERMUTATION (node)[k].second
3829 = perms[perm][SLP_TREE_LANE_PERMUTATION (node)[k].second];
3830 }
3831 else
3832 {
3833 if (dump_enabled_p ())
3834 dump_printf_loc (MSG_NOTE, vect_location,
3835 "inserting permute node in place of %p\n",
3836 node);
3837
3838 /* Make a copy of NODE and in-place change it to a
3839 VEC_PERM node to permute the lanes of the copy. */
3840 slp_tree copy = new _slp_tree;
3841 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3842 SLP_TREE_CHILDREN (node) = vNULL;
3843 SLP_TREE_SCALAR_STMTS (copy)
3844 = SLP_TREE_SCALAR_STMTS (node).copy ();
3845 vect_slp_permute (perms[perm],
3846 SLP_TREE_SCALAR_STMTS (copy), true);
3847 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3848 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3849 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3850 SLP_TREE_LANE_PERMUTATION (copy)
3851 = SLP_TREE_LANE_PERMUTATION (node);
3852 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3853 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3854 copy->refcnt = 1;
3855 copy->max_nunits = node->max_nunits;
3856 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3857 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3858 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3859
3860 /* Now turn NODE into a VEC_PERM. */
3861 SLP_TREE_CHILDREN (node).safe_push (copy);
3862 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3863 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3864 SLP_TREE_LANE_PERMUTATION (node)
3865 .quick_push (std::make_pair (0, perms[perm][j]));
3866 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3867 }
3868 }
3869 else
3870 {
3871 /* Apply the reverse permutation to our stmts. */
3872 vect_slp_permute (perms[perm],
3873 SLP_TREE_SCALAR_STMTS (node), true);
3874 /* And to the load permutation, which we can simply
3875 make regular by design. */
3876 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3877 {
3878 /* ??? When we handle non-bijective permutes the idea
3879 is that we can force the load-permutation to be
3880 { min, min + 1, min + 2, ... max }. But then the
3881 scalar defs might no longer match the lane content
3882 which means wrong-code with live lane vectorization.
3883 So we possibly have to have NULL entries for those. */
3884 vect_slp_permute (perms[perm],
3885 SLP_TREE_LOAD_PERMUTATION (node), true);
3886 }
3887 }
3888 }
3889
3890 /* Free the perms vector used for propagation. */
3891 while (!perms.is_empty ())
3892 perms.pop ().release ();
3893 free_graph (slpg);
3894
3895
3896 /* Now elide load permutations that are not necessary. */
3897 for (i = 0; i < leafs.length (); ++i)
3898 {
3899 node = vertices[leafs[i]];
3900 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3901 continue;
3902
3903 /* In basic block vectorization we allow any subchain of an interleaving
3904 chain.
3905 FORNOW: not in loop SLP because of realignment complications. */
3906 if (is_a <bb_vec_info> (vinfo))
3907 {
3908 bool subchain_p = true;
3909 stmt_vec_info next_load_info = NULL;
3910 stmt_vec_info load_info;
3911 unsigned j;
3912 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3913 {
3914 if (j != 0
3915 && (next_load_info != load_info
3916 || DR_GROUP_GAP (load_info) != 1))
3917 {
3918 subchain_p = false;
3919 break;
3920 }
3921 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3922 }
3923 if (subchain_p)
3924 {
3925 SLP_TREE_LOAD_PERMUTATION (node).release ();
3926 continue;
3927 }
3928 }
3929 else
3930 {
3931 stmt_vec_info load_info;
3932 bool this_load_permuted = false;
3933 unsigned j;
3934 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3935 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3936 {
3937 this_load_permuted = true;
3938 break;
3939 }
3940 stmt_vec_info first_stmt_info
3941 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3942 if (!this_load_permuted
3943 /* The load requires permutation when unrolling exposes
3944 a gap either because the group is larger than the SLP
3945 group-size or because there is a gap between the groups. */
3946 && (known_eq (LOOP_VINFO_VECT_FACTOR
3947 (as_a <loop_vec_info> (vinfo)), 1U)
3948 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3949 && DR_GROUP_GAP (first_stmt_info) == 0)))
3950 {
3951 SLP_TREE_LOAD_PERMUTATION (node).release ();
3952 continue;
3953 }
3954 }
3955 }
3956
3957 /* And any permutations of BB reductions. */
3958 if (is_a <bb_vec_info> (vinfo))
3959 {
3960 for (slp_instance instance : vinfo->slp_instances)
3961 {
3962 if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
3963 continue;
3964 slp_tree old = SLP_INSTANCE_TREE (instance);
3965 if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
3966 && SLP_TREE_CHILDREN (old).length () == 1)
3967 {
3968 slp_tree child = SLP_TREE_CHILDREN (old)[0];
3969 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
3970 {
3971 /* Preserve the special VEC_PERM we use to shield existing
3972 vector defs from the rest. But make it a no-op. */
3973 unsigned i = 0;
3974 for (std::pair<unsigned, unsigned> &p
3975 : SLP_TREE_LANE_PERMUTATION (old))
3976 p.second = i++;
3977 }
3978 else
3979 {
3980 SLP_INSTANCE_TREE (instance) = child;
3981 SLP_TREE_REF_COUNT (child)++;
3982 vect_free_slp_tree (old);
3983 }
3984 }
3985 else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
3986 && SLP_TREE_REF_COUNT (old) == 1)
3987 {
3988 /* ??? For loads the situation is more complex since
3989 we can't modify the permute in place in case the
3990 node is used multiple times. In fact for loads this
3991 should be somehow handled in the propagation engine. */
3992 auto fn = [] (const void *a, const void *b)
3993 { return *(const int *)a - *(const int *)b; };
3994 SLP_TREE_LOAD_PERMUTATION (old).qsort (fn);
3995 }
3996 }
3997 }
3998 }
3999
4000 /* Gather loads reachable from the individual SLP graph entries. */
4001
4002 void
4003 vect_gather_slp_loads (vec_info *vinfo)
4004 {
4005 unsigned i;
4006 slp_instance instance;
4007 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
4008 {
4009 hash_set<slp_tree> visited;
4010 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
4011 SLP_INSTANCE_TREE (instance), visited);
4012 }
4013 }
4014
4015
4016 /* For each possible SLP instance decide whether to SLP it and calculate overall
4017 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
4018 least one instance. */
4019
4020 bool
4021 vect_make_slp_decision (loop_vec_info loop_vinfo)
4022 {
4023 unsigned int i;
4024 poly_uint64 unrolling_factor = 1;
4025 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
4026 slp_instance instance;
4027 int decided_to_slp = 0;
4028
4029 DUMP_VECT_SCOPE ("vect_make_slp_decision");
4030
4031 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4032 {
4033 /* FORNOW: SLP if you can. */
4034 /* All unroll factors have the form:
4035
4036 GET_MODE_SIZE (vinfo->vector_mode) * X
4037
4038 for some rational X, so they must have a common multiple. */
4039 unrolling_factor
4040 = force_common_multiple (unrolling_factor,
4041 SLP_INSTANCE_UNROLLING_FACTOR (instance));
4042
4043 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
4044 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
4045 loop-based vectorization. Such stmts will be marked as HYBRID. */
4046 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4047 decided_to_slp++;
4048 }
4049
4050 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
4051
4052 if (decided_to_slp && dump_enabled_p ())
4053 {
4054 dump_printf_loc (MSG_NOTE, vect_location,
4055 "Decided to SLP %d instances. Unrolling factor ",
4056 decided_to_slp);
4057 dump_dec (MSG_NOTE, unrolling_factor);
4058 dump_printf (MSG_NOTE, "\n");
4059 }
4060
4061 return (decided_to_slp > 0);
4062 }
4063
4064 /* Private data for vect_detect_hybrid_slp. */
4065 struct vdhs_data
4066 {
4067 loop_vec_info loop_vinfo;
4068 vec<stmt_vec_info> *worklist;
4069 };
4070
4071 /* Walker for walk_gimple_op. */
4072
4073 static tree
4074 vect_detect_hybrid_slp (tree *tp, int *, void *data)
4075 {
4076 walk_stmt_info *wi = (walk_stmt_info *)data;
4077 vdhs_data *dat = (vdhs_data *)wi->info;
4078
4079 if (wi->is_lhs)
4080 return NULL_TREE;
4081
4082 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
4083 if (!def_stmt_info)
4084 return NULL_TREE;
4085 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
4086 if (PURE_SLP_STMT (def_stmt_info))
4087 {
4088 if (dump_enabled_p ())
4089 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
4090 def_stmt_info->stmt);
4091 STMT_SLP_TYPE (def_stmt_info) = hybrid;
4092 dat->worklist->safe_push (def_stmt_info);
4093 }
4094
4095 return NULL_TREE;
4096 }
4097
4098 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
4099 if so, otherwise pushing it to WORKLIST. */
4100
4101 static void
4102 maybe_push_to_hybrid_worklist (vec_info *vinfo,
4103 vec<stmt_vec_info> &worklist,
4104 stmt_vec_info stmt_info)
4105 {
4106 if (dump_enabled_p ())
4107 dump_printf_loc (MSG_NOTE, vect_location,
4108 "Processing hybrid candidate : %G", stmt_info->stmt);
4109 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
4110 imm_use_iterator iter2;
4111 ssa_op_iter iter1;
4112 use_operand_p use_p;
4113 def_operand_p def_p;
4114 bool any_def = false;
4115 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
4116 {
4117 any_def = true;
4118 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
4119 {
4120 if (is_gimple_debug (USE_STMT (use_p)))
4121 continue;
4122 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
4123 /* An out-of loop use means this is a loop_vect sink. */
4124 if (!use_info)
4125 {
4126 if (dump_enabled_p ())
4127 dump_printf_loc (MSG_NOTE, vect_location,
4128 "Found loop_vect sink: %G", stmt_info->stmt);
4129 worklist.safe_push (stmt_info);
4130 return;
4131 }
4132 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
4133 {
4134 if (dump_enabled_p ())
4135 dump_printf_loc (MSG_NOTE, vect_location,
4136 "Found loop_vect use: %G", use_info->stmt);
4137 worklist.safe_push (stmt_info);
4138 return;
4139 }
4140 }
4141 }
4142 /* No def means this is a loo_vect sink. */
4143 if (!any_def)
4144 {
4145 if (dump_enabled_p ())
4146 dump_printf_loc (MSG_NOTE, vect_location,
4147 "Found loop_vect sink: %G", stmt_info->stmt);
4148 worklist.safe_push (stmt_info);
4149 return;
4150 }
4151 if (dump_enabled_p ())
4152 dump_printf_loc (MSG_NOTE, vect_location,
4153 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
4154 STMT_SLP_TYPE (stmt_info) = pure_slp;
4155 }
4156
4157 /* Find stmts that must be both vectorized and SLPed. */
4158
4159 void
4160 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
4161 {
4162 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
4163
4164 /* All stmts participating in SLP are marked pure_slp, all other
4165 stmts are loop_vect.
4166 First collect all loop_vect stmts into a worklist.
4167 SLP patterns cause not all original scalar stmts to appear in
4168 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
4169 Rectify this here and do a backward walk over the IL only considering
4170 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
4171 mark them as pure_slp. */
4172 auto_vec<stmt_vec_info> worklist;
4173 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
4174 {
4175 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
4176 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
4177 gsi_next (&gsi))
4178 {
4179 gphi *phi = gsi.phi ();
4180 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
4181 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4182 maybe_push_to_hybrid_worklist (loop_vinfo,
4183 worklist, stmt_info);
4184 }
4185 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
4186 gsi_prev (&gsi))
4187 {
4188 gimple *stmt = gsi_stmt (gsi);
4189 if (is_gimple_debug (stmt))
4190 continue;
4191 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
4192 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
4193 {
4194 for (gimple_stmt_iterator gsi2
4195 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
4196 !gsi_end_p (gsi2); gsi_next (&gsi2))
4197 {
4198 stmt_vec_info patt_info
4199 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
4200 if (!STMT_SLP_TYPE (patt_info)
4201 && STMT_VINFO_RELEVANT (patt_info))
4202 maybe_push_to_hybrid_worklist (loop_vinfo,
4203 worklist, patt_info);
4204 }
4205 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4206 }
4207 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4208 maybe_push_to_hybrid_worklist (loop_vinfo,
4209 worklist, stmt_info);
4210 }
4211 }
4212
4213 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
4214 mark any SLP vectorized stmt as hybrid.
4215 ??? We're visiting def stmts N times (once for each non-SLP and
4216 once for each hybrid-SLP use). */
4217 walk_stmt_info wi;
4218 vdhs_data dat;
4219 dat.worklist = &worklist;
4220 dat.loop_vinfo = loop_vinfo;
4221 memset (&wi, 0, sizeof (wi));
4222 wi.info = (void *)&dat;
4223 while (!worklist.is_empty ())
4224 {
4225 stmt_vec_info stmt_info = worklist.pop ();
4226 /* Since SSA operands are not set up for pattern stmts we need
4227 to use walk_gimple_op. */
4228 wi.is_lhs = 0;
4229 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
4230 }
4231 }
4232
4233
4234 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
4235
4236 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
4237 : vec_info (vec_info::bb, init_cost (NULL, false), shared),
4238 bbs (_bbs),
4239 roots (vNULL)
4240 {
4241 for (unsigned i = 0; i < bbs.length (); ++i)
4242 {
4243 if (i != 0)
4244 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4245 gsi_next (&si))
4246 {
4247 gphi *phi = si.phi ();
4248 gimple_set_uid (phi, 0);
4249 add_stmt (phi);
4250 }
4251 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4252 !gsi_end_p (gsi); gsi_next (&gsi))
4253 {
4254 gimple *stmt = gsi_stmt (gsi);
4255 gimple_set_uid (stmt, 0);
4256 if (is_gimple_debug (stmt))
4257 continue;
4258 add_stmt (stmt);
4259 }
4260 }
4261 }
4262
4263
4264 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
4265 stmts in the basic block. */
4266
4267 _bb_vec_info::~_bb_vec_info ()
4268 {
4269 /* Reset region marker. */
4270 for (unsigned i = 0; i < bbs.length (); ++i)
4271 {
4272 if (i != 0)
4273 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4274 gsi_next (&si))
4275 {
4276 gphi *phi = si.phi ();
4277 gimple_set_uid (phi, -1);
4278 }
4279 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4280 !gsi_end_p (gsi); gsi_next (&gsi))
4281 {
4282 gimple *stmt = gsi_stmt (gsi);
4283 gimple_set_uid (stmt, -1);
4284 }
4285 }
4286
4287 for (unsigned i = 0; i < roots.length (); ++i)
4288 {
4289 roots[i].stmts.release ();
4290 roots[i].roots.release ();
4291 }
4292 roots.release ();
4293 }
4294
4295 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
4296 given then that child nodes have already been processed, and that
4297 their def types currently match their SLP node's def type. */
4298
4299 static bool
4300 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
4301 slp_instance node_instance,
4302 stmt_vector_for_cost *cost_vec)
4303 {
4304 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4305
4306 /* Calculate the number of vector statements to be created for the
4307 scalar stmts in this node. For SLP reductions it is equal to the
4308 number of vector statements in the children (which has already been
4309 calculated by the recursive call). Otherwise it is the number of
4310 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
4311 VF divided by the number of elements in a vector. */
4312 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
4313 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
4314 {
4315 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
4316 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
4317 {
4318 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4319 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
4320 break;
4321 }
4322 }
4323 else
4324 {
4325 poly_uint64 vf;
4326 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4327 vf = loop_vinfo->vectorization_factor;
4328 else
4329 vf = 1;
4330 unsigned int group_size = SLP_TREE_LANES (node);
4331 tree vectype = SLP_TREE_VECTYPE (node);
4332 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4333 = vect_get_num_vectors (vf * group_size, vectype);
4334 }
4335
4336 /* Handle purely internal nodes. */
4337 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4338 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
4339
4340 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
4341 if (is_a <bb_vec_info> (vinfo)
4342 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
4343 {
4344 if (dump_enabled_p ())
4345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4346 "desired vector type conflicts with earlier one "
4347 "for %G", stmt_info->stmt);
4348 return false;
4349 }
4350
4351 bool dummy;
4352 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
4353 node, node_instance, cost_vec);
4354 }
4355
4356 /* Try to build NODE from scalars, returning true on success.
4357 NODE_INSTANCE is the SLP instance that contains NODE. */
4358
4359 static bool
4360 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
4361 slp_instance node_instance)
4362 {
4363 stmt_vec_info stmt_info;
4364 unsigned int i;
4365
4366 if (!is_a <bb_vec_info> (vinfo)
4367 || node == SLP_INSTANCE_TREE (node_instance)
4368 || !SLP_TREE_SCALAR_STMTS (node).exists ()
4369 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
4370 return false;
4371
4372 if (dump_enabled_p ())
4373 dump_printf_loc (MSG_NOTE, vect_location,
4374 "Building vector operands of %p from scalars instead\n", node);
4375
4376 /* Don't remove and free the child nodes here, since they could be
4377 referenced by other structures. The analysis and scheduling phases
4378 (need to) ignore child nodes of anything that isn't vect_internal_def. */
4379 unsigned int group_size = SLP_TREE_LANES (node);
4380 SLP_TREE_DEF_TYPE (node) = vect_external_def;
4381 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
4382 SLP_TREE_LOAD_PERMUTATION (node).release ();
4383 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4384 {
4385 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
4386 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
4387 }
4388 return true;
4389 }
4390
4391 /* Compute the prologue cost for invariant or constant operands represented
4392 by NODE. */
4393
4394 static void
4395 vect_prologue_cost_for_slp (slp_tree node,
4396 stmt_vector_for_cost *cost_vec)
4397 {
4398 /* There's a special case of an existing vector, that costs nothing. */
4399 if (SLP_TREE_SCALAR_OPS (node).length () == 0
4400 && !SLP_TREE_VEC_DEFS (node).is_empty ())
4401 return;
4402 /* Without looking at the actual initializer a vector of
4403 constants can be implemented as load from the constant pool.
4404 When all elements are the same we can use a splat. */
4405 tree vectype = SLP_TREE_VECTYPE (node);
4406 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
4407 unsigned num_vects_to_check;
4408 unsigned HOST_WIDE_INT const_nunits;
4409 unsigned nelt_limit;
4410 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
4411 && ! multiple_p (const_nunits, group_size))
4412 {
4413 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4414 nelt_limit = const_nunits;
4415 }
4416 else
4417 {
4418 /* If either the vector has variable length or the vectors
4419 are composed of repeated whole groups we only need to
4420 cost construction once. All vectors will be the same. */
4421 num_vects_to_check = 1;
4422 nelt_limit = group_size;
4423 }
4424 tree elt = NULL_TREE;
4425 unsigned nelt = 0;
4426 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
4427 {
4428 unsigned si = j % group_size;
4429 if (nelt == 0)
4430 elt = SLP_TREE_SCALAR_OPS (node)[si];
4431 /* ??? We're just tracking whether all operands of a single
4432 vector initializer are the same, ideally we'd check if
4433 we emitted the same one already. */
4434 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
4435 elt = NULL_TREE;
4436 nelt++;
4437 if (nelt == nelt_limit)
4438 {
4439 record_stmt_cost (cost_vec, 1,
4440 SLP_TREE_DEF_TYPE (node) == vect_external_def
4441 ? (elt ? scalar_to_vec : vec_construct)
4442 : vector_load,
4443 NULL, vectype, 0, vect_prologue);
4444 nelt = 0;
4445 }
4446 }
4447 }
4448
4449 /* Analyze statements contained in SLP tree NODE after recursively analyzing
4450 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
4451
4452 Return true if the operations are supported. */
4453
4454 static bool
4455 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
4456 slp_instance node_instance,
4457 hash_set<slp_tree> &visited_set,
4458 vec<slp_tree> &visited_vec,
4459 stmt_vector_for_cost *cost_vec)
4460 {
4461 int i, j;
4462 slp_tree child;
4463
4464 /* Assume we can code-generate all invariants. */
4465 if (!node
4466 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
4467 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4468 return true;
4469
4470 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
4471 {
4472 if (dump_enabled_p ())
4473 dump_printf_loc (MSG_NOTE, vect_location,
4474 "Failed cyclic SLP reference in %p\n", node);
4475 return false;
4476 }
4477 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
4478
4479 /* If we already analyzed the exact same set of scalar stmts we're done.
4480 We share the generated vector stmts for those. */
4481 if (visited_set.add (node))
4482 return true;
4483 visited_vec.safe_push (node);
4484
4485 bool res = true;
4486 unsigned visited_rec_start = visited_vec.length ();
4487 unsigned cost_vec_rec_start = cost_vec->length ();
4488 bool seen_non_constant_child = false;
4489 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4490 {
4491 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
4492 visited_set, visited_vec,
4493 cost_vec);
4494 if (!res)
4495 break;
4496 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
4497 seen_non_constant_child = true;
4498 }
4499 /* We're having difficulties scheduling nodes with just constant
4500 operands and no scalar stmts since we then cannot compute a stmt
4501 insertion place. */
4502 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
4503 {
4504 if (dump_enabled_p ())
4505 dump_printf_loc (MSG_NOTE, vect_location,
4506 "Cannot vectorize all-constant op node %p\n", node);
4507 res = false;
4508 }
4509
4510 if (res)
4511 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
4512 cost_vec);
4513 /* If analysis failed we have to pop all recursive visited nodes
4514 plus ourselves. */
4515 if (!res)
4516 {
4517 while (visited_vec.length () >= visited_rec_start)
4518 visited_set.remove (visited_vec.pop ());
4519 cost_vec->truncate (cost_vec_rec_start);
4520 }
4521
4522 /* When the node can be vectorized cost invariant nodes it references.
4523 This is not done in DFS order to allow the refering node
4524 vectorizable_* calls to nail down the invariant nodes vector type
4525 and possibly unshare it if it needs a different vector type than
4526 other referrers. */
4527 if (res)
4528 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
4529 if (child
4530 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
4531 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
4532 /* Perform usual caching, note code-generation still
4533 code-gens these nodes multiple times but we expect
4534 to CSE them later. */
4535 && !visited_set.add (child))
4536 {
4537 visited_vec.safe_push (child);
4538 /* ??? After auditing more code paths make a "default"
4539 and push the vector type from NODE to all children
4540 if it is not already set. */
4541 /* Compute the number of vectors to be generated. */
4542 tree vector_type = SLP_TREE_VECTYPE (child);
4543 if (!vector_type)
4544 {
4545 /* For shifts with a scalar argument we don't need
4546 to cost or code-generate anything.
4547 ??? Represent this more explicitely. */
4548 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4549 == shift_vec_info_type)
4550 && j == 1);
4551 continue;
4552 }
4553 unsigned group_size = SLP_TREE_LANES (child);
4554 poly_uint64 vf = 1;
4555 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4556 vf = loop_vinfo->vectorization_factor;
4557 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
4558 = vect_get_num_vectors (vf * group_size, vector_type);
4559 /* And cost them. */
4560 vect_prologue_cost_for_slp (child, cost_vec);
4561 }
4562
4563 /* If this node or any of its children can't be vectorized, try pruning
4564 the tree here rather than felling the whole thing. */
4565 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
4566 {
4567 /* We'll need to revisit this for invariant costing and number
4568 of vectorized stmt setting. */
4569 res = true;
4570 }
4571
4572 return res;
4573 }
4574
4575 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4576 region and that can be vectorized using vectorizable_live_operation
4577 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
4578 scalar code computing it to be retained. */
4579
4580 static void
4581 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
4582 slp_instance instance,
4583 stmt_vector_for_cost *cost_vec,
4584 hash_set<stmt_vec_info> &svisited,
4585 hash_set<slp_tree> &visited)
4586 {
4587 if (visited.add (node))
4588 return;
4589
4590 unsigned i;
4591 stmt_vec_info stmt_info;
4592 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
4593 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4594 {
4595 if (svisited.contains (stmt_info))
4596 continue;
4597 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4598 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
4599 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
4600 /* Only the pattern root stmt computes the original scalar value. */
4601 continue;
4602 bool mark_visited = true;
4603 gimple *orig_stmt = orig_stmt_info->stmt;
4604 ssa_op_iter op_iter;
4605 def_operand_p def_p;
4606 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4607 {
4608 imm_use_iterator use_iter;
4609 gimple *use_stmt;
4610 stmt_vec_info use_stmt_info;
4611 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4612 if (!is_gimple_debug (use_stmt))
4613 {
4614 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
4615 if (!use_stmt_info
4616 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4617 {
4618 STMT_VINFO_LIVE_P (stmt_info) = true;
4619 if (vectorizable_live_operation (bb_vinfo, stmt_info,
4620 NULL, node, instance, i,
4621 false, cost_vec))
4622 /* ??? So we know we can vectorize the live stmt
4623 from one SLP node. If we cannot do so from all
4624 or none consistently we'd have to record which
4625 SLP node (and lane) we want to use for the live
4626 operation. So make sure we can code-generate
4627 from all nodes. */
4628 mark_visited = false;
4629 else
4630 STMT_VINFO_LIVE_P (stmt_info) = false;
4631 break;
4632 }
4633 }
4634 /* We have to verify whether we can insert the lane extract
4635 before all uses. The following is a conservative approximation.
4636 We cannot put this into vectorizable_live_operation because
4637 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4638 doesn't work.
4639 Note that while the fact that we emit code for loads at the
4640 first load should make this a non-problem leafs we construct
4641 from scalars are vectorized after the last scalar def.
4642 ??? If we'd actually compute the insert location during
4643 analysis we could use sth less conservative than the last
4644 scalar stmt in the node for the dominance check. */
4645 /* ??? What remains is "live" uses in vector CTORs in the same
4646 SLP graph which is where those uses can end up code-generated
4647 right after their definition instead of close to their original
4648 use. But that would restrict us to code-generate lane-extracts
4649 from the latest stmt in a node. So we compensate for this
4650 during code-generation, simply not replacing uses for those
4651 hopefully rare cases. */
4652 if (STMT_VINFO_LIVE_P (stmt_info))
4653 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4654 if (!is_gimple_debug (use_stmt)
4655 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4656 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4657 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4658 {
4659 if (dump_enabled_p ())
4660 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4661 "Cannot determine insertion place for "
4662 "lane extract\n");
4663 STMT_VINFO_LIVE_P (stmt_info) = false;
4664 mark_visited = true;
4665 }
4666 }
4667 if (mark_visited)
4668 svisited.add (stmt_info);
4669 }
4670
4671 slp_tree child;
4672 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4673 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4674 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4675 cost_vec, svisited, visited);
4676 }
4677
4678 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
4679
4680 static bool
4681 vectorizable_bb_reduc_epilogue (slp_instance instance,
4682 stmt_vector_for_cost *cost_vec)
4683 {
4684 enum tree_code reduc_code
4685 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
4686 if (reduc_code == MINUS_EXPR)
4687 reduc_code = PLUS_EXPR;
4688 internal_fn reduc_fn;
4689 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
4690 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
4691 || reduc_fn == IFN_LAST
4692 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH))
4693 return false;
4694
4695 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
4696 cost log2 vector operations plus shuffles. */
4697 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
4698 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
4699 vectype, 0, vect_body);
4700 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
4701 vectype, 0, vect_body);
4702 return true;
4703 }
4704
4705 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
4706 and recurse to children. */
4707
4708 static void
4709 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
4710 hash_set<slp_tree> &visited)
4711 {
4712 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
4713 || visited.add (node))
4714 return;
4715
4716 stmt_vec_info stmt;
4717 unsigned i;
4718 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4719 roots.remove (vect_orig_stmt (stmt));
4720
4721 slp_tree child;
4722 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4723 if (child)
4724 vect_slp_prune_covered_roots (child, roots, visited);
4725 }
4726
4727 /* Analyze statements in SLP instances of VINFO. Return true if the
4728 operations are supported. */
4729
4730 bool
4731 vect_slp_analyze_operations (vec_info *vinfo)
4732 {
4733 slp_instance instance;
4734 int i;
4735
4736 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4737
4738 hash_set<slp_tree> visited;
4739 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4740 {
4741 auto_vec<slp_tree> visited_vec;
4742 stmt_vector_for_cost cost_vec;
4743 cost_vec.create (2);
4744 if (is_a <bb_vec_info> (vinfo))
4745 vect_location = instance->location ();
4746 if (!vect_slp_analyze_node_operations (vinfo,
4747 SLP_INSTANCE_TREE (instance),
4748 instance, visited, visited_vec,
4749 &cost_vec)
4750 /* CTOR instances require vectorized defs for the SLP tree root. */
4751 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
4752 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
4753 != vect_internal_def))
4754 /* Check we can vectorize the reduction. */
4755 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
4756 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
4757 {
4758 slp_tree node = SLP_INSTANCE_TREE (instance);
4759 stmt_vec_info stmt_info;
4760 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
4761 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
4762 else
4763 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4764 if (dump_enabled_p ())
4765 dump_printf_loc (MSG_NOTE, vect_location,
4766 "removing SLP instance operations starting from: %G",
4767 stmt_info->stmt);
4768 vect_free_slp_instance (instance);
4769 vinfo->slp_instances.ordered_remove (i);
4770 cost_vec.release ();
4771 while (!visited_vec.is_empty ())
4772 visited.remove (visited_vec.pop ());
4773 }
4774 else
4775 {
4776 i++;
4777
4778 /* For BB vectorization remember the SLP graph entry
4779 cost for later. */
4780 if (is_a <bb_vec_info> (vinfo))
4781 instance->cost_vec = cost_vec;
4782 else
4783 {
4784 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
4785 cost_vec.release ();
4786 }
4787 }
4788 }
4789
4790 /* Now look for SLP instances with a root that are covered by other
4791 instances and remove them. */
4792 hash_set<stmt_vec_info> roots;
4793 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4794 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
4795 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
4796 if (!roots.is_empty ())
4797 {
4798 visited.empty ();
4799 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4800 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
4801 visited);
4802 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4803 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
4804 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
4805 {
4806 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
4807 if (dump_enabled_p ())
4808 dump_printf_loc (MSG_NOTE, vect_location,
4809 "removing SLP instance operations starting "
4810 "from: %G", root->stmt);
4811 vect_free_slp_instance (instance);
4812 vinfo->slp_instances.ordered_remove (i);
4813 }
4814 else
4815 ++i;
4816 }
4817
4818 /* Compute vectorizable live stmts. */
4819 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4820 {
4821 hash_set<stmt_vec_info> svisited;
4822 hash_set<slp_tree> visited;
4823 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4824 {
4825 vect_location = instance->location ();
4826 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
4827 instance, &instance->cost_vec, svisited,
4828 visited);
4829 }
4830 }
4831
4832 return !vinfo->slp_instances.is_empty ();
4833 }
4834
4835 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
4836 closing the eventual chain. */
4837
4838 static slp_instance
4839 get_ultimate_leader (slp_instance instance,
4840 hash_map<slp_instance, slp_instance> &instance_leader)
4841 {
4842 auto_vec<slp_instance *, 8> chain;
4843 slp_instance *tem;
4844 while (*(tem = instance_leader.get (instance)) != instance)
4845 {
4846 chain.safe_push (tem);
4847 instance = *tem;
4848 }
4849 while (!chain.is_empty ())
4850 *chain.pop () = instance;
4851 return instance;
4852 }
4853
4854 /* Worker of vect_bb_partition_graph, recurse on NODE. */
4855
4856 static void
4857 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
4858 slp_instance instance, slp_tree node,
4859 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
4860 hash_map<slp_instance, slp_instance> &instance_leader,
4861 hash_set<slp_tree> &visited)
4862 {
4863 stmt_vec_info stmt_info;
4864 unsigned i;
4865
4866 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4867 {
4868 bool existed_p;
4869 slp_instance &stmt_instance
4870 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
4871 if (!existed_p)
4872 ;
4873 else if (stmt_instance != instance)
4874 {
4875 /* If we're running into a previously marked stmt make us the
4876 leader of the current ultimate leader. This keeps the
4877 leader chain acyclic and works even when the current instance
4878 connects two previously independent graph parts. */
4879 slp_instance stmt_leader
4880 = get_ultimate_leader (stmt_instance, instance_leader);
4881 if (stmt_leader != instance)
4882 instance_leader.put (stmt_leader, instance);
4883 }
4884 stmt_instance = instance;
4885 }
4886
4887 if (!SLP_TREE_SCALAR_STMTS (node).is_empty () && visited.add (node))
4888 return;
4889
4890 slp_tree child;
4891 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4892 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4893 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
4894 instance_leader, visited);
4895 }
4896
4897 /* Partition the SLP graph into pieces that can be costed independently. */
4898
4899 static void
4900 vect_bb_partition_graph (bb_vec_info bb_vinfo)
4901 {
4902 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
4903
4904 /* First walk the SLP graph assigning each involved scalar stmt a
4905 corresponding SLP graph entry and upon visiting a previously
4906 marked stmt, make the stmts leader the current SLP graph entry. */
4907 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
4908 hash_map<slp_instance, slp_instance> instance_leader;
4909 hash_set<slp_tree> visited;
4910 slp_instance instance;
4911 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4912 {
4913 instance_leader.put (instance, instance);
4914 vect_bb_partition_graph_r (bb_vinfo,
4915 instance, SLP_INSTANCE_TREE (instance),
4916 stmt_to_instance, instance_leader,
4917 visited);
4918 }
4919
4920 /* Then collect entries to each independent subgraph. */
4921 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4922 {
4923 slp_instance leader = get_ultimate_leader (instance, instance_leader);
4924 leader->subgraph_entries.safe_push (instance);
4925 if (dump_enabled_p ()
4926 && leader != instance)
4927 dump_printf_loc (MSG_NOTE, vect_location,
4928 "instance %p is leader of %p\n",
4929 leader, instance);
4930 }
4931 }
4932
4933 /* Compute the scalar cost of the SLP node NODE and its children
4934 and return it. Do not account defs that are marked in LIFE and
4935 update LIFE according to uses of NODE. */
4936
4937 static void
4938 vect_bb_slp_scalar_cost (vec_info *vinfo,
4939 slp_tree node, vec<bool, va_heap> *life,
4940 stmt_vector_for_cost *cost_vec,
4941 hash_set<slp_tree> &visited)
4942 {
4943 unsigned i;
4944 stmt_vec_info stmt_info;
4945 slp_tree child;
4946
4947 if (visited.add (node))
4948 return;
4949
4950 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4951 {
4952 ssa_op_iter op_iter;
4953 def_operand_p def_p;
4954
4955 if ((*life)[i])
4956 continue;
4957
4958 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4959 gimple *orig_stmt = orig_stmt_info->stmt;
4960
4961 /* If there is a non-vectorized use of the defs then the scalar
4962 stmt is kept live in which case we do not account it or any
4963 required defs in the SLP children in the scalar cost. This
4964 way we make the vectorization more costly when compared to
4965 the scalar cost. */
4966 if (!STMT_VINFO_LIVE_P (stmt_info))
4967 {
4968 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4969 {
4970 imm_use_iterator use_iter;
4971 gimple *use_stmt;
4972 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4973 if (!is_gimple_debug (use_stmt))
4974 {
4975 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
4976 if (!use_stmt_info
4977 || !PURE_SLP_STMT
4978 (vect_stmt_to_vectorize (use_stmt_info)))
4979 {
4980 (*life)[i] = true;
4981 break;
4982 }
4983 }
4984 }
4985 if ((*life)[i])
4986 continue;
4987 }
4988
4989 /* Count scalar stmts only once. */
4990 if (gimple_visited_p (orig_stmt))
4991 continue;
4992 gimple_set_visited (orig_stmt, true);
4993
4994 vect_cost_for_stmt kind;
4995 if (STMT_VINFO_DATA_REF (orig_stmt_info))
4996 {
4997 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
4998 kind = scalar_load;
4999 else
5000 kind = scalar_store;
5001 }
5002 else if (vect_nop_conversion_p (orig_stmt_info))
5003 continue;
5004 /* For single-argument PHIs assume coalescing which means zero cost
5005 for the scalar and the vector PHIs. This avoids artificially
5006 favoring the vector path (but may pessimize it in some cases). */
5007 else if (is_a <gphi *> (orig_stmt_info->stmt)
5008 && gimple_phi_num_args
5009 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
5010 continue;
5011 else
5012 kind = scalar_stmt;
5013 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
5014 SLP_TREE_VECTYPE (node), 0, vect_body);
5015 }
5016
5017 auto_vec<bool, 20> subtree_life;
5018 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5019 {
5020 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5021 {
5022 /* Do not directly pass LIFE to the recursive call, copy it to
5023 confine changes in the callee to the current child/subtree. */
5024 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5025 {
5026 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
5027 for (unsigned j = 0;
5028 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
5029 {
5030 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
5031 if (perm.first == i)
5032 subtree_life[perm.second] = (*life)[j];
5033 }
5034 }
5035 else
5036 {
5037 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
5038 subtree_life.safe_splice (*life);
5039 }
5040 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
5041 visited);
5042 subtree_life.truncate (0);
5043 }
5044 }
5045 }
5046
5047 /* Comparator for the loop-index sorted cost vectors. */
5048
5049 static int
5050 li_cost_vec_cmp (const void *a_, const void *b_)
5051 {
5052 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
5053 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
5054 if (a->first < b->first)
5055 return -1;
5056 else if (a->first == b->first)
5057 return 0;
5058 return 1;
5059 }
5060
5061 /* Check if vectorization of the basic block is profitable for the
5062 subgraph denoted by SLP_INSTANCES. */
5063
5064 static bool
5065 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
5066 vec<slp_instance> slp_instances)
5067 {
5068 slp_instance instance;
5069 int i;
5070 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
5071 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
5072
5073 if (dump_enabled_p ())
5074 {
5075 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
5076 hash_set<slp_tree> visited;
5077 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5078 vect_print_slp_graph (MSG_NOTE, vect_location,
5079 SLP_INSTANCE_TREE (instance), visited);
5080 }
5081
5082 /* Calculate scalar cost and sum the cost for the vector stmts
5083 previously collected. */
5084 stmt_vector_for_cost scalar_costs = vNULL;
5085 stmt_vector_for_cost vector_costs = vNULL;
5086 hash_set<slp_tree> visited;
5087 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5088 {
5089 auto_vec<bool, 20> life;
5090 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
5091 true);
5092 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5093 record_stmt_cost (&scalar_costs,
5094 SLP_INSTANCE_ROOT_STMTS (instance).length (),
5095 scalar_stmt,
5096 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
5097 vect_bb_slp_scalar_cost (bb_vinfo,
5098 SLP_INSTANCE_TREE (instance),
5099 &life, &scalar_costs, visited);
5100 vector_costs.safe_splice (instance->cost_vec);
5101 instance->cost_vec.release ();
5102 }
5103 /* Unset visited flag. */
5104 stmt_info_for_cost *cost;
5105 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5106 gimple_set_visited (cost->stmt_info->stmt, false);
5107
5108 if (dump_enabled_p ())
5109 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5110
5111 /* When costing non-loop vectorization we need to consider each covered
5112 loop independently and make sure vectorization is profitable. For
5113 now we assume a loop may be not entered or executed an arbitrary
5114 number of iterations (??? static information can provide more
5115 precise info here) which means we can simply cost each containing
5116 loops stmts separately. */
5117
5118 /* First produce cost vectors sorted by loop index. */
5119 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5120 li_scalar_costs (scalar_costs.length ());
5121 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5122 li_vector_costs (vector_costs.length ());
5123 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5124 {
5125 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5126 li_scalar_costs.quick_push (std::make_pair (l, cost));
5127 }
5128 /* Use a random used loop as fallback in case the first vector_costs
5129 entry does not have a stmt_info associated with it. */
5130 unsigned l = li_scalar_costs[0].first;
5131 FOR_EACH_VEC_ELT (vector_costs, i, cost)
5132 {
5133 /* We inherit from the previous COST, invariants, externals and
5134 extracts immediately follow the cost for the related stmt. */
5135 if (cost->stmt_info)
5136 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5137 li_vector_costs.quick_push (std::make_pair (l, cost));
5138 }
5139 li_scalar_costs.qsort (li_cost_vec_cmp);
5140 li_vector_costs.qsort (li_cost_vec_cmp);
5141
5142 /* Now cost the portions individually. */
5143 unsigned vi = 0;
5144 unsigned si = 0;
5145 while (si < li_scalar_costs.length ()
5146 && vi < li_vector_costs.length ())
5147 {
5148 unsigned sl = li_scalar_costs[si].first;
5149 unsigned vl = li_vector_costs[vi].first;
5150 if (sl != vl)
5151 {
5152 if (dump_enabled_p ())
5153 dump_printf_loc (MSG_NOTE, vect_location,
5154 "Scalar %d and vector %d loop part do not "
5155 "match up, skipping scalar part\n", sl, vl);
5156 /* Skip the scalar part, assuming zero cost on the vector side. */
5157 do
5158 {
5159 si++;
5160 }
5161 while (si < li_scalar_costs.length ()
5162 && li_scalar_costs[si].first == sl);
5163 continue;
5164 }
5165
5166 void *scalar_target_cost_data = init_cost (NULL, true);
5167 do
5168 {
5169 add_stmt_cost (bb_vinfo, scalar_target_cost_data,
5170 li_scalar_costs[si].second);
5171 si++;
5172 }
5173 while (si < li_scalar_costs.length ()
5174 && li_scalar_costs[si].first == sl);
5175 unsigned dummy;
5176 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
5177 destroy_cost_data (scalar_target_cost_data);
5178
5179 /* Complete the target-specific vector cost calculation. */
5180 void *vect_target_cost_data = init_cost (NULL, false);
5181 do
5182 {
5183 add_stmt_cost (bb_vinfo, vect_target_cost_data,
5184 li_vector_costs[vi].second);
5185 vi++;
5186 }
5187 while (vi < li_vector_costs.length ()
5188 && li_vector_costs[vi].first == vl);
5189 finish_cost (vect_target_cost_data, &vec_prologue_cost,
5190 &vec_inside_cost, &vec_epilogue_cost);
5191 destroy_cost_data (vect_target_cost_data);
5192
5193 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
5194
5195 if (dump_enabled_p ())
5196 {
5197 dump_printf_loc (MSG_NOTE, vect_location,
5198 "Cost model analysis for part in loop %d:\n", sl);
5199 dump_printf (MSG_NOTE, " Vector cost: %d\n",
5200 vec_inside_cost + vec_outside_cost);
5201 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
5202 }
5203
5204 /* Vectorization is profitable if its cost is more than the cost of scalar
5205 version. Note that we err on the vector side for equal cost because
5206 the cost estimate is otherwise quite pessimistic (constant uses are
5207 free on the scalar side but cost a load on the vector side for
5208 example). */
5209 if (vec_outside_cost + vec_inside_cost > scalar_cost)
5210 {
5211 scalar_costs.release ();
5212 vector_costs.release ();
5213 return false;
5214 }
5215 }
5216 if (vi < li_vector_costs.length ())
5217 {
5218 if (dump_enabled_p ())
5219 dump_printf_loc (MSG_NOTE, vect_location,
5220 "Excess vector cost for part in loop %d:\n",
5221 li_vector_costs[vi].first);
5222 scalar_costs.release ();
5223 vector_costs.release ();
5224 return false;
5225 }
5226
5227 scalar_costs.release ();
5228 vector_costs.release ();
5229 return true;
5230 }
5231
5232 /* qsort comparator for lane defs. */
5233
5234 static int
5235 vld_cmp (const void *a_, const void *b_)
5236 {
5237 auto *a = (const std::pair<unsigned, tree> *)a_;
5238 auto *b = (const std::pair<unsigned, tree> *)b_;
5239 return a->first - b->first;
5240 }
5241
5242 /* Return true if USE_STMT is a vector lane insert into VEC and set
5243 *THIS_LANE to the lane number that is set. */
5244
5245 static bool
5246 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
5247 {
5248 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
5249 if (!use_ass
5250 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
5251 || (vec
5252 ? gimple_assign_rhs1 (use_ass) != vec
5253 : ((vec = gimple_assign_rhs1 (use_ass)), false))
5254 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
5255 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
5256 || !constant_multiple_p
5257 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
5258 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
5259 this_lane))
5260 return false;
5261 return true;
5262 }
5263
5264 /* Find any vectorizable constructors and add them to the grouped_store
5265 array. */
5266
5267 static void
5268 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
5269 {
5270 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
5271 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
5272 !gsi_end_p (gsi); gsi_next (&gsi))
5273 {
5274 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
5275 if (!assign)
5276 continue;
5277
5278 tree rhs = gimple_assign_rhs1 (assign);
5279 enum tree_code code = gimple_assign_rhs_code (assign);
5280 use_operand_p use_p;
5281 gimple *use_stmt;
5282 if (code == CONSTRUCTOR)
5283 {
5284 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5285 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
5286 CONSTRUCTOR_NELTS (rhs))
5287 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
5288 || uniform_vector_p (rhs))
5289 continue;
5290
5291 unsigned j;
5292 tree val;
5293 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
5294 if (TREE_CODE (val) != SSA_NAME
5295 || !bb_vinfo->lookup_def (val))
5296 break;
5297 if (j != CONSTRUCTOR_NELTS (rhs))
5298 continue;
5299
5300 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
5301 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
5302 }
5303 else if (code == BIT_INSERT_EXPR
5304 && VECTOR_TYPE_P (TREE_TYPE (rhs))
5305 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
5306 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
5307 && integer_zerop (gimple_assign_rhs3 (assign))
5308 && useless_type_conversion_p
5309 (TREE_TYPE (TREE_TYPE (rhs)),
5310 TREE_TYPE (gimple_assign_rhs2 (assign)))
5311 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
5312 {
5313 /* We start to match on insert to lane zero but since the
5314 inserts need not be ordered we'd have to search both
5315 the def and the use chains. */
5316 tree vectype = TREE_TYPE (rhs);
5317 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5318 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
5319 auto_sbitmap lanes (nlanes);
5320 bitmap_clear (lanes);
5321 bitmap_set_bit (lanes, 0);
5322 tree def = gimple_assign_lhs (assign);
5323 lane_defs.quick_push
5324 (std::make_pair (0, gimple_assign_rhs2 (assign)));
5325 unsigned lanes_found = 1;
5326 /* Start with the use chains, the last stmt will be the root. */
5327 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
5328 vec<stmt_vec_info> roots = vNULL;
5329 roots.safe_push (last);
5330 do
5331 {
5332 use_operand_p use_p;
5333 gimple *use_stmt;
5334 if (!single_imm_use (def, &use_p, &use_stmt))
5335 break;
5336 unsigned this_lane;
5337 if (!bb_vinfo->lookup_stmt (use_stmt)
5338 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
5339 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
5340 break;
5341 if (bitmap_bit_p (lanes, this_lane))
5342 break;
5343 lanes_found++;
5344 bitmap_set_bit (lanes, this_lane);
5345 gassign *use_ass = as_a <gassign *> (use_stmt);
5346 lane_defs.quick_push (std::make_pair
5347 (this_lane, gimple_assign_rhs2 (use_ass)));
5348 last = bb_vinfo->lookup_stmt (use_ass);
5349 roots.safe_push (last);
5350 def = gimple_assign_lhs (use_ass);
5351 }
5352 while (lanes_found < nlanes);
5353 if (roots.length () > 1)
5354 std::swap(roots[0], roots[roots.length () - 1]);
5355 if (lanes_found < nlanes)
5356 {
5357 /* Now search the def chain. */
5358 def = gimple_assign_rhs1 (assign);
5359 do
5360 {
5361 if (TREE_CODE (def) != SSA_NAME
5362 || !has_single_use (def))
5363 break;
5364 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
5365 unsigned this_lane;
5366 if (!bb_vinfo->lookup_stmt (def_stmt)
5367 || !vect_slp_is_lane_insert (def_stmt,
5368 NULL_TREE, &this_lane)
5369 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
5370 break;
5371 if (bitmap_bit_p (lanes, this_lane))
5372 break;
5373 lanes_found++;
5374 bitmap_set_bit (lanes, this_lane);
5375 lane_defs.quick_push (std::make_pair
5376 (this_lane,
5377 gimple_assign_rhs2 (def_stmt)));
5378 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
5379 def = gimple_assign_rhs1 (def_stmt);
5380 }
5381 while (lanes_found < nlanes);
5382 }
5383 if (lanes_found == nlanes)
5384 {
5385 /* Sort lane_defs after the lane index and register the root. */
5386 lane_defs.qsort (vld_cmp);
5387 vec<stmt_vec_info> stmts;
5388 stmts.create (nlanes);
5389 for (unsigned i = 0; i < nlanes; ++i)
5390 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
5391 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
5392 stmts, roots));
5393 }
5394 else
5395 roots.release ();
5396 }
5397 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5398 && (associative_tree_code (code) || code == MINUS_EXPR)
5399 /* ??? The flag_associative_math and TYPE_OVERFLOW_WRAPS
5400 checks pessimize a two-element reduction. PR54400.
5401 ??? In-order reduction could be handled if we only
5402 traverse one operand chain in vect_slp_linearize_chain. */
5403 && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
5404 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
5405 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
5406 /* Ops with constants at the tail can be stripped here. */
5407 && TREE_CODE (rhs) == SSA_NAME
5408 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
5409 /* Should be the chain end. */
5410 && (!single_imm_use (gimple_assign_lhs (assign),
5411 &use_p, &use_stmt)
5412 || !is_gimple_assign (use_stmt)
5413 || (gimple_assign_rhs_code (use_stmt) != code
5414 && ((code != PLUS_EXPR && code != MINUS_EXPR)
5415 || (gimple_assign_rhs_code (use_stmt)
5416 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
5417 {
5418 /* We start the match at the end of a possible association
5419 chain. */
5420 auto_vec<chain_op_t> chain;
5421 auto_vec<std::pair<tree_code, gimple *> > worklist;
5422 auto_vec<gimple *> chain_stmts;
5423 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
5424 if (code == MINUS_EXPR)
5425 code = PLUS_EXPR;
5426 internal_fn reduc_fn;
5427 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
5428 || reduc_fn == IFN_LAST)
5429 continue;
5430 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
5431 /* ??? */
5432 code_stmt, alt_code_stmt, &chain_stmts);
5433 if (chain.length () > 1)
5434 {
5435 /* Sort the chain according to def_type and operation. */
5436 chain.sort (dt_sort_cmp, bb_vinfo);
5437 /* ??? Now we'd want to strip externals and constants
5438 but record those to be handled in the epilogue. */
5439 /* ??? For now do not allow mixing ops or externs/constants. */
5440 bool invalid = false;
5441 for (unsigned i = 0; i < chain.length (); ++i)
5442 if (chain[i].dt != vect_internal_def
5443 || chain[i].code != code)
5444 invalid = true;
5445 if (!invalid)
5446 {
5447 vec<stmt_vec_info> stmts;
5448 stmts.create (chain.length ());
5449 for (unsigned i = 0; i < chain.length (); ++i)
5450 stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
5451 vec<stmt_vec_info> roots;
5452 roots.create (chain_stmts.length ());
5453 for (unsigned i = 0; i < chain_stmts.length (); ++i)
5454 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
5455 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
5456 stmts, roots));
5457 }
5458 }
5459 }
5460 }
5461 }
5462
5463 /* Walk the grouped store chains and replace entries with their
5464 pattern variant if any. */
5465
5466 static void
5467 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
5468 {
5469 stmt_vec_info first_element;
5470 unsigned i;
5471
5472 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5473 {
5474 /* We also have CTORs in this array. */
5475 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
5476 continue;
5477 if (STMT_VINFO_IN_PATTERN_P (first_element))
5478 {
5479 stmt_vec_info orig = first_element;
5480 first_element = STMT_VINFO_RELATED_STMT (first_element);
5481 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
5482 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
5483 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
5484 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
5485 vinfo->grouped_stores[i] = first_element;
5486 }
5487 stmt_vec_info prev = first_element;
5488 while (DR_GROUP_NEXT_ELEMENT (prev))
5489 {
5490 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
5491 if (STMT_VINFO_IN_PATTERN_P (elt))
5492 {
5493 stmt_vec_info orig = elt;
5494 elt = STMT_VINFO_RELATED_STMT (elt);
5495 DR_GROUP_NEXT_ELEMENT (prev) = elt;
5496 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
5497 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
5498 }
5499 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
5500 prev = elt;
5501 }
5502 }
5503 }
5504
5505 /* Check if the region described by BB_VINFO can be vectorized, returning
5506 true if so. When returning false, set FATAL to true if the same failure
5507 would prevent vectorization at other vector sizes, false if it is still
5508 worth trying other sizes. N_STMTS is the number of statements in the
5509 region. */
5510
5511 static bool
5512 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
5513 vec<int> *dataref_groups)
5514 {
5515 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
5516
5517 slp_instance instance;
5518 int i;
5519 poly_uint64 min_vf = 2;
5520
5521 /* The first group of checks is independent of the vector size. */
5522 fatal = true;
5523
5524 /* Analyze the data references. */
5525
5526 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
5527 {
5528 if (dump_enabled_p ())
5529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5530 "not vectorized: unhandled data-ref in basic "
5531 "block.\n");
5532 return false;
5533 }
5534
5535 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
5536 {
5537 if (dump_enabled_p ())
5538 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5539 "not vectorized: unhandled data access in "
5540 "basic block.\n");
5541 return false;
5542 }
5543
5544 vect_slp_check_for_constructors (bb_vinfo);
5545
5546 /* If there are no grouped stores and no constructors in the region
5547 there is no need to continue with pattern recog as vect_analyze_slp
5548 will fail anyway. */
5549 if (bb_vinfo->grouped_stores.is_empty ()
5550 && bb_vinfo->roots.is_empty ())
5551 {
5552 if (dump_enabled_p ())
5553 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5554 "not vectorized: no grouped stores in "
5555 "basic block.\n");
5556 return false;
5557 }
5558
5559 /* While the rest of the analysis below depends on it in some way. */
5560 fatal = false;
5561
5562 vect_pattern_recog (bb_vinfo);
5563
5564 /* Update store groups from pattern processing. */
5565 vect_fixup_store_groups_with_patterns (bb_vinfo);
5566
5567 /* Check the SLP opportunities in the basic block, analyze and build SLP
5568 trees. */
5569 if (!vect_analyze_slp (bb_vinfo, n_stmts))
5570 {
5571 if (dump_enabled_p ())
5572 {
5573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5574 "Failed to SLP the basic block.\n");
5575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5576 "not vectorized: failed to find SLP opportunities "
5577 "in basic block.\n");
5578 }
5579 return false;
5580 }
5581
5582 /* Optimize permutations. */
5583 vect_optimize_slp (bb_vinfo);
5584
5585 /* Gather the loads reachable from the SLP graph entries. */
5586 vect_gather_slp_loads (bb_vinfo);
5587
5588 vect_record_base_alignments (bb_vinfo);
5589
5590 /* Analyze and verify the alignment of data references and the
5591 dependence in the SLP instances. */
5592 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
5593 {
5594 vect_location = instance->location ();
5595 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
5596 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
5597 {
5598 slp_tree node = SLP_INSTANCE_TREE (instance);
5599 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5600 if (dump_enabled_p ())
5601 dump_printf_loc (MSG_NOTE, vect_location,
5602 "removing SLP instance operations starting from: %G",
5603 stmt_info->stmt);
5604 vect_free_slp_instance (instance);
5605 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
5606 continue;
5607 }
5608
5609 /* Mark all the statements that we want to vectorize as pure SLP and
5610 relevant. */
5611 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5612 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
5613 unsigned j;
5614 stmt_vec_info root;
5615 /* Likewise consider instance root stmts as vectorized. */
5616 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
5617 STMT_SLP_TYPE (root) = pure_slp;
5618
5619 i++;
5620 }
5621 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
5622 return false;
5623
5624 if (!vect_slp_analyze_operations (bb_vinfo))
5625 {
5626 if (dump_enabled_p ())
5627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5628 "not vectorized: bad operation in basic block.\n");
5629 return false;
5630 }
5631
5632 vect_bb_partition_graph (bb_vinfo);
5633
5634 return true;
5635 }
5636
5637 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
5638 basic blocks in BBS, returning true on success.
5639 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
5640
5641 static bool
5642 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
5643 vec<int> *dataref_groups, unsigned int n_stmts)
5644 {
5645 bb_vec_info bb_vinfo;
5646 auto_vector_modes vector_modes;
5647
5648 /* Autodetect first vector size we try. */
5649 machine_mode next_vector_mode = VOIDmode;
5650 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
5651 unsigned int mode_i = 0;
5652
5653 vec_info_shared shared;
5654
5655 machine_mode autodetected_vector_mode = VOIDmode;
5656 while (1)
5657 {
5658 bool vectorized = false;
5659 bool fatal = false;
5660 bb_vinfo = new _bb_vec_info (bbs, &shared);
5661
5662 bool first_time_p = shared.datarefs.is_empty ();
5663 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
5664 if (first_time_p)
5665 bb_vinfo->shared->save_datarefs ();
5666 else
5667 bb_vinfo->shared->check_datarefs ();
5668 bb_vinfo->vector_mode = next_vector_mode;
5669
5670 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
5671 {
5672 if (dump_enabled_p ())
5673 {
5674 dump_printf_loc (MSG_NOTE, vect_location,
5675 "***** Analysis succeeded with vector mode"
5676 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
5677 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
5678 }
5679
5680 bb_vinfo->shared->check_datarefs ();
5681
5682 unsigned i;
5683 slp_instance instance;
5684 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
5685 {
5686 if (instance->subgraph_entries.is_empty ())
5687 continue;
5688
5689 vect_location = instance->location ();
5690 if (!unlimited_cost_model (NULL)
5691 && !vect_bb_vectorization_profitable_p
5692 (bb_vinfo, instance->subgraph_entries))
5693 {
5694 if (dump_enabled_p ())
5695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5696 "not vectorized: vectorization is not "
5697 "profitable.\n");
5698 continue;
5699 }
5700
5701 if (!dbg_cnt (vect_slp))
5702 continue;
5703
5704 if (!vectorized && dump_enabled_p ())
5705 dump_printf_loc (MSG_NOTE, vect_location,
5706 "Basic block will be vectorized "
5707 "using SLP\n");
5708 vectorized = true;
5709
5710 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
5711
5712 unsigned HOST_WIDE_INT bytes;
5713 if (dump_enabled_p ())
5714 {
5715 if (GET_MODE_SIZE
5716 (bb_vinfo->vector_mode).is_constant (&bytes))
5717 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5718 "basic block part vectorized using %wu "
5719 "byte vectors\n", bytes);
5720 else
5721 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5722 "basic block part vectorized using "
5723 "variable length vectors\n");
5724 }
5725 }
5726 }
5727 else
5728 {
5729 if (dump_enabled_p ())
5730 dump_printf_loc (MSG_NOTE, vect_location,
5731 "***** Analysis failed with vector mode %s\n",
5732 GET_MODE_NAME (bb_vinfo->vector_mode));
5733 }
5734
5735 if (mode_i == 0)
5736 autodetected_vector_mode = bb_vinfo->vector_mode;
5737
5738 if (!fatal)
5739 while (mode_i < vector_modes.length ()
5740 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
5741 {
5742 if (dump_enabled_p ())
5743 dump_printf_loc (MSG_NOTE, vect_location,
5744 "***** The result for vector mode %s would"
5745 " be the same\n",
5746 GET_MODE_NAME (vector_modes[mode_i]));
5747 mode_i += 1;
5748 }
5749
5750 delete bb_vinfo;
5751
5752 if (mode_i < vector_modes.length ()
5753 && VECTOR_MODE_P (autodetected_vector_mode)
5754 && (related_vector_mode (vector_modes[mode_i],
5755 GET_MODE_INNER (autodetected_vector_mode))
5756 == autodetected_vector_mode)
5757 && (related_vector_mode (autodetected_vector_mode,
5758 GET_MODE_INNER (vector_modes[mode_i]))
5759 == vector_modes[mode_i]))
5760 {
5761 if (dump_enabled_p ())
5762 dump_printf_loc (MSG_NOTE, vect_location,
5763 "***** Skipping vector mode %s, which would"
5764 " repeat the analysis for %s\n",
5765 GET_MODE_NAME (vector_modes[mode_i]),
5766 GET_MODE_NAME (autodetected_vector_mode));
5767 mode_i += 1;
5768 }
5769
5770 if (vectorized
5771 || mode_i == vector_modes.length ()
5772 || autodetected_vector_mode == VOIDmode
5773 /* If vect_slp_analyze_bb_1 signaled that analysis for all
5774 vector sizes will fail do not bother iterating. */
5775 || fatal)
5776 return vectorized;
5777
5778 /* Try the next biggest vector size. */
5779 next_vector_mode = vector_modes[mode_i++];
5780 if (dump_enabled_p ())
5781 dump_printf_loc (MSG_NOTE, vect_location,
5782 "***** Re-trying analysis with vector mode %s\n",
5783 GET_MODE_NAME (next_vector_mode));
5784 }
5785 }
5786
5787
5788 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
5789 true if anything in the basic-block was vectorized. */
5790
5791 static bool
5792 vect_slp_bbs (vec<basic_block> bbs)
5793 {
5794 vec<data_reference_p> datarefs = vNULL;
5795 auto_vec<int> dataref_groups;
5796 int insns = 0;
5797 int current_group = 0;
5798
5799 for (unsigned i = 0; i < bbs.length (); i++)
5800 {
5801 basic_block bb = bbs[i];
5802 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
5803 gsi_next (&gsi))
5804 {
5805 gimple *stmt = gsi_stmt (gsi);
5806 if (is_gimple_debug (stmt))
5807 continue;
5808
5809 insns++;
5810
5811 if (gimple_location (stmt) != UNKNOWN_LOCATION)
5812 vect_location = stmt;
5813
5814 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
5815 &dataref_groups, current_group))
5816 ++current_group;
5817 }
5818 }
5819
5820 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
5821 }
5822
5823 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5824 true if anything in the basic-block was vectorized. */
5825
5826 bool
5827 vect_slp_bb (basic_block bb)
5828 {
5829 auto_vec<basic_block> bbs;
5830 bbs.safe_push (bb);
5831 return vect_slp_bbs (bbs);
5832 }
5833
5834 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5835 true if anything in the basic-block was vectorized. */
5836
5837 bool
5838 vect_slp_function (function *fun)
5839 {
5840 bool r = false;
5841 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
5842 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
5843
5844 /* For the moment split the function into pieces to avoid making
5845 the iteration on the vector mode moot. Split at points we know
5846 to not handle well which is CFG merges (SLP discovery doesn't
5847 handle non-loop-header PHIs) and loop exits. Since pattern
5848 recog requires reverse iteration to visit uses before defs
5849 simply chop RPO into pieces. */
5850 auto_vec<basic_block> bbs;
5851 for (unsigned i = 0; i < n; i++)
5852 {
5853 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
5854 bool split = false;
5855
5856 /* Split when a BB is not dominated by the first block. */
5857 if (!bbs.is_empty ()
5858 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
5859 {
5860 if (dump_enabled_p ())
5861 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5862 "splitting region at dominance boundary bb%d\n",
5863 bb->index);
5864 split = true;
5865 }
5866 /* Split when the loop determined by the first block
5867 is exited. This is because we eventually insert
5868 invariants at region begin. */
5869 else if (!bbs.is_empty ()
5870 && bbs[0]->loop_father != bb->loop_father
5871 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
5872 {
5873 if (dump_enabled_p ())
5874 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5875 "splitting region at loop %d exit at bb%d\n",
5876 bbs[0]->loop_father->num, bb->index);
5877 split = true;
5878 }
5879
5880 if (split && !bbs.is_empty ())
5881 {
5882 r |= vect_slp_bbs (bbs);
5883 bbs.truncate (0);
5884 bbs.quick_push (bb);
5885 }
5886 else
5887 bbs.safe_push (bb);
5888
5889 /* When we have a stmt ending this block and defining a
5890 value we have to insert on edges when inserting after it for
5891 a vector containing its definition. Avoid this for now. */
5892 if (gimple *last = last_stmt (bb))
5893 if (gimple_get_lhs (last)
5894 && is_ctrl_altering_stmt (last))
5895 {
5896 if (dump_enabled_p ())
5897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5898 "splitting region at control altering "
5899 "definition %G", last);
5900 r |= vect_slp_bbs (bbs);
5901 bbs.truncate (0);
5902 }
5903 }
5904
5905 if (!bbs.is_empty ())
5906 r |= vect_slp_bbs (bbs);
5907
5908 free (rpo);
5909
5910 return r;
5911 }
5912
5913 /* Build a variable-length vector in which the elements in ELTS are repeated
5914 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
5915 RESULTS and add any new instructions to SEQ.
5916
5917 The approach we use is:
5918
5919 (1) Find a vector mode VM with integer elements of mode IM.
5920
5921 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5922 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
5923 from small vectors to IM.
5924
5925 (3) Duplicate each ELTS'[I] into a vector of mode VM.
5926
5927 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
5928 correct byte contents.
5929
5930 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
5931
5932 We try to find the largest IM for which this sequence works, in order
5933 to cut down on the number of interleaves. */
5934
5935 void
5936 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
5937 vec<tree> elts, unsigned int nresults,
5938 vec<tree> &results)
5939 {
5940 unsigned int nelts = elts.length ();
5941 tree element_type = TREE_TYPE (vector_type);
5942
5943 /* (1) Find a vector mode VM with integer elements of mode IM. */
5944 unsigned int nvectors = 1;
5945 tree new_vector_type;
5946 tree permutes[2];
5947 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
5948 &nvectors, &new_vector_type,
5949 permutes))
5950 gcc_unreachable ();
5951
5952 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
5953 unsigned int partial_nelts = nelts / nvectors;
5954 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
5955
5956 tree_vector_builder partial_elts;
5957 auto_vec<tree, 32> pieces (nvectors * 2);
5958 pieces.quick_grow_cleared (nvectors * 2);
5959 for (unsigned int i = 0; i < nvectors; ++i)
5960 {
5961 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5962 ELTS' has mode IM. */
5963 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
5964 for (unsigned int j = 0; j < partial_nelts; ++j)
5965 partial_elts.quick_push (elts[i * partial_nelts + j]);
5966 tree t = gimple_build_vector (seq, &partial_elts);
5967 t = gimple_build (seq, VIEW_CONVERT_EXPR,
5968 TREE_TYPE (new_vector_type), t);
5969
5970 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
5971 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
5972 }
5973
5974 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
5975 correct byte contents.
5976
5977 Conceptually, we need to repeat the following operation log2(nvectors)
5978 times, where hi_start = nvectors / 2:
5979
5980 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
5981 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
5982
5983 However, if each input repeats every N elements and the VF is
5984 a multiple of N * 2, the HI result is the same as the LO result.
5985 This will be true for the first N1 iterations of the outer loop,
5986 followed by N2 iterations for which both the LO and HI results
5987 are needed. I.e.:
5988
5989 N1 + N2 = log2(nvectors)
5990
5991 Each "N1 iteration" doubles the number of redundant vectors and the
5992 effect of the process as a whole is to have a sequence of nvectors/2**N1
5993 vectors that repeats 2**N1 times. Rather than generate these redundant
5994 vectors, we halve the number of vectors for each N1 iteration. */
5995 unsigned int in_start = 0;
5996 unsigned int out_start = nvectors;
5997 unsigned int new_nvectors = nvectors;
5998 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
5999 {
6000 unsigned int hi_start = new_nvectors / 2;
6001 unsigned int out_i = 0;
6002 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
6003 {
6004 if ((in_i & 1) != 0
6005 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
6006 2 * in_repeat))
6007 continue;
6008
6009 tree output = make_ssa_name (new_vector_type);
6010 tree input1 = pieces[in_start + (in_i / 2)];
6011 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
6012 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
6013 input1, input2,
6014 permutes[in_i & 1]);
6015 gimple_seq_add_stmt (seq, stmt);
6016 pieces[out_start + out_i] = output;
6017 out_i += 1;
6018 }
6019 std::swap (in_start, out_start);
6020 new_nvectors = out_i;
6021 }
6022
6023 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
6024 results.reserve (nresults);
6025 for (unsigned int i = 0; i < nresults; ++i)
6026 if (i < new_nvectors)
6027 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
6028 pieces[in_start + i]));
6029 else
6030 results.quick_push (results[i - new_nvectors]);
6031 }
6032
6033
6034 /* For constant and loop invariant defs in OP_NODE this function creates
6035 vector defs that will be used in the vectorized stmts and stores them
6036 to SLP_TREE_VEC_DEFS of OP_NODE. */
6037
6038 static void
6039 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
6040 {
6041 unsigned HOST_WIDE_INT nunits;
6042 tree vec_cst;
6043 unsigned j, number_of_places_left_in_vector;
6044 tree vector_type;
6045 tree vop;
6046 int group_size = op_node->ops.length ();
6047 unsigned int vec_num, i;
6048 unsigned number_of_copies = 1;
6049 bool constant_p;
6050 gimple_seq ctor_seq = NULL;
6051 auto_vec<tree, 16> permute_results;
6052
6053 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
6054 vector_type = SLP_TREE_VECTYPE (op_node);
6055
6056 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
6057 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
6058 auto_vec<tree> voprnds (number_of_vectors);
6059
6060 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
6061 created vectors. It is greater than 1 if unrolling is performed.
6062
6063 For example, we have two scalar operands, s1 and s2 (e.g., group of
6064 strided accesses of size two), while NUNITS is four (i.e., four scalars
6065 of this type can be packed in a vector). The output vector will contain
6066 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
6067 will be 2).
6068
6069 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
6070 containing the operands.
6071
6072 For example, NUNITS is four as before, and the group size is 8
6073 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
6074 {s5, s6, s7, s8}. */
6075
6076 /* When using duplicate_and_interleave, we just need one element for
6077 each scalar statement. */
6078 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
6079 nunits = group_size;
6080
6081 number_of_copies = nunits * number_of_vectors / group_size;
6082
6083 number_of_places_left_in_vector = nunits;
6084 constant_p = true;
6085 tree_vector_builder elts (vector_type, nunits, 1);
6086 elts.quick_grow (nunits);
6087 stmt_vec_info insert_after = NULL;
6088 for (j = 0; j < number_of_copies; j++)
6089 {
6090 tree op;
6091 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
6092 {
6093 /* Create 'vect_ = {op0,op1,...,opn}'. */
6094 number_of_places_left_in_vector--;
6095 tree orig_op = op;
6096 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
6097 {
6098 if (CONSTANT_CLASS_P (op))
6099 {
6100 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6101 {
6102 /* Can't use VIEW_CONVERT_EXPR for booleans because
6103 of possibly different sizes of scalar value and
6104 vector element. */
6105 if (integer_zerop (op))
6106 op = build_int_cst (TREE_TYPE (vector_type), 0);
6107 else if (integer_onep (op))
6108 op = build_all_ones_cst (TREE_TYPE (vector_type));
6109 else
6110 gcc_unreachable ();
6111 }
6112 else
6113 op = fold_unary (VIEW_CONVERT_EXPR,
6114 TREE_TYPE (vector_type), op);
6115 gcc_assert (op && CONSTANT_CLASS_P (op));
6116 }
6117 else
6118 {
6119 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
6120 gimple *init_stmt;
6121 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6122 {
6123 tree true_val
6124 = build_all_ones_cst (TREE_TYPE (vector_type));
6125 tree false_val
6126 = build_zero_cst (TREE_TYPE (vector_type));
6127 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
6128 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
6129 op, true_val,
6130 false_val);
6131 }
6132 else
6133 {
6134 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
6135 op);
6136 init_stmt
6137 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
6138 op);
6139 }
6140 gimple_seq_add_stmt (&ctor_seq, init_stmt);
6141 op = new_temp;
6142 }
6143 }
6144 elts[number_of_places_left_in_vector] = op;
6145 if (!CONSTANT_CLASS_P (op))
6146 constant_p = false;
6147 /* For BB vectorization we have to compute an insert location
6148 when a def is inside the analyzed region since we cannot
6149 simply insert at the BB start in this case. */
6150 stmt_vec_info opdef;
6151 if (TREE_CODE (orig_op) == SSA_NAME
6152 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
6153 && is_a <bb_vec_info> (vinfo)
6154 && (opdef = vinfo->lookup_def (orig_op)))
6155 {
6156 if (!insert_after)
6157 insert_after = opdef;
6158 else
6159 insert_after = get_later_stmt (insert_after, opdef);
6160 }
6161
6162 if (number_of_places_left_in_vector == 0)
6163 {
6164 if (constant_p
6165 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
6166 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
6167 vec_cst = gimple_build_vector (&ctor_seq, &elts);
6168 else
6169 {
6170 if (permute_results.is_empty ())
6171 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
6172 elts, number_of_vectors,
6173 permute_results);
6174 vec_cst = permute_results[number_of_vectors - j - 1];
6175 }
6176 if (!gimple_seq_empty_p (ctor_seq))
6177 {
6178 if (insert_after)
6179 {
6180 gimple_stmt_iterator gsi;
6181 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
6182 {
6183 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
6184 gsi_insert_seq_before (&gsi, ctor_seq,
6185 GSI_CONTINUE_LINKING);
6186 }
6187 else if (!stmt_ends_bb_p (insert_after->stmt))
6188 {
6189 gsi = gsi_for_stmt (insert_after->stmt);
6190 gsi_insert_seq_after (&gsi, ctor_seq,
6191 GSI_CONTINUE_LINKING);
6192 }
6193 else
6194 {
6195 /* When we want to insert after a def where the
6196 defining stmt throws then insert on the fallthru
6197 edge. */
6198 edge e = find_fallthru_edge
6199 (gimple_bb (insert_after->stmt)->succs);
6200 basic_block new_bb
6201 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
6202 gcc_assert (!new_bb);
6203 }
6204 }
6205 else
6206 vinfo->insert_seq_on_entry (NULL, ctor_seq);
6207 ctor_seq = NULL;
6208 }
6209 voprnds.quick_push (vec_cst);
6210 insert_after = NULL;
6211 number_of_places_left_in_vector = nunits;
6212 constant_p = true;
6213 elts.new_vector (vector_type, nunits, 1);
6214 elts.quick_grow (nunits);
6215 }
6216 }
6217 }
6218
6219 /* Since the vectors are created in the reverse order, we should invert
6220 them. */
6221 vec_num = voprnds.length ();
6222 for (j = vec_num; j != 0; j--)
6223 {
6224 vop = voprnds[j - 1];
6225 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6226 }
6227
6228 /* In case that VF is greater than the unrolling factor needed for the SLP
6229 group of stmts, NUMBER_OF_VECTORS to be created is greater than
6230 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
6231 to replicate the vectors. */
6232 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
6233 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
6234 i++)
6235 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6236 }
6237
6238 /* Get the Ith vectorized definition from SLP_NODE. */
6239
6240 tree
6241 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
6242 {
6243 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
6244 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
6245 else
6246 return SLP_TREE_VEC_DEFS (slp_node)[i];
6247 }
6248
6249 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
6250
6251 void
6252 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
6253 {
6254 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
6255 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
6256 {
6257 unsigned j;
6258 gimple *vec_def_stmt;
6259 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
6260 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
6261 }
6262 else
6263 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
6264 }
6265
6266 /* Get N vectorized definitions for SLP_NODE. */
6267
6268 void
6269 vect_get_slp_defs (vec_info *,
6270 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
6271 {
6272 if (n == -1U)
6273 n = SLP_TREE_CHILDREN (slp_node).length ();
6274
6275 for (unsigned i = 0; i < n; ++i)
6276 {
6277 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
6278 vec<tree> vec_defs = vNULL;
6279 vect_get_slp_defs (child, &vec_defs);
6280 vec_oprnds->quick_push (vec_defs);
6281 }
6282 }
6283
6284 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6285 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6286 permute statements for the SLP node NODE. Store the number of vector
6287 permute instructions in *N_PERMS and the number of vector load
6288 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
6289 that were not needed. */
6290
6291 bool
6292 vect_transform_slp_perm_load (vec_info *vinfo,
6293 slp_tree node, vec<tree> dr_chain,
6294 gimple_stmt_iterator *gsi, poly_uint64 vf,
6295 bool analyze_only, unsigned *n_perms,
6296 unsigned int *n_loads, bool dce_chain)
6297 {
6298 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6299 int vec_index = 0;
6300 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6301 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
6302 unsigned int mask_element;
6303 machine_mode mode;
6304
6305 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
6306 return false;
6307
6308 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6309
6310 mode = TYPE_MODE (vectype);
6311 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6312
6313 /* Initialize the vect stmts of NODE to properly insert the generated
6314 stmts later. */
6315 if (! analyze_only)
6316 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
6317 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6318 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
6319
6320 /* Generate permutation masks for every NODE. Number of masks for each NODE
6321 is equal to GROUP_SIZE.
6322 E.g., we have a group of three nodes with three loads from the same
6323 location in each node, and the vector size is 4. I.e., we have a
6324 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6325 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6326 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6327 ...
6328
6329 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
6330 The last mask is illegal since we assume two operands for permute
6331 operation, and the mask element values can't be outside that range.
6332 Hence, the last mask must be converted into {2,5,5,5}.
6333 For the first two permutations we need the first and the second input
6334 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6335 we need the second and the third vectors: {b1,c1,a2,b2} and
6336 {c2,a3,b3,c3}. */
6337
6338 int vect_stmts_counter = 0;
6339 unsigned int index = 0;
6340 int first_vec_index = -1;
6341 int second_vec_index = -1;
6342 bool noop_p = true;
6343 *n_perms = 0;
6344
6345 vec_perm_builder mask;
6346 unsigned int nelts_to_build;
6347 unsigned int nvectors_per_build;
6348 unsigned int in_nlanes;
6349 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
6350 && multiple_p (nunits, group_size));
6351 if (repeating_p)
6352 {
6353 /* A single vector contains a whole number of copies of the node, so:
6354 (a) all permutes can use the same mask; and
6355 (b) the permutes only need a single vector input. */
6356 mask.new_vector (nunits, group_size, 3);
6357 nelts_to_build = mask.encoded_nelts ();
6358 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
6359 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
6360 }
6361 else
6362 {
6363 /* We need to construct a separate mask for each vector statement. */
6364 unsigned HOST_WIDE_INT const_nunits, const_vf;
6365 if (!nunits.is_constant (&const_nunits)
6366 || !vf.is_constant (&const_vf))
6367 return false;
6368 mask.new_vector (const_nunits, const_nunits, 1);
6369 nelts_to_build = const_vf * group_size;
6370 nvectors_per_build = 1;
6371 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
6372 }
6373 auto_sbitmap used_in_lanes (in_nlanes);
6374 bitmap_clear (used_in_lanes);
6375 auto_bitmap used_defs;
6376
6377 unsigned int count = mask.encoded_nelts ();
6378 mask.quick_grow (count);
6379 vec_perm_indices indices;
6380
6381 for (unsigned int j = 0; j < nelts_to_build; j++)
6382 {
6383 unsigned int iter_num = j / group_size;
6384 unsigned int stmt_num = j % group_size;
6385 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
6386 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
6387 bitmap_set_bit (used_in_lanes, i);
6388 if (repeating_p)
6389 {
6390 first_vec_index = 0;
6391 mask_element = i;
6392 }
6393 else
6394 {
6395 /* Enforced before the loop when !repeating_p. */
6396 unsigned int const_nunits = nunits.to_constant ();
6397 vec_index = i / const_nunits;
6398 mask_element = i % const_nunits;
6399 if (vec_index == first_vec_index
6400 || first_vec_index == -1)
6401 {
6402 first_vec_index = vec_index;
6403 }
6404 else if (vec_index == second_vec_index
6405 || second_vec_index == -1)
6406 {
6407 second_vec_index = vec_index;
6408 mask_element += const_nunits;
6409 }
6410 else
6411 {
6412 if (dump_enabled_p ())
6413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6414 "permutation requires at "
6415 "least three vectors %G",
6416 stmt_info->stmt);
6417 gcc_assert (analyze_only);
6418 return false;
6419 }
6420
6421 gcc_assert (mask_element < 2 * const_nunits);
6422 }
6423
6424 if (mask_element != index)
6425 noop_p = false;
6426 mask[index++] = mask_element;
6427
6428 if (index == count && !noop_p)
6429 {
6430 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
6431 if (!can_vec_perm_const_p (mode, indices))
6432 {
6433 if (dump_enabled_p ())
6434 {
6435 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
6436 vect_location,
6437 "unsupported vect permute { ");
6438 for (i = 0; i < count; ++i)
6439 {
6440 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
6441 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
6442 }
6443 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
6444 }
6445 gcc_assert (analyze_only);
6446 return false;
6447 }
6448
6449 ++*n_perms;
6450 }
6451
6452 if (index == count)
6453 {
6454 if (!analyze_only)
6455 {
6456 tree mask_vec = NULL_TREE;
6457
6458 if (! noop_p)
6459 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6460
6461 if (second_vec_index == -1)
6462 second_vec_index = first_vec_index;
6463
6464 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
6465 {
6466 /* Generate the permute statement if necessary. */
6467 tree first_vec = dr_chain[first_vec_index + ri];
6468 tree second_vec = dr_chain[second_vec_index + ri];
6469 gimple *perm_stmt;
6470 if (! noop_p)
6471 {
6472 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6473 tree perm_dest
6474 = vect_create_destination_var (gimple_assign_lhs (stmt),
6475 vectype);
6476 perm_dest = make_ssa_name (perm_dest);
6477 perm_stmt
6478 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6479 first_vec, second_vec,
6480 mask_vec);
6481 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
6482 gsi);
6483 if (dce_chain)
6484 {
6485 bitmap_set_bit (used_defs, first_vec_index + ri);
6486 bitmap_set_bit (used_defs, second_vec_index + ri);
6487 }
6488 }
6489 else
6490 {
6491 /* If mask was NULL_TREE generate the requested
6492 identity transform. */
6493 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
6494 if (dce_chain)
6495 bitmap_set_bit (used_defs, first_vec_index + ri);
6496 }
6497
6498 /* Store the vector statement in NODE. */
6499 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
6500 }
6501 }
6502
6503 index = 0;
6504 first_vec_index = -1;
6505 second_vec_index = -1;
6506 noop_p = true;
6507 }
6508 }
6509
6510 if (n_loads)
6511 {
6512 if (repeating_p)
6513 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6514 else
6515 {
6516 /* Enforced above when !repeating_p. */
6517 unsigned int const_nunits = nunits.to_constant ();
6518 *n_loads = 0;
6519 bool load_seen = false;
6520 for (unsigned i = 0; i < in_nlanes; ++i)
6521 {
6522 if (i % const_nunits == 0)
6523 {
6524 if (load_seen)
6525 *n_loads += 1;
6526 load_seen = false;
6527 }
6528 if (bitmap_bit_p (used_in_lanes, i))
6529 load_seen = true;
6530 }
6531 if (load_seen)
6532 *n_loads += 1;
6533 }
6534 }
6535
6536 if (dce_chain)
6537 for (unsigned i = 0; i < dr_chain.length (); ++i)
6538 if (!bitmap_bit_p (used_defs, i))
6539 {
6540 gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
6541 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
6542 gsi_remove (&rgsi, true);
6543 release_defs (stmt);
6544 }
6545
6546 return true;
6547 }
6548
6549 /* Produce the next vector result for SLP permutation NODE by adding a vector
6550 statement at GSI. If MASK_VEC is nonnull, add:
6551
6552 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
6553
6554 otherwise add:
6555
6556 <new SSA name> = FIRST_DEF. */
6557
6558 static void
6559 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6560 slp_tree node, tree first_def, tree second_def,
6561 tree mask_vec)
6562 {
6563 tree vectype = SLP_TREE_VECTYPE (node);
6564
6565 /* ??? We SLP match existing vector element extracts but
6566 allow punning which we need to re-instantiate at uses
6567 but have no good way of explicitly representing. */
6568 if (!types_compatible_p (TREE_TYPE (first_def), vectype))
6569 {
6570 gassign *conv_stmt
6571 = gimple_build_assign (make_ssa_name (vectype),
6572 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
6573 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6574 first_def = gimple_assign_lhs (conv_stmt);
6575 }
6576 gassign *perm_stmt;
6577 tree perm_dest = make_ssa_name (vectype);
6578 if (mask_vec)
6579 {
6580 if (!types_compatible_p (TREE_TYPE (second_def), vectype))
6581 {
6582 gassign *conv_stmt
6583 = gimple_build_assign (make_ssa_name (vectype),
6584 build1 (VIEW_CONVERT_EXPR,
6585 vectype, second_def));
6586 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6587 second_def = gimple_assign_lhs (conv_stmt);
6588 }
6589 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6590 first_def, second_def,
6591 mask_vec);
6592 }
6593 else
6594 /* We need a copy here in case the def was external. */
6595 perm_stmt = gimple_build_assign (perm_dest, first_def);
6596 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
6597 /* Store the vector statement in NODE. */
6598 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
6599 }
6600
6601 /* Vectorize the SLP permutations in NODE as specified
6602 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
6603 child number and lane number.
6604 Interleaving of two two-lane two-child SLP subtrees (not supported):
6605 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
6606 A blend of two four-lane two-child SLP subtrees:
6607 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
6608 Highpart of a four-lane one-child SLP subtree (not supported):
6609 [ { 0, 2 }, { 0, 3 } ]
6610 Where currently only a subset is supported by code generating below. */
6611
6612 static bool
6613 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6614 slp_tree node, stmt_vector_for_cost *cost_vec)
6615 {
6616 tree vectype = SLP_TREE_VECTYPE (node);
6617
6618 /* ??? We currently only support all same vector input and output types
6619 while the SLP IL should really do a concat + select and thus accept
6620 arbitrary mismatches. */
6621 slp_tree child;
6622 unsigned i;
6623 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6624 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
6625 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6626 {
6627 if (!vect_maybe_update_slp_op_vectype (child, vectype)
6628 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
6629 {
6630 if (dump_enabled_p ())
6631 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6632 "Unsupported lane permutation\n");
6633 return false;
6634 }
6635 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
6636 repeating_p = false;
6637 }
6638
6639 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
6640 gcc_assert (perm.length () == SLP_TREE_LANES (node));
6641 if (dump_enabled_p ())
6642 {
6643 dump_printf_loc (MSG_NOTE, vect_location,
6644 "vectorizing permutation");
6645 for (unsigned i = 0; i < perm.length (); ++i)
6646 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
6647 if (repeating_p)
6648 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
6649 dump_printf (MSG_NOTE, "\n");
6650 }
6651
6652 /* REPEATING_P is true if every output vector is guaranteed to use the
6653 same permute vector. We can handle that case for both variable-length
6654 and constant-length vectors, but we only handle other cases for
6655 constant-length vectors.
6656
6657 Set:
6658
6659 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
6660 mask vector that we want to build.
6661
6662 - NCOPIES to the number of copies of PERM that we need in order
6663 to build the necessary permute mask vectors.
6664
6665 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
6666 for each permute mask vector. This is only relevant when GSI is
6667 nonnull. */
6668 uint64_t npatterns;
6669 unsigned nelts_per_pattern;
6670 uint64_t ncopies;
6671 unsigned noutputs_per_mask;
6672 if (repeating_p)
6673 {
6674 /* We need a single permute mask vector that has the form:
6675
6676 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
6677
6678 In other words, the original n-element permute in PERM is
6679 "unrolled" to fill a full vector. The stepped vector encoding
6680 that we use for permutes requires 3n elements. */
6681 npatterns = SLP_TREE_LANES (node);
6682 nelts_per_pattern = ncopies = 3;
6683 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6684 }
6685 else
6686 {
6687 /* Calculate every element of every permute mask vector explicitly,
6688 instead of relying on the pattern described above. */
6689 if (!nunits.is_constant (&npatterns))
6690 return false;
6691 nelts_per_pattern = ncopies = 1;
6692 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
6693 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
6694 return false;
6695 noutputs_per_mask = 1;
6696 }
6697 unsigned olanes = ncopies * SLP_TREE_LANES (node);
6698 gcc_assert (repeating_p || multiple_p (olanes, nunits));
6699
6700 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
6701 from the { SLP operand, scalar lane } permutation as recorded in the
6702 SLP node as intermediate step. This part should already work
6703 with SLP children with arbitrary number of lanes. */
6704 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
6705 auto_vec<unsigned> active_lane;
6706 vperm.create (olanes);
6707 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
6708 for (unsigned i = 0; i < ncopies; ++i)
6709 {
6710 for (unsigned pi = 0; pi < perm.length (); ++pi)
6711 {
6712 std::pair<unsigned, unsigned> p = perm[pi];
6713 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
6714 if (repeating_p)
6715 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
6716 else
6717 {
6718 /* We checked above that the vectors are constant-length. */
6719 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
6720 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
6721 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
6722 vperm.quick_push ({{p.first, vi}, vl});
6723 }
6724 }
6725 /* Advance to the next group. */
6726 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
6727 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
6728 }
6729
6730 if (dump_enabled_p ())
6731 {
6732 dump_printf_loc (MSG_NOTE, vect_location, "as");
6733 for (unsigned i = 0; i < vperm.length (); ++i)
6734 {
6735 if (i != 0
6736 && (repeating_p
6737 ? multiple_p (i, npatterns)
6738 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
6739 dump_printf (MSG_NOTE, ",");
6740 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
6741 vperm[i].first.first, vperm[i].first.second,
6742 vperm[i].second);
6743 }
6744 dump_printf (MSG_NOTE, "\n");
6745 }
6746
6747 /* We can only handle two-vector permutes, everything else should
6748 be lowered on the SLP level. The following is closely inspired
6749 by vect_transform_slp_perm_load and is supposed to eventually
6750 replace it.
6751 ??? As intermediate step do code-gen in the SLP tree representation
6752 somehow? */
6753 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
6754 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
6755 unsigned int index = 0;
6756 poly_uint64 mask_element;
6757 vec_perm_builder mask;
6758 mask.new_vector (nunits, npatterns, nelts_per_pattern);
6759 unsigned int count = mask.encoded_nelts ();
6760 mask.quick_grow (count);
6761 vec_perm_indices indices;
6762 unsigned nperms = 0;
6763 for (unsigned i = 0; i < vperm.length (); ++i)
6764 {
6765 mask_element = vperm[i].second;
6766 if (first_vec.first == -1U
6767 || first_vec == vperm[i].first)
6768 first_vec = vperm[i].first;
6769 else if (second_vec.first == -1U
6770 || second_vec == vperm[i].first)
6771 {
6772 second_vec = vperm[i].first;
6773 mask_element += nunits;
6774 }
6775 else
6776 {
6777 if (dump_enabled_p ())
6778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6779 "permutation requires at "
6780 "least three vectors\n");
6781 gcc_assert (!gsi);
6782 return false;
6783 }
6784
6785 mask[index++] = mask_element;
6786
6787 if (index == count)
6788 {
6789 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, nunits);
6790 bool identity_p = indices.series_p (0, 1, 0, 1);
6791 if (!identity_p
6792 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6793 {
6794 if (dump_enabled_p ())
6795 {
6796 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
6797 vect_location,
6798 "unsupported vect permute { ");
6799 for (i = 0; i < count; ++i)
6800 {
6801 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
6802 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
6803 }
6804 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
6805 }
6806 gcc_assert (!gsi);
6807 return false;
6808 }
6809
6810 if (!identity_p)
6811 nperms++;
6812 if (gsi)
6813 {
6814 if (second_vec.first == -1U)
6815 second_vec = first_vec;
6816
6817 slp_tree
6818 first_node = SLP_TREE_CHILDREN (node)[first_vec.first],
6819 second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
6820
6821 tree mask_vec = NULL_TREE;
6822 if (!identity_p)
6823 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6824
6825 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
6826 {
6827 tree first_def
6828 = vect_get_slp_vect_def (first_node,
6829 first_vec.second + vi);
6830 tree second_def
6831 = vect_get_slp_vect_def (second_node,
6832 second_vec.second + vi);
6833 vect_add_slp_permutation (vinfo, gsi, node, first_def,
6834 second_def, mask_vec);
6835 }
6836 }
6837
6838 index = 0;
6839 first_vec = std::make_pair (-1U, -1U);
6840 second_vec = std::make_pair (-1U, -1U);
6841 }
6842 }
6843
6844 if (!gsi)
6845 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
6846
6847 return true;
6848 }
6849
6850 /* Vectorize SLP NODE. */
6851
6852 static void
6853 vect_schedule_slp_node (vec_info *vinfo,
6854 slp_tree node, slp_instance instance)
6855 {
6856 gimple_stmt_iterator si;
6857 int i;
6858 slp_tree child;
6859
6860 /* For existing vectors there's nothing to do. */
6861 if (SLP_TREE_VEC_DEFS (node).exists ())
6862 return;
6863
6864 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
6865
6866 /* Vectorize externals and constants. */
6867 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
6868 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6869 {
6870 /* ??? vectorizable_shift can end up using a scalar operand which is
6871 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
6872 node in this case. */
6873 if (!SLP_TREE_VECTYPE (node))
6874 return;
6875
6876 vect_create_constant_vectors (vinfo, node);
6877 return;
6878 }
6879
6880 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6881
6882 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
6883 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6884
6885 if (dump_enabled_p ())
6886 dump_printf_loc (MSG_NOTE, vect_location,
6887 "------>vectorizing SLP node starting from: %G",
6888 stmt_info->stmt);
6889
6890 if (STMT_VINFO_DATA_REF (stmt_info)
6891 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
6892 {
6893 /* Vectorized loads go before the first scalar load to make it
6894 ready early, vectorized stores go before the last scalar
6895 stmt which is where all uses are ready. */
6896 stmt_vec_info last_stmt_info = NULL;
6897 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
6898 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
6899 else /* DR_IS_WRITE */
6900 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
6901 si = gsi_for_stmt (last_stmt_info->stmt);
6902 }
6903 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
6904 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
6905 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
6906 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
6907 {
6908 /* For PHI node vectorization we do not use the insertion iterator. */
6909 si = gsi_none ();
6910 }
6911 else
6912 {
6913 /* Emit other stmts after the children vectorized defs which is
6914 earliest possible. */
6915 gimple *last_stmt = NULL;
6916 bool seen_vector_def = false;
6917 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6918 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6919 {
6920 /* For fold-left reductions we are retaining the scalar
6921 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
6922 set so the representation isn't perfect. Resort to the
6923 last scalar def here. */
6924 if (SLP_TREE_VEC_STMTS (child).is_empty ())
6925 {
6926 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
6927 == cycle_phi_info_type);
6928 gphi *phi = as_a <gphi *>
6929 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
6930 if (!last_stmt
6931 || vect_stmt_dominates_stmt_p (last_stmt, phi))
6932 last_stmt = phi;
6933 }
6934 /* We are emitting all vectorized stmts in the same place and
6935 the last one is the last.
6936 ??? Unless we have a load permutation applied and that
6937 figures to re-use an earlier generated load. */
6938 unsigned j;
6939 gimple *vstmt;
6940 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
6941 if (!last_stmt
6942 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
6943 last_stmt = vstmt;
6944 }
6945 else if (!SLP_TREE_VECTYPE (child))
6946 {
6947 /* For externals we use unvectorized at all scalar defs. */
6948 unsigned j;
6949 tree def;
6950 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
6951 if (TREE_CODE (def) == SSA_NAME
6952 && !SSA_NAME_IS_DEFAULT_DEF (def))
6953 {
6954 gimple *stmt = SSA_NAME_DEF_STMT (def);
6955 if (!last_stmt
6956 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
6957 last_stmt = stmt;
6958 }
6959 }
6960 else
6961 {
6962 /* For externals we have to look at all defs since their
6963 insertion place is decided per vector. But beware
6964 of pre-existing vectors where we need to make sure
6965 we do not insert before the region boundary. */
6966 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
6967 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
6968 seen_vector_def = true;
6969 else
6970 {
6971 unsigned j;
6972 tree vdef;
6973 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
6974 if (TREE_CODE (vdef) == SSA_NAME
6975 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
6976 {
6977 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
6978 if (!last_stmt
6979 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
6980 last_stmt = vstmt;
6981 }
6982 }
6983 }
6984 /* This can happen when all children are pre-existing vectors or
6985 constants. */
6986 if (!last_stmt)
6987 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
6988 if (!last_stmt)
6989 {
6990 gcc_assert (seen_vector_def);
6991 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
6992 }
6993 else if (is_a <gphi *> (last_stmt))
6994 si = gsi_after_labels (gimple_bb (last_stmt));
6995 else
6996 {
6997 si = gsi_for_stmt (last_stmt);
6998 gsi_next (&si);
6999 }
7000 }
7001
7002 bool done_p = false;
7003
7004 /* Handle purely internal nodes. */
7005 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7006 {
7007 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
7008 be shared with different SLP nodes (but usually it's the same
7009 operation apart from the case the stmt is only there for denoting
7010 the actual scalar lane defs ...). So do not call vect_transform_stmt
7011 but open-code it here (partly). */
7012 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
7013 gcc_assert (done);
7014 done_p = true;
7015 }
7016 if (!done_p)
7017 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
7018 }
7019
7020 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
7021 For loop vectorization this is done in vectorizable_call, but for SLP
7022 it needs to be deferred until end of vect_schedule_slp, because multiple
7023 SLP instances may refer to the same scalar stmt. */
7024
7025 static void
7026 vect_remove_slp_scalar_calls (vec_info *vinfo,
7027 slp_tree node, hash_set<slp_tree> &visited)
7028 {
7029 gimple *new_stmt;
7030 gimple_stmt_iterator gsi;
7031 int i;
7032 slp_tree child;
7033 tree lhs;
7034 stmt_vec_info stmt_info;
7035
7036 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7037 return;
7038
7039 if (visited.add (node))
7040 return;
7041
7042 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7043 vect_remove_slp_scalar_calls (vinfo, child, visited);
7044
7045 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7046 {
7047 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
7048 if (!stmt || gimple_bb (stmt) == NULL)
7049 continue;
7050 if (is_pattern_stmt_p (stmt_info)
7051 || !PURE_SLP_STMT (stmt_info))
7052 continue;
7053 lhs = gimple_call_lhs (stmt);
7054 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
7055 gsi = gsi_for_stmt (stmt);
7056 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
7057 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
7058 }
7059 }
7060
7061 static void
7062 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
7063 {
7064 hash_set<slp_tree> visited;
7065 vect_remove_slp_scalar_calls (vinfo, node, visited);
7066 }
7067
7068 /* Vectorize the instance root. */
7069
7070 void
7071 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
7072 {
7073 gassign *rstmt = NULL;
7074
7075 if (instance->kind == slp_inst_kind_ctor)
7076 {
7077 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
7078 {
7079 gimple *child_stmt;
7080 int j;
7081
7082 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
7083 {
7084 tree vect_lhs = gimple_get_lhs (child_stmt);
7085 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7086 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
7087 TREE_TYPE (vect_lhs)))
7088 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
7089 vect_lhs);
7090 rstmt = gimple_build_assign (root_lhs, vect_lhs);
7091 break;
7092 }
7093 }
7094 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
7095 {
7096 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
7097 gimple *child_stmt;
7098 int j;
7099 vec<constructor_elt, va_gc> *v;
7100 vec_alloc (v, nelts);
7101
7102 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
7103 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
7104 gimple_get_lhs (child_stmt));
7105 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7106 tree rtype
7107 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
7108 tree r_constructor = build_constructor (rtype, v);
7109 rstmt = gimple_build_assign (lhs, r_constructor);
7110 }
7111 }
7112 else if (instance->kind == slp_inst_kind_bb_reduc)
7113 {
7114 /* Largely inspired by reduction chain epilogue handling in
7115 vect_create_epilog_for_reduction. */
7116 vec<tree> vec_defs = vNULL;
7117 vect_get_slp_defs (node, &vec_defs);
7118 enum tree_code reduc_code
7119 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
7120 /* ??? We actually have to reflect signs somewhere. */
7121 if (reduc_code == MINUS_EXPR)
7122 reduc_code = PLUS_EXPR;
7123 gimple_seq epilogue = NULL;
7124 /* We may end up with more than one vector result, reduce them
7125 to one vector. */
7126 tree vec_def = vec_defs[0];
7127 for (unsigned i = 1; i < vec_defs.length (); ++i)
7128 vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
7129 vec_def, vec_defs[i]);
7130 vec_defs.release ();
7131 /* ??? Support other schemes than direct internal fn. */
7132 internal_fn reduc_fn;
7133 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
7134 || reduc_fn == IFN_LAST)
7135 gcc_unreachable ();
7136 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
7137 TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
7138
7139 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7140 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
7141 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
7142 update_stmt (gsi_stmt (rgsi));
7143 return;
7144 }
7145 else
7146 gcc_unreachable ();
7147
7148 gcc_assert (rstmt);
7149
7150 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7151 gsi_replace (&rgsi, rstmt, true);
7152 }
7153
7154 struct slp_scc_info
7155 {
7156 bool on_stack;
7157 int dfs;
7158 int lowlink;
7159 };
7160
7161 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
7162
7163 static void
7164 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
7165 hash_map<slp_tree, slp_scc_info> &scc_info,
7166 int &maxdfs, vec<slp_tree> &stack)
7167 {
7168 bool existed_p;
7169 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
7170 gcc_assert (!existed_p);
7171 info->dfs = maxdfs;
7172 info->lowlink = maxdfs;
7173 maxdfs++;
7174
7175 /* Leaf. */
7176 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7177 {
7178 info->on_stack = false;
7179 vect_schedule_slp_node (vinfo, node, instance);
7180 return;
7181 }
7182
7183 info->on_stack = true;
7184 stack.safe_push (node);
7185
7186 unsigned i;
7187 slp_tree child;
7188 /* DFS recurse. */
7189 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7190 {
7191 if (!child)
7192 continue;
7193 slp_scc_info *child_info = scc_info.get (child);
7194 if (!child_info)
7195 {
7196 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
7197 /* Recursion might have re-allocated the node. */
7198 info = scc_info.get (node);
7199 child_info = scc_info.get (child);
7200 info->lowlink = MIN (info->lowlink, child_info->lowlink);
7201 }
7202 else if (child_info->on_stack)
7203 info->lowlink = MIN (info->lowlink, child_info->dfs);
7204 }
7205 if (info->lowlink != info->dfs)
7206 return;
7207
7208 auto_vec<slp_tree, 4> phis_to_fixup;
7209
7210 /* Singleton. */
7211 if (stack.last () == node)
7212 {
7213 stack.pop ();
7214 info->on_stack = false;
7215 vect_schedule_slp_node (vinfo, node, instance);
7216 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
7217 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
7218 phis_to_fixup.quick_push (node);
7219 }
7220 else
7221 {
7222 /* SCC. */
7223 int last_idx = stack.length () - 1;
7224 while (stack[last_idx] != node)
7225 last_idx--;
7226 /* We can break the cycle at PHIs who have at least one child
7227 code generated. Then we could re-start the DFS walk until
7228 all nodes in the SCC are covered (we might have new entries
7229 for only back-reachable nodes). But it's simpler to just
7230 iterate and schedule those that are ready. */
7231 unsigned todo = stack.length () - last_idx;
7232 do
7233 {
7234 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
7235 {
7236 slp_tree entry = stack[idx];
7237 if (!entry)
7238 continue;
7239 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
7240 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
7241 bool ready = !phi;
7242 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
7243 if (!child)
7244 {
7245 gcc_assert (phi);
7246 ready = true;
7247 break;
7248 }
7249 else if (scc_info.get (child)->on_stack)
7250 {
7251 if (!phi)
7252 {
7253 ready = false;
7254 break;
7255 }
7256 }
7257 else
7258 {
7259 if (phi)
7260 {
7261 ready = true;
7262 break;
7263 }
7264 }
7265 if (ready)
7266 {
7267 vect_schedule_slp_node (vinfo, entry, instance);
7268 scc_info.get (entry)->on_stack = false;
7269 stack[idx] = NULL;
7270 todo--;
7271 if (phi)
7272 phis_to_fixup.safe_push (entry);
7273 }
7274 }
7275 }
7276 while (todo != 0);
7277
7278 /* Pop the SCC. */
7279 stack.truncate (last_idx);
7280 }
7281
7282 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
7283 slp_tree phi_node;
7284 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
7285 {
7286 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
7287 edge_iterator ei;
7288 edge e;
7289 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
7290 {
7291 unsigned dest_idx = e->dest_idx;
7292 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
7293 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
7294 continue;
7295 /* Simply fill all args. */
7296 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
7297 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
7298 vect_get_slp_vect_def (child, i),
7299 e, gimple_phi_arg_location (phi, dest_idx));
7300 }
7301 }
7302 }
7303
7304 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
7305
7306 void
7307 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
7308 {
7309 slp_instance instance;
7310 unsigned int i;
7311
7312 hash_map<slp_tree, slp_scc_info> scc_info;
7313 int maxdfs = 0;
7314 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7315 {
7316 slp_tree node = SLP_INSTANCE_TREE (instance);
7317 if (dump_enabled_p ())
7318 {
7319 dump_printf_loc (MSG_NOTE, vect_location,
7320 "Vectorizing SLP tree:\n");
7321 /* ??? Dump all? */
7322 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7323 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
7324 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
7325 vect_print_slp_graph (MSG_NOTE, vect_location,
7326 SLP_INSTANCE_TREE (instance));
7327 }
7328 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
7329 have a PHI be the node breaking the cycle. */
7330 auto_vec<slp_tree> stack;
7331 if (!scc_info.get (node))
7332 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
7333
7334 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7335 vectorize_slp_instance_root_stmt (node, instance);
7336
7337 if (dump_enabled_p ())
7338 dump_printf_loc (MSG_NOTE, vect_location,
7339 "vectorizing stmts using SLP.\n");
7340 }
7341
7342 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7343 {
7344 slp_tree root = SLP_INSTANCE_TREE (instance);
7345 stmt_vec_info store_info;
7346 unsigned int j;
7347
7348 /* Remove scalar call stmts. Do not do this for basic-block
7349 vectorization as not all uses may be vectorized.
7350 ??? Why should this be necessary? DCE should be able to
7351 remove the stmts itself.
7352 ??? For BB vectorization we can as well remove scalar
7353 stmts starting from the SLP tree root if they have no
7354 uses. */
7355 if (is_a <loop_vec_info> (vinfo))
7356 vect_remove_slp_scalar_calls (vinfo, root);
7357
7358 /* Remove vectorized stores original scalar stmts. */
7359 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
7360 {
7361 if (!STMT_VINFO_DATA_REF (store_info)
7362 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
7363 break;
7364
7365 store_info = vect_orig_stmt (store_info);
7366 /* Free the attached stmt_vec_info and remove the stmt. */
7367 vinfo->remove_stmt (store_info);
7368
7369 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
7370 to not crash in vect_free_slp_tree later. */
7371 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
7372 SLP_TREE_REPRESENTATIVE (root) = NULL;
7373 }
7374 }
7375 }