]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
Add x86 addsub SLP pattern
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
52
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
56
57 static object_allocator<_slp_tree> *slp_tree_pool;
58 static slp_tree slp_first_node;
59
60 void
61 vect_slp_init (void)
62 {
63 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
64 }
65
66 void
67 vect_slp_fini (void)
68 {
69 while (slp_first_node)
70 delete slp_first_node;
71 delete slp_tree_pool;
72 slp_tree_pool = NULL;
73 }
74
75 void *
76 _slp_tree::operator new (size_t n)
77 {
78 gcc_assert (n == sizeof (_slp_tree));
79 return slp_tree_pool->allocate_raw ();
80 }
81
82 void
83 _slp_tree::operator delete (void *node, size_t n)
84 {
85 gcc_assert (n == sizeof (_slp_tree));
86 slp_tree_pool->remove_raw (node);
87 }
88
89
90 /* Initialize a SLP node. */
91
92 _slp_tree::_slp_tree ()
93 {
94 this->prev_node = NULL;
95 if (slp_first_node)
96 slp_first_node->prev_node = this;
97 this->next_node = slp_first_node;
98 slp_first_node = this;
99 SLP_TREE_SCALAR_STMTS (this) = vNULL;
100 SLP_TREE_SCALAR_OPS (this) = vNULL;
101 SLP_TREE_VEC_STMTS (this) = vNULL;
102 SLP_TREE_VEC_DEFS (this) = vNULL;
103 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
104 SLP_TREE_CHILDREN (this) = vNULL;
105 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
106 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
107 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
108 SLP_TREE_CODE (this) = ERROR_MARK;
109 SLP_TREE_VECTYPE (this) = NULL_TREE;
110 SLP_TREE_REPRESENTATIVE (this) = NULL;
111 SLP_TREE_REF_COUNT (this) = 1;
112 this->failed = NULL;
113 this->max_nunits = 1;
114 this->lanes = 0;
115 }
116
117 /* Tear down a SLP node. */
118
119 _slp_tree::~_slp_tree ()
120 {
121 if (this->prev_node)
122 this->prev_node->next_node = this->next_node;
123 else
124 slp_first_node = this->next_node;
125 if (this->next_node)
126 this->next_node->prev_node = this->prev_node;
127 SLP_TREE_CHILDREN (this).release ();
128 SLP_TREE_SCALAR_STMTS (this).release ();
129 SLP_TREE_SCALAR_OPS (this).release ();
130 SLP_TREE_VEC_STMTS (this).release ();
131 SLP_TREE_VEC_DEFS (this).release ();
132 SLP_TREE_LOAD_PERMUTATION (this).release ();
133 SLP_TREE_LANE_PERMUTATION (this).release ();
134 if (this->failed)
135 free (failed);
136 }
137
138 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
139
140 void
141 vect_free_slp_tree (slp_tree node)
142 {
143 int i;
144 slp_tree child;
145
146 if (--SLP_TREE_REF_COUNT (node) != 0)
147 return;
148
149 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
150 if (child)
151 vect_free_slp_tree (child);
152
153 /* If the node defines any SLP only patterns then those patterns are no
154 longer valid and should be removed. */
155 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
156 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
157 {
158 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
159 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
160 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
161 }
162
163 delete node;
164 }
165
166 /* Return a location suitable for dumpings related to the SLP instance. */
167
168 dump_user_location_t
169 _slp_instance::location () const
170 {
171 if (!root_stmts.is_empty ())
172 return root_stmts[0]->stmt;
173 else
174 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
175 }
176
177
178 /* Free the memory allocated for the SLP instance. */
179
180 void
181 vect_free_slp_instance (slp_instance instance)
182 {
183 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
184 SLP_INSTANCE_LOADS (instance).release ();
185 SLP_INSTANCE_ROOT_STMTS (instance).release ();
186 instance->subgraph_entries.release ();
187 instance->cost_vec.release ();
188 free (instance);
189 }
190
191
192 /* Create an SLP node for SCALAR_STMTS. */
193
194 slp_tree
195 vect_create_new_slp_node (unsigned nops, tree_code code)
196 {
197 slp_tree node = new _slp_tree;
198 SLP_TREE_SCALAR_STMTS (node) = vNULL;
199 SLP_TREE_CHILDREN (node).create (nops);
200 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
201 SLP_TREE_CODE (node) = code;
202 return node;
203 }
204 /* Create an SLP node for SCALAR_STMTS. */
205
206 static slp_tree
207 vect_create_new_slp_node (slp_tree node,
208 vec<stmt_vec_info> scalar_stmts, unsigned nops)
209 {
210 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
211 SLP_TREE_CHILDREN (node).create (nops);
212 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
213 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
214 SLP_TREE_LANES (node) = scalar_stmts.length ();
215 return node;
216 }
217
218 /* Create an SLP node for SCALAR_STMTS. */
219
220 static slp_tree
221 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
222 {
223 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
224 }
225
226 /* Create an SLP node for OPS. */
227
228 static slp_tree
229 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
230 {
231 SLP_TREE_SCALAR_OPS (node) = ops;
232 SLP_TREE_DEF_TYPE (node) = vect_external_def;
233 SLP_TREE_LANES (node) = ops.length ();
234 return node;
235 }
236
237 /* Create an SLP node for OPS. */
238
239 static slp_tree
240 vect_create_new_slp_node (vec<tree> ops)
241 {
242 return vect_create_new_slp_node (new _slp_tree, ops);
243 }
244
245
246 /* This structure is used in creation of an SLP tree. Each instance
247 corresponds to the same operand in a group of scalar stmts in an SLP
248 node. */
249 typedef struct _slp_oprnd_info
250 {
251 /* Def-stmts for the operands. */
252 vec<stmt_vec_info> def_stmts;
253 /* Operands. */
254 vec<tree> ops;
255 /* Information about the first statement, its vector def-type, type, the
256 operand itself in case it's constant, and an indication if it's a pattern
257 stmt. */
258 tree first_op_type;
259 enum vect_def_type first_dt;
260 bool any_pattern;
261 } *slp_oprnd_info;
262
263
264 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
265 operand. */
266 static vec<slp_oprnd_info>
267 vect_create_oprnd_info (int nops, int group_size)
268 {
269 int i;
270 slp_oprnd_info oprnd_info;
271 vec<slp_oprnd_info> oprnds_info;
272
273 oprnds_info.create (nops);
274 for (i = 0; i < nops; i++)
275 {
276 oprnd_info = XNEW (struct _slp_oprnd_info);
277 oprnd_info->def_stmts.create (group_size);
278 oprnd_info->ops.create (group_size);
279 oprnd_info->first_dt = vect_uninitialized_def;
280 oprnd_info->first_op_type = NULL_TREE;
281 oprnd_info->any_pattern = false;
282 oprnds_info.quick_push (oprnd_info);
283 }
284
285 return oprnds_info;
286 }
287
288
289 /* Free operands info. */
290
291 static void
292 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
293 {
294 int i;
295 slp_oprnd_info oprnd_info;
296
297 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
298 {
299 oprnd_info->def_stmts.release ();
300 oprnd_info->ops.release ();
301 XDELETE (oprnd_info);
302 }
303
304 oprnds_info.release ();
305 }
306
307
308 /* Return true if STMTS contains a pattern statement. */
309
310 static bool
311 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
312 {
313 stmt_vec_info stmt_info;
314 unsigned int i;
315 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
316 if (is_pattern_stmt_p (stmt_info))
317 return true;
318 return false;
319 }
320
321 /* Return true when all lanes in the external or constant NODE have
322 the same value. */
323
324 static bool
325 vect_slp_tree_uniform_p (slp_tree node)
326 {
327 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
328 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
329
330 /* Pre-exsting vectors. */
331 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
332 return false;
333
334 unsigned i;
335 tree op, first = NULL_TREE;
336 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
337 if (!first)
338 first = op;
339 else if (!operand_equal_p (first, op, 0))
340 return false;
341
342 return true;
343 }
344
345 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
346 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
347 of the chain. */
348
349 int
350 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
351 stmt_vec_info first_stmt_info)
352 {
353 stmt_vec_info next_stmt_info = first_stmt_info;
354 int result = 0;
355
356 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
357 return -1;
358
359 do
360 {
361 if (next_stmt_info == stmt_info)
362 return result;
363 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
364 if (next_stmt_info)
365 result += DR_GROUP_GAP (next_stmt_info);
366 }
367 while (next_stmt_info);
368
369 return -1;
370 }
371
372 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
373 using the method implemented by duplicate_and_interleave. Return true
374 if so, returning the number of intermediate vectors in *NVECTORS_OUT
375 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
376 (if nonnull). */
377
378 bool
379 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
380 tree elt_type, unsigned int *nvectors_out,
381 tree *vector_type_out,
382 tree *permutes)
383 {
384 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
385 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
386 return false;
387
388 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
389 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
390 unsigned int nvectors = 1;
391 for (;;)
392 {
393 scalar_int_mode int_mode;
394 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
395 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
396 {
397 /* Get the natural vector type for this SLP group size. */
398 tree int_type = build_nonstandard_integer_type
399 (GET_MODE_BITSIZE (int_mode), 1);
400 tree vector_type
401 = get_vectype_for_scalar_type (vinfo, int_type, count);
402 if (vector_type
403 && VECTOR_MODE_P (TYPE_MODE (vector_type))
404 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
405 GET_MODE_SIZE (base_vector_mode)))
406 {
407 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
408 together into elements of type INT_TYPE and using the result
409 to build NVECTORS vectors. */
410 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
411 vec_perm_builder sel1 (nelts, 2, 3);
412 vec_perm_builder sel2 (nelts, 2, 3);
413 poly_int64 half_nelts = exact_div (nelts, 2);
414 for (unsigned int i = 0; i < 3; ++i)
415 {
416 sel1.quick_push (i);
417 sel1.quick_push (i + nelts);
418 sel2.quick_push (half_nelts + i);
419 sel2.quick_push (half_nelts + i + nelts);
420 }
421 vec_perm_indices indices1 (sel1, 2, nelts);
422 vec_perm_indices indices2 (sel2, 2, nelts);
423 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
424 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
425 {
426 if (nvectors_out)
427 *nvectors_out = nvectors;
428 if (vector_type_out)
429 *vector_type_out = vector_type;
430 if (permutes)
431 {
432 permutes[0] = vect_gen_perm_mask_checked (vector_type,
433 indices1);
434 permutes[1] = vect_gen_perm_mask_checked (vector_type,
435 indices2);
436 }
437 return true;
438 }
439 }
440 }
441 if (!multiple_p (elt_bytes, 2, &elt_bytes))
442 return false;
443 nvectors *= 2;
444 }
445 }
446
447 /* Return true if DTA and DTB match. */
448
449 static bool
450 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
451 {
452 return (dta == dtb
453 || ((dta == vect_external_def || dta == vect_constant_def)
454 && (dtb == vect_external_def || dtb == vect_constant_def)));
455 }
456
457 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
458 they are of a valid type and that they match the defs of the first stmt of
459 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
460 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
461 indicates swap is required for cond_expr stmts. Specifically, *SWAP
462 is 1 if STMT is cond and operands of comparison need to be swapped;
463 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
464 If there is any operand swap in this function, *SWAP is set to non-zero
465 value.
466 If there was a fatal error return -1; if the error could be corrected by
467 swapping operands of father node of this one, return 1; if everything is
468 ok return 0. */
469 static int
470 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
471 bool *skip_args,
472 vec<stmt_vec_info> stmts, unsigned stmt_num,
473 vec<slp_oprnd_info> *oprnds_info)
474 {
475 stmt_vec_info stmt_info = stmts[stmt_num];
476 tree oprnd;
477 unsigned int i, number_of_oprnds;
478 enum vect_def_type dt = vect_uninitialized_def;
479 slp_oprnd_info oprnd_info;
480 int first_op_idx = 1;
481 unsigned int commutative_op = -1U;
482 bool first_op_cond = false;
483 bool first = stmt_num == 0;
484
485 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
486 {
487 number_of_oprnds = gimple_call_num_args (stmt);
488 first_op_idx = 3;
489 if (gimple_call_internal_p (stmt))
490 {
491 internal_fn ifn = gimple_call_internal_fn (stmt);
492 commutative_op = first_commutative_argument (ifn);
493
494 /* Masked load, only look at mask. */
495 if (ifn == IFN_MASK_LOAD)
496 {
497 number_of_oprnds = 1;
498 /* Mask operand index. */
499 first_op_idx = 5;
500 }
501 }
502 }
503 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
504 {
505 enum tree_code code = gimple_assign_rhs_code (stmt);
506 number_of_oprnds = gimple_num_ops (stmt) - 1;
507 /* Swap can only be done for cond_expr if asked to, otherwise we
508 could result in different comparison code to the first stmt. */
509 if (code == COND_EXPR
510 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
511 {
512 first_op_cond = true;
513 number_of_oprnds++;
514 }
515 else
516 commutative_op = commutative_tree_code (code) ? 0U : -1U;
517 }
518 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
519 number_of_oprnds = gimple_phi_num_args (stmt);
520 else
521 return -1;
522
523 bool swapped = (swap != 0);
524 bool backedge = false;
525 gcc_assert (!swapped || first_op_cond);
526 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
527 for (i = 0; i < number_of_oprnds; i++)
528 {
529 if (first_op_cond)
530 {
531 /* Map indicating how operands of cond_expr should be swapped. */
532 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
533 int *map = maps[swap];
534
535 if (i < 2)
536 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
537 first_op_idx), map[i]);
538 else
539 oprnd = gimple_op (stmt_info->stmt, map[i]);
540 }
541 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
542 {
543 oprnd = gimple_phi_arg_def (stmt, i);
544 backedge = dominated_by_p (CDI_DOMINATORS,
545 gimple_phi_arg_edge (stmt, i)->src,
546 gimple_bb (stmt_info->stmt));
547 }
548 else
549 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
550 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
551 oprnd = TREE_OPERAND (oprnd, 0);
552
553 oprnd_info = (*oprnds_info)[i];
554
555 stmt_vec_info def_stmt_info;
556 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
557 {
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
560 "Build SLP failed: can't analyze def for %T\n",
561 oprnd);
562
563 return -1;
564 }
565
566 if (skip_args[i])
567 {
568 oprnd_info->def_stmts.quick_push (NULL);
569 oprnd_info->ops.quick_push (NULL_TREE);
570 oprnd_info->first_dt = vect_uninitialized_def;
571 continue;
572 }
573
574 oprnd_info->def_stmts.quick_push (def_stmt_info);
575 oprnd_info->ops.quick_push (oprnd);
576
577 if (def_stmt_info
578 && is_pattern_stmt_p (def_stmt_info))
579 {
580 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
581 != def_stmt_info)
582 oprnd_info->any_pattern = true;
583 else
584 /* If we promote this to external use the original stmt def. */
585 oprnd_info->ops.last ()
586 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
587 }
588
589 /* If there's a extern def on a backedge make sure we can
590 code-generate at the region start.
591 ??? This is another case that could be fixed by adjusting
592 how we split the function but at the moment we'd have conflicting
593 goals there. */
594 if (backedge
595 && dts[i] == vect_external_def
596 && is_a <bb_vec_info> (vinfo)
597 && TREE_CODE (oprnd) == SSA_NAME
598 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
599 && !dominated_by_p (CDI_DOMINATORS,
600 as_a <bb_vec_info> (vinfo)->bbs[0],
601 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
602 {
603 if (dump_enabled_p ())
604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
605 "Build SLP failed: extern def %T only defined "
606 "on backedge\n", oprnd);
607 return -1;
608 }
609
610 if (first)
611 {
612 tree type = TREE_TYPE (oprnd);
613 dt = dts[i];
614 if ((dt == vect_constant_def
615 || dt == vect_external_def)
616 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
617 && (TREE_CODE (type) == BOOLEAN_TYPE
618 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
619 type)))
620 {
621 if (dump_enabled_p ())
622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
623 "Build SLP failed: invalid type of def "
624 "for variable-length SLP %T\n", oprnd);
625 return -1;
626 }
627
628 /* For the swapping logic below force vect_reduction_def
629 for the reduction op in a SLP reduction group. */
630 if (!STMT_VINFO_DATA_REF (stmt_info)
631 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
632 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
633 && def_stmt_info)
634 dts[i] = dt = vect_reduction_def;
635
636 /* Check the types of the definition. */
637 switch (dt)
638 {
639 case vect_external_def:
640 case vect_constant_def:
641 case vect_internal_def:
642 case vect_reduction_def:
643 case vect_induction_def:
644 case vect_nested_cycle:
645 break;
646
647 default:
648 /* FORNOW: Not supported. */
649 if (dump_enabled_p ())
650 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
651 "Build SLP failed: illegal type of def %T\n",
652 oprnd);
653 return -1;
654 }
655
656 oprnd_info->first_dt = dt;
657 oprnd_info->first_op_type = type;
658 }
659 }
660 if (first)
661 return 0;
662
663 /* Now match the operand definition types to that of the first stmt. */
664 for (i = 0; i < number_of_oprnds;)
665 {
666 if (skip_args[i])
667 {
668 ++i;
669 continue;
670 }
671
672 oprnd_info = (*oprnds_info)[i];
673 dt = dts[i];
674 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
675 oprnd = oprnd_info->ops[stmt_num];
676 tree type = TREE_TYPE (oprnd);
677
678 if (!types_compatible_p (oprnd_info->first_op_type, type))
679 {
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
682 "Build SLP failed: different operand types\n");
683 return 1;
684 }
685
686 /* Not first stmt of the group, check that the def-stmt/s match
687 the def-stmt/s of the first stmt. Allow different definition
688 types for reduction chains: the first stmt must be a
689 vect_reduction_def (a phi node), and the rest
690 end in the reduction chain. */
691 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
692 && !(oprnd_info->first_dt == vect_reduction_def
693 && !STMT_VINFO_DATA_REF (stmt_info)
694 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
695 && def_stmt_info
696 && !STMT_VINFO_DATA_REF (def_stmt_info)
697 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
698 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
699 || (!STMT_VINFO_DATA_REF (stmt_info)
700 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
701 && ((!def_stmt_info
702 || STMT_VINFO_DATA_REF (def_stmt_info)
703 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
704 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
705 != (oprnd_info->first_dt != vect_reduction_def))))
706 {
707 /* Try swapping operands if we got a mismatch. For BB
708 vectorization only in case it will clearly improve things. */
709 if (i == commutative_op && !swapped
710 && (!is_a <bb_vec_info> (vinfo)
711 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
712 dts[i+1])
713 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
714 || vect_def_types_match
715 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
716 {
717 if (dump_enabled_p ())
718 dump_printf_loc (MSG_NOTE, vect_location,
719 "trying swapped operands\n");
720 std::swap (dts[i], dts[i+1]);
721 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
722 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
723 std::swap ((*oprnds_info)[i]->ops[stmt_num],
724 (*oprnds_info)[i+1]->ops[stmt_num]);
725 swapped = true;
726 continue;
727 }
728
729 if (is_a <bb_vec_info> (vinfo)
730 && !oprnd_info->any_pattern)
731 {
732 /* Now for commutative ops we should see whether we can
733 make the other operand matching. */
734 if (dump_enabled_p ())
735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
736 "treating operand as external\n");
737 oprnd_info->first_dt = dt = vect_external_def;
738 }
739 else
740 {
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
743 "Build SLP failed: different types\n");
744 return 1;
745 }
746 }
747
748 /* Make sure to demote the overall operand to external. */
749 if (dt == vect_external_def)
750 oprnd_info->first_dt = vect_external_def;
751 /* For a SLP reduction chain we want to duplicate the reduction to
752 each of the chain members. That gets us a sane SLP graph (still
753 the stmts are not 100% correct wrt the initial values). */
754 else if ((dt == vect_internal_def
755 || dt == vect_reduction_def)
756 && oprnd_info->first_dt == vect_reduction_def
757 && !STMT_VINFO_DATA_REF (stmt_info)
758 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
759 && !STMT_VINFO_DATA_REF (def_stmt_info)
760 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
761 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
762 {
763 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
764 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
765 }
766
767 ++i;
768 }
769
770 /* Swap operands. */
771 if (swapped)
772 {
773 if (dump_enabled_p ())
774 dump_printf_loc (MSG_NOTE, vect_location,
775 "swapped operands to match def types in %G",
776 stmt_info->stmt);
777 }
778
779 return 0;
780 }
781
782 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
783 Return true if we can, meaning that this choice doesn't conflict with
784 existing SLP nodes that use STMT_INFO. */
785
786 bool
787 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
788 {
789 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
790 if (old_vectype)
791 return useless_type_conversion_p (vectype, old_vectype);
792
793 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
794 {
795 /* We maintain the invariant that if any statement in the group is
796 used, all other members of the group have the same vector type. */
797 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
798 stmt_vec_info member_info = first_info;
799 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
800 if (is_pattern_stmt_p (member_info)
801 && !useless_type_conversion_p (vectype,
802 STMT_VINFO_VECTYPE (member_info)))
803 break;
804
805 if (!member_info)
806 {
807 for (member_info = first_info; member_info;
808 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
809 STMT_VINFO_VECTYPE (member_info) = vectype;
810 return true;
811 }
812 }
813 else if (!is_pattern_stmt_p (stmt_info))
814 {
815 STMT_VINFO_VECTYPE (stmt_info) = vectype;
816 return true;
817 }
818
819 if (dump_enabled_p ())
820 {
821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822 "Build SLP failed: incompatible vector"
823 " types for: %G", stmt_info->stmt);
824 dump_printf_loc (MSG_NOTE, vect_location,
825 " old vector type: %T\n", old_vectype);
826 dump_printf_loc (MSG_NOTE, vect_location,
827 " new vector type: %T\n", vectype);
828 }
829 return false;
830 }
831
832 /* Return true if call statements CALL1 and CALL2 are similar enough
833 to be combined into the same SLP group. */
834
835 static bool
836 compatible_calls_p (gcall *call1, gcall *call2)
837 {
838 unsigned int nargs = gimple_call_num_args (call1);
839 if (nargs != gimple_call_num_args (call2))
840 return false;
841
842 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
843 return false;
844
845 if (gimple_call_internal_p (call1))
846 {
847 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
848 TREE_TYPE (gimple_call_lhs (call2))))
849 return false;
850 for (unsigned int i = 0; i < nargs; ++i)
851 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
852 TREE_TYPE (gimple_call_arg (call2, i))))
853 return false;
854 }
855 else
856 {
857 if (!operand_equal_p (gimple_call_fn (call1),
858 gimple_call_fn (call2), 0))
859 return false;
860
861 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
862 return false;
863 }
864 return true;
865 }
866
867 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
868 caller's attempt to find the vector type in STMT_INFO with the narrowest
869 element type. Return true if VECTYPE is nonnull and if it is valid
870 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
871 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
872 vect_build_slp_tree. */
873
874 static bool
875 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
876 unsigned int group_size,
877 tree vectype, poly_uint64 *max_nunits)
878 {
879 if (!vectype)
880 {
881 if (dump_enabled_p ())
882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
883 "Build SLP failed: unsupported data-type in %G\n",
884 stmt_info->stmt);
885 /* Fatal mismatch. */
886 return false;
887 }
888
889 /* If populating the vector type requires unrolling then fail
890 before adjusting *max_nunits for basic-block vectorization. */
891 if (is_a <bb_vec_info> (vinfo)
892 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
893 {
894 if (dump_enabled_p ())
895 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
896 "Build SLP failed: unrolling required "
897 "in basic block SLP\n");
898 /* Fatal mismatch. */
899 return false;
900 }
901
902 /* In case of multiple types we need to detect the smallest type. */
903 vect_update_max_nunits (max_nunits, vectype);
904 return true;
905 }
906
907 /* Verify if the scalar stmts STMTS are isomorphic, require data
908 permutation or are of unsupported types of operation. Return
909 true if they are, otherwise return false and indicate in *MATCHES
910 which stmts are not isomorphic to the first one. If MATCHES[0]
911 is false then this indicates the comparison could not be
912 carried out or the stmts will never be vectorized by SLP.
913
914 Note COND_EXPR is possibly isomorphic to another one after swapping its
915 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
916 the first stmt by swapping the two operands of comparison; set SWAP[i]
917 to 2 if stmt I is isormorphic to the first stmt by inverting the code
918 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
919 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
920
921 static bool
922 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
923 vec<stmt_vec_info> stmts, unsigned int group_size,
924 poly_uint64 *max_nunits, bool *matches,
925 bool *two_operators, tree *node_vectype)
926 {
927 unsigned int i;
928 stmt_vec_info first_stmt_info = stmts[0];
929 enum tree_code first_stmt_code = ERROR_MARK;
930 enum tree_code alt_stmt_code = ERROR_MARK;
931 enum tree_code rhs_code = ERROR_MARK;
932 enum tree_code first_cond_code = ERROR_MARK;
933 tree lhs;
934 bool need_same_oprnds = false;
935 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
936 optab optab;
937 int icode;
938 machine_mode optab_op2_mode;
939 machine_mode vec_mode;
940 stmt_vec_info first_load = NULL, prev_first_load = NULL;
941 bool first_stmt_load_p = false, load_p = false;
942 bool first_stmt_phi_p = false, phi_p = false;
943 bool maybe_soft_fail = false;
944 tree soft_fail_nunits_vectype = NULL_TREE;
945
946 /* For every stmt in NODE find its def stmt/s. */
947 stmt_vec_info stmt_info;
948 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
949 {
950 gimple *stmt = stmt_info->stmt;
951 swap[i] = 0;
952 matches[i] = false;
953
954 if (dump_enabled_p ())
955 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
956
957 /* Fail to vectorize statements marked as unvectorizable, throw
958 or are volatile. */
959 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
960 || stmt_can_throw_internal (cfun, stmt)
961 || gimple_has_volatile_ops (stmt))
962 {
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
965 "Build SLP failed: unvectorizable statement %G",
966 stmt);
967 /* ??? For BB vectorization we want to commutate operands in a way
968 to shuffle all unvectorizable defs into one operand and have
969 the other still vectorized. The following doesn't reliably
970 work for this though but it's the easiest we can do here. */
971 if (is_a <bb_vec_info> (vinfo) && i != 0)
972 continue;
973 /* Fatal mismatch. */
974 matches[0] = false;
975 return false;
976 }
977
978 lhs = gimple_get_lhs (stmt);
979 if (lhs == NULL_TREE)
980 {
981 if (dump_enabled_p ())
982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
983 "Build SLP failed: not GIMPLE_ASSIGN nor "
984 "GIMPLE_CALL %G", stmt);
985 if (is_a <bb_vec_info> (vinfo) && i != 0)
986 continue;
987 /* Fatal mismatch. */
988 matches[0] = false;
989 return false;
990 }
991
992 tree nunits_vectype;
993 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
994 &nunits_vectype, group_size))
995 {
996 if (is_a <bb_vec_info> (vinfo) && i != 0)
997 continue;
998 /* Fatal mismatch. */
999 matches[0] = false;
1000 return false;
1001 }
1002 /* Record nunits required but continue analysis, producing matches[]
1003 as if nunits was not an issue. This allows splitting of groups
1004 to happen. */
1005 if (nunits_vectype
1006 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1007 nunits_vectype, max_nunits))
1008 {
1009 gcc_assert (is_a <bb_vec_info> (vinfo));
1010 maybe_soft_fail = true;
1011 soft_fail_nunits_vectype = nunits_vectype;
1012 }
1013
1014 gcc_assert (vectype);
1015
1016 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1017 if (call_stmt)
1018 {
1019 rhs_code = CALL_EXPR;
1020
1021 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
1022 load_p = true;
1023 else if ((gimple_call_internal_p (call_stmt)
1024 && (!vectorizable_internal_fn_p
1025 (gimple_call_internal_fn (call_stmt))))
1026 || gimple_call_tail_p (call_stmt)
1027 || gimple_call_noreturn_p (call_stmt)
1028 || !gimple_call_nothrow_p (call_stmt)
1029 || gimple_call_chain (call_stmt))
1030 {
1031 if (dump_enabled_p ())
1032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1033 "Build SLP failed: unsupported call type %G",
1034 call_stmt);
1035 if (is_a <bb_vec_info> (vinfo) && i != 0)
1036 continue;
1037 /* Fatal mismatch. */
1038 matches[0] = false;
1039 return false;
1040 }
1041 }
1042 else if (gimple_code (stmt) == GIMPLE_PHI)
1043 {
1044 rhs_code = ERROR_MARK;
1045 phi_p = true;
1046 }
1047 else
1048 {
1049 rhs_code = gimple_assign_rhs_code (stmt);
1050 load_p = gimple_vuse (stmt);
1051 }
1052
1053 /* Check the operation. */
1054 if (i == 0)
1055 {
1056 *node_vectype = vectype;
1057 first_stmt_code = rhs_code;
1058 first_stmt_load_p = load_p;
1059 first_stmt_phi_p = phi_p;
1060
1061 /* Shift arguments should be equal in all the packed stmts for a
1062 vector shift with scalar shift operand. */
1063 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1064 || rhs_code == LROTATE_EXPR
1065 || rhs_code == RROTATE_EXPR)
1066 {
1067 vec_mode = TYPE_MODE (vectype);
1068
1069 /* First see if we have a vector/vector shift. */
1070 optab = optab_for_tree_code (rhs_code, vectype,
1071 optab_vector);
1072
1073 if (!optab
1074 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1075 {
1076 /* No vector/vector shift, try for a vector/scalar shift. */
1077 optab = optab_for_tree_code (rhs_code, vectype,
1078 optab_scalar);
1079
1080 if (!optab)
1081 {
1082 if (dump_enabled_p ())
1083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1084 "Build SLP failed: no optab.\n");
1085 if (is_a <bb_vec_info> (vinfo) && i != 0)
1086 continue;
1087 /* Fatal mismatch. */
1088 matches[0] = false;
1089 return false;
1090 }
1091 icode = (int) optab_handler (optab, vec_mode);
1092 if (icode == CODE_FOR_nothing)
1093 {
1094 if (dump_enabled_p ())
1095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1096 "Build SLP failed: "
1097 "op not supported by target.\n");
1098 if (is_a <bb_vec_info> (vinfo) && i != 0)
1099 continue;
1100 /* Fatal mismatch. */
1101 matches[0] = false;
1102 return false;
1103 }
1104 optab_op2_mode = insn_data[icode].operand[2].mode;
1105 if (!VECTOR_MODE_P (optab_op2_mode))
1106 {
1107 need_same_oprnds = true;
1108 first_op1 = gimple_assign_rhs2 (stmt);
1109 }
1110 }
1111 }
1112 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1113 {
1114 need_same_oprnds = true;
1115 first_op1 = gimple_assign_rhs2 (stmt);
1116 }
1117 else if (!load_p
1118 && rhs_code == BIT_FIELD_REF)
1119 {
1120 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1121 if (!is_a <bb_vec_info> (vinfo)
1122 || TREE_CODE (vec) != SSA_NAME
1123 || !operand_equal_p (TYPE_SIZE (vectype),
1124 TYPE_SIZE (TREE_TYPE (vec))))
1125 {
1126 if (dump_enabled_p ())
1127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1128 "Build SLP failed: "
1129 "BIT_FIELD_REF not supported\n");
1130 /* Fatal mismatch. */
1131 matches[0] = false;
1132 return false;
1133 }
1134 }
1135 else if (call_stmt
1136 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1137 {
1138 need_same_oprnds = true;
1139 first_op1 = gimple_call_arg (call_stmt, 1);
1140 }
1141 }
1142 else
1143 {
1144 if (first_stmt_code != rhs_code
1145 && alt_stmt_code == ERROR_MARK)
1146 alt_stmt_code = rhs_code;
1147 if ((first_stmt_code != rhs_code
1148 && (first_stmt_code != IMAGPART_EXPR
1149 || rhs_code != REALPART_EXPR)
1150 && (first_stmt_code != REALPART_EXPR
1151 || rhs_code != IMAGPART_EXPR)
1152 /* Handle mismatches in plus/minus by computing both
1153 and merging the results. */
1154 && !((first_stmt_code == PLUS_EXPR
1155 || first_stmt_code == MINUS_EXPR)
1156 && (alt_stmt_code == PLUS_EXPR
1157 || alt_stmt_code == MINUS_EXPR)
1158 && rhs_code == alt_stmt_code)
1159 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1160 && (first_stmt_code == ARRAY_REF
1161 || first_stmt_code == BIT_FIELD_REF
1162 || first_stmt_code == INDIRECT_REF
1163 || first_stmt_code == COMPONENT_REF
1164 || first_stmt_code == MEM_REF)))
1165 || first_stmt_load_p != load_p
1166 || first_stmt_phi_p != phi_p)
1167 {
1168 if (dump_enabled_p ())
1169 {
1170 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1171 "Build SLP failed: different operation "
1172 "in stmt %G", stmt);
1173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1174 "original stmt %G", first_stmt_info->stmt);
1175 }
1176 /* Mismatch. */
1177 continue;
1178 }
1179
1180 if (!load_p
1181 && first_stmt_code == BIT_FIELD_REF
1182 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1183 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1184 {
1185 if (dump_enabled_p ())
1186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1187 "Build SLP failed: different BIT_FIELD_REF "
1188 "arguments in %G", stmt);
1189 /* Mismatch. */
1190 continue;
1191 }
1192
1193 if (!load_p && rhs_code == CALL_EXPR)
1194 {
1195 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1196 as_a <gcall *> (stmt)))
1197 {
1198 if (dump_enabled_p ())
1199 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1200 "Build SLP failed: different calls in %G",
1201 stmt);
1202 /* Mismatch. */
1203 continue;
1204 }
1205 }
1206
1207 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1208 && (gimple_bb (first_stmt_info->stmt)
1209 != gimple_bb (stmt_info->stmt)))
1210 {
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1213 "Build SLP failed: different BB for PHI "
1214 "or possibly trapping operation in %G", stmt);
1215 /* Mismatch. */
1216 continue;
1217 }
1218
1219 if (need_same_oprnds)
1220 {
1221 tree other_op1 = (call_stmt
1222 ? gimple_call_arg (call_stmt, 1)
1223 : gimple_assign_rhs2 (stmt));
1224 if (!operand_equal_p (first_op1, other_op1, 0))
1225 {
1226 if (dump_enabled_p ())
1227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1228 "Build SLP failed: different shift "
1229 "arguments in %G", stmt);
1230 /* Mismatch. */
1231 continue;
1232 }
1233 }
1234
1235 if (!types_compatible_p (vectype, *node_vectype))
1236 {
1237 if (dump_enabled_p ())
1238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1239 "Build SLP failed: different vector type "
1240 "in %G", stmt);
1241 /* Mismatch. */
1242 continue;
1243 }
1244 }
1245
1246 /* Grouped store or load. */
1247 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1248 {
1249 if (REFERENCE_CLASS_P (lhs))
1250 {
1251 /* Store. */
1252 ;
1253 }
1254 else
1255 {
1256 /* Load. */
1257 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1258 if (prev_first_load)
1259 {
1260 /* Check that there are no loads from different interleaving
1261 chains in the same node. */
1262 if (prev_first_load != first_load)
1263 {
1264 if (dump_enabled_p ())
1265 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1266 vect_location,
1267 "Build SLP failed: different "
1268 "interleaving chains in one node %G",
1269 stmt);
1270 /* Mismatch. */
1271 continue;
1272 }
1273 }
1274 else
1275 prev_first_load = first_load;
1276 }
1277 } /* Grouped access. */
1278 else
1279 {
1280 if (load_p)
1281 {
1282 /* Not grouped load. */
1283 if (dump_enabled_p ())
1284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1285 "Build SLP failed: not grouped load %G", stmt);
1286
1287 /* FORNOW: Not grouped loads are not supported. */
1288 if (is_a <bb_vec_info> (vinfo) && i != 0)
1289 continue;
1290 /* Fatal mismatch. */
1291 matches[0] = false;
1292 return false;
1293 }
1294
1295 /* Not memory operation. */
1296 if (!phi_p
1297 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1298 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1299 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1300 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1301 && rhs_code != VIEW_CONVERT_EXPR
1302 && rhs_code != CALL_EXPR
1303 && rhs_code != BIT_FIELD_REF)
1304 {
1305 if (dump_enabled_p ())
1306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1307 "Build SLP failed: operation unsupported %G",
1308 stmt);
1309 if (is_a <bb_vec_info> (vinfo) && i != 0)
1310 continue;
1311 /* Fatal mismatch. */
1312 matches[0] = false;
1313 return false;
1314 }
1315
1316 if (rhs_code == COND_EXPR)
1317 {
1318 tree cond_expr = gimple_assign_rhs1 (stmt);
1319 enum tree_code cond_code = TREE_CODE (cond_expr);
1320 enum tree_code swap_code = ERROR_MARK;
1321 enum tree_code invert_code = ERROR_MARK;
1322
1323 if (i == 0)
1324 first_cond_code = TREE_CODE (cond_expr);
1325 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1326 {
1327 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1328 swap_code = swap_tree_comparison (cond_code);
1329 invert_code = invert_tree_comparison (cond_code, honor_nans);
1330 }
1331
1332 if (first_cond_code == cond_code)
1333 ;
1334 /* Isomorphic can be achieved by swapping. */
1335 else if (first_cond_code == swap_code)
1336 swap[i] = 1;
1337 /* Isomorphic can be achieved by inverting. */
1338 else if (first_cond_code == invert_code)
1339 swap[i] = 2;
1340 else
1341 {
1342 if (dump_enabled_p ())
1343 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1344 "Build SLP failed: different"
1345 " operation %G", stmt);
1346 /* Mismatch. */
1347 continue;
1348 }
1349 }
1350 }
1351
1352 matches[i] = true;
1353 }
1354
1355 for (i = 0; i < group_size; ++i)
1356 if (!matches[i])
1357 return false;
1358
1359 /* If we allowed a two-operation SLP node verify the target can cope
1360 with the permute we are going to use. */
1361 if (alt_stmt_code != ERROR_MARK
1362 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1363 {
1364 *two_operators = true;
1365 }
1366
1367 if (maybe_soft_fail)
1368 {
1369 unsigned HOST_WIDE_INT const_nunits;
1370 if (!TYPE_VECTOR_SUBPARTS
1371 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1372 || const_nunits > group_size)
1373 matches[0] = false;
1374 else
1375 {
1376 /* With constant vector elements simulate a mismatch at the
1377 point we need to split. */
1378 unsigned tail = group_size & (const_nunits - 1);
1379 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1380 }
1381 return false;
1382 }
1383
1384 return true;
1385 }
1386
1387 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1388 Note we never remove apart from at destruction time so we do not
1389 need a special value for deleted that differs from empty. */
1390 struct bst_traits
1391 {
1392 typedef vec <stmt_vec_info> value_type;
1393 typedef vec <stmt_vec_info> compare_type;
1394 static inline hashval_t hash (value_type);
1395 static inline bool equal (value_type existing, value_type candidate);
1396 static inline bool is_empty (value_type x) { return !x.exists (); }
1397 static inline bool is_deleted (value_type x) { return !x.exists (); }
1398 static const bool empty_zero_p = true;
1399 static inline void mark_empty (value_type &x) { x.release (); }
1400 static inline void mark_deleted (value_type &x) { x.release (); }
1401 static inline void remove (value_type &x) { x.release (); }
1402 };
1403 inline hashval_t
1404 bst_traits::hash (value_type x)
1405 {
1406 inchash::hash h;
1407 for (unsigned i = 0; i < x.length (); ++i)
1408 h.add_int (gimple_uid (x[i]->stmt));
1409 return h.end ();
1410 }
1411 inline bool
1412 bst_traits::equal (value_type existing, value_type candidate)
1413 {
1414 if (existing.length () != candidate.length ())
1415 return false;
1416 for (unsigned i = 0; i < existing.length (); ++i)
1417 if (existing[i] != candidate[i])
1418 return false;
1419 return true;
1420 }
1421
1422 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1423 but then vec::insert does memmove and that's not compatible with
1424 std::pair. */
1425 struct chain_op_t
1426 {
1427 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1428 : code (code_), dt (dt_), op (op_) {}
1429 tree_code code;
1430 vect_def_type dt;
1431 tree op;
1432 };
1433
1434 /* Comparator for sorting associatable chains. */
1435
1436 static int
1437 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1438 {
1439 auto *op1 = (const chain_op_t *) op1_;
1440 auto *op2 = (const chain_op_t *) op2_;
1441 if (op1->dt != op2->dt)
1442 return (int)op1->dt - (int)op2->dt;
1443 return (int)op1->code - (int)op2->code;
1444 }
1445
1446 /* Linearize the associatable expression chain at START with the
1447 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1448 filling CHAIN with the result and using WORKLIST as intermediate storage.
1449 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1450 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1451 stmts, starting with START. */
1452
1453 static void
1454 vect_slp_linearize_chain (vec_info *vinfo,
1455 vec<std::pair<tree_code, gimple *> > &worklist,
1456 vec<chain_op_t> &chain,
1457 enum tree_code code, gimple *start,
1458 gimple *&code_stmt, gimple *&alt_code_stmt,
1459 vec<gimple *> *chain_stmts)
1460 {
1461 /* For each lane linearize the addition/subtraction (or other
1462 uniform associatable operation) expression tree. */
1463 worklist.safe_push (std::make_pair (code, start));
1464 while (!worklist.is_empty ())
1465 {
1466 auto entry = worklist.pop ();
1467 gassign *stmt = as_a <gassign *> (entry.second);
1468 enum tree_code in_code = entry.first;
1469 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1470 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1471 if (!code_stmt
1472 && gimple_assign_rhs_code (stmt) == code)
1473 code_stmt = stmt;
1474 else if (!alt_code_stmt
1475 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1476 alt_code_stmt = stmt;
1477 if (chain_stmts)
1478 chain_stmts->safe_push (stmt);
1479 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1480 {
1481 tree op = gimple_op (stmt, opnum);
1482 vect_def_type dt;
1483 stmt_vec_info def_stmt_info;
1484 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1485 gcc_assert (res);
1486 if (dt == vect_internal_def
1487 && is_pattern_stmt_p (def_stmt_info))
1488 op = gimple_get_lhs (def_stmt_info->stmt);
1489 gimple *use_stmt;
1490 use_operand_p use_p;
1491 if (dt == vect_internal_def
1492 && single_imm_use (op, &use_p, &use_stmt)
1493 && is_gimple_assign (def_stmt_info->stmt)
1494 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1495 || (code == PLUS_EXPR
1496 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1497 == MINUS_EXPR))))
1498 {
1499 tree_code op_def_code = this_code;
1500 if (op_def_code == MINUS_EXPR && opnum == 1)
1501 op_def_code = PLUS_EXPR;
1502 if (in_code == MINUS_EXPR)
1503 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1504 worklist.safe_push (std::make_pair (op_def_code,
1505 def_stmt_info->stmt));
1506 }
1507 else
1508 {
1509 tree_code op_def_code = this_code;
1510 if (op_def_code == MINUS_EXPR && opnum == 1)
1511 op_def_code = PLUS_EXPR;
1512 if (in_code == MINUS_EXPR)
1513 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1514 chain.safe_push (chain_op_t (op_def_code, dt, op));
1515 }
1516 }
1517 }
1518 }
1519
1520 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1521 simple_hashmap_traits <bst_traits, slp_tree> >
1522 scalar_stmts_to_slp_tree_map_t;
1523
1524 static slp_tree
1525 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1526 vec<stmt_vec_info> stmts, unsigned int group_size,
1527 poly_uint64 *max_nunits,
1528 bool *matches, unsigned *limit, unsigned *tree_size,
1529 scalar_stmts_to_slp_tree_map_t *bst_map);
1530
1531 static slp_tree
1532 vect_build_slp_tree (vec_info *vinfo,
1533 vec<stmt_vec_info> stmts, unsigned int group_size,
1534 poly_uint64 *max_nunits,
1535 bool *matches, unsigned *limit, unsigned *tree_size,
1536 scalar_stmts_to_slp_tree_map_t *bst_map)
1537 {
1538 if (slp_tree *leader = bst_map->get (stmts))
1539 {
1540 if (dump_enabled_p ())
1541 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1542 !(*leader)->failed ? "" : "failed ", *leader);
1543 if (!(*leader)->failed)
1544 {
1545 SLP_TREE_REF_COUNT (*leader)++;
1546 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1547 stmts.release ();
1548 return *leader;
1549 }
1550 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1551 return NULL;
1552 }
1553
1554 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1555 so we can pick up backedge destinations during discovery. */
1556 slp_tree res = new _slp_tree;
1557 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1558 SLP_TREE_SCALAR_STMTS (res) = stmts;
1559 bst_map->put (stmts.copy (), res);
1560
1561 if (*limit == 0)
1562 {
1563 if (dump_enabled_p ())
1564 dump_printf_loc (MSG_NOTE, vect_location,
1565 "SLP discovery limit exceeded\n");
1566 /* Mark the node invalid so we can detect those when still in use
1567 as backedge destinations. */
1568 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1569 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1570 res->failed = XNEWVEC (bool, group_size);
1571 memset (res->failed, 0, sizeof (bool) * group_size);
1572 memset (matches, 0, sizeof (bool) * group_size);
1573 return NULL;
1574 }
1575 --*limit;
1576
1577 if (dump_enabled_p ())
1578 dump_printf_loc (MSG_NOTE, vect_location,
1579 "starting SLP discovery for node %p\n", res);
1580
1581 poly_uint64 this_max_nunits = 1;
1582 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1583 &this_max_nunits,
1584 matches, limit, tree_size, bst_map);
1585 if (!res_)
1586 {
1587 if (dump_enabled_p ())
1588 dump_printf_loc (MSG_NOTE, vect_location,
1589 "SLP discovery for node %p failed\n", res);
1590 /* Mark the node invalid so we can detect those when still in use
1591 as backedge destinations. */
1592 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1593 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1594 res->failed = XNEWVEC (bool, group_size);
1595 if (flag_checking)
1596 {
1597 unsigned i;
1598 for (i = 0; i < group_size; ++i)
1599 if (!matches[i])
1600 break;
1601 gcc_assert (i < group_size);
1602 }
1603 memcpy (res->failed, matches, sizeof (bool) * group_size);
1604 }
1605 else
1606 {
1607 if (dump_enabled_p ())
1608 dump_printf_loc (MSG_NOTE, vect_location,
1609 "SLP discovery for node %p succeeded\n", res);
1610 gcc_assert (res_ == res);
1611 res->max_nunits = this_max_nunits;
1612 vect_update_max_nunits (max_nunits, this_max_nunits);
1613 /* Keep a reference for the bst_map use. */
1614 SLP_TREE_REF_COUNT (res)++;
1615 }
1616 return res_;
1617 }
1618
1619 /* Helper for building an associated SLP node chain. */
1620
1621 static void
1622 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1623 slp_tree op0, slp_tree op1,
1624 stmt_vec_info oper1, stmt_vec_info oper2,
1625 vec<std::pair<unsigned, unsigned> > lperm)
1626 {
1627 unsigned group_size = SLP_TREE_LANES (op1);
1628
1629 slp_tree child1 = new _slp_tree;
1630 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1631 SLP_TREE_VECTYPE (child1) = vectype;
1632 SLP_TREE_LANES (child1) = group_size;
1633 SLP_TREE_CHILDREN (child1).create (2);
1634 SLP_TREE_CHILDREN (child1).quick_push (op0);
1635 SLP_TREE_CHILDREN (child1).quick_push (op1);
1636 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1637
1638 slp_tree child2 = new _slp_tree;
1639 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1640 SLP_TREE_VECTYPE (child2) = vectype;
1641 SLP_TREE_LANES (child2) = group_size;
1642 SLP_TREE_CHILDREN (child2).create (2);
1643 SLP_TREE_CHILDREN (child2).quick_push (op0);
1644 SLP_TREE_REF_COUNT (op0)++;
1645 SLP_TREE_CHILDREN (child2).quick_push (op1);
1646 SLP_TREE_REF_COUNT (op1)++;
1647 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1648
1649 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1650 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1651 SLP_TREE_VECTYPE (perm) = vectype;
1652 SLP_TREE_LANES (perm) = group_size;
1653 /* ??? We should set this NULL but that's not expected. */
1654 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1655 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1656 SLP_TREE_CHILDREN (perm).quick_push (child1);
1657 SLP_TREE_CHILDREN (perm).quick_push (child2);
1658 }
1659
1660 /* Recursively build an SLP tree starting from NODE.
1661 Fail (and return a value not equal to zero) if def-stmts are not
1662 isomorphic, require data permutation or are of unsupported types of
1663 operation. Otherwise, return 0.
1664 The value returned is the depth in the SLP tree where a mismatch
1665 was found. */
1666
1667 static slp_tree
1668 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1669 vec<stmt_vec_info> stmts, unsigned int group_size,
1670 poly_uint64 *max_nunits,
1671 bool *matches, unsigned *limit, unsigned *tree_size,
1672 scalar_stmts_to_slp_tree_map_t *bst_map)
1673 {
1674 unsigned nops, i, this_tree_size = 0;
1675 poly_uint64 this_max_nunits = *max_nunits;
1676
1677 matches[0] = false;
1678
1679 stmt_vec_info stmt_info = stmts[0];
1680 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1681 nops = gimple_call_num_args (stmt);
1682 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1683 {
1684 nops = gimple_num_ops (stmt) - 1;
1685 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1686 nops++;
1687 }
1688 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1689 nops = gimple_phi_num_args (phi);
1690 else
1691 return NULL;
1692
1693 /* If the SLP node is a PHI (induction or reduction), terminate
1694 the recursion. */
1695 bool *skip_args = XALLOCAVEC (bool, nops);
1696 memset (skip_args, 0, sizeof (bool) * nops);
1697 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1698 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1699 {
1700 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1701 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1702 group_size);
1703 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1704 max_nunits))
1705 return NULL;
1706
1707 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1708 if (def_type == vect_induction_def)
1709 {
1710 /* Induction PHIs are not cycles but walk the initial
1711 value. Only for inner loops through, for outer loops
1712 we need to pick up the value from the actual PHIs
1713 to more easily support peeling and epilogue vectorization. */
1714 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1715 if (!nested_in_vect_loop_p (loop, stmt_info))
1716 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1717 else
1718 loop = loop->inner;
1719 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1720 }
1721 else if (def_type == vect_reduction_def
1722 || def_type == vect_double_reduction_def
1723 || def_type == vect_nested_cycle)
1724 {
1725 /* Else def types have to match. */
1726 stmt_vec_info other_info;
1727 bool all_same = true;
1728 FOR_EACH_VEC_ELT (stmts, i, other_info)
1729 {
1730 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1731 return NULL;
1732 if (other_info != stmt_info)
1733 all_same = false;
1734 }
1735 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1736 /* Reduction initial values are not explicitely represented. */
1737 if (!nested_in_vect_loop_p (loop, stmt_info))
1738 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1739 /* Reduction chain backedge defs are filled manually.
1740 ??? Need a better way to identify a SLP reduction chain PHI.
1741 Or a better overall way to SLP match those. */
1742 if (all_same && def_type == vect_reduction_def)
1743 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1744 }
1745 else if (def_type != vect_internal_def)
1746 return NULL;
1747 }
1748
1749
1750 bool two_operators = false;
1751 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1752 tree vectype = NULL_TREE;
1753 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1754 &this_max_nunits, matches, &two_operators,
1755 &vectype))
1756 return NULL;
1757
1758 /* If the SLP node is a load, terminate the recursion unless masked. */
1759 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1760 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1761 {
1762 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1763 {
1764 /* Masked load. */
1765 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1766 nops = 1;
1767 }
1768 else
1769 {
1770 *max_nunits = this_max_nunits;
1771 (*tree_size)++;
1772 node = vect_create_new_slp_node (node, stmts, 0);
1773 SLP_TREE_VECTYPE (node) = vectype;
1774 /* And compute the load permutation. Whether it is actually
1775 a permutation depends on the unrolling factor which is
1776 decided later. */
1777 vec<unsigned> load_permutation;
1778 int j;
1779 stmt_vec_info load_info;
1780 load_permutation.create (group_size);
1781 stmt_vec_info first_stmt_info
1782 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1783 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1784 {
1785 int load_place = vect_get_place_in_interleaving_chain
1786 (load_info, first_stmt_info);
1787 gcc_assert (load_place != -1);
1788 load_permutation.safe_push (load_place);
1789 }
1790 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1791 return node;
1792 }
1793 }
1794 else if (gimple_assign_single_p (stmt_info->stmt)
1795 && !gimple_vuse (stmt_info->stmt)
1796 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1797 {
1798 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1799 the same SSA name vector of a compatible type to vectype. */
1800 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1801 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1802 stmt_vec_info estmt_info;
1803 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1804 {
1805 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1806 tree bfref = gimple_assign_rhs1 (estmt);
1807 HOST_WIDE_INT lane;
1808 if (!known_eq (bit_field_size (bfref),
1809 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1810 || !constant_multiple_p (bit_field_offset (bfref),
1811 bit_field_size (bfref), &lane))
1812 {
1813 lperm.release ();
1814 return NULL;
1815 }
1816 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1817 }
1818 slp_tree vnode = vect_create_new_slp_node (vNULL);
1819 /* ??? We record vectype here but we hide eventually necessary
1820 punning and instead rely on code generation to materialize
1821 VIEW_CONVERT_EXPRs as necessary. We instead should make
1822 this explicit somehow. */
1823 SLP_TREE_VECTYPE (vnode) = vectype;
1824 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1825 /* We are always building a permutation node even if it is an identity
1826 permute to shield the rest of the vectorizer from the odd node
1827 representing an actual vector without any scalar ops.
1828 ??? We could hide it completely with making the permute node
1829 external? */
1830 node = vect_create_new_slp_node (node, stmts, 1);
1831 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1832 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1833 SLP_TREE_VECTYPE (node) = vectype;
1834 SLP_TREE_CHILDREN (node).quick_push (vnode);
1835 return node;
1836 }
1837 /* When discovery reaches an associatable operation see whether we can
1838 improve that to match up lanes in a way superior to the operand
1839 swapping code which at most looks at two defs.
1840 ??? For BB vectorization we cannot do the brute-force search
1841 for matching as we can succeed by means of builds from scalars
1842 and have no good way to "cost" one build against another. */
1843 else if (is_a <loop_vec_info> (vinfo)
1844 /* ??? We don't handle !vect_internal_def defs below. */
1845 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1846 && is_gimple_assign (stmt_info->stmt)
1847 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1848 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1849 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1850 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1851 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1852 {
1853 /* See if we have a chain of (mixed) adds or subtracts or other
1854 associatable ops. */
1855 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1856 if (code == MINUS_EXPR)
1857 code = PLUS_EXPR;
1858 stmt_vec_info other_op_stmt_info = NULL;
1859 stmt_vec_info op_stmt_info = NULL;
1860 unsigned chain_len = 0;
1861 auto_vec<chain_op_t> chain;
1862 auto_vec<std::pair<tree_code, gimple *> > worklist;
1863 auto_vec<vec<chain_op_t> > chains (group_size);
1864 auto_vec<slp_tree, 4> children;
1865 bool hard_fail = true;
1866 for (unsigned lane = 0; lane < group_size; ++lane)
1867 {
1868 /* For each lane linearize the addition/subtraction (or other
1869 uniform associatable operation) expression tree. */
1870 gimple *op_stmt = NULL, *other_op_stmt = NULL;
1871 vect_slp_linearize_chain (vinfo, worklist, chain, code,
1872 stmts[lane]->stmt, op_stmt, other_op_stmt,
1873 NULL);
1874 if (!op_stmt_info && op_stmt)
1875 op_stmt_info = vinfo->lookup_stmt (op_stmt);
1876 if (!other_op_stmt_info && other_op_stmt)
1877 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1878 if (chain.length () == 2)
1879 {
1880 /* In a chain of just two elements resort to the regular
1881 operand swapping scheme. If we run into a length
1882 mismatch still hard-FAIL. */
1883 if (chain_len == 0)
1884 hard_fail = false;
1885 else
1886 {
1887 matches[lane] = false;
1888 /* ??? We might want to process the other lanes, but
1889 make sure to not give false matching hints to the
1890 caller for lanes we did not process. */
1891 if (lane != group_size - 1)
1892 matches[0] = false;
1893 }
1894 break;
1895 }
1896 else if (chain_len == 0)
1897 chain_len = chain.length ();
1898 else if (chain.length () != chain_len)
1899 {
1900 /* ??? Here we could slip in magic to compensate with
1901 neutral operands. */
1902 matches[lane] = false;
1903 if (lane != group_size - 1)
1904 matches[0] = false;
1905 break;
1906 }
1907 chains.quick_push (chain.copy ());
1908 chain.truncate (0);
1909 }
1910 if (chains.length () == group_size)
1911 {
1912 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
1913 if (!op_stmt_info)
1914 {
1915 hard_fail = false;
1916 goto out;
1917 }
1918 /* Now we have a set of chains with the same length. */
1919 /* 1. pre-sort according to def_type and operation. */
1920 for (unsigned lane = 0; lane < group_size; ++lane)
1921 chains[lane].stablesort (dt_sort_cmp, vinfo);
1922 if (dump_enabled_p ())
1923 {
1924 dump_printf_loc (MSG_NOTE, vect_location,
1925 "pre-sorted chains of %s\n",
1926 get_tree_code_name (code));
1927 for (unsigned lane = 0; lane < group_size; ++lane)
1928 {
1929 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1930 dump_printf (MSG_NOTE, "%s %T ",
1931 get_tree_code_name (chains[lane][opnum].code),
1932 chains[lane][opnum].op);
1933 dump_printf (MSG_NOTE, "\n");
1934 }
1935 }
1936 /* 2. try to build children nodes, associating as necessary. */
1937 for (unsigned n = 0; n < chain_len; ++n)
1938 {
1939 vect_def_type dt = chains[0][n].dt;
1940 unsigned lane;
1941 for (lane = 0; lane < group_size; ++lane)
1942 if (chains[lane][n].dt != dt)
1943 {
1944 if (dt == vect_constant_def
1945 && chains[lane][n].dt == vect_external_def)
1946 dt = vect_external_def;
1947 else if (dt == vect_external_def
1948 && chains[lane][n].dt == vect_constant_def)
1949 ;
1950 else
1951 break;
1952 }
1953 if (lane != group_size)
1954 {
1955 if (dump_enabled_p ())
1956 dump_printf_loc (MSG_NOTE, vect_location,
1957 "giving up on chain due to mismatched "
1958 "def types\n");
1959 matches[lane] = false;
1960 if (lane != group_size - 1)
1961 matches[0] = false;
1962 goto out;
1963 }
1964 if (dt == vect_constant_def
1965 || dt == vect_external_def)
1966 {
1967 /* We can always build those. Might want to sort last
1968 or defer building. */
1969 vec<tree> ops;
1970 ops.create (group_size);
1971 for (lane = 0; lane < group_size; ++lane)
1972 ops.quick_push (chains[lane][n].op);
1973 slp_tree child = vect_create_new_slp_node (ops);
1974 SLP_TREE_DEF_TYPE (child) = dt;
1975 children.safe_push (child);
1976 }
1977 else if (dt != vect_internal_def)
1978 {
1979 /* Not sure, we might need sth special.
1980 gcc.dg/vect/pr96854.c,
1981 gfortran.dg/vect/fast-math-pr37021.f90
1982 and gfortran.dg/vect/pr61171.f trigger. */
1983 /* Soft-fail for now. */
1984 hard_fail = false;
1985 goto out;
1986 }
1987 else
1988 {
1989 vec<stmt_vec_info> op_stmts;
1990 op_stmts.create (group_size);
1991 slp_tree child = NULL;
1992 /* Brute-force our way. We have to consider a lane
1993 failing after fixing an earlier fail up in the
1994 SLP discovery recursion. So track the current
1995 permute per lane. */
1996 unsigned *perms = XALLOCAVEC (unsigned, group_size);
1997 memset (perms, 0, sizeof (unsigned) * group_size);
1998 do
1999 {
2000 op_stmts.truncate (0);
2001 for (lane = 0; lane < group_size; ++lane)
2002 op_stmts.quick_push
2003 (vinfo->lookup_def (chains[lane][n].op));
2004 child = vect_build_slp_tree (vinfo, op_stmts,
2005 group_size, &this_max_nunits,
2006 matches, limit,
2007 &this_tree_size, bst_map);
2008 /* ??? We're likely getting too many fatal mismatches
2009 here so maybe we want to ignore them (but then we
2010 have no idea which lanes fatally mismatched). */
2011 if (child || !matches[0])
2012 break;
2013 /* Swap another lane we have not yet matched up into
2014 lanes that did not match. If we run out of
2015 permute possibilities for a lane terminate the
2016 search. */
2017 bool term = false;
2018 for (lane = 1; lane < group_size; ++lane)
2019 if (!matches[lane])
2020 {
2021 if (n + perms[lane] + 1 == chain_len)
2022 {
2023 term = true;
2024 break;
2025 }
2026 std::swap (chains[lane][n],
2027 chains[lane][n + perms[lane] + 1]);
2028 perms[lane]++;
2029 }
2030 if (term)
2031 break;
2032 }
2033 while (1);
2034 if (!child)
2035 {
2036 if (dump_enabled_p ())
2037 dump_printf_loc (MSG_NOTE, vect_location,
2038 "failed to match up op %d\n", n);
2039 op_stmts.release ();
2040 if (lane != group_size - 1)
2041 matches[0] = false;
2042 else
2043 matches[lane] = false;
2044 goto out;
2045 }
2046 if (dump_enabled_p ())
2047 {
2048 dump_printf_loc (MSG_NOTE, vect_location,
2049 "matched up op %d to\n", n);
2050 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2051 }
2052 children.safe_push (child);
2053 }
2054 }
2055 /* 3. build SLP nodes to combine the chain. */
2056 for (unsigned lane = 0; lane < group_size; ++lane)
2057 if (chains[lane][0].code != code)
2058 {
2059 /* See if there's any alternate all-PLUS entry. */
2060 unsigned n;
2061 for (n = 1; n < chain_len; ++n)
2062 {
2063 for (lane = 0; lane < group_size; ++lane)
2064 if (chains[lane][n].code != code)
2065 break;
2066 if (lane == group_size)
2067 break;
2068 }
2069 if (n != chain_len)
2070 {
2071 /* Swap that in at first position. */
2072 std::swap (children[0], children[n]);
2073 for (lane = 0; lane < group_size; ++lane)
2074 std::swap (chains[lane][0], chains[lane][n]);
2075 }
2076 else
2077 {
2078 /* ??? When this triggers and we end up with two
2079 vect_constant/external_def up-front things break (ICE)
2080 spectacularly finding an insertion place for the
2081 all-constant op. We should have a fully
2082 vect_internal_def operand though(?) so we can swap
2083 that into first place and then prepend the all-zero
2084 constant. */
2085 if (dump_enabled_p ())
2086 dump_printf_loc (MSG_NOTE, vect_location,
2087 "inserting constant zero to compensate "
2088 "for (partially) negated first "
2089 "operand\n");
2090 chain_len++;
2091 for (lane = 0; lane < group_size; ++lane)
2092 chains[lane].safe_insert
2093 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2094 vec<tree> zero_ops;
2095 zero_ops.create (group_size);
2096 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2097 for (lane = 1; lane < group_size; ++lane)
2098 zero_ops.quick_push (zero_ops[0]);
2099 slp_tree zero = vect_create_new_slp_node (zero_ops);
2100 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2101 children.safe_insert (0, zero);
2102 }
2103 break;
2104 }
2105 for (unsigned i = 1; i < children.length (); ++i)
2106 {
2107 slp_tree op0 = children[i - 1];
2108 slp_tree op1 = children[i];
2109 bool this_two_op = false;
2110 for (unsigned lane = 0; lane < group_size; ++lane)
2111 if (chains[lane][i].code != chains[0][i].code)
2112 {
2113 this_two_op = true;
2114 break;
2115 }
2116 slp_tree child;
2117 if (i == children.length () - 1)
2118 child = vect_create_new_slp_node (node, stmts, 2);
2119 else
2120 child = vect_create_new_slp_node (2, ERROR_MARK);
2121 if (this_two_op)
2122 {
2123 vec<std::pair<unsigned, unsigned> > lperm;
2124 lperm.create (group_size);
2125 for (unsigned lane = 0; lane < group_size; ++lane)
2126 lperm.quick_push (std::make_pair
2127 (chains[lane][i].code != chains[0][i].code, lane));
2128 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2129 (chains[0][i].code == code
2130 ? op_stmt_info
2131 : other_op_stmt_info),
2132 (chains[0][i].code == code
2133 ? other_op_stmt_info
2134 : op_stmt_info),
2135 lperm);
2136 }
2137 else
2138 {
2139 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2140 SLP_TREE_VECTYPE (child) = vectype;
2141 SLP_TREE_LANES (child) = group_size;
2142 SLP_TREE_CHILDREN (child).quick_push (op0);
2143 SLP_TREE_CHILDREN (child).quick_push (op1);
2144 SLP_TREE_REPRESENTATIVE (child)
2145 = (chains[0][i].code == code
2146 ? op_stmt_info : other_op_stmt_info);
2147 }
2148 children[i] = child;
2149 }
2150 *tree_size += this_tree_size + 1;
2151 *max_nunits = this_max_nunits;
2152 while (!chains.is_empty ())
2153 chains.pop ().release ();
2154 return node;
2155 }
2156 out:
2157 while (!children.is_empty ())
2158 vect_free_slp_tree (children.pop ());
2159 while (!chains.is_empty ())
2160 chains.pop ().release ();
2161 /* Hard-fail, otherwise we might run into quadratic processing of the
2162 chains starting one stmt into the chain again. */
2163 if (hard_fail)
2164 return NULL;
2165 /* Fall thru to normal processing. */
2166 }
2167
2168 /* Get at the operands, verifying they are compatible. */
2169 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2170 slp_oprnd_info oprnd_info;
2171 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2172 {
2173 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2174 stmts, i, &oprnds_info);
2175 if (res != 0)
2176 matches[(res == -1) ? 0 : i] = false;
2177 if (!matches[0])
2178 break;
2179 }
2180 for (i = 0; i < group_size; ++i)
2181 if (!matches[i])
2182 {
2183 vect_free_oprnd_info (oprnds_info);
2184 return NULL;
2185 }
2186 swap = NULL;
2187
2188 auto_vec<slp_tree, 4> children;
2189
2190 stmt_info = stmts[0];
2191
2192 /* Create SLP_TREE nodes for the definition node/s. */
2193 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2194 {
2195 slp_tree child;
2196 unsigned int j;
2197
2198 /* We're skipping certain operands from processing, for example
2199 outer loop reduction initial defs. */
2200 if (skip_args[i])
2201 {
2202 children.safe_push (NULL);
2203 continue;
2204 }
2205
2206 if (oprnd_info->first_dt == vect_uninitialized_def)
2207 {
2208 /* COND_EXPR have one too many eventually if the condition
2209 is a SSA name. */
2210 gcc_assert (i == 3 && nops == 4);
2211 continue;
2212 }
2213
2214 if (is_a <bb_vec_info> (vinfo)
2215 && oprnd_info->first_dt == vect_internal_def
2216 && !oprnd_info->any_pattern)
2217 {
2218 /* For BB vectorization, if all defs are the same do not
2219 bother to continue the build along the single-lane
2220 graph but use a splat of the scalar value. */
2221 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2222 for (j = 1; j < group_size; ++j)
2223 if (oprnd_info->def_stmts[j] != first_def)
2224 break;
2225 if (j == group_size
2226 /* But avoid doing this for loads where we may be
2227 able to CSE things, unless the stmt is not
2228 vectorizable. */
2229 && (!STMT_VINFO_VECTORIZABLE (first_def)
2230 || !gimple_vuse (first_def->stmt)))
2231 {
2232 if (dump_enabled_p ())
2233 dump_printf_loc (MSG_NOTE, vect_location,
2234 "Using a splat of the uniform operand\n");
2235 oprnd_info->first_dt = vect_external_def;
2236 }
2237 }
2238
2239 if (oprnd_info->first_dt == vect_external_def
2240 || oprnd_info->first_dt == vect_constant_def)
2241 {
2242 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2243 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2244 oprnd_info->ops = vNULL;
2245 children.safe_push (invnode);
2246 continue;
2247 }
2248
2249 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2250 group_size, &this_max_nunits,
2251 matches, limit,
2252 &this_tree_size, bst_map)) != NULL)
2253 {
2254 oprnd_info->def_stmts = vNULL;
2255 children.safe_push (child);
2256 continue;
2257 }
2258
2259 /* If the SLP build for operand zero failed and operand zero
2260 and one can be commutated try that for the scalar stmts
2261 that failed the match. */
2262 if (i == 0
2263 /* A first scalar stmt mismatch signals a fatal mismatch. */
2264 && matches[0]
2265 /* ??? For COND_EXPRs we can swap the comparison operands
2266 as well as the arms under some constraints. */
2267 && nops == 2
2268 && oprnds_info[1]->first_dt == vect_internal_def
2269 && is_gimple_assign (stmt_info->stmt)
2270 /* Swapping operands for reductions breaks assumptions later on. */
2271 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2272 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2273 {
2274 /* See whether we can swap the matching or the non-matching
2275 stmt operands. */
2276 bool swap_not_matching = true;
2277 do
2278 {
2279 for (j = 0; j < group_size; ++j)
2280 {
2281 if (matches[j] != !swap_not_matching)
2282 continue;
2283 stmt_vec_info stmt_info = stmts[j];
2284 /* Verify if we can swap operands of this stmt. */
2285 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2286 if (!stmt
2287 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2288 {
2289 if (!swap_not_matching)
2290 goto fail;
2291 swap_not_matching = false;
2292 break;
2293 }
2294 }
2295 }
2296 while (j != group_size);
2297
2298 /* Swap mismatched definition stmts. */
2299 if (dump_enabled_p ())
2300 dump_printf_loc (MSG_NOTE, vect_location,
2301 "Re-trying with swapped operands of stmts ");
2302 for (j = 0; j < group_size; ++j)
2303 if (matches[j] == !swap_not_matching)
2304 {
2305 std::swap (oprnds_info[0]->def_stmts[j],
2306 oprnds_info[1]->def_stmts[j]);
2307 std::swap (oprnds_info[0]->ops[j],
2308 oprnds_info[1]->ops[j]);
2309 if (dump_enabled_p ())
2310 dump_printf (MSG_NOTE, "%d ", j);
2311 }
2312 if (dump_enabled_p ())
2313 dump_printf (MSG_NOTE, "\n");
2314 /* And try again with scratch 'matches' ... */
2315 bool *tem = XALLOCAVEC (bool, group_size);
2316 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2317 group_size, &this_max_nunits,
2318 tem, limit,
2319 &this_tree_size, bst_map)) != NULL)
2320 {
2321 oprnd_info->def_stmts = vNULL;
2322 children.safe_push (child);
2323 continue;
2324 }
2325 }
2326 fail:
2327
2328 /* If the SLP build failed and we analyze a basic-block
2329 simply treat nodes we fail to build as externally defined
2330 (and thus build vectors from the scalar defs).
2331 The cost model will reject outright expensive cases.
2332 ??? This doesn't treat cases where permutation ultimatively
2333 fails (or we don't try permutation below). Ideally we'd
2334 even compute a permutation that will end up with the maximum
2335 SLP tree size... */
2336 if (is_a <bb_vec_info> (vinfo)
2337 /* ??? Rejecting patterns this way doesn't work. We'd have to
2338 do extra work to cancel the pattern so the uses see the
2339 scalar version. */
2340 && !is_pattern_stmt_p (stmt_info)
2341 && !oprnd_info->any_pattern)
2342 {
2343 /* But if there's a leading vector sized set of matching stmts
2344 fail here so we can split the group. This matches the condition
2345 vect_analyze_slp_instance uses. */
2346 /* ??? We might want to split here and combine the results to support
2347 multiple vector sizes better. */
2348 for (j = 0; j < group_size; ++j)
2349 if (!matches[j])
2350 break;
2351 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2352 {
2353 if (dump_enabled_p ())
2354 dump_printf_loc (MSG_NOTE, vect_location,
2355 "Building vector operands from scalars\n");
2356 this_tree_size++;
2357 child = vect_create_new_slp_node (oprnd_info->ops);
2358 children.safe_push (child);
2359 oprnd_info->ops = vNULL;
2360 continue;
2361 }
2362 }
2363
2364 gcc_assert (child == NULL);
2365 FOR_EACH_VEC_ELT (children, j, child)
2366 if (child)
2367 vect_free_slp_tree (child);
2368 vect_free_oprnd_info (oprnds_info);
2369 return NULL;
2370 }
2371
2372 vect_free_oprnd_info (oprnds_info);
2373
2374 /* If we have all children of a child built up from uniform scalars
2375 or does more than one possibly expensive vector construction then
2376 just throw that away, causing it built up from scalars.
2377 The exception is the SLP node for the vector store. */
2378 if (is_a <bb_vec_info> (vinfo)
2379 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2380 /* ??? Rejecting patterns this way doesn't work. We'd have to
2381 do extra work to cancel the pattern so the uses see the
2382 scalar version. */
2383 && !is_pattern_stmt_p (stmt_info))
2384 {
2385 slp_tree child;
2386 unsigned j;
2387 bool all_uniform_p = true;
2388 unsigned n_vector_builds = 0;
2389 FOR_EACH_VEC_ELT (children, j, child)
2390 {
2391 if (!child)
2392 ;
2393 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2394 all_uniform_p = false;
2395 else if (!vect_slp_tree_uniform_p (child))
2396 {
2397 all_uniform_p = false;
2398 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2399 n_vector_builds++;
2400 }
2401 }
2402 if (all_uniform_p
2403 || n_vector_builds > 1
2404 || (n_vector_builds == children.length ()
2405 && is_a <gphi *> (stmt_info->stmt)))
2406 {
2407 /* Roll back. */
2408 matches[0] = false;
2409 FOR_EACH_VEC_ELT (children, j, child)
2410 if (child)
2411 vect_free_slp_tree (child);
2412
2413 if (dump_enabled_p ())
2414 dump_printf_loc (MSG_NOTE, vect_location,
2415 "Building parent vector operands from "
2416 "scalars instead\n");
2417 return NULL;
2418 }
2419 }
2420
2421 *tree_size += this_tree_size + 1;
2422 *max_nunits = this_max_nunits;
2423
2424 if (two_operators)
2425 {
2426 /* ??? We'd likely want to either cache in bst_map sth like
2427 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2428 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2429 explicit stmts to put in so the keying on 'stmts' doesn't
2430 work (but we have the same issue with nodes that use 'ops'). */
2431 slp_tree one = new _slp_tree;
2432 slp_tree two = new _slp_tree;
2433 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2434 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2435 SLP_TREE_VECTYPE (one) = vectype;
2436 SLP_TREE_VECTYPE (two) = vectype;
2437 SLP_TREE_CHILDREN (one).safe_splice (children);
2438 SLP_TREE_CHILDREN (two).safe_splice (children);
2439 slp_tree child;
2440 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2441 SLP_TREE_REF_COUNT (child)++;
2442
2443 /* Here we record the original defs since this
2444 node represents the final lane configuration. */
2445 node = vect_create_new_slp_node (node, stmts, 2);
2446 SLP_TREE_VECTYPE (node) = vectype;
2447 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2448 SLP_TREE_CHILDREN (node).quick_push (one);
2449 SLP_TREE_CHILDREN (node).quick_push (two);
2450 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2451 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2452 enum tree_code ocode = ERROR_MARK;
2453 stmt_vec_info ostmt_info;
2454 unsigned j = 0;
2455 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2456 {
2457 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2458 if (gimple_assign_rhs_code (ostmt) != code0)
2459 {
2460 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2461 ocode = gimple_assign_rhs_code (ostmt);
2462 j = i;
2463 }
2464 else
2465 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2466 }
2467 SLP_TREE_CODE (one) = code0;
2468 SLP_TREE_CODE (two) = ocode;
2469 SLP_TREE_LANES (one) = stmts.length ();
2470 SLP_TREE_LANES (two) = stmts.length ();
2471 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2472 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2473 return node;
2474 }
2475
2476 node = vect_create_new_slp_node (node, stmts, nops);
2477 SLP_TREE_VECTYPE (node) = vectype;
2478 SLP_TREE_CHILDREN (node).splice (children);
2479 return node;
2480 }
2481
2482 /* Dump a single SLP tree NODE. */
2483
2484 static void
2485 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2486 slp_tree node)
2487 {
2488 unsigned i, j;
2489 slp_tree child;
2490 stmt_vec_info stmt_info;
2491 tree op;
2492
2493 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2494 dump_user_location_t user_loc = loc.get_user_location ();
2495 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
2496 SLP_TREE_DEF_TYPE (node) == vect_external_def
2497 ? " (external)"
2498 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2499 ? " (constant)"
2500 : ""), node,
2501 estimated_poly_value (node->max_nunits),
2502 SLP_TREE_REF_COUNT (node));
2503 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2504 {
2505 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2506 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2507 else
2508 dump_printf_loc (metadata, user_loc, "op template: %G",
2509 SLP_TREE_REPRESENTATIVE (node)->stmt);
2510 }
2511 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2512 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2513 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2514 else
2515 {
2516 dump_printf_loc (metadata, user_loc, "\t{ ");
2517 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2518 dump_printf (metadata, "%T%s ", op,
2519 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2520 dump_printf (metadata, "}\n");
2521 }
2522 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2523 {
2524 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2525 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2526 dump_printf (dump_kind, " %u", j);
2527 dump_printf (dump_kind, " }\n");
2528 }
2529 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2530 {
2531 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2532 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2533 dump_printf (dump_kind, " %u[%u]",
2534 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2535 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2536 dump_printf (dump_kind, " }\n");
2537 }
2538 if (SLP_TREE_CHILDREN (node).is_empty ())
2539 return;
2540 dump_printf_loc (metadata, user_loc, "\tchildren");
2541 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2542 dump_printf (dump_kind, " %p", (void *)child);
2543 dump_printf (dump_kind, "\n");
2544 }
2545
2546 DEBUG_FUNCTION void
2547 debug (slp_tree node)
2548 {
2549 debug_dump_context ctx;
2550 vect_print_slp_tree (MSG_NOTE,
2551 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2552 node);
2553 }
2554
2555 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2556
2557 static void
2558 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2559 slp_tree node, hash_set<slp_tree> &visited)
2560 {
2561 unsigned i;
2562 slp_tree child;
2563
2564 if (visited.add (node))
2565 return;
2566
2567 vect_print_slp_tree (dump_kind, loc, node);
2568
2569 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2570 if (child)
2571 vect_print_slp_graph (dump_kind, loc, child, visited);
2572 }
2573
2574 static void
2575 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2576 slp_tree entry)
2577 {
2578 hash_set<slp_tree> visited;
2579 vect_print_slp_graph (dump_kind, loc, entry, visited);
2580 }
2581
2582 /* Mark the tree rooted at NODE with PURE_SLP. */
2583
2584 static void
2585 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2586 {
2587 int i;
2588 stmt_vec_info stmt_info;
2589 slp_tree child;
2590
2591 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2592 return;
2593
2594 if (visited.add (node))
2595 return;
2596
2597 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2598 STMT_SLP_TYPE (stmt_info) = pure_slp;
2599
2600 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2601 if (child)
2602 vect_mark_slp_stmts (child, visited);
2603 }
2604
2605 static void
2606 vect_mark_slp_stmts (slp_tree node)
2607 {
2608 hash_set<slp_tree> visited;
2609 vect_mark_slp_stmts (node, visited);
2610 }
2611
2612 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2613
2614 static void
2615 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2616 {
2617 int i;
2618 stmt_vec_info stmt_info;
2619 slp_tree child;
2620
2621 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2622 return;
2623
2624 if (visited.add (node))
2625 return;
2626
2627 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2628 {
2629 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2630 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2631 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2632 }
2633
2634 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2635 if (child)
2636 vect_mark_slp_stmts_relevant (child, visited);
2637 }
2638
2639 static void
2640 vect_mark_slp_stmts_relevant (slp_tree node)
2641 {
2642 hash_set<slp_tree> visited;
2643 vect_mark_slp_stmts_relevant (node, visited);
2644 }
2645
2646
2647 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2648
2649 static void
2650 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2651 hash_set<slp_tree> &visited)
2652 {
2653 if (!node || visited.add (node))
2654 return;
2655
2656 if (SLP_TREE_CHILDREN (node).length () == 0)
2657 {
2658 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2659 return;
2660 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2661 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2662 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2663 loads.safe_push (node);
2664 }
2665 else
2666 {
2667 unsigned i;
2668 slp_tree child;
2669 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2670 vect_gather_slp_loads (loads, child, visited);
2671 }
2672 }
2673
2674
2675 /* Find the last store in SLP INSTANCE. */
2676
2677 stmt_vec_info
2678 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2679 {
2680 stmt_vec_info last = NULL;
2681 stmt_vec_info stmt_vinfo;
2682
2683 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2684 {
2685 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2686 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2687 }
2688
2689 return last;
2690 }
2691
2692 /* Find the first stmt in NODE. */
2693
2694 stmt_vec_info
2695 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2696 {
2697 stmt_vec_info first = NULL;
2698 stmt_vec_info stmt_vinfo;
2699
2700 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2701 {
2702 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2703 if (!first
2704 || get_later_stmt (stmt_vinfo, first) == first)
2705 first = stmt_vinfo;
2706 }
2707
2708 return first;
2709 }
2710
2711 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2712 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2713 (also containing the first GROUP1_SIZE stmts, since stores are
2714 consecutive), the second containing the remainder.
2715 Return the first stmt in the second group. */
2716
2717 static stmt_vec_info
2718 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2719 {
2720 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2721 gcc_assert (group1_size > 0);
2722 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2723 gcc_assert (group2_size > 0);
2724 DR_GROUP_SIZE (first_vinfo) = group1_size;
2725
2726 stmt_vec_info stmt_info = first_vinfo;
2727 for (unsigned i = group1_size; i > 1; i--)
2728 {
2729 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2730 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2731 }
2732 /* STMT is now the last element of the first group. */
2733 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2734 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2735
2736 DR_GROUP_SIZE (group2) = group2_size;
2737 for (stmt_info = group2; stmt_info;
2738 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2739 {
2740 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2741 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2742 }
2743
2744 /* For the second group, the DR_GROUP_GAP is that before the original group,
2745 plus skipping over the first vector. */
2746 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2747
2748 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2749 DR_GROUP_GAP (first_vinfo) += group2_size;
2750
2751 if (dump_enabled_p ())
2752 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2753 group1_size, group2_size);
2754
2755 return group2;
2756 }
2757
2758 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2759 statements and a vector of NUNITS elements. */
2760
2761 static poly_uint64
2762 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2763 {
2764 return exact_div (common_multiple (nunits, group_size), group_size);
2765 }
2766
2767 /* Helper that checks to see if a node is a load node. */
2768
2769 static inline bool
2770 vect_is_slp_load_node (slp_tree root)
2771 {
2772 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2773 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2774 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2775 }
2776
2777
2778 /* Helper function of optimize_load_redistribution that performs the operation
2779 recursively. */
2780
2781 static slp_tree
2782 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2783 vec_info *vinfo, unsigned int group_size,
2784 hash_map<slp_tree, slp_tree> *load_map,
2785 slp_tree root)
2786 {
2787 if (slp_tree *leader = load_map->get (root))
2788 return *leader;
2789
2790 slp_tree node;
2791 unsigned i;
2792
2793 /* For now, we don't know anything about externals so do not do anything. */
2794 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2795 return NULL;
2796 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2797 {
2798 /* First convert this node into a load node and add it to the leaves
2799 list and flatten the permute from a lane to a load one. If it's
2800 unneeded it will be elided later. */
2801 vec<stmt_vec_info> stmts;
2802 stmts.create (SLP_TREE_LANES (root));
2803 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2804 for (unsigned j = 0; j < lane_perm.length (); j++)
2805 {
2806 std::pair<unsigned, unsigned> perm = lane_perm[j];
2807 node = SLP_TREE_CHILDREN (root)[perm.first];
2808
2809 if (!vect_is_slp_load_node (node)
2810 || SLP_TREE_CHILDREN (node).exists ())
2811 {
2812 stmts.release ();
2813 goto next;
2814 }
2815
2816 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2817 }
2818
2819 if (dump_enabled_p ())
2820 dump_printf_loc (MSG_NOTE, vect_location,
2821 "converting stmts on permute node %p\n", root);
2822
2823 bool *matches = XALLOCAVEC (bool, group_size);
2824 poly_uint64 max_nunits = 1;
2825 unsigned tree_size = 0, limit = 1;
2826 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2827 matches, &limit, &tree_size, bst_map);
2828 if (!node)
2829 stmts.release ();
2830
2831 load_map->put (root, node);
2832 return node;
2833 }
2834
2835 next:
2836 load_map->put (root, NULL);
2837
2838 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2839 {
2840 slp_tree value
2841 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2842 node);
2843 if (value)
2844 {
2845 SLP_TREE_REF_COUNT (value)++;
2846 SLP_TREE_CHILDREN (root)[i] = value;
2847 /* ??? We know the original leafs of the replaced nodes will
2848 be referenced by bst_map, only the permutes created by
2849 pattern matching are not. */
2850 if (SLP_TREE_REF_COUNT (node) == 1)
2851 load_map->remove (node);
2852 vect_free_slp_tree (node);
2853 }
2854 }
2855
2856 return NULL;
2857 }
2858
2859 /* Temporary workaround for loads not being CSEd during SLP build. This
2860 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2861 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2862 same DR such that the final operation is equal to a permuted load. Such
2863 NODES are then directly converted into LOADS themselves. The nodes are
2864 CSEd using BST_MAP. */
2865
2866 static void
2867 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2868 vec_info *vinfo, unsigned int group_size,
2869 hash_map<slp_tree, slp_tree> *load_map,
2870 slp_tree root)
2871 {
2872 slp_tree node;
2873 unsigned i;
2874
2875 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2876 {
2877 slp_tree value
2878 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2879 node);
2880 if (value)
2881 {
2882 SLP_TREE_REF_COUNT (value)++;
2883 SLP_TREE_CHILDREN (root)[i] = value;
2884 /* ??? We know the original leafs of the replaced nodes will
2885 be referenced by bst_map, only the permutes created by
2886 pattern matching are not. */
2887 if (SLP_TREE_REF_COUNT (node) == 1)
2888 load_map->remove (node);
2889 vect_free_slp_tree (node);
2890 }
2891 }
2892 }
2893
2894 /* Helper function of vect_match_slp_patterns.
2895
2896 Attempts to match patterns against the slp tree rooted in REF_NODE using
2897 VINFO. Patterns are matched in post-order traversal.
2898
2899 If matching is successful the value in REF_NODE is updated and returned, if
2900 not then it is returned unchanged. */
2901
2902 static bool
2903 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2904 slp_tree_to_load_perm_map_t *perm_cache,
2905 hash_set<slp_tree> *visited)
2906 {
2907 unsigned i;
2908 slp_tree node = *ref_node;
2909 bool found_p = false;
2910 if (!node || visited->add (node))
2911 return false;
2912
2913 slp_tree child;
2914 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2915 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2916 vinfo, perm_cache, visited);
2917
2918 for (unsigned x = 0; x < num__slp_patterns; x++)
2919 {
2920 vect_pattern *pattern = slp_patterns[x] (perm_cache, ref_node);
2921 if (pattern)
2922 {
2923 pattern->build (vinfo);
2924 delete pattern;
2925 found_p = true;
2926 }
2927 }
2928
2929 return found_p;
2930 }
2931
2932 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2933 vec_info VINFO.
2934
2935 The modified tree is returned. Patterns are tried in order and multiple
2936 patterns may match. */
2937
2938 static bool
2939 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2940 hash_set<slp_tree> *visited,
2941 slp_tree_to_load_perm_map_t *perm_cache)
2942 {
2943 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2944 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2945
2946 if (dump_enabled_p ())
2947 dump_printf_loc (MSG_NOTE, vect_location,
2948 "Analyzing SLP tree %p for patterns\n",
2949 SLP_INSTANCE_TREE (instance));
2950
2951 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
2952 }
2953
2954 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
2955 splitting into two, with the first split group having size NEW_GROUP_SIZE.
2956 Return true if we could use IFN_STORE_LANES instead and if that appears
2957 to be the better approach. */
2958
2959 static bool
2960 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
2961 unsigned int group_size,
2962 unsigned int new_group_size)
2963 {
2964 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2965 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
2966 if (!vectype)
2967 return false;
2968 /* Allow the split if one of the two new groups would operate on full
2969 vectors *within* rather than across one scalar loop iteration.
2970 This is purely a heuristic, but it should work well for group
2971 sizes of 3 and 4, where the possible splits are:
2972
2973 3->2+1: OK if the vector has exactly two elements
2974 4->2+2: Likewise
2975 4->3+1: Less clear-cut. */
2976 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
2977 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
2978 return false;
2979 return vect_store_lanes_supported (vectype, group_size, false);
2980 }
2981
2982 /* Analyze an SLP instance starting from a group of grouped stores. Call
2983 vect_build_slp_tree to build a tree of packed stmts if possible.
2984 Return FALSE if it's impossible to SLP any stmt in the loop. */
2985
2986 static bool
2987 vect_analyze_slp_instance (vec_info *vinfo,
2988 scalar_stmts_to_slp_tree_map_t *bst_map,
2989 stmt_vec_info stmt_info, slp_instance_kind kind,
2990 unsigned max_tree_size, unsigned *limit);
2991
2992 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2993 of KIND. Return true if successful. */
2994
2995 static bool
2996 vect_build_slp_instance (vec_info *vinfo,
2997 slp_instance_kind kind,
2998 vec<stmt_vec_info> &scalar_stmts,
2999 vec<stmt_vec_info> &root_stmt_infos,
3000 unsigned max_tree_size, unsigned *limit,
3001 scalar_stmts_to_slp_tree_map_t *bst_map,
3002 /* ??? We need stmt_info for group splitting. */
3003 stmt_vec_info stmt_info_)
3004 {
3005 if (dump_enabled_p ())
3006 {
3007 dump_printf_loc (MSG_NOTE, vect_location,
3008 "Starting SLP discovery for\n");
3009 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3010 dump_printf_loc (MSG_NOTE, vect_location,
3011 " %G", scalar_stmts[i]->stmt);
3012 }
3013
3014 /* Build the tree for the SLP instance. */
3015 unsigned int group_size = scalar_stmts.length ();
3016 bool *matches = XALLOCAVEC (bool, group_size);
3017 poly_uint64 max_nunits = 1;
3018 unsigned tree_size = 0;
3019 unsigned i;
3020 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3021 &max_nunits, matches, limit,
3022 &tree_size, bst_map);
3023 if (node != NULL)
3024 {
3025 /* Calculate the unrolling factor based on the smallest type. */
3026 poly_uint64 unrolling_factor
3027 = calculate_unrolling_factor (max_nunits, group_size);
3028
3029 if (maybe_ne (unrolling_factor, 1U)
3030 && is_a <bb_vec_info> (vinfo))
3031 {
3032 unsigned HOST_WIDE_INT const_max_nunits;
3033 if (!max_nunits.is_constant (&const_max_nunits)
3034 || const_max_nunits > group_size)
3035 {
3036 if (dump_enabled_p ())
3037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3038 "Build SLP failed: store group "
3039 "size not a multiple of the vector size "
3040 "in basic block SLP\n");
3041 vect_free_slp_tree (node);
3042 return false;
3043 }
3044 /* Fatal mismatch. */
3045 if (dump_enabled_p ())
3046 dump_printf_loc (MSG_NOTE, vect_location,
3047 "SLP discovery succeeded but node needs "
3048 "splitting\n");
3049 memset (matches, true, group_size);
3050 matches[group_size / const_max_nunits * const_max_nunits] = false;
3051 vect_free_slp_tree (node);
3052 }
3053 else
3054 {
3055 /* Create a new SLP instance. */
3056 slp_instance new_instance = XNEW (class _slp_instance);
3057 SLP_INSTANCE_TREE (new_instance) = node;
3058 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3059 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3060 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3061 SLP_INSTANCE_KIND (new_instance) = kind;
3062 new_instance->reduc_phis = NULL;
3063 new_instance->cost_vec = vNULL;
3064 new_instance->subgraph_entries = vNULL;
3065
3066 if (dump_enabled_p ())
3067 dump_printf_loc (MSG_NOTE, vect_location,
3068 "SLP size %u vs. limit %u.\n",
3069 tree_size, max_tree_size);
3070
3071 /* Fixup SLP reduction chains. */
3072 if (kind == slp_inst_kind_reduc_chain)
3073 {
3074 /* If this is a reduction chain with a conversion in front
3075 amend the SLP tree with a node for that. */
3076 gimple *scalar_def
3077 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3078 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3079 {
3080 /* Get at the conversion stmt - we know it's the single use
3081 of the last stmt of the reduction chain. */
3082 use_operand_p use_p;
3083 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3084 &use_p, &scalar_def);
3085 gcc_assert (r);
3086 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3087 next_info = vect_stmt_to_vectorize (next_info);
3088 scalar_stmts = vNULL;
3089 scalar_stmts.create (group_size);
3090 for (unsigned i = 0; i < group_size; ++i)
3091 scalar_stmts.quick_push (next_info);
3092 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3093 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3094 SLP_TREE_CHILDREN (conv).quick_push (node);
3095 SLP_INSTANCE_TREE (new_instance) = conv;
3096 /* We also have to fake this conversion stmt as SLP reduction
3097 group so we don't have to mess with too much code
3098 elsewhere. */
3099 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3100 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3101 }
3102 /* Fill the backedge child of the PHI SLP node. The
3103 general matching code cannot find it because the
3104 scalar code does not reflect how we vectorize the
3105 reduction. */
3106 use_operand_p use_p;
3107 imm_use_iterator imm_iter;
3108 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3109 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3110 gimple_get_lhs (scalar_def))
3111 /* There are exactly two non-debug uses, the reduction
3112 PHI and the loop-closed PHI node. */
3113 if (!is_gimple_debug (USE_STMT (use_p))
3114 && gimple_bb (USE_STMT (use_p)) == loop->header)
3115 {
3116 auto_vec<stmt_vec_info, 64> phis (group_size);
3117 stmt_vec_info phi_info
3118 = vinfo->lookup_stmt (USE_STMT (use_p));
3119 for (unsigned i = 0; i < group_size; ++i)
3120 phis.quick_push (phi_info);
3121 slp_tree *phi_node = bst_map->get (phis);
3122 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3123 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3124 = SLP_INSTANCE_TREE (new_instance);
3125 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3126 }
3127 }
3128
3129 vinfo->slp_instances.safe_push (new_instance);
3130
3131 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3132 the number of scalar stmts in the root in a few places.
3133 Verify that assumption holds. */
3134 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3135 .length () == group_size);
3136
3137 if (dump_enabled_p ())
3138 {
3139 dump_printf_loc (MSG_NOTE, vect_location,
3140 "Final SLP tree for instance %p:\n", new_instance);
3141 vect_print_slp_graph (MSG_NOTE, vect_location,
3142 SLP_INSTANCE_TREE (new_instance));
3143 }
3144
3145 return true;
3146 }
3147 }
3148 else
3149 {
3150 /* Failed to SLP. */
3151 /* Free the allocated memory. */
3152 scalar_stmts.release ();
3153 }
3154
3155 stmt_vec_info stmt_info = stmt_info_;
3156 /* Try to break the group up into pieces. */
3157 if (kind == slp_inst_kind_store)
3158 {
3159 /* ??? We could delay all the actual splitting of store-groups
3160 until after SLP discovery of the original group completed.
3161 Then we can recurse to vect_build_slp_instance directly. */
3162 for (i = 0; i < group_size; i++)
3163 if (!matches[i])
3164 break;
3165
3166 /* For basic block SLP, try to break the group up into multiples of
3167 a vector size. */
3168 if (is_a <bb_vec_info> (vinfo)
3169 && (i > 1 && i < group_size))
3170 {
3171 tree scalar_type
3172 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3173 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3174 1 << floor_log2 (i));
3175 unsigned HOST_WIDE_INT const_nunits;
3176 if (vectype
3177 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3178 {
3179 /* Split into two groups at the first vector boundary. */
3180 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3181 unsigned group1_size = i & ~(const_nunits - 1);
3182
3183 if (dump_enabled_p ())
3184 dump_printf_loc (MSG_NOTE, vect_location,
3185 "Splitting SLP group at stmt %u\n", i);
3186 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3187 group1_size);
3188 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3189 kind, max_tree_size,
3190 limit);
3191 /* Split the rest at the failure point and possibly
3192 re-analyze the remaining matching part if it has
3193 at least two lanes. */
3194 if (group1_size < i
3195 && (i + 1 < group_size
3196 || i - group1_size > 1))
3197 {
3198 stmt_vec_info rest2 = rest;
3199 rest = vect_split_slp_store_group (rest, i - group1_size);
3200 if (i - group1_size > 1)
3201 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3202 kind, max_tree_size,
3203 limit);
3204 }
3205 /* Re-analyze the non-matching tail if it has at least
3206 two lanes. */
3207 if (i + 1 < group_size)
3208 res |= vect_analyze_slp_instance (vinfo, bst_map,
3209 rest, kind, max_tree_size,
3210 limit);
3211 return res;
3212 }
3213 }
3214
3215 /* For loop vectorization split into arbitrary pieces of size > 1. */
3216 if (is_a <loop_vec_info> (vinfo)
3217 && (i > 1 && i < group_size)
3218 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3219 {
3220 unsigned group1_size = i;
3221
3222 if (dump_enabled_p ())
3223 dump_printf_loc (MSG_NOTE, vect_location,
3224 "Splitting SLP group at stmt %u\n", i);
3225
3226 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3227 group1_size);
3228 /* Loop vectorization cannot handle gaps in stores, make sure
3229 the split group appears as strided. */
3230 STMT_VINFO_STRIDED_P (rest) = 1;
3231 DR_GROUP_GAP (rest) = 0;
3232 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3233 DR_GROUP_GAP (stmt_info) = 0;
3234
3235 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3236 kind, max_tree_size, limit);
3237 if (i + 1 < group_size)
3238 res |= vect_analyze_slp_instance (vinfo, bst_map,
3239 rest, kind, max_tree_size, limit);
3240
3241 return res;
3242 }
3243
3244 /* Even though the first vector did not all match, we might be able to SLP
3245 (some) of the remainder. FORNOW ignore this possibility. */
3246 }
3247
3248 /* Failed to SLP. */
3249 if (dump_enabled_p ())
3250 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3251 return false;
3252 }
3253
3254
3255 /* Analyze an SLP instance starting from a group of grouped stores. Call
3256 vect_build_slp_tree to build a tree of packed stmts if possible.
3257 Return FALSE if it's impossible to SLP any stmt in the loop. */
3258
3259 static bool
3260 vect_analyze_slp_instance (vec_info *vinfo,
3261 scalar_stmts_to_slp_tree_map_t *bst_map,
3262 stmt_vec_info stmt_info,
3263 slp_instance_kind kind,
3264 unsigned max_tree_size, unsigned *limit)
3265 {
3266 unsigned int i;
3267 vec<stmt_vec_info> scalar_stmts;
3268
3269 if (is_a <bb_vec_info> (vinfo))
3270 vect_location = stmt_info->stmt;
3271
3272 stmt_vec_info next_info = stmt_info;
3273 if (kind == slp_inst_kind_store)
3274 {
3275 /* Collect the stores and store them in scalar_stmts. */
3276 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3277 while (next_info)
3278 {
3279 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3280 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3281 }
3282 }
3283 else if (kind == slp_inst_kind_reduc_chain)
3284 {
3285 /* Collect the reduction stmts and store them in scalar_stmts. */
3286 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3287 while (next_info)
3288 {
3289 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3290 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3291 }
3292 /* Mark the first element of the reduction chain as reduction to properly
3293 transform the node. In the reduction analysis phase only the last
3294 element of the chain is marked as reduction. */
3295 STMT_VINFO_DEF_TYPE (stmt_info)
3296 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3297 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3298 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3299 }
3300 else if (kind == slp_inst_kind_ctor)
3301 {
3302 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3303 tree val;
3304 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3305 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3306 {
3307 stmt_vec_info def_info = vinfo->lookup_def (val);
3308 def_info = vect_stmt_to_vectorize (def_info);
3309 scalar_stmts.quick_push (def_info);
3310 }
3311 if (dump_enabled_p ())
3312 dump_printf_loc (MSG_NOTE, vect_location,
3313 "Analyzing vectorizable constructor: %G\n",
3314 stmt_info->stmt);
3315 }
3316 else if (kind == slp_inst_kind_reduc_group)
3317 {
3318 /* Collect reduction statements. */
3319 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
3320 scalar_stmts.create (reductions.length ());
3321 for (i = 0; reductions.iterate (i, &next_info); i++)
3322 if (STMT_VINFO_RELEVANT_P (next_info)
3323 || STMT_VINFO_LIVE_P (next_info))
3324 scalar_stmts.quick_push (next_info);
3325 /* If less than two were relevant/live there's nothing to SLP. */
3326 if (scalar_stmts.length () < 2)
3327 return false;
3328 }
3329 else
3330 gcc_unreachable ();
3331
3332 vec<stmt_vec_info> roots = vNULL;
3333 if (kind == slp_inst_kind_ctor)
3334 {
3335 roots.create (1);
3336 roots.quick_push (stmt_info);
3337 }
3338 /* Build the tree for the SLP instance. */
3339 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3340 roots,
3341 max_tree_size, limit, bst_map,
3342 kind == slp_inst_kind_store
3343 ? stmt_info : NULL);
3344 if (!res)
3345 roots.release ();
3346
3347 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3348 where we should do store group splitting. */
3349
3350 return res;
3351 }
3352
3353 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3354 trees of packed scalar stmts if SLP is possible. */
3355
3356 opt_result
3357 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3358 {
3359 unsigned int i;
3360 stmt_vec_info first_element;
3361 slp_instance instance;
3362
3363 DUMP_VECT_SCOPE ("vect_analyze_slp");
3364
3365 unsigned limit = max_tree_size;
3366
3367 scalar_stmts_to_slp_tree_map_t *bst_map
3368 = new scalar_stmts_to_slp_tree_map_t ();
3369
3370 /* Find SLP sequences starting from groups of grouped stores. */
3371 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3372 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3373 STMT_VINFO_GROUPED_ACCESS (first_element)
3374 ? slp_inst_kind_store : slp_inst_kind_ctor,
3375 max_tree_size, &limit);
3376
3377 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3378 {
3379 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3380 {
3381 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3382 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3383 bb_vinfo->roots[i].stmts,
3384 bb_vinfo->roots[i].roots,
3385 max_tree_size, &limit, bst_map, NULL))
3386 {
3387 bb_vinfo->roots[i].stmts = vNULL;
3388 bb_vinfo->roots[i].roots = vNULL;
3389 }
3390 }
3391 }
3392
3393 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3394 {
3395 /* Find SLP sequences starting from reduction chains. */
3396 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3397 if (! STMT_VINFO_RELEVANT_P (first_element)
3398 && ! STMT_VINFO_LIVE_P (first_element))
3399 ;
3400 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3401 slp_inst_kind_reduc_chain,
3402 max_tree_size, &limit))
3403 {
3404 /* Dissolve reduction chain group. */
3405 stmt_vec_info vinfo = first_element;
3406 stmt_vec_info last = NULL;
3407 while (vinfo)
3408 {
3409 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3410 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3411 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3412 last = vinfo;
3413 vinfo = next;
3414 }
3415 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3416 /* It can be still vectorized as part of an SLP reduction. */
3417 loop_vinfo->reductions.safe_push (last);
3418 }
3419
3420 /* Find SLP sequences starting from groups of reductions. */
3421 if (loop_vinfo->reductions.length () > 1)
3422 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3423 slp_inst_kind_reduc_group, max_tree_size,
3424 &limit);
3425 }
3426
3427 hash_set<slp_tree> visited_patterns;
3428 slp_tree_to_load_perm_map_t perm_cache;
3429
3430 /* See if any patterns can be found in the SLP tree. */
3431 bool pattern_found = false;
3432 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3433 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3434 &visited_patterns, &perm_cache);
3435
3436 /* If any were found optimize permutations of loads. */
3437 if (pattern_found)
3438 {
3439 hash_map<slp_tree, slp_tree> load_map;
3440 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3441 {
3442 slp_tree root = SLP_INSTANCE_TREE (instance);
3443 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3444 &load_map, root);
3445 }
3446 }
3447
3448
3449
3450 /* The map keeps a reference on SLP nodes built, release that. */
3451 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3452 it != bst_map->end (); ++it)
3453 if ((*it).second)
3454 vect_free_slp_tree ((*it).second);
3455 delete bst_map;
3456
3457 if (pattern_found && dump_enabled_p ())
3458 {
3459 dump_printf_loc (MSG_NOTE, vect_location,
3460 "Pattern matched SLP tree\n");
3461 hash_set<slp_tree> visited;
3462 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3463 vect_print_slp_graph (MSG_NOTE, vect_location,
3464 SLP_INSTANCE_TREE (instance), visited);
3465 }
3466
3467 return opt_result::success ();
3468 }
3469
3470 struct slpg_vertex
3471 {
3472 slpg_vertex (slp_tree node_)
3473 : node (node_), visited (0), perm_out (0), materialize (0) {}
3474
3475 int get_perm_in () const { return materialize ? materialize : perm_out; }
3476
3477 slp_tree node;
3478 unsigned visited : 1;
3479 /* The permutation on the outgoing lanes (towards SLP parents). */
3480 int perm_out;
3481 /* The permutation that is applied by this node. perm_out is
3482 relative to this. */
3483 int materialize;
3484 };
3485
3486 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3487
3488 static void
3489 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
3490 vec<slpg_vertex> &vertices, vec<int> &leafs)
3491 {
3492 unsigned i;
3493 slp_tree child;
3494
3495 if (visited.add (node))
3496 return;
3497
3498 node->vertex = vertices.length ();
3499 vertices.safe_push (slpg_vertex (node));
3500
3501 bool leaf = true;
3502 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3503 if (child)
3504 {
3505 leaf = false;
3506 vect_slp_build_vertices (visited, child, vertices, leafs);
3507 }
3508 if (leaf)
3509 leafs.safe_push (node->vertex);
3510 }
3511
3512 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3513
3514 static void
3515 vect_slp_build_vertices (vec_info *info, vec<slpg_vertex> &vertices,
3516 vec<int> &leafs)
3517 {
3518 hash_set<slp_tree> visited;
3519 unsigned i;
3520 slp_instance instance;
3521 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
3522 {
3523 unsigned n_v = vertices.length ();
3524 unsigned n_l = leafs.length ();
3525 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
3526 leafs);
3527 /* If we added vertices but no entries to the reverse graph we've
3528 added a cycle that is not backwards-reachable. Push the entry
3529 to mimic as leaf then. */
3530 if (vertices.length () > n_v
3531 && leafs.length () == n_l)
3532 leafs.safe_push (SLP_INSTANCE_TREE (instance)->vertex);
3533 }
3534 }
3535
3536 /* Apply (reverse) bijectite PERM to VEC. */
3537
3538 template <class T>
3539 static void
3540 vect_slp_permute (vec<unsigned> perm,
3541 vec<T> &vec, bool reverse)
3542 {
3543 auto_vec<T, 64> saved;
3544 saved.create (vec.length ());
3545 for (unsigned i = 0; i < vec.length (); ++i)
3546 saved.quick_push (vec[i]);
3547
3548 if (reverse)
3549 {
3550 for (unsigned i = 0; i < vec.length (); ++i)
3551 vec[perm[i]] = saved[i];
3552 for (unsigned i = 0; i < vec.length (); ++i)
3553 gcc_assert (vec[perm[i]] == saved[i]);
3554 }
3555 else
3556 {
3557 for (unsigned i = 0; i < vec.length (); ++i)
3558 vec[i] = saved[perm[i]];
3559 for (unsigned i = 0; i < vec.length (); ++i)
3560 gcc_assert (vec[i] == saved[perm[i]]);
3561 }
3562 }
3563
3564 /* Return whether permutations PERM_A and PERM_B as recorded in the
3565 PERMS vector are equal. */
3566
3567 static bool
3568 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
3569 int perm_a, int perm_b)
3570 {
3571 return (perm_a == perm_b
3572 || (perms[perm_a].length () == perms[perm_b].length ()
3573 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
3574 sizeof (unsigned) * perms[perm_a].length ()) == 0));
3575 }
3576
3577 /* Optimize the SLP graph of VINFO. */
3578
3579 void
3580 vect_optimize_slp (vec_info *vinfo)
3581 {
3582 if (vinfo->slp_instances.is_empty ())
3583 return;
3584
3585 slp_tree node;
3586 unsigned i;
3587 auto_vec<slpg_vertex> vertices;
3588 auto_vec<int> leafs;
3589 vect_slp_build_vertices (vinfo, vertices, leafs);
3590
3591 struct graph *slpg = new_graph (vertices.length ());
3592 for (slpg_vertex &v : vertices)
3593 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
3594 if (child)
3595 add_edge (slpg, v.node->vertex, child->vertex);
3596
3597 /* Compute (reverse) postorder on the inverted graph. */
3598 auto_vec<int> ipo;
3599 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3600
3601 auto_vec<vec<unsigned> > perms;
3602 perms.safe_push (vNULL); /* zero is no permute */
3603
3604 /* Produce initial permutations. */
3605 for (i = 0; i < leafs.length (); ++i)
3606 {
3607 int idx = leafs[i];
3608 slp_tree node = vertices[idx].node;
3609
3610 /* Handle externals and constants optimistically throughout the
3611 iteration. */
3612 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3613 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3614 continue;
3615
3616 /* Leafs do not change across iterations. Note leafs also double
3617 as entries to the reverse graph. */
3618 if (!slpg->vertices[idx].succ)
3619 vertices[idx].visited = 1;
3620 /* Loads are the only thing generating permutes. */
3621 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3622 continue;
3623
3624 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3625 node unpermuted, record this permute. */
3626 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3627 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3628 continue;
3629 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3630 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3631 bool any_permute = false;
3632 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3633 {
3634 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3635 imin = MIN (imin, idx);
3636 imax = MAX (imax, idx);
3637 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3638 any_permute = true;
3639 }
3640 /* If there's no permute no need to split one out. */
3641 if (!any_permute)
3642 continue;
3643 /* If the span doesn't match we'd disrupt VF computation, avoid
3644 that for now. */
3645 if (imax - imin + 1 != SLP_TREE_LANES (node))
3646 continue;
3647
3648 /* For now only handle true permutes, like
3649 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3650 when permuting constants and invariants keeping the permute
3651 bijective. */
3652 auto_sbitmap load_index (SLP_TREE_LANES (node));
3653 bitmap_clear (load_index);
3654 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3655 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3656 unsigned j;
3657 for (j = 0; j < SLP_TREE_LANES (node); ++j)
3658 if (!bitmap_bit_p (load_index, j))
3659 break;
3660 if (j != SLP_TREE_LANES (node))
3661 continue;
3662
3663 vec<unsigned> perm = vNULL;
3664 perm.safe_grow (SLP_TREE_LANES (node), true);
3665 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3666 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3667 perms.safe_push (perm);
3668 vertices[idx].perm_out = perms.length () - 1;
3669 }
3670
3671 /* Propagate permutes along the graph and compute materialization points. */
3672 bool changed;
3673 unsigned iteration = 0;
3674 do
3675 {
3676 changed = false;
3677 ++iteration;
3678
3679 for (i = vertices.length (); i > 0 ; --i)
3680 {
3681 int idx = ipo[i-1];
3682 slp_tree node = vertices[idx].node;
3683
3684 /* Handle externals and constants optimistically throughout the
3685 iteration. */
3686 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3687 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3688 continue;
3689
3690 vertices[idx].visited = 1;
3691
3692 /* We do not handle stores with a permutation. */
3693 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
3694 if (STMT_VINFO_DATA_REF (rep)
3695 && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
3696 continue;
3697 /* We cannot move a permute across an operation that is
3698 not independent on lanes. Note this is an explicit
3699 negative list since that's much shorter than the respective
3700 positive one but it's critical to keep maintaining it. */
3701 if (is_gimple_call (STMT_VINFO_STMT (rep)))
3702 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
3703 {
3704 case CFN_COMPLEX_ADD_ROT90:
3705 case CFN_COMPLEX_ADD_ROT270:
3706 case CFN_COMPLEX_MUL:
3707 case CFN_COMPLEX_MUL_CONJ:
3708 case CFN_VEC_ADDSUB:
3709 continue;
3710 default:;
3711 }
3712
3713 int perm = -1;
3714 for (graph_edge *succ = slpg->vertices[idx].succ;
3715 succ; succ = succ->succ_next)
3716 {
3717 int succ_idx = succ->dest;
3718 /* Handle unvisited nodes optimistically. */
3719 /* ??? But for constants once we want to handle non-bijective
3720 permutes we have to verify the permute, when unifying lanes,
3721 will not unify different constants. For example see
3722 gcc.dg/vect/bb-slp-14.c for a case that would break. */
3723 if (!vertices[succ_idx].visited)
3724 continue;
3725 int succ_perm = vertices[succ_idx].perm_out;
3726 if (perm == -1)
3727 perm = succ_perm;
3728 else if (succ_perm == 0)
3729 {
3730 perm = 0;
3731 break;
3732 }
3733 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
3734 {
3735 perm = 0;
3736 break;
3737 }
3738 }
3739
3740 if (perm == -1)
3741 /* Pick up pre-computed leaf values. */
3742 perm = vertices[idx].perm_out;
3743 else if (!vect_slp_perms_eq (perms, perm,
3744 vertices[idx].get_perm_in ()))
3745 {
3746 if (iteration > 1)
3747 /* Make sure we eventually converge. */
3748 gcc_checking_assert (perm == 0);
3749 if (perm == 0)
3750 {
3751 vertices[idx].perm_out = 0;
3752 vertices[idx].materialize = 0;
3753 }
3754 if (!vertices[idx].materialize)
3755 vertices[idx].perm_out = perm;
3756 changed = true;
3757 }
3758
3759 if (perm == 0)
3760 continue;
3761
3762 /* Elide pruning at materialization points in the first
3763 iteration so every node was visited once at least. */
3764 if (iteration == 1)
3765 continue;
3766
3767 /* Decide on permute materialization. Look whether there's
3768 a use (pred) edge that is permuted differently than us.
3769 In that case mark ourselves so the permutation is applied.
3770 For VEC_PERM_EXPRs the permutation doesn't carry along
3771 from children to parents so force materialization at the
3772 point of the VEC_PERM_EXPR. In principle VEC_PERM_EXPRs
3773 are a source of an arbitrary permutation again, similar
3774 to constants/externals - that's something we do not yet
3775 optimally handle. */
3776 bool all_preds_permuted = (SLP_TREE_CODE (node) != VEC_PERM_EXPR
3777 && slpg->vertices[idx].pred != NULL);
3778 if (all_preds_permuted)
3779 for (graph_edge *pred = slpg->vertices[idx].pred;
3780 pred; pred = pred->pred_next)
3781 {
3782 gcc_checking_assert (vertices[pred->src].visited);
3783 int pred_perm = vertices[pred->src].get_perm_in ();
3784 if (!vect_slp_perms_eq (perms, perm, pred_perm))
3785 {
3786 all_preds_permuted = false;
3787 break;
3788 }
3789 }
3790 if (!all_preds_permuted)
3791 {
3792 if (!vertices[idx].materialize)
3793 changed = true;
3794 vertices[idx].materialize = perm;
3795 vertices[idx].perm_out = 0;
3796 }
3797 }
3798 }
3799 while (changed || iteration == 1);
3800
3801 /* Materialize. */
3802 for (i = 0; i < vertices.length (); ++i)
3803 {
3804 int perm = vertices[i].get_perm_in ();
3805 if (perm <= 0)
3806 continue;
3807
3808 slp_tree node = vertices[i].node;
3809
3810 /* First permute invariant/external original successors. */
3811 unsigned j;
3812 slp_tree child;
3813 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3814 {
3815 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3816 continue;
3817
3818 /* If the vector is uniform there's nothing to do. */
3819 if (vect_slp_tree_uniform_p (child))
3820 continue;
3821
3822 /* We can end up sharing some externals via two_operator
3823 handling. Be prepared to unshare those. */
3824 if (child->refcnt != 1)
3825 {
3826 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3827 SLP_TREE_CHILDREN (node)[j] = child
3828 = vect_create_new_slp_node
3829 (SLP_TREE_SCALAR_OPS (child).copy ());
3830 }
3831 vect_slp_permute (perms[perm],
3832 SLP_TREE_SCALAR_OPS (child), true);
3833 }
3834
3835 if (vertices[i].materialize)
3836 {
3837 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3838 /* For loads simply drop the permutation, the load permutation
3839 already performs the desired permutation. */
3840 ;
3841 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3842 {
3843 /* If the node is already a permute node we can apply
3844 the permutation to the lane selection, effectively
3845 materializing it on the incoming vectors. */
3846 if (dump_enabled_p ())
3847 dump_printf_loc (MSG_NOTE, vect_location,
3848 "simplifying permute node %p\n",
3849 node);
3850
3851 for (unsigned k = 0;
3852 k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3853 SLP_TREE_LANE_PERMUTATION (node)[k].second
3854 = perms[perm][SLP_TREE_LANE_PERMUTATION (node)[k].second];
3855 }
3856 else
3857 {
3858 if (dump_enabled_p ())
3859 dump_printf_loc (MSG_NOTE, vect_location,
3860 "inserting permute node in place of %p\n",
3861 node);
3862
3863 /* Make a copy of NODE and in-place change it to a
3864 VEC_PERM node to permute the lanes of the copy. */
3865 slp_tree copy = new _slp_tree;
3866 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3867 SLP_TREE_CHILDREN (node) = vNULL;
3868 SLP_TREE_SCALAR_STMTS (copy)
3869 = SLP_TREE_SCALAR_STMTS (node).copy ();
3870 vect_slp_permute (perms[perm],
3871 SLP_TREE_SCALAR_STMTS (copy), true);
3872 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3873 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3874 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3875 SLP_TREE_LANE_PERMUTATION (copy)
3876 = SLP_TREE_LANE_PERMUTATION (node);
3877 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3878 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3879 copy->refcnt = 1;
3880 copy->max_nunits = node->max_nunits;
3881 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3882 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3883 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3884
3885 /* Now turn NODE into a VEC_PERM. */
3886 SLP_TREE_CHILDREN (node).safe_push (copy);
3887 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3888 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3889 SLP_TREE_LANE_PERMUTATION (node)
3890 .quick_push (std::make_pair (0, perms[perm][j]));
3891 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3892 }
3893 }
3894 else
3895 {
3896 /* Apply the reverse permutation to our stmts. */
3897 vect_slp_permute (perms[perm],
3898 SLP_TREE_SCALAR_STMTS (node), true);
3899 /* And to the load permutation, which we can simply
3900 make regular by design. */
3901 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3902 {
3903 /* ??? When we handle non-bijective permutes the idea
3904 is that we can force the load-permutation to be
3905 { min, min + 1, min + 2, ... max }. But then the
3906 scalar defs might no longer match the lane content
3907 which means wrong-code with live lane vectorization.
3908 So we possibly have to have NULL entries for those. */
3909 vect_slp_permute (perms[perm],
3910 SLP_TREE_LOAD_PERMUTATION (node), true);
3911 }
3912 }
3913 }
3914
3915 /* Free the perms vector used for propagation. */
3916 while (!perms.is_empty ())
3917 perms.pop ().release ();
3918 free_graph (slpg);
3919
3920
3921 /* Now elide load permutations that are not necessary. */
3922 for (i = 0; i < leafs.length (); ++i)
3923 {
3924 node = vertices[leafs[i]].node;
3925 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3926 continue;
3927
3928 /* In basic block vectorization we allow any subchain of an interleaving
3929 chain.
3930 FORNOW: not in loop SLP because of realignment complications. */
3931 if (is_a <bb_vec_info> (vinfo))
3932 {
3933 bool subchain_p = true;
3934 stmt_vec_info next_load_info = NULL;
3935 stmt_vec_info load_info;
3936 unsigned j;
3937 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3938 {
3939 if (j != 0
3940 && (next_load_info != load_info
3941 || DR_GROUP_GAP (load_info) != 1))
3942 {
3943 subchain_p = false;
3944 break;
3945 }
3946 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3947 }
3948 if (subchain_p)
3949 {
3950 SLP_TREE_LOAD_PERMUTATION (node).release ();
3951 continue;
3952 }
3953 }
3954 else
3955 {
3956 stmt_vec_info load_info;
3957 bool this_load_permuted = false;
3958 unsigned j;
3959 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3960 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3961 {
3962 this_load_permuted = true;
3963 break;
3964 }
3965 stmt_vec_info first_stmt_info
3966 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3967 if (!this_load_permuted
3968 /* The load requires permutation when unrolling exposes
3969 a gap either because the group is larger than the SLP
3970 group-size or because there is a gap between the groups. */
3971 && (known_eq (LOOP_VINFO_VECT_FACTOR
3972 (as_a <loop_vec_info> (vinfo)), 1U)
3973 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3974 && DR_GROUP_GAP (first_stmt_info) == 0)))
3975 {
3976 SLP_TREE_LOAD_PERMUTATION (node).release ();
3977 continue;
3978 }
3979 }
3980 }
3981
3982 /* And any permutations of BB reductions. */
3983 if (is_a <bb_vec_info> (vinfo))
3984 {
3985 for (slp_instance instance : vinfo->slp_instances)
3986 {
3987 if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
3988 continue;
3989 slp_tree old = SLP_INSTANCE_TREE (instance);
3990 if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
3991 && SLP_TREE_CHILDREN (old).length () == 1)
3992 {
3993 slp_tree child = SLP_TREE_CHILDREN (old)[0];
3994 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
3995 {
3996 /* Preserve the special VEC_PERM we use to shield existing
3997 vector defs from the rest. But make it a no-op. */
3998 unsigned i = 0;
3999 for (std::pair<unsigned, unsigned> &p
4000 : SLP_TREE_LANE_PERMUTATION (old))
4001 p.second = i++;
4002 }
4003 else
4004 {
4005 SLP_INSTANCE_TREE (instance) = child;
4006 SLP_TREE_REF_COUNT (child)++;
4007 vect_free_slp_tree (old);
4008 }
4009 }
4010 else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
4011 && SLP_TREE_REF_COUNT (old) == 1)
4012 {
4013 /* ??? For loads the situation is more complex since
4014 we can't modify the permute in place in case the
4015 node is used multiple times. In fact for loads this
4016 should be somehow handled in the propagation engine. */
4017 auto fn = [] (const void *a, const void *b)
4018 { return *(const int *)a - *(const int *)b; };
4019 SLP_TREE_LOAD_PERMUTATION (old).qsort (fn);
4020 }
4021 }
4022 }
4023 }
4024
4025 /* Gather loads reachable from the individual SLP graph entries. */
4026
4027 void
4028 vect_gather_slp_loads (vec_info *vinfo)
4029 {
4030 unsigned i;
4031 slp_instance instance;
4032 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
4033 {
4034 hash_set<slp_tree> visited;
4035 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
4036 SLP_INSTANCE_TREE (instance), visited);
4037 }
4038 }
4039
4040
4041 /* For each possible SLP instance decide whether to SLP it and calculate overall
4042 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
4043 least one instance. */
4044
4045 bool
4046 vect_make_slp_decision (loop_vec_info loop_vinfo)
4047 {
4048 unsigned int i;
4049 poly_uint64 unrolling_factor = 1;
4050 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
4051 slp_instance instance;
4052 int decided_to_slp = 0;
4053
4054 DUMP_VECT_SCOPE ("vect_make_slp_decision");
4055
4056 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4057 {
4058 /* FORNOW: SLP if you can. */
4059 /* All unroll factors have the form:
4060
4061 GET_MODE_SIZE (vinfo->vector_mode) * X
4062
4063 for some rational X, so they must have a common multiple. */
4064 unrolling_factor
4065 = force_common_multiple (unrolling_factor,
4066 SLP_INSTANCE_UNROLLING_FACTOR (instance));
4067
4068 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
4069 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
4070 loop-based vectorization. Such stmts will be marked as HYBRID. */
4071 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4072 decided_to_slp++;
4073 }
4074
4075 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
4076
4077 if (decided_to_slp && dump_enabled_p ())
4078 {
4079 dump_printf_loc (MSG_NOTE, vect_location,
4080 "Decided to SLP %d instances. Unrolling factor ",
4081 decided_to_slp);
4082 dump_dec (MSG_NOTE, unrolling_factor);
4083 dump_printf (MSG_NOTE, "\n");
4084 }
4085
4086 return (decided_to_slp > 0);
4087 }
4088
4089 /* Private data for vect_detect_hybrid_slp. */
4090 struct vdhs_data
4091 {
4092 loop_vec_info loop_vinfo;
4093 vec<stmt_vec_info> *worklist;
4094 };
4095
4096 /* Walker for walk_gimple_op. */
4097
4098 static tree
4099 vect_detect_hybrid_slp (tree *tp, int *, void *data)
4100 {
4101 walk_stmt_info *wi = (walk_stmt_info *)data;
4102 vdhs_data *dat = (vdhs_data *)wi->info;
4103
4104 if (wi->is_lhs)
4105 return NULL_TREE;
4106
4107 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
4108 if (!def_stmt_info)
4109 return NULL_TREE;
4110 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
4111 if (PURE_SLP_STMT (def_stmt_info))
4112 {
4113 if (dump_enabled_p ())
4114 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
4115 def_stmt_info->stmt);
4116 STMT_SLP_TYPE (def_stmt_info) = hybrid;
4117 dat->worklist->safe_push (def_stmt_info);
4118 }
4119
4120 return NULL_TREE;
4121 }
4122
4123 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
4124 if so, otherwise pushing it to WORKLIST. */
4125
4126 static void
4127 maybe_push_to_hybrid_worklist (vec_info *vinfo,
4128 vec<stmt_vec_info> &worklist,
4129 stmt_vec_info stmt_info)
4130 {
4131 if (dump_enabled_p ())
4132 dump_printf_loc (MSG_NOTE, vect_location,
4133 "Processing hybrid candidate : %G", stmt_info->stmt);
4134 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
4135 imm_use_iterator iter2;
4136 ssa_op_iter iter1;
4137 use_operand_p use_p;
4138 def_operand_p def_p;
4139 bool any_def = false;
4140 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
4141 {
4142 any_def = true;
4143 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
4144 {
4145 if (is_gimple_debug (USE_STMT (use_p)))
4146 continue;
4147 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
4148 /* An out-of loop use means this is a loop_vect sink. */
4149 if (!use_info)
4150 {
4151 if (dump_enabled_p ())
4152 dump_printf_loc (MSG_NOTE, vect_location,
4153 "Found loop_vect sink: %G", stmt_info->stmt);
4154 worklist.safe_push (stmt_info);
4155 return;
4156 }
4157 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
4158 {
4159 if (dump_enabled_p ())
4160 dump_printf_loc (MSG_NOTE, vect_location,
4161 "Found loop_vect use: %G", use_info->stmt);
4162 worklist.safe_push (stmt_info);
4163 return;
4164 }
4165 }
4166 }
4167 /* No def means this is a loo_vect sink. */
4168 if (!any_def)
4169 {
4170 if (dump_enabled_p ())
4171 dump_printf_loc (MSG_NOTE, vect_location,
4172 "Found loop_vect sink: %G", stmt_info->stmt);
4173 worklist.safe_push (stmt_info);
4174 return;
4175 }
4176 if (dump_enabled_p ())
4177 dump_printf_loc (MSG_NOTE, vect_location,
4178 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
4179 STMT_SLP_TYPE (stmt_info) = pure_slp;
4180 }
4181
4182 /* Find stmts that must be both vectorized and SLPed. */
4183
4184 void
4185 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
4186 {
4187 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
4188
4189 /* All stmts participating in SLP are marked pure_slp, all other
4190 stmts are loop_vect.
4191 First collect all loop_vect stmts into a worklist.
4192 SLP patterns cause not all original scalar stmts to appear in
4193 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
4194 Rectify this here and do a backward walk over the IL only considering
4195 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
4196 mark them as pure_slp. */
4197 auto_vec<stmt_vec_info> worklist;
4198 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
4199 {
4200 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
4201 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
4202 gsi_next (&gsi))
4203 {
4204 gphi *phi = gsi.phi ();
4205 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
4206 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4207 maybe_push_to_hybrid_worklist (loop_vinfo,
4208 worklist, stmt_info);
4209 }
4210 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
4211 gsi_prev (&gsi))
4212 {
4213 gimple *stmt = gsi_stmt (gsi);
4214 if (is_gimple_debug (stmt))
4215 continue;
4216 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
4217 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
4218 {
4219 for (gimple_stmt_iterator gsi2
4220 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
4221 !gsi_end_p (gsi2); gsi_next (&gsi2))
4222 {
4223 stmt_vec_info patt_info
4224 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
4225 if (!STMT_SLP_TYPE (patt_info)
4226 && STMT_VINFO_RELEVANT (patt_info))
4227 maybe_push_to_hybrid_worklist (loop_vinfo,
4228 worklist, patt_info);
4229 }
4230 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4231 }
4232 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4233 maybe_push_to_hybrid_worklist (loop_vinfo,
4234 worklist, stmt_info);
4235 }
4236 }
4237
4238 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
4239 mark any SLP vectorized stmt as hybrid.
4240 ??? We're visiting def stmts N times (once for each non-SLP and
4241 once for each hybrid-SLP use). */
4242 walk_stmt_info wi;
4243 vdhs_data dat;
4244 dat.worklist = &worklist;
4245 dat.loop_vinfo = loop_vinfo;
4246 memset (&wi, 0, sizeof (wi));
4247 wi.info = (void *)&dat;
4248 while (!worklist.is_empty ())
4249 {
4250 stmt_vec_info stmt_info = worklist.pop ();
4251 /* Since SSA operands are not set up for pattern stmts we need
4252 to use walk_gimple_op. */
4253 wi.is_lhs = 0;
4254 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
4255 }
4256 }
4257
4258
4259 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
4260
4261 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
4262 : vec_info (vec_info::bb, init_cost (NULL, false), shared),
4263 bbs (_bbs),
4264 roots (vNULL)
4265 {
4266 for (unsigned i = 0; i < bbs.length (); ++i)
4267 {
4268 if (i != 0)
4269 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4270 gsi_next (&si))
4271 {
4272 gphi *phi = si.phi ();
4273 gimple_set_uid (phi, 0);
4274 add_stmt (phi);
4275 }
4276 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4277 !gsi_end_p (gsi); gsi_next (&gsi))
4278 {
4279 gimple *stmt = gsi_stmt (gsi);
4280 gimple_set_uid (stmt, 0);
4281 if (is_gimple_debug (stmt))
4282 continue;
4283 add_stmt (stmt);
4284 }
4285 }
4286 }
4287
4288
4289 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
4290 stmts in the basic block. */
4291
4292 _bb_vec_info::~_bb_vec_info ()
4293 {
4294 /* Reset region marker. */
4295 for (unsigned i = 0; i < bbs.length (); ++i)
4296 {
4297 if (i != 0)
4298 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4299 gsi_next (&si))
4300 {
4301 gphi *phi = si.phi ();
4302 gimple_set_uid (phi, -1);
4303 }
4304 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4305 !gsi_end_p (gsi); gsi_next (&gsi))
4306 {
4307 gimple *stmt = gsi_stmt (gsi);
4308 gimple_set_uid (stmt, -1);
4309 }
4310 }
4311
4312 for (unsigned i = 0; i < roots.length (); ++i)
4313 {
4314 roots[i].stmts.release ();
4315 roots[i].roots.release ();
4316 }
4317 roots.release ();
4318 }
4319
4320 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
4321 given then that child nodes have already been processed, and that
4322 their def types currently match their SLP node's def type. */
4323
4324 static bool
4325 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
4326 slp_instance node_instance,
4327 stmt_vector_for_cost *cost_vec)
4328 {
4329 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4330
4331 /* Calculate the number of vector statements to be created for the
4332 scalar stmts in this node. For SLP reductions it is equal to the
4333 number of vector statements in the children (which has already been
4334 calculated by the recursive call). Otherwise it is the number of
4335 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
4336 VF divided by the number of elements in a vector. */
4337 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
4338 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
4339 {
4340 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
4341 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
4342 {
4343 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4344 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
4345 break;
4346 }
4347 }
4348 else
4349 {
4350 poly_uint64 vf;
4351 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4352 vf = loop_vinfo->vectorization_factor;
4353 else
4354 vf = 1;
4355 unsigned int group_size = SLP_TREE_LANES (node);
4356 tree vectype = SLP_TREE_VECTYPE (node);
4357 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4358 = vect_get_num_vectors (vf * group_size, vectype);
4359 }
4360
4361 /* Handle purely internal nodes. */
4362 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4363 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
4364
4365 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
4366 if (is_a <bb_vec_info> (vinfo)
4367 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
4368 {
4369 if (dump_enabled_p ())
4370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4371 "desired vector type conflicts with earlier one "
4372 "for %G", stmt_info->stmt);
4373 return false;
4374 }
4375
4376 bool dummy;
4377 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
4378 node, node_instance, cost_vec);
4379 }
4380
4381 /* Try to build NODE from scalars, returning true on success.
4382 NODE_INSTANCE is the SLP instance that contains NODE. */
4383
4384 static bool
4385 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
4386 slp_instance node_instance)
4387 {
4388 stmt_vec_info stmt_info;
4389 unsigned int i;
4390
4391 if (!is_a <bb_vec_info> (vinfo)
4392 || node == SLP_INSTANCE_TREE (node_instance)
4393 || !SLP_TREE_SCALAR_STMTS (node).exists ()
4394 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
4395 return false;
4396
4397 if (dump_enabled_p ())
4398 dump_printf_loc (MSG_NOTE, vect_location,
4399 "Building vector operands of %p from scalars instead\n", node);
4400
4401 /* Don't remove and free the child nodes here, since they could be
4402 referenced by other structures. The analysis and scheduling phases
4403 (need to) ignore child nodes of anything that isn't vect_internal_def. */
4404 unsigned int group_size = SLP_TREE_LANES (node);
4405 SLP_TREE_DEF_TYPE (node) = vect_external_def;
4406 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
4407 SLP_TREE_LOAD_PERMUTATION (node).release ();
4408 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4409 {
4410 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
4411 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
4412 }
4413 return true;
4414 }
4415
4416 /* Compute the prologue cost for invariant or constant operands represented
4417 by NODE. */
4418
4419 static void
4420 vect_prologue_cost_for_slp (slp_tree node,
4421 stmt_vector_for_cost *cost_vec)
4422 {
4423 /* There's a special case of an existing vector, that costs nothing. */
4424 if (SLP_TREE_SCALAR_OPS (node).length () == 0
4425 && !SLP_TREE_VEC_DEFS (node).is_empty ())
4426 return;
4427 /* Without looking at the actual initializer a vector of
4428 constants can be implemented as load from the constant pool.
4429 When all elements are the same we can use a splat. */
4430 tree vectype = SLP_TREE_VECTYPE (node);
4431 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
4432 unsigned num_vects_to_check;
4433 unsigned HOST_WIDE_INT const_nunits;
4434 unsigned nelt_limit;
4435 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
4436 && ! multiple_p (const_nunits, group_size))
4437 {
4438 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4439 nelt_limit = const_nunits;
4440 }
4441 else
4442 {
4443 /* If either the vector has variable length or the vectors
4444 are composed of repeated whole groups we only need to
4445 cost construction once. All vectors will be the same. */
4446 num_vects_to_check = 1;
4447 nelt_limit = group_size;
4448 }
4449 tree elt = NULL_TREE;
4450 unsigned nelt = 0;
4451 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
4452 {
4453 unsigned si = j % group_size;
4454 if (nelt == 0)
4455 elt = SLP_TREE_SCALAR_OPS (node)[si];
4456 /* ??? We're just tracking whether all operands of a single
4457 vector initializer are the same, ideally we'd check if
4458 we emitted the same one already. */
4459 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
4460 elt = NULL_TREE;
4461 nelt++;
4462 if (nelt == nelt_limit)
4463 {
4464 record_stmt_cost (cost_vec, 1,
4465 SLP_TREE_DEF_TYPE (node) == vect_external_def
4466 ? (elt ? scalar_to_vec : vec_construct)
4467 : vector_load,
4468 NULL, vectype, 0, vect_prologue);
4469 nelt = 0;
4470 }
4471 }
4472 }
4473
4474 /* Analyze statements contained in SLP tree NODE after recursively analyzing
4475 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
4476
4477 Return true if the operations are supported. */
4478
4479 static bool
4480 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
4481 slp_instance node_instance,
4482 hash_set<slp_tree> &visited_set,
4483 vec<slp_tree> &visited_vec,
4484 stmt_vector_for_cost *cost_vec)
4485 {
4486 int i, j;
4487 slp_tree child;
4488
4489 /* Assume we can code-generate all invariants. */
4490 if (!node
4491 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
4492 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4493 return true;
4494
4495 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
4496 {
4497 if (dump_enabled_p ())
4498 dump_printf_loc (MSG_NOTE, vect_location,
4499 "Failed cyclic SLP reference in %p\n", node);
4500 return false;
4501 }
4502 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
4503
4504 /* If we already analyzed the exact same set of scalar stmts we're done.
4505 We share the generated vector stmts for those. */
4506 if (visited_set.add (node))
4507 return true;
4508 visited_vec.safe_push (node);
4509
4510 bool res = true;
4511 unsigned visited_rec_start = visited_vec.length ();
4512 unsigned cost_vec_rec_start = cost_vec->length ();
4513 bool seen_non_constant_child = false;
4514 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4515 {
4516 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
4517 visited_set, visited_vec,
4518 cost_vec);
4519 if (!res)
4520 break;
4521 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
4522 seen_non_constant_child = true;
4523 }
4524 /* We're having difficulties scheduling nodes with just constant
4525 operands and no scalar stmts since we then cannot compute a stmt
4526 insertion place. */
4527 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
4528 {
4529 if (dump_enabled_p ())
4530 dump_printf_loc (MSG_NOTE, vect_location,
4531 "Cannot vectorize all-constant op node %p\n", node);
4532 res = false;
4533 }
4534
4535 if (res)
4536 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
4537 cost_vec);
4538 /* If analysis failed we have to pop all recursive visited nodes
4539 plus ourselves. */
4540 if (!res)
4541 {
4542 while (visited_vec.length () >= visited_rec_start)
4543 visited_set.remove (visited_vec.pop ());
4544 cost_vec->truncate (cost_vec_rec_start);
4545 }
4546
4547 /* When the node can be vectorized cost invariant nodes it references.
4548 This is not done in DFS order to allow the refering node
4549 vectorizable_* calls to nail down the invariant nodes vector type
4550 and possibly unshare it if it needs a different vector type than
4551 other referrers. */
4552 if (res)
4553 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
4554 if (child
4555 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
4556 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
4557 /* Perform usual caching, note code-generation still
4558 code-gens these nodes multiple times but we expect
4559 to CSE them later. */
4560 && !visited_set.add (child))
4561 {
4562 visited_vec.safe_push (child);
4563 /* ??? After auditing more code paths make a "default"
4564 and push the vector type from NODE to all children
4565 if it is not already set. */
4566 /* Compute the number of vectors to be generated. */
4567 tree vector_type = SLP_TREE_VECTYPE (child);
4568 if (!vector_type)
4569 {
4570 /* For shifts with a scalar argument we don't need
4571 to cost or code-generate anything.
4572 ??? Represent this more explicitely. */
4573 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4574 == shift_vec_info_type)
4575 && j == 1);
4576 continue;
4577 }
4578 unsigned group_size = SLP_TREE_LANES (child);
4579 poly_uint64 vf = 1;
4580 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4581 vf = loop_vinfo->vectorization_factor;
4582 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
4583 = vect_get_num_vectors (vf * group_size, vector_type);
4584 /* And cost them. */
4585 vect_prologue_cost_for_slp (child, cost_vec);
4586 }
4587
4588 /* If this node or any of its children can't be vectorized, try pruning
4589 the tree here rather than felling the whole thing. */
4590 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
4591 {
4592 /* We'll need to revisit this for invariant costing and number
4593 of vectorized stmt setting. */
4594 res = true;
4595 }
4596
4597 return res;
4598 }
4599
4600 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4601 region and that can be vectorized using vectorizable_live_operation
4602 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
4603 scalar code computing it to be retained. */
4604
4605 static void
4606 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
4607 slp_instance instance,
4608 stmt_vector_for_cost *cost_vec,
4609 hash_set<stmt_vec_info> &svisited,
4610 hash_set<slp_tree> &visited)
4611 {
4612 if (visited.add (node))
4613 return;
4614
4615 unsigned i;
4616 stmt_vec_info stmt_info;
4617 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
4618 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4619 {
4620 if (svisited.contains (stmt_info))
4621 continue;
4622 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4623 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
4624 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
4625 /* Only the pattern root stmt computes the original scalar value. */
4626 continue;
4627 bool mark_visited = true;
4628 gimple *orig_stmt = orig_stmt_info->stmt;
4629 ssa_op_iter op_iter;
4630 def_operand_p def_p;
4631 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4632 {
4633 imm_use_iterator use_iter;
4634 gimple *use_stmt;
4635 stmt_vec_info use_stmt_info;
4636 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4637 if (!is_gimple_debug (use_stmt))
4638 {
4639 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
4640 if (!use_stmt_info
4641 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4642 {
4643 STMT_VINFO_LIVE_P (stmt_info) = true;
4644 if (vectorizable_live_operation (bb_vinfo, stmt_info,
4645 NULL, node, instance, i,
4646 false, cost_vec))
4647 /* ??? So we know we can vectorize the live stmt
4648 from one SLP node. If we cannot do so from all
4649 or none consistently we'd have to record which
4650 SLP node (and lane) we want to use for the live
4651 operation. So make sure we can code-generate
4652 from all nodes. */
4653 mark_visited = false;
4654 else
4655 STMT_VINFO_LIVE_P (stmt_info) = false;
4656 break;
4657 }
4658 }
4659 /* We have to verify whether we can insert the lane extract
4660 before all uses. The following is a conservative approximation.
4661 We cannot put this into vectorizable_live_operation because
4662 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4663 doesn't work.
4664 Note that while the fact that we emit code for loads at the
4665 first load should make this a non-problem leafs we construct
4666 from scalars are vectorized after the last scalar def.
4667 ??? If we'd actually compute the insert location during
4668 analysis we could use sth less conservative than the last
4669 scalar stmt in the node for the dominance check. */
4670 /* ??? What remains is "live" uses in vector CTORs in the same
4671 SLP graph which is where those uses can end up code-generated
4672 right after their definition instead of close to their original
4673 use. But that would restrict us to code-generate lane-extracts
4674 from the latest stmt in a node. So we compensate for this
4675 during code-generation, simply not replacing uses for those
4676 hopefully rare cases. */
4677 if (STMT_VINFO_LIVE_P (stmt_info))
4678 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4679 if (!is_gimple_debug (use_stmt)
4680 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4681 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4682 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4683 {
4684 if (dump_enabled_p ())
4685 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4686 "Cannot determine insertion place for "
4687 "lane extract\n");
4688 STMT_VINFO_LIVE_P (stmt_info) = false;
4689 mark_visited = true;
4690 }
4691 }
4692 if (mark_visited)
4693 svisited.add (stmt_info);
4694 }
4695
4696 slp_tree child;
4697 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4698 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4699 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4700 cost_vec, svisited, visited);
4701 }
4702
4703 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
4704
4705 static bool
4706 vectorizable_bb_reduc_epilogue (slp_instance instance,
4707 stmt_vector_for_cost *cost_vec)
4708 {
4709 enum tree_code reduc_code
4710 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
4711 if (reduc_code == MINUS_EXPR)
4712 reduc_code = PLUS_EXPR;
4713 internal_fn reduc_fn;
4714 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
4715 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
4716 || reduc_fn == IFN_LAST
4717 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH))
4718 return false;
4719
4720 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
4721 cost log2 vector operations plus shuffles. */
4722 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
4723 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
4724 vectype, 0, vect_body);
4725 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
4726 vectype, 0, vect_body);
4727 return true;
4728 }
4729
4730 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
4731 and recurse to children. */
4732
4733 static void
4734 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
4735 hash_set<slp_tree> &visited)
4736 {
4737 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
4738 || visited.add (node))
4739 return;
4740
4741 stmt_vec_info stmt;
4742 unsigned i;
4743 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4744 roots.remove (vect_orig_stmt (stmt));
4745
4746 slp_tree child;
4747 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4748 if (child)
4749 vect_slp_prune_covered_roots (child, roots, visited);
4750 }
4751
4752 /* Analyze statements in SLP instances of VINFO. Return true if the
4753 operations are supported. */
4754
4755 bool
4756 vect_slp_analyze_operations (vec_info *vinfo)
4757 {
4758 slp_instance instance;
4759 int i;
4760
4761 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4762
4763 hash_set<slp_tree> visited;
4764 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4765 {
4766 auto_vec<slp_tree> visited_vec;
4767 stmt_vector_for_cost cost_vec;
4768 cost_vec.create (2);
4769 if (is_a <bb_vec_info> (vinfo))
4770 vect_location = instance->location ();
4771 if (!vect_slp_analyze_node_operations (vinfo,
4772 SLP_INSTANCE_TREE (instance),
4773 instance, visited, visited_vec,
4774 &cost_vec)
4775 /* CTOR instances require vectorized defs for the SLP tree root. */
4776 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
4777 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
4778 != vect_internal_def))
4779 /* Check we can vectorize the reduction. */
4780 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
4781 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
4782 {
4783 slp_tree node = SLP_INSTANCE_TREE (instance);
4784 stmt_vec_info stmt_info;
4785 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
4786 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
4787 else
4788 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4789 if (dump_enabled_p ())
4790 dump_printf_loc (MSG_NOTE, vect_location,
4791 "removing SLP instance operations starting from: %G",
4792 stmt_info->stmt);
4793 vect_free_slp_instance (instance);
4794 vinfo->slp_instances.ordered_remove (i);
4795 cost_vec.release ();
4796 while (!visited_vec.is_empty ())
4797 visited.remove (visited_vec.pop ());
4798 }
4799 else
4800 {
4801 i++;
4802
4803 /* For BB vectorization remember the SLP graph entry
4804 cost for later. */
4805 if (is_a <bb_vec_info> (vinfo))
4806 instance->cost_vec = cost_vec;
4807 else
4808 {
4809 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
4810 cost_vec.release ();
4811 }
4812 }
4813 }
4814
4815 /* Now look for SLP instances with a root that are covered by other
4816 instances and remove them. */
4817 hash_set<stmt_vec_info> roots;
4818 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4819 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
4820 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
4821 if (!roots.is_empty ())
4822 {
4823 visited.empty ();
4824 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4825 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
4826 visited);
4827 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4828 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
4829 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
4830 {
4831 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
4832 if (dump_enabled_p ())
4833 dump_printf_loc (MSG_NOTE, vect_location,
4834 "removing SLP instance operations starting "
4835 "from: %G", root->stmt);
4836 vect_free_slp_instance (instance);
4837 vinfo->slp_instances.ordered_remove (i);
4838 }
4839 else
4840 ++i;
4841 }
4842
4843 /* Compute vectorizable live stmts. */
4844 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4845 {
4846 hash_set<stmt_vec_info> svisited;
4847 hash_set<slp_tree> visited;
4848 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4849 {
4850 vect_location = instance->location ();
4851 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
4852 instance, &instance->cost_vec, svisited,
4853 visited);
4854 }
4855 }
4856
4857 return !vinfo->slp_instances.is_empty ();
4858 }
4859
4860 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
4861 closing the eventual chain. */
4862
4863 static slp_instance
4864 get_ultimate_leader (slp_instance instance,
4865 hash_map<slp_instance, slp_instance> &instance_leader)
4866 {
4867 auto_vec<slp_instance *, 8> chain;
4868 slp_instance *tem;
4869 while (*(tem = instance_leader.get (instance)) != instance)
4870 {
4871 chain.safe_push (tem);
4872 instance = *tem;
4873 }
4874 while (!chain.is_empty ())
4875 *chain.pop () = instance;
4876 return instance;
4877 }
4878
4879 /* Worker of vect_bb_partition_graph, recurse on NODE. */
4880
4881 static void
4882 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
4883 slp_instance instance, slp_tree node,
4884 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
4885 hash_map<slp_instance, slp_instance> &instance_leader,
4886 hash_set<slp_tree> &visited)
4887 {
4888 stmt_vec_info stmt_info;
4889 unsigned i;
4890
4891 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4892 {
4893 bool existed_p;
4894 slp_instance &stmt_instance
4895 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
4896 if (!existed_p)
4897 ;
4898 else if (stmt_instance != instance)
4899 {
4900 /* If we're running into a previously marked stmt make us the
4901 leader of the current ultimate leader. This keeps the
4902 leader chain acyclic and works even when the current instance
4903 connects two previously independent graph parts. */
4904 slp_instance stmt_leader
4905 = get_ultimate_leader (stmt_instance, instance_leader);
4906 if (stmt_leader != instance)
4907 instance_leader.put (stmt_leader, instance);
4908 }
4909 stmt_instance = instance;
4910 }
4911
4912 if (!SLP_TREE_SCALAR_STMTS (node).is_empty () && visited.add (node))
4913 return;
4914
4915 slp_tree child;
4916 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4917 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4918 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
4919 instance_leader, visited);
4920 }
4921
4922 /* Partition the SLP graph into pieces that can be costed independently. */
4923
4924 static void
4925 vect_bb_partition_graph (bb_vec_info bb_vinfo)
4926 {
4927 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
4928
4929 /* First walk the SLP graph assigning each involved scalar stmt a
4930 corresponding SLP graph entry and upon visiting a previously
4931 marked stmt, make the stmts leader the current SLP graph entry. */
4932 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
4933 hash_map<slp_instance, slp_instance> instance_leader;
4934 hash_set<slp_tree> visited;
4935 slp_instance instance;
4936 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4937 {
4938 instance_leader.put (instance, instance);
4939 vect_bb_partition_graph_r (bb_vinfo,
4940 instance, SLP_INSTANCE_TREE (instance),
4941 stmt_to_instance, instance_leader,
4942 visited);
4943 }
4944
4945 /* Then collect entries to each independent subgraph. */
4946 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4947 {
4948 slp_instance leader = get_ultimate_leader (instance, instance_leader);
4949 leader->subgraph_entries.safe_push (instance);
4950 if (dump_enabled_p ()
4951 && leader != instance)
4952 dump_printf_loc (MSG_NOTE, vect_location,
4953 "instance %p is leader of %p\n",
4954 leader, instance);
4955 }
4956 }
4957
4958 /* Compute the scalar cost of the SLP node NODE and its children
4959 and return it. Do not account defs that are marked in LIFE and
4960 update LIFE according to uses of NODE. */
4961
4962 static void
4963 vect_bb_slp_scalar_cost (vec_info *vinfo,
4964 slp_tree node, vec<bool, va_heap> *life,
4965 stmt_vector_for_cost *cost_vec,
4966 hash_set<slp_tree> &visited)
4967 {
4968 unsigned i;
4969 stmt_vec_info stmt_info;
4970 slp_tree child;
4971
4972 if (visited.add (node))
4973 return;
4974
4975 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4976 {
4977 ssa_op_iter op_iter;
4978 def_operand_p def_p;
4979
4980 if ((*life)[i])
4981 continue;
4982
4983 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4984 gimple *orig_stmt = orig_stmt_info->stmt;
4985
4986 /* If there is a non-vectorized use of the defs then the scalar
4987 stmt is kept live in which case we do not account it or any
4988 required defs in the SLP children in the scalar cost. This
4989 way we make the vectorization more costly when compared to
4990 the scalar cost. */
4991 if (!STMT_VINFO_LIVE_P (stmt_info))
4992 {
4993 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4994 {
4995 imm_use_iterator use_iter;
4996 gimple *use_stmt;
4997 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4998 if (!is_gimple_debug (use_stmt))
4999 {
5000 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
5001 if (!use_stmt_info
5002 || !PURE_SLP_STMT
5003 (vect_stmt_to_vectorize (use_stmt_info)))
5004 {
5005 (*life)[i] = true;
5006 break;
5007 }
5008 }
5009 }
5010 if ((*life)[i])
5011 continue;
5012 }
5013
5014 /* Count scalar stmts only once. */
5015 if (gimple_visited_p (orig_stmt))
5016 continue;
5017 gimple_set_visited (orig_stmt, true);
5018
5019 vect_cost_for_stmt kind;
5020 if (STMT_VINFO_DATA_REF (orig_stmt_info))
5021 {
5022 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
5023 kind = scalar_load;
5024 else
5025 kind = scalar_store;
5026 }
5027 else if (vect_nop_conversion_p (orig_stmt_info))
5028 continue;
5029 /* For single-argument PHIs assume coalescing which means zero cost
5030 for the scalar and the vector PHIs. This avoids artificially
5031 favoring the vector path (but may pessimize it in some cases). */
5032 else if (is_a <gphi *> (orig_stmt_info->stmt)
5033 && gimple_phi_num_args
5034 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
5035 continue;
5036 else
5037 kind = scalar_stmt;
5038 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
5039 SLP_TREE_VECTYPE (node), 0, vect_body);
5040 }
5041
5042 auto_vec<bool, 20> subtree_life;
5043 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5044 {
5045 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5046 {
5047 /* Do not directly pass LIFE to the recursive call, copy it to
5048 confine changes in the callee to the current child/subtree. */
5049 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5050 {
5051 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
5052 for (unsigned j = 0;
5053 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
5054 {
5055 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
5056 if (perm.first == i)
5057 subtree_life[perm.second] = (*life)[j];
5058 }
5059 }
5060 else
5061 {
5062 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
5063 subtree_life.safe_splice (*life);
5064 }
5065 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
5066 visited);
5067 subtree_life.truncate (0);
5068 }
5069 }
5070 }
5071
5072 /* Comparator for the loop-index sorted cost vectors. */
5073
5074 static int
5075 li_cost_vec_cmp (const void *a_, const void *b_)
5076 {
5077 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
5078 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
5079 if (a->first < b->first)
5080 return -1;
5081 else if (a->first == b->first)
5082 return 0;
5083 return 1;
5084 }
5085
5086 /* Check if vectorization of the basic block is profitable for the
5087 subgraph denoted by SLP_INSTANCES. */
5088
5089 static bool
5090 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
5091 vec<slp_instance> slp_instances)
5092 {
5093 slp_instance instance;
5094 int i;
5095 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
5096 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
5097
5098 if (dump_enabled_p ())
5099 {
5100 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
5101 hash_set<slp_tree> visited;
5102 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5103 vect_print_slp_graph (MSG_NOTE, vect_location,
5104 SLP_INSTANCE_TREE (instance), visited);
5105 }
5106
5107 /* Calculate scalar cost and sum the cost for the vector stmts
5108 previously collected. */
5109 stmt_vector_for_cost scalar_costs = vNULL;
5110 stmt_vector_for_cost vector_costs = vNULL;
5111 hash_set<slp_tree> visited;
5112 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5113 {
5114 auto_vec<bool, 20> life;
5115 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
5116 true);
5117 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5118 record_stmt_cost (&scalar_costs,
5119 SLP_INSTANCE_ROOT_STMTS (instance).length (),
5120 scalar_stmt,
5121 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
5122 vect_bb_slp_scalar_cost (bb_vinfo,
5123 SLP_INSTANCE_TREE (instance),
5124 &life, &scalar_costs, visited);
5125 vector_costs.safe_splice (instance->cost_vec);
5126 instance->cost_vec.release ();
5127 }
5128 /* Unset visited flag. */
5129 stmt_info_for_cost *cost;
5130 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5131 gimple_set_visited (cost->stmt_info->stmt, false);
5132
5133 if (dump_enabled_p ())
5134 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5135
5136 /* When costing non-loop vectorization we need to consider each covered
5137 loop independently and make sure vectorization is profitable. For
5138 now we assume a loop may be not entered or executed an arbitrary
5139 number of iterations (??? static information can provide more
5140 precise info here) which means we can simply cost each containing
5141 loops stmts separately. */
5142
5143 /* First produce cost vectors sorted by loop index. */
5144 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5145 li_scalar_costs (scalar_costs.length ());
5146 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5147 li_vector_costs (vector_costs.length ());
5148 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5149 {
5150 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5151 li_scalar_costs.quick_push (std::make_pair (l, cost));
5152 }
5153 /* Use a random used loop as fallback in case the first vector_costs
5154 entry does not have a stmt_info associated with it. */
5155 unsigned l = li_scalar_costs[0].first;
5156 FOR_EACH_VEC_ELT (vector_costs, i, cost)
5157 {
5158 /* We inherit from the previous COST, invariants, externals and
5159 extracts immediately follow the cost for the related stmt. */
5160 if (cost->stmt_info)
5161 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5162 li_vector_costs.quick_push (std::make_pair (l, cost));
5163 }
5164 li_scalar_costs.qsort (li_cost_vec_cmp);
5165 li_vector_costs.qsort (li_cost_vec_cmp);
5166
5167 /* Now cost the portions individually. */
5168 unsigned vi = 0;
5169 unsigned si = 0;
5170 while (si < li_scalar_costs.length ()
5171 && vi < li_vector_costs.length ())
5172 {
5173 unsigned sl = li_scalar_costs[si].first;
5174 unsigned vl = li_vector_costs[vi].first;
5175 if (sl != vl)
5176 {
5177 if (dump_enabled_p ())
5178 dump_printf_loc (MSG_NOTE, vect_location,
5179 "Scalar %d and vector %d loop part do not "
5180 "match up, skipping scalar part\n", sl, vl);
5181 /* Skip the scalar part, assuming zero cost on the vector side. */
5182 do
5183 {
5184 si++;
5185 }
5186 while (si < li_scalar_costs.length ()
5187 && li_scalar_costs[si].first == sl);
5188 continue;
5189 }
5190
5191 void *scalar_target_cost_data = init_cost (NULL, true);
5192 do
5193 {
5194 add_stmt_cost (bb_vinfo, scalar_target_cost_data,
5195 li_scalar_costs[si].second);
5196 si++;
5197 }
5198 while (si < li_scalar_costs.length ()
5199 && li_scalar_costs[si].first == sl);
5200 unsigned dummy;
5201 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
5202 destroy_cost_data (scalar_target_cost_data);
5203
5204 /* Complete the target-specific vector cost calculation. */
5205 void *vect_target_cost_data = init_cost (NULL, false);
5206 do
5207 {
5208 add_stmt_cost (bb_vinfo, vect_target_cost_data,
5209 li_vector_costs[vi].second);
5210 vi++;
5211 }
5212 while (vi < li_vector_costs.length ()
5213 && li_vector_costs[vi].first == vl);
5214 finish_cost (vect_target_cost_data, &vec_prologue_cost,
5215 &vec_inside_cost, &vec_epilogue_cost);
5216 destroy_cost_data (vect_target_cost_data);
5217
5218 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
5219
5220 if (dump_enabled_p ())
5221 {
5222 dump_printf_loc (MSG_NOTE, vect_location,
5223 "Cost model analysis for part in loop %d:\n", sl);
5224 dump_printf (MSG_NOTE, " Vector cost: %d\n",
5225 vec_inside_cost + vec_outside_cost);
5226 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
5227 }
5228
5229 /* Vectorization is profitable if its cost is more than the cost of scalar
5230 version. Note that we err on the vector side for equal cost because
5231 the cost estimate is otherwise quite pessimistic (constant uses are
5232 free on the scalar side but cost a load on the vector side for
5233 example). */
5234 if (vec_outside_cost + vec_inside_cost > scalar_cost)
5235 {
5236 scalar_costs.release ();
5237 vector_costs.release ();
5238 return false;
5239 }
5240 }
5241 if (vi < li_vector_costs.length ())
5242 {
5243 if (dump_enabled_p ())
5244 dump_printf_loc (MSG_NOTE, vect_location,
5245 "Excess vector cost for part in loop %d:\n",
5246 li_vector_costs[vi].first);
5247 scalar_costs.release ();
5248 vector_costs.release ();
5249 return false;
5250 }
5251
5252 scalar_costs.release ();
5253 vector_costs.release ();
5254 return true;
5255 }
5256
5257 /* qsort comparator for lane defs. */
5258
5259 static int
5260 vld_cmp (const void *a_, const void *b_)
5261 {
5262 auto *a = (const std::pair<unsigned, tree> *)a_;
5263 auto *b = (const std::pair<unsigned, tree> *)b_;
5264 return a->first - b->first;
5265 }
5266
5267 /* Return true if USE_STMT is a vector lane insert into VEC and set
5268 *THIS_LANE to the lane number that is set. */
5269
5270 static bool
5271 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
5272 {
5273 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
5274 if (!use_ass
5275 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
5276 || (vec
5277 ? gimple_assign_rhs1 (use_ass) != vec
5278 : ((vec = gimple_assign_rhs1 (use_ass)), false))
5279 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
5280 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
5281 || !constant_multiple_p
5282 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
5283 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
5284 this_lane))
5285 return false;
5286 return true;
5287 }
5288
5289 /* Find any vectorizable constructors and add them to the grouped_store
5290 array. */
5291
5292 static void
5293 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
5294 {
5295 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
5296 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
5297 !gsi_end_p (gsi); gsi_next (&gsi))
5298 {
5299 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
5300 if (!assign)
5301 continue;
5302
5303 tree rhs = gimple_assign_rhs1 (assign);
5304 enum tree_code code = gimple_assign_rhs_code (assign);
5305 use_operand_p use_p;
5306 gimple *use_stmt;
5307 if (code == CONSTRUCTOR)
5308 {
5309 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5310 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
5311 CONSTRUCTOR_NELTS (rhs))
5312 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
5313 || uniform_vector_p (rhs))
5314 continue;
5315
5316 unsigned j;
5317 tree val;
5318 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
5319 if (TREE_CODE (val) != SSA_NAME
5320 || !bb_vinfo->lookup_def (val))
5321 break;
5322 if (j != CONSTRUCTOR_NELTS (rhs))
5323 continue;
5324
5325 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
5326 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
5327 }
5328 else if (code == BIT_INSERT_EXPR
5329 && VECTOR_TYPE_P (TREE_TYPE (rhs))
5330 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
5331 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
5332 && integer_zerop (gimple_assign_rhs3 (assign))
5333 && useless_type_conversion_p
5334 (TREE_TYPE (TREE_TYPE (rhs)),
5335 TREE_TYPE (gimple_assign_rhs2 (assign)))
5336 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
5337 {
5338 /* We start to match on insert to lane zero but since the
5339 inserts need not be ordered we'd have to search both
5340 the def and the use chains. */
5341 tree vectype = TREE_TYPE (rhs);
5342 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5343 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
5344 auto_sbitmap lanes (nlanes);
5345 bitmap_clear (lanes);
5346 bitmap_set_bit (lanes, 0);
5347 tree def = gimple_assign_lhs (assign);
5348 lane_defs.quick_push
5349 (std::make_pair (0, gimple_assign_rhs2 (assign)));
5350 unsigned lanes_found = 1;
5351 /* Start with the use chains, the last stmt will be the root. */
5352 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
5353 vec<stmt_vec_info> roots = vNULL;
5354 roots.safe_push (last);
5355 do
5356 {
5357 use_operand_p use_p;
5358 gimple *use_stmt;
5359 if (!single_imm_use (def, &use_p, &use_stmt))
5360 break;
5361 unsigned this_lane;
5362 if (!bb_vinfo->lookup_stmt (use_stmt)
5363 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
5364 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
5365 break;
5366 if (bitmap_bit_p (lanes, this_lane))
5367 break;
5368 lanes_found++;
5369 bitmap_set_bit (lanes, this_lane);
5370 gassign *use_ass = as_a <gassign *> (use_stmt);
5371 lane_defs.quick_push (std::make_pair
5372 (this_lane, gimple_assign_rhs2 (use_ass)));
5373 last = bb_vinfo->lookup_stmt (use_ass);
5374 roots.safe_push (last);
5375 def = gimple_assign_lhs (use_ass);
5376 }
5377 while (lanes_found < nlanes);
5378 if (roots.length () > 1)
5379 std::swap(roots[0], roots[roots.length () - 1]);
5380 if (lanes_found < nlanes)
5381 {
5382 /* Now search the def chain. */
5383 def = gimple_assign_rhs1 (assign);
5384 do
5385 {
5386 if (TREE_CODE (def) != SSA_NAME
5387 || !has_single_use (def))
5388 break;
5389 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
5390 unsigned this_lane;
5391 if (!bb_vinfo->lookup_stmt (def_stmt)
5392 || !vect_slp_is_lane_insert (def_stmt,
5393 NULL_TREE, &this_lane)
5394 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
5395 break;
5396 if (bitmap_bit_p (lanes, this_lane))
5397 break;
5398 lanes_found++;
5399 bitmap_set_bit (lanes, this_lane);
5400 lane_defs.quick_push (std::make_pair
5401 (this_lane,
5402 gimple_assign_rhs2 (def_stmt)));
5403 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
5404 def = gimple_assign_rhs1 (def_stmt);
5405 }
5406 while (lanes_found < nlanes);
5407 }
5408 if (lanes_found == nlanes)
5409 {
5410 /* Sort lane_defs after the lane index and register the root. */
5411 lane_defs.qsort (vld_cmp);
5412 vec<stmt_vec_info> stmts;
5413 stmts.create (nlanes);
5414 for (unsigned i = 0; i < nlanes; ++i)
5415 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
5416 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
5417 stmts, roots));
5418 }
5419 else
5420 roots.release ();
5421 }
5422 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5423 && (associative_tree_code (code) || code == MINUS_EXPR)
5424 /* ??? The flag_associative_math and TYPE_OVERFLOW_WRAPS
5425 checks pessimize a two-element reduction. PR54400.
5426 ??? In-order reduction could be handled if we only
5427 traverse one operand chain in vect_slp_linearize_chain. */
5428 && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
5429 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
5430 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
5431 /* Ops with constants at the tail can be stripped here. */
5432 && TREE_CODE (rhs) == SSA_NAME
5433 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
5434 /* Should be the chain end. */
5435 && (!single_imm_use (gimple_assign_lhs (assign),
5436 &use_p, &use_stmt)
5437 || !is_gimple_assign (use_stmt)
5438 || (gimple_assign_rhs_code (use_stmt) != code
5439 && ((code != PLUS_EXPR && code != MINUS_EXPR)
5440 || (gimple_assign_rhs_code (use_stmt)
5441 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
5442 {
5443 /* We start the match at the end of a possible association
5444 chain. */
5445 auto_vec<chain_op_t> chain;
5446 auto_vec<std::pair<tree_code, gimple *> > worklist;
5447 auto_vec<gimple *> chain_stmts;
5448 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
5449 if (code == MINUS_EXPR)
5450 code = PLUS_EXPR;
5451 internal_fn reduc_fn;
5452 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
5453 || reduc_fn == IFN_LAST)
5454 continue;
5455 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
5456 /* ??? */
5457 code_stmt, alt_code_stmt, &chain_stmts);
5458 if (chain.length () > 1)
5459 {
5460 /* Sort the chain according to def_type and operation. */
5461 chain.sort (dt_sort_cmp, bb_vinfo);
5462 /* ??? Now we'd want to strip externals and constants
5463 but record those to be handled in the epilogue. */
5464 /* ??? For now do not allow mixing ops or externs/constants. */
5465 bool invalid = false;
5466 for (unsigned i = 0; i < chain.length (); ++i)
5467 if (chain[i].dt != vect_internal_def
5468 || chain[i].code != code)
5469 invalid = true;
5470 if (!invalid)
5471 {
5472 vec<stmt_vec_info> stmts;
5473 stmts.create (chain.length ());
5474 for (unsigned i = 0; i < chain.length (); ++i)
5475 stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
5476 vec<stmt_vec_info> roots;
5477 roots.create (chain_stmts.length ());
5478 for (unsigned i = 0; i < chain_stmts.length (); ++i)
5479 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
5480 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
5481 stmts, roots));
5482 }
5483 }
5484 }
5485 }
5486 }
5487
5488 /* Walk the grouped store chains and replace entries with their
5489 pattern variant if any. */
5490
5491 static void
5492 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
5493 {
5494 stmt_vec_info first_element;
5495 unsigned i;
5496
5497 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5498 {
5499 /* We also have CTORs in this array. */
5500 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
5501 continue;
5502 if (STMT_VINFO_IN_PATTERN_P (first_element))
5503 {
5504 stmt_vec_info orig = first_element;
5505 first_element = STMT_VINFO_RELATED_STMT (first_element);
5506 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
5507 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
5508 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
5509 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
5510 vinfo->grouped_stores[i] = first_element;
5511 }
5512 stmt_vec_info prev = first_element;
5513 while (DR_GROUP_NEXT_ELEMENT (prev))
5514 {
5515 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
5516 if (STMT_VINFO_IN_PATTERN_P (elt))
5517 {
5518 stmt_vec_info orig = elt;
5519 elt = STMT_VINFO_RELATED_STMT (elt);
5520 DR_GROUP_NEXT_ELEMENT (prev) = elt;
5521 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
5522 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
5523 }
5524 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
5525 prev = elt;
5526 }
5527 }
5528 }
5529
5530 /* Check if the region described by BB_VINFO can be vectorized, returning
5531 true if so. When returning false, set FATAL to true if the same failure
5532 would prevent vectorization at other vector sizes, false if it is still
5533 worth trying other sizes. N_STMTS is the number of statements in the
5534 region. */
5535
5536 static bool
5537 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
5538 vec<int> *dataref_groups)
5539 {
5540 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
5541
5542 slp_instance instance;
5543 int i;
5544 poly_uint64 min_vf = 2;
5545
5546 /* The first group of checks is independent of the vector size. */
5547 fatal = true;
5548
5549 /* Analyze the data references. */
5550
5551 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
5552 {
5553 if (dump_enabled_p ())
5554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5555 "not vectorized: unhandled data-ref in basic "
5556 "block.\n");
5557 return false;
5558 }
5559
5560 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
5561 {
5562 if (dump_enabled_p ())
5563 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5564 "not vectorized: unhandled data access in "
5565 "basic block.\n");
5566 return false;
5567 }
5568
5569 vect_slp_check_for_constructors (bb_vinfo);
5570
5571 /* If there are no grouped stores and no constructors in the region
5572 there is no need to continue with pattern recog as vect_analyze_slp
5573 will fail anyway. */
5574 if (bb_vinfo->grouped_stores.is_empty ()
5575 && bb_vinfo->roots.is_empty ())
5576 {
5577 if (dump_enabled_p ())
5578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5579 "not vectorized: no grouped stores in "
5580 "basic block.\n");
5581 return false;
5582 }
5583
5584 /* While the rest of the analysis below depends on it in some way. */
5585 fatal = false;
5586
5587 vect_pattern_recog (bb_vinfo);
5588
5589 /* Update store groups from pattern processing. */
5590 vect_fixup_store_groups_with_patterns (bb_vinfo);
5591
5592 /* Check the SLP opportunities in the basic block, analyze and build SLP
5593 trees. */
5594 if (!vect_analyze_slp (bb_vinfo, n_stmts))
5595 {
5596 if (dump_enabled_p ())
5597 {
5598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5599 "Failed to SLP the basic block.\n");
5600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5601 "not vectorized: failed to find SLP opportunities "
5602 "in basic block.\n");
5603 }
5604 return false;
5605 }
5606
5607 /* Optimize permutations. */
5608 vect_optimize_slp (bb_vinfo);
5609
5610 /* Gather the loads reachable from the SLP graph entries. */
5611 vect_gather_slp_loads (bb_vinfo);
5612
5613 vect_record_base_alignments (bb_vinfo);
5614
5615 /* Analyze and verify the alignment of data references and the
5616 dependence in the SLP instances. */
5617 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
5618 {
5619 vect_location = instance->location ();
5620 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
5621 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
5622 {
5623 slp_tree node = SLP_INSTANCE_TREE (instance);
5624 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5625 if (dump_enabled_p ())
5626 dump_printf_loc (MSG_NOTE, vect_location,
5627 "removing SLP instance operations starting from: %G",
5628 stmt_info->stmt);
5629 vect_free_slp_instance (instance);
5630 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
5631 continue;
5632 }
5633
5634 /* Mark all the statements that we want to vectorize as pure SLP and
5635 relevant. */
5636 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5637 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
5638 unsigned j;
5639 stmt_vec_info root;
5640 /* Likewise consider instance root stmts as vectorized. */
5641 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
5642 STMT_SLP_TYPE (root) = pure_slp;
5643
5644 i++;
5645 }
5646 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
5647 return false;
5648
5649 if (!vect_slp_analyze_operations (bb_vinfo))
5650 {
5651 if (dump_enabled_p ())
5652 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5653 "not vectorized: bad operation in basic block.\n");
5654 return false;
5655 }
5656
5657 vect_bb_partition_graph (bb_vinfo);
5658
5659 return true;
5660 }
5661
5662 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
5663 basic blocks in BBS, returning true on success.
5664 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
5665
5666 static bool
5667 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
5668 vec<int> *dataref_groups, unsigned int n_stmts)
5669 {
5670 bb_vec_info bb_vinfo;
5671 auto_vector_modes vector_modes;
5672
5673 /* Autodetect first vector size we try. */
5674 machine_mode next_vector_mode = VOIDmode;
5675 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
5676 unsigned int mode_i = 0;
5677
5678 vec_info_shared shared;
5679
5680 machine_mode autodetected_vector_mode = VOIDmode;
5681 while (1)
5682 {
5683 bool vectorized = false;
5684 bool fatal = false;
5685 bb_vinfo = new _bb_vec_info (bbs, &shared);
5686
5687 bool first_time_p = shared.datarefs.is_empty ();
5688 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
5689 if (first_time_p)
5690 bb_vinfo->shared->save_datarefs ();
5691 else
5692 bb_vinfo->shared->check_datarefs ();
5693 bb_vinfo->vector_mode = next_vector_mode;
5694
5695 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
5696 {
5697 if (dump_enabled_p ())
5698 {
5699 dump_printf_loc (MSG_NOTE, vect_location,
5700 "***** Analysis succeeded with vector mode"
5701 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
5702 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
5703 }
5704
5705 bb_vinfo->shared->check_datarefs ();
5706
5707 unsigned i;
5708 slp_instance instance;
5709 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
5710 {
5711 if (instance->subgraph_entries.is_empty ())
5712 continue;
5713
5714 vect_location = instance->location ();
5715 if (!unlimited_cost_model (NULL)
5716 && !vect_bb_vectorization_profitable_p
5717 (bb_vinfo, instance->subgraph_entries))
5718 {
5719 if (dump_enabled_p ())
5720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5721 "not vectorized: vectorization is not "
5722 "profitable.\n");
5723 continue;
5724 }
5725
5726 if (!dbg_cnt (vect_slp))
5727 continue;
5728
5729 if (!vectorized && dump_enabled_p ())
5730 dump_printf_loc (MSG_NOTE, vect_location,
5731 "Basic block will be vectorized "
5732 "using SLP\n");
5733 vectorized = true;
5734
5735 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
5736
5737 unsigned HOST_WIDE_INT bytes;
5738 if (dump_enabled_p ())
5739 {
5740 if (GET_MODE_SIZE
5741 (bb_vinfo->vector_mode).is_constant (&bytes))
5742 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5743 "basic block part vectorized using %wu "
5744 "byte vectors\n", bytes);
5745 else
5746 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5747 "basic block part vectorized using "
5748 "variable length vectors\n");
5749 }
5750 }
5751 }
5752 else
5753 {
5754 if (dump_enabled_p ())
5755 dump_printf_loc (MSG_NOTE, vect_location,
5756 "***** Analysis failed with vector mode %s\n",
5757 GET_MODE_NAME (bb_vinfo->vector_mode));
5758 }
5759
5760 if (mode_i == 0)
5761 autodetected_vector_mode = bb_vinfo->vector_mode;
5762
5763 if (!fatal)
5764 while (mode_i < vector_modes.length ()
5765 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
5766 {
5767 if (dump_enabled_p ())
5768 dump_printf_loc (MSG_NOTE, vect_location,
5769 "***** The result for vector mode %s would"
5770 " be the same\n",
5771 GET_MODE_NAME (vector_modes[mode_i]));
5772 mode_i += 1;
5773 }
5774
5775 delete bb_vinfo;
5776
5777 if (mode_i < vector_modes.length ()
5778 && VECTOR_MODE_P (autodetected_vector_mode)
5779 && (related_vector_mode (vector_modes[mode_i],
5780 GET_MODE_INNER (autodetected_vector_mode))
5781 == autodetected_vector_mode)
5782 && (related_vector_mode (autodetected_vector_mode,
5783 GET_MODE_INNER (vector_modes[mode_i]))
5784 == vector_modes[mode_i]))
5785 {
5786 if (dump_enabled_p ())
5787 dump_printf_loc (MSG_NOTE, vect_location,
5788 "***** Skipping vector mode %s, which would"
5789 " repeat the analysis for %s\n",
5790 GET_MODE_NAME (vector_modes[mode_i]),
5791 GET_MODE_NAME (autodetected_vector_mode));
5792 mode_i += 1;
5793 }
5794
5795 if (vectorized
5796 || mode_i == vector_modes.length ()
5797 || autodetected_vector_mode == VOIDmode
5798 /* If vect_slp_analyze_bb_1 signaled that analysis for all
5799 vector sizes will fail do not bother iterating. */
5800 || fatal)
5801 return vectorized;
5802
5803 /* Try the next biggest vector size. */
5804 next_vector_mode = vector_modes[mode_i++];
5805 if (dump_enabled_p ())
5806 dump_printf_loc (MSG_NOTE, vect_location,
5807 "***** Re-trying analysis with vector mode %s\n",
5808 GET_MODE_NAME (next_vector_mode));
5809 }
5810 }
5811
5812
5813 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
5814 true if anything in the basic-block was vectorized. */
5815
5816 static bool
5817 vect_slp_bbs (vec<basic_block> bbs)
5818 {
5819 vec<data_reference_p> datarefs = vNULL;
5820 auto_vec<int> dataref_groups;
5821 int insns = 0;
5822 int current_group = 0;
5823
5824 for (unsigned i = 0; i < bbs.length (); i++)
5825 {
5826 basic_block bb = bbs[i];
5827 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
5828 gsi_next (&gsi))
5829 {
5830 gimple *stmt = gsi_stmt (gsi);
5831 if (is_gimple_debug (stmt))
5832 continue;
5833
5834 insns++;
5835
5836 if (gimple_location (stmt) != UNKNOWN_LOCATION)
5837 vect_location = stmt;
5838
5839 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
5840 &dataref_groups, current_group))
5841 ++current_group;
5842 }
5843 }
5844
5845 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
5846 }
5847
5848 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5849 true if anything in the basic-block was vectorized. */
5850
5851 bool
5852 vect_slp_bb (basic_block bb)
5853 {
5854 auto_vec<basic_block> bbs;
5855 bbs.safe_push (bb);
5856 return vect_slp_bbs (bbs);
5857 }
5858
5859 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5860 true if anything in the basic-block was vectorized. */
5861
5862 bool
5863 vect_slp_function (function *fun)
5864 {
5865 bool r = false;
5866 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
5867 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
5868
5869 /* For the moment split the function into pieces to avoid making
5870 the iteration on the vector mode moot. Split at points we know
5871 to not handle well which is CFG merges (SLP discovery doesn't
5872 handle non-loop-header PHIs) and loop exits. Since pattern
5873 recog requires reverse iteration to visit uses before defs
5874 simply chop RPO into pieces. */
5875 auto_vec<basic_block> bbs;
5876 for (unsigned i = 0; i < n; i++)
5877 {
5878 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
5879 bool split = false;
5880
5881 /* Split when a BB is not dominated by the first block. */
5882 if (!bbs.is_empty ()
5883 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
5884 {
5885 if (dump_enabled_p ())
5886 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5887 "splitting region at dominance boundary bb%d\n",
5888 bb->index);
5889 split = true;
5890 }
5891 /* Split when the loop determined by the first block
5892 is exited. This is because we eventually insert
5893 invariants at region begin. */
5894 else if (!bbs.is_empty ()
5895 && bbs[0]->loop_father != bb->loop_father
5896 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
5897 {
5898 if (dump_enabled_p ())
5899 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5900 "splitting region at loop %d exit at bb%d\n",
5901 bbs[0]->loop_father->num, bb->index);
5902 split = true;
5903 }
5904
5905 if (split && !bbs.is_empty ())
5906 {
5907 r |= vect_slp_bbs (bbs);
5908 bbs.truncate (0);
5909 bbs.quick_push (bb);
5910 }
5911 else
5912 bbs.safe_push (bb);
5913
5914 /* When we have a stmt ending this block and defining a
5915 value we have to insert on edges when inserting after it for
5916 a vector containing its definition. Avoid this for now. */
5917 if (gimple *last = last_stmt (bb))
5918 if (gimple_get_lhs (last)
5919 && is_ctrl_altering_stmt (last))
5920 {
5921 if (dump_enabled_p ())
5922 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5923 "splitting region at control altering "
5924 "definition %G", last);
5925 r |= vect_slp_bbs (bbs);
5926 bbs.truncate (0);
5927 }
5928 }
5929
5930 if (!bbs.is_empty ())
5931 r |= vect_slp_bbs (bbs);
5932
5933 free (rpo);
5934
5935 return r;
5936 }
5937
5938 /* Build a variable-length vector in which the elements in ELTS are repeated
5939 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
5940 RESULTS and add any new instructions to SEQ.
5941
5942 The approach we use is:
5943
5944 (1) Find a vector mode VM with integer elements of mode IM.
5945
5946 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5947 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
5948 from small vectors to IM.
5949
5950 (3) Duplicate each ELTS'[I] into a vector of mode VM.
5951
5952 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
5953 correct byte contents.
5954
5955 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
5956
5957 We try to find the largest IM for which this sequence works, in order
5958 to cut down on the number of interleaves. */
5959
5960 void
5961 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
5962 vec<tree> elts, unsigned int nresults,
5963 vec<tree> &results)
5964 {
5965 unsigned int nelts = elts.length ();
5966 tree element_type = TREE_TYPE (vector_type);
5967
5968 /* (1) Find a vector mode VM with integer elements of mode IM. */
5969 unsigned int nvectors = 1;
5970 tree new_vector_type;
5971 tree permutes[2];
5972 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
5973 &nvectors, &new_vector_type,
5974 permutes))
5975 gcc_unreachable ();
5976
5977 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
5978 unsigned int partial_nelts = nelts / nvectors;
5979 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
5980
5981 tree_vector_builder partial_elts;
5982 auto_vec<tree, 32> pieces (nvectors * 2);
5983 pieces.quick_grow_cleared (nvectors * 2);
5984 for (unsigned int i = 0; i < nvectors; ++i)
5985 {
5986 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5987 ELTS' has mode IM. */
5988 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
5989 for (unsigned int j = 0; j < partial_nelts; ++j)
5990 partial_elts.quick_push (elts[i * partial_nelts + j]);
5991 tree t = gimple_build_vector (seq, &partial_elts);
5992 t = gimple_build (seq, VIEW_CONVERT_EXPR,
5993 TREE_TYPE (new_vector_type), t);
5994
5995 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
5996 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
5997 }
5998
5999 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
6000 correct byte contents.
6001
6002 Conceptually, we need to repeat the following operation log2(nvectors)
6003 times, where hi_start = nvectors / 2:
6004
6005 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
6006 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
6007
6008 However, if each input repeats every N elements and the VF is
6009 a multiple of N * 2, the HI result is the same as the LO result.
6010 This will be true for the first N1 iterations of the outer loop,
6011 followed by N2 iterations for which both the LO and HI results
6012 are needed. I.e.:
6013
6014 N1 + N2 = log2(nvectors)
6015
6016 Each "N1 iteration" doubles the number of redundant vectors and the
6017 effect of the process as a whole is to have a sequence of nvectors/2**N1
6018 vectors that repeats 2**N1 times. Rather than generate these redundant
6019 vectors, we halve the number of vectors for each N1 iteration. */
6020 unsigned int in_start = 0;
6021 unsigned int out_start = nvectors;
6022 unsigned int new_nvectors = nvectors;
6023 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
6024 {
6025 unsigned int hi_start = new_nvectors / 2;
6026 unsigned int out_i = 0;
6027 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
6028 {
6029 if ((in_i & 1) != 0
6030 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
6031 2 * in_repeat))
6032 continue;
6033
6034 tree output = make_ssa_name (new_vector_type);
6035 tree input1 = pieces[in_start + (in_i / 2)];
6036 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
6037 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
6038 input1, input2,
6039 permutes[in_i & 1]);
6040 gimple_seq_add_stmt (seq, stmt);
6041 pieces[out_start + out_i] = output;
6042 out_i += 1;
6043 }
6044 std::swap (in_start, out_start);
6045 new_nvectors = out_i;
6046 }
6047
6048 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
6049 results.reserve (nresults);
6050 for (unsigned int i = 0; i < nresults; ++i)
6051 if (i < new_nvectors)
6052 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
6053 pieces[in_start + i]));
6054 else
6055 results.quick_push (results[i - new_nvectors]);
6056 }
6057
6058
6059 /* For constant and loop invariant defs in OP_NODE this function creates
6060 vector defs that will be used in the vectorized stmts and stores them
6061 to SLP_TREE_VEC_DEFS of OP_NODE. */
6062
6063 static void
6064 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
6065 {
6066 unsigned HOST_WIDE_INT nunits;
6067 tree vec_cst;
6068 unsigned j, number_of_places_left_in_vector;
6069 tree vector_type;
6070 tree vop;
6071 int group_size = op_node->ops.length ();
6072 unsigned int vec_num, i;
6073 unsigned number_of_copies = 1;
6074 bool constant_p;
6075 gimple_seq ctor_seq = NULL;
6076 auto_vec<tree, 16> permute_results;
6077
6078 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
6079 vector_type = SLP_TREE_VECTYPE (op_node);
6080
6081 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
6082 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
6083 auto_vec<tree> voprnds (number_of_vectors);
6084
6085 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
6086 created vectors. It is greater than 1 if unrolling is performed.
6087
6088 For example, we have two scalar operands, s1 and s2 (e.g., group of
6089 strided accesses of size two), while NUNITS is four (i.e., four scalars
6090 of this type can be packed in a vector). The output vector will contain
6091 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
6092 will be 2).
6093
6094 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
6095 containing the operands.
6096
6097 For example, NUNITS is four as before, and the group size is 8
6098 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
6099 {s5, s6, s7, s8}. */
6100
6101 /* When using duplicate_and_interleave, we just need one element for
6102 each scalar statement. */
6103 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
6104 nunits = group_size;
6105
6106 number_of_copies = nunits * number_of_vectors / group_size;
6107
6108 number_of_places_left_in_vector = nunits;
6109 constant_p = true;
6110 tree_vector_builder elts (vector_type, nunits, 1);
6111 elts.quick_grow (nunits);
6112 stmt_vec_info insert_after = NULL;
6113 for (j = 0; j < number_of_copies; j++)
6114 {
6115 tree op;
6116 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
6117 {
6118 /* Create 'vect_ = {op0,op1,...,opn}'. */
6119 number_of_places_left_in_vector--;
6120 tree orig_op = op;
6121 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
6122 {
6123 if (CONSTANT_CLASS_P (op))
6124 {
6125 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6126 {
6127 /* Can't use VIEW_CONVERT_EXPR for booleans because
6128 of possibly different sizes of scalar value and
6129 vector element. */
6130 if (integer_zerop (op))
6131 op = build_int_cst (TREE_TYPE (vector_type), 0);
6132 else if (integer_onep (op))
6133 op = build_all_ones_cst (TREE_TYPE (vector_type));
6134 else
6135 gcc_unreachable ();
6136 }
6137 else
6138 op = fold_unary (VIEW_CONVERT_EXPR,
6139 TREE_TYPE (vector_type), op);
6140 gcc_assert (op && CONSTANT_CLASS_P (op));
6141 }
6142 else
6143 {
6144 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
6145 gimple *init_stmt;
6146 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6147 {
6148 tree true_val
6149 = build_all_ones_cst (TREE_TYPE (vector_type));
6150 tree false_val
6151 = build_zero_cst (TREE_TYPE (vector_type));
6152 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
6153 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
6154 op, true_val,
6155 false_val);
6156 }
6157 else
6158 {
6159 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
6160 op);
6161 init_stmt
6162 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
6163 op);
6164 }
6165 gimple_seq_add_stmt (&ctor_seq, init_stmt);
6166 op = new_temp;
6167 }
6168 }
6169 elts[number_of_places_left_in_vector] = op;
6170 if (!CONSTANT_CLASS_P (op))
6171 constant_p = false;
6172 /* For BB vectorization we have to compute an insert location
6173 when a def is inside the analyzed region since we cannot
6174 simply insert at the BB start in this case. */
6175 stmt_vec_info opdef;
6176 if (TREE_CODE (orig_op) == SSA_NAME
6177 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
6178 && is_a <bb_vec_info> (vinfo)
6179 && (opdef = vinfo->lookup_def (orig_op)))
6180 {
6181 if (!insert_after)
6182 insert_after = opdef;
6183 else
6184 insert_after = get_later_stmt (insert_after, opdef);
6185 }
6186
6187 if (number_of_places_left_in_vector == 0)
6188 {
6189 if (constant_p
6190 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
6191 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
6192 vec_cst = gimple_build_vector (&ctor_seq, &elts);
6193 else
6194 {
6195 if (permute_results.is_empty ())
6196 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
6197 elts, number_of_vectors,
6198 permute_results);
6199 vec_cst = permute_results[number_of_vectors - j - 1];
6200 }
6201 if (!gimple_seq_empty_p (ctor_seq))
6202 {
6203 if (insert_after)
6204 {
6205 gimple_stmt_iterator gsi;
6206 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
6207 {
6208 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
6209 gsi_insert_seq_before (&gsi, ctor_seq,
6210 GSI_CONTINUE_LINKING);
6211 }
6212 else if (!stmt_ends_bb_p (insert_after->stmt))
6213 {
6214 gsi = gsi_for_stmt (insert_after->stmt);
6215 gsi_insert_seq_after (&gsi, ctor_seq,
6216 GSI_CONTINUE_LINKING);
6217 }
6218 else
6219 {
6220 /* When we want to insert after a def where the
6221 defining stmt throws then insert on the fallthru
6222 edge. */
6223 edge e = find_fallthru_edge
6224 (gimple_bb (insert_after->stmt)->succs);
6225 basic_block new_bb
6226 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
6227 gcc_assert (!new_bb);
6228 }
6229 }
6230 else
6231 vinfo->insert_seq_on_entry (NULL, ctor_seq);
6232 ctor_seq = NULL;
6233 }
6234 voprnds.quick_push (vec_cst);
6235 insert_after = NULL;
6236 number_of_places_left_in_vector = nunits;
6237 constant_p = true;
6238 elts.new_vector (vector_type, nunits, 1);
6239 elts.quick_grow (nunits);
6240 }
6241 }
6242 }
6243
6244 /* Since the vectors are created in the reverse order, we should invert
6245 them. */
6246 vec_num = voprnds.length ();
6247 for (j = vec_num; j != 0; j--)
6248 {
6249 vop = voprnds[j - 1];
6250 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6251 }
6252
6253 /* In case that VF is greater than the unrolling factor needed for the SLP
6254 group of stmts, NUMBER_OF_VECTORS to be created is greater than
6255 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
6256 to replicate the vectors. */
6257 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
6258 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
6259 i++)
6260 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6261 }
6262
6263 /* Get the Ith vectorized definition from SLP_NODE. */
6264
6265 tree
6266 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
6267 {
6268 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
6269 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
6270 else
6271 return SLP_TREE_VEC_DEFS (slp_node)[i];
6272 }
6273
6274 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
6275
6276 void
6277 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
6278 {
6279 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
6280 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
6281 {
6282 unsigned j;
6283 gimple *vec_def_stmt;
6284 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
6285 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
6286 }
6287 else
6288 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
6289 }
6290
6291 /* Get N vectorized definitions for SLP_NODE. */
6292
6293 void
6294 vect_get_slp_defs (vec_info *,
6295 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
6296 {
6297 if (n == -1U)
6298 n = SLP_TREE_CHILDREN (slp_node).length ();
6299
6300 for (unsigned i = 0; i < n; ++i)
6301 {
6302 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
6303 vec<tree> vec_defs = vNULL;
6304 vect_get_slp_defs (child, &vec_defs);
6305 vec_oprnds->quick_push (vec_defs);
6306 }
6307 }
6308
6309 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6310 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6311 permute statements for the SLP node NODE. Store the number of vector
6312 permute instructions in *N_PERMS and the number of vector load
6313 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
6314 that were not needed. */
6315
6316 bool
6317 vect_transform_slp_perm_load (vec_info *vinfo,
6318 slp_tree node, vec<tree> dr_chain,
6319 gimple_stmt_iterator *gsi, poly_uint64 vf,
6320 bool analyze_only, unsigned *n_perms,
6321 unsigned int *n_loads, bool dce_chain)
6322 {
6323 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6324 int vec_index = 0;
6325 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6326 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
6327 unsigned int mask_element;
6328 machine_mode mode;
6329
6330 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
6331 return false;
6332
6333 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6334
6335 mode = TYPE_MODE (vectype);
6336 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6337
6338 /* Initialize the vect stmts of NODE to properly insert the generated
6339 stmts later. */
6340 if (! analyze_only)
6341 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
6342 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6343 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
6344
6345 /* Generate permutation masks for every NODE. Number of masks for each NODE
6346 is equal to GROUP_SIZE.
6347 E.g., we have a group of three nodes with three loads from the same
6348 location in each node, and the vector size is 4. I.e., we have a
6349 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6350 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6351 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6352 ...
6353
6354 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
6355 The last mask is illegal since we assume two operands for permute
6356 operation, and the mask element values can't be outside that range.
6357 Hence, the last mask must be converted into {2,5,5,5}.
6358 For the first two permutations we need the first and the second input
6359 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6360 we need the second and the third vectors: {b1,c1,a2,b2} and
6361 {c2,a3,b3,c3}. */
6362
6363 int vect_stmts_counter = 0;
6364 unsigned int index = 0;
6365 int first_vec_index = -1;
6366 int second_vec_index = -1;
6367 bool noop_p = true;
6368 *n_perms = 0;
6369
6370 vec_perm_builder mask;
6371 unsigned int nelts_to_build;
6372 unsigned int nvectors_per_build;
6373 unsigned int in_nlanes;
6374 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
6375 && multiple_p (nunits, group_size));
6376 if (repeating_p)
6377 {
6378 /* A single vector contains a whole number of copies of the node, so:
6379 (a) all permutes can use the same mask; and
6380 (b) the permutes only need a single vector input. */
6381 mask.new_vector (nunits, group_size, 3);
6382 nelts_to_build = mask.encoded_nelts ();
6383 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
6384 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
6385 }
6386 else
6387 {
6388 /* We need to construct a separate mask for each vector statement. */
6389 unsigned HOST_WIDE_INT const_nunits, const_vf;
6390 if (!nunits.is_constant (&const_nunits)
6391 || !vf.is_constant (&const_vf))
6392 return false;
6393 mask.new_vector (const_nunits, const_nunits, 1);
6394 nelts_to_build = const_vf * group_size;
6395 nvectors_per_build = 1;
6396 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
6397 }
6398 auto_sbitmap used_in_lanes (in_nlanes);
6399 bitmap_clear (used_in_lanes);
6400 auto_bitmap used_defs;
6401
6402 unsigned int count = mask.encoded_nelts ();
6403 mask.quick_grow (count);
6404 vec_perm_indices indices;
6405
6406 for (unsigned int j = 0; j < nelts_to_build; j++)
6407 {
6408 unsigned int iter_num = j / group_size;
6409 unsigned int stmt_num = j % group_size;
6410 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
6411 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
6412 bitmap_set_bit (used_in_lanes, i);
6413 if (repeating_p)
6414 {
6415 first_vec_index = 0;
6416 mask_element = i;
6417 }
6418 else
6419 {
6420 /* Enforced before the loop when !repeating_p. */
6421 unsigned int const_nunits = nunits.to_constant ();
6422 vec_index = i / const_nunits;
6423 mask_element = i % const_nunits;
6424 if (vec_index == first_vec_index
6425 || first_vec_index == -1)
6426 {
6427 first_vec_index = vec_index;
6428 }
6429 else if (vec_index == second_vec_index
6430 || second_vec_index == -1)
6431 {
6432 second_vec_index = vec_index;
6433 mask_element += const_nunits;
6434 }
6435 else
6436 {
6437 if (dump_enabled_p ())
6438 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6439 "permutation requires at "
6440 "least three vectors %G",
6441 stmt_info->stmt);
6442 gcc_assert (analyze_only);
6443 return false;
6444 }
6445
6446 gcc_assert (mask_element < 2 * const_nunits);
6447 }
6448
6449 if (mask_element != index)
6450 noop_p = false;
6451 mask[index++] = mask_element;
6452
6453 if (index == count && !noop_p)
6454 {
6455 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
6456 if (!can_vec_perm_const_p (mode, indices))
6457 {
6458 if (dump_enabled_p ())
6459 {
6460 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
6461 vect_location,
6462 "unsupported vect permute { ");
6463 for (i = 0; i < count; ++i)
6464 {
6465 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
6466 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
6467 }
6468 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
6469 }
6470 gcc_assert (analyze_only);
6471 return false;
6472 }
6473
6474 ++*n_perms;
6475 }
6476
6477 if (index == count)
6478 {
6479 if (!analyze_only)
6480 {
6481 tree mask_vec = NULL_TREE;
6482
6483 if (! noop_p)
6484 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6485
6486 if (second_vec_index == -1)
6487 second_vec_index = first_vec_index;
6488
6489 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
6490 {
6491 /* Generate the permute statement if necessary. */
6492 tree first_vec = dr_chain[first_vec_index + ri];
6493 tree second_vec = dr_chain[second_vec_index + ri];
6494 gimple *perm_stmt;
6495 if (! noop_p)
6496 {
6497 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6498 tree perm_dest
6499 = vect_create_destination_var (gimple_assign_lhs (stmt),
6500 vectype);
6501 perm_dest = make_ssa_name (perm_dest);
6502 perm_stmt
6503 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6504 first_vec, second_vec,
6505 mask_vec);
6506 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
6507 gsi);
6508 if (dce_chain)
6509 {
6510 bitmap_set_bit (used_defs, first_vec_index + ri);
6511 bitmap_set_bit (used_defs, second_vec_index + ri);
6512 }
6513 }
6514 else
6515 {
6516 /* If mask was NULL_TREE generate the requested
6517 identity transform. */
6518 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
6519 if (dce_chain)
6520 bitmap_set_bit (used_defs, first_vec_index + ri);
6521 }
6522
6523 /* Store the vector statement in NODE. */
6524 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
6525 }
6526 }
6527
6528 index = 0;
6529 first_vec_index = -1;
6530 second_vec_index = -1;
6531 noop_p = true;
6532 }
6533 }
6534
6535 if (n_loads)
6536 {
6537 if (repeating_p)
6538 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6539 else
6540 {
6541 /* Enforced above when !repeating_p. */
6542 unsigned int const_nunits = nunits.to_constant ();
6543 *n_loads = 0;
6544 bool load_seen = false;
6545 for (unsigned i = 0; i < in_nlanes; ++i)
6546 {
6547 if (i % const_nunits == 0)
6548 {
6549 if (load_seen)
6550 *n_loads += 1;
6551 load_seen = false;
6552 }
6553 if (bitmap_bit_p (used_in_lanes, i))
6554 load_seen = true;
6555 }
6556 if (load_seen)
6557 *n_loads += 1;
6558 }
6559 }
6560
6561 if (dce_chain)
6562 for (unsigned i = 0; i < dr_chain.length (); ++i)
6563 if (!bitmap_bit_p (used_defs, i))
6564 {
6565 gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
6566 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
6567 gsi_remove (&rgsi, true);
6568 release_defs (stmt);
6569 }
6570
6571 return true;
6572 }
6573
6574 /* Produce the next vector result for SLP permutation NODE by adding a vector
6575 statement at GSI. If MASK_VEC is nonnull, add:
6576
6577 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
6578
6579 otherwise add:
6580
6581 <new SSA name> = FIRST_DEF. */
6582
6583 static void
6584 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6585 slp_tree node, tree first_def, tree second_def,
6586 tree mask_vec)
6587 {
6588 tree vectype = SLP_TREE_VECTYPE (node);
6589
6590 /* ??? We SLP match existing vector element extracts but
6591 allow punning which we need to re-instantiate at uses
6592 but have no good way of explicitly representing. */
6593 if (!types_compatible_p (TREE_TYPE (first_def), vectype))
6594 {
6595 gassign *conv_stmt
6596 = gimple_build_assign (make_ssa_name (vectype),
6597 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
6598 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6599 first_def = gimple_assign_lhs (conv_stmt);
6600 }
6601 gassign *perm_stmt;
6602 tree perm_dest = make_ssa_name (vectype);
6603 if (mask_vec)
6604 {
6605 if (!types_compatible_p (TREE_TYPE (second_def), vectype))
6606 {
6607 gassign *conv_stmt
6608 = gimple_build_assign (make_ssa_name (vectype),
6609 build1 (VIEW_CONVERT_EXPR,
6610 vectype, second_def));
6611 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6612 second_def = gimple_assign_lhs (conv_stmt);
6613 }
6614 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6615 first_def, second_def,
6616 mask_vec);
6617 }
6618 else
6619 /* We need a copy here in case the def was external. */
6620 perm_stmt = gimple_build_assign (perm_dest, first_def);
6621 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
6622 /* Store the vector statement in NODE. */
6623 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
6624 }
6625
6626 /* Vectorize the SLP permutations in NODE as specified
6627 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
6628 child number and lane number.
6629 Interleaving of two two-lane two-child SLP subtrees (not supported):
6630 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
6631 A blend of two four-lane two-child SLP subtrees:
6632 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
6633 Highpart of a four-lane one-child SLP subtree (not supported):
6634 [ { 0, 2 }, { 0, 3 } ]
6635 Where currently only a subset is supported by code generating below. */
6636
6637 static bool
6638 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6639 slp_tree node, stmt_vector_for_cost *cost_vec)
6640 {
6641 tree vectype = SLP_TREE_VECTYPE (node);
6642
6643 /* ??? We currently only support all same vector input and output types
6644 while the SLP IL should really do a concat + select and thus accept
6645 arbitrary mismatches. */
6646 slp_tree child;
6647 unsigned i;
6648 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6649 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
6650 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6651 {
6652 if (!vect_maybe_update_slp_op_vectype (child, vectype)
6653 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
6654 {
6655 if (dump_enabled_p ())
6656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6657 "Unsupported lane permutation\n");
6658 return false;
6659 }
6660 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
6661 repeating_p = false;
6662 }
6663
6664 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
6665 gcc_assert (perm.length () == SLP_TREE_LANES (node));
6666 if (dump_enabled_p ())
6667 {
6668 dump_printf_loc (MSG_NOTE, vect_location,
6669 "vectorizing permutation");
6670 for (unsigned i = 0; i < perm.length (); ++i)
6671 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
6672 if (repeating_p)
6673 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
6674 dump_printf (MSG_NOTE, "\n");
6675 }
6676
6677 /* REPEATING_P is true if every output vector is guaranteed to use the
6678 same permute vector. We can handle that case for both variable-length
6679 and constant-length vectors, but we only handle other cases for
6680 constant-length vectors.
6681
6682 Set:
6683
6684 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
6685 mask vector that we want to build.
6686
6687 - NCOPIES to the number of copies of PERM that we need in order
6688 to build the necessary permute mask vectors.
6689
6690 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
6691 for each permute mask vector. This is only relevant when GSI is
6692 nonnull. */
6693 uint64_t npatterns;
6694 unsigned nelts_per_pattern;
6695 uint64_t ncopies;
6696 unsigned noutputs_per_mask;
6697 if (repeating_p)
6698 {
6699 /* We need a single permute mask vector that has the form:
6700
6701 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
6702
6703 In other words, the original n-element permute in PERM is
6704 "unrolled" to fill a full vector. The stepped vector encoding
6705 that we use for permutes requires 3n elements. */
6706 npatterns = SLP_TREE_LANES (node);
6707 nelts_per_pattern = ncopies = 3;
6708 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6709 }
6710 else
6711 {
6712 /* Calculate every element of every permute mask vector explicitly,
6713 instead of relying on the pattern described above. */
6714 if (!nunits.is_constant (&npatterns))
6715 return false;
6716 nelts_per_pattern = ncopies = 1;
6717 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
6718 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
6719 return false;
6720 noutputs_per_mask = 1;
6721 }
6722 unsigned olanes = ncopies * SLP_TREE_LANES (node);
6723 gcc_assert (repeating_p || multiple_p (olanes, nunits));
6724
6725 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
6726 from the { SLP operand, scalar lane } permutation as recorded in the
6727 SLP node as intermediate step. This part should already work
6728 with SLP children with arbitrary number of lanes. */
6729 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
6730 auto_vec<unsigned> active_lane;
6731 vperm.create (olanes);
6732 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
6733 for (unsigned i = 0; i < ncopies; ++i)
6734 {
6735 for (unsigned pi = 0; pi < perm.length (); ++pi)
6736 {
6737 std::pair<unsigned, unsigned> p = perm[pi];
6738 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
6739 if (repeating_p)
6740 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
6741 else
6742 {
6743 /* We checked above that the vectors are constant-length. */
6744 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
6745 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
6746 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
6747 vperm.quick_push ({{p.first, vi}, vl});
6748 }
6749 }
6750 /* Advance to the next group. */
6751 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
6752 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
6753 }
6754
6755 if (dump_enabled_p ())
6756 {
6757 dump_printf_loc (MSG_NOTE, vect_location, "as");
6758 for (unsigned i = 0; i < vperm.length (); ++i)
6759 {
6760 if (i != 0
6761 && (repeating_p
6762 ? multiple_p (i, npatterns)
6763 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
6764 dump_printf (MSG_NOTE, ",");
6765 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
6766 vperm[i].first.first, vperm[i].first.second,
6767 vperm[i].second);
6768 }
6769 dump_printf (MSG_NOTE, "\n");
6770 }
6771
6772 /* We can only handle two-vector permutes, everything else should
6773 be lowered on the SLP level. The following is closely inspired
6774 by vect_transform_slp_perm_load and is supposed to eventually
6775 replace it.
6776 ??? As intermediate step do code-gen in the SLP tree representation
6777 somehow? */
6778 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
6779 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
6780 unsigned int index = 0;
6781 poly_uint64 mask_element;
6782 vec_perm_builder mask;
6783 mask.new_vector (nunits, npatterns, nelts_per_pattern);
6784 unsigned int count = mask.encoded_nelts ();
6785 mask.quick_grow (count);
6786 vec_perm_indices indices;
6787 unsigned nperms = 0;
6788 for (unsigned i = 0; i < vperm.length (); ++i)
6789 {
6790 mask_element = vperm[i].second;
6791 if (first_vec.first == -1U
6792 || first_vec == vperm[i].first)
6793 first_vec = vperm[i].first;
6794 else if (second_vec.first == -1U
6795 || second_vec == vperm[i].first)
6796 {
6797 second_vec = vperm[i].first;
6798 mask_element += nunits;
6799 }
6800 else
6801 {
6802 if (dump_enabled_p ())
6803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6804 "permutation requires at "
6805 "least three vectors\n");
6806 gcc_assert (!gsi);
6807 return false;
6808 }
6809
6810 mask[index++] = mask_element;
6811
6812 if (index == count)
6813 {
6814 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, nunits);
6815 bool identity_p = indices.series_p (0, 1, 0, 1);
6816 if (!identity_p
6817 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6818 {
6819 if (dump_enabled_p ())
6820 {
6821 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
6822 vect_location,
6823 "unsupported vect permute { ");
6824 for (i = 0; i < count; ++i)
6825 {
6826 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
6827 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
6828 }
6829 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
6830 }
6831 gcc_assert (!gsi);
6832 return false;
6833 }
6834
6835 if (!identity_p)
6836 nperms++;
6837 if (gsi)
6838 {
6839 if (second_vec.first == -1U)
6840 second_vec = first_vec;
6841
6842 slp_tree
6843 first_node = SLP_TREE_CHILDREN (node)[first_vec.first],
6844 second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
6845
6846 tree mask_vec = NULL_TREE;
6847 if (!identity_p)
6848 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6849
6850 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
6851 {
6852 tree first_def
6853 = vect_get_slp_vect_def (first_node,
6854 first_vec.second + vi);
6855 tree second_def
6856 = vect_get_slp_vect_def (second_node,
6857 second_vec.second + vi);
6858 vect_add_slp_permutation (vinfo, gsi, node, first_def,
6859 second_def, mask_vec);
6860 }
6861 }
6862
6863 index = 0;
6864 first_vec = std::make_pair (-1U, -1U);
6865 second_vec = std::make_pair (-1U, -1U);
6866 }
6867 }
6868
6869 if (!gsi)
6870 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
6871
6872 return true;
6873 }
6874
6875 /* Vectorize SLP NODE. */
6876
6877 static void
6878 vect_schedule_slp_node (vec_info *vinfo,
6879 slp_tree node, slp_instance instance)
6880 {
6881 gimple_stmt_iterator si;
6882 int i;
6883 slp_tree child;
6884
6885 /* For existing vectors there's nothing to do. */
6886 if (SLP_TREE_VEC_DEFS (node).exists ())
6887 return;
6888
6889 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
6890
6891 /* Vectorize externals and constants. */
6892 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
6893 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6894 {
6895 /* ??? vectorizable_shift can end up using a scalar operand which is
6896 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
6897 node in this case. */
6898 if (!SLP_TREE_VECTYPE (node))
6899 return;
6900
6901 vect_create_constant_vectors (vinfo, node);
6902 return;
6903 }
6904
6905 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6906
6907 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
6908 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6909
6910 if (dump_enabled_p ())
6911 dump_printf_loc (MSG_NOTE, vect_location,
6912 "------>vectorizing SLP node starting from: %G",
6913 stmt_info->stmt);
6914
6915 if (STMT_VINFO_DATA_REF (stmt_info)
6916 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
6917 {
6918 /* Vectorized loads go before the first scalar load to make it
6919 ready early, vectorized stores go before the last scalar
6920 stmt which is where all uses are ready. */
6921 stmt_vec_info last_stmt_info = NULL;
6922 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
6923 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
6924 else /* DR_IS_WRITE */
6925 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
6926 si = gsi_for_stmt (last_stmt_info->stmt);
6927 }
6928 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
6929 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
6930 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
6931 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
6932 {
6933 /* For PHI node vectorization we do not use the insertion iterator. */
6934 si = gsi_none ();
6935 }
6936 else
6937 {
6938 /* Emit other stmts after the children vectorized defs which is
6939 earliest possible. */
6940 gimple *last_stmt = NULL;
6941 bool seen_vector_def = false;
6942 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6943 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6944 {
6945 /* For fold-left reductions we are retaining the scalar
6946 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
6947 set so the representation isn't perfect. Resort to the
6948 last scalar def here. */
6949 if (SLP_TREE_VEC_STMTS (child).is_empty ())
6950 {
6951 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
6952 == cycle_phi_info_type);
6953 gphi *phi = as_a <gphi *>
6954 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
6955 if (!last_stmt
6956 || vect_stmt_dominates_stmt_p (last_stmt, phi))
6957 last_stmt = phi;
6958 }
6959 /* We are emitting all vectorized stmts in the same place and
6960 the last one is the last.
6961 ??? Unless we have a load permutation applied and that
6962 figures to re-use an earlier generated load. */
6963 unsigned j;
6964 gimple *vstmt;
6965 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
6966 if (!last_stmt
6967 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
6968 last_stmt = vstmt;
6969 }
6970 else if (!SLP_TREE_VECTYPE (child))
6971 {
6972 /* For externals we use unvectorized at all scalar defs. */
6973 unsigned j;
6974 tree def;
6975 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
6976 if (TREE_CODE (def) == SSA_NAME
6977 && !SSA_NAME_IS_DEFAULT_DEF (def))
6978 {
6979 gimple *stmt = SSA_NAME_DEF_STMT (def);
6980 if (!last_stmt
6981 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
6982 last_stmt = stmt;
6983 }
6984 }
6985 else
6986 {
6987 /* For externals we have to look at all defs since their
6988 insertion place is decided per vector. But beware
6989 of pre-existing vectors where we need to make sure
6990 we do not insert before the region boundary. */
6991 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
6992 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
6993 seen_vector_def = true;
6994 else
6995 {
6996 unsigned j;
6997 tree vdef;
6998 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
6999 if (TREE_CODE (vdef) == SSA_NAME
7000 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
7001 {
7002 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
7003 if (!last_stmt
7004 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7005 last_stmt = vstmt;
7006 }
7007 }
7008 }
7009 /* This can happen when all children are pre-existing vectors or
7010 constants. */
7011 if (!last_stmt)
7012 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
7013 if (!last_stmt)
7014 {
7015 gcc_assert (seen_vector_def);
7016 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
7017 }
7018 else if (is_a <gphi *> (last_stmt))
7019 si = gsi_after_labels (gimple_bb (last_stmt));
7020 else
7021 {
7022 si = gsi_for_stmt (last_stmt);
7023 gsi_next (&si);
7024 }
7025 }
7026
7027 bool done_p = false;
7028
7029 /* Handle purely internal nodes. */
7030 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7031 {
7032 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
7033 be shared with different SLP nodes (but usually it's the same
7034 operation apart from the case the stmt is only there for denoting
7035 the actual scalar lane defs ...). So do not call vect_transform_stmt
7036 but open-code it here (partly). */
7037 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
7038 gcc_assert (done);
7039 done_p = true;
7040 }
7041 if (!done_p)
7042 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
7043 }
7044
7045 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
7046 For loop vectorization this is done in vectorizable_call, but for SLP
7047 it needs to be deferred until end of vect_schedule_slp, because multiple
7048 SLP instances may refer to the same scalar stmt. */
7049
7050 static void
7051 vect_remove_slp_scalar_calls (vec_info *vinfo,
7052 slp_tree node, hash_set<slp_tree> &visited)
7053 {
7054 gimple *new_stmt;
7055 gimple_stmt_iterator gsi;
7056 int i;
7057 slp_tree child;
7058 tree lhs;
7059 stmt_vec_info stmt_info;
7060
7061 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7062 return;
7063
7064 if (visited.add (node))
7065 return;
7066
7067 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7068 vect_remove_slp_scalar_calls (vinfo, child, visited);
7069
7070 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7071 {
7072 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
7073 if (!stmt || gimple_bb (stmt) == NULL)
7074 continue;
7075 if (is_pattern_stmt_p (stmt_info)
7076 || !PURE_SLP_STMT (stmt_info))
7077 continue;
7078 lhs = gimple_call_lhs (stmt);
7079 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
7080 gsi = gsi_for_stmt (stmt);
7081 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
7082 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
7083 }
7084 }
7085
7086 static void
7087 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
7088 {
7089 hash_set<slp_tree> visited;
7090 vect_remove_slp_scalar_calls (vinfo, node, visited);
7091 }
7092
7093 /* Vectorize the instance root. */
7094
7095 void
7096 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
7097 {
7098 gassign *rstmt = NULL;
7099
7100 if (instance->kind == slp_inst_kind_ctor)
7101 {
7102 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
7103 {
7104 gimple *child_stmt;
7105 int j;
7106
7107 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
7108 {
7109 tree vect_lhs = gimple_get_lhs (child_stmt);
7110 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7111 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
7112 TREE_TYPE (vect_lhs)))
7113 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
7114 vect_lhs);
7115 rstmt = gimple_build_assign (root_lhs, vect_lhs);
7116 break;
7117 }
7118 }
7119 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
7120 {
7121 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
7122 gimple *child_stmt;
7123 int j;
7124 vec<constructor_elt, va_gc> *v;
7125 vec_alloc (v, nelts);
7126
7127 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
7128 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
7129 gimple_get_lhs (child_stmt));
7130 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7131 tree rtype
7132 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
7133 tree r_constructor = build_constructor (rtype, v);
7134 rstmt = gimple_build_assign (lhs, r_constructor);
7135 }
7136 }
7137 else if (instance->kind == slp_inst_kind_bb_reduc)
7138 {
7139 /* Largely inspired by reduction chain epilogue handling in
7140 vect_create_epilog_for_reduction. */
7141 vec<tree> vec_defs = vNULL;
7142 vect_get_slp_defs (node, &vec_defs);
7143 enum tree_code reduc_code
7144 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
7145 /* ??? We actually have to reflect signs somewhere. */
7146 if (reduc_code == MINUS_EXPR)
7147 reduc_code = PLUS_EXPR;
7148 gimple_seq epilogue = NULL;
7149 /* We may end up with more than one vector result, reduce them
7150 to one vector. */
7151 tree vec_def = vec_defs[0];
7152 for (unsigned i = 1; i < vec_defs.length (); ++i)
7153 vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
7154 vec_def, vec_defs[i]);
7155 vec_defs.release ();
7156 /* ??? Support other schemes than direct internal fn. */
7157 internal_fn reduc_fn;
7158 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
7159 || reduc_fn == IFN_LAST)
7160 gcc_unreachable ();
7161 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
7162 TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
7163
7164 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7165 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
7166 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
7167 update_stmt (gsi_stmt (rgsi));
7168 return;
7169 }
7170 else
7171 gcc_unreachable ();
7172
7173 gcc_assert (rstmt);
7174
7175 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7176 gsi_replace (&rgsi, rstmt, true);
7177 }
7178
7179 struct slp_scc_info
7180 {
7181 bool on_stack;
7182 int dfs;
7183 int lowlink;
7184 };
7185
7186 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
7187
7188 static void
7189 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
7190 hash_map<slp_tree, slp_scc_info> &scc_info,
7191 int &maxdfs, vec<slp_tree> &stack)
7192 {
7193 bool existed_p;
7194 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
7195 gcc_assert (!existed_p);
7196 info->dfs = maxdfs;
7197 info->lowlink = maxdfs;
7198 maxdfs++;
7199
7200 /* Leaf. */
7201 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7202 {
7203 info->on_stack = false;
7204 vect_schedule_slp_node (vinfo, node, instance);
7205 return;
7206 }
7207
7208 info->on_stack = true;
7209 stack.safe_push (node);
7210
7211 unsigned i;
7212 slp_tree child;
7213 /* DFS recurse. */
7214 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7215 {
7216 if (!child)
7217 continue;
7218 slp_scc_info *child_info = scc_info.get (child);
7219 if (!child_info)
7220 {
7221 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
7222 /* Recursion might have re-allocated the node. */
7223 info = scc_info.get (node);
7224 child_info = scc_info.get (child);
7225 info->lowlink = MIN (info->lowlink, child_info->lowlink);
7226 }
7227 else if (child_info->on_stack)
7228 info->lowlink = MIN (info->lowlink, child_info->dfs);
7229 }
7230 if (info->lowlink != info->dfs)
7231 return;
7232
7233 auto_vec<slp_tree, 4> phis_to_fixup;
7234
7235 /* Singleton. */
7236 if (stack.last () == node)
7237 {
7238 stack.pop ();
7239 info->on_stack = false;
7240 vect_schedule_slp_node (vinfo, node, instance);
7241 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
7242 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
7243 phis_to_fixup.quick_push (node);
7244 }
7245 else
7246 {
7247 /* SCC. */
7248 int last_idx = stack.length () - 1;
7249 while (stack[last_idx] != node)
7250 last_idx--;
7251 /* We can break the cycle at PHIs who have at least one child
7252 code generated. Then we could re-start the DFS walk until
7253 all nodes in the SCC are covered (we might have new entries
7254 for only back-reachable nodes). But it's simpler to just
7255 iterate and schedule those that are ready. */
7256 unsigned todo = stack.length () - last_idx;
7257 do
7258 {
7259 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
7260 {
7261 slp_tree entry = stack[idx];
7262 if (!entry)
7263 continue;
7264 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
7265 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
7266 bool ready = !phi;
7267 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
7268 if (!child)
7269 {
7270 gcc_assert (phi);
7271 ready = true;
7272 break;
7273 }
7274 else if (scc_info.get (child)->on_stack)
7275 {
7276 if (!phi)
7277 {
7278 ready = false;
7279 break;
7280 }
7281 }
7282 else
7283 {
7284 if (phi)
7285 {
7286 ready = true;
7287 break;
7288 }
7289 }
7290 if (ready)
7291 {
7292 vect_schedule_slp_node (vinfo, entry, instance);
7293 scc_info.get (entry)->on_stack = false;
7294 stack[idx] = NULL;
7295 todo--;
7296 if (phi)
7297 phis_to_fixup.safe_push (entry);
7298 }
7299 }
7300 }
7301 while (todo != 0);
7302
7303 /* Pop the SCC. */
7304 stack.truncate (last_idx);
7305 }
7306
7307 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
7308 slp_tree phi_node;
7309 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
7310 {
7311 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
7312 edge_iterator ei;
7313 edge e;
7314 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
7315 {
7316 unsigned dest_idx = e->dest_idx;
7317 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
7318 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
7319 continue;
7320 /* Simply fill all args. */
7321 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
7322 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
7323 vect_get_slp_vect_def (child, i),
7324 e, gimple_phi_arg_location (phi, dest_idx));
7325 }
7326 }
7327 }
7328
7329 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
7330
7331 void
7332 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
7333 {
7334 slp_instance instance;
7335 unsigned int i;
7336
7337 hash_map<slp_tree, slp_scc_info> scc_info;
7338 int maxdfs = 0;
7339 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7340 {
7341 slp_tree node = SLP_INSTANCE_TREE (instance);
7342 if (dump_enabled_p ())
7343 {
7344 dump_printf_loc (MSG_NOTE, vect_location,
7345 "Vectorizing SLP tree:\n");
7346 /* ??? Dump all? */
7347 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7348 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
7349 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
7350 vect_print_slp_graph (MSG_NOTE, vect_location,
7351 SLP_INSTANCE_TREE (instance));
7352 }
7353 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
7354 have a PHI be the node breaking the cycle. */
7355 auto_vec<slp_tree> stack;
7356 if (!scc_info.get (node))
7357 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
7358
7359 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7360 vectorize_slp_instance_root_stmt (node, instance);
7361
7362 if (dump_enabled_p ())
7363 dump_printf_loc (MSG_NOTE, vect_location,
7364 "vectorizing stmts using SLP.\n");
7365 }
7366
7367 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7368 {
7369 slp_tree root = SLP_INSTANCE_TREE (instance);
7370 stmt_vec_info store_info;
7371 unsigned int j;
7372
7373 /* Remove scalar call stmts. Do not do this for basic-block
7374 vectorization as not all uses may be vectorized.
7375 ??? Why should this be necessary? DCE should be able to
7376 remove the stmts itself.
7377 ??? For BB vectorization we can as well remove scalar
7378 stmts starting from the SLP tree root if they have no
7379 uses. */
7380 if (is_a <loop_vec_info> (vinfo))
7381 vect_remove_slp_scalar_calls (vinfo, root);
7382
7383 /* Remove vectorized stores original scalar stmts. */
7384 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
7385 {
7386 if (!STMT_VINFO_DATA_REF (store_info)
7387 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
7388 break;
7389
7390 store_info = vect_orig_stmt (store_info);
7391 /* Free the attached stmt_vec_info and remove the stmt. */
7392 vinfo->remove_stmt (store_info);
7393
7394 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
7395 to not crash in vect_free_slp_tree later. */
7396 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
7397 SLP_TREE_REPRESENTATIVE (root) = NULL;
7398 }
7399 }
7400 }