]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
fix hybrid SLP discovery debug stmt issue
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51
52 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
53 slp_tree, stmt_vector_for_cost *);
54
55 object_allocator<_slp_tree> *slp_tree_pool;
56
57 void *
58 _slp_tree::operator new (size_t n)
59 {
60 gcc_assert (n == sizeof (_slp_tree));
61 return slp_tree_pool->allocate_raw ();
62 }
63
64 void
65 _slp_tree::operator delete (void *node, size_t n)
66 {
67 gcc_assert (n == sizeof (_slp_tree));
68 slp_tree_pool->remove_raw (node);
69 }
70
71
72 /* Initialize a SLP node. */
73
74 _slp_tree::_slp_tree ()
75 {
76 SLP_TREE_SCALAR_STMTS (this) = vNULL;
77 SLP_TREE_SCALAR_OPS (this) = vNULL;
78 SLP_TREE_VEC_STMTS (this) = vNULL;
79 SLP_TREE_VEC_DEFS (this) = vNULL;
80 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
81 SLP_TREE_CHILDREN (this) = vNULL;
82 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
83 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
84 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
85 SLP_TREE_CODE (this) = ERROR_MARK;
86 SLP_TREE_VECTYPE (this) = NULL_TREE;
87 SLP_TREE_REPRESENTATIVE (this) = NULL;
88 SLP_TREE_REF_COUNT (this) = 1;
89 this->max_nunits = 1;
90 this->lanes = 0;
91 }
92
93 /* Tear down a SLP node. */
94
95 _slp_tree::~_slp_tree ()
96 {
97 SLP_TREE_CHILDREN (this).release ();
98 SLP_TREE_SCALAR_STMTS (this).release ();
99 SLP_TREE_SCALAR_OPS (this).release ();
100 SLP_TREE_VEC_STMTS (this).release ();
101 SLP_TREE_VEC_DEFS (this).release ();
102 SLP_TREE_LOAD_PERMUTATION (this).release ();
103 SLP_TREE_LANE_PERMUTATION (this).release ();
104 }
105
106 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
107
108 static void
109 vect_free_slp_tree (slp_tree node)
110 {
111 int i;
112 slp_tree child;
113
114 if (--SLP_TREE_REF_COUNT (node) != 0)
115 return;
116
117 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
118 if (child)
119 vect_free_slp_tree (child);
120
121 delete node;
122 }
123
124 /* Return a location suitable for dumpings related to the SLP instance. */
125
126 dump_user_location_t
127 _slp_instance::location () const
128 {
129 if (root_stmt)
130 return root_stmt->stmt;
131 else
132 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
133 }
134
135
136 /* Free the memory allocated for the SLP instance. */
137
138 void
139 vect_free_slp_instance (slp_instance instance)
140 {
141 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
142 SLP_INSTANCE_LOADS (instance).release ();
143 instance->subgraph_entries.release ();
144 instance->cost_vec.release ();
145 free (instance);
146 }
147
148
149 /* Create an SLP node for SCALAR_STMTS. */
150
151 static slp_tree
152 vect_create_new_slp_node (slp_tree node,
153 vec<stmt_vec_info> scalar_stmts, unsigned nops)
154 {
155 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
156 SLP_TREE_CHILDREN (node).create (nops);
157 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
158 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
159 SLP_TREE_LANES (node) = scalar_stmts.length ();
160 return node;
161 }
162
163 /* Create an SLP node for SCALAR_STMTS. */
164
165 static slp_tree
166 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
167 {
168 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
169 }
170
171 /* Create an SLP node for OPS. */
172
173 static slp_tree
174 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
175 {
176 SLP_TREE_SCALAR_OPS (node) = ops;
177 SLP_TREE_DEF_TYPE (node) = vect_external_def;
178 SLP_TREE_LANES (node) = ops.length ();
179 return node;
180 }
181
182 /* Create an SLP node for OPS. */
183
184 static slp_tree
185 vect_create_new_slp_node (vec<tree> ops)
186 {
187 return vect_create_new_slp_node (new _slp_tree, ops);
188 }
189
190
191 /* This structure is used in creation of an SLP tree. Each instance
192 corresponds to the same operand in a group of scalar stmts in an SLP
193 node. */
194 typedef struct _slp_oprnd_info
195 {
196 /* Def-stmts for the operands. */
197 vec<stmt_vec_info> def_stmts;
198 /* Operands. */
199 vec<tree> ops;
200 /* Information about the first statement, its vector def-type, type, the
201 operand itself in case it's constant, and an indication if it's a pattern
202 stmt. */
203 tree first_op_type;
204 enum vect_def_type first_dt;
205 bool any_pattern;
206 } *slp_oprnd_info;
207
208
209 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
210 operand. */
211 static vec<slp_oprnd_info>
212 vect_create_oprnd_info (int nops, int group_size)
213 {
214 int i;
215 slp_oprnd_info oprnd_info;
216 vec<slp_oprnd_info> oprnds_info;
217
218 oprnds_info.create (nops);
219 for (i = 0; i < nops; i++)
220 {
221 oprnd_info = XNEW (struct _slp_oprnd_info);
222 oprnd_info->def_stmts.create (group_size);
223 oprnd_info->ops.create (group_size);
224 oprnd_info->first_dt = vect_uninitialized_def;
225 oprnd_info->first_op_type = NULL_TREE;
226 oprnd_info->any_pattern = false;
227 oprnds_info.quick_push (oprnd_info);
228 }
229
230 return oprnds_info;
231 }
232
233
234 /* Free operands info. */
235
236 static void
237 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
238 {
239 int i;
240 slp_oprnd_info oprnd_info;
241
242 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
243 {
244 oprnd_info->def_stmts.release ();
245 oprnd_info->ops.release ();
246 XDELETE (oprnd_info);
247 }
248
249 oprnds_info.release ();
250 }
251
252
253 /* Return true if STMTS contains a pattern statement. */
254
255 static bool
256 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
257 {
258 stmt_vec_info stmt_info;
259 unsigned int i;
260 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
261 if (is_pattern_stmt_p (stmt_info))
262 return true;
263 return false;
264 }
265
266 /* Return true when all lanes in the external or constant NODE have
267 the same value. */
268
269 static bool
270 vect_slp_tree_uniform_p (slp_tree node)
271 {
272 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
273 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
274
275 /* Pre-exsting vectors. */
276 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
277 return false;
278
279 unsigned i;
280 tree op, first = NULL_TREE;
281 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
282 if (!first)
283 first = op;
284 else if (!operand_equal_p (first, op, 0))
285 return false;
286
287 return true;
288 }
289
290 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
291 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
292 of the chain. */
293
294 int
295 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
296 stmt_vec_info first_stmt_info)
297 {
298 stmt_vec_info next_stmt_info = first_stmt_info;
299 int result = 0;
300
301 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
302 return -1;
303
304 do
305 {
306 if (next_stmt_info == stmt_info)
307 return result;
308 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
309 if (next_stmt_info)
310 result += DR_GROUP_GAP (next_stmt_info);
311 }
312 while (next_stmt_info);
313
314 return -1;
315 }
316
317 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
318 using the method implemented by duplicate_and_interleave. Return true
319 if so, returning the number of intermediate vectors in *NVECTORS_OUT
320 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
321 (if nonnull). */
322
323 bool
324 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
325 tree elt_type, unsigned int *nvectors_out,
326 tree *vector_type_out,
327 tree *permutes)
328 {
329 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
330 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
331 return false;
332
333 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
334 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
335 unsigned int nvectors = 1;
336 for (;;)
337 {
338 scalar_int_mode int_mode;
339 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
340 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
341 {
342 /* Get the natural vector type for this SLP group size. */
343 tree int_type = build_nonstandard_integer_type
344 (GET_MODE_BITSIZE (int_mode), 1);
345 tree vector_type
346 = get_vectype_for_scalar_type (vinfo, int_type, count);
347 if (vector_type
348 && VECTOR_MODE_P (TYPE_MODE (vector_type))
349 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
350 GET_MODE_SIZE (base_vector_mode)))
351 {
352 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
353 together into elements of type INT_TYPE and using the result
354 to build NVECTORS vectors. */
355 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
356 vec_perm_builder sel1 (nelts, 2, 3);
357 vec_perm_builder sel2 (nelts, 2, 3);
358 poly_int64 half_nelts = exact_div (nelts, 2);
359 for (unsigned int i = 0; i < 3; ++i)
360 {
361 sel1.quick_push (i);
362 sel1.quick_push (i + nelts);
363 sel2.quick_push (half_nelts + i);
364 sel2.quick_push (half_nelts + i + nelts);
365 }
366 vec_perm_indices indices1 (sel1, 2, nelts);
367 vec_perm_indices indices2 (sel2, 2, nelts);
368 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
369 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
370 {
371 if (nvectors_out)
372 *nvectors_out = nvectors;
373 if (vector_type_out)
374 *vector_type_out = vector_type;
375 if (permutes)
376 {
377 permutes[0] = vect_gen_perm_mask_checked (vector_type,
378 indices1);
379 permutes[1] = vect_gen_perm_mask_checked (vector_type,
380 indices2);
381 }
382 return true;
383 }
384 }
385 }
386 if (!multiple_p (elt_bytes, 2, &elt_bytes))
387 return false;
388 nvectors *= 2;
389 }
390 }
391
392 /* Return true if DTA and DTB match. */
393
394 static bool
395 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
396 {
397 return (dta == dtb
398 || ((dta == vect_external_def || dta == vect_constant_def)
399 && (dtb == vect_external_def || dtb == vect_constant_def)));
400 }
401
402 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
403 they are of a valid type and that they match the defs of the first stmt of
404 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
405 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
406 indicates swap is required for cond_expr stmts. Specifically, *SWAP
407 is 1 if STMT is cond and operands of comparison need to be swapped;
408 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
409 If there is any operand swap in this function, *SWAP is set to non-zero
410 value.
411 If there was a fatal error return -1; if the error could be corrected by
412 swapping operands of father node of this one, return 1; if everything is
413 ok return 0. */
414 static int
415 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
416 bool *skip_args,
417 vec<stmt_vec_info> stmts, unsigned stmt_num,
418 vec<slp_oprnd_info> *oprnds_info)
419 {
420 stmt_vec_info stmt_info = stmts[stmt_num];
421 tree oprnd;
422 unsigned int i, number_of_oprnds;
423 enum vect_def_type dt = vect_uninitialized_def;
424 slp_oprnd_info oprnd_info;
425 int first_op_idx = 1;
426 unsigned int commutative_op = -1U;
427 bool first_op_cond = false;
428 bool first = stmt_num == 0;
429
430 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
431 {
432 number_of_oprnds = gimple_call_num_args (stmt);
433 first_op_idx = 3;
434 if (gimple_call_internal_p (stmt))
435 {
436 internal_fn ifn = gimple_call_internal_fn (stmt);
437 commutative_op = first_commutative_argument (ifn);
438
439 /* Masked load, only look at mask. */
440 if (ifn == IFN_MASK_LOAD)
441 {
442 number_of_oprnds = 1;
443 /* Mask operand index. */
444 first_op_idx = 5;
445 }
446 }
447 }
448 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
449 {
450 enum tree_code code = gimple_assign_rhs_code (stmt);
451 number_of_oprnds = gimple_num_ops (stmt) - 1;
452 /* Swap can only be done for cond_expr if asked to, otherwise we
453 could result in different comparison code to the first stmt. */
454 if (code == COND_EXPR
455 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
456 {
457 first_op_cond = true;
458 number_of_oprnds++;
459 }
460 else
461 commutative_op = commutative_tree_code (code) ? 0U : -1U;
462 }
463 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
464 number_of_oprnds = gimple_phi_num_args (stmt);
465 else
466 return -1;
467
468 bool swapped = (swap != 0);
469 bool backedge = false;
470 gcc_assert (!swapped || first_op_cond);
471 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
472 for (i = 0; i < number_of_oprnds; i++)
473 {
474 if (first_op_cond)
475 {
476 /* Map indicating how operands of cond_expr should be swapped. */
477 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
478 int *map = maps[swap];
479
480 if (i < 2)
481 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
482 first_op_idx), map[i]);
483 else
484 oprnd = gimple_op (stmt_info->stmt, map[i]);
485 }
486 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
487 {
488 oprnd = gimple_phi_arg_def (stmt, i);
489 backedge = dominated_by_p (CDI_DOMINATORS,
490 gimple_phi_arg_edge (stmt, i)->src,
491 gimple_bb (stmt_info->stmt));
492 }
493 else
494 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
495 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
496 oprnd = TREE_OPERAND (oprnd, 0);
497
498 oprnd_info = (*oprnds_info)[i];
499
500 stmt_vec_info def_stmt_info;
501 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
502 {
503 if (dump_enabled_p ())
504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
505 "Build SLP failed: can't analyze def for %T\n",
506 oprnd);
507
508 return -1;
509 }
510
511 if (skip_args[i])
512 {
513 oprnd_info->def_stmts.quick_push (NULL);
514 oprnd_info->ops.quick_push (NULL_TREE);
515 oprnd_info->first_dt = vect_uninitialized_def;
516 continue;
517 }
518
519 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
520 oprnd_info->any_pattern = true;
521
522 oprnd_info->def_stmts.quick_push (def_stmt_info);
523 oprnd_info->ops.quick_push (oprnd);
524
525 /* If there's a extern def on a backedge make sure we can
526 code-generate at the region start.
527 ??? This is another case that could be fixed by adjusting
528 how we split the function but at the moment we'd have conflicting
529 goals there. */
530 if (backedge
531 && dts[i] == vect_external_def
532 && is_a <bb_vec_info> (vinfo)
533 && TREE_CODE (oprnd) == SSA_NAME
534 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
535 && !dominated_by_p (CDI_DOMINATORS,
536 as_a <bb_vec_info> (vinfo)->bbs[0],
537 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
538 {
539 if (dump_enabled_p ())
540 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
541 "Build SLP failed: extern def %T only defined "
542 "on backedge\n", oprnd);
543 return -1;
544 }
545
546 if (first)
547 {
548 tree type = TREE_TYPE (oprnd);
549 dt = dts[i];
550 if ((dt == vect_constant_def
551 || dt == vect_external_def)
552 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
553 && (TREE_CODE (type) == BOOLEAN_TYPE
554 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
555 type)))
556 {
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
559 "Build SLP failed: invalid type of def "
560 "for variable-length SLP %T\n", oprnd);
561 return -1;
562 }
563
564 /* For the swapping logic below force vect_reduction_def
565 for the reduction op in a SLP reduction group. */
566 if (!STMT_VINFO_DATA_REF (stmt_info)
567 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
568 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
569 && def_stmt_info)
570 dts[i] = dt = vect_reduction_def;
571
572 /* Check the types of the definition. */
573 switch (dt)
574 {
575 case vect_external_def:
576 case vect_constant_def:
577 case vect_internal_def:
578 case vect_reduction_def:
579 case vect_induction_def:
580 case vect_nested_cycle:
581 break;
582
583 default:
584 /* FORNOW: Not supported. */
585 if (dump_enabled_p ())
586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
587 "Build SLP failed: illegal type of def %T\n",
588 oprnd);
589 return -1;
590 }
591
592 oprnd_info->first_dt = dt;
593 oprnd_info->first_op_type = type;
594 }
595 }
596 if (first)
597 return 0;
598
599 /* Now match the operand definition types to that of the first stmt. */
600 for (i = 0; i < number_of_oprnds;)
601 {
602 if (skip_args[i])
603 {
604 ++i;
605 continue;
606 }
607
608 oprnd_info = (*oprnds_info)[i];
609 dt = dts[i];
610 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
611 oprnd = oprnd_info->ops[stmt_num];
612 tree type = TREE_TYPE (oprnd);
613
614 if (!types_compatible_p (oprnd_info->first_op_type, type))
615 {
616 if (dump_enabled_p ())
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
618 "Build SLP failed: different operand types\n");
619 return 1;
620 }
621
622 /* Not first stmt of the group, check that the def-stmt/s match
623 the def-stmt/s of the first stmt. Allow different definition
624 types for reduction chains: the first stmt must be a
625 vect_reduction_def (a phi node), and the rest
626 end in the reduction chain. */
627 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
628 && !(oprnd_info->first_dt == vect_reduction_def
629 && !STMT_VINFO_DATA_REF (stmt_info)
630 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
631 && def_stmt_info
632 && !STMT_VINFO_DATA_REF (def_stmt_info)
633 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
634 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
635 || (!STMT_VINFO_DATA_REF (stmt_info)
636 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
637 && ((!def_stmt_info
638 || STMT_VINFO_DATA_REF (def_stmt_info)
639 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
640 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
641 != (oprnd_info->first_dt != vect_reduction_def))))
642 {
643 /* Try swapping operands if we got a mismatch. For BB
644 vectorization only in case it will clearly improve things. */
645 if (i == commutative_op && !swapped
646 && (!is_a <bb_vec_info> (vinfo)
647 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
648 dts[i+1])
649 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
650 || vect_def_types_match
651 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
652 {
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location,
655 "trying swapped operands\n");
656 std::swap (dts[i], dts[i+1]);
657 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
658 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
659 std::swap ((*oprnds_info)[i]->ops[stmt_num],
660 (*oprnds_info)[i+1]->ops[stmt_num]);
661 swapped = true;
662 continue;
663 }
664
665 if (is_a <bb_vec_info> (vinfo)
666 && !oprnd_info->any_pattern)
667 {
668 /* Now for commutative ops we should see whether we can
669 make the other operand matching. */
670 if (dump_enabled_p ())
671 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
672 "treating operand as external\n");
673 oprnd_info->first_dt = dt = vect_external_def;
674 }
675 else
676 {
677 if (dump_enabled_p ())
678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
679 "Build SLP failed: different types\n");
680 return 1;
681 }
682 }
683
684 /* Make sure to demote the overall operand to external. */
685 if (dt == vect_external_def)
686 oprnd_info->first_dt = vect_external_def;
687 /* For a SLP reduction chain we want to duplicate the reduction to
688 each of the chain members. That gets us a sane SLP graph (still
689 the stmts are not 100% correct wrt the initial values). */
690 else if ((dt == vect_internal_def
691 || dt == vect_reduction_def)
692 && oprnd_info->first_dt == vect_reduction_def
693 && !STMT_VINFO_DATA_REF (stmt_info)
694 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
695 && !STMT_VINFO_DATA_REF (def_stmt_info)
696 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
697 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
698 {
699 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
700 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
701 }
702
703 ++i;
704 }
705
706 /* Swap operands. */
707 if (swapped)
708 {
709 if (dump_enabled_p ())
710 dump_printf_loc (MSG_NOTE, vect_location,
711 "swapped operands to match def types in %G",
712 stmt_info->stmt);
713 }
714
715 return 0;
716 }
717
718 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
719 Return true if we can, meaning that this choice doesn't conflict with
720 existing SLP nodes that use STMT_INFO. */
721
722 bool
723 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
724 {
725 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
726 if (old_vectype)
727 return useless_type_conversion_p (vectype, old_vectype);
728
729 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
730 {
731 /* We maintain the invariant that if any statement in the group is
732 used, all other members of the group have the same vector type. */
733 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
734 stmt_vec_info member_info = first_info;
735 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
736 if (is_pattern_stmt_p (member_info)
737 && !useless_type_conversion_p (vectype,
738 STMT_VINFO_VECTYPE (member_info)))
739 break;
740
741 if (!member_info)
742 {
743 for (member_info = first_info; member_info;
744 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
745 STMT_VINFO_VECTYPE (member_info) = vectype;
746 return true;
747 }
748 }
749 else if (!is_pattern_stmt_p (stmt_info))
750 {
751 STMT_VINFO_VECTYPE (stmt_info) = vectype;
752 return true;
753 }
754
755 if (dump_enabled_p ())
756 {
757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758 "Build SLP failed: incompatible vector"
759 " types for: %G", stmt_info->stmt);
760 dump_printf_loc (MSG_NOTE, vect_location,
761 " old vector type: %T\n", old_vectype);
762 dump_printf_loc (MSG_NOTE, vect_location,
763 " new vector type: %T\n", vectype);
764 }
765 return false;
766 }
767
768 /* Return true if call statements CALL1 and CALL2 are similar enough
769 to be combined into the same SLP group. */
770
771 static bool
772 compatible_calls_p (gcall *call1, gcall *call2)
773 {
774 unsigned int nargs = gimple_call_num_args (call1);
775 if (nargs != gimple_call_num_args (call2))
776 return false;
777
778 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
779 return false;
780
781 if (gimple_call_internal_p (call1))
782 {
783 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
784 TREE_TYPE (gimple_call_lhs (call2))))
785 return false;
786 for (unsigned int i = 0; i < nargs; ++i)
787 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
788 TREE_TYPE (gimple_call_arg (call2, i))))
789 return false;
790 }
791 else
792 {
793 if (!operand_equal_p (gimple_call_fn (call1),
794 gimple_call_fn (call2), 0))
795 return false;
796
797 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
798 return false;
799 }
800 return true;
801 }
802
803 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
804 caller's attempt to find the vector type in STMT_INFO with the narrowest
805 element type. Return true if VECTYPE is nonnull and if it is valid
806 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
807 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
808 vect_build_slp_tree. */
809
810 static bool
811 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
812 unsigned int group_size,
813 tree vectype, poly_uint64 *max_nunits)
814 {
815 if (!vectype)
816 {
817 if (dump_enabled_p ())
818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
819 "Build SLP failed: unsupported data-type in %G\n",
820 stmt_info->stmt);
821 /* Fatal mismatch. */
822 return false;
823 }
824
825 /* If populating the vector type requires unrolling then fail
826 before adjusting *max_nunits for basic-block vectorization. */
827 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
828 unsigned HOST_WIDE_INT const_nunits;
829 if (is_a <bb_vec_info> (vinfo)
830 && (!nunits.is_constant (&const_nunits)
831 || const_nunits > group_size))
832 {
833 if (dump_enabled_p ())
834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
835 "Build SLP failed: unrolling required "
836 "in basic block SLP\n");
837 /* Fatal mismatch. */
838 return false;
839 }
840
841 /* In case of multiple types we need to detect the smallest type. */
842 vect_update_max_nunits (max_nunits, vectype);
843 return true;
844 }
845
846 /* Verify if the scalar stmts STMTS are isomorphic, require data
847 permutation or are of unsupported types of operation. Return
848 true if they are, otherwise return false and indicate in *MATCHES
849 which stmts are not isomorphic to the first one. If MATCHES[0]
850 is false then this indicates the comparison could not be
851 carried out or the stmts will never be vectorized by SLP.
852
853 Note COND_EXPR is possibly isomorphic to another one after swapping its
854 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
855 the first stmt by swapping the two operands of comparison; set SWAP[i]
856 to 2 if stmt I is isormorphic to the first stmt by inverting the code
857 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
858 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
859
860 static bool
861 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
862 vec<stmt_vec_info> stmts, unsigned int group_size,
863 poly_uint64 *max_nunits, bool *matches,
864 bool *two_operators, tree *node_vectype)
865 {
866 unsigned int i;
867 stmt_vec_info first_stmt_info = stmts[0];
868 enum tree_code first_stmt_code = ERROR_MARK;
869 enum tree_code alt_stmt_code = ERROR_MARK;
870 enum tree_code rhs_code = ERROR_MARK;
871 enum tree_code first_cond_code = ERROR_MARK;
872 tree lhs;
873 bool need_same_oprnds = false;
874 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
875 optab optab;
876 int icode;
877 machine_mode optab_op2_mode;
878 machine_mode vec_mode;
879 stmt_vec_info first_load = NULL, prev_first_load = NULL;
880 bool first_stmt_load_p = false, load_p = false;
881 bool first_stmt_phi_p = false, phi_p = false;
882
883 /* For every stmt in NODE find its def stmt/s. */
884 stmt_vec_info stmt_info;
885 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
886 {
887 gimple *stmt = stmt_info->stmt;
888 swap[i] = 0;
889 matches[i] = false;
890
891 if (dump_enabled_p ())
892 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
893
894 /* Fail to vectorize statements marked as unvectorizable, throw
895 or are volatile. */
896 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
897 || stmt_can_throw_internal (cfun, stmt)
898 || gimple_has_volatile_ops (stmt))
899 {
900 if (dump_enabled_p ())
901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
902 "Build SLP failed: unvectorizable statement %G",
903 stmt);
904 /* ??? For BB vectorization we want to commutate operands in a way
905 to shuffle all unvectorizable defs into one operand and have
906 the other still vectorized. The following doesn't reliably
907 work for this though but it's the easiest we can do here. */
908 if (is_a <bb_vec_info> (vinfo) && i != 0)
909 continue;
910 /* Fatal mismatch. */
911 matches[0] = false;
912 return false;
913 }
914
915 lhs = gimple_get_lhs (stmt);
916 if (lhs == NULL_TREE)
917 {
918 if (dump_enabled_p ())
919 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
920 "Build SLP failed: not GIMPLE_ASSIGN nor "
921 "GIMPLE_CALL %G", stmt);
922 if (is_a <bb_vec_info> (vinfo) && i != 0)
923 continue;
924 /* Fatal mismatch. */
925 matches[0] = false;
926 return false;
927 }
928
929 tree nunits_vectype;
930 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
931 &nunits_vectype, group_size)
932 || (nunits_vectype
933 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
934 nunits_vectype, max_nunits)))
935 {
936 if (is_a <bb_vec_info> (vinfo) && i != 0)
937 continue;
938 /* Fatal mismatch. */
939 matches[0] = false;
940 return false;
941 }
942
943 gcc_assert (vectype);
944
945 gcall *call_stmt = dyn_cast <gcall *> (stmt);
946 if (call_stmt)
947 {
948 rhs_code = CALL_EXPR;
949
950 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
951 load_p = true;
952 else if ((gimple_call_internal_p (call_stmt)
953 && (!vectorizable_internal_fn_p
954 (gimple_call_internal_fn (call_stmt))))
955 || gimple_call_tail_p (call_stmt)
956 || gimple_call_noreturn_p (call_stmt)
957 || !gimple_call_nothrow_p (call_stmt)
958 || gimple_call_chain (call_stmt))
959 {
960 if (dump_enabled_p ())
961 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
962 "Build SLP failed: unsupported call type %G",
963 call_stmt);
964 if (is_a <bb_vec_info> (vinfo) && i != 0)
965 continue;
966 /* Fatal mismatch. */
967 matches[0] = false;
968 return false;
969 }
970 }
971 else if (gimple_code (stmt) == GIMPLE_PHI)
972 {
973 rhs_code = ERROR_MARK;
974 phi_p = true;
975 }
976 else
977 {
978 rhs_code = gimple_assign_rhs_code (stmt);
979 load_p = gimple_vuse (stmt);
980 }
981
982 /* Check the operation. */
983 if (i == 0)
984 {
985 *node_vectype = vectype;
986 first_stmt_code = rhs_code;
987 first_stmt_load_p = load_p;
988 first_stmt_phi_p = phi_p;
989
990 /* Shift arguments should be equal in all the packed stmts for a
991 vector shift with scalar shift operand. */
992 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
993 || rhs_code == LROTATE_EXPR
994 || rhs_code == RROTATE_EXPR)
995 {
996 vec_mode = TYPE_MODE (vectype);
997
998 /* First see if we have a vector/vector shift. */
999 optab = optab_for_tree_code (rhs_code, vectype,
1000 optab_vector);
1001
1002 if (!optab
1003 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1004 {
1005 /* No vector/vector shift, try for a vector/scalar shift. */
1006 optab = optab_for_tree_code (rhs_code, vectype,
1007 optab_scalar);
1008
1009 if (!optab)
1010 {
1011 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1013 "Build SLP failed: no optab.\n");
1014 if (is_a <bb_vec_info> (vinfo) && i != 0)
1015 continue;
1016 /* Fatal mismatch. */
1017 matches[0] = false;
1018 return false;
1019 }
1020 icode = (int) optab_handler (optab, vec_mode);
1021 if (icode == CODE_FOR_nothing)
1022 {
1023 if (dump_enabled_p ())
1024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1025 "Build SLP failed: "
1026 "op not supported by target.\n");
1027 if (is_a <bb_vec_info> (vinfo) && i != 0)
1028 continue;
1029 /* Fatal mismatch. */
1030 matches[0] = false;
1031 return false;
1032 }
1033 optab_op2_mode = insn_data[icode].operand[2].mode;
1034 if (!VECTOR_MODE_P (optab_op2_mode))
1035 {
1036 need_same_oprnds = true;
1037 first_op1 = gimple_assign_rhs2 (stmt);
1038 }
1039 }
1040 }
1041 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1042 {
1043 need_same_oprnds = true;
1044 first_op1 = gimple_assign_rhs2 (stmt);
1045 }
1046 else if (!load_p
1047 && rhs_code == BIT_FIELD_REF)
1048 {
1049 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1050 if (TREE_CODE (vec) != SSA_NAME
1051 || !types_compatible_p (vectype, TREE_TYPE (vec)))
1052 {
1053 if (is_a <bb_vec_info> (vinfo) && i != 0)
1054 continue;
1055 if (dump_enabled_p ())
1056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1057 "Build SLP failed: "
1058 "BIT_FIELD_REF not supported\n");
1059 /* Fatal mismatch. */
1060 matches[0] = false;
1061 return false;
1062 }
1063 }
1064 else if (call_stmt
1065 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1066 {
1067 need_same_oprnds = true;
1068 first_op1 = gimple_call_arg (call_stmt, 1);
1069 }
1070 }
1071 else
1072 {
1073 if (first_stmt_code != rhs_code
1074 && alt_stmt_code == ERROR_MARK)
1075 alt_stmt_code = rhs_code;
1076 if ((first_stmt_code != rhs_code
1077 && (first_stmt_code != IMAGPART_EXPR
1078 || rhs_code != REALPART_EXPR)
1079 && (first_stmt_code != REALPART_EXPR
1080 || rhs_code != IMAGPART_EXPR)
1081 /* Handle mismatches in plus/minus by computing both
1082 and merging the results. */
1083 && !((first_stmt_code == PLUS_EXPR
1084 || first_stmt_code == MINUS_EXPR)
1085 && (alt_stmt_code == PLUS_EXPR
1086 || alt_stmt_code == MINUS_EXPR)
1087 && rhs_code == alt_stmt_code)
1088 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1089 && (first_stmt_code == ARRAY_REF
1090 || first_stmt_code == BIT_FIELD_REF
1091 || first_stmt_code == INDIRECT_REF
1092 || first_stmt_code == COMPONENT_REF
1093 || first_stmt_code == MEM_REF)))
1094 || first_stmt_load_p != load_p
1095 || first_stmt_phi_p != phi_p)
1096 {
1097 if (dump_enabled_p ())
1098 {
1099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1100 "Build SLP failed: different operation "
1101 "in stmt %G", stmt);
1102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1103 "original stmt %G", first_stmt_info->stmt);
1104 }
1105 /* Mismatch. */
1106 continue;
1107 }
1108
1109 if (need_same_oprnds)
1110 {
1111 tree other_op1 = (call_stmt
1112 ? gimple_call_arg (call_stmt, 1)
1113 : gimple_assign_rhs2 (stmt));
1114 if (!operand_equal_p (first_op1, other_op1, 0))
1115 {
1116 if (dump_enabled_p ())
1117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1118 "Build SLP failed: different shift "
1119 "arguments in %G", stmt);
1120 /* Mismatch. */
1121 continue;
1122 }
1123 }
1124 if (!load_p
1125 && first_stmt_code == BIT_FIELD_REF
1126 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1127 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1128 {
1129 if (dump_enabled_p ())
1130 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1131 "Build SLP failed: different BIT_FIELD_REF "
1132 "arguments in %G", stmt);
1133 /* Mismatch. */
1134 continue;
1135 }
1136
1137 if (!load_p && rhs_code == CALL_EXPR)
1138 {
1139 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1140 as_a <gcall *> (stmt)))
1141 {
1142 if (dump_enabled_p ())
1143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1144 "Build SLP failed: different calls in %G",
1145 stmt);
1146 /* Mismatch. */
1147 continue;
1148 }
1149 }
1150
1151 if (phi_p
1152 && (gimple_bb (first_stmt_info->stmt)
1153 != gimple_bb (stmt_info->stmt)))
1154 {
1155 if (dump_enabled_p ())
1156 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1157 "Build SLP failed: different BB for PHI "
1158 "in %G", stmt);
1159 /* Mismatch. */
1160 continue;
1161 }
1162
1163 if (!types_compatible_p (vectype, *node_vectype))
1164 {
1165 if (dump_enabled_p ())
1166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1167 "Build SLP failed: different vector type "
1168 "in %G", stmt);
1169 /* Mismatch. */
1170 continue;
1171 }
1172 }
1173
1174 /* Grouped store or load. */
1175 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1176 {
1177 if (REFERENCE_CLASS_P (lhs))
1178 {
1179 /* Store. */
1180 ;
1181 }
1182 else
1183 {
1184 /* Load. */
1185 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1186 if (prev_first_load)
1187 {
1188 /* Check that there are no loads from different interleaving
1189 chains in the same node. */
1190 if (prev_first_load != first_load)
1191 {
1192 if (dump_enabled_p ())
1193 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1194 vect_location,
1195 "Build SLP failed: different "
1196 "interleaving chains in one node %G",
1197 stmt);
1198 /* Mismatch. */
1199 continue;
1200 }
1201 }
1202 else
1203 prev_first_load = first_load;
1204 }
1205 } /* Grouped access. */
1206 else
1207 {
1208 if (load_p)
1209 {
1210 /* Not grouped load. */
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1213 "Build SLP failed: not grouped load %G", stmt);
1214
1215 /* FORNOW: Not grouped loads are not supported. */
1216 if (is_a <bb_vec_info> (vinfo) && i != 0)
1217 continue;
1218 /* Fatal mismatch. */
1219 matches[0] = false;
1220 return false;
1221 }
1222
1223 /* Not memory operation. */
1224 if (!phi_p
1225 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1226 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1227 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1228 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1229 && rhs_code != VIEW_CONVERT_EXPR
1230 && rhs_code != CALL_EXPR
1231 && rhs_code != BIT_FIELD_REF)
1232 {
1233 if (dump_enabled_p ())
1234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235 "Build SLP failed: operation unsupported %G",
1236 stmt);
1237 if (is_a <bb_vec_info> (vinfo) && i != 0)
1238 continue;
1239 /* Fatal mismatch. */
1240 matches[0] = false;
1241 return false;
1242 }
1243
1244 if (rhs_code == COND_EXPR)
1245 {
1246 tree cond_expr = gimple_assign_rhs1 (stmt);
1247 enum tree_code cond_code = TREE_CODE (cond_expr);
1248 enum tree_code swap_code = ERROR_MARK;
1249 enum tree_code invert_code = ERROR_MARK;
1250
1251 if (i == 0)
1252 first_cond_code = TREE_CODE (cond_expr);
1253 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1254 {
1255 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1256 swap_code = swap_tree_comparison (cond_code);
1257 invert_code = invert_tree_comparison (cond_code, honor_nans);
1258 }
1259
1260 if (first_cond_code == cond_code)
1261 ;
1262 /* Isomorphic can be achieved by swapping. */
1263 else if (first_cond_code == swap_code)
1264 swap[i] = 1;
1265 /* Isomorphic can be achieved by inverting. */
1266 else if (first_cond_code == invert_code)
1267 swap[i] = 2;
1268 else
1269 {
1270 if (dump_enabled_p ())
1271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1272 "Build SLP failed: different"
1273 " operation %G", stmt);
1274 /* Mismatch. */
1275 continue;
1276 }
1277 }
1278 }
1279
1280 matches[i] = true;
1281 }
1282
1283 for (i = 0; i < group_size; ++i)
1284 if (!matches[i])
1285 return false;
1286
1287 /* If we allowed a two-operation SLP node verify the target can cope
1288 with the permute we are going to use. */
1289 if (alt_stmt_code != ERROR_MARK
1290 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1291 {
1292 *two_operators = true;
1293 }
1294
1295 return true;
1296 }
1297
1298 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1299 Note we never remove apart from at destruction time so we do not
1300 need a special value for deleted that differs from empty. */
1301 struct bst_traits
1302 {
1303 typedef vec <stmt_vec_info> value_type;
1304 typedef vec <stmt_vec_info> compare_type;
1305 static inline hashval_t hash (value_type);
1306 static inline bool equal (value_type existing, value_type candidate);
1307 static inline bool is_empty (value_type x) { return !x.exists (); }
1308 static inline bool is_deleted (value_type x) { return !x.exists (); }
1309 static const bool empty_zero_p = true;
1310 static inline void mark_empty (value_type &x) { x.release (); }
1311 static inline void mark_deleted (value_type &x) { x.release (); }
1312 static inline void remove (value_type &x) { x.release (); }
1313 };
1314 inline hashval_t
1315 bst_traits::hash (value_type x)
1316 {
1317 inchash::hash h;
1318 for (unsigned i = 0; i < x.length (); ++i)
1319 h.add_int (gimple_uid (x[i]->stmt));
1320 return h.end ();
1321 }
1322 inline bool
1323 bst_traits::equal (value_type existing, value_type candidate)
1324 {
1325 if (existing.length () != candidate.length ())
1326 return false;
1327 for (unsigned i = 0; i < existing.length (); ++i)
1328 if (existing[i] != candidate[i])
1329 return false;
1330 return true;
1331 }
1332
1333 typedef hash_map <vec <gimple *>, slp_tree,
1334 simple_hashmap_traits <bst_traits, slp_tree> >
1335 scalar_stmts_to_slp_tree_map_t;
1336
1337 static slp_tree
1338 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1339 vec<stmt_vec_info> stmts, unsigned int group_size,
1340 poly_uint64 *max_nunits,
1341 bool *matches, unsigned *npermutes, unsigned *tree_size,
1342 scalar_stmts_to_slp_tree_map_t *bst_map);
1343
1344 static slp_tree
1345 vect_build_slp_tree (vec_info *vinfo,
1346 vec<stmt_vec_info> stmts, unsigned int group_size,
1347 poly_uint64 *max_nunits,
1348 bool *matches, unsigned *npermutes, unsigned *tree_size,
1349 scalar_stmts_to_slp_tree_map_t *bst_map)
1350 {
1351 if (slp_tree *leader = bst_map->get (stmts))
1352 {
1353 if (dump_enabled_p ())
1354 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1355 *leader ? "" : "failed ", *leader);
1356 if (*leader)
1357 {
1358 SLP_TREE_REF_COUNT (*leader)++;
1359 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1360 }
1361 return *leader;
1362 }
1363
1364 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1365 so we can pick up backedge destinations during discovery. */
1366 slp_tree res = new _slp_tree;
1367 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1368 SLP_TREE_SCALAR_STMTS (res) = stmts;
1369 bst_map->put (stmts.copy (), res);
1370
1371 poly_uint64 this_max_nunits = 1;
1372 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1373 &this_max_nunits,
1374 matches, npermutes, tree_size, bst_map);
1375 if (!res_)
1376 {
1377 bool existed_p = bst_map->put (stmts, NULL);
1378 gcc_assert (existed_p);
1379 /* Mark the node invalid so we can detect those when still in use
1380 as backedge destinations. */
1381 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1382 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1383 vect_free_slp_tree (res);
1384 }
1385 else
1386 {
1387 gcc_assert (res_ == res);
1388 res->max_nunits = this_max_nunits;
1389 vect_update_max_nunits (max_nunits, this_max_nunits);
1390 /* Keep a reference for the bst_map use. */
1391 SLP_TREE_REF_COUNT (res)++;
1392 }
1393 return res_;
1394 }
1395
1396 /* Recursively build an SLP tree starting from NODE.
1397 Fail (and return a value not equal to zero) if def-stmts are not
1398 isomorphic, require data permutation or are of unsupported types of
1399 operation. Otherwise, return 0.
1400 The value returned is the depth in the SLP tree where a mismatch
1401 was found. */
1402
1403 static slp_tree
1404 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1405 vec<stmt_vec_info> stmts, unsigned int group_size,
1406 poly_uint64 *max_nunits,
1407 bool *matches, unsigned *npermutes, unsigned *tree_size,
1408 scalar_stmts_to_slp_tree_map_t *bst_map)
1409 {
1410 unsigned nops, i, this_tree_size = 0;
1411 poly_uint64 this_max_nunits = *max_nunits;
1412
1413 matches[0] = false;
1414
1415 stmt_vec_info stmt_info = stmts[0];
1416 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1417 nops = gimple_call_num_args (stmt);
1418 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1419 {
1420 nops = gimple_num_ops (stmt) - 1;
1421 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1422 nops++;
1423 }
1424 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1425 nops = gimple_phi_num_args (phi);
1426 else
1427 return NULL;
1428
1429 /* If the SLP node is a PHI (induction or reduction), terminate
1430 the recursion. */
1431 bool *skip_args = XALLOCAVEC (bool, nops);
1432 memset (skip_args, 0, sizeof (bool) * nops);
1433 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1434 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1435 {
1436 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1437 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1438 group_size);
1439 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1440 max_nunits))
1441 return NULL;
1442
1443 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1444 if (def_type == vect_induction_def)
1445 {
1446 /* Induction PHIs are not cycles but walk the initial
1447 value. Only for inner loops through, for outer loops
1448 we need to pick up the value from the actual PHIs
1449 to more easily support peeling and epilogue vectorization. */
1450 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1451 if (!nested_in_vect_loop_p (loop, stmt_info))
1452 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1453 else
1454 loop = loop->inner;
1455 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1456 }
1457 else if (def_type == vect_reduction_def
1458 || def_type == vect_double_reduction_def
1459 || def_type == vect_nested_cycle)
1460 {
1461 /* Else def types have to match. */
1462 stmt_vec_info other_info;
1463 bool all_same = true;
1464 FOR_EACH_VEC_ELT (stmts, i, other_info)
1465 {
1466 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1467 return NULL;
1468 if (other_info != stmt_info)
1469 all_same = false;
1470 }
1471 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1472 /* Reduction initial values are not explicitely represented. */
1473 if (!nested_in_vect_loop_p (loop, stmt_info))
1474 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1475 /* Reduction chain backedge defs are filled manually.
1476 ??? Need a better way to identify a SLP reduction chain PHI.
1477 Or a better overall way to SLP match those. */
1478 if (all_same && def_type == vect_reduction_def)
1479 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1480 }
1481 else if (def_type != vect_internal_def)
1482 return NULL;
1483 }
1484
1485
1486 bool two_operators = false;
1487 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1488 tree vectype = NULL_TREE;
1489 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1490 &this_max_nunits, matches, &two_operators,
1491 &vectype))
1492 return NULL;
1493
1494 /* If the SLP node is a load, terminate the recursion unless masked. */
1495 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1496 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1497 {
1498 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1499 {
1500 /* Masked load. */
1501 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1502 nops = 1;
1503 }
1504 else
1505 {
1506 *max_nunits = this_max_nunits;
1507 (*tree_size)++;
1508 node = vect_create_new_slp_node (node, stmts, 0);
1509 SLP_TREE_VECTYPE (node) = vectype;
1510 /* And compute the load permutation. Whether it is actually
1511 a permutation depends on the unrolling factor which is
1512 decided later. */
1513 vec<unsigned> load_permutation;
1514 int j;
1515 stmt_vec_info load_info;
1516 load_permutation.create (group_size);
1517 stmt_vec_info first_stmt_info
1518 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1519 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1520 {
1521 int load_place = vect_get_place_in_interleaving_chain
1522 (load_info, first_stmt_info);
1523 gcc_assert (load_place != -1);
1524 load_permutation.safe_push (load_place);
1525 }
1526 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1527 return node;
1528 }
1529 }
1530 else if (gimple_assign_single_p (stmt_info->stmt)
1531 && !gimple_vuse (stmt_info->stmt)
1532 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1533 {
1534 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1535 the same SSA name vector of a compatible type to vectype. */
1536 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1537 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1538 stmt_vec_info estmt_info;
1539 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1540 {
1541 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1542 tree bfref = gimple_assign_rhs1 (estmt);
1543 HOST_WIDE_INT lane;
1544 if (!known_eq (bit_field_size (bfref),
1545 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1546 || !constant_multiple_p (bit_field_offset (bfref),
1547 bit_field_size (bfref), &lane))
1548 {
1549 lperm.release ();
1550 return NULL;
1551 }
1552 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1553 }
1554 slp_tree vnode = vect_create_new_slp_node (vNULL);
1555 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1556 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1557 /* We are always building a permutation node even if it is an identity
1558 permute to shield the rest of the vectorizer from the odd node
1559 representing an actual vector without any scalar ops.
1560 ??? We could hide it completely with making the permute node
1561 external? */
1562 node = vect_create_new_slp_node (node, stmts, 1);
1563 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1564 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1565 SLP_TREE_VECTYPE (node) = vectype;
1566 SLP_TREE_CHILDREN (node).quick_push (vnode);
1567 return node;
1568 }
1569
1570 /* Get at the operands, verifying they are compatible. */
1571 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1572 slp_oprnd_info oprnd_info;
1573 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1574 {
1575 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
1576 stmts, i, &oprnds_info);
1577 if (res != 0)
1578 matches[(res == -1) ? 0 : i] = false;
1579 if (!matches[0])
1580 break;
1581 }
1582 for (i = 0; i < group_size; ++i)
1583 if (!matches[i])
1584 {
1585 vect_free_oprnd_info (oprnds_info);
1586 return NULL;
1587 }
1588 swap = NULL;
1589
1590 auto_vec<slp_tree, 4> children;
1591
1592 stmt_info = stmts[0];
1593
1594 /* Create SLP_TREE nodes for the definition node/s. */
1595 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1596 {
1597 slp_tree child;
1598 unsigned int j;
1599
1600 /* We're skipping certain operands from processing, for example
1601 outer loop reduction initial defs. */
1602 if (skip_args[i])
1603 {
1604 children.safe_push (NULL);
1605 continue;
1606 }
1607
1608 if (oprnd_info->first_dt == vect_uninitialized_def)
1609 {
1610 /* COND_EXPR have one too many eventually if the condition
1611 is a SSA name. */
1612 gcc_assert (i == 3 && nops == 4);
1613 continue;
1614 }
1615
1616 if (is_a <bb_vec_info> (vinfo)
1617 && oprnd_info->first_dt == vect_internal_def
1618 && !oprnd_info->any_pattern)
1619 {
1620 /* For BB vectorization, if all defs are the same do not
1621 bother to continue the build along the single-lane
1622 graph but use a splat of the scalar value. */
1623 stmt_vec_info first_def = oprnd_info->def_stmts[0];
1624 for (j = 1; j < group_size; ++j)
1625 if (oprnd_info->def_stmts[j] != first_def)
1626 break;
1627 if (j == group_size
1628 /* But avoid doing this for loads where we may be
1629 able to CSE things, unless the stmt is not
1630 vectorizable. */
1631 && (!STMT_VINFO_VECTORIZABLE (first_def)
1632 || !gimple_vuse (first_def->stmt)))
1633 {
1634 if (dump_enabled_p ())
1635 dump_printf_loc (MSG_NOTE, vect_location,
1636 "Using a splat of the uniform operand\n");
1637 oprnd_info->first_dt = vect_external_def;
1638 }
1639 }
1640
1641 if (oprnd_info->first_dt == vect_external_def
1642 || oprnd_info->first_dt == vect_constant_def)
1643 {
1644 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1645 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1646 oprnd_info->ops = vNULL;
1647 children.safe_push (invnode);
1648 continue;
1649 }
1650
1651 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1652 group_size, &this_max_nunits,
1653 matches, npermutes,
1654 &this_tree_size, bst_map)) != NULL)
1655 {
1656 oprnd_info->def_stmts = vNULL;
1657 children.safe_push (child);
1658 continue;
1659 }
1660
1661 /* If the SLP build for operand zero failed and operand zero
1662 and one can be commutated try that for the scalar stmts
1663 that failed the match. */
1664 if (i == 0
1665 /* A first scalar stmt mismatch signals a fatal mismatch. */
1666 && matches[0]
1667 /* ??? For COND_EXPRs we can swap the comparison operands
1668 as well as the arms under some constraints. */
1669 && nops == 2
1670 && oprnds_info[1]->first_dt == vect_internal_def
1671 && is_gimple_assign (stmt_info->stmt)
1672 /* Swapping operands for reductions breaks assumptions later on. */
1673 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1674 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1675 /* Do so only if the number of not successful permutes was nor more
1676 than a cut-ff as re-trying the recursive match on
1677 possibly each level of the tree would expose exponential
1678 behavior. */
1679 && *npermutes < 4)
1680 {
1681 /* See whether we can swap the matching or the non-matching
1682 stmt operands. */
1683 bool swap_not_matching = true;
1684 do
1685 {
1686 for (j = 0; j < group_size; ++j)
1687 {
1688 if (matches[j] != !swap_not_matching)
1689 continue;
1690 stmt_vec_info stmt_info = stmts[j];
1691 /* Verify if we can swap operands of this stmt. */
1692 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1693 if (!stmt
1694 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1695 {
1696 if (!swap_not_matching)
1697 goto fail;
1698 swap_not_matching = false;
1699 break;
1700 }
1701 }
1702 }
1703 while (j != group_size);
1704
1705 /* Swap mismatched definition stmts. */
1706 if (dump_enabled_p ())
1707 dump_printf_loc (MSG_NOTE, vect_location,
1708 "Re-trying with swapped operands of stmts ");
1709 for (j = 0; j < group_size; ++j)
1710 if (matches[j] == !swap_not_matching)
1711 {
1712 std::swap (oprnds_info[0]->def_stmts[j],
1713 oprnds_info[1]->def_stmts[j]);
1714 std::swap (oprnds_info[0]->ops[j],
1715 oprnds_info[1]->ops[j]);
1716 if (dump_enabled_p ())
1717 dump_printf (MSG_NOTE, "%d ", j);
1718 }
1719 if (dump_enabled_p ())
1720 dump_printf (MSG_NOTE, "\n");
1721 /* And try again with scratch 'matches' ... */
1722 bool *tem = XALLOCAVEC (bool, group_size);
1723 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1724 group_size, &this_max_nunits,
1725 tem, npermutes,
1726 &this_tree_size, bst_map)) != NULL)
1727 {
1728 oprnd_info->def_stmts = vNULL;
1729 children.safe_push (child);
1730 continue;
1731 }
1732 /* We do not undo the swapping here since it might still be
1733 the better order for the second operand in case we build
1734 the first one from scalars below. */
1735 ++*npermutes;
1736 }
1737 fail:
1738
1739 /* If the SLP build failed and we analyze a basic-block
1740 simply treat nodes we fail to build as externally defined
1741 (and thus build vectors from the scalar defs).
1742 The cost model will reject outright expensive cases.
1743 ??? This doesn't treat cases where permutation ultimatively
1744 fails (or we don't try permutation below). Ideally we'd
1745 even compute a permutation that will end up with the maximum
1746 SLP tree size... */
1747 if (is_a <bb_vec_info> (vinfo)
1748 /* ??? Rejecting patterns this way doesn't work. We'd have to
1749 do extra work to cancel the pattern so the uses see the
1750 scalar version. */
1751 && !is_pattern_stmt_p (stmt_info)
1752 && !oprnd_info->any_pattern)
1753 {
1754 /* But if there's a leading vector sized set of matching stmts
1755 fail here so we can split the group. This matches the condition
1756 vect_analyze_slp_instance uses. */
1757 /* ??? We might want to split here and combine the results to support
1758 multiple vector sizes better. */
1759 for (j = 0; j < group_size; ++j)
1760 if (!matches[j])
1761 break;
1762 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1763 {
1764 if (dump_enabled_p ())
1765 dump_printf_loc (MSG_NOTE, vect_location,
1766 "Building vector operands from scalars\n");
1767 this_tree_size++;
1768 child = vect_create_new_slp_node (oprnd_info->ops);
1769 children.safe_push (child);
1770 oprnd_info->ops = vNULL;
1771 continue;
1772 }
1773 }
1774
1775 gcc_assert (child == NULL);
1776 FOR_EACH_VEC_ELT (children, j, child)
1777 if (child)
1778 vect_free_slp_tree (child);
1779 vect_free_oprnd_info (oprnds_info);
1780 return NULL;
1781 }
1782
1783 vect_free_oprnd_info (oprnds_info);
1784
1785 /* If we have all children of a child built up from uniform scalars
1786 or does more than one possibly expensive vector construction then
1787 just throw that away, causing it built up from scalars.
1788 The exception is the SLP node for the vector store. */
1789 if (is_a <bb_vec_info> (vinfo)
1790 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1791 /* ??? Rejecting patterns this way doesn't work. We'd have to
1792 do extra work to cancel the pattern so the uses see the
1793 scalar version. */
1794 && !is_pattern_stmt_p (stmt_info))
1795 {
1796 slp_tree child;
1797 unsigned j;
1798 bool all_uniform_p = true;
1799 unsigned n_vector_builds = 0;
1800 FOR_EACH_VEC_ELT (children, j, child)
1801 {
1802 if (!child)
1803 ;
1804 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1805 all_uniform_p = false;
1806 else if (!vect_slp_tree_uniform_p (child))
1807 {
1808 all_uniform_p = false;
1809 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1810 n_vector_builds++;
1811 }
1812 }
1813 if (all_uniform_p || n_vector_builds > 1)
1814 {
1815 /* Roll back. */
1816 matches[0] = false;
1817 FOR_EACH_VEC_ELT (children, j, child)
1818 if (child)
1819 vect_free_slp_tree (child);
1820
1821 if (dump_enabled_p ())
1822 dump_printf_loc (MSG_NOTE, vect_location,
1823 "Building parent vector operands from "
1824 "scalars instead\n");
1825 return NULL;
1826 }
1827 }
1828
1829 *tree_size += this_tree_size + 1;
1830 *max_nunits = this_max_nunits;
1831
1832 if (two_operators)
1833 {
1834 /* ??? We'd likely want to either cache in bst_map sth like
1835 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1836 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1837 explicit stmts to put in so the keying on 'stmts' doesn't
1838 work (but we have the same issue with nodes that use 'ops'). */
1839 slp_tree one = new _slp_tree;
1840 slp_tree two = new _slp_tree;
1841 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1842 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1843 SLP_TREE_VECTYPE (one) = vectype;
1844 SLP_TREE_VECTYPE (two) = vectype;
1845 SLP_TREE_CHILDREN (one).safe_splice (children);
1846 SLP_TREE_CHILDREN (two).safe_splice (children);
1847 slp_tree child;
1848 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1849 SLP_TREE_REF_COUNT (child)++;
1850
1851 /* Here we record the original defs since this
1852 node represents the final lane configuration. */
1853 node = vect_create_new_slp_node (node, stmts, 2);
1854 SLP_TREE_VECTYPE (node) = vectype;
1855 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1856 SLP_TREE_CHILDREN (node).quick_push (one);
1857 SLP_TREE_CHILDREN (node).quick_push (two);
1858 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1859 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1860 enum tree_code ocode = ERROR_MARK;
1861 stmt_vec_info ostmt_info;
1862 unsigned j = 0;
1863 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1864 {
1865 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1866 if (gimple_assign_rhs_code (ostmt) != code0)
1867 {
1868 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1869 ocode = gimple_assign_rhs_code (ostmt);
1870 j = i;
1871 }
1872 else
1873 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1874 }
1875 SLP_TREE_CODE (one) = code0;
1876 SLP_TREE_CODE (two) = ocode;
1877 SLP_TREE_LANES (one) = stmts.length ();
1878 SLP_TREE_LANES (two) = stmts.length ();
1879 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1880 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1881 return node;
1882 }
1883
1884 node = vect_create_new_slp_node (node, stmts, nops);
1885 SLP_TREE_VECTYPE (node) = vectype;
1886 SLP_TREE_CHILDREN (node).splice (children);
1887 return node;
1888 }
1889
1890 /* Dump a single SLP tree NODE. */
1891
1892 static void
1893 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1894 slp_tree node)
1895 {
1896 unsigned i, j;
1897 slp_tree child;
1898 stmt_vec_info stmt_info;
1899 tree op;
1900
1901 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1902 dump_user_location_t user_loc = loc.get_user_location ();
1903 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1904 SLP_TREE_DEF_TYPE (node) == vect_external_def
1905 ? " (external)"
1906 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1907 ? " (constant)"
1908 : ""), node,
1909 estimated_poly_value (node->max_nunits),
1910 SLP_TREE_REF_COUNT (node));
1911 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
1912 {
1913 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
1914 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
1915 else
1916 dump_printf_loc (metadata, user_loc, "op template: %G",
1917 SLP_TREE_REPRESENTATIVE (node)->stmt);
1918 }
1919 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1920 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1921 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1922 else
1923 {
1924 dump_printf_loc (metadata, user_loc, "\t{ ");
1925 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1926 dump_printf (metadata, "%T%s ", op,
1927 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1928 dump_printf (metadata, "}\n");
1929 }
1930 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1931 {
1932 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1933 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1934 dump_printf (dump_kind, " %u", j);
1935 dump_printf (dump_kind, " }\n");
1936 }
1937 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1938 {
1939 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1940 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1941 dump_printf (dump_kind, " %u[%u]",
1942 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1943 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1944 dump_printf (dump_kind, " }\n");
1945 }
1946 if (SLP_TREE_CHILDREN (node).is_empty ())
1947 return;
1948 dump_printf_loc (metadata, user_loc, "\tchildren");
1949 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1950 dump_printf (dump_kind, " %p", (void *)child);
1951 dump_printf (dump_kind, "\n");
1952 }
1953
1954 DEBUG_FUNCTION void
1955 debug (slp_tree node)
1956 {
1957 debug_dump_context ctx;
1958 vect_print_slp_tree (MSG_NOTE,
1959 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1960 node);
1961 }
1962
1963 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1964
1965 static void
1966 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1967 slp_tree node, hash_set<slp_tree> &visited)
1968 {
1969 unsigned i;
1970 slp_tree child;
1971
1972 if (visited.add (node))
1973 return;
1974
1975 vect_print_slp_tree (dump_kind, loc, node);
1976
1977 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1978 if (child)
1979 vect_print_slp_graph (dump_kind, loc, child, visited);
1980 }
1981
1982 static void
1983 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1984 slp_tree entry)
1985 {
1986 hash_set<slp_tree> visited;
1987 vect_print_slp_graph (dump_kind, loc, entry, visited);
1988 }
1989
1990 /* Mark the tree rooted at NODE with PURE_SLP. */
1991
1992 static void
1993 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1994 {
1995 int i;
1996 stmt_vec_info stmt_info;
1997 slp_tree child;
1998
1999 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2000 return;
2001
2002 if (visited.add (node))
2003 return;
2004
2005 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2006 STMT_SLP_TYPE (stmt_info) = pure_slp;
2007
2008 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2009 if (child)
2010 vect_mark_slp_stmts (child, visited);
2011 }
2012
2013 static void
2014 vect_mark_slp_stmts (slp_tree node)
2015 {
2016 hash_set<slp_tree> visited;
2017 vect_mark_slp_stmts (node, visited);
2018 }
2019
2020 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2021
2022 static void
2023 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2024 {
2025 int i;
2026 stmt_vec_info stmt_info;
2027 slp_tree child;
2028
2029 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2030 return;
2031
2032 if (visited.add (node))
2033 return;
2034
2035 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2036 {
2037 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2038 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2039 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2040 }
2041
2042 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2043 if (child)
2044 vect_mark_slp_stmts_relevant (child, visited);
2045 }
2046
2047 static void
2048 vect_mark_slp_stmts_relevant (slp_tree node)
2049 {
2050 hash_set<slp_tree> visited;
2051 vect_mark_slp_stmts_relevant (node, visited);
2052 }
2053
2054
2055 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2056
2057 static void
2058 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2059 hash_set<slp_tree> &visited)
2060 {
2061 if (!node || visited.add (node))
2062 return;
2063
2064 if (SLP_TREE_CHILDREN (node).length () == 0)
2065 {
2066 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2067 return;
2068 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2069 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2070 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2071 loads.safe_push (node);
2072 }
2073 else
2074 {
2075 unsigned i;
2076 slp_tree child;
2077 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2078 vect_gather_slp_loads (loads, child, visited);
2079 }
2080 }
2081
2082
2083 /* Find the last store in SLP INSTANCE. */
2084
2085 stmt_vec_info
2086 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2087 {
2088 stmt_vec_info last = NULL;
2089 stmt_vec_info stmt_vinfo;
2090
2091 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2092 {
2093 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2094 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2095 }
2096
2097 return last;
2098 }
2099
2100 /* Find the first stmt in NODE. */
2101
2102 stmt_vec_info
2103 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2104 {
2105 stmt_vec_info first = NULL;
2106 stmt_vec_info stmt_vinfo;
2107
2108 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2109 {
2110 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2111 if (!first
2112 || get_later_stmt (stmt_vinfo, first) == first)
2113 first = stmt_vinfo;
2114 }
2115
2116 return first;
2117 }
2118
2119 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2120 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2121 (also containing the first GROUP1_SIZE stmts, since stores are
2122 consecutive), the second containing the remainder.
2123 Return the first stmt in the second group. */
2124
2125 static stmt_vec_info
2126 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2127 {
2128 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2129 gcc_assert (group1_size > 0);
2130 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2131 gcc_assert (group2_size > 0);
2132 DR_GROUP_SIZE (first_vinfo) = group1_size;
2133
2134 stmt_vec_info stmt_info = first_vinfo;
2135 for (unsigned i = group1_size; i > 1; i--)
2136 {
2137 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2138 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2139 }
2140 /* STMT is now the last element of the first group. */
2141 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2142 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2143
2144 DR_GROUP_SIZE (group2) = group2_size;
2145 for (stmt_info = group2; stmt_info;
2146 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2147 {
2148 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2149 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2150 }
2151
2152 /* For the second group, the DR_GROUP_GAP is that before the original group,
2153 plus skipping over the first vector. */
2154 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2155
2156 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2157 DR_GROUP_GAP (first_vinfo) += group2_size;
2158
2159 if (dump_enabled_p ())
2160 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2161 group1_size, group2_size);
2162
2163 return group2;
2164 }
2165
2166 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2167 statements and a vector of NUNITS elements. */
2168
2169 static poly_uint64
2170 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2171 {
2172 return exact_div (common_multiple (nunits, group_size), group_size);
2173 }
2174
2175 static bool
2176 vect_analyze_slp_instance (vec_info *vinfo,
2177 scalar_stmts_to_slp_tree_map_t *bst_map,
2178 stmt_vec_info stmt_info, slp_instance_kind kind,
2179 unsigned max_tree_size);
2180
2181 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2182 of KIND. Return true if successful. */
2183
2184 static bool
2185 vect_build_slp_instance (vec_info *vinfo,
2186 slp_instance_kind kind,
2187 vec<stmt_vec_info> scalar_stmts,
2188 stmt_vec_info root_stmt_info,
2189 unsigned max_tree_size,
2190 scalar_stmts_to_slp_tree_map_t *bst_map,
2191 /* ??? We need stmt_info for group splitting. */
2192 stmt_vec_info stmt_info_)
2193 {
2194 if (dump_enabled_p ())
2195 {
2196 dump_printf_loc (MSG_NOTE, vect_location,
2197 "Starting SLP discovery for\n");
2198 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
2199 dump_printf_loc (MSG_NOTE, vect_location,
2200 " %G", scalar_stmts[i]->stmt);
2201 }
2202
2203 /* Build the tree for the SLP instance. */
2204 unsigned int group_size = scalar_stmts.length ();
2205 bool *matches = XALLOCAVEC (bool, group_size);
2206 unsigned npermutes = 0;
2207 poly_uint64 max_nunits = 1;
2208 unsigned tree_size = 0;
2209 unsigned i;
2210 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2211 &max_nunits, matches, &npermutes,
2212 &tree_size, bst_map);
2213 if (node != NULL)
2214 {
2215 /* Calculate the unrolling factor based on the smallest type. */
2216 poly_uint64 unrolling_factor
2217 = calculate_unrolling_factor (max_nunits, group_size);
2218
2219 if (maybe_ne (unrolling_factor, 1U)
2220 && is_a <bb_vec_info> (vinfo))
2221 {
2222 unsigned HOST_WIDE_INT const_max_nunits;
2223 if (!max_nunits.is_constant (&const_max_nunits)
2224 || const_max_nunits > group_size)
2225 {
2226 if (dump_enabled_p ())
2227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2228 "Build SLP failed: store group "
2229 "size not a multiple of the vector size "
2230 "in basic block SLP\n");
2231 vect_free_slp_tree (node);
2232 return false;
2233 }
2234 /* Fatal mismatch. */
2235 if (dump_enabled_p ())
2236 dump_printf_loc (MSG_NOTE, vect_location,
2237 "SLP discovery succeeded but node needs "
2238 "splitting\n");
2239 memset (matches, true, group_size);
2240 matches[group_size / const_max_nunits * const_max_nunits] = false;
2241 vect_free_slp_tree (node);
2242 }
2243 else
2244 {
2245 /* Create a new SLP instance. */
2246 slp_instance new_instance = XNEW (class _slp_instance);
2247 SLP_INSTANCE_TREE (new_instance) = node;
2248 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2249 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2250 SLP_INSTANCE_ROOT_STMT (new_instance) = root_stmt_info;
2251 SLP_INSTANCE_KIND (new_instance) = kind;
2252 new_instance->reduc_phis = NULL;
2253 new_instance->cost_vec = vNULL;
2254 new_instance->subgraph_entries = vNULL;
2255
2256 if (dump_enabled_p ())
2257 dump_printf_loc (MSG_NOTE, vect_location,
2258 "SLP size %u vs. limit %u.\n",
2259 tree_size, max_tree_size);
2260
2261 /* Fixup SLP reduction chains. */
2262 if (kind == slp_inst_kind_reduc_chain)
2263 {
2264 /* If this is a reduction chain with a conversion in front
2265 amend the SLP tree with a node for that. */
2266 gimple *scalar_def
2267 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2268 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
2269 {
2270 /* Get at the conversion stmt - we know it's the single use
2271 of the last stmt of the reduction chain. */
2272 use_operand_p use_p;
2273 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
2274 &use_p, &scalar_def);
2275 gcc_assert (r);
2276 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
2277 next_info = vect_stmt_to_vectorize (next_info);
2278 scalar_stmts = vNULL;
2279 scalar_stmts.create (group_size);
2280 for (unsigned i = 0; i < group_size; ++i)
2281 scalar_stmts.quick_push (next_info);
2282 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2283 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2284 SLP_TREE_CHILDREN (conv).quick_push (node);
2285 SLP_INSTANCE_TREE (new_instance) = conv;
2286 /* We also have to fake this conversion stmt as SLP reduction
2287 group so we don't have to mess with too much code
2288 elsewhere. */
2289 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2290 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2291 }
2292 /* Fill the backedge child of the PHI SLP node. The
2293 general matching code cannot find it because the
2294 scalar code does not reflect how we vectorize the
2295 reduction. */
2296 use_operand_p use_p;
2297 imm_use_iterator imm_iter;
2298 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
2299 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
2300 gimple_get_lhs (scalar_def))
2301 /* There are exactly two non-debug uses, the reduction
2302 PHI and the loop-closed PHI node. */
2303 if (!is_gimple_debug (USE_STMT (use_p))
2304 && gimple_bb (USE_STMT (use_p)) == loop->header)
2305 {
2306 auto_vec<stmt_vec_info, 64> phis (group_size);
2307 stmt_vec_info phi_info
2308 = vinfo->lookup_stmt (USE_STMT (use_p));
2309 for (unsigned i = 0; i < group_size; ++i)
2310 phis.quick_push (phi_info);
2311 slp_tree *phi_node = bst_map->get (phis);
2312 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
2313 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
2314 = SLP_INSTANCE_TREE (new_instance);
2315 SLP_INSTANCE_TREE (new_instance)->refcnt++;
2316 }
2317 }
2318
2319 vinfo->slp_instances.safe_push (new_instance);
2320
2321 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2322 the number of scalar stmts in the root in a few places.
2323 Verify that assumption holds. */
2324 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2325 .length () == group_size);
2326
2327 if (dump_enabled_p ())
2328 {
2329 dump_printf_loc (MSG_NOTE, vect_location,
2330 "Final SLP tree for instance %p:\n", new_instance);
2331 vect_print_slp_graph (MSG_NOTE, vect_location,
2332 SLP_INSTANCE_TREE (new_instance));
2333 }
2334
2335 return true;
2336 }
2337 }
2338 else
2339 {
2340 /* Failed to SLP. */
2341 /* Free the allocated memory. */
2342 scalar_stmts.release ();
2343 }
2344
2345 stmt_vec_info stmt_info = stmt_info_;
2346 /* Try to break the group up into pieces. */
2347 if (kind == slp_inst_kind_store)
2348 {
2349 /* ??? We could delay all the actual splitting of store-groups
2350 until after SLP discovery of the original group completed.
2351 Then we can recurse to vect_build_slp_instance directly. */
2352 for (i = 0; i < group_size; i++)
2353 if (!matches[i])
2354 break;
2355
2356 /* For basic block SLP, try to break the group up into multiples of
2357 a vector size. */
2358 if (is_a <bb_vec_info> (vinfo)
2359 && (i > 1 && i < group_size))
2360 {
2361 tree scalar_type
2362 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2363 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2364 1 << floor_log2 (i));
2365 unsigned HOST_WIDE_INT const_nunits;
2366 if (vectype
2367 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2368 {
2369 /* Split into two groups at the first vector boundary. */
2370 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2371 unsigned group1_size = i & ~(const_nunits - 1);
2372
2373 if (dump_enabled_p ())
2374 dump_printf_loc (MSG_NOTE, vect_location,
2375 "Splitting SLP group at stmt %u\n", i);
2376 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2377 group1_size);
2378 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2379 kind, max_tree_size);
2380 /* Split the rest at the failure point and possibly
2381 re-analyze the remaining matching part if it has
2382 at least two lanes. */
2383 if (group1_size < i
2384 && (i + 1 < group_size
2385 || i - group1_size > 1))
2386 {
2387 stmt_vec_info rest2 = rest;
2388 rest = vect_split_slp_store_group (rest, i - group1_size);
2389 if (i - group1_size > 1)
2390 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
2391 kind, max_tree_size);
2392 }
2393 /* Re-analyze the non-matching tail if it has at least
2394 two lanes. */
2395 if (i + 1 < group_size)
2396 res |= vect_analyze_slp_instance (vinfo, bst_map,
2397 rest, kind, max_tree_size);
2398 return res;
2399 }
2400 }
2401
2402 /* For loop vectorization split into arbitrary pieces of size > 1. */
2403 if (is_a <loop_vec_info> (vinfo)
2404 && (i > 1 && i < group_size))
2405 {
2406 unsigned group1_size = i;
2407
2408 if (dump_enabled_p ())
2409 dump_printf_loc (MSG_NOTE, vect_location,
2410 "Splitting SLP group at stmt %u\n", i);
2411
2412 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2413 group1_size);
2414 /* Loop vectorization cannot handle gaps in stores, make sure
2415 the split group appears as strided. */
2416 STMT_VINFO_STRIDED_P (rest) = 1;
2417 DR_GROUP_GAP (rest) = 0;
2418 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2419 DR_GROUP_GAP (stmt_info) = 0;
2420
2421 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2422 kind, max_tree_size);
2423 if (i + 1 < group_size)
2424 res |= vect_analyze_slp_instance (vinfo, bst_map,
2425 rest, kind, max_tree_size);
2426
2427 return res;
2428 }
2429
2430 /* Even though the first vector did not all match, we might be able to SLP
2431 (some) of the remainder. FORNOW ignore this possibility. */
2432 }
2433
2434 /* Failed to SLP. */
2435 if (dump_enabled_p ())
2436 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2437 return false;
2438 }
2439
2440
2441 /* Analyze an SLP instance starting from a group of grouped stores. Call
2442 vect_build_slp_tree to build a tree of packed stmts if possible.
2443 Return FALSE if it's impossible to SLP any stmt in the loop. */
2444
2445 static bool
2446 vect_analyze_slp_instance (vec_info *vinfo,
2447 scalar_stmts_to_slp_tree_map_t *bst_map,
2448 stmt_vec_info stmt_info,
2449 slp_instance_kind kind,
2450 unsigned max_tree_size)
2451 {
2452 unsigned int i;
2453 vec<stmt_vec_info> scalar_stmts;
2454
2455 if (is_a <bb_vec_info> (vinfo))
2456 vect_location = stmt_info->stmt;
2457
2458 stmt_vec_info next_info = stmt_info;
2459 if (kind == slp_inst_kind_store)
2460 {
2461 /* Collect the stores and store them in scalar_stmts. */
2462 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
2463 while (next_info)
2464 {
2465 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2466 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2467 }
2468 }
2469 else if (kind == slp_inst_kind_reduc_chain)
2470 {
2471 /* Collect the reduction stmts and store them in scalar_stmts. */
2472 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
2473 while (next_info)
2474 {
2475 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2476 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2477 }
2478 /* Mark the first element of the reduction chain as reduction to properly
2479 transform the node. In the reduction analysis phase only the last
2480 element of the chain is marked as reduction. */
2481 STMT_VINFO_DEF_TYPE (stmt_info)
2482 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2483 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2484 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2485 }
2486 else if (kind == slp_inst_kind_ctor)
2487 {
2488 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2489 tree val;
2490 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
2491 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2492 {
2493 stmt_vec_info def_info = vinfo->lookup_def (val);
2494 def_info = vect_stmt_to_vectorize (def_info);
2495 scalar_stmts.quick_push (def_info);
2496 }
2497 if (dump_enabled_p ())
2498 dump_printf_loc (MSG_NOTE, vect_location,
2499 "Analyzing vectorizable constructor: %G\n",
2500 stmt_info->stmt);
2501 }
2502 else if (kind == slp_inst_kind_reduc_group)
2503 {
2504 /* Collect reduction statements. */
2505 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2506 scalar_stmts.create (reductions.length ());
2507 for (i = 0; reductions.iterate (i, &next_info); i++)
2508 if (STMT_VINFO_RELEVANT_P (next_info)
2509 || STMT_VINFO_LIVE_P (next_info))
2510 scalar_stmts.quick_push (next_info);
2511 /* If less than two were relevant/live there's nothing to SLP. */
2512 if (scalar_stmts.length () < 2)
2513 return false;
2514 }
2515 else
2516 gcc_unreachable ();
2517
2518 /* Build the tree for the SLP instance. */
2519 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
2520 kind == slp_inst_kind_ctor
2521 ? stmt_info : NULL,
2522 max_tree_size, bst_map,
2523 kind == slp_inst_kind_store
2524 ? stmt_info : NULL);
2525
2526 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2527 where we should do store group splitting. */
2528
2529 return res;
2530 }
2531
2532 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2533 trees of packed scalar stmts if SLP is possible. */
2534
2535 opt_result
2536 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2537 {
2538 unsigned int i;
2539 stmt_vec_info first_element;
2540
2541 DUMP_VECT_SCOPE ("vect_analyze_slp");
2542
2543 scalar_stmts_to_slp_tree_map_t *bst_map
2544 = new scalar_stmts_to_slp_tree_map_t ();
2545
2546 /* Find SLP sequences starting from groups of grouped stores. */
2547 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2548 vect_analyze_slp_instance (vinfo, bst_map, first_element,
2549 STMT_VINFO_GROUPED_ACCESS (first_element)
2550 ? slp_inst_kind_store : slp_inst_kind_ctor,
2551 max_tree_size);
2552
2553 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2554 {
2555 /* Find SLP sequences starting from reduction chains. */
2556 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2557 if (! STMT_VINFO_RELEVANT_P (first_element)
2558 && ! STMT_VINFO_LIVE_P (first_element))
2559 ;
2560 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2561 slp_inst_kind_reduc_chain,
2562 max_tree_size))
2563 {
2564 /* Dissolve reduction chain group. */
2565 stmt_vec_info vinfo = first_element;
2566 stmt_vec_info last = NULL;
2567 while (vinfo)
2568 {
2569 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2570 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2571 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2572 last = vinfo;
2573 vinfo = next;
2574 }
2575 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2576 /* It can be still vectorized as part of an SLP reduction. */
2577 loop_vinfo->reductions.safe_push (last);
2578 }
2579
2580 /* Find SLP sequences starting from groups of reductions. */
2581 if (loop_vinfo->reductions.length () > 1)
2582 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2583 slp_inst_kind_reduc_group, max_tree_size);
2584 }
2585
2586 /* The map keeps a reference on SLP nodes built, release that. */
2587 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2588 it != bst_map->end (); ++it)
2589 if ((*it).second)
2590 vect_free_slp_tree ((*it).second);
2591 delete bst_map;
2592
2593 return opt_result::success ();
2594 }
2595
2596 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2597
2598 static void
2599 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2600 vec<slp_tree> &vertices, vec<int> &leafs)
2601 {
2602 unsigned i;
2603 slp_tree child;
2604
2605 if (visited.add (node))
2606 return;
2607
2608 node->vertex = vertices.length ();
2609 vertices.safe_push (node);
2610
2611 bool leaf = true;
2612 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2613 if (child)
2614 {
2615 leaf = false;
2616 vect_slp_build_vertices (visited, child, vertices, leafs);
2617 }
2618 if (leaf)
2619 leafs.safe_push (node->vertex);
2620 }
2621
2622 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2623
2624 static void
2625 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2626 vec<int> &leafs)
2627 {
2628 hash_set<slp_tree> visited;
2629 unsigned i;
2630 slp_instance instance;
2631 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2632 {
2633 unsigned n_v = vertices.length ();
2634 unsigned n_l = leafs.length ();
2635 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2636 leafs);
2637 /* If we added vertices but no entries to the reverse graph we've
2638 added a cycle that is not backwards-reachable. Push the entry
2639 to mimic as leaf then. */
2640 if (vertices.length () > n_v
2641 && leafs.length () == n_l)
2642 leafs.safe_push (SLP_INSTANCE_TREE (instance)->vertex);
2643 }
2644 }
2645
2646 /* Apply (reverse) bijectite PERM to VEC. */
2647
2648 template <class T>
2649 static void
2650 vect_slp_permute (vec<unsigned> perm,
2651 vec<T> &vec, bool reverse)
2652 {
2653 auto_vec<T, 64> saved;
2654 saved.create (vec.length ());
2655 for (unsigned i = 0; i < vec.length (); ++i)
2656 saved.quick_push (vec[i]);
2657
2658 if (reverse)
2659 {
2660 for (unsigned i = 0; i < vec.length (); ++i)
2661 vec[perm[i]] = saved[i];
2662 for (unsigned i = 0; i < vec.length (); ++i)
2663 gcc_assert (vec[perm[i]] == saved[i]);
2664 }
2665 else
2666 {
2667 for (unsigned i = 0; i < vec.length (); ++i)
2668 vec[i] = saved[perm[i]];
2669 for (unsigned i = 0; i < vec.length (); ++i)
2670 gcc_assert (vec[i] == saved[perm[i]]);
2671 }
2672 }
2673
2674 /* Return whether permutations PERM_A and PERM_B as recorded in the
2675 PERMS vector are equal. */
2676
2677 static bool
2678 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
2679 int perm_a, int perm_b)
2680 {
2681 return (perm_a == perm_b
2682 || (perms[perm_a].length () == perms[perm_b].length ()
2683 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
2684 sizeof (unsigned) * perms[perm_a].length ()) == 0));
2685 }
2686
2687 /* Optimize the SLP graph of VINFO. */
2688
2689 void
2690 vect_optimize_slp (vec_info *vinfo)
2691 {
2692 if (vinfo->slp_instances.is_empty ())
2693 return;
2694
2695 slp_tree node;
2696 unsigned i;
2697 auto_vec<slp_tree> vertices;
2698 auto_vec<int> leafs;
2699 vect_slp_build_vertices (vinfo, vertices, leafs);
2700
2701 struct graph *slpg = new_graph (vertices.length ());
2702 FOR_EACH_VEC_ELT (vertices, i, node)
2703 {
2704 unsigned j;
2705 slp_tree child;
2706 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2707 if (child)
2708 add_edge (slpg, i, child->vertex);
2709 }
2710
2711 /* Compute (reverse) postorder on the inverted graph. */
2712 auto_vec<int> ipo;
2713 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
2714
2715 auto_sbitmap n_visited (vertices.length ());
2716 auto_sbitmap n_materialize (vertices.length ());
2717 auto_vec<int> n_perm (vertices.length ());
2718 auto_vec<vec<unsigned> > perms;
2719
2720 bitmap_clear (n_visited);
2721 bitmap_clear (n_materialize);
2722 n_perm.quick_grow_cleared (vertices.length ());
2723 perms.safe_push (vNULL); /* zero is no permute */
2724
2725 /* Produce initial permutations. */
2726 for (i = 0; i < leafs.length (); ++i)
2727 {
2728 int idx = leafs[i];
2729 slp_tree node = vertices[idx];
2730
2731 /* Handle externals and constants optimistically throughout the
2732 iteration. */
2733 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
2734 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
2735 continue;
2736
2737 /* Leafs do not change across iterations. Note leafs also double
2738 as entries to the reverse graph. */
2739 if (!slpg->vertices[idx].succ)
2740 bitmap_set_bit (n_visited, idx);
2741 /* Loads are the only thing generating permutes. */
2742 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2743 continue;
2744
2745 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2746 node unpermuted, record this permute. */
2747 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
2748 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
2749 continue;
2750 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
2751 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
2752 bool any_permute = false;
2753 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2754 {
2755 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
2756 imin = MIN (imin, idx);
2757 imax = MAX (imax, idx);
2758 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
2759 any_permute = true;
2760 }
2761 /* If there's no permute no need to split one out. */
2762 if (!any_permute)
2763 continue;
2764 /* If the span doesn't match we'd disrupt VF computation, avoid
2765 that for now. */
2766 if (imax - imin + 1 != SLP_TREE_LANES (node))
2767 continue;
2768
2769 /* For now only handle true permutes, like
2770 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2771 when permuting constants and invariants keeping the permute
2772 bijective. */
2773 auto_sbitmap load_index (SLP_TREE_LANES (node));
2774 bitmap_clear (load_index);
2775 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2776 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
2777 unsigned j;
2778 for (j = 0; j < SLP_TREE_LANES (node); ++j)
2779 if (!bitmap_bit_p (load_index, j))
2780 break;
2781 if (j != SLP_TREE_LANES (node))
2782 continue;
2783
2784 vec<unsigned> perm = vNULL;
2785 perm.safe_grow (SLP_TREE_LANES (node), true);
2786 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2787 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
2788 perms.safe_push (perm);
2789 n_perm[idx] = perms.length () - 1;
2790 }
2791
2792 /* Propagate permutes along the graph and compute materialization points. */
2793 bool changed;
2794 unsigned iteration = 0;
2795 do
2796 {
2797 changed = false;
2798 ++iteration;
2799
2800 for (i = vertices.length (); i > 0 ; --i)
2801 {
2802 int idx = ipo[i-1];
2803 slp_tree node = vertices[idx];
2804 /* For leafs there's nothing to do - we've seeded permutes
2805 on those above. */
2806 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2807 continue;
2808
2809 bitmap_set_bit (n_visited, idx);
2810
2811 /* We cannot move a permute across a store. */
2812 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2813 && DR_IS_WRITE
2814 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
2815 continue;
2816
2817 int perm = -1;
2818 for (graph_edge *succ = slpg->vertices[idx].succ;
2819 succ; succ = succ->succ_next)
2820 {
2821 int succ_idx = succ->dest;
2822 /* Handle unvisited nodes optimistically. */
2823 /* ??? But for constants once we want to handle non-bijective
2824 permutes we have to verify the permute, when unifying lanes,
2825 will not unify different constants. For example see
2826 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2827 if (!bitmap_bit_p (n_visited, succ_idx))
2828 continue;
2829 int succ_perm = n_perm[succ_idx];
2830 /* Once we materialize succs permutation its output lanes
2831 appear unpermuted to us. */
2832 if (bitmap_bit_p (n_materialize, succ_idx))
2833 succ_perm = 0;
2834 if (perm == -1)
2835 perm = succ_perm;
2836 else if (succ_perm == 0)
2837 {
2838 perm = 0;
2839 break;
2840 }
2841 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
2842 {
2843 perm = 0;
2844 break;
2845 }
2846 }
2847
2848 if (perm == -1)
2849 /* Pick up pre-computed leaf values. */
2850 perm = n_perm[idx];
2851 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
2852 {
2853 if (iteration > 1)
2854 /* Make sure we eventually converge. */
2855 gcc_checking_assert (perm == 0);
2856 n_perm[idx] = perm;
2857 if (perm == 0)
2858 bitmap_clear_bit (n_materialize, idx);
2859 changed = true;
2860 }
2861
2862 if (perm == 0)
2863 continue;
2864
2865 /* Elide pruning at materialization points in the first
2866 iteration so every node was visited once at least. */
2867 if (iteration == 1)
2868 continue;
2869
2870 /* Decide on permute materialization. Look whether there's
2871 a use (pred) edge that is permuted differently than us.
2872 In that case mark ourselves so the permutation is applied. */
2873 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
2874 for (graph_edge *pred = slpg->vertices[idx].pred;
2875 pred; pred = pred->pred_next)
2876 {
2877 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
2878 int pred_perm = n_perm[pred->src];
2879 if (!vect_slp_perms_eq (perms, perm, pred_perm))
2880 {
2881 all_preds_permuted = false;
2882 break;
2883 }
2884 }
2885 if (!all_preds_permuted)
2886 {
2887 if (!bitmap_bit_p (n_materialize, idx))
2888 changed = true;
2889 bitmap_set_bit (n_materialize, idx);
2890 }
2891 }
2892 }
2893 while (changed || iteration == 1);
2894
2895 /* Materialize. */
2896 for (i = 0; i < vertices.length (); ++i)
2897 {
2898 int perm = n_perm[i];
2899 if (perm <= 0)
2900 continue;
2901
2902 slp_tree node = vertices[i];
2903
2904 /* First permute invariant/external original successors. */
2905 unsigned j;
2906 slp_tree child;
2907 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2908 {
2909 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2910 continue;
2911
2912 /* If the vector is uniform there's nothing to do. */
2913 if (vect_slp_tree_uniform_p (child))
2914 continue;
2915
2916 /* We can end up sharing some externals via two_operator
2917 handling. Be prepared to unshare those. */
2918 if (child->refcnt != 1)
2919 {
2920 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
2921 SLP_TREE_CHILDREN (node)[j] = child
2922 = vect_create_new_slp_node
2923 (SLP_TREE_SCALAR_OPS (child).copy ());
2924 }
2925 vect_slp_permute (perms[perm],
2926 SLP_TREE_SCALAR_OPS (child), true);
2927 }
2928
2929 if (bitmap_bit_p (n_materialize, i))
2930 {
2931 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2932 /* For loads simply drop the permutation, the load permutation
2933 already performs the desired permutation. */
2934 ;
2935 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2936 {
2937 /* If the node if already a permute node we just need to apply
2938 the permutation to the permute node itself. */
2939 if (dump_enabled_p ())
2940 dump_printf_loc (MSG_NOTE, vect_location,
2941 "simplifying permute node %p\n",
2942 node);
2943
2944 vect_slp_permute (perms[perm], SLP_TREE_LANE_PERMUTATION (node),
2945 true);
2946 }
2947 else
2948 {
2949 if (dump_enabled_p ())
2950 dump_printf_loc (MSG_NOTE, vect_location,
2951 "inserting permute node in place of %p\n",
2952 node);
2953
2954 /* Make a copy of NODE and in-place change it to a
2955 VEC_PERM node to permute the lanes of the copy. */
2956 slp_tree copy = new _slp_tree;
2957 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
2958 SLP_TREE_CHILDREN (node) = vNULL;
2959 SLP_TREE_SCALAR_STMTS (copy)
2960 = SLP_TREE_SCALAR_STMTS (node).copy ();
2961 vect_slp_permute (perms[perm],
2962 SLP_TREE_SCALAR_STMTS (copy), true);
2963 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
2964 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
2965 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
2966 SLP_TREE_LANE_PERMUTATION (copy)
2967 = SLP_TREE_LANE_PERMUTATION (node);
2968 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
2969 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
2970 copy->refcnt = 1;
2971 copy->max_nunits = node->max_nunits;
2972 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
2973 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
2974 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
2975
2976 /* Now turn NODE into a VEC_PERM. */
2977 SLP_TREE_CHILDREN (node).safe_push (copy);
2978 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
2979 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2980 SLP_TREE_LANE_PERMUTATION (node)
2981 .quick_push (std::make_pair (0, perms[perm][j]));
2982 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2983 }
2984 }
2985 else
2986 {
2987 /* Apply the reverse permutation to our stmts. */
2988 vect_slp_permute (perms[perm],
2989 SLP_TREE_SCALAR_STMTS (node), true);
2990 /* And to the load permutation, which we can simply
2991 make regular by design. */
2992 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2993 {
2994 /* ??? When we handle non-bijective permutes the idea
2995 is that we can force the load-permutation to be
2996 { min, min + 1, min + 2, ... max }. But then the
2997 scalar defs might no longer match the lane content
2998 which means wrong-code with live lane vectorization.
2999 So we possibly have to have NULL entries for those. */
3000 vect_slp_permute (perms[perm],
3001 SLP_TREE_LOAD_PERMUTATION (node), true);
3002 }
3003 }
3004 }
3005
3006 /* Free the perms vector used for propagation. */
3007 while (!perms.is_empty ())
3008 perms.pop ().release ();
3009 free_graph (slpg);
3010
3011
3012 /* Now elide load permutations that are not necessary. */
3013 for (i = 0; i < leafs.length (); ++i)
3014 {
3015 node = vertices[leafs[i]];
3016 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3017 continue;
3018
3019 /* In basic block vectorization we allow any subchain of an interleaving
3020 chain.
3021 FORNOW: not in loop SLP because of realignment complications. */
3022 if (is_a <bb_vec_info> (vinfo))
3023 {
3024 bool subchain_p = true;
3025 stmt_vec_info next_load_info = NULL;
3026 stmt_vec_info load_info;
3027 unsigned j;
3028 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3029 {
3030 if (j != 0
3031 && (next_load_info != load_info
3032 || DR_GROUP_GAP (load_info) != 1))
3033 {
3034 subchain_p = false;
3035 break;
3036 }
3037 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3038 }
3039 if (subchain_p)
3040 {
3041 SLP_TREE_LOAD_PERMUTATION (node).release ();
3042 continue;
3043 }
3044 }
3045 else
3046 {
3047 stmt_vec_info load_info;
3048 bool this_load_permuted = false;
3049 unsigned j;
3050 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3051 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3052 {
3053 this_load_permuted = true;
3054 break;
3055 }
3056 stmt_vec_info first_stmt_info
3057 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3058 if (!this_load_permuted
3059 /* The load requires permutation when unrolling exposes
3060 a gap either because the group is larger than the SLP
3061 group-size or because there is a gap between the groups. */
3062 && (known_eq (LOOP_VINFO_VECT_FACTOR
3063 (as_a <loop_vec_info> (vinfo)), 1U)
3064 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3065 && DR_GROUP_GAP (first_stmt_info) == 0)))
3066 {
3067 SLP_TREE_LOAD_PERMUTATION (node).release ();
3068 continue;
3069 }
3070 }
3071 }
3072 }
3073
3074 /* Gather loads reachable from the individual SLP graph entries. */
3075
3076 void
3077 vect_gather_slp_loads (vec_info *vinfo)
3078 {
3079 unsigned i;
3080 slp_instance instance;
3081 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
3082 {
3083 hash_set<slp_tree> visited;
3084 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
3085 SLP_INSTANCE_TREE (instance), visited);
3086 }
3087 }
3088
3089
3090 /* For each possible SLP instance decide whether to SLP it and calculate overall
3091 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3092 least one instance. */
3093
3094 bool
3095 vect_make_slp_decision (loop_vec_info loop_vinfo)
3096 {
3097 unsigned int i;
3098 poly_uint64 unrolling_factor = 1;
3099 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3100 slp_instance instance;
3101 int decided_to_slp = 0;
3102
3103 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3104
3105 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3106 {
3107 /* FORNOW: SLP if you can. */
3108 /* All unroll factors have the form:
3109
3110 GET_MODE_SIZE (vinfo->vector_mode) * X
3111
3112 for some rational X, so they must have a common multiple. */
3113 unrolling_factor
3114 = force_common_multiple (unrolling_factor,
3115 SLP_INSTANCE_UNROLLING_FACTOR (instance));
3116
3117 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3118 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3119 loop-based vectorization. Such stmts will be marked as HYBRID. */
3120 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3121 decided_to_slp++;
3122 }
3123
3124 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
3125
3126 if (decided_to_slp && dump_enabled_p ())
3127 {
3128 dump_printf_loc (MSG_NOTE, vect_location,
3129 "Decided to SLP %d instances. Unrolling factor ",
3130 decided_to_slp);
3131 dump_dec (MSG_NOTE, unrolling_factor);
3132 dump_printf (MSG_NOTE, "\n");
3133 }
3134
3135 return (decided_to_slp > 0);
3136 }
3137
3138 /* Private data for vect_detect_hybrid_slp. */
3139 struct vdhs_data
3140 {
3141 loop_vec_info loop_vinfo;
3142 vec<stmt_vec_info> *worklist;
3143 };
3144
3145 /* Walker for walk_gimple_op. */
3146
3147 static tree
3148 vect_detect_hybrid_slp (tree *tp, int *, void *data)
3149 {
3150 walk_stmt_info *wi = (walk_stmt_info *)data;
3151 vdhs_data *dat = (vdhs_data *)wi->info;
3152
3153 if (wi->is_lhs)
3154 return NULL_TREE;
3155
3156 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
3157 if (!def_stmt_info)
3158 return NULL_TREE;
3159 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3160 if (PURE_SLP_STMT (def_stmt_info))
3161 {
3162 if (dump_enabled_p ())
3163 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3164 def_stmt_info->stmt);
3165 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3166 dat->worklist->safe_push (def_stmt_info);
3167 }
3168
3169 return NULL_TREE;
3170 }
3171
3172 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3173 if so, otherwise pushing it to WORKLIST. */
3174
3175 static void
3176 maybe_push_to_hybrid_worklist (vec_info *vinfo,
3177 vec<stmt_vec_info> &worklist,
3178 stmt_vec_info stmt_info)
3179 {
3180 if (dump_enabled_p ())
3181 dump_printf_loc (MSG_NOTE, vect_location,
3182 "Processing hybrid candidate : %G", stmt_info->stmt);
3183 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
3184 imm_use_iterator iter2;
3185 ssa_op_iter iter1;
3186 use_operand_p use_p;
3187 def_operand_p def_p;
3188 bool any_def = false;
3189 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
3190 {
3191 any_def = true;
3192 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
3193 {
3194 if (is_gimple_debug (USE_STMT (use_p)))
3195 continue;
3196 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
3197 /* An out-of loop use means this is a loop_vect sink. */
3198 if (!use_info)
3199 {
3200 if (dump_enabled_p ())
3201 dump_printf_loc (MSG_NOTE, vect_location,
3202 "Found loop_vect sink: %G", stmt_info->stmt);
3203 worklist.safe_push (stmt_info);
3204 return;
3205 }
3206 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
3207 {
3208 if (dump_enabled_p ())
3209 dump_printf_loc (MSG_NOTE, vect_location,
3210 "Found loop_vect use: %G", use_info->stmt);
3211 worklist.safe_push (stmt_info);
3212 return;
3213 }
3214 }
3215 }
3216 /* No def means this is a loo_vect sink. */
3217 if (!any_def)
3218 {
3219 if (dump_enabled_p ())
3220 dump_printf_loc (MSG_NOTE, vect_location,
3221 "Found loop_vect sink: %G", stmt_info->stmt);
3222 worklist.safe_push (stmt_info);
3223 return;
3224 }
3225 if (dump_enabled_p ())
3226 dump_printf_loc (MSG_NOTE, vect_location,
3227 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
3228 STMT_SLP_TYPE (stmt_info) = pure_slp;
3229 }
3230
3231 /* Find stmts that must be both vectorized and SLPed. */
3232
3233 void
3234 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3235 {
3236 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3237
3238 /* All stmts participating in SLP are marked pure_slp, all other
3239 stmts are loop_vect.
3240 First collect all loop_vect stmts into a worklist.
3241 SLP patterns cause not all original scalar stmts to appear in
3242 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3243 Rectify this here and do a backward walk over the IL only considering
3244 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3245 mark them as pure_slp. */
3246 auto_vec<stmt_vec_info> worklist;
3247 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
3248 {
3249 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3250 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3251 gsi_next (&gsi))
3252 {
3253 gphi *phi = gsi.phi ();
3254 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3255 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3256 maybe_push_to_hybrid_worklist (loop_vinfo,
3257 worklist, stmt_info);
3258 }
3259 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
3260 gsi_prev (&gsi))
3261 {
3262 gimple *stmt = gsi_stmt (gsi);
3263 if (is_gimple_debug (stmt))
3264 continue;
3265 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3266 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3267 {
3268 for (gimple_stmt_iterator gsi2
3269 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3270 !gsi_end_p (gsi2); gsi_next (&gsi2))
3271 {
3272 stmt_vec_info patt_info
3273 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3274 if (!STMT_SLP_TYPE (patt_info)
3275 && STMT_VINFO_RELEVANT (patt_info))
3276 maybe_push_to_hybrid_worklist (loop_vinfo,
3277 worklist, patt_info);
3278 }
3279 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3280 }
3281 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3282 maybe_push_to_hybrid_worklist (loop_vinfo,
3283 worklist, stmt_info);
3284 }
3285 }
3286
3287 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3288 mark any SLP vectorized stmt as hybrid.
3289 ??? We're visiting def stmts N times (once for each non-SLP and
3290 once for each hybrid-SLP use). */
3291 walk_stmt_info wi;
3292 vdhs_data dat;
3293 dat.worklist = &worklist;
3294 dat.loop_vinfo = loop_vinfo;
3295 memset (&wi, 0, sizeof (wi));
3296 wi.info = (void *)&dat;
3297 while (!worklist.is_empty ())
3298 {
3299 stmt_vec_info stmt_info = worklist.pop ();
3300 /* Since SSA operands are not set up for pattern stmts we need
3301 to use walk_gimple_op. */
3302 wi.is_lhs = 0;
3303 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3304 }
3305 }
3306
3307
3308 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3309
3310 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3311 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs)
3312 {
3313 for (unsigned i = 0; i < bbs.length (); ++i)
3314 {
3315 if (i != 0)
3316 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3317 gsi_next (&si))
3318 {
3319 gphi *phi = si.phi ();
3320 gimple_set_uid (phi, 0);
3321 add_stmt (phi);
3322 }
3323 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3324 !gsi_end_p (gsi); gsi_next (&gsi))
3325 {
3326 gimple *stmt = gsi_stmt (gsi);
3327 gimple_set_uid (stmt, 0);
3328 if (is_gimple_debug (stmt))
3329 continue;
3330 add_stmt (stmt);
3331 }
3332 }
3333 }
3334
3335
3336 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3337 stmts in the basic block. */
3338
3339 _bb_vec_info::~_bb_vec_info ()
3340 {
3341 /* Reset region marker. */
3342 for (unsigned i = 0; i < bbs.length (); ++i)
3343 {
3344 if (i != 0)
3345 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3346 gsi_next (&si))
3347 {
3348 gphi *phi = si.phi ();
3349 gimple_set_uid (phi, -1);
3350 }
3351 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3352 !gsi_end_p (gsi); gsi_next (&gsi))
3353 {
3354 gimple *stmt = gsi_stmt (gsi);
3355 gimple_set_uid (stmt, -1);
3356 }
3357 }
3358 }
3359
3360 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3361 given then that child nodes have already been processed, and that
3362 their def types currently match their SLP node's def type. */
3363
3364 static bool
3365 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3366 slp_instance node_instance,
3367 stmt_vector_for_cost *cost_vec)
3368 {
3369 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3370 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3371
3372 /* Calculate the number of vector statements to be created for the
3373 scalar stmts in this node. For SLP reductions it is equal to the
3374 number of vector statements in the children (which has already been
3375 calculated by the recursive call). Otherwise it is the number of
3376 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3377 VF divided by the number of elements in a vector. */
3378 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3379 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3380 {
3381 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3382 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3383 {
3384 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3385 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3386 break;
3387 }
3388 }
3389 else
3390 {
3391 poly_uint64 vf;
3392 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3393 vf = loop_vinfo->vectorization_factor;
3394 else
3395 vf = 1;
3396 unsigned int group_size = SLP_TREE_LANES (node);
3397 tree vectype = SLP_TREE_VECTYPE (node);
3398 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3399 = vect_get_num_vectors (vf * group_size, vectype);
3400 }
3401
3402 /* Handle purely internal nodes. */
3403 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3404 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3405
3406 if (is_a <bb_vec_info> (vinfo)
3407 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3408 {
3409 if (dump_enabled_p ())
3410 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3411 "desired vector type conflicts with earlier one "
3412 "for %G", stmt_info->stmt);
3413 return false;
3414 }
3415
3416 bool dummy;
3417 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3418 node, node_instance, cost_vec);
3419 }
3420
3421 /* Try to build NODE from scalars, returning true on success.
3422 NODE_INSTANCE is the SLP instance that contains NODE. */
3423
3424 static bool
3425 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3426 slp_instance node_instance)
3427 {
3428 stmt_vec_info stmt_info;
3429 unsigned int i;
3430
3431 if (!is_a <bb_vec_info> (vinfo)
3432 || node == SLP_INSTANCE_TREE (node_instance)
3433 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3434 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3435 return false;
3436
3437 if (dump_enabled_p ())
3438 dump_printf_loc (MSG_NOTE, vect_location,
3439 "Building vector operands of %p from scalars instead\n", node);
3440
3441 /* Don't remove and free the child nodes here, since they could be
3442 referenced by other structures. The analysis and scheduling phases
3443 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3444 unsigned int group_size = SLP_TREE_LANES (node);
3445 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3446 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3447 SLP_TREE_LOAD_PERMUTATION (node).release ();
3448 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3449 {
3450 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3451 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3452 }
3453 return true;
3454 }
3455
3456 /* Compute the prologue cost for invariant or constant operands represented
3457 by NODE. */
3458
3459 static void
3460 vect_prologue_cost_for_slp (slp_tree node,
3461 stmt_vector_for_cost *cost_vec)
3462 {
3463 /* There's a special case of an existing vector, that costs nothing. */
3464 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3465 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3466 return;
3467 /* Without looking at the actual initializer a vector of
3468 constants can be implemented as load from the constant pool.
3469 When all elements are the same we can use a splat. */
3470 tree vectype = SLP_TREE_VECTYPE (node);
3471 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3472 unsigned num_vects_to_check;
3473 unsigned HOST_WIDE_INT const_nunits;
3474 unsigned nelt_limit;
3475 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3476 && ! multiple_p (const_nunits, group_size))
3477 {
3478 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3479 nelt_limit = const_nunits;
3480 }
3481 else
3482 {
3483 /* If either the vector has variable length or the vectors
3484 are composed of repeated whole groups we only need to
3485 cost construction once. All vectors will be the same. */
3486 num_vects_to_check = 1;
3487 nelt_limit = group_size;
3488 }
3489 tree elt = NULL_TREE;
3490 unsigned nelt = 0;
3491 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3492 {
3493 unsigned si = j % group_size;
3494 if (nelt == 0)
3495 elt = SLP_TREE_SCALAR_OPS (node)[si];
3496 /* ??? We're just tracking whether all operands of a single
3497 vector initializer are the same, ideally we'd check if
3498 we emitted the same one already. */
3499 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3500 elt = NULL_TREE;
3501 nelt++;
3502 if (nelt == nelt_limit)
3503 {
3504 record_stmt_cost (cost_vec, 1,
3505 SLP_TREE_DEF_TYPE (node) == vect_external_def
3506 ? (elt ? scalar_to_vec : vec_construct)
3507 : vector_load,
3508 NULL, vectype, 0, vect_prologue);
3509 nelt = 0;
3510 }
3511 }
3512 }
3513
3514 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3515 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3516
3517 Return true if the operations are supported. */
3518
3519 static bool
3520 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3521 slp_instance node_instance,
3522 hash_set<slp_tree> &visited_set,
3523 vec<slp_tree> &visited_vec,
3524 stmt_vector_for_cost *cost_vec)
3525 {
3526 int i, j;
3527 slp_tree child;
3528
3529 /* Assume we can code-generate all invariants. */
3530 if (!node
3531 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
3532 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
3533 return true;
3534
3535 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
3536 {
3537 if (dump_enabled_p ())
3538 dump_printf_loc (MSG_NOTE, vect_location,
3539 "Failed cyclic SLP reference in %p", node);
3540 return false;
3541 }
3542 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
3543
3544 /* If we already analyzed the exact same set of scalar stmts we're done.
3545 We share the generated vector stmts for those. */
3546 if (visited_set.add (node))
3547 return true;
3548 visited_vec.safe_push (node);
3549
3550 bool res = true;
3551 unsigned visited_rec_start = visited_vec.length ();
3552 unsigned cost_vec_rec_start = cost_vec->length ();
3553 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3554 {
3555 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3556 visited_set, visited_vec,
3557 cost_vec);
3558 if (!res)
3559 break;
3560 }
3561
3562 if (res)
3563 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3564 cost_vec);
3565 /* If analysis failed we have to pop all recursive visited nodes
3566 plus ourselves. */
3567 if (!res)
3568 {
3569 while (visited_vec.length () >= visited_rec_start)
3570 visited_set.remove (visited_vec.pop ());
3571 cost_vec->truncate (cost_vec_rec_start);
3572 }
3573
3574 /* When the node can be vectorized cost invariant nodes it references.
3575 This is not done in DFS order to allow the refering node
3576 vectorizable_* calls to nail down the invariant nodes vector type
3577 and possibly unshare it if it needs a different vector type than
3578 other referrers. */
3579 if (res)
3580 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3581 if (child
3582 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
3583 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3584 /* Perform usual caching, note code-generation still
3585 code-gens these nodes multiple times but we expect
3586 to CSE them later. */
3587 && !visited_set.add (child))
3588 {
3589 visited_vec.safe_push (child);
3590 /* ??? After auditing more code paths make a "default"
3591 and push the vector type from NODE to all children
3592 if it is not already set. */
3593 /* Compute the number of vectors to be generated. */
3594 tree vector_type = SLP_TREE_VECTYPE (child);
3595 if (!vector_type)
3596 {
3597 /* For shifts with a scalar argument we don't need
3598 to cost or code-generate anything.
3599 ??? Represent this more explicitely. */
3600 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3601 == shift_vec_info_type)
3602 && j == 1);
3603 continue;
3604 }
3605 unsigned group_size = SLP_TREE_LANES (child);
3606 poly_uint64 vf = 1;
3607 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3608 vf = loop_vinfo->vectorization_factor;
3609 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3610 = vect_get_num_vectors (vf * group_size, vector_type);
3611 /* And cost them. */
3612 vect_prologue_cost_for_slp (child, cost_vec);
3613 }
3614
3615 /* If this node or any of its children can't be vectorized, try pruning
3616 the tree here rather than felling the whole thing. */
3617 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3618 {
3619 /* We'll need to revisit this for invariant costing and number
3620 of vectorized stmt setting. */
3621 res = true;
3622 }
3623
3624 return res;
3625 }
3626
3627
3628 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3629 region and that can be vectorized using vectorizable_live_operation
3630 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3631 scalar code computing it to be retained. */
3632
3633 static void
3634 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3635 slp_instance instance,
3636 stmt_vector_for_cost *cost_vec,
3637 hash_set<stmt_vec_info> &svisited,
3638 hash_set<slp_tree> &visited)
3639 {
3640 if (visited.add (node))
3641 return;
3642
3643 unsigned i;
3644 stmt_vec_info stmt_info;
3645 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3646 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3647 {
3648 if (svisited.contains (stmt_info))
3649 continue;
3650 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3651 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
3652 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
3653 /* Only the pattern root stmt computes the original scalar value. */
3654 continue;
3655 bool mark_visited = true;
3656 gimple *orig_stmt = orig_stmt_info->stmt;
3657 ssa_op_iter op_iter;
3658 def_operand_p def_p;
3659 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3660 {
3661 imm_use_iterator use_iter;
3662 gimple *use_stmt;
3663 stmt_vec_info use_stmt_info;
3664 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3665 if (!is_gimple_debug (use_stmt))
3666 {
3667 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3668 if (!use_stmt_info
3669 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3670 {
3671 STMT_VINFO_LIVE_P (stmt_info) = true;
3672 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3673 NULL, node, instance, i,
3674 false, cost_vec))
3675 /* ??? So we know we can vectorize the live stmt
3676 from one SLP node. If we cannot do so from all
3677 or none consistently we'd have to record which
3678 SLP node (and lane) we want to use for the live
3679 operation. So make sure we can code-generate
3680 from all nodes. */
3681 mark_visited = false;
3682 else
3683 STMT_VINFO_LIVE_P (stmt_info) = false;
3684 BREAK_FROM_IMM_USE_STMT (use_iter);
3685 }
3686 }
3687 /* We have to verify whether we can insert the lane extract
3688 before all uses. The following is a conservative approximation.
3689 We cannot put this into vectorizable_live_operation because
3690 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3691 doesn't work.
3692 Note that while the fact that we emit code for loads at the
3693 first load should make this a non-problem leafs we construct
3694 from scalars are vectorized after the last scalar def.
3695 ??? If we'd actually compute the insert location during
3696 analysis we could use sth less conservative than the last
3697 scalar stmt in the node for the dominance check. */
3698 /* ??? What remains is "live" uses in vector CTORs in the same
3699 SLP graph which is where those uses can end up code-generated
3700 right after their definition instead of close to their original
3701 use. But that would restrict us to code-generate lane-extracts
3702 from the latest stmt in a node. So we compensate for this
3703 during code-generation, simply not replacing uses for those
3704 hopefully rare cases. */
3705 if (STMT_VINFO_LIVE_P (stmt_info))
3706 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3707 if (!is_gimple_debug (use_stmt)
3708 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
3709 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3710 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
3711 {
3712 if (dump_enabled_p ())
3713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3714 "Cannot determine insertion place for "
3715 "lane extract\n");
3716 STMT_VINFO_LIVE_P (stmt_info) = false;
3717 mark_visited = true;
3718 }
3719 }
3720 if (mark_visited)
3721 svisited.add (stmt_info);
3722 }
3723
3724 slp_tree child;
3725 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3726 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3727 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
3728 cost_vec, svisited, visited);
3729 }
3730
3731 /* Analyze statements in SLP instances of VINFO. Return true if the
3732 operations are supported. */
3733
3734 bool
3735 vect_slp_analyze_operations (vec_info *vinfo)
3736 {
3737 slp_instance instance;
3738 int i;
3739
3740 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3741
3742 hash_set<slp_tree> visited;
3743 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
3744 {
3745 auto_vec<slp_tree> visited_vec;
3746 stmt_vector_for_cost cost_vec;
3747 cost_vec.create (2);
3748 if (is_a <bb_vec_info> (vinfo))
3749 vect_location = instance->location ();
3750 if (!vect_slp_analyze_node_operations (vinfo,
3751 SLP_INSTANCE_TREE (instance),
3752 instance, visited, visited_vec,
3753 &cost_vec)
3754 /* Instances with a root stmt require vectorized defs for the
3755 SLP tree root. */
3756 || (SLP_INSTANCE_ROOT_STMT (instance)
3757 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
3758 != vect_internal_def)))
3759 {
3760 slp_tree node = SLP_INSTANCE_TREE (instance);
3761 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3762 if (dump_enabled_p ())
3763 dump_printf_loc (MSG_NOTE, vect_location,
3764 "removing SLP instance operations starting from: %G",
3765 stmt_info->stmt);
3766 vect_free_slp_instance (instance);
3767 vinfo->slp_instances.ordered_remove (i);
3768 cost_vec.release ();
3769 while (!visited_vec.is_empty ())
3770 visited.remove (visited_vec.pop ());
3771 }
3772 else
3773 {
3774 i++;
3775
3776 /* For BB vectorization remember the SLP graph entry
3777 cost for later. */
3778 if (is_a <bb_vec_info> (vinfo))
3779 instance->cost_vec = cost_vec;
3780 else
3781 {
3782 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
3783 cost_vec.release ();
3784 }
3785 }
3786 }
3787
3788 /* Compute vectorizable live stmts. */
3789 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3790 {
3791 hash_set<stmt_vec_info> svisited;
3792 hash_set<slp_tree> visited;
3793 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
3794 {
3795 vect_location = instance->location ();
3796 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
3797 instance, &instance->cost_vec, svisited,
3798 visited);
3799 }
3800 }
3801
3802 return !vinfo->slp_instances.is_empty ();
3803 }
3804
3805 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3806 closing the eventual chain. */
3807
3808 static slp_instance
3809 get_ultimate_leader (slp_instance instance,
3810 hash_map<slp_instance, slp_instance> &instance_leader)
3811 {
3812 auto_vec<slp_instance *, 8> chain;
3813 slp_instance *tem;
3814 while (*(tem = instance_leader.get (instance)) != instance)
3815 {
3816 chain.safe_push (tem);
3817 instance = *tem;
3818 }
3819 while (!chain.is_empty ())
3820 *chain.pop () = instance;
3821 return instance;
3822 }
3823
3824 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3825
3826 static void
3827 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
3828 slp_instance instance, slp_tree node,
3829 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
3830 hash_map<slp_instance, slp_instance> &instance_leader,
3831 hash_set<slp_tree> &visited)
3832 {
3833 stmt_vec_info stmt_info;
3834 unsigned i;
3835
3836 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3837 {
3838 bool existed_p;
3839 slp_instance &stmt_instance
3840 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
3841 if (!existed_p)
3842 ;
3843 else if (stmt_instance != instance)
3844 {
3845 /* If we're running into a previously marked stmt make us the
3846 leader of the current ultimate leader. This keeps the
3847 leader chain acyclic and works even when the current instance
3848 connects two previously independent graph parts. */
3849 slp_instance stmt_leader
3850 = get_ultimate_leader (stmt_instance, instance_leader);
3851 if (stmt_leader != instance)
3852 instance_leader.put (stmt_leader, instance);
3853 }
3854 stmt_instance = instance;
3855 }
3856
3857 if (visited.add (node))
3858 return;
3859
3860 slp_tree child;
3861 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3862 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3863 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
3864 instance_leader, visited);
3865 }
3866
3867 /* Partition the SLP graph into pieces that can be costed independently. */
3868
3869 static void
3870 vect_bb_partition_graph (bb_vec_info bb_vinfo)
3871 {
3872 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3873
3874 /* First walk the SLP graph assigning each involved scalar stmt a
3875 corresponding SLP graph entry and upon visiting a previously
3876 marked stmt, make the stmts leader the current SLP graph entry. */
3877 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
3878 hash_map<slp_instance, slp_instance> instance_leader;
3879 hash_set<slp_tree> visited;
3880 slp_instance instance;
3881 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3882 {
3883 instance_leader.put (instance, instance);
3884 vect_bb_partition_graph_r (bb_vinfo,
3885 instance, SLP_INSTANCE_TREE (instance),
3886 stmt_to_instance, instance_leader,
3887 visited);
3888 }
3889
3890 /* Then collect entries to each independent subgraph. */
3891 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3892 {
3893 slp_instance leader = get_ultimate_leader (instance, instance_leader);
3894 leader->subgraph_entries.safe_push (instance);
3895 if (dump_enabled_p ()
3896 && leader != instance)
3897 dump_printf_loc (MSG_NOTE, vect_location,
3898 "instance %p is leader of %p\n",
3899 leader, instance);
3900 }
3901 }
3902
3903 /* Compute the scalar cost of the SLP node NODE and its children
3904 and return it. Do not account defs that are marked in LIFE and
3905 update LIFE according to uses of NODE. */
3906
3907 static void
3908 vect_bb_slp_scalar_cost (vec_info *vinfo,
3909 slp_tree node, vec<bool, va_heap> *life,
3910 stmt_vector_for_cost *cost_vec,
3911 hash_set<slp_tree> &visited)
3912 {
3913 unsigned i;
3914 stmt_vec_info stmt_info;
3915 slp_tree child;
3916
3917 if (visited.add (node))
3918 return;
3919
3920 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3921 {
3922 ssa_op_iter op_iter;
3923 def_operand_p def_p;
3924
3925 if ((*life)[i])
3926 continue;
3927
3928 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3929 gimple *orig_stmt = orig_stmt_info->stmt;
3930
3931 /* If there is a non-vectorized use of the defs then the scalar
3932 stmt is kept live in which case we do not account it or any
3933 required defs in the SLP children in the scalar cost. This
3934 way we make the vectorization more costly when compared to
3935 the scalar cost. */
3936 if (!STMT_VINFO_LIVE_P (stmt_info))
3937 {
3938 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3939 {
3940 imm_use_iterator use_iter;
3941 gimple *use_stmt;
3942 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3943 if (!is_gimple_debug (use_stmt))
3944 {
3945 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3946 if (!use_stmt_info
3947 || !PURE_SLP_STMT
3948 (vect_stmt_to_vectorize (use_stmt_info)))
3949 {
3950 (*life)[i] = true;
3951 BREAK_FROM_IMM_USE_STMT (use_iter);
3952 }
3953 }
3954 }
3955 if ((*life)[i])
3956 continue;
3957 }
3958
3959 /* Count scalar stmts only once. */
3960 if (gimple_visited_p (orig_stmt))
3961 continue;
3962 gimple_set_visited (orig_stmt, true);
3963
3964 vect_cost_for_stmt kind;
3965 if (STMT_VINFO_DATA_REF (orig_stmt_info))
3966 {
3967 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
3968 kind = scalar_load;
3969 else
3970 kind = scalar_store;
3971 }
3972 else if (vect_nop_conversion_p (orig_stmt_info))
3973 continue;
3974 else
3975 kind = scalar_stmt;
3976 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
3977 SLP_TREE_VECTYPE (node), 0, vect_body);
3978 }
3979
3980 auto_vec<bool, 20> subtree_life;
3981 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3982 {
3983 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3984 {
3985 /* Do not directly pass LIFE to the recursive call, copy it to
3986 confine changes in the callee to the current child/subtree. */
3987 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3988 {
3989 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
3990 for (unsigned j = 0;
3991 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
3992 {
3993 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
3994 if (perm.first == i)
3995 subtree_life[perm.second] = (*life)[j];
3996 }
3997 }
3998 else
3999 {
4000 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
4001 subtree_life.safe_splice (*life);
4002 }
4003 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
4004 visited);
4005 subtree_life.truncate (0);
4006 }
4007 }
4008 }
4009
4010 /* Check if vectorization of the basic block is profitable for the
4011 subgraph denoted by SLP_INSTANCES. */
4012
4013 static bool
4014 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
4015 vec<slp_instance> slp_instances)
4016 {
4017 slp_instance instance;
4018 int i;
4019 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
4020 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
4021
4022 void *vect_target_cost_data = init_cost (NULL);
4023
4024 /* Calculate scalar cost and sum the cost for the vector stmts
4025 previously collected. */
4026 stmt_vector_for_cost scalar_costs;
4027 scalar_costs.create (0);
4028 hash_set<slp_tree> visited;
4029 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4030 {
4031 auto_vec<bool, 20> life;
4032 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
4033 true);
4034 vect_bb_slp_scalar_cost (bb_vinfo,
4035 SLP_INSTANCE_TREE (instance),
4036 &life, &scalar_costs, visited);
4037 add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
4038 instance->cost_vec.release ();
4039 }
4040 /* Unset visited flag. */
4041 stmt_info_for_cost *si;
4042 FOR_EACH_VEC_ELT (scalar_costs, i, si)
4043 gimple_set_visited (si->stmt_info->stmt, false);
4044
4045 void *scalar_target_cost_data = init_cost (NULL);
4046 add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
4047 scalar_costs.release ();
4048 unsigned dummy;
4049 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
4050 destroy_cost_data (scalar_target_cost_data);
4051
4052 /* Complete the target-specific vector cost calculation. */
4053 finish_cost (vect_target_cost_data, &vec_prologue_cost,
4054 &vec_inside_cost, &vec_epilogue_cost);
4055 destroy_cost_data (vect_target_cost_data);
4056
4057 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
4058
4059 if (dump_enabled_p ())
4060 {
4061 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4062 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
4063 vec_inside_cost);
4064 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
4065 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
4066 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
4067 }
4068
4069 /* Vectorization is profitable if its cost is more than the cost of scalar
4070 version. Note that we err on the vector side for equal cost because
4071 the cost estimate is otherwise quite pessimistic (constant uses are
4072 free on the scalar side but cost a load on the vector side for
4073 example). */
4074 if (vec_outside_cost + vec_inside_cost > scalar_cost)
4075 return false;
4076
4077 return true;
4078 }
4079
4080 /* Find any vectorizable constructors and add them to the grouped_store
4081 array. */
4082
4083 static void
4084 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
4085 {
4086 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
4087 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
4088 !gsi_end_p (gsi); gsi_next (&gsi))
4089 {
4090 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
4091 if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
4092 continue;
4093
4094 tree rhs = gimple_assign_rhs1 (assign);
4095 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
4096 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
4097 CONSTRUCTOR_NELTS (rhs))
4098 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
4099 || uniform_vector_p (rhs))
4100 continue;
4101
4102 unsigned j;
4103 tree val;
4104 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
4105 if (TREE_CODE (val) != SSA_NAME
4106 || !bb_vinfo->lookup_def (val))
4107 break;
4108 if (j != CONSTRUCTOR_NELTS (rhs))
4109 continue;
4110
4111 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
4112 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
4113 }
4114 }
4115
4116 /* Walk the grouped store chains and replace entries with their
4117 pattern variant if any. */
4118
4119 static void
4120 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
4121 {
4122 stmt_vec_info first_element;
4123 unsigned i;
4124
4125 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4126 {
4127 /* We also have CTORs in this array. */
4128 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
4129 continue;
4130 if (STMT_VINFO_IN_PATTERN_P (first_element))
4131 {
4132 stmt_vec_info orig = first_element;
4133 first_element = STMT_VINFO_RELATED_STMT (first_element);
4134 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
4135 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
4136 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
4137 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
4138 vinfo->grouped_stores[i] = first_element;
4139 }
4140 stmt_vec_info prev = first_element;
4141 while (DR_GROUP_NEXT_ELEMENT (prev))
4142 {
4143 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
4144 if (STMT_VINFO_IN_PATTERN_P (elt))
4145 {
4146 stmt_vec_info orig = elt;
4147 elt = STMT_VINFO_RELATED_STMT (elt);
4148 DR_GROUP_NEXT_ELEMENT (prev) = elt;
4149 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
4150 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
4151 }
4152 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
4153 prev = elt;
4154 }
4155 }
4156 }
4157
4158 /* Check if the region described by BB_VINFO can be vectorized, returning
4159 true if so. When returning false, set FATAL to true if the same failure
4160 would prevent vectorization at other vector sizes, false if it is still
4161 worth trying other sizes. N_STMTS is the number of statements in the
4162 region. */
4163
4164 static bool
4165 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
4166 vec<int> *dataref_groups)
4167 {
4168 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4169
4170 slp_instance instance;
4171 int i;
4172 poly_uint64 min_vf = 2;
4173
4174 /* The first group of checks is independent of the vector size. */
4175 fatal = true;
4176
4177 /* Analyze the data references. */
4178
4179 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
4180 {
4181 if (dump_enabled_p ())
4182 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4183 "not vectorized: unhandled data-ref in basic "
4184 "block.\n");
4185 return false;
4186 }
4187
4188 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
4189 {
4190 if (dump_enabled_p ())
4191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4192 "not vectorized: unhandled data access in "
4193 "basic block.\n");
4194 return false;
4195 }
4196
4197 vect_slp_check_for_constructors (bb_vinfo);
4198
4199 /* If there are no grouped stores and no constructors in the region
4200 there is no need to continue with pattern recog as vect_analyze_slp
4201 will fail anyway. */
4202 if (bb_vinfo->grouped_stores.is_empty ())
4203 {
4204 if (dump_enabled_p ())
4205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4206 "not vectorized: no grouped stores in "
4207 "basic block.\n");
4208 return false;
4209 }
4210
4211 /* While the rest of the analysis below depends on it in some way. */
4212 fatal = false;
4213
4214 vect_pattern_recog (bb_vinfo);
4215
4216 /* Update store groups from pattern processing. */
4217 vect_fixup_store_groups_with_patterns (bb_vinfo);
4218
4219 /* Check the SLP opportunities in the basic block, analyze and build SLP
4220 trees. */
4221 if (!vect_analyze_slp (bb_vinfo, n_stmts))
4222 {
4223 if (dump_enabled_p ())
4224 {
4225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4226 "Failed to SLP the basic block.\n");
4227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4228 "not vectorized: failed to find SLP opportunities "
4229 "in basic block.\n");
4230 }
4231 return false;
4232 }
4233
4234 /* Optimize permutations. */
4235 vect_optimize_slp (bb_vinfo);
4236
4237 /* Gather the loads reachable from the SLP graph entries. */
4238 vect_gather_slp_loads (bb_vinfo);
4239
4240 vect_record_base_alignments (bb_vinfo);
4241
4242 /* Analyze and verify the alignment of data references and the
4243 dependence in the SLP instances. */
4244 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
4245 {
4246 vect_location = instance->location ();
4247 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
4248 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
4249 {
4250 slp_tree node = SLP_INSTANCE_TREE (instance);
4251 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4252 if (dump_enabled_p ())
4253 dump_printf_loc (MSG_NOTE, vect_location,
4254 "removing SLP instance operations starting from: %G",
4255 stmt_info->stmt);
4256 vect_free_slp_instance (instance);
4257 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
4258 continue;
4259 }
4260
4261 /* Mark all the statements that we want to vectorize as pure SLP and
4262 relevant. */
4263 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4264 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
4265 if (SLP_INSTANCE_ROOT_STMT (instance))
4266 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
4267
4268 i++;
4269 }
4270 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
4271 return false;
4272
4273 if (!vect_slp_analyze_operations (bb_vinfo))
4274 {
4275 if (dump_enabled_p ())
4276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4277 "not vectorized: bad operation in basic block.\n");
4278 return false;
4279 }
4280
4281 vect_bb_partition_graph (bb_vinfo);
4282
4283 return true;
4284 }
4285
4286 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4287 basic blocks in BBS, returning true on success.
4288 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4289
4290 static bool
4291 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
4292 vec<int> *dataref_groups, unsigned int n_stmts)
4293 {
4294 bb_vec_info bb_vinfo;
4295 auto_vector_modes vector_modes;
4296
4297 /* Autodetect first vector size we try. */
4298 machine_mode next_vector_mode = VOIDmode;
4299 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
4300 unsigned int mode_i = 0;
4301
4302 vec_info_shared shared;
4303
4304 machine_mode autodetected_vector_mode = VOIDmode;
4305 while (1)
4306 {
4307 bool vectorized = false;
4308 bool fatal = false;
4309 bb_vinfo = new _bb_vec_info (bbs, &shared);
4310
4311 bool first_time_p = shared.datarefs.is_empty ();
4312 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
4313 if (first_time_p)
4314 bb_vinfo->shared->save_datarefs ();
4315 else
4316 bb_vinfo->shared->check_datarefs ();
4317 bb_vinfo->vector_mode = next_vector_mode;
4318
4319 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)
4320 && dbg_cnt (vect_slp))
4321 {
4322 if (dump_enabled_p ())
4323 {
4324 dump_printf_loc (MSG_NOTE, vect_location,
4325 "***** Analysis succeeded with vector mode"
4326 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4327 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4328 }
4329
4330 bb_vinfo->shared->check_datarefs ();
4331
4332 unsigned i;
4333 slp_instance instance;
4334 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4335 {
4336 if (instance->subgraph_entries.is_empty ())
4337 continue;
4338
4339 vect_location = instance->location ();
4340 if (!unlimited_cost_model (NULL)
4341 && !vect_bb_vectorization_profitable_p
4342 (bb_vinfo, instance->subgraph_entries))
4343 {
4344 if (dump_enabled_p ())
4345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4346 "not vectorized: vectorization is not "
4347 "profitable.\n");
4348 continue;
4349 }
4350
4351 if (!vectorized && dump_enabled_p ())
4352 dump_printf_loc (MSG_NOTE, vect_location,
4353 "Basic block will be vectorized "
4354 "using SLP\n");
4355 vectorized = true;
4356
4357 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4358
4359 unsigned HOST_WIDE_INT bytes;
4360 if (dump_enabled_p ())
4361 {
4362 if (GET_MODE_SIZE
4363 (bb_vinfo->vector_mode).is_constant (&bytes))
4364 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4365 "basic block part vectorized using %wu "
4366 "byte vectors\n", bytes);
4367 else
4368 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4369 "basic block part vectorized using "
4370 "variable length vectors\n");
4371 }
4372 }
4373 }
4374 else
4375 {
4376 if (dump_enabled_p ())
4377 dump_printf_loc (MSG_NOTE, vect_location,
4378 "***** Analysis failed with vector mode %s\n",
4379 GET_MODE_NAME (bb_vinfo->vector_mode));
4380 }
4381
4382 if (mode_i == 0)
4383 autodetected_vector_mode = bb_vinfo->vector_mode;
4384
4385 if (!fatal)
4386 while (mode_i < vector_modes.length ()
4387 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
4388 {
4389 if (dump_enabled_p ())
4390 dump_printf_loc (MSG_NOTE, vect_location,
4391 "***** The result for vector mode %s would"
4392 " be the same\n",
4393 GET_MODE_NAME (vector_modes[mode_i]));
4394 mode_i += 1;
4395 }
4396
4397 delete bb_vinfo;
4398
4399 if (mode_i < vector_modes.length ()
4400 && VECTOR_MODE_P (autodetected_vector_mode)
4401 && (related_vector_mode (vector_modes[mode_i],
4402 GET_MODE_INNER (autodetected_vector_mode))
4403 == autodetected_vector_mode)
4404 && (related_vector_mode (autodetected_vector_mode,
4405 GET_MODE_INNER (vector_modes[mode_i]))
4406 == vector_modes[mode_i]))
4407 {
4408 if (dump_enabled_p ())
4409 dump_printf_loc (MSG_NOTE, vect_location,
4410 "***** Skipping vector mode %s, which would"
4411 " repeat the analysis for %s\n",
4412 GET_MODE_NAME (vector_modes[mode_i]),
4413 GET_MODE_NAME (autodetected_vector_mode));
4414 mode_i += 1;
4415 }
4416
4417 if (vectorized
4418 || mode_i == vector_modes.length ()
4419 || autodetected_vector_mode == VOIDmode
4420 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4421 vector sizes will fail do not bother iterating. */
4422 || fatal)
4423 return vectorized;
4424
4425 /* Try the next biggest vector size. */
4426 next_vector_mode = vector_modes[mode_i++];
4427 if (dump_enabled_p ())
4428 dump_printf_loc (MSG_NOTE, vect_location,
4429 "***** Re-trying analysis with vector mode %s\n",
4430 GET_MODE_NAME (next_vector_mode));
4431 }
4432 }
4433
4434
4435 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4436 true if anything in the basic-block was vectorized. */
4437
4438 static bool
4439 vect_slp_bbs (vec<basic_block> bbs)
4440 {
4441 vec<data_reference_p> datarefs = vNULL;
4442 auto_vec<int> dataref_groups;
4443 int insns = 0;
4444 int current_group = 0;
4445
4446 for (unsigned i = 0; i < bbs.length (); i++)
4447 {
4448 basic_block bb = bbs[i];
4449 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
4450 gsi_next (&gsi))
4451 {
4452 gimple *stmt = gsi_stmt (gsi);
4453 if (is_gimple_debug (stmt))
4454 continue;
4455
4456 insns++;
4457
4458 if (gimple_location (stmt) != UNKNOWN_LOCATION)
4459 vect_location = stmt;
4460
4461 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
4462 &dataref_groups, current_group))
4463 ++current_group;
4464 }
4465 }
4466
4467 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
4468 }
4469
4470 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4471 true if anything in the basic-block was vectorized. */
4472
4473 bool
4474 vect_slp_bb (basic_block bb)
4475 {
4476 auto_vec<basic_block> bbs;
4477 bbs.safe_push (bb);
4478 return vect_slp_bbs (bbs);
4479 }
4480
4481 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4482 true if anything in the basic-block was vectorized. */
4483
4484 bool
4485 vect_slp_function (function *fun)
4486 {
4487 bool r = false;
4488 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
4489 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
4490
4491 /* For the moment split the function into pieces to avoid making
4492 the iteration on the vector mode moot. Split at points we know
4493 to not handle well which is CFG merges (SLP discovery doesn't
4494 handle non-loop-header PHIs) and loop exits. Since pattern
4495 recog requires reverse iteration to visit uses before defs
4496 simply chop RPO into pieces. */
4497 auto_vec<basic_block> bbs;
4498 for (unsigned i = 0; i < n; i++)
4499 {
4500 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
4501 bool split = false;
4502
4503 /* Split when a BB is not dominated by the first block. */
4504 if (!bbs.is_empty ()
4505 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
4506 {
4507 if (dump_enabled_p ())
4508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4509 "splitting region at dominance boundary bb%d\n",
4510 bb->index);
4511 split = true;
4512 }
4513 /* Split when the loop determined by the first block
4514 is exited. This is because we eventually insert
4515 invariants at region begin. */
4516 else if (!bbs.is_empty ()
4517 && bbs[0]->loop_father != bb->loop_father
4518 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
4519 {
4520 if (dump_enabled_p ())
4521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4522 "splitting region at loop %d exit at bb%d\n",
4523 bbs[0]->loop_father->num, bb->index);
4524 split = true;
4525 }
4526
4527 if (split && !bbs.is_empty ())
4528 {
4529 r |= vect_slp_bbs (bbs);
4530 bbs.truncate (0);
4531 bbs.quick_push (bb);
4532 }
4533 else
4534 bbs.safe_push (bb);
4535
4536 /* When we have a stmt ending this block and defining a
4537 value we have to insert on edges when inserting after it for
4538 a vector containing its definition. Avoid this for now. */
4539 if (gimple *last = last_stmt (bb))
4540 if (gimple_get_lhs (last)
4541 && is_ctrl_altering_stmt (last))
4542 {
4543 if (dump_enabled_p ())
4544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4545 "splitting region at control altering "
4546 "definition %G", last);
4547 r |= vect_slp_bbs (bbs);
4548 bbs.truncate (0);
4549 }
4550 }
4551
4552 if (!bbs.is_empty ())
4553 r |= vect_slp_bbs (bbs);
4554
4555 free (rpo);
4556
4557 return r;
4558 }
4559
4560 /* Build a variable-length vector in which the elements in ELTS are repeated
4561 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4562 RESULTS and add any new instructions to SEQ.
4563
4564 The approach we use is:
4565
4566 (1) Find a vector mode VM with integer elements of mode IM.
4567
4568 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4569 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4570 from small vectors to IM.
4571
4572 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4573
4574 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4575 correct byte contents.
4576
4577 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4578
4579 We try to find the largest IM for which this sequence works, in order
4580 to cut down on the number of interleaves. */
4581
4582 void
4583 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
4584 vec<tree> elts, unsigned int nresults,
4585 vec<tree> &results)
4586 {
4587 unsigned int nelts = elts.length ();
4588 tree element_type = TREE_TYPE (vector_type);
4589
4590 /* (1) Find a vector mode VM with integer elements of mode IM. */
4591 unsigned int nvectors = 1;
4592 tree new_vector_type;
4593 tree permutes[2];
4594 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
4595 &nvectors, &new_vector_type,
4596 permutes))
4597 gcc_unreachable ();
4598
4599 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4600 unsigned int partial_nelts = nelts / nvectors;
4601 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
4602
4603 tree_vector_builder partial_elts;
4604 auto_vec<tree, 32> pieces (nvectors * 2);
4605 pieces.quick_grow (nvectors * 2);
4606 for (unsigned int i = 0; i < nvectors; ++i)
4607 {
4608 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4609 ELTS' has mode IM. */
4610 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
4611 for (unsigned int j = 0; j < partial_nelts; ++j)
4612 partial_elts.quick_push (elts[i * partial_nelts + j]);
4613 tree t = gimple_build_vector (seq, &partial_elts);
4614 t = gimple_build (seq, VIEW_CONVERT_EXPR,
4615 TREE_TYPE (new_vector_type), t);
4616
4617 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4618 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
4619 }
4620
4621 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4622 correct byte contents.
4623
4624 We need to repeat the following operation log2(nvectors) times:
4625
4626 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4627 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4628
4629 However, if each input repeats every N elements and the VF is
4630 a multiple of N * 2, the HI result is the same as the LO. */
4631 unsigned int in_start = 0;
4632 unsigned int out_start = nvectors;
4633 unsigned int hi_start = nvectors / 2;
4634 /* A bound on the number of outputs needed to produce NRESULTS results
4635 in the final iteration. */
4636 unsigned int noutputs_bound = nvectors * nresults;
4637 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
4638 {
4639 noutputs_bound /= 2;
4640 unsigned int limit = MIN (noutputs_bound, nvectors);
4641 for (unsigned int i = 0; i < limit; ++i)
4642 {
4643 if ((i & 1) != 0
4644 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
4645 2 * in_repeat))
4646 {
4647 pieces[out_start + i] = pieces[out_start + i - 1];
4648 continue;
4649 }
4650
4651 tree output = make_ssa_name (new_vector_type);
4652 tree input1 = pieces[in_start + (i / 2)];
4653 tree input2 = pieces[in_start + (i / 2) + hi_start];
4654 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
4655 input1, input2,
4656 permutes[i & 1]);
4657 gimple_seq_add_stmt (seq, stmt);
4658 pieces[out_start + i] = output;
4659 }
4660 std::swap (in_start, out_start);
4661 }
4662
4663 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4664 results.reserve (nresults);
4665 for (unsigned int i = 0; i < nresults; ++i)
4666 if (i < nvectors)
4667 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
4668 pieces[in_start + i]));
4669 else
4670 results.quick_push (results[i - nvectors]);
4671 }
4672
4673
4674 /* For constant and loop invariant defs in OP_NODE this function creates
4675 vector defs that will be used in the vectorized stmts and stores them
4676 to SLP_TREE_VEC_DEFS of OP_NODE. */
4677
4678 static void
4679 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
4680 {
4681 unsigned HOST_WIDE_INT nunits;
4682 tree vec_cst;
4683 unsigned j, number_of_places_left_in_vector;
4684 tree vector_type;
4685 tree vop;
4686 int group_size = op_node->ops.length ();
4687 unsigned int vec_num, i;
4688 unsigned number_of_copies = 1;
4689 bool constant_p;
4690 gimple_seq ctor_seq = NULL;
4691 auto_vec<tree, 16> permute_results;
4692
4693 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
4694 vector_type = SLP_TREE_VECTYPE (op_node);
4695
4696 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
4697 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
4698 auto_vec<tree> voprnds (number_of_vectors);
4699
4700 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4701 created vectors. It is greater than 1 if unrolling is performed.
4702
4703 For example, we have two scalar operands, s1 and s2 (e.g., group of
4704 strided accesses of size two), while NUNITS is four (i.e., four scalars
4705 of this type can be packed in a vector). The output vector will contain
4706 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4707 will be 2).
4708
4709 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4710 containing the operands.
4711
4712 For example, NUNITS is four as before, and the group size is 8
4713 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4714 {s5, s6, s7, s8}. */
4715
4716 /* When using duplicate_and_interleave, we just need one element for
4717 each scalar statement. */
4718 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4719 nunits = group_size;
4720
4721 number_of_copies = nunits * number_of_vectors / group_size;
4722
4723 number_of_places_left_in_vector = nunits;
4724 constant_p = true;
4725 tree_vector_builder elts (vector_type, nunits, 1);
4726 elts.quick_grow (nunits);
4727 stmt_vec_info insert_after = NULL;
4728 for (j = 0; j < number_of_copies; j++)
4729 {
4730 tree op;
4731 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
4732 {
4733 /* Create 'vect_ = {op0,op1,...,opn}'. */
4734 number_of_places_left_in_vector--;
4735 tree orig_op = op;
4736 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
4737 {
4738 if (CONSTANT_CLASS_P (op))
4739 {
4740 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4741 {
4742 /* Can't use VIEW_CONVERT_EXPR for booleans because
4743 of possibly different sizes of scalar value and
4744 vector element. */
4745 if (integer_zerop (op))
4746 op = build_int_cst (TREE_TYPE (vector_type), 0);
4747 else if (integer_onep (op))
4748 op = build_all_ones_cst (TREE_TYPE (vector_type));
4749 else
4750 gcc_unreachable ();
4751 }
4752 else
4753 op = fold_unary (VIEW_CONVERT_EXPR,
4754 TREE_TYPE (vector_type), op);
4755 gcc_assert (op && CONSTANT_CLASS_P (op));
4756 }
4757 else
4758 {
4759 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
4760 gimple *init_stmt;
4761 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4762 {
4763 tree true_val
4764 = build_all_ones_cst (TREE_TYPE (vector_type));
4765 tree false_val
4766 = build_zero_cst (TREE_TYPE (vector_type));
4767 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
4768 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
4769 op, true_val,
4770 false_val);
4771 }
4772 else
4773 {
4774 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
4775 op);
4776 init_stmt
4777 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
4778 op);
4779 }
4780 gimple_seq_add_stmt (&ctor_seq, init_stmt);
4781 op = new_temp;
4782 }
4783 }
4784 elts[number_of_places_left_in_vector] = op;
4785 if (!CONSTANT_CLASS_P (op))
4786 constant_p = false;
4787 /* For BB vectorization we have to compute an insert location
4788 when a def is inside the analyzed region since we cannot
4789 simply insert at the BB start in this case. */
4790 stmt_vec_info opdef;
4791 if (TREE_CODE (orig_op) == SSA_NAME
4792 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
4793 && is_a <bb_vec_info> (vinfo)
4794 && (opdef = vinfo->lookup_def (orig_op)))
4795 {
4796 if (!insert_after)
4797 insert_after = opdef;
4798 else
4799 insert_after = get_later_stmt (insert_after, opdef);
4800 }
4801
4802 if (number_of_places_left_in_vector == 0)
4803 {
4804 if (constant_p
4805 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4806 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4807 vec_cst = gimple_build_vector (&ctor_seq, &elts);
4808 else
4809 {
4810 if (permute_results.is_empty ())
4811 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
4812 elts, number_of_vectors,
4813 permute_results);
4814 vec_cst = permute_results[number_of_vectors - j - 1];
4815 }
4816 if (!gimple_seq_empty_p (ctor_seq))
4817 {
4818 if (insert_after)
4819 {
4820 gimple_stmt_iterator gsi;
4821 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
4822 {
4823 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
4824 gsi_insert_seq_before (&gsi, ctor_seq,
4825 GSI_CONTINUE_LINKING);
4826 }
4827 else if (!stmt_ends_bb_p (insert_after->stmt))
4828 {
4829 gsi = gsi_for_stmt (insert_after->stmt);
4830 gsi_insert_seq_after (&gsi, ctor_seq,
4831 GSI_CONTINUE_LINKING);
4832 }
4833 else
4834 {
4835 /* When we want to insert after a def where the
4836 defining stmt throws then insert on the fallthru
4837 edge. */
4838 edge e = find_fallthru_edge
4839 (gimple_bb (insert_after->stmt)->succs);
4840 basic_block new_bb
4841 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
4842 gcc_assert (!new_bb);
4843 }
4844 }
4845 else
4846 vinfo->insert_seq_on_entry (NULL, ctor_seq);
4847 ctor_seq = NULL;
4848 }
4849 voprnds.quick_push (vec_cst);
4850 insert_after = NULL;
4851 number_of_places_left_in_vector = nunits;
4852 constant_p = true;
4853 elts.new_vector (vector_type, nunits, 1);
4854 elts.quick_grow (nunits);
4855 }
4856 }
4857 }
4858
4859 /* Since the vectors are created in the reverse order, we should invert
4860 them. */
4861 vec_num = voprnds.length ();
4862 for (j = vec_num; j != 0; j--)
4863 {
4864 vop = voprnds[j - 1];
4865 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4866 }
4867
4868 /* In case that VF is greater than the unrolling factor needed for the SLP
4869 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4870 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4871 to replicate the vectors. */
4872 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
4873 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
4874 i++)
4875 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4876 }
4877
4878 /* Get the Ith vectorized definition from SLP_NODE. */
4879
4880 tree
4881 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
4882 {
4883 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
4884 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
4885 else
4886 return SLP_TREE_VEC_DEFS (slp_node)[i];
4887 }
4888
4889 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
4890
4891 void
4892 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
4893 {
4894 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
4895 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
4896 {
4897 unsigned j;
4898 gimple *vec_def_stmt;
4899 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
4900 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
4901 }
4902 else
4903 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
4904 }
4905
4906 /* Get N vectorized definitions for SLP_NODE. */
4907
4908 void
4909 vect_get_slp_defs (vec_info *,
4910 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
4911 {
4912 if (n == -1U)
4913 n = SLP_TREE_CHILDREN (slp_node).length ();
4914
4915 for (unsigned i = 0; i < n; ++i)
4916 {
4917 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
4918 vec<tree> vec_defs = vNULL;
4919 vect_get_slp_defs (child, &vec_defs);
4920 vec_oprnds->quick_push (vec_defs);
4921 }
4922 }
4923
4924 /* Generate vector permute statements from a list of loads in DR_CHAIN.
4925 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
4926 permute statements for the SLP node NODE. Store the number of vector
4927 permute instructions in *N_PERMS and the number of vector load
4928 instructions in *N_LOADS. */
4929
4930 bool
4931 vect_transform_slp_perm_load (vec_info *vinfo,
4932 slp_tree node, vec<tree> dr_chain,
4933 gimple_stmt_iterator *gsi, poly_uint64 vf,
4934 bool analyze_only, unsigned *n_perms,
4935 unsigned int *n_loads)
4936 {
4937 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4938 int vec_index = 0;
4939 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4940 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
4941 unsigned int mask_element;
4942 machine_mode mode;
4943
4944 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
4945 return false;
4946
4947 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
4948
4949 mode = TYPE_MODE (vectype);
4950 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4951
4952 /* Initialize the vect stmts of NODE to properly insert the generated
4953 stmts later. */
4954 if (! analyze_only)
4955 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
4956 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
4957 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
4958
4959 /* Generate permutation masks for every NODE. Number of masks for each NODE
4960 is equal to GROUP_SIZE.
4961 E.g., we have a group of three nodes with three loads from the same
4962 location in each node, and the vector size is 4. I.e., we have a
4963 a0b0c0a1b1c1... sequence and we need to create the following vectors:
4964 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
4965 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
4966 ...
4967
4968 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
4969 The last mask is illegal since we assume two operands for permute
4970 operation, and the mask element values can't be outside that range.
4971 Hence, the last mask must be converted into {2,5,5,5}.
4972 For the first two permutations we need the first and the second input
4973 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
4974 we need the second and the third vectors: {b1,c1,a2,b2} and
4975 {c2,a3,b3,c3}. */
4976
4977 int vect_stmts_counter = 0;
4978 unsigned int index = 0;
4979 int first_vec_index = -1;
4980 int second_vec_index = -1;
4981 bool noop_p = true;
4982 *n_perms = 0;
4983
4984 vec_perm_builder mask;
4985 unsigned int nelts_to_build;
4986 unsigned int nvectors_per_build;
4987 unsigned int in_nlanes;
4988 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
4989 && multiple_p (nunits, group_size));
4990 if (repeating_p)
4991 {
4992 /* A single vector contains a whole number of copies of the node, so:
4993 (a) all permutes can use the same mask; and
4994 (b) the permutes only need a single vector input. */
4995 mask.new_vector (nunits, group_size, 3);
4996 nelts_to_build = mask.encoded_nelts ();
4997 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
4998 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
4999 }
5000 else
5001 {
5002 /* We need to construct a separate mask for each vector statement. */
5003 unsigned HOST_WIDE_INT const_nunits, const_vf;
5004 if (!nunits.is_constant (&const_nunits)
5005 || !vf.is_constant (&const_vf))
5006 return false;
5007 mask.new_vector (const_nunits, const_nunits, 1);
5008 nelts_to_build = const_vf * group_size;
5009 nvectors_per_build = 1;
5010 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
5011 }
5012 auto_sbitmap used_in_lanes (in_nlanes);
5013 bitmap_clear (used_in_lanes);
5014
5015 unsigned int count = mask.encoded_nelts ();
5016 mask.quick_grow (count);
5017 vec_perm_indices indices;
5018
5019 for (unsigned int j = 0; j < nelts_to_build; j++)
5020 {
5021 unsigned int iter_num = j / group_size;
5022 unsigned int stmt_num = j % group_size;
5023 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
5024 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
5025 bitmap_set_bit (used_in_lanes, i);
5026 if (repeating_p)
5027 {
5028 first_vec_index = 0;
5029 mask_element = i;
5030 }
5031 else
5032 {
5033 /* Enforced before the loop when !repeating_p. */
5034 unsigned int const_nunits = nunits.to_constant ();
5035 vec_index = i / const_nunits;
5036 mask_element = i % const_nunits;
5037 if (vec_index == first_vec_index
5038 || first_vec_index == -1)
5039 {
5040 first_vec_index = vec_index;
5041 }
5042 else if (vec_index == second_vec_index
5043 || second_vec_index == -1)
5044 {
5045 second_vec_index = vec_index;
5046 mask_element += const_nunits;
5047 }
5048 else
5049 {
5050 if (dump_enabled_p ())
5051 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5052 "permutation requires at "
5053 "least three vectors %G",
5054 stmt_info->stmt);
5055 gcc_assert (analyze_only);
5056 return false;
5057 }
5058
5059 gcc_assert (mask_element < 2 * const_nunits);
5060 }
5061
5062 if (mask_element != index)
5063 noop_p = false;
5064 mask[index++] = mask_element;
5065
5066 if (index == count && !noop_p)
5067 {
5068 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
5069 if (!can_vec_perm_const_p (mode, indices))
5070 {
5071 if (dump_enabled_p ())
5072 {
5073 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5074 vect_location,
5075 "unsupported vect permute { ");
5076 for (i = 0; i < count; ++i)
5077 {
5078 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5079 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5080 }
5081 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5082 }
5083 gcc_assert (analyze_only);
5084 return false;
5085 }
5086
5087 ++*n_perms;
5088 }
5089
5090 if (index == count)
5091 {
5092 if (!analyze_only)
5093 {
5094 tree mask_vec = NULL_TREE;
5095
5096 if (! noop_p)
5097 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5098
5099 if (second_vec_index == -1)
5100 second_vec_index = first_vec_index;
5101
5102 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
5103 {
5104 /* Generate the permute statement if necessary. */
5105 tree first_vec = dr_chain[first_vec_index + ri];
5106 tree second_vec = dr_chain[second_vec_index + ri];
5107 gimple *perm_stmt;
5108 if (! noop_p)
5109 {
5110 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5111 tree perm_dest
5112 = vect_create_destination_var (gimple_assign_lhs (stmt),
5113 vectype);
5114 perm_dest = make_ssa_name (perm_dest);
5115 perm_stmt
5116 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5117 first_vec, second_vec,
5118 mask_vec);
5119 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
5120 gsi);
5121 }
5122 else
5123 /* If mask was NULL_TREE generate the requested
5124 identity transform. */
5125 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
5126
5127 /* Store the vector statement in NODE. */
5128 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
5129 }
5130 }
5131
5132 index = 0;
5133 first_vec_index = -1;
5134 second_vec_index = -1;
5135 noop_p = true;
5136 }
5137 }
5138
5139 if (n_loads)
5140 {
5141 if (repeating_p)
5142 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5143 else
5144 {
5145 /* Enforced above when !repeating_p. */
5146 unsigned int const_nunits = nunits.to_constant ();
5147 *n_loads = 0;
5148 bool load_seen = false;
5149 for (unsigned i = 0; i < in_nlanes; ++i)
5150 {
5151 if (i % const_nunits == 0)
5152 {
5153 if (load_seen)
5154 *n_loads += 1;
5155 load_seen = false;
5156 }
5157 if (bitmap_bit_p (used_in_lanes, i))
5158 load_seen = true;
5159 }
5160 if (load_seen)
5161 *n_loads += 1;
5162 }
5163 }
5164
5165 return true;
5166 }
5167
5168
5169 /* Vectorize the SLP permutations in NODE as specified
5170 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5171 child number and lane number.
5172 Interleaving of two two-lane two-child SLP subtrees (not supported):
5173 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5174 A blend of two four-lane two-child SLP subtrees:
5175 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5176 Highpart of a four-lane one-child SLP subtree (not supported):
5177 [ { 0, 2 }, { 0, 3 } ]
5178 Where currently only a subset is supported by code generating below. */
5179
5180 static bool
5181 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5182 slp_tree node, stmt_vector_for_cost *cost_vec)
5183 {
5184 tree vectype = SLP_TREE_VECTYPE (node);
5185
5186 /* ??? We currently only support all same vector input and output types
5187 while the SLP IL should really do a concat + select and thus accept
5188 arbitrary mismatches. */
5189 slp_tree child;
5190 unsigned i;
5191 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5192 if (!vect_maybe_update_slp_op_vectype (child, vectype)
5193 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
5194 {
5195 if (dump_enabled_p ())
5196 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5197 "Unsupported lane permutation\n");
5198 return false;
5199 }
5200
5201 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
5202 gcc_assert (perm.length () == SLP_TREE_LANES (node));
5203 if (dump_enabled_p ())
5204 {
5205 dump_printf_loc (MSG_NOTE, vect_location,
5206 "vectorizing permutation");
5207 for (unsigned i = 0; i < perm.length (); ++i)
5208 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
5209 dump_printf (MSG_NOTE, "\n");
5210 }
5211
5212 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5213 if (!nunits.is_constant ())
5214 return false;
5215 unsigned HOST_WIDE_INT vf = 1;
5216 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
5217 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
5218 return false;
5219 unsigned olanes = vf * SLP_TREE_LANES (node);
5220 gcc_assert (multiple_p (olanes, nunits));
5221
5222 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5223 from the { SLP operand, scalar lane } permutation as recorded in the
5224 SLP node as intermediate step. This part should already work
5225 with SLP children with arbitrary number of lanes. */
5226 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
5227 auto_vec<unsigned> active_lane;
5228 vperm.create (olanes);
5229 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
5230 for (unsigned i = 0; i < vf; ++i)
5231 {
5232 for (unsigned pi = 0; pi < perm.length (); ++pi)
5233 {
5234 std::pair<unsigned, unsigned> p = perm[pi];
5235 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
5236 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
5237 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
5238 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
5239 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
5240 }
5241 /* Advance to the next group. */
5242 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
5243 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
5244 }
5245
5246 if (dump_enabled_p ())
5247 {
5248 dump_printf_loc (MSG_NOTE, vect_location, "as");
5249 for (unsigned i = 0; i < vperm.length (); ++i)
5250 {
5251 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
5252 dump_printf (MSG_NOTE, ",");
5253 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
5254 vperm[i].first.first, vperm[i].first.second,
5255 vperm[i].first.second);
5256 }
5257 dump_printf (MSG_NOTE, "\n");
5258 }
5259
5260 /* We can only handle two-vector permutes, everything else should
5261 be lowered on the SLP level. The following is closely inspired
5262 by vect_transform_slp_perm_load and is supposed to eventually
5263 replace it.
5264 ??? As intermediate step do code-gen in the SLP tree representation
5265 somehow? */
5266 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
5267 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
5268 unsigned int const_nunits = nunits.to_constant ();
5269 unsigned int index = 0;
5270 unsigned int mask_element;
5271 vec_perm_builder mask;
5272 mask.new_vector (const_nunits, const_nunits, 1);
5273 unsigned int count = mask.encoded_nelts ();
5274 mask.quick_grow (count);
5275 vec_perm_indices indices;
5276 unsigned nperms = 0;
5277 for (unsigned i = 0; i < vperm.length (); ++i)
5278 {
5279 mask_element = vperm[i].second;
5280 if (first_vec.first == -1U
5281 || first_vec == vperm[i].first)
5282 first_vec = vperm[i].first;
5283 else if (second_vec.first == -1U
5284 || second_vec == vperm[i].first)
5285 {
5286 second_vec = vperm[i].first;
5287 mask_element += const_nunits;
5288 }
5289 else
5290 {
5291 if (dump_enabled_p ())
5292 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5293 "permutation requires at "
5294 "least three vectors\n");
5295 gcc_assert (!gsi);
5296 return false;
5297 }
5298
5299 mask[index++] = mask_element;
5300
5301 if (index == count)
5302 {
5303 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
5304 const_nunits);
5305 bool identity_p = indices.series_p (0, 1, 0, 1);
5306 if (!identity_p
5307 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5308 {
5309 if (dump_enabled_p ())
5310 {
5311 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5312 vect_location,
5313 "unsupported vect permute { ");
5314 for (i = 0; i < count; ++i)
5315 {
5316 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5317 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5318 }
5319 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5320 }
5321 gcc_assert (!gsi);
5322 return false;
5323 }
5324
5325 if (!identity_p)
5326 nperms++;
5327 if (gsi)
5328 {
5329 if (second_vec.first == -1U)
5330 second_vec = first_vec;
5331
5332 /* Generate the permute statement if necessary. */
5333 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
5334 tree first_def
5335 = vect_get_slp_vect_def (first_node, first_vec.second);
5336 gassign *perm_stmt;
5337 tree perm_dest = make_ssa_name (vectype);
5338 if (!identity_p)
5339 {
5340 slp_tree second_node
5341 = SLP_TREE_CHILDREN (node)[second_vec.first];
5342 tree second_def
5343 = vect_get_slp_vect_def (second_node, second_vec.second);
5344 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5345 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5346 first_def, second_def,
5347 mask_vec);
5348 }
5349 else
5350 /* We need a copy here in case the def was external. */
5351 perm_stmt = gimple_build_assign (perm_dest, first_def);
5352 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
5353 /* Store the vector statement in NODE. */
5354 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
5355 }
5356
5357 index = 0;
5358 first_vec = std::make_pair (-1U, -1U);
5359 second_vec = std::make_pair (-1U, -1U);
5360 }
5361 }
5362
5363 if (!gsi)
5364 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
5365
5366 return true;
5367 }
5368
5369 /* Vectorize SLP NODE. */
5370
5371 static void
5372 vect_schedule_slp_node (vec_info *vinfo,
5373 slp_tree node, slp_instance instance)
5374 {
5375 gimple_stmt_iterator si;
5376 int i;
5377 slp_tree child;
5378
5379 /* For existing vectors there's nothing to do. */
5380 if (SLP_TREE_VEC_DEFS (node).exists ())
5381 return;
5382
5383 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
5384
5385 /* Vectorize externals and constants. */
5386 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5387 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5388 {
5389 /* ??? vectorizable_shift can end up using a scalar operand which is
5390 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5391 node in this case. */
5392 if (!SLP_TREE_VECTYPE (node))
5393 return;
5394
5395 vect_create_constant_vectors (vinfo, node);
5396 return;
5397 }
5398
5399 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5400
5401 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
5402 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
5403
5404 if (dump_enabled_p ())
5405 dump_printf_loc (MSG_NOTE, vect_location,
5406 "------>vectorizing SLP node starting from: %G",
5407 stmt_info->stmt);
5408
5409 if (STMT_VINFO_DATA_REF (stmt_info)
5410 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5411 {
5412 /* Vectorized loads go before the first scalar load to make it
5413 ready early, vectorized stores go before the last scalar
5414 stmt which is where all uses are ready. */
5415 stmt_vec_info last_stmt_info = NULL;
5416 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
5417 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
5418 else /* DR_IS_WRITE */
5419 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
5420 si = gsi_for_stmt (last_stmt_info->stmt);
5421 }
5422 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
5423 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
5424 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
5425 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5426 {
5427 /* For PHI node vectorization we do not use the insertion iterator. */
5428 si = gsi_none ();
5429 }
5430 else
5431 {
5432 /* Emit other stmts after the children vectorized defs which is
5433 earliest possible. */
5434 gimple *last_stmt = NULL;
5435 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5436 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5437 {
5438 /* For fold-left reductions we are retaining the scalar
5439 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5440 set so the representation isn't perfect. Resort to the
5441 last scalar def here. */
5442 if (SLP_TREE_VEC_STMTS (child).is_empty ())
5443 {
5444 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
5445 == cycle_phi_info_type);
5446 gphi *phi = as_a <gphi *>
5447 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
5448 if (!last_stmt
5449 || vect_stmt_dominates_stmt_p (last_stmt, phi))
5450 last_stmt = phi;
5451 }
5452 /* We are emitting all vectorized stmts in the same place and
5453 the last one is the last.
5454 ??? Unless we have a load permutation applied and that
5455 figures to re-use an earlier generated load. */
5456 unsigned j;
5457 gimple *vstmt;
5458 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
5459 if (!last_stmt
5460 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5461 last_stmt = vstmt;
5462 }
5463 else if (!SLP_TREE_VECTYPE (child))
5464 {
5465 /* For externals we use unvectorized at all scalar defs. */
5466 unsigned j;
5467 tree def;
5468 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
5469 if (TREE_CODE (def) == SSA_NAME
5470 && !SSA_NAME_IS_DEFAULT_DEF (def))
5471 {
5472 gimple *stmt = SSA_NAME_DEF_STMT (def);
5473 if (!last_stmt
5474 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
5475 last_stmt = stmt;
5476 }
5477 }
5478 else
5479 {
5480 /* For externals we have to look at all defs since their
5481 insertion place is decided per vector. But beware
5482 of pre-existing vectors where we need to make sure
5483 we do not insert before the region boundary. */
5484 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
5485 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
5486 last_stmt = gsi_stmt (gsi_after_labels
5487 (as_a <bb_vec_info> (vinfo)->bbs[0]));
5488 else
5489 {
5490 unsigned j;
5491 tree vdef;
5492 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
5493 if (TREE_CODE (vdef) == SSA_NAME
5494 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
5495 {
5496 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
5497 if (!last_stmt
5498 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5499 last_stmt = vstmt;
5500 }
5501 }
5502 }
5503 /* This can happen when all children are pre-existing vectors or
5504 constants. */
5505 if (!last_stmt)
5506 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
5507 if (is_a <gphi *> (last_stmt))
5508 si = gsi_after_labels (gimple_bb (last_stmt));
5509 else
5510 {
5511 si = gsi_for_stmt (last_stmt);
5512 gsi_next (&si);
5513 }
5514 }
5515
5516 bool done_p = false;
5517
5518 /* Handle purely internal nodes. */
5519 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5520 {
5521 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5522 be shared with different SLP nodes (but usually it's the same
5523 operation apart from the case the stmt is only there for denoting
5524 the actual scalar lane defs ...). So do not call vect_transform_stmt
5525 but open-code it here (partly). */
5526 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
5527 gcc_assert (done);
5528 done_p = true;
5529 }
5530 if (!done_p)
5531 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
5532 }
5533
5534 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5535 For loop vectorization this is done in vectorizable_call, but for SLP
5536 it needs to be deferred until end of vect_schedule_slp, because multiple
5537 SLP instances may refer to the same scalar stmt. */
5538
5539 static void
5540 vect_remove_slp_scalar_calls (vec_info *vinfo,
5541 slp_tree node, hash_set<slp_tree> &visited)
5542 {
5543 gimple *new_stmt;
5544 gimple_stmt_iterator gsi;
5545 int i;
5546 slp_tree child;
5547 tree lhs;
5548 stmt_vec_info stmt_info;
5549
5550 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5551 return;
5552
5553 if (visited.add (node))
5554 return;
5555
5556 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5557 vect_remove_slp_scalar_calls (vinfo, child, visited);
5558
5559 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5560 {
5561 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
5562 if (!stmt || gimple_bb (stmt) == NULL)
5563 continue;
5564 if (is_pattern_stmt_p (stmt_info)
5565 || !PURE_SLP_STMT (stmt_info))
5566 continue;
5567 lhs = gimple_call_lhs (stmt);
5568 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
5569 gsi = gsi_for_stmt (stmt);
5570 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
5571 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
5572 }
5573 }
5574
5575 static void
5576 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
5577 {
5578 hash_set<slp_tree> visited;
5579 vect_remove_slp_scalar_calls (vinfo, node, visited);
5580 }
5581
5582 /* Vectorize the instance root. */
5583
5584 void
5585 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
5586 {
5587 gassign *rstmt = NULL;
5588
5589 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
5590 {
5591 gimple *child_stmt;
5592 int j;
5593
5594 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5595 {
5596 tree vect_lhs = gimple_get_lhs (child_stmt);
5597 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
5598 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
5599 TREE_TYPE (vect_lhs)))
5600 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
5601 vect_lhs);
5602 rstmt = gimple_build_assign (root_lhs, vect_lhs);
5603 break;
5604 }
5605 }
5606 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
5607 {
5608 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5609 gimple *child_stmt;
5610 int j;
5611 vec<constructor_elt, va_gc> *v;
5612 vec_alloc (v, nelts);
5613
5614 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5615 {
5616 CONSTRUCTOR_APPEND_ELT (v,
5617 NULL_TREE,
5618 gimple_get_lhs (child_stmt));
5619 }
5620 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
5621 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
5622 tree r_constructor = build_constructor (rtype, v);
5623 rstmt = gimple_build_assign (lhs, r_constructor);
5624 }
5625
5626 gcc_assert (rstmt);
5627
5628 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
5629 gsi_replace (&rgsi, rstmt, true);
5630 }
5631
5632 struct slp_scc_info
5633 {
5634 bool on_stack;
5635 int dfs;
5636 int lowlink;
5637 };
5638
5639 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
5640
5641 static void
5642 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
5643 hash_map<slp_tree, slp_scc_info> &scc_info,
5644 int &maxdfs, vec<slp_tree> &stack)
5645 {
5646 bool existed_p;
5647 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
5648 gcc_assert (!existed_p);
5649 info->dfs = maxdfs;
5650 info->lowlink = maxdfs;
5651 maxdfs++;
5652
5653 /* Leaf. */
5654 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5655 {
5656 info->on_stack = false;
5657 vect_schedule_slp_node (vinfo, node, instance);
5658 return;
5659 }
5660
5661 info->on_stack = true;
5662 stack.safe_push (node);
5663
5664 unsigned i;
5665 slp_tree child;
5666 /* DFS recurse. */
5667 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5668 {
5669 if (!child)
5670 continue;
5671 slp_scc_info *child_info = scc_info.get (child);
5672 if (!child_info)
5673 {
5674 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
5675 /* Recursion might have re-allocated the node. */
5676 info = scc_info.get (node);
5677 child_info = scc_info.get (child);
5678 info->lowlink = MIN (info->lowlink, child_info->lowlink);
5679 }
5680 else if (child_info->on_stack)
5681 info->lowlink = MIN (info->lowlink, child_info->dfs);
5682 }
5683 if (info->lowlink != info->dfs)
5684 return;
5685
5686 auto_vec<slp_tree, 4> phis_to_fixup;
5687
5688 /* Singleton. */
5689 if (stack.last () == node)
5690 {
5691 stack.pop ();
5692 info->on_stack = false;
5693 vect_schedule_slp_node (vinfo, node, instance);
5694 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
5695 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
5696 phis_to_fixup.quick_push (node);
5697 }
5698 else
5699 {
5700 /* SCC. */
5701 int last_idx = stack.length () - 1;
5702 while (stack[last_idx] != node)
5703 last_idx--;
5704 /* We can break the cycle at PHIs who have at least one child
5705 code generated. Then we could re-start the DFS walk until
5706 all nodes in the SCC are covered (we might have new entries
5707 for only back-reachable nodes). But it's simpler to just
5708 iterate and schedule those that are ready. */
5709 unsigned todo = stack.length () - last_idx;
5710 do
5711 {
5712 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
5713 {
5714 slp_tree entry = stack[idx];
5715 if (!entry)
5716 continue;
5717 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
5718 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
5719 bool ready = !phi;
5720 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
5721 if (!child)
5722 {
5723 gcc_assert (phi);
5724 ready = true;
5725 break;
5726 }
5727 else if (scc_info.get (child)->on_stack)
5728 {
5729 if (!phi)
5730 {
5731 ready = false;
5732 break;
5733 }
5734 }
5735 else
5736 {
5737 if (phi)
5738 {
5739 ready = true;
5740 break;
5741 }
5742 }
5743 if (ready)
5744 {
5745 vect_schedule_slp_node (vinfo, entry, instance);
5746 scc_info.get (entry)->on_stack = false;
5747 stack[idx] = NULL;
5748 todo--;
5749 if (phi)
5750 phis_to_fixup.safe_push (entry);
5751 }
5752 }
5753 }
5754 while (todo != 0);
5755
5756 /* Pop the SCC. */
5757 stack.truncate (last_idx);
5758 }
5759
5760 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
5761 slp_tree phi_node;
5762 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
5763 {
5764 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
5765 edge_iterator ei;
5766 edge e;
5767 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
5768 {
5769 unsigned dest_idx = e->dest_idx;
5770 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
5771 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
5772 continue;
5773 /* Simply fill all args. */
5774 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
5775 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
5776 vect_get_slp_vect_def (child, i),
5777 e, gimple_phi_arg_location (phi, dest_idx));
5778 }
5779 }
5780 }
5781
5782 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
5783
5784 void
5785 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
5786 {
5787 slp_instance instance;
5788 unsigned int i;
5789
5790 hash_map<slp_tree, slp_scc_info> scc_info;
5791 int maxdfs = 0;
5792 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5793 {
5794 slp_tree node = SLP_INSTANCE_TREE (instance);
5795 if (dump_enabled_p ())
5796 {
5797 dump_printf_loc (MSG_NOTE, vect_location,
5798 "Vectorizing SLP tree:\n");
5799 if (SLP_INSTANCE_ROOT_STMT (instance))
5800 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
5801 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
5802 vect_print_slp_graph (MSG_NOTE, vect_location,
5803 SLP_INSTANCE_TREE (instance));
5804 }
5805 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
5806 have a PHI be the node breaking the cycle. */
5807 auto_vec<slp_tree> stack;
5808 if (!scc_info.get (node))
5809 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
5810
5811 if (SLP_INSTANCE_ROOT_STMT (instance))
5812 vectorize_slp_instance_root_stmt (node, instance);
5813
5814 if (dump_enabled_p ())
5815 dump_printf_loc (MSG_NOTE, vect_location,
5816 "vectorizing stmts using SLP.\n");
5817 }
5818
5819 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5820 {
5821 slp_tree root = SLP_INSTANCE_TREE (instance);
5822 stmt_vec_info store_info;
5823 unsigned int j;
5824
5825 /* Remove scalar call stmts. Do not do this for basic-block
5826 vectorization as not all uses may be vectorized.
5827 ??? Why should this be necessary? DCE should be able to
5828 remove the stmts itself.
5829 ??? For BB vectorization we can as well remove scalar
5830 stmts starting from the SLP tree root if they have no
5831 uses. */
5832 if (is_a <loop_vec_info> (vinfo))
5833 vect_remove_slp_scalar_calls (vinfo, root);
5834
5835 /* Remove vectorized stores original scalar stmts. */
5836 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
5837 {
5838 if (!STMT_VINFO_DATA_REF (store_info)
5839 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
5840 break;
5841
5842 store_info = vect_orig_stmt (store_info);
5843 /* Free the attached stmt_vec_info and remove the stmt. */
5844 vinfo->remove_stmt (store_info);
5845 }
5846 }
5847 }