]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
middle-end: Move load/store-lanes check till late.
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51
52 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
53 slp_tree, stmt_vector_for_cost *);
54
55 object_allocator<_slp_tree> *slp_tree_pool;
56
57 void *
58 _slp_tree::operator new (size_t n)
59 {
60 gcc_assert (n == sizeof (_slp_tree));
61 return slp_tree_pool->allocate_raw ();
62 }
63
64 void
65 _slp_tree::operator delete (void *node, size_t n)
66 {
67 gcc_assert (n == sizeof (_slp_tree));
68 slp_tree_pool->remove_raw (node);
69 }
70
71
72 /* Initialize a SLP node. */
73
74 _slp_tree::_slp_tree ()
75 {
76 SLP_TREE_SCALAR_STMTS (this) = vNULL;
77 SLP_TREE_SCALAR_OPS (this) = vNULL;
78 SLP_TREE_VEC_STMTS (this) = vNULL;
79 SLP_TREE_VEC_DEFS (this) = vNULL;
80 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
81 SLP_TREE_CHILDREN (this) = vNULL;
82 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
83 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
84 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
85 SLP_TREE_CODE (this) = ERROR_MARK;
86 SLP_TREE_VECTYPE (this) = NULL_TREE;
87 SLP_TREE_REPRESENTATIVE (this) = NULL;
88 SLP_TREE_REF_COUNT (this) = 1;
89 this->max_nunits = 1;
90 this->lanes = 0;
91 }
92
93 /* Tear down a SLP node. */
94
95 _slp_tree::~_slp_tree ()
96 {
97 SLP_TREE_CHILDREN (this).release ();
98 SLP_TREE_SCALAR_STMTS (this).release ();
99 SLP_TREE_SCALAR_OPS (this).release ();
100 SLP_TREE_VEC_STMTS (this).release ();
101 SLP_TREE_VEC_DEFS (this).release ();
102 SLP_TREE_LOAD_PERMUTATION (this).release ();
103 SLP_TREE_LANE_PERMUTATION (this).release ();
104 }
105
106 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
107
108 static void
109 vect_free_slp_tree (slp_tree node)
110 {
111 int i;
112 slp_tree child;
113
114 if (--SLP_TREE_REF_COUNT (node) != 0)
115 return;
116
117 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
118 if (child)
119 vect_free_slp_tree (child);
120
121 delete node;
122 }
123
124 /* Return a location suitable for dumpings related to the SLP instance. */
125
126 dump_user_location_t
127 _slp_instance::location () const
128 {
129 if (root_stmt)
130 return root_stmt->stmt;
131 else
132 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
133 }
134
135
136 /* Free the memory allocated for the SLP instance. */
137
138 void
139 vect_free_slp_instance (slp_instance instance)
140 {
141 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
142 SLP_INSTANCE_LOADS (instance).release ();
143 instance->subgraph_entries.release ();
144 instance->cost_vec.release ();
145 free (instance);
146 }
147
148
149 /* Create an SLP node for SCALAR_STMTS. */
150
151 static slp_tree
152 vect_create_new_slp_node (slp_tree node,
153 vec<stmt_vec_info> scalar_stmts, unsigned nops)
154 {
155 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
156 SLP_TREE_CHILDREN (node).create (nops);
157 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
158 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
159 SLP_TREE_LANES (node) = scalar_stmts.length ();
160 return node;
161 }
162
163 /* Create an SLP node for SCALAR_STMTS. */
164
165 static slp_tree
166 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
167 {
168 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
169 }
170
171 /* Create an SLP node for OPS. */
172
173 static slp_tree
174 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
175 {
176 SLP_TREE_SCALAR_OPS (node) = ops;
177 SLP_TREE_DEF_TYPE (node) = vect_external_def;
178 SLP_TREE_LANES (node) = ops.length ();
179 return node;
180 }
181
182 /* Create an SLP node for OPS. */
183
184 static slp_tree
185 vect_create_new_slp_node (vec<tree> ops)
186 {
187 return vect_create_new_slp_node (new _slp_tree, ops);
188 }
189
190
191 /* This structure is used in creation of an SLP tree. Each instance
192 corresponds to the same operand in a group of scalar stmts in an SLP
193 node. */
194 typedef struct _slp_oprnd_info
195 {
196 /* Def-stmts for the operands. */
197 vec<stmt_vec_info> def_stmts;
198 /* Operands. */
199 vec<tree> ops;
200 /* Information about the first statement, its vector def-type, type, the
201 operand itself in case it's constant, and an indication if it's a pattern
202 stmt. */
203 tree first_op_type;
204 enum vect_def_type first_dt;
205 bool any_pattern;
206 } *slp_oprnd_info;
207
208
209 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
210 operand. */
211 static vec<slp_oprnd_info>
212 vect_create_oprnd_info (int nops, int group_size)
213 {
214 int i;
215 slp_oprnd_info oprnd_info;
216 vec<slp_oprnd_info> oprnds_info;
217
218 oprnds_info.create (nops);
219 for (i = 0; i < nops; i++)
220 {
221 oprnd_info = XNEW (struct _slp_oprnd_info);
222 oprnd_info->def_stmts.create (group_size);
223 oprnd_info->ops.create (group_size);
224 oprnd_info->first_dt = vect_uninitialized_def;
225 oprnd_info->first_op_type = NULL_TREE;
226 oprnd_info->any_pattern = false;
227 oprnds_info.quick_push (oprnd_info);
228 }
229
230 return oprnds_info;
231 }
232
233
234 /* Free operands info. */
235
236 static void
237 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
238 {
239 int i;
240 slp_oprnd_info oprnd_info;
241
242 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
243 {
244 oprnd_info->def_stmts.release ();
245 oprnd_info->ops.release ();
246 XDELETE (oprnd_info);
247 }
248
249 oprnds_info.release ();
250 }
251
252
253 /* Return true if STMTS contains a pattern statement. */
254
255 static bool
256 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
257 {
258 stmt_vec_info stmt_info;
259 unsigned int i;
260 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
261 if (is_pattern_stmt_p (stmt_info))
262 return true;
263 return false;
264 }
265
266 /* Return true when all lanes in the external or constant NODE have
267 the same value. */
268
269 static bool
270 vect_slp_tree_uniform_p (slp_tree node)
271 {
272 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
273 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
274
275 /* Pre-exsting vectors. */
276 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
277 return false;
278
279 unsigned i;
280 tree op, first = NULL_TREE;
281 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
282 if (!first)
283 first = op;
284 else if (!operand_equal_p (first, op, 0))
285 return false;
286
287 return true;
288 }
289
290 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
291 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
292 of the chain. */
293
294 int
295 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
296 stmt_vec_info first_stmt_info)
297 {
298 stmt_vec_info next_stmt_info = first_stmt_info;
299 int result = 0;
300
301 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
302 return -1;
303
304 do
305 {
306 if (next_stmt_info == stmt_info)
307 return result;
308 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
309 if (next_stmt_info)
310 result += DR_GROUP_GAP (next_stmt_info);
311 }
312 while (next_stmt_info);
313
314 return -1;
315 }
316
317 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
318 using the method implemented by duplicate_and_interleave. Return true
319 if so, returning the number of intermediate vectors in *NVECTORS_OUT
320 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
321 (if nonnull). */
322
323 bool
324 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
325 tree elt_type, unsigned int *nvectors_out,
326 tree *vector_type_out,
327 tree *permutes)
328 {
329 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
330 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
331 return false;
332
333 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
334 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
335 unsigned int nvectors = 1;
336 for (;;)
337 {
338 scalar_int_mode int_mode;
339 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
340 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
341 {
342 /* Get the natural vector type for this SLP group size. */
343 tree int_type = build_nonstandard_integer_type
344 (GET_MODE_BITSIZE (int_mode), 1);
345 tree vector_type
346 = get_vectype_for_scalar_type (vinfo, int_type, count);
347 if (vector_type
348 && VECTOR_MODE_P (TYPE_MODE (vector_type))
349 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
350 GET_MODE_SIZE (base_vector_mode)))
351 {
352 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
353 together into elements of type INT_TYPE and using the result
354 to build NVECTORS vectors. */
355 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
356 vec_perm_builder sel1 (nelts, 2, 3);
357 vec_perm_builder sel2 (nelts, 2, 3);
358 poly_int64 half_nelts = exact_div (nelts, 2);
359 for (unsigned int i = 0; i < 3; ++i)
360 {
361 sel1.quick_push (i);
362 sel1.quick_push (i + nelts);
363 sel2.quick_push (half_nelts + i);
364 sel2.quick_push (half_nelts + i + nelts);
365 }
366 vec_perm_indices indices1 (sel1, 2, nelts);
367 vec_perm_indices indices2 (sel2, 2, nelts);
368 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
369 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
370 {
371 if (nvectors_out)
372 *nvectors_out = nvectors;
373 if (vector_type_out)
374 *vector_type_out = vector_type;
375 if (permutes)
376 {
377 permutes[0] = vect_gen_perm_mask_checked (vector_type,
378 indices1);
379 permutes[1] = vect_gen_perm_mask_checked (vector_type,
380 indices2);
381 }
382 return true;
383 }
384 }
385 }
386 if (!multiple_p (elt_bytes, 2, &elt_bytes))
387 return false;
388 nvectors *= 2;
389 }
390 }
391
392 /* Return true if DTA and DTB match. */
393
394 static bool
395 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
396 {
397 return (dta == dtb
398 || ((dta == vect_external_def || dta == vect_constant_def)
399 && (dtb == vect_external_def || dtb == vect_constant_def)));
400 }
401
402 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
403 they are of a valid type and that they match the defs of the first stmt of
404 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
405 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
406 indicates swap is required for cond_expr stmts. Specifically, *SWAP
407 is 1 if STMT is cond and operands of comparison need to be swapped;
408 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
409 If there is any operand swap in this function, *SWAP is set to non-zero
410 value.
411 If there was a fatal error return -1; if the error could be corrected by
412 swapping operands of father node of this one, return 1; if everything is
413 ok return 0. */
414 static int
415 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
416 bool *skip_args,
417 vec<stmt_vec_info> stmts, unsigned stmt_num,
418 vec<slp_oprnd_info> *oprnds_info)
419 {
420 stmt_vec_info stmt_info = stmts[stmt_num];
421 tree oprnd;
422 unsigned int i, number_of_oprnds;
423 enum vect_def_type dt = vect_uninitialized_def;
424 slp_oprnd_info oprnd_info;
425 int first_op_idx = 1;
426 unsigned int commutative_op = -1U;
427 bool first_op_cond = false;
428 bool first = stmt_num == 0;
429
430 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
431 {
432 number_of_oprnds = gimple_call_num_args (stmt);
433 first_op_idx = 3;
434 if (gimple_call_internal_p (stmt))
435 {
436 internal_fn ifn = gimple_call_internal_fn (stmt);
437 commutative_op = first_commutative_argument (ifn);
438
439 /* Masked load, only look at mask. */
440 if (ifn == IFN_MASK_LOAD)
441 {
442 number_of_oprnds = 1;
443 /* Mask operand index. */
444 first_op_idx = 5;
445 }
446 }
447 }
448 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
449 {
450 enum tree_code code = gimple_assign_rhs_code (stmt);
451 number_of_oprnds = gimple_num_ops (stmt) - 1;
452 /* Swap can only be done for cond_expr if asked to, otherwise we
453 could result in different comparison code to the first stmt. */
454 if (code == COND_EXPR
455 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
456 {
457 first_op_cond = true;
458 number_of_oprnds++;
459 }
460 else
461 commutative_op = commutative_tree_code (code) ? 0U : -1U;
462 }
463 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
464 number_of_oprnds = gimple_phi_num_args (stmt);
465 else
466 return -1;
467
468 bool swapped = (swap != 0);
469 bool backedge = false;
470 gcc_assert (!swapped || first_op_cond);
471 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
472 for (i = 0; i < number_of_oprnds; i++)
473 {
474 if (first_op_cond)
475 {
476 /* Map indicating how operands of cond_expr should be swapped. */
477 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
478 int *map = maps[swap];
479
480 if (i < 2)
481 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
482 first_op_idx), map[i]);
483 else
484 oprnd = gimple_op (stmt_info->stmt, map[i]);
485 }
486 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
487 {
488 oprnd = gimple_phi_arg_def (stmt, i);
489 backedge = dominated_by_p (CDI_DOMINATORS,
490 gimple_phi_arg_edge (stmt, i)->src,
491 gimple_bb (stmt_info->stmt));
492 }
493 else
494 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
495 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
496 oprnd = TREE_OPERAND (oprnd, 0);
497
498 oprnd_info = (*oprnds_info)[i];
499
500 stmt_vec_info def_stmt_info;
501 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
502 {
503 if (dump_enabled_p ())
504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
505 "Build SLP failed: can't analyze def for %T\n",
506 oprnd);
507
508 return -1;
509 }
510
511 if (skip_args[i])
512 {
513 oprnd_info->def_stmts.quick_push (NULL);
514 oprnd_info->ops.quick_push (NULL_TREE);
515 oprnd_info->first_dt = vect_uninitialized_def;
516 continue;
517 }
518
519 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
520 oprnd_info->any_pattern = true;
521
522 oprnd_info->def_stmts.quick_push (def_stmt_info);
523 oprnd_info->ops.quick_push (oprnd);
524
525 /* If there's a extern def on a backedge make sure we can
526 code-generate at the region start.
527 ??? This is another case that could be fixed by adjusting
528 how we split the function but at the moment we'd have conflicting
529 goals there. */
530 if (backedge
531 && dts[i] == vect_external_def
532 && is_a <bb_vec_info> (vinfo)
533 && TREE_CODE (oprnd) == SSA_NAME
534 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
535 && !dominated_by_p (CDI_DOMINATORS,
536 as_a <bb_vec_info> (vinfo)->bbs[0],
537 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
538 {
539 if (dump_enabled_p ())
540 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
541 "Build SLP failed: extern def %T only defined "
542 "on backedge\n", oprnd);
543 return -1;
544 }
545
546 if (first)
547 {
548 tree type = TREE_TYPE (oprnd);
549 dt = dts[i];
550 if ((dt == vect_constant_def
551 || dt == vect_external_def)
552 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
553 && (TREE_CODE (type) == BOOLEAN_TYPE
554 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
555 type)))
556 {
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
559 "Build SLP failed: invalid type of def "
560 "for variable-length SLP %T\n", oprnd);
561 return -1;
562 }
563
564 /* For the swapping logic below force vect_reduction_def
565 for the reduction op in a SLP reduction group. */
566 if (!STMT_VINFO_DATA_REF (stmt_info)
567 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
568 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
569 && def_stmt_info)
570 dts[i] = dt = vect_reduction_def;
571
572 /* Check the types of the definition. */
573 switch (dt)
574 {
575 case vect_external_def:
576 case vect_constant_def:
577 case vect_internal_def:
578 case vect_reduction_def:
579 case vect_induction_def:
580 case vect_nested_cycle:
581 break;
582
583 default:
584 /* FORNOW: Not supported. */
585 if (dump_enabled_p ())
586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
587 "Build SLP failed: illegal type of def %T\n",
588 oprnd);
589 return -1;
590 }
591
592 oprnd_info->first_dt = dt;
593 oprnd_info->first_op_type = type;
594 }
595 }
596 if (first)
597 return 0;
598
599 /* Now match the operand definition types to that of the first stmt. */
600 for (i = 0; i < number_of_oprnds;)
601 {
602 if (skip_args[i])
603 {
604 ++i;
605 continue;
606 }
607
608 oprnd_info = (*oprnds_info)[i];
609 dt = dts[i];
610 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
611 oprnd = oprnd_info->ops[stmt_num];
612 tree type = TREE_TYPE (oprnd);
613
614 if (!types_compatible_p (oprnd_info->first_op_type, type))
615 {
616 if (dump_enabled_p ())
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
618 "Build SLP failed: different operand types\n");
619 return 1;
620 }
621
622 /* Not first stmt of the group, check that the def-stmt/s match
623 the def-stmt/s of the first stmt. Allow different definition
624 types for reduction chains: the first stmt must be a
625 vect_reduction_def (a phi node), and the rest
626 end in the reduction chain. */
627 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
628 && !(oprnd_info->first_dt == vect_reduction_def
629 && !STMT_VINFO_DATA_REF (stmt_info)
630 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
631 && def_stmt_info
632 && !STMT_VINFO_DATA_REF (def_stmt_info)
633 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
634 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
635 || (!STMT_VINFO_DATA_REF (stmt_info)
636 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
637 && ((!def_stmt_info
638 || STMT_VINFO_DATA_REF (def_stmt_info)
639 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
640 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
641 != (oprnd_info->first_dt != vect_reduction_def))))
642 {
643 /* Try swapping operands if we got a mismatch. For BB
644 vectorization only in case it will clearly improve things. */
645 if (i == commutative_op && !swapped
646 && (!is_a <bb_vec_info> (vinfo)
647 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
648 dts[i+1])
649 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
650 || vect_def_types_match
651 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
652 {
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location,
655 "trying swapped operands\n");
656 std::swap (dts[i], dts[i+1]);
657 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
658 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
659 std::swap ((*oprnds_info)[i]->ops[stmt_num],
660 (*oprnds_info)[i+1]->ops[stmt_num]);
661 swapped = true;
662 continue;
663 }
664
665 if (is_a <bb_vec_info> (vinfo)
666 && !oprnd_info->any_pattern)
667 {
668 /* Now for commutative ops we should see whether we can
669 make the other operand matching. */
670 if (dump_enabled_p ())
671 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
672 "treating operand as external\n");
673 oprnd_info->first_dt = dt = vect_external_def;
674 }
675 else
676 {
677 if (dump_enabled_p ())
678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
679 "Build SLP failed: different types\n");
680 return 1;
681 }
682 }
683
684 /* Make sure to demote the overall operand to external. */
685 if (dt == vect_external_def)
686 oprnd_info->first_dt = vect_external_def;
687 /* For a SLP reduction chain we want to duplicate the reduction to
688 each of the chain members. That gets us a sane SLP graph (still
689 the stmts are not 100% correct wrt the initial values). */
690 else if ((dt == vect_internal_def
691 || dt == vect_reduction_def)
692 && oprnd_info->first_dt == vect_reduction_def
693 && !STMT_VINFO_DATA_REF (stmt_info)
694 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
695 && !STMT_VINFO_DATA_REF (def_stmt_info)
696 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
697 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
698 {
699 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
700 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
701 }
702
703 ++i;
704 }
705
706 /* Swap operands. */
707 if (swapped)
708 {
709 if (dump_enabled_p ())
710 dump_printf_loc (MSG_NOTE, vect_location,
711 "swapped operands to match def types in %G",
712 stmt_info->stmt);
713 }
714
715 return 0;
716 }
717
718 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
719 Return true if we can, meaning that this choice doesn't conflict with
720 existing SLP nodes that use STMT_INFO. */
721
722 bool
723 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
724 {
725 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
726 if (old_vectype)
727 return useless_type_conversion_p (vectype, old_vectype);
728
729 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
730 {
731 /* We maintain the invariant that if any statement in the group is
732 used, all other members of the group have the same vector type. */
733 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
734 stmt_vec_info member_info = first_info;
735 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
736 if (is_pattern_stmt_p (member_info)
737 && !useless_type_conversion_p (vectype,
738 STMT_VINFO_VECTYPE (member_info)))
739 break;
740
741 if (!member_info)
742 {
743 for (member_info = first_info; member_info;
744 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
745 STMT_VINFO_VECTYPE (member_info) = vectype;
746 return true;
747 }
748 }
749 else if (!is_pattern_stmt_p (stmt_info))
750 {
751 STMT_VINFO_VECTYPE (stmt_info) = vectype;
752 return true;
753 }
754
755 if (dump_enabled_p ())
756 {
757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758 "Build SLP failed: incompatible vector"
759 " types for: %G", stmt_info->stmt);
760 dump_printf_loc (MSG_NOTE, vect_location,
761 " old vector type: %T\n", old_vectype);
762 dump_printf_loc (MSG_NOTE, vect_location,
763 " new vector type: %T\n", vectype);
764 }
765 return false;
766 }
767
768 /* Return true if call statements CALL1 and CALL2 are similar enough
769 to be combined into the same SLP group. */
770
771 static bool
772 compatible_calls_p (gcall *call1, gcall *call2)
773 {
774 unsigned int nargs = gimple_call_num_args (call1);
775 if (nargs != gimple_call_num_args (call2))
776 return false;
777
778 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
779 return false;
780
781 if (gimple_call_internal_p (call1))
782 {
783 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
784 TREE_TYPE (gimple_call_lhs (call2))))
785 return false;
786 for (unsigned int i = 0; i < nargs; ++i)
787 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
788 TREE_TYPE (gimple_call_arg (call2, i))))
789 return false;
790 }
791 else
792 {
793 if (!operand_equal_p (gimple_call_fn (call1),
794 gimple_call_fn (call2), 0))
795 return false;
796
797 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
798 return false;
799 }
800 return true;
801 }
802
803 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
804 caller's attempt to find the vector type in STMT_INFO with the narrowest
805 element type. Return true if VECTYPE is nonnull and if it is valid
806 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
807 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
808 vect_build_slp_tree. */
809
810 static bool
811 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
812 unsigned int group_size,
813 tree vectype, poly_uint64 *max_nunits)
814 {
815 if (!vectype)
816 {
817 if (dump_enabled_p ())
818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
819 "Build SLP failed: unsupported data-type in %G\n",
820 stmt_info->stmt);
821 /* Fatal mismatch. */
822 return false;
823 }
824
825 /* If populating the vector type requires unrolling then fail
826 before adjusting *max_nunits for basic-block vectorization. */
827 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
828 unsigned HOST_WIDE_INT const_nunits;
829 if (is_a <bb_vec_info> (vinfo)
830 && (!nunits.is_constant (&const_nunits)
831 || const_nunits > group_size))
832 {
833 if (dump_enabled_p ())
834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
835 "Build SLP failed: unrolling required "
836 "in basic block SLP\n");
837 /* Fatal mismatch. */
838 return false;
839 }
840
841 /* In case of multiple types we need to detect the smallest type. */
842 vect_update_max_nunits (max_nunits, vectype);
843 return true;
844 }
845
846 /* Verify if the scalar stmts STMTS are isomorphic, require data
847 permutation or are of unsupported types of operation. Return
848 true if they are, otherwise return false and indicate in *MATCHES
849 which stmts are not isomorphic to the first one. If MATCHES[0]
850 is false then this indicates the comparison could not be
851 carried out or the stmts will never be vectorized by SLP.
852
853 Note COND_EXPR is possibly isomorphic to another one after swapping its
854 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
855 the first stmt by swapping the two operands of comparison; set SWAP[i]
856 to 2 if stmt I is isormorphic to the first stmt by inverting the code
857 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
858 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
859
860 static bool
861 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
862 vec<stmt_vec_info> stmts, unsigned int group_size,
863 poly_uint64 *max_nunits, bool *matches,
864 bool *two_operators, tree *node_vectype)
865 {
866 unsigned int i;
867 stmt_vec_info first_stmt_info = stmts[0];
868 enum tree_code first_stmt_code = ERROR_MARK;
869 enum tree_code alt_stmt_code = ERROR_MARK;
870 enum tree_code rhs_code = ERROR_MARK;
871 enum tree_code first_cond_code = ERROR_MARK;
872 tree lhs;
873 bool need_same_oprnds = false;
874 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
875 optab optab;
876 int icode;
877 machine_mode optab_op2_mode;
878 machine_mode vec_mode;
879 stmt_vec_info first_load = NULL, prev_first_load = NULL;
880 bool first_stmt_load_p = false, load_p = false;
881 bool first_stmt_phi_p = false, phi_p = false;
882
883 /* For every stmt in NODE find its def stmt/s. */
884 stmt_vec_info stmt_info;
885 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
886 {
887 gimple *stmt = stmt_info->stmt;
888 swap[i] = 0;
889 matches[i] = false;
890
891 if (dump_enabled_p ())
892 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
893
894 /* Fail to vectorize statements marked as unvectorizable, throw
895 or are volatile. */
896 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
897 || stmt_can_throw_internal (cfun, stmt)
898 || gimple_has_volatile_ops (stmt))
899 {
900 if (dump_enabled_p ())
901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
902 "Build SLP failed: unvectorizable statement %G",
903 stmt);
904 /* ??? For BB vectorization we want to commutate operands in a way
905 to shuffle all unvectorizable defs into one operand and have
906 the other still vectorized. The following doesn't reliably
907 work for this though but it's the easiest we can do here. */
908 if (is_a <bb_vec_info> (vinfo) && i != 0)
909 continue;
910 /* Fatal mismatch. */
911 matches[0] = false;
912 return false;
913 }
914
915 lhs = gimple_get_lhs (stmt);
916 if (lhs == NULL_TREE)
917 {
918 if (dump_enabled_p ())
919 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
920 "Build SLP failed: not GIMPLE_ASSIGN nor "
921 "GIMPLE_CALL %G", stmt);
922 if (is_a <bb_vec_info> (vinfo) && i != 0)
923 continue;
924 /* Fatal mismatch. */
925 matches[0] = false;
926 return false;
927 }
928
929 tree nunits_vectype;
930 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
931 &nunits_vectype, group_size)
932 || (nunits_vectype
933 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
934 nunits_vectype, max_nunits)))
935 {
936 if (is_a <bb_vec_info> (vinfo) && i != 0)
937 continue;
938 /* Fatal mismatch. */
939 matches[0] = false;
940 return false;
941 }
942
943 gcc_assert (vectype);
944
945 gcall *call_stmt = dyn_cast <gcall *> (stmt);
946 if (call_stmt)
947 {
948 rhs_code = CALL_EXPR;
949
950 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
951 load_p = true;
952 else if ((gimple_call_internal_p (call_stmt)
953 && (!vectorizable_internal_fn_p
954 (gimple_call_internal_fn (call_stmt))))
955 || gimple_call_tail_p (call_stmt)
956 || gimple_call_noreturn_p (call_stmt)
957 || !gimple_call_nothrow_p (call_stmt)
958 || gimple_call_chain (call_stmt))
959 {
960 if (dump_enabled_p ())
961 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
962 "Build SLP failed: unsupported call type %G",
963 call_stmt);
964 if (is_a <bb_vec_info> (vinfo) && i != 0)
965 continue;
966 /* Fatal mismatch. */
967 matches[0] = false;
968 return false;
969 }
970 }
971 else if (gimple_code (stmt) == GIMPLE_PHI)
972 {
973 rhs_code = ERROR_MARK;
974 phi_p = true;
975 }
976 else
977 {
978 rhs_code = gimple_assign_rhs_code (stmt);
979 load_p = gimple_vuse (stmt);
980 }
981
982 /* Check the operation. */
983 if (i == 0)
984 {
985 *node_vectype = vectype;
986 first_stmt_code = rhs_code;
987 first_stmt_load_p = load_p;
988 first_stmt_phi_p = phi_p;
989
990 /* Shift arguments should be equal in all the packed stmts for a
991 vector shift with scalar shift operand. */
992 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
993 || rhs_code == LROTATE_EXPR
994 || rhs_code == RROTATE_EXPR)
995 {
996 vec_mode = TYPE_MODE (vectype);
997
998 /* First see if we have a vector/vector shift. */
999 optab = optab_for_tree_code (rhs_code, vectype,
1000 optab_vector);
1001
1002 if (!optab
1003 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1004 {
1005 /* No vector/vector shift, try for a vector/scalar shift. */
1006 optab = optab_for_tree_code (rhs_code, vectype,
1007 optab_scalar);
1008
1009 if (!optab)
1010 {
1011 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1013 "Build SLP failed: no optab.\n");
1014 if (is_a <bb_vec_info> (vinfo) && i != 0)
1015 continue;
1016 /* Fatal mismatch. */
1017 matches[0] = false;
1018 return false;
1019 }
1020 icode = (int) optab_handler (optab, vec_mode);
1021 if (icode == CODE_FOR_nothing)
1022 {
1023 if (dump_enabled_p ())
1024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1025 "Build SLP failed: "
1026 "op not supported by target.\n");
1027 if (is_a <bb_vec_info> (vinfo) && i != 0)
1028 continue;
1029 /* Fatal mismatch. */
1030 matches[0] = false;
1031 return false;
1032 }
1033 optab_op2_mode = insn_data[icode].operand[2].mode;
1034 if (!VECTOR_MODE_P (optab_op2_mode))
1035 {
1036 need_same_oprnds = true;
1037 first_op1 = gimple_assign_rhs2 (stmt);
1038 }
1039 }
1040 }
1041 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1042 {
1043 need_same_oprnds = true;
1044 first_op1 = gimple_assign_rhs2 (stmt);
1045 }
1046 else if (!load_p
1047 && rhs_code == BIT_FIELD_REF)
1048 {
1049 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1050 if (TREE_CODE (vec) != SSA_NAME
1051 || !types_compatible_p (vectype, TREE_TYPE (vec)))
1052 {
1053 if (is_a <bb_vec_info> (vinfo) && i != 0)
1054 continue;
1055 if (dump_enabled_p ())
1056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1057 "Build SLP failed: "
1058 "BIT_FIELD_REF not supported\n");
1059 /* Fatal mismatch. */
1060 matches[0] = false;
1061 return false;
1062 }
1063 }
1064 else if (call_stmt
1065 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1066 {
1067 need_same_oprnds = true;
1068 first_op1 = gimple_call_arg (call_stmt, 1);
1069 }
1070 }
1071 else
1072 {
1073 if (first_stmt_code != rhs_code
1074 && alt_stmt_code == ERROR_MARK)
1075 alt_stmt_code = rhs_code;
1076 if ((first_stmt_code != rhs_code
1077 && (first_stmt_code != IMAGPART_EXPR
1078 || rhs_code != REALPART_EXPR)
1079 && (first_stmt_code != REALPART_EXPR
1080 || rhs_code != IMAGPART_EXPR)
1081 /* Handle mismatches in plus/minus by computing both
1082 and merging the results. */
1083 && !((first_stmt_code == PLUS_EXPR
1084 || first_stmt_code == MINUS_EXPR)
1085 && (alt_stmt_code == PLUS_EXPR
1086 || alt_stmt_code == MINUS_EXPR)
1087 && rhs_code == alt_stmt_code)
1088 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1089 && (first_stmt_code == ARRAY_REF
1090 || first_stmt_code == BIT_FIELD_REF
1091 || first_stmt_code == INDIRECT_REF
1092 || first_stmt_code == COMPONENT_REF
1093 || first_stmt_code == MEM_REF)))
1094 || first_stmt_load_p != load_p
1095 || first_stmt_phi_p != phi_p)
1096 {
1097 if (dump_enabled_p ())
1098 {
1099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1100 "Build SLP failed: different operation "
1101 "in stmt %G", stmt);
1102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1103 "original stmt %G", first_stmt_info->stmt);
1104 }
1105 /* Mismatch. */
1106 continue;
1107 }
1108
1109 if (need_same_oprnds)
1110 {
1111 tree other_op1 = (call_stmt
1112 ? gimple_call_arg (call_stmt, 1)
1113 : gimple_assign_rhs2 (stmt));
1114 if (!operand_equal_p (first_op1, other_op1, 0))
1115 {
1116 if (dump_enabled_p ())
1117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1118 "Build SLP failed: different shift "
1119 "arguments in %G", stmt);
1120 /* Mismatch. */
1121 continue;
1122 }
1123 }
1124 if (!load_p
1125 && first_stmt_code == BIT_FIELD_REF
1126 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1127 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1128 {
1129 if (dump_enabled_p ())
1130 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1131 "Build SLP failed: different BIT_FIELD_REF "
1132 "arguments in %G", stmt);
1133 /* Mismatch. */
1134 continue;
1135 }
1136
1137 if (!load_p && rhs_code == CALL_EXPR)
1138 {
1139 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1140 as_a <gcall *> (stmt)))
1141 {
1142 if (dump_enabled_p ())
1143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1144 "Build SLP failed: different calls in %G",
1145 stmt);
1146 /* Mismatch. */
1147 continue;
1148 }
1149 }
1150
1151 if (phi_p
1152 && (gimple_bb (first_stmt_info->stmt)
1153 != gimple_bb (stmt_info->stmt)))
1154 {
1155 if (dump_enabled_p ())
1156 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1157 "Build SLP failed: different BB for PHI "
1158 "in %G", stmt);
1159 /* Mismatch. */
1160 continue;
1161 }
1162
1163 if (!types_compatible_p (vectype, *node_vectype))
1164 {
1165 if (dump_enabled_p ())
1166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1167 "Build SLP failed: different vector type "
1168 "in %G", stmt);
1169 /* Mismatch. */
1170 continue;
1171 }
1172 }
1173
1174 /* Grouped store or load. */
1175 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1176 {
1177 if (REFERENCE_CLASS_P (lhs))
1178 {
1179 /* Store. */
1180 ;
1181 }
1182 else
1183 {
1184 /* Load. */
1185 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1186 if (prev_first_load)
1187 {
1188 /* Check that there are no loads from different interleaving
1189 chains in the same node. */
1190 if (prev_first_load != first_load)
1191 {
1192 if (dump_enabled_p ())
1193 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1194 vect_location,
1195 "Build SLP failed: different "
1196 "interleaving chains in one node %G",
1197 stmt);
1198 /* Mismatch. */
1199 continue;
1200 }
1201 }
1202 else
1203 prev_first_load = first_load;
1204 }
1205 } /* Grouped access. */
1206 else
1207 {
1208 if (load_p)
1209 {
1210 /* Not grouped load. */
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1213 "Build SLP failed: not grouped load %G", stmt);
1214
1215 /* FORNOW: Not grouped loads are not supported. */
1216 if (is_a <bb_vec_info> (vinfo) && i != 0)
1217 continue;
1218 /* Fatal mismatch. */
1219 matches[0] = false;
1220 return false;
1221 }
1222
1223 /* Not memory operation. */
1224 if (!phi_p
1225 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1226 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1227 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1228 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1229 && rhs_code != VIEW_CONVERT_EXPR
1230 && rhs_code != CALL_EXPR
1231 && rhs_code != BIT_FIELD_REF)
1232 {
1233 if (dump_enabled_p ())
1234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235 "Build SLP failed: operation unsupported %G",
1236 stmt);
1237 if (is_a <bb_vec_info> (vinfo) && i != 0)
1238 continue;
1239 /* Fatal mismatch. */
1240 matches[0] = false;
1241 return false;
1242 }
1243
1244 if (rhs_code == COND_EXPR)
1245 {
1246 tree cond_expr = gimple_assign_rhs1 (stmt);
1247 enum tree_code cond_code = TREE_CODE (cond_expr);
1248 enum tree_code swap_code = ERROR_MARK;
1249 enum tree_code invert_code = ERROR_MARK;
1250
1251 if (i == 0)
1252 first_cond_code = TREE_CODE (cond_expr);
1253 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1254 {
1255 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1256 swap_code = swap_tree_comparison (cond_code);
1257 invert_code = invert_tree_comparison (cond_code, honor_nans);
1258 }
1259
1260 if (first_cond_code == cond_code)
1261 ;
1262 /* Isomorphic can be achieved by swapping. */
1263 else if (first_cond_code == swap_code)
1264 swap[i] = 1;
1265 /* Isomorphic can be achieved by inverting. */
1266 else if (first_cond_code == invert_code)
1267 swap[i] = 2;
1268 else
1269 {
1270 if (dump_enabled_p ())
1271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1272 "Build SLP failed: different"
1273 " operation %G", stmt);
1274 /* Mismatch. */
1275 continue;
1276 }
1277 }
1278 }
1279
1280 matches[i] = true;
1281 }
1282
1283 for (i = 0; i < group_size; ++i)
1284 if (!matches[i])
1285 return false;
1286
1287 /* If we allowed a two-operation SLP node verify the target can cope
1288 with the permute we are going to use. */
1289 if (alt_stmt_code != ERROR_MARK
1290 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1291 {
1292 *two_operators = true;
1293 }
1294
1295 return true;
1296 }
1297
1298 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1299 Note we never remove apart from at destruction time so we do not
1300 need a special value for deleted that differs from empty. */
1301 struct bst_traits
1302 {
1303 typedef vec <stmt_vec_info> value_type;
1304 typedef vec <stmt_vec_info> compare_type;
1305 static inline hashval_t hash (value_type);
1306 static inline bool equal (value_type existing, value_type candidate);
1307 static inline bool is_empty (value_type x) { return !x.exists (); }
1308 static inline bool is_deleted (value_type x) { return !x.exists (); }
1309 static const bool empty_zero_p = true;
1310 static inline void mark_empty (value_type &x) { x.release (); }
1311 static inline void mark_deleted (value_type &x) { x.release (); }
1312 static inline void remove (value_type &x) { x.release (); }
1313 };
1314 inline hashval_t
1315 bst_traits::hash (value_type x)
1316 {
1317 inchash::hash h;
1318 for (unsigned i = 0; i < x.length (); ++i)
1319 h.add_int (gimple_uid (x[i]->stmt));
1320 return h.end ();
1321 }
1322 inline bool
1323 bst_traits::equal (value_type existing, value_type candidate)
1324 {
1325 if (existing.length () != candidate.length ())
1326 return false;
1327 for (unsigned i = 0; i < existing.length (); ++i)
1328 if (existing[i] != candidate[i])
1329 return false;
1330 return true;
1331 }
1332
1333 typedef hash_map <vec <gimple *>, slp_tree,
1334 simple_hashmap_traits <bst_traits, slp_tree> >
1335 scalar_stmts_to_slp_tree_map_t;
1336
1337 static slp_tree
1338 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1339 vec<stmt_vec_info> stmts, unsigned int group_size,
1340 poly_uint64 *max_nunits,
1341 bool *matches, unsigned *npermutes, unsigned *tree_size,
1342 scalar_stmts_to_slp_tree_map_t *bst_map);
1343
1344 static slp_tree
1345 vect_build_slp_tree (vec_info *vinfo,
1346 vec<stmt_vec_info> stmts, unsigned int group_size,
1347 poly_uint64 *max_nunits,
1348 bool *matches, unsigned *npermutes, unsigned *tree_size,
1349 scalar_stmts_to_slp_tree_map_t *bst_map)
1350 {
1351 if (slp_tree *leader = bst_map->get (stmts))
1352 {
1353 if (dump_enabled_p ())
1354 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1355 *leader ? "" : "failed ", *leader);
1356 if (*leader)
1357 {
1358 SLP_TREE_REF_COUNT (*leader)++;
1359 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1360 }
1361 return *leader;
1362 }
1363
1364 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1365 so we can pick up backedge destinations during discovery. */
1366 slp_tree res = new _slp_tree;
1367 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1368 SLP_TREE_SCALAR_STMTS (res) = stmts;
1369 bst_map->put (stmts.copy (), res);
1370
1371 poly_uint64 this_max_nunits = 1;
1372 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1373 &this_max_nunits,
1374 matches, npermutes, tree_size, bst_map);
1375 if (!res_)
1376 {
1377 bool existed_p = bst_map->put (stmts, NULL);
1378 gcc_assert (existed_p);
1379 /* Mark the node invalid so we can detect those when still in use
1380 as backedge destinations. */
1381 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1382 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1383 vect_free_slp_tree (res);
1384 }
1385 else
1386 {
1387 gcc_assert (res_ == res);
1388 res->max_nunits = this_max_nunits;
1389 vect_update_max_nunits (max_nunits, this_max_nunits);
1390 /* Keep a reference for the bst_map use. */
1391 SLP_TREE_REF_COUNT (res)++;
1392 }
1393 return res_;
1394 }
1395
1396 /* Recursively build an SLP tree starting from NODE.
1397 Fail (and return a value not equal to zero) if def-stmts are not
1398 isomorphic, require data permutation or are of unsupported types of
1399 operation. Otherwise, return 0.
1400 The value returned is the depth in the SLP tree where a mismatch
1401 was found. */
1402
1403 static slp_tree
1404 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1405 vec<stmt_vec_info> stmts, unsigned int group_size,
1406 poly_uint64 *max_nunits,
1407 bool *matches, unsigned *npermutes, unsigned *tree_size,
1408 scalar_stmts_to_slp_tree_map_t *bst_map)
1409 {
1410 unsigned nops, i, this_tree_size = 0;
1411 poly_uint64 this_max_nunits = *max_nunits;
1412
1413 matches[0] = false;
1414
1415 stmt_vec_info stmt_info = stmts[0];
1416 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1417 nops = gimple_call_num_args (stmt);
1418 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1419 {
1420 nops = gimple_num_ops (stmt) - 1;
1421 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1422 nops++;
1423 }
1424 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1425 nops = gimple_phi_num_args (phi);
1426 else
1427 return NULL;
1428
1429 /* If the SLP node is a PHI (induction or reduction), terminate
1430 the recursion. */
1431 bool *skip_args = XALLOCAVEC (bool, nops);
1432 memset (skip_args, 0, sizeof (bool) * nops);
1433 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1434 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1435 {
1436 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1437 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1438 group_size);
1439 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1440 max_nunits))
1441 return NULL;
1442
1443 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1444 if (def_type == vect_induction_def)
1445 {
1446 /* Induction PHIs are not cycles but walk the initial
1447 value. Only for inner loops through, for outer loops
1448 we need to pick up the value from the actual PHIs
1449 to more easily support peeling and epilogue vectorization. */
1450 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1451 if (!nested_in_vect_loop_p (loop, stmt_info))
1452 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1453 else
1454 loop = loop->inner;
1455 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1456 }
1457 else if (def_type == vect_reduction_def
1458 || def_type == vect_double_reduction_def
1459 || def_type == vect_nested_cycle)
1460 {
1461 /* Else def types have to match. */
1462 stmt_vec_info other_info;
1463 bool all_same = true;
1464 FOR_EACH_VEC_ELT (stmts, i, other_info)
1465 {
1466 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1467 return NULL;
1468 if (other_info != stmt_info)
1469 all_same = false;
1470 }
1471 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1472 /* Reduction initial values are not explicitely represented. */
1473 if (!nested_in_vect_loop_p (loop, stmt_info))
1474 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1475 /* Reduction chain backedge defs are filled manually.
1476 ??? Need a better way to identify a SLP reduction chain PHI.
1477 Or a better overall way to SLP match those. */
1478 if (all_same && def_type == vect_reduction_def)
1479 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1480 }
1481 else if (def_type != vect_internal_def)
1482 return NULL;
1483 }
1484
1485
1486 bool two_operators = false;
1487 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1488 tree vectype = NULL_TREE;
1489 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1490 &this_max_nunits, matches, &two_operators,
1491 &vectype))
1492 return NULL;
1493
1494 /* If the SLP node is a load, terminate the recursion unless masked. */
1495 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1496 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1497 {
1498 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1499 {
1500 /* Masked load. */
1501 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1502 nops = 1;
1503 }
1504 else
1505 {
1506 *max_nunits = this_max_nunits;
1507 (*tree_size)++;
1508 node = vect_create_new_slp_node (node, stmts, 0);
1509 SLP_TREE_VECTYPE (node) = vectype;
1510 /* And compute the load permutation. Whether it is actually
1511 a permutation depends on the unrolling factor which is
1512 decided later. */
1513 vec<unsigned> load_permutation;
1514 int j;
1515 stmt_vec_info load_info;
1516 load_permutation.create (group_size);
1517 stmt_vec_info first_stmt_info
1518 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1519 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1520 {
1521 int load_place = vect_get_place_in_interleaving_chain
1522 (load_info, first_stmt_info);
1523 gcc_assert (load_place != -1);
1524 load_permutation.safe_push (load_place);
1525 }
1526 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1527 return node;
1528 }
1529 }
1530 else if (gimple_assign_single_p (stmt_info->stmt)
1531 && !gimple_vuse (stmt_info->stmt)
1532 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1533 {
1534 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1535 the same SSA name vector of a compatible type to vectype. */
1536 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1537 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1538 stmt_vec_info estmt_info;
1539 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1540 {
1541 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1542 tree bfref = gimple_assign_rhs1 (estmt);
1543 HOST_WIDE_INT lane;
1544 if (!known_eq (bit_field_size (bfref),
1545 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1546 || !constant_multiple_p (bit_field_offset (bfref),
1547 bit_field_size (bfref), &lane))
1548 {
1549 lperm.release ();
1550 return NULL;
1551 }
1552 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1553 }
1554 slp_tree vnode = vect_create_new_slp_node (vNULL);
1555 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1556 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1557 /* We are always building a permutation node even if it is an identity
1558 permute to shield the rest of the vectorizer from the odd node
1559 representing an actual vector without any scalar ops.
1560 ??? We could hide it completely with making the permute node
1561 external? */
1562 node = vect_create_new_slp_node (node, stmts, 1);
1563 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1564 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1565 SLP_TREE_VECTYPE (node) = vectype;
1566 SLP_TREE_CHILDREN (node).quick_push (vnode);
1567 return node;
1568 }
1569
1570 /* Get at the operands, verifying they are compatible. */
1571 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1572 slp_oprnd_info oprnd_info;
1573 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1574 {
1575 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
1576 stmts, i, &oprnds_info);
1577 if (res != 0)
1578 matches[(res == -1) ? 0 : i] = false;
1579 if (!matches[0])
1580 break;
1581 }
1582 for (i = 0; i < group_size; ++i)
1583 if (!matches[i])
1584 {
1585 vect_free_oprnd_info (oprnds_info);
1586 return NULL;
1587 }
1588 swap = NULL;
1589
1590 auto_vec<slp_tree, 4> children;
1591
1592 stmt_info = stmts[0];
1593
1594 /* Create SLP_TREE nodes for the definition node/s. */
1595 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1596 {
1597 slp_tree child;
1598 unsigned int j;
1599
1600 /* We're skipping certain operands from processing, for example
1601 outer loop reduction initial defs. */
1602 if (skip_args[i])
1603 {
1604 children.safe_push (NULL);
1605 continue;
1606 }
1607
1608 if (oprnd_info->first_dt == vect_uninitialized_def)
1609 {
1610 /* COND_EXPR have one too many eventually if the condition
1611 is a SSA name. */
1612 gcc_assert (i == 3 && nops == 4);
1613 continue;
1614 }
1615
1616 if (is_a <bb_vec_info> (vinfo)
1617 && oprnd_info->first_dt == vect_internal_def
1618 && !oprnd_info->any_pattern)
1619 {
1620 /* For BB vectorization, if all defs are the same do not
1621 bother to continue the build along the single-lane
1622 graph but use a splat of the scalar value. */
1623 stmt_vec_info first_def = oprnd_info->def_stmts[0];
1624 for (j = 1; j < group_size; ++j)
1625 if (oprnd_info->def_stmts[j] != first_def)
1626 break;
1627 if (j == group_size
1628 /* But avoid doing this for loads where we may be
1629 able to CSE things, unless the stmt is not
1630 vectorizable. */
1631 && (!STMT_VINFO_VECTORIZABLE (first_def)
1632 || !gimple_vuse (first_def->stmt)))
1633 {
1634 if (dump_enabled_p ())
1635 dump_printf_loc (MSG_NOTE, vect_location,
1636 "Using a splat of the uniform operand\n");
1637 oprnd_info->first_dt = vect_external_def;
1638 }
1639 }
1640
1641 if (oprnd_info->first_dt == vect_external_def
1642 || oprnd_info->first_dt == vect_constant_def)
1643 {
1644 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1645 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1646 oprnd_info->ops = vNULL;
1647 children.safe_push (invnode);
1648 continue;
1649 }
1650
1651 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1652 group_size, &this_max_nunits,
1653 matches, npermutes,
1654 &this_tree_size, bst_map)) != NULL)
1655 {
1656 oprnd_info->def_stmts = vNULL;
1657 children.safe_push (child);
1658 continue;
1659 }
1660
1661 /* If the SLP build for operand zero failed and operand zero
1662 and one can be commutated try that for the scalar stmts
1663 that failed the match. */
1664 if (i == 0
1665 /* A first scalar stmt mismatch signals a fatal mismatch. */
1666 && matches[0]
1667 /* ??? For COND_EXPRs we can swap the comparison operands
1668 as well as the arms under some constraints. */
1669 && nops == 2
1670 && oprnds_info[1]->first_dt == vect_internal_def
1671 && is_gimple_assign (stmt_info->stmt)
1672 /* Swapping operands for reductions breaks assumptions later on. */
1673 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1674 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1675 /* Do so only if the number of not successful permutes was nor more
1676 than a cut-ff as re-trying the recursive match on
1677 possibly each level of the tree would expose exponential
1678 behavior. */
1679 && *npermutes < 4)
1680 {
1681 /* See whether we can swap the matching or the non-matching
1682 stmt operands. */
1683 bool swap_not_matching = true;
1684 do
1685 {
1686 for (j = 0; j < group_size; ++j)
1687 {
1688 if (matches[j] != !swap_not_matching)
1689 continue;
1690 stmt_vec_info stmt_info = stmts[j];
1691 /* Verify if we can swap operands of this stmt. */
1692 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1693 if (!stmt
1694 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1695 {
1696 if (!swap_not_matching)
1697 goto fail;
1698 swap_not_matching = false;
1699 break;
1700 }
1701 }
1702 }
1703 while (j != group_size);
1704
1705 /* Swap mismatched definition stmts. */
1706 if (dump_enabled_p ())
1707 dump_printf_loc (MSG_NOTE, vect_location,
1708 "Re-trying with swapped operands of stmts ");
1709 for (j = 0; j < group_size; ++j)
1710 if (matches[j] == !swap_not_matching)
1711 {
1712 std::swap (oprnds_info[0]->def_stmts[j],
1713 oprnds_info[1]->def_stmts[j]);
1714 std::swap (oprnds_info[0]->ops[j],
1715 oprnds_info[1]->ops[j]);
1716 if (dump_enabled_p ())
1717 dump_printf (MSG_NOTE, "%d ", j);
1718 }
1719 if (dump_enabled_p ())
1720 dump_printf (MSG_NOTE, "\n");
1721 /* And try again with scratch 'matches' ... */
1722 bool *tem = XALLOCAVEC (bool, group_size);
1723 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1724 group_size, &this_max_nunits,
1725 tem, npermutes,
1726 &this_tree_size, bst_map)) != NULL)
1727 {
1728 oprnd_info->def_stmts = vNULL;
1729 children.safe_push (child);
1730 continue;
1731 }
1732 /* We do not undo the swapping here since it might still be
1733 the better order for the second operand in case we build
1734 the first one from scalars below. */
1735 ++*npermutes;
1736 }
1737 fail:
1738
1739 /* If the SLP build failed and we analyze a basic-block
1740 simply treat nodes we fail to build as externally defined
1741 (and thus build vectors from the scalar defs).
1742 The cost model will reject outright expensive cases.
1743 ??? This doesn't treat cases where permutation ultimatively
1744 fails (or we don't try permutation below). Ideally we'd
1745 even compute a permutation that will end up with the maximum
1746 SLP tree size... */
1747 if (is_a <bb_vec_info> (vinfo)
1748 /* ??? Rejecting patterns this way doesn't work. We'd have to
1749 do extra work to cancel the pattern so the uses see the
1750 scalar version. */
1751 && !is_pattern_stmt_p (stmt_info)
1752 && !oprnd_info->any_pattern)
1753 {
1754 /* But if there's a leading vector sized set of matching stmts
1755 fail here so we can split the group. This matches the condition
1756 vect_analyze_slp_instance uses. */
1757 /* ??? We might want to split here and combine the results to support
1758 multiple vector sizes better. */
1759 for (j = 0; j < group_size; ++j)
1760 if (!matches[j])
1761 break;
1762 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1763 {
1764 if (dump_enabled_p ())
1765 dump_printf_loc (MSG_NOTE, vect_location,
1766 "Building vector operands from scalars\n");
1767 this_tree_size++;
1768 child = vect_create_new_slp_node (oprnd_info->ops);
1769 children.safe_push (child);
1770 oprnd_info->ops = vNULL;
1771 continue;
1772 }
1773 }
1774
1775 gcc_assert (child == NULL);
1776 FOR_EACH_VEC_ELT (children, j, child)
1777 if (child)
1778 vect_free_slp_tree (child);
1779 vect_free_oprnd_info (oprnds_info);
1780 return NULL;
1781 }
1782
1783 vect_free_oprnd_info (oprnds_info);
1784
1785 /* If we have all children of a child built up from uniform scalars
1786 or does more than one possibly expensive vector construction then
1787 just throw that away, causing it built up from scalars.
1788 The exception is the SLP node for the vector store. */
1789 if (is_a <bb_vec_info> (vinfo)
1790 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1791 /* ??? Rejecting patterns this way doesn't work. We'd have to
1792 do extra work to cancel the pattern so the uses see the
1793 scalar version. */
1794 && !is_pattern_stmt_p (stmt_info))
1795 {
1796 slp_tree child;
1797 unsigned j;
1798 bool all_uniform_p = true;
1799 unsigned n_vector_builds = 0;
1800 FOR_EACH_VEC_ELT (children, j, child)
1801 {
1802 if (!child)
1803 ;
1804 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1805 all_uniform_p = false;
1806 else if (!vect_slp_tree_uniform_p (child))
1807 {
1808 all_uniform_p = false;
1809 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1810 n_vector_builds++;
1811 }
1812 }
1813 if (all_uniform_p || n_vector_builds > 1)
1814 {
1815 /* Roll back. */
1816 matches[0] = false;
1817 FOR_EACH_VEC_ELT (children, j, child)
1818 if (child)
1819 vect_free_slp_tree (child);
1820
1821 if (dump_enabled_p ())
1822 dump_printf_loc (MSG_NOTE, vect_location,
1823 "Building parent vector operands from "
1824 "scalars instead\n");
1825 return NULL;
1826 }
1827 }
1828
1829 *tree_size += this_tree_size + 1;
1830 *max_nunits = this_max_nunits;
1831
1832 if (two_operators)
1833 {
1834 /* ??? We'd likely want to either cache in bst_map sth like
1835 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1836 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1837 explicit stmts to put in so the keying on 'stmts' doesn't
1838 work (but we have the same issue with nodes that use 'ops'). */
1839 slp_tree one = new _slp_tree;
1840 slp_tree two = new _slp_tree;
1841 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1842 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1843 SLP_TREE_VECTYPE (one) = vectype;
1844 SLP_TREE_VECTYPE (two) = vectype;
1845 SLP_TREE_CHILDREN (one).safe_splice (children);
1846 SLP_TREE_CHILDREN (two).safe_splice (children);
1847 slp_tree child;
1848 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1849 SLP_TREE_REF_COUNT (child)++;
1850
1851 /* Here we record the original defs since this
1852 node represents the final lane configuration. */
1853 node = vect_create_new_slp_node (node, stmts, 2);
1854 SLP_TREE_VECTYPE (node) = vectype;
1855 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1856 SLP_TREE_CHILDREN (node).quick_push (one);
1857 SLP_TREE_CHILDREN (node).quick_push (two);
1858 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1859 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1860 enum tree_code ocode = ERROR_MARK;
1861 stmt_vec_info ostmt_info;
1862 unsigned j = 0;
1863 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1864 {
1865 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1866 if (gimple_assign_rhs_code (ostmt) != code0)
1867 {
1868 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1869 ocode = gimple_assign_rhs_code (ostmt);
1870 j = i;
1871 }
1872 else
1873 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1874 }
1875 SLP_TREE_CODE (one) = code0;
1876 SLP_TREE_CODE (two) = ocode;
1877 SLP_TREE_LANES (one) = stmts.length ();
1878 SLP_TREE_LANES (two) = stmts.length ();
1879 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1880 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1881 return node;
1882 }
1883
1884 node = vect_create_new_slp_node (node, stmts, nops);
1885 SLP_TREE_VECTYPE (node) = vectype;
1886 SLP_TREE_CHILDREN (node).splice (children);
1887 return node;
1888 }
1889
1890 /* Dump a single SLP tree NODE. */
1891
1892 static void
1893 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1894 slp_tree node)
1895 {
1896 unsigned i, j;
1897 slp_tree child;
1898 stmt_vec_info stmt_info;
1899 tree op;
1900
1901 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1902 dump_user_location_t user_loc = loc.get_user_location ();
1903 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1904 SLP_TREE_DEF_TYPE (node) == vect_external_def
1905 ? " (external)"
1906 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1907 ? " (constant)"
1908 : ""), node,
1909 estimated_poly_value (node->max_nunits),
1910 SLP_TREE_REF_COUNT (node));
1911 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1912 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1913 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1914 else
1915 {
1916 dump_printf_loc (metadata, user_loc, "\t{ ");
1917 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1918 dump_printf (metadata, "%T%s ", op,
1919 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1920 dump_printf (metadata, "}\n");
1921 }
1922 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1923 {
1924 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1925 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1926 dump_printf (dump_kind, " %u", j);
1927 dump_printf (dump_kind, " }\n");
1928 }
1929 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1930 {
1931 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1932 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1933 dump_printf (dump_kind, " %u[%u]",
1934 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1935 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1936 dump_printf (dump_kind, " }\n");
1937 }
1938 if (SLP_TREE_CHILDREN (node).is_empty ())
1939 return;
1940 dump_printf_loc (metadata, user_loc, "\tchildren");
1941 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1942 dump_printf (dump_kind, " %p", (void *)child);
1943 dump_printf (dump_kind, "\n");
1944 }
1945
1946 DEBUG_FUNCTION void
1947 debug (slp_tree node)
1948 {
1949 debug_dump_context ctx;
1950 vect_print_slp_tree (MSG_NOTE,
1951 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1952 node);
1953 }
1954
1955 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1956
1957 static void
1958 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1959 slp_tree node, hash_set<slp_tree> &visited)
1960 {
1961 unsigned i;
1962 slp_tree child;
1963
1964 if (visited.add (node))
1965 return;
1966
1967 vect_print_slp_tree (dump_kind, loc, node);
1968
1969 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1970 if (child)
1971 vect_print_slp_graph (dump_kind, loc, child, visited);
1972 }
1973
1974 static void
1975 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1976 slp_tree entry)
1977 {
1978 hash_set<slp_tree> visited;
1979 vect_print_slp_graph (dump_kind, loc, entry, visited);
1980 }
1981
1982 /* Mark the tree rooted at NODE with PURE_SLP. */
1983
1984 static void
1985 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1986 {
1987 int i;
1988 stmt_vec_info stmt_info;
1989 slp_tree child;
1990
1991 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1992 return;
1993
1994 if (visited.add (node))
1995 return;
1996
1997 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1998 STMT_SLP_TYPE (stmt_info) = pure_slp;
1999
2000 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2001 if (child)
2002 vect_mark_slp_stmts (child, visited);
2003 }
2004
2005 static void
2006 vect_mark_slp_stmts (slp_tree node)
2007 {
2008 hash_set<slp_tree> visited;
2009 vect_mark_slp_stmts (node, visited);
2010 }
2011
2012 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2013
2014 static void
2015 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2016 {
2017 int i;
2018 stmt_vec_info stmt_info;
2019 slp_tree child;
2020
2021 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2022 return;
2023
2024 if (visited.add (node))
2025 return;
2026
2027 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2028 {
2029 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2030 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2031 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2032 }
2033
2034 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2035 if (child)
2036 vect_mark_slp_stmts_relevant (child, visited);
2037 }
2038
2039 static void
2040 vect_mark_slp_stmts_relevant (slp_tree node)
2041 {
2042 hash_set<slp_tree> visited;
2043 vect_mark_slp_stmts_relevant (node, visited);
2044 }
2045
2046
2047 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2048
2049 static void
2050 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2051 hash_set<slp_tree> &visited)
2052 {
2053 if (!node || visited.add (node))
2054 return;
2055
2056 if (SLP_TREE_CHILDREN (node).length () == 0)
2057 {
2058 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2059 return;
2060 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2061 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2062 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2063 loads.safe_push (node);
2064 }
2065 else
2066 {
2067 unsigned i;
2068 slp_tree child;
2069 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2070 vect_gather_slp_loads (loads, child, visited);
2071 }
2072 }
2073
2074 static void
2075 vect_gather_slp_loads (slp_instance inst, slp_tree node)
2076 {
2077 hash_set<slp_tree> visited;
2078 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
2079 }
2080
2081
2082 /* Find the last store in SLP INSTANCE. */
2083
2084 stmt_vec_info
2085 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2086 {
2087 stmt_vec_info last = NULL;
2088 stmt_vec_info stmt_vinfo;
2089
2090 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2091 {
2092 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2093 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2094 }
2095
2096 return last;
2097 }
2098
2099 /* Find the first stmt in NODE. */
2100
2101 stmt_vec_info
2102 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2103 {
2104 stmt_vec_info first = NULL;
2105 stmt_vec_info stmt_vinfo;
2106
2107 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2108 {
2109 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2110 if (!first
2111 || get_later_stmt (stmt_vinfo, first) == first)
2112 first = stmt_vinfo;
2113 }
2114
2115 return first;
2116 }
2117
2118 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2119 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2120 (also containing the first GROUP1_SIZE stmts, since stores are
2121 consecutive), the second containing the remainder.
2122 Return the first stmt in the second group. */
2123
2124 static stmt_vec_info
2125 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2126 {
2127 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2128 gcc_assert (group1_size > 0);
2129 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2130 gcc_assert (group2_size > 0);
2131 DR_GROUP_SIZE (first_vinfo) = group1_size;
2132
2133 stmt_vec_info stmt_info = first_vinfo;
2134 for (unsigned i = group1_size; i > 1; i--)
2135 {
2136 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2137 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2138 }
2139 /* STMT is now the last element of the first group. */
2140 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2141 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2142
2143 DR_GROUP_SIZE (group2) = group2_size;
2144 for (stmt_info = group2; stmt_info;
2145 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2146 {
2147 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2148 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2149 }
2150
2151 /* For the second group, the DR_GROUP_GAP is that before the original group,
2152 plus skipping over the first vector. */
2153 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2154
2155 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2156 DR_GROUP_GAP (first_vinfo) += group2_size;
2157
2158 if (dump_enabled_p ())
2159 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2160 group1_size, group2_size);
2161
2162 return group2;
2163 }
2164
2165 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2166 statements and a vector of NUNITS elements. */
2167
2168 static poly_uint64
2169 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2170 {
2171 return exact_div (common_multiple (nunits, group_size), group_size);
2172 }
2173
2174 enum slp_instance_kind {
2175 slp_inst_kind_store,
2176 slp_inst_kind_reduc_group,
2177 slp_inst_kind_reduc_chain,
2178 slp_inst_kind_ctor
2179 };
2180
2181 static bool
2182 vect_analyze_slp_instance (vec_info *vinfo,
2183 scalar_stmts_to_slp_tree_map_t *bst_map,
2184 stmt_vec_info stmt_info, unsigned max_tree_size);
2185
2186 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2187 of KIND. Return true if successful. */
2188
2189 static bool
2190 vect_build_slp_instance (vec_info *vinfo,
2191 slp_instance_kind kind,
2192 vec<stmt_vec_info> scalar_stmts,
2193 stmt_vec_info root_stmt_info,
2194 unsigned max_tree_size,
2195 scalar_stmts_to_slp_tree_map_t *bst_map,
2196 /* ??? We need stmt_info for group splitting. */
2197 stmt_vec_info stmt_info_)
2198 {
2199 if (dump_enabled_p ())
2200 {
2201 dump_printf_loc (MSG_NOTE, vect_location,
2202 "Starting SLP discovery for\n");
2203 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
2204 dump_printf_loc (MSG_NOTE, vect_location,
2205 " %G", scalar_stmts[i]->stmt);
2206 }
2207
2208 /* Build the tree for the SLP instance. */
2209 unsigned int group_size = scalar_stmts.length ();
2210 bool *matches = XALLOCAVEC (bool, group_size);
2211 unsigned npermutes = 0;
2212 poly_uint64 max_nunits = 1;
2213 unsigned tree_size = 0;
2214 unsigned i;
2215 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2216 &max_nunits, matches, &npermutes,
2217 &tree_size, bst_map);
2218 if (node != NULL)
2219 {
2220 /* Calculate the unrolling factor based on the smallest type. */
2221 poly_uint64 unrolling_factor
2222 = calculate_unrolling_factor (max_nunits, group_size);
2223
2224 if (maybe_ne (unrolling_factor, 1U)
2225 && is_a <bb_vec_info> (vinfo))
2226 {
2227 unsigned HOST_WIDE_INT const_max_nunits;
2228 if (!max_nunits.is_constant (&const_max_nunits)
2229 || const_max_nunits > group_size)
2230 {
2231 if (dump_enabled_p ())
2232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2233 "Build SLP failed: store group "
2234 "size not a multiple of the vector size "
2235 "in basic block SLP\n");
2236 vect_free_slp_tree (node);
2237 return false;
2238 }
2239 /* Fatal mismatch. */
2240 if (dump_enabled_p ())
2241 dump_printf_loc (MSG_NOTE, vect_location,
2242 "SLP discovery succeeded but node needs "
2243 "splitting\n");
2244 memset (matches, true, group_size);
2245 matches[group_size / const_max_nunits * const_max_nunits] = false;
2246 vect_free_slp_tree (node);
2247 }
2248 else
2249 {
2250 /* Create a new SLP instance. */
2251 slp_instance new_instance = XNEW (class _slp_instance);
2252 SLP_INSTANCE_TREE (new_instance) = node;
2253 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2254 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2255 SLP_INSTANCE_ROOT_STMT (new_instance) = root_stmt_info;
2256 new_instance->reduc_phis = NULL;
2257 new_instance->cost_vec = vNULL;
2258 new_instance->subgraph_entries = vNULL;
2259
2260 vect_gather_slp_loads (new_instance, node);
2261 if (dump_enabled_p ())
2262 dump_printf_loc (MSG_NOTE, vect_location,
2263 "SLP size %u vs. limit %u.\n",
2264 tree_size, max_tree_size);
2265
2266 /* Fixup SLP reduction chains. */
2267 if (kind == slp_inst_kind_reduc_chain)
2268 {
2269 /* If this is a reduction chain with a conversion in front
2270 amend the SLP tree with a node for that. */
2271 gimple *scalar_def
2272 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2273 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
2274 {
2275 /* Get at the conversion stmt - we know it's the single use
2276 of the last stmt of the reduction chain. */
2277 use_operand_p use_p;
2278 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
2279 &use_p, &scalar_def);
2280 gcc_assert (r);
2281 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
2282 next_info = vect_stmt_to_vectorize (next_info);
2283 scalar_stmts = vNULL;
2284 scalar_stmts.create (group_size);
2285 for (unsigned i = 0; i < group_size; ++i)
2286 scalar_stmts.quick_push (next_info);
2287 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2288 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2289 SLP_TREE_CHILDREN (conv).quick_push (node);
2290 SLP_INSTANCE_TREE (new_instance) = conv;
2291 /* We also have to fake this conversion stmt as SLP reduction
2292 group so we don't have to mess with too much code
2293 elsewhere. */
2294 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2295 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2296 }
2297 /* Fill the backedge child of the PHI SLP node. The
2298 general matching code cannot find it because the
2299 scalar code does not reflect how we vectorize the
2300 reduction. */
2301 use_operand_p use_p;
2302 imm_use_iterator imm_iter;
2303 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
2304 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
2305 gimple_get_lhs (scalar_def))
2306 /* There are exactly two non-debug uses, the reduction
2307 PHI and the loop-closed PHI node. */
2308 if (!is_gimple_debug (USE_STMT (use_p))
2309 && gimple_bb (USE_STMT (use_p)) == loop->header)
2310 {
2311 auto_vec<stmt_vec_info, 64> phis (group_size);
2312 stmt_vec_info phi_info
2313 = vinfo->lookup_stmt (USE_STMT (use_p));
2314 for (unsigned i = 0; i < group_size; ++i)
2315 phis.quick_push (phi_info);
2316 slp_tree *phi_node = bst_map->get (phis);
2317 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
2318 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
2319 = SLP_INSTANCE_TREE (new_instance);
2320 SLP_INSTANCE_TREE (new_instance)->refcnt++;
2321 }
2322 }
2323
2324 vinfo->slp_instances.safe_push (new_instance);
2325
2326 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2327 the number of scalar stmts in the root in a few places.
2328 Verify that assumption holds. */
2329 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2330 .length () == group_size);
2331
2332 if (dump_enabled_p ())
2333 {
2334 dump_printf_loc (MSG_NOTE, vect_location,
2335 "Final SLP tree for instance %p:\n", new_instance);
2336 vect_print_slp_graph (MSG_NOTE, vect_location,
2337 SLP_INSTANCE_TREE (new_instance));
2338 }
2339
2340 return true;
2341 }
2342 }
2343 else
2344 {
2345 /* Failed to SLP. */
2346 /* Free the allocated memory. */
2347 scalar_stmts.release ();
2348 }
2349
2350 stmt_vec_info stmt_info = stmt_info_;
2351 /* Try to break the group up into pieces. */
2352 if (kind == slp_inst_kind_store)
2353 {
2354 /* ??? We could delay all the actual splitting of store-groups
2355 until after SLP discovery of the original group completed.
2356 Then we can recurse to vect_build_slp_instance directly. */
2357 for (i = 0; i < group_size; i++)
2358 if (!matches[i])
2359 break;
2360
2361 /* For basic block SLP, try to break the group up into multiples of
2362 a vector size. */
2363 if (is_a <bb_vec_info> (vinfo)
2364 && (i > 1 && i < group_size))
2365 {
2366 tree scalar_type
2367 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2368 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2369 1 << floor_log2 (i));
2370 unsigned HOST_WIDE_INT const_nunits;
2371 if (vectype
2372 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2373 {
2374 /* Split into two groups at the first vector boundary. */
2375 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2376 unsigned group1_size = i & ~(const_nunits - 1);
2377
2378 if (dump_enabled_p ())
2379 dump_printf_loc (MSG_NOTE, vect_location,
2380 "Splitting SLP group at stmt %u\n", i);
2381 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2382 group1_size);
2383 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2384 max_tree_size);
2385 /* Split the rest at the failure point and possibly
2386 re-analyze the remaining matching part if it has
2387 at least two lanes. */
2388 if (group1_size < i
2389 && (i + 1 < group_size
2390 || i - group1_size > 1))
2391 {
2392 stmt_vec_info rest2 = rest;
2393 rest = vect_split_slp_store_group (rest, i - group1_size);
2394 if (i - group1_size > 1)
2395 res |= vect_analyze_slp_instance (vinfo, bst_map,
2396 rest2, max_tree_size);
2397 }
2398 /* Re-analyze the non-matching tail if it has at least
2399 two lanes. */
2400 if (i + 1 < group_size)
2401 res |= vect_analyze_slp_instance (vinfo, bst_map,
2402 rest, max_tree_size);
2403 return res;
2404 }
2405 }
2406
2407 /* For loop vectorization split into arbitrary pieces of size > 1. */
2408 if (is_a <loop_vec_info> (vinfo)
2409 && (i > 1 && i < group_size))
2410 {
2411 unsigned group1_size = i;
2412
2413 if (dump_enabled_p ())
2414 dump_printf_loc (MSG_NOTE, vect_location,
2415 "Splitting SLP group at stmt %u\n", i);
2416
2417 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2418 group1_size);
2419 /* Loop vectorization cannot handle gaps in stores, make sure
2420 the split group appears as strided. */
2421 STMT_VINFO_STRIDED_P (rest) = 1;
2422 DR_GROUP_GAP (rest) = 0;
2423 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2424 DR_GROUP_GAP (stmt_info) = 0;
2425
2426 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2427 max_tree_size);
2428 if (i + 1 < group_size)
2429 res |= vect_analyze_slp_instance (vinfo, bst_map,
2430 rest, max_tree_size);
2431
2432 return res;
2433 }
2434
2435 /* Even though the first vector did not all match, we might be able to SLP
2436 (some) of the remainder. FORNOW ignore this possibility. */
2437 }
2438
2439 /* Failed to SLP. */
2440 if (dump_enabled_p ())
2441 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2442 return false;
2443 }
2444
2445
2446 /* Analyze an SLP instance starting from a group of grouped stores. Call
2447 vect_build_slp_tree to build a tree of packed stmts if possible.
2448 Return FALSE if it's impossible to SLP any stmt in the loop. */
2449
2450 static bool
2451 vect_analyze_slp_instance (vec_info *vinfo,
2452 scalar_stmts_to_slp_tree_map_t *bst_map,
2453 stmt_vec_info stmt_info, unsigned max_tree_size)
2454 {
2455 unsigned int group_size;
2456 unsigned int i;
2457 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2458 vec<stmt_vec_info> scalar_stmts;
2459 slp_instance_kind kind;
2460
2461 if (is_a <bb_vec_info> (vinfo))
2462 vect_location = stmt_info->stmt;
2463 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2464 {
2465 kind = slp_inst_kind_store;
2466 group_size = DR_GROUP_SIZE (stmt_info);
2467 }
2468 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2469 {
2470 kind = slp_inst_kind_reduc_chain;
2471 gcc_assert (is_a <loop_vec_info> (vinfo));
2472 group_size = REDUC_GROUP_SIZE (stmt_info);
2473 }
2474 else if (is_gimple_assign (stmt_info->stmt)
2475 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2476 {
2477 kind = slp_inst_kind_ctor;
2478 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2479 }
2480 else
2481 {
2482 kind = slp_inst_kind_reduc_group;
2483 gcc_assert (is_a <loop_vec_info> (vinfo));
2484 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2485 }
2486
2487 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2488 scalar_stmts.create (group_size);
2489 stmt_vec_info next_info = stmt_info;
2490 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2491 {
2492 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2493 while (next_info)
2494 {
2495 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2496 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2497 }
2498 }
2499 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2500 {
2501 /* Collect the reduction stmts and store them in
2502 SLP_TREE_SCALAR_STMTS. */
2503 while (next_info)
2504 {
2505 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2506 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2507 }
2508 /* Mark the first element of the reduction chain as reduction to properly
2509 transform the node. In the reduction analysis phase only the last
2510 element of the chain is marked as reduction. */
2511 STMT_VINFO_DEF_TYPE (stmt_info)
2512 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2513 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2514 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2515 }
2516 else if (kind == slp_inst_kind_ctor)
2517 {
2518 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2519 tree val;
2520 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2521 {
2522 if (TREE_CODE (val) == SSA_NAME)
2523 {
2524 gimple* def = SSA_NAME_DEF_STMT (val);
2525 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2526 /* Value is defined in another basic block. */
2527 if (!def_info)
2528 return false;
2529 def_info = vect_stmt_to_vectorize (def_info);
2530 scalar_stmts.safe_push (def_info);
2531 }
2532 else
2533 return false;
2534 }
2535 if (dump_enabled_p ())
2536 dump_printf_loc (MSG_NOTE, vect_location,
2537 "Analyzing vectorizable constructor: %G\n",
2538 stmt_info->stmt);
2539 }
2540 else
2541 {
2542 /* Collect reduction statements. */
2543 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2544 for (i = 0; reductions.iterate (i, &next_info); i++)
2545 if (STMT_VINFO_RELEVANT_P (next_info)
2546 || STMT_VINFO_LIVE_P (next_info))
2547 scalar_stmts.quick_push (next_info);
2548 }
2549
2550 /* Build the tree for the SLP instance. */
2551 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
2552 kind == slp_inst_kind_ctor
2553 ? stmt_info : NULL,
2554 max_tree_size,
2555 bst_map, stmt_info);
2556
2557 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2558 where we should do store group splitting. */
2559
2560 return res;
2561 }
2562
2563 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2564 trees of packed scalar stmts if SLP is possible. */
2565
2566 opt_result
2567 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2568 {
2569 unsigned int i;
2570 stmt_vec_info first_element;
2571
2572 DUMP_VECT_SCOPE ("vect_analyze_slp");
2573
2574 scalar_stmts_to_slp_tree_map_t *bst_map
2575 = new scalar_stmts_to_slp_tree_map_t ();
2576
2577 /* Find SLP sequences starting from groups of grouped stores. */
2578 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2579 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2580
2581 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2582 {
2583 /* Find SLP sequences starting from reduction chains. */
2584 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2585 if (! STMT_VINFO_RELEVANT_P (first_element)
2586 && ! STMT_VINFO_LIVE_P (first_element))
2587 ;
2588 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2589 max_tree_size))
2590 {
2591 /* Dissolve reduction chain group. */
2592 stmt_vec_info vinfo = first_element;
2593 stmt_vec_info last = NULL;
2594 while (vinfo)
2595 {
2596 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2597 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2598 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2599 last = vinfo;
2600 vinfo = next;
2601 }
2602 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2603 /* It can be still vectorized as part of an SLP reduction. */
2604 loop_vinfo->reductions.safe_push (last);
2605 }
2606
2607 /* Find SLP sequences starting from groups of reductions. */
2608 if (loop_vinfo->reductions.length () > 1)
2609 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2610 max_tree_size);
2611 }
2612
2613 /* The map keeps a reference on SLP nodes built, release that. */
2614 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2615 it != bst_map->end (); ++it)
2616 if ((*it).second)
2617 vect_free_slp_tree ((*it).second);
2618 delete bst_map;
2619
2620 return opt_result::success ();
2621 }
2622
2623 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2624
2625 static void
2626 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2627 vec<slp_tree> &vertices, vec<int> &leafs)
2628 {
2629 unsigned i;
2630 slp_tree child;
2631
2632 if (visited.add (node))
2633 return;
2634
2635 node->vertex = vertices.length ();
2636 vertices.safe_push (node);
2637
2638 bool leaf = true;
2639 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2640 if (child)
2641 {
2642 leaf = false;
2643 vect_slp_build_vertices (visited, child, vertices, leafs);
2644 }
2645 if (leaf)
2646 leafs.safe_push (node->vertex);
2647 }
2648
2649 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2650
2651 static void
2652 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2653 vec<int> &leafs)
2654 {
2655 hash_set<slp_tree> visited;
2656 unsigned i;
2657 slp_instance instance;
2658 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2659 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2660 leafs);
2661 }
2662
2663 /* Apply (reverse) bijectite PERM to VEC. */
2664
2665 template <class T>
2666 static void
2667 vect_slp_permute (vec<unsigned> perm,
2668 vec<T> &vec, bool reverse)
2669 {
2670 auto_vec<T, 64> saved;
2671 saved.create (vec.length ());
2672 for (unsigned i = 0; i < vec.length (); ++i)
2673 saved.quick_push (vec[i]);
2674
2675 if (reverse)
2676 {
2677 for (unsigned i = 0; i < vec.length (); ++i)
2678 vec[perm[i]] = saved[i];
2679 for (unsigned i = 0; i < vec.length (); ++i)
2680 gcc_assert (vec[perm[i]] == saved[i]);
2681 }
2682 else
2683 {
2684 for (unsigned i = 0; i < vec.length (); ++i)
2685 vec[i] = saved[perm[i]];
2686 for (unsigned i = 0; i < vec.length (); ++i)
2687 gcc_assert (vec[i] == saved[perm[i]]);
2688 }
2689 }
2690
2691 /* Return whether permutations PERM_A and PERM_B as recorded in the
2692 PERMS vector are equal. */
2693
2694 static bool
2695 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
2696 int perm_a, int perm_b)
2697 {
2698 return (perm_a == perm_b
2699 || (perms[perm_a].length () == perms[perm_b].length ()
2700 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
2701 sizeof (unsigned) * perms[perm_a].length ()) == 0));
2702 }
2703
2704 /* Optimize the SLP graph of VINFO. */
2705
2706 void
2707 vect_optimize_slp (vec_info *vinfo)
2708 {
2709 if (vinfo->slp_instances.is_empty ())
2710 return;
2711
2712 slp_tree node;
2713 unsigned i;
2714 auto_vec<slp_tree> vertices;
2715 auto_vec<int> leafs;
2716 vect_slp_build_vertices (vinfo, vertices, leafs);
2717
2718 struct graph *slpg = new_graph (vertices.length ());
2719 FOR_EACH_VEC_ELT (vertices, i, node)
2720 {
2721 unsigned j;
2722 slp_tree child;
2723 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2724 if (child)
2725 add_edge (slpg, i, child->vertex);
2726 }
2727
2728 /* Compute (reverse) postorder on the inverted graph. */
2729 auto_vec<int> ipo;
2730 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
2731
2732 auto_sbitmap n_visited (vertices.length ());
2733 auto_sbitmap n_materialize (vertices.length ());
2734 auto_vec<int> n_perm (vertices.length ());
2735 auto_vec<vec<unsigned> > perms;
2736
2737 bitmap_clear (n_visited);
2738 bitmap_clear (n_materialize);
2739 n_perm.quick_grow_cleared (vertices.length ());
2740 perms.safe_push (vNULL); /* zero is no permute */
2741
2742 /* Produce initial permutations. */
2743 for (i = 0; i < leafs.length (); ++i)
2744 {
2745 int idx = leafs[i];
2746 slp_tree node = vertices[idx];
2747
2748 /* Handle externals and constants optimistically throughout the
2749 iteration. */
2750 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
2751 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
2752 continue;
2753
2754 /* Loads are the only thing generating permutes and leafs do not
2755 change across iterations. */
2756 bitmap_set_bit (n_visited, idx);
2757 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2758 continue;
2759
2760 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2761 node unpermuted, record this permute. */
2762 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
2763 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
2764 continue;
2765 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
2766 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
2767 bool any_permute = false;
2768 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2769 {
2770 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
2771 imin = MIN (imin, idx);
2772 imax = MAX (imax, idx);
2773 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
2774 any_permute = true;
2775 }
2776 /* If there's no permute no need to split one out. */
2777 if (!any_permute)
2778 continue;
2779 /* If the span doesn't match we'd disrupt VF computation, avoid
2780 that for now. */
2781 if (imax - imin + 1 != SLP_TREE_LANES (node))
2782 continue;
2783
2784 /* For now only handle true permutes, like
2785 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2786 when permuting constants and invariants keeping the permute
2787 bijective. */
2788 auto_sbitmap load_index (SLP_TREE_LANES (node));
2789 bitmap_clear (load_index);
2790 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2791 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
2792 unsigned j;
2793 for (j = 0; j < SLP_TREE_LANES (node); ++j)
2794 if (!bitmap_bit_p (load_index, j))
2795 break;
2796 if (j != SLP_TREE_LANES (node))
2797 continue;
2798
2799 vec<unsigned> perm = vNULL;
2800 perm.safe_grow (SLP_TREE_LANES (node), true);
2801 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2802 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
2803 perms.safe_push (perm);
2804 n_perm[idx] = perms.length () - 1;
2805 }
2806
2807 /* Propagate permutes along the graph and compute materialization points. */
2808 bool changed;
2809 unsigned iteration = 0;
2810 do
2811 {
2812 changed = false;
2813 ++iteration;
2814
2815 for (i = vertices.length (); i > 0 ; --i)
2816 {
2817 int idx = ipo[i-1];
2818 slp_tree node = vertices[idx];
2819 /* For leafs there's nothing to do - we've seeded permutes
2820 on those above. */
2821 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2822 continue;
2823
2824 bitmap_set_bit (n_visited, idx);
2825
2826 /* We cannot move a permute across a store. */
2827 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2828 && DR_IS_WRITE
2829 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
2830 continue;
2831
2832 int perm = -1;
2833 for (graph_edge *succ = slpg->vertices[idx].succ;
2834 succ; succ = succ->succ_next)
2835 {
2836 int succ_idx = succ->dest;
2837 /* Handle unvisited nodes optimistically. */
2838 /* ??? But for constants once we want to handle non-bijective
2839 permutes we have to verify the permute, when unifying lanes,
2840 will not unify different constants. For example see
2841 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2842 if (!bitmap_bit_p (n_visited, succ_idx))
2843 continue;
2844 int succ_perm = n_perm[succ_idx];
2845 /* Once we materialize succs permutation its output lanes
2846 appear unpermuted to us. */
2847 if (bitmap_bit_p (n_materialize, succ_idx))
2848 succ_perm = 0;
2849 if (perm == -1)
2850 perm = succ_perm;
2851 else if (succ_perm == 0)
2852 {
2853 perm = 0;
2854 break;
2855 }
2856 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
2857 {
2858 perm = 0;
2859 break;
2860 }
2861 }
2862
2863 if (perm == -1)
2864 /* Pick up pre-computed leaf values. */
2865 perm = n_perm[idx];
2866 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
2867 {
2868 if (iteration > 1)
2869 /* Make sure we eventually converge. */
2870 gcc_checking_assert (perm == 0);
2871 n_perm[idx] = perm;
2872 if (perm == 0)
2873 bitmap_clear_bit (n_materialize, idx);
2874 changed = true;
2875 }
2876
2877 if (perm == 0)
2878 continue;
2879
2880 /* Elide pruning at materialization points in the first
2881 iteration so every node was visited once at least. */
2882 if (iteration == 1)
2883 continue;
2884
2885 /* Decide on permute materialization. Look whether there's
2886 a use (pred) edge that is permuted differently than us.
2887 In that case mark ourselves so the permutation is applied. */
2888 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
2889 for (graph_edge *pred = slpg->vertices[idx].pred;
2890 pred; pred = pred->pred_next)
2891 {
2892 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
2893 int pred_perm = n_perm[pred->src];
2894 if (!vect_slp_perms_eq (perms, perm, pred_perm))
2895 {
2896 all_preds_permuted = false;
2897 break;
2898 }
2899 }
2900 if (!all_preds_permuted)
2901 {
2902 if (!bitmap_bit_p (n_materialize, idx))
2903 changed = true;
2904 bitmap_set_bit (n_materialize, idx);
2905 }
2906 }
2907 }
2908 while (changed || iteration == 1);
2909
2910 /* Materialize. */
2911 for (i = 0; i < vertices.length (); ++i)
2912 {
2913 int perm = n_perm[i];
2914 if (perm <= 0)
2915 continue;
2916
2917 slp_tree node = vertices[i];
2918
2919 /* First permute invariant/external original successors. */
2920 unsigned j;
2921 slp_tree child;
2922 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2923 {
2924 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2925 continue;
2926
2927 /* If the vector is uniform there's nothing to do. */
2928 if (vect_slp_tree_uniform_p (child))
2929 continue;
2930
2931 /* We can end up sharing some externals via two_operator
2932 handling. Be prepared to unshare those. */
2933 if (child->refcnt != 1)
2934 {
2935 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
2936 SLP_TREE_CHILDREN (node)[j] = child
2937 = vect_create_new_slp_node
2938 (SLP_TREE_SCALAR_OPS (child).copy ());
2939 }
2940 vect_slp_permute (perms[perm],
2941 SLP_TREE_SCALAR_OPS (child), true);
2942 }
2943
2944 if (bitmap_bit_p (n_materialize, i))
2945 {
2946 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2947 /* For loads simply drop the permutation, the load permutation
2948 already performs the desired permutation. */
2949 ;
2950 else
2951 {
2952 if (dump_enabled_p ())
2953 dump_printf_loc (MSG_NOTE, vect_location,
2954 "inserting permute node in place of %p\n",
2955 node);
2956
2957 /* Make a copy of NODE and in-place change it to a
2958 VEC_PERM node to permute the lanes of the copy. */
2959 slp_tree copy = new _slp_tree;
2960 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
2961 SLP_TREE_CHILDREN (node) = vNULL;
2962 SLP_TREE_SCALAR_STMTS (copy)
2963 = SLP_TREE_SCALAR_STMTS (node).copy ();
2964 vect_slp_permute (perms[perm],
2965 SLP_TREE_SCALAR_STMTS (copy), true);
2966 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
2967 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
2968 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
2969 SLP_TREE_LANE_PERMUTATION (copy)
2970 = SLP_TREE_LANE_PERMUTATION (node);
2971 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
2972 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
2973 copy->refcnt = 1;
2974 copy->max_nunits = node->max_nunits;
2975 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
2976 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
2977 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
2978
2979 /* Now turn NODE into a VEC_PERM. */
2980 SLP_TREE_CHILDREN (node).safe_push (copy);
2981 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
2982 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2983 SLP_TREE_LANE_PERMUTATION (node)
2984 .quick_push (std::make_pair (0, perms[perm][j]));
2985 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2986 }
2987 }
2988 else
2989 {
2990 /* Apply the reverse permutation to our stmts. */
2991 vect_slp_permute (perms[perm],
2992 SLP_TREE_SCALAR_STMTS (node), true);
2993 /* And to the load permutation, which we can simply
2994 make regular by design. */
2995 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2996 {
2997 /* ??? When we handle non-bijective permutes the idea
2998 is that we can force the load-permutation to be
2999 { min, min + 1, min + 2, ... max }. But then the
3000 scalar defs might no longer match the lane content
3001 which means wrong-code with live lane vectorization.
3002 So we possibly have to have NULL entries for those. */
3003 vect_slp_permute (perms[perm],
3004 SLP_TREE_LOAD_PERMUTATION (node), true);
3005 }
3006 }
3007 }
3008
3009 /* Free the perms vector used for propagation. */
3010 while (!perms.is_empty ())
3011 perms.pop ().release ();
3012 free_graph (slpg);
3013
3014
3015 /* Now elide load permutations that are not necessary. */
3016 for (i = 0; i < leafs.length (); ++i)
3017 {
3018 node = vertices[leafs[i]];
3019 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3020 continue;
3021
3022 /* In basic block vectorization we allow any subchain of an interleaving
3023 chain.
3024 FORNOW: not in loop SLP because of realignment complications. */
3025 if (is_a <bb_vec_info> (vinfo))
3026 {
3027 bool subchain_p = true;
3028 stmt_vec_info next_load_info = NULL;
3029 stmt_vec_info load_info;
3030 unsigned j;
3031 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3032 {
3033 if (j != 0
3034 && (next_load_info != load_info
3035 || DR_GROUP_GAP (load_info) != 1))
3036 {
3037 subchain_p = false;
3038 break;
3039 }
3040 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3041 }
3042 if (subchain_p)
3043 {
3044 SLP_TREE_LOAD_PERMUTATION (node).release ();
3045 continue;
3046 }
3047 }
3048 else
3049 {
3050 stmt_vec_info load_info;
3051 bool this_load_permuted = false;
3052 unsigned j;
3053 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3054 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3055 {
3056 this_load_permuted = true;
3057 break;
3058 }
3059 stmt_vec_info first_stmt_info
3060 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3061 if (!this_load_permuted
3062 /* The load requires permutation when unrolling exposes
3063 a gap either because the group is larger than the SLP
3064 group-size or because there is a gap between the groups. */
3065 && (known_eq (LOOP_VINFO_VECT_FACTOR
3066 (as_a <loop_vec_info> (vinfo)), 1U)
3067 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3068 && DR_GROUP_GAP (first_stmt_info) == 0)))
3069 {
3070 SLP_TREE_LOAD_PERMUTATION (node).release ();
3071 continue;
3072 }
3073 }
3074 }
3075 }
3076
3077
3078 /* For each possible SLP instance decide whether to SLP it and calculate overall
3079 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3080 least one instance. */
3081
3082 bool
3083 vect_make_slp_decision (loop_vec_info loop_vinfo)
3084 {
3085 unsigned int i;
3086 poly_uint64 unrolling_factor = 1;
3087 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3088 slp_instance instance;
3089 int decided_to_slp = 0;
3090
3091 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3092
3093 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3094 {
3095 /* FORNOW: SLP if you can. */
3096 /* All unroll factors have the form:
3097
3098 GET_MODE_SIZE (vinfo->vector_mode) * X
3099
3100 for some rational X, so they must have a common multiple. */
3101 unrolling_factor
3102 = force_common_multiple (unrolling_factor,
3103 SLP_INSTANCE_UNROLLING_FACTOR (instance));
3104
3105 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3106 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3107 loop-based vectorization. Such stmts will be marked as HYBRID. */
3108 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3109 decided_to_slp++;
3110 }
3111
3112 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
3113
3114 if (decided_to_slp && dump_enabled_p ())
3115 {
3116 dump_printf_loc (MSG_NOTE, vect_location,
3117 "Decided to SLP %d instances. Unrolling factor ",
3118 decided_to_slp);
3119 dump_dec (MSG_NOTE, unrolling_factor);
3120 dump_printf (MSG_NOTE, "\n");
3121 }
3122
3123 return (decided_to_slp > 0);
3124 }
3125
3126 /* Private data for vect_detect_hybrid_slp. */
3127 struct vdhs_data
3128 {
3129 loop_vec_info loop_vinfo;
3130 vec<stmt_vec_info> *worklist;
3131 };
3132
3133 /* Walker for walk_gimple_op. */
3134
3135 static tree
3136 vect_detect_hybrid_slp (tree *tp, int *, void *data)
3137 {
3138 walk_stmt_info *wi = (walk_stmt_info *)data;
3139 vdhs_data *dat = (vdhs_data *)wi->info;
3140
3141 if (wi->is_lhs)
3142 return NULL_TREE;
3143
3144 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
3145 if (!def_stmt_info)
3146 return NULL_TREE;
3147 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3148 if (PURE_SLP_STMT (def_stmt_info))
3149 {
3150 if (dump_enabled_p ())
3151 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3152 def_stmt_info->stmt);
3153 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3154 dat->worklist->safe_push (def_stmt_info);
3155 }
3156
3157 return NULL_TREE;
3158 }
3159
3160 /* Find stmts that must be both vectorized and SLPed. */
3161
3162 void
3163 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3164 {
3165 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3166
3167 /* All stmts participating in SLP are marked pure_slp, all other
3168 stmts are loop_vect.
3169 First collect all loop_vect stmts into a worklist. */
3170 auto_vec<stmt_vec_info> worklist;
3171 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3172 {
3173 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3174 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3175 gsi_next (&gsi))
3176 {
3177 gphi *phi = gsi.phi ();
3178 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3179 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3180 worklist.safe_push (stmt_info);
3181 }
3182 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3183 gsi_next (&gsi))
3184 {
3185 gimple *stmt = gsi_stmt (gsi);
3186 if (is_gimple_debug (stmt))
3187 continue;
3188 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3189 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3190 {
3191 for (gimple_stmt_iterator gsi2
3192 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3193 !gsi_end_p (gsi2); gsi_next (&gsi2))
3194 {
3195 stmt_vec_info patt_info
3196 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3197 if (!STMT_SLP_TYPE (patt_info)
3198 && STMT_VINFO_RELEVANT (patt_info))
3199 worklist.safe_push (patt_info);
3200 }
3201 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3202 }
3203 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3204 worklist.safe_push (stmt_info);
3205 }
3206 }
3207
3208 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3209 mark any SLP vectorized stmt as hybrid.
3210 ??? We're visiting def stmts N times (once for each non-SLP and
3211 once for each hybrid-SLP use). */
3212 walk_stmt_info wi;
3213 vdhs_data dat;
3214 dat.worklist = &worklist;
3215 dat.loop_vinfo = loop_vinfo;
3216 memset (&wi, 0, sizeof (wi));
3217 wi.info = (void *)&dat;
3218 while (!worklist.is_empty ())
3219 {
3220 stmt_vec_info stmt_info = worklist.pop ();
3221 /* Since SSA operands are not set up for pattern stmts we need
3222 to use walk_gimple_op. */
3223 wi.is_lhs = 0;
3224 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3225 }
3226 }
3227
3228
3229 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3230
3231 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3232 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs)
3233 {
3234 for (unsigned i = 0; i < bbs.length (); ++i)
3235 {
3236 if (i != 0)
3237 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3238 gsi_next (&si))
3239 {
3240 gphi *phi = si.phi ();
3241 gimple_set_uid (phi, 0);
3242 add_stmt (phi);
3243 }
3244 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3245 !gsi_end_p (gsi); gsi_next (&gsi))
3246 {
3247 gimple *stmt = gsi_stmt (gsi);
3248 gimple_set_uid (stmt, 0);
3249 if (is_gimple_debug (stmt))
3250 continue;
3251 add_stmt (stmt);
3252 }
3253 }
3254 }
3255
3256
3257 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3258 stmts in the basic block. */
3259
3260 _bb_vec_info::~_bb_vec_info ()
3261 {
3262 /* Reset region marker. */
3263 for (unsigned i = 0; i < bbs.length (); ++i)
3264 {
3265 if (i != 0)
3266 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3267 gsi_next (&si))
3268 {
3269 gphi *phi = si.phi ();
3270 gimple_set_uid (phi, -1);
3271 }
3272 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3273 !gsi_end_p (gsi); gsi_next (&gsi))
3274 {
3275 gimple *stmt = gsi_stmt (gsi);
3276 gimple_set_uid (stmt, -1);
3277 }
3278 }
3279 }
3280
3281 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3282 given then that child nodes have already been processed, and that
3283 their def types currently match their SLP node's def type. */
3284
3285 static bool
3286 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3287 slp_instance node_instance,
3288 stmt_vector_for_cost *cost_vec)
3289 {
3290 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3291 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3292
3293 /* Calculate the number of vector statements to be created for the
3294 scalar stmts in this node. For SLP reductions it is equal to the
3295 number of vector statements in the children (which has already been
3296 calculated by the recursive call). Otherwise it is the number of
3297 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3298 VF divided by the number of elements in a vector. */
3299 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3300 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3301 {
3302 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3303 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3304 {
3305 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3306 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3307 break;
3308 }
3309 }
3310 else
3311 {
3312 poly_uint64 vf;
3313 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3314 vf = loop_vinfo->vectorization_factor;
3315 else
3316 vf = 1;
3317 unsigned int group_size = SLP_TREE_LANES (node);
3318 tree vectype = SLP_TREE_VECTYPE (node);
3319 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3320 = vect_get_num_vectors (vf * group_size, vectype);
3321 }
3322
3323 /* Handle purely internal nodes. */
3324 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3325 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3326
3327 if (is_a <bb_vec_info> (vinfo)
3328 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3329 {
3330 if (dump_enabled_p ())
3331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3332 "desired vector type conflicts with earlier one "
3333 "for %G", stmt_info->stmt);
3334 return false;
3335 }
3336
3337 bool dummy;
3338 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3339 node, node_instance, cost_vec);
3340 }
3341
3342 /* Try to build NODE from scalars, returning true on success.
3343 NODE_INSTANCE is the SLP instance that contains NODE. */
3344
3345 static bool
3346 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3347 slp_instance node_instance)
3348 {
3349 stmt_vec_info stmt_info;
3350 unsigned int i;
3351
3352 if (!is_a <bb_vec_info> (vinfo)
3353 || node == SLP_INSTANCE_TREE (node_instance)
3354 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3355 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3356 return false;
3357
3358 if (dump_enabled_p ())
3359 dump_printf_loc (MSG_NOTE, vect_location,
3360 "Building vector operands of %p from scalars instead\n", node);
3361
3362 /* Don't remove and free the child nodes here, since they could be
3363 referenced by other structures. The analysis and scheduling phases
3364 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3365 unsigned int group_size = SLP_TREE_LANES (node);
3366 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3367 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3368 SLP_TREE_LOAD_PERMUTATION (node).release ();
3369 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3370 {
3371 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3372 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3373 }
3374 return true;
3375 }
3376
3377 /* Compute the prologue cost for invariant or constant operands represented
3378 by NODE. */
3379
3380 static void
3381 vect_prologue_cost_for_slp (slp_tree node,
3382 stmt_vector_for_cost *cost_vec)
3383 {
3384 /* There's a special case of an existing vector, that costs nothing. */
3385 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3386 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3387 return;
3388 /* Without looking at the actual initializer a vector of
3389 constants can be implemented as load from the constant pool.
3390 When all elements are the same we can use a splat. */
3391 tree vectype = SLP_TREE_VECTYPE (node);
3392 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3393 unsigned num_vects_to_check;
3394 unsigned HOST_WIDE_INT const_nunits;
3395 unsigned nelt_limit;
3396 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3397 && ! multiple_p (const_nunits, group_size))
3398 {
3399 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3400 nelt_limit = const_nunits;
3401 }
3402 else
3403 {
3404 /* If either the vector has variable length or the vectors
3405 are composed of repeated whole groups we only need to
3406 cost construction once. All vectors will be the same. */
3407 num_vects_to_check = 1;
3408 nelt_limit = group_size;
3409 }
3410 tree elt = NULL_TREE;
3411 unsigned nelt = 0;
3412 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3413 {
3414 unsigned si = j % group_size;
3415 if (nelt == 0)
3416 elt = SLP_TREE_SCALAR_OPS (node)[si];
3417 /* ??? We're just tracking whether all operands of a single
3418 vector initializer are the same, ideally we'd check if
3419 we emitted the same one already. */
3420 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3421 elt = NULL_TREE;
3422 nelt++;
3423 if (nelt == nelt_limit)
3424 {
3425 record_stmt_cost (cost_vec, 1,
3426 SLP_TREE_DEF_TYPE (node) == vect_external_def
3427 ? (elt ? scalar_to_vec : vec_construct)
3428 : vector_load,
3429 NULL, vectype, 0, vect_prologue);
3430 nelt = 0;
3431 }
3432 }
3433 }
3434
3435 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3436 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3437
3438 Return true if the operations are supported. */
3439
3440 static bool
3441 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3442 slp_instance node_instance,
3443 hash_set<slp_tree> &visited_set,
3444 vec<slp_tree> &visited_vec,
3445 stmt_vector_for_cost *cost_vec)
3446 {
3447 int i, j;
3448 slp_tree child;
3449
3450 /* Assume we can code-generate all invariants. */
3451 if (!node
3452 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
3453 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
3454 return true;
3455
3456 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
3457 {
3458 if (dump_enabled_p ())
3459 dump_printf_loc (MSG_NOTE, vect_location,
3460 "Failed cyclic SLP reference in %p", node);
3461 return false;
3462 }
3463 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
3464
3465 /* If we already analyzed the exact same set of scalar stmts we're done.
3466 We share the generated vector stmts for those. */
3467 if (visited_set.add (node))
3468 return true;
3469 visited_vec.safe_push (node);
3470
3471 bool res = true;
3472 unsigned visited_rec_start = visited_vec.length ();
3473 unsigned cost_vec_rec_start = cost_vec->length ();
3474 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3475 {
3476 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3477 visited_set, visited_vec,
3478 cost_vec);
3479 if (!res)
3480 break;
3481 }
3482
3483 if (res)
3484 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3485 cost_vec);
3486 /* If analysis failed we have to pop all recursive visited nodes
3487 plus ourselves. */
3488 if (!res)
3489 {
3490 while (visited_vec.length () >= visited_rec_start)
3491 visited_set.remove (visited_vec.pop ());
3492 cost_vec->truncate (cost_vec_rec_start);
3493 }
3494
3495 /* When the node can be vectorized cost invariant nodes it references.
3496 This is not done in DFS order to allow the refering node
3497 vectorizable_* calls to nail down the invariant nodes vector type
3498 and possibly unshare it if it needs a different vector type than
3499 other referrers. */
3500 if (res)
3501 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3502 if (child
3503 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
3504 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3505 /* Perform usual caching, note code-generation still
3506 code-gens these nodes multiple times but we expect
3507 to CSE them later. */
3508 && !visited_set.add (child))
3509 {
3510 visited_vec.safe_push (child);
3511 /* ??? After auditing more code paths make a "default"
3512 and push the vector type from NODE to all children
3513 if it is not already set. */
3514 /* Compute the number of vectors to be generated. */
3515 tree vector_type = SLP_TREE_VECTYPE (child);
3516 if (!vector_type)
3517 {
3518 /* For shifts with a scalar argument we don't need
3519 to cost or code-generate anything.
3520 ??? Represent this more explicitely. */
3521 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3522 == shift_vec_info_type)
3523 && j == 1);
3524 continue;
3525 }
3526 unsigned group_size = SLP_TREE_LANES (child);
3527 poly_uint64 vf = 1;
3528 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3529 vf = loop_vinfo->vectorization_factor;
3530 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3531 = vect_get_num_vectors (vf * group_size, vector_type);
3532 /* And cost them. */
3533 vect_prologue_cost_for_slp (child, cost_vec);
3534 }
3535
3536 /* If this node or any of its children can't be vectorized, try pruning
3537 the tree here rather than felling the whole thing. */
3538 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3539 {
3540 /* We'll need to revisit this for invariant costing and number
3541 of vectorized stmt setting. */
3542 res = true;
3543 }
3544
3545 return res;
3546 }
3547
3548
3549 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3550 region and that can be vectorized using vectorizable_live_operation
3551 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3552 scalar code computing it to be retained. */
3553
3554 static void
3555 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3556 slp_instance instance,
3557 stmt_vector_for_cost *cost_vec,
3558 hash_set<stmt_vec_info> &svisited,
3559 hash_set<slp_tree> &visited)
3560 {
3561 if (visited.add (node))
3562 return;
3563
3564 unsigned i;
3565 stmt_vec_info stmt_info;
3566 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3567 bool all_visited = true;
3568 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3569 {
3570 if (svisited.contains (stmt_info))
3571 continue;
3572 all_visited = false;
3573 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3574 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
3575 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
3576 /* Only the pattern root stmt computes the original scalar value. */
3577 continue;
3578 bool mark_visited = true;
3579 gimple *orig_stmt = orig_stmt_info->stmt;
3580 ssa_op_iter op_iter;
3581 def_operand_p def_p;
3582 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3583 {
3584 imm_use_iterator use_iter;
3585 gimple *use_stmt;
3586 stmt_vec_info use_stmt_info;
3587 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3588 if (!is_gimple_debug (use_stmt))
3589 {
3590 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3591 if (!use_stmt_info
3592 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3593 {
3594 STMT_VINFO_LIVE_P (stmt_info) = true;
3595 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3596 NULL, node, instance, i,
3597 false, cost_vec))
3598 /* ??? So we know we can vectorize the live stmt
3599 from one SLP node. If we cannot do so from all
3600 or none consistently we'd have to record which
3601 SLP node (and lane) we want to use for the live
3602 operation. So make sure we can code-generate
3603 from all nodes. */
3604 mark_visited = false;
3605 else
3606 STMT_VINFO_LIVE_P (stmt_info) = false;
3607 BREAK_FROM_IMM_USE_STMT (use_iter);
3608 }
3609 }
3610 /* We have to verify whether we can insert the lane extract
3611 before all uses. The following is a conservative approximation.
3612 We cannot put this into vectorizable_live_operation because
3613 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3614 doesn't work.
3615 Note that while the fact that we emit code for loads at the
3616 first load should make this a non-problem leafs we construct
3617 from scalars are vectorized after the last scalar def.
3618 ??? If we'd actually compute the insert location during
3619 analysis we could use sth less conservative than the last
3620 scalar stmt in the node for the dominance check. */
3621 /* ??? What remains is "live" uses in vector CTORs in the same
3622 SLP graph which is where those uses can end up code-generated
3623 right after their definition instead of close to their original
3624 use. But that would restrict us to code-generate lane-extracts
3625 from the latest stmt in a node. So we compensate for this
3626 during code-generation, simply not replacing uses for those
3627 hopefully rare cases. */
3628 if (STMT_VINFO_LIVE_P (stmt_info))
3629 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3630 if (!is_gimple_debug (use_stmt)
3631 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
3632 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3633 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
3634 {
3635 if (dump_enabled_p ())
3636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3637 "Cannot determine insertion place for "
3638 "lane extract\n");
3639 STMT_VINFO_LIVE_P (stmt_info) = false;
3640 mark_visited = true;
3641 }
3642 }
3643 if (mark_visited)
3644 svisited.add (stmt_info);
3645 }
3646 if (all_visited)
3647 return;
3648
3649 slp_tree child;
3650 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3651 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3652 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
3653 cost_vec, svisited, visited);
3654 }
3655
3656 /* Analyze statements in SLP instances of VINFO. Return true if the
3657 operations are supported. */
3658
3659 bool
3660 vect_slp_analyze_operations (vec_info *vinfo)
3661 {
3662 slp_instance instance;
3663 int i;
3664
3665 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3666
3667 hash_set<slp_tree> visited;
3668 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
3669 {
3670 auto_vec<slp_tree> visited_vec;
3671 stmt_vector_for_cost cost_vec;
3672 cost_vec.create (2);
3673 if (is_a <bb_vec_info> (vinfo))
3674 vect_location = instance->location ();
3675 if (!vect_slp_analyze_node_operations (vinfo,
3676 SLP_INSTANCE_TREE (instance),
3677 instance, visited, visited_vec,
3678 &cost_vec)
3679 /* Instances with a root stmt require vectorized defs for the
3680 SLP tree root. */
3681 || (SLP_INSTANCE_ROOT_STMT (instance)
3682 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
3683 != vect_internal_def)))
3684 {
3685 slp_tree node = SLP_INSTANCE_TREE (instance);
3686 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3687 if (dump_enabled_p ())
3688 dump_printf_loc (MSG_NOTE, vect_location,
3689 "removing SLP instance operations starting from: %G",
3690 stmt_info->stmt);
3691 vect_free_slp_instance (instance);
3692 vinfo->slp_instances.ordered_remove (i);
3693 cost_vec.release ();
3694 while (!visited_vec.is_empty ())
3695 visited.remove (visited_vec.pop ());
3696 }
3697 else
3698 {
3699 i++;
3700
3701 /* For BB vectorization remember the SLP graph entry
3702 cost for later. */
3703 if (is_a <bb_vec_info> (vinfo))
3704 instance->cost_vec = cost_vec;
3705 else
3706 {
3707 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
3708 cost_vec.release ();
3709 }
3710 }
3711 }
3712
3713 /* Compute vectorizable live stmts. */
3714 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3715 {
3716 hash_set<stmt_vec_info> svisited;
3717 hash_set<slp_tree> visited;
3718 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
3719 {
3720 vect_location = instance->location ();
3721 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
3722 instance, &instance->cost_vec, svisited,
3723 visited);
3724 }
3725 }
3726
3727 return !vinfo->slp_instances.is_empty ();
3728 }
3729
3730 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3731 closing the eventual chain. */
3732
3733 static slp_instance
3734 get_ultimate_leader (slp_instance instance,
3735 hash_map<slp_instance, slp_instance> &instance_leader)
3736 {
3737 auto_vec<slp_instance *, 8> chain;
3738 slp_instance *tem;
3739 while (*(tem = instance_leader.get (instance)) != instance)
3740 {
3741 chain.safe_push (tem);
3742 instance = *tem;
3743 }
3744 while (!chain.is_empty ())
3745 *chain.pop () = instance;
3746 return instance;
3747 }
3748
3749 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3750
3751 static void
3752 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
3753 slp_instance instance, slp_tree node,
3754 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
3755 hash_map<slp_instance, slp_instance> &instance_leader,
3756 hash_set<slp_tree> &visited)
3757 {
3758 stmt_vec_info stmt_info;
3759 unsigned i;
3760
3761 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3762 {
3763 bool existed_p;
3764 slp_instance &stmt_instance
3765 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
3766 if (!existed_p)
3767 ;
3768 else if (stmt_instance != instance)
3769 {
3770 /* If we're running into a previously marked stmt make us the
3771 leader of the current ultimate leader. This keeps the
3772 leader chain acyclic and works even when the current instance
3773 connects two previously independent graph parts. */
3774 slp_instance stmt_leader
3775 = get_ultimate_leader (stmt_instance, instance_leader);
3776 if (stmt_leader != instance)
3777 instance_leader.put (stmt_leader, instance);
3778 }
3779 stmt_instance = instance;
3780 }
3781
3782 if (visited.add (node))
3783 return;
3784
3785 slp_tree child;
3786 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3787 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3788 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
3789 instance_leader, visited);
3790 }
3791
3792 /* Partition the SLP graph into pieces that can be costed independently. */
3793
3794 static void
3795 vect_bb_partition_graph (bb_vec_info bb_vinfo)
3796 {
3797 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3798
3799 /* First walk the SLP graph assigning each involved scalar stmt a
3800 corresponding SLP graph entry and upon visiting a previously
3801 marked stmt, make the stmts leader the current SLP graph entry. */
3802 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
3803 hash_map<slp_instance, slp_instance> instance_leader;
3804 hash_set<slp_tree> visited;
3805 slp_instance instance;
3806 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3807 {
3808 instance_leader.put (instance, instance);
3809 vect_bb_partition_graph_r (bb_vinfo,
3810 instance, SLP_INSTANCE_TREE (instance),
3811 stmt_to_instance, instance_leader,
3812 visited);
3813 }
3814
3815 /* Then collect entries to each independent subgraph. */
3816 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3817 {
3818 slp_instance leader = get_ultimate_leader (instance, instance_leader);
3819 leader->subgraph_entries.safe_push (instance);
3820 if (dump_enabled_p ()
3821 && leader != instance)
3822 dump_printf_loc (MSG_NOTE, vect_location,
3823 "instance %p is leader of %p\n",
3824 leader, instance);
3825 }
3826 }
3827
3828 /* Compute the scalar cost of the SLP node NODE and its children
3829 and return it. Do not account defs that are marked in LIFE and
3830 update LIFE according to uses of NODE. */
3831
3832 static void
3833 vect_bb_slp_scalar_cost (vec_info *vinfo,
3834 slp_tree node, vec<bool, va_heap> *life,
3835 stmt_vector_for_cost *cost_vec,
3836 hash_set<slp_tree> &visited)
3837 {
3838 unsigned i;
3839 stmt_vec_info stmt_info;
3840 slp_tree child;
3841
3842 if (visited.add (node))
3843 return;
3844
3845 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3846 {
3847 ssa_op_iter op_iter;
3848 def_operand_p def_p;
3849
3850 if ((*life)[i])
3851 continue;
3852
3853 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3854 gimple *orig_stmt = orig_stmt_info->stmt;
3855
3856 /* If there is a non-vectorized use of the defs then the scalar
3857 stmt is kept live in which case we do not account it or any
3858 required defs in the SLP children in the scalar cost. This
3859 way we make the vectorization more costly when compared to
3860 the scalar cost. */
3861 if (!STMT_VINFO_LIVE_P (stmt_info))
3862 {
3863 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3864 {
3865 imm_use_iterator use_iter;
3866 gimple *use_stmt;
3867 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3868 if (!is_gimple_debug (use_stmt))
3869 {
3870 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3871 if (!use_stmt_info
3872 || !PURE_SLP_STMT
3873 (vect_stmt_to_vectorize (use_stmt_info)))
3874 {
3875 (*life)[i] = true;
3876 BREAK_FROM_IMM_USE_STMT (use_iter);
3877 }
3878 }
3879 }
3880 if ((*life)[i])
3881 continue;
3882 }
3883
3884 /* Count scalar stmts only once. */
3885 if (gimple_visited_p (orig_stmt))
3886 continue;
3887 gimple_set_visited (orig_stmt, true);
3888
3889 vect_cost_for_stmt kind;
3890 if (STMT_VINFO_DATA_REF (orig_stmt_info))
3891 {
3892 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
3893 kind = scalar_load;
3894 else
3895 kind = scalar_store;
3896 }
3897 else if (vect_nop_conversion_p (orig_stmt_info))
3898 continue;
3899 else
3900 kind = scalar_stmt;
3901 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
3902 SLP_TREE_VECTYPE (node), 0, vect_body);
3903 }
3904
3905 auto_vec<bool, 20> subtree_life;
3906 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3907 {
3908 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3909 {
3910 /* Do not directly pass LIFE to the recursive call, copy it to
3911 confine changes in the callee to the current child/subtree. */
3912 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3913 {
3914 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
3915 for (unsigned j = 0;
3916 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
3917 {
3918 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
3919 if (perm.first == i)
3920 subtree_life[perm.second] = (*life)[j];
3921 }
3922 }
3923 else
3924 {
3925 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
3926 subtree_life.safe_splice (*life);
3927 }
3928 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
3929 visited);
3930 subtree_life.truncate (0);
3931 }
3932 }
3933 }
3934
3935 /* Check if vectorization of the basic block is profitable for the
3936 subgraph denoted by SLP_INSTANCES. */
3937
3938 static bool
3939 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
3940 vec<slp_instance> slp_instances)
3941 {
3942 slp_instance instance;
3943 int i;
3944 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3945 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3946
3947 void *vect_target_cost_data = init_cost (NULL);
3948
3949 /* Calculate scalar cost and sum the cost for the vector stmts
3950 previously collected. */
3951 stmt_vector_for_cost scalar_costs;
3952 scalar_costs.create (0);
3953 hash_set<slp_tree> visited;
3954 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3955 {
3956 auto_vec<bool, 20> life;
3957 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
3958 true);
3959 vect_bb_slp_scalar_cost (bb_vinfo,
3960 SLP_INSTANCE_TREE (instance),
3961 &life, &scalar_costs, visited);
3962 add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
3963 instance->cost_vec.release ();
3964 }
3965 /* Unset visited flag. */
3966 stmt_info_for_cost *si;
3967 FOR_EACH_VEC_ELT (scalar_costs, i, si)
3968 gimple_set_visited (si->stmt_info->stmt, false);
3969
3970 void *scalar_target_cost_data = init_cost (NULL);
3971 add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
3972 scalar_costs.release ();
3973 unsigned dummy;
3974 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
3975 destroy_cost_data (scalar_target_cost_data);
3976
3977 /* Complete the target-specific vector cost calculation. */
3978 finish_cost (vect_target_cost_data, &vec_prologue_cost,
3979 &vec_inside_cost, &vec_epilogue_cost);
3980 destroy_cost_data (vect_target_cost_data);
3981
3982 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3983
3984 if (dump_enabled_p ())
3985 {
3986 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3987 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3988 vec_inside_cost);
3989 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3990 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3991 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3992 }
3993
3994 /* Vectorization is profitable if its cost is more than the cost of scalar
3995 version. Note that we err on the vector side for equal cost because
3996 the cost estimate is otherwise quite pessimistic (constant uses are
3997 free on the scalar side but cost a load on the vector side for
3998 example). */
3999 if (vec_outside_cost + vec_inside_cost > scalar_cost)
4000 return false;
4001
4002 return true;
4003 }
4004
4005 /* Find any vectorizable constructors and add them to the grouped_store
4006 array. */
4007
4008 static void
4009 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
4010 {
4011 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
4012 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
4013 !gsi_end_p (gsi); gsi_next (&gsi))
4014 {
4015 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
4016 if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
4017 continue;
4018
4019 tree rhs = gimple_assign_rhs1 (assign);
4020 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
4021 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
4022 CONSTRUCTOR_NELTS (rhs))
4023 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
4024 || uniform_vector_p (rhs))
4025 continue;
4026
4027 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
4028 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
4029 }
4030 }
4031
4032 /* Check if the region described by BB_VINFO can be vectorized, returning
4033 true if so. When returning false, set FATAL to true if the same failure
4034 would prevent vectorization at other vector sizes, false if it is still
4035 worth trying other sizes. N_STMTS is the number of statements in the
4036 region. */
4037
4038 static bool
4039 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
4040 vec<int> *dataref_groups)
4041 {
4042 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4043
4044 slp_instance instance;
4045 int i;
4046 poly_uint64 min_vf = 2;
4047
4048 /* The first group of checks is independent of the vector size. */
4049 fatal = true;
4050
4051 /* Analyze the data references. */
4052
4053 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
4054 {
4055 if (dump_enabled_p ())
4056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4057 "not vectorized: unhandled data-ref in basic "
4058 "block.\n");
4059 return false;
4060 }
4061
4062 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
4063 {
4064 if (dump_enabled_p ())
4065 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4066 "not vectorized: unhandled data access in "
4067 "basic block.\n");
4068 return false;
4069 }
4070
4071 vect_slp_check_for_constructors (bb_vinfo);
4072
4073 /* If there are no grouped stores and no constructors in the region
4074 there is no need to continue with pattern recog as vect_analyze_slp
4075 will fail anyway. */
4076 if (bb_vinfo->grouped_stores.is_empty ())
4077 {
4078 if (dump_enabled_p ())
4079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4080 "not vectorized: no grouped stores in "
4081 "basic block.\n");
4082 return false;
4083 }
4084
4085 /* While the rest of the analysis below depends on it in some way. */
4086 fatal = false;
4087
4088 vect_pattern_recog (bb_vinfo);
4089
4090 /* Check the SLP opportunities in the basic block, analyze and build SLP
4091 trees. */
4092 if (!vect_analyze_slp (bb_vinfo, n_stmts))
4093 {
4094 if (dump_enabled_p ())
4095 {
4096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4097 "Failed to SLP the basic block.\n");
4098 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4099 "not vectorized: failed to find SLP opportunities "
4100 "in basic block.\n");
4101 }
4102 return false;
4103 }
4104
4105 /* Optimize permutations. */
4106 vect_optimize_slp (bb_vinfo);
4107
4108 vect_record_base_alignments (bb_vinfo);
4109
4110 /* Analyze and verify the alignment of data references and the
4111 dependence in the SLP instances. */
4112 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
4113 {
4114 vect_location = instance->location ();
4115 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
4116 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
4117 {
4118 slp_tree node = SLP_INSTANCE_TREE (instance);
4119 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4120 if (dump_enabled_p ())
4121 dump_printf_loc (MSG_NOTE, vect_location,
4122 "removing SLP instance operations starting from: %G",
4123 stmt_info->stmt);
4124 vect_free_slp_instance (instance);
4125 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
4126 continue;
4127 }
4128
4129 /* Mark all the statements that we want to vectorize as pure SLP and
4130 relevant. */
4131 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4132 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
4133 if (SLP_INSTANCE_ROOT_STMT (instance))
4134 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
4135
4136 i++;
4137 }
4138 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
4139 return false;
4140
4141 if (!vect_slp_analyze_operations (bb_vinfo))
4142 {
4143 if (dump_enabled_p ())
4144 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4145 "not vectorized: bad operation in basic block.\n");
4146 return false;
4147 }
4148
4149 vect_bb_partition_graph (bb_vinfo);
4150
4151 return true;
4152 }
4153
4154 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4155 basic blocks in BBS, returning true on success.
4156 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4157
4158 static bool
4159 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
4160 vec<int> *dataref_groups, unsigned int n_stmts)
4161 {
4162 bb_vec_info bb_vinfo;
4163 auto_vector_modes vector_modes;
4164
4165 /* Autodetect first vector size we try. */
4166 machine_mode next_vector_mode = VOIDmode;
4167 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
4168 unsigned int mode_i = 0;
4169
4170 vec_info_shared shared;
4171
4172 machine_mode autodetected_vector_mode = VOIDmode;
4173 while (1)
4174 {
4175 bool vectorized = false;
4176 bool fatal = false;
4177 bb_vinfo = new _bb_vec_info (bbs, &shared);
4178
4179 bool first_time_p = shared.datarefs.is_empty ();
4180 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
4181 if (first_time_p)
4182 bb_vinfo->shared->save_datarefs ();
4183 else
4184 bb_vinfo->shared->check_datarefs ();
4185 bb_vinfo->vector_mode = next_vector_mode;
4186
4187 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)
4188 && dbg_cnt (vect_slp))
4189 {
4190 if (dump_enabled_p ())
4191 {
4192 dump_printf_loc (MSG_NOTE, vect_location,
4193 "***** Analysis succeeded with vector mode"
4194 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4195 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4196 }
4197
4198 bb_vinfo->shared->check_datarefs ();
4199
4200 unsigned i;
4201 slp_instance instance;
4202 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4203 {
4204 if (instance->subgraph_entries.is_empty ())
4205 continue;
4206
4207 vect_location = instance->location ();
4208 if (!unlimited_cost_model (NULL)
4209 && !vect_bb_vectorization_profitable_p
4210 (bb_vinfo, instance->subgraph_entries))
4211 {
4212 if (dump_enabled_p ())
4213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4214 "not vectorized: vectorization is not "
4215 "profitable.\n");
4216 continue;
4217 }
4218
4219 if (!vectorized && dump_enabled_p ())
4220 dump_printf_loc (MSG_NOTE, vect_location,
4221 "Basic block will be vectorized "
4222 "using SLP\n");
4223 vectorized = true;
4224
4225 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4226
4227 unsigned HOST_WIDE_INT bytes;
4228 if (dump_enabled_p ())
4229 {
4230 if (GET_MODE_SIZE
4231 (bb_vinfo->vector_mode).is_constant (&bytes))
4232 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4233 "basic block part vectorized using %wu "
4234 "byte vectors\n", bytes);
4235 else
4236 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4237 "basic block part vectorized using "
4238 "variable length vectors\n");
4239 }
4240 }
4241 }
4242 else
4243 {
4244 if (dump_enabled_p ())
4245 dump_printf_loc (MSG_NOTE, vect_location,
4246 "***** Analysis failed with vector mode %s\n",
4247 GET_MODE_NAME (bb_vinfo->vector_mode));
4248 }
4249
4250 if (mode_i == 0)
4251 autodetected_vector_mode = bb_vinfo->vector_mode;
4252
4253 if (!fatal)
4254 while (mode_i < vector_modes.length ()
4255 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
4256 {
4257 if (dump_enabled_p ())
4258 dump_printf_loc (MSG_NOTE, vect_location,
4259 "***** The result for vector mode %s would"
4260 " be the same\n",
4261 GET_MODE_NAME (vector_modes[mode_i]));
4262 mode_i += 1;
4263 }
4264
4265 delete bb_vinfo;
4266
4267 if (mode_i < vector_modes.length ()
4268 && VECTOR_MODE_P (autodetected_vector_mode)
4269 && (related_vector_mode (vector_modes[mode_i],
4270 GET_MODE_INNER (autodetected_vector_mode))
4271 == autodetected_vector_mode)
4272 && (related_vector_mode (autodetected_vector_mode,
4273 GET_MODE_INNER (vector_modes[mode_i]))
4274 == vector_modes[mode_i]))
4275 {
4276 if (dump_enabled_p ())
4277 dump_printf_loc (MSG_NOTE, vect_location,
4278 "***** Skipping vector mode %s, which would"
4279 " repeat the analysis for %s\n",
4280 GET_MODE_NAME (vector_modes[mode_i]),
4281 GET_MODE_NAME (autodetected_vector_mode));
4282 mode_i += 1;
4283 }
4284
4285 if (vectorized
4286 || mode_i == vector_modes.length ()
4287 || autodetected_vector_mode == VOIDmode
4288 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4289 vector sizes will fail do not bother iterating. */
4290 || fatal)
4291 return vectorized;
4292
4293 /* Try the next biggest vector size. */
4294 next_vector_mode = vector_modes[mode_i++];
4295 if (dump_enabled_p ())
4296 dump_printf_loc (MSG_NOTE, vect_location,
4297 "***** Re-trying analysis with vector mode %s\n",
4298 GET_MODE_NAME (next_vector_mode));
4299 }
4300 }
4301
4302
4303 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4304 true if anything in the basic-block was vectorized. */
4305
4306 static bool
4307 vect_slp_bbs (vec<basic_block> bbs)
4308 {
4309 vec<data_reference_p> datarefs = vNULL;
4310 auto_vec<int> dataref_groups;
4311 int insns = 0;
4312 int current_group = 0;
4313
4314 for (unsigned i = 0; i < bbs.length (); i++)
4315 {
4316 basic_block bb = bbs[i];
4317 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
4318 gsi_next (&gsi))
4319 {
4320 gimple *stmt = gsi_stmt (gsi);
4321 if (is_gimple_debug (stmt))
4322 continue;
4323
4324 insns++;
4325
4326 if (gimple_location (stmt) != UNKNOWN_LOCATION)
4327 vect_location = stmt;
4328
4329 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
4330 &dataref_groups, current_group))
4331 ++current_group;
4332 }
4333 }
4334
4335 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
4336 }
4337
4338 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4339 true if anything in the basic-block was vectorized. */
4340
4341 bool
4342 vect_slp_bb (basic_block bb)
4343 {
4344 auto_vec<basic_block> bbs;
4345 bbs.safe_push (bb);
4346 return vect_slp_bbs (bbs);
4347 }
4348
4349 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4350 true if anything in the basic-block was vectorized. */
4351
4352 bool
4353 vect_slp_function (function *fun)
4354 {
4355 bool r = false;
4356 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
4357 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
4358
4359 /* For the moment split the function into pieces to avoid making
4360 the iteration on the vector mode moot. Split at points we know
4361 to not handle well which is CFG merges (SLP discovery doesn't
4362 handle non-loop-header PHIs) and loop exits. Since pattern
4363 recog requires reverse iteration to visit uses before defs
4364 simply chop RPO into pieces. */
4365 auto_vec<basic_block> bbs;
4366 for (unsigned i = 0; i < n; i++)
4367 {
4368 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
4369 bool split = false;
4370
4371 /* Split when a BB is not dominated by the first block. */
4372 if (!bbs.is_empty ()
4373 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
4374 {
4375 if (dump_enabled_p ())
4376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4377 "splitting region at dominance boundary bb%d\n",
4378 bb->index);
4379 split = true;
4380 }
4381 /* Split when the loop determined by the first block
4382 is exited. This is because we eventually insert
4383 invariants at region begin. */
4384 else if (!bbs.is_empty ()
4385 && bbs[0]->loop_father != bb->loop_father
4386 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
4387 {
4388 if (dump_enabled_p ())
4389 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4390 "splitting region at loop %d exit at bb%d\n",
4391 bbs[0]->loop_father->num, bb->index);
4392 split = true;
4393 }
4394
4395 if (split && !bbs.is_empty ())
4396 {
4397 r |= vect_slp_bbs (bbs);
4398 bbs.truncate (0);
4399 bbs.quick_push (bb);
4400 }
4401 else
4402 bbs.safe_push (bb);
4403
4404 /* When we have a stmt ending this block and defining a
4405 value we have to insert on edges when inserting after it for
4406 a vector containing its definition. Avoid this for now. */
4407 if (gimple *last = last_stmt (bb))
4408 if (gimple_get_lhs (last)
4409 && is_ctrl_altering_stmt (last))
4410 {
4411 if (dump_enabled_p ())
4412 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4413 "splitting region at control altering "
4414 "definition %G", last);
4415 r |= vect_slp_bbs (bbs);
4416 bbs.truncate (0);
4417 }
4418 }
4419
4420 if (!bbs.is_empty ())
4421 r |= vect_slp_bbs (bbs);
4422
4423 free (rpo);
4424
4425 return r;
4426 }
4427
4428 /* Build a variable-length vector in which the elements in ELTS are repeated
4429 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4430 RESULTS and add any new instructions to SEQ.
4431
4432 The approach we use is:
4433
4434 (1) Find a vector mode VM with integer elements of mode IM.
4435
4436 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4437 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4438 from small vectors to IM.
4439
4440 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4441
4442 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4443 correct byte contents.
4444
4445 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4446
4447 We try to find the largest IM for which this sequence works, in order
4448 to cut down on the number of interleaves. */
4449
4450 void
4451 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
4452 vec<tree> elts, unsigned int nresults,
4453 vec<tree> &results)
4454 {
4455 unsigned int nelts = elts.length ();
4456 tree element_type = TREE_TYPE (vector_type);
4457
4458 /* (1) Find a vector mode VM with integer elements of mode IM. */
4459 unsigned int nvectors = 1;
4460 tree new_vector_type;
4461 tree permutes[2];
4462 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
4463 &nvectors, &new_vector_type,
4464 permutes))
4465 gcc_unreachable ();
4466
4467 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4468 unsigned int partial_nelts = nelts / nvectors;
4469 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
4470
4471 tree_vector_builder partial_elts;
4472 auto_vec<tree, 32> pieces (nvectors * 2);
4473 pieces.quick_grow (nvectors * 2);
4474 for (unsigned int i = 0; i < nvectors; ++i)
4475 {
4476 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4477 ELTS' has mode IM. */
4478 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
4479 for (unsigned int j = 0; j < partial_nelts; ++j)
4480 partial_elts.quick_push (elts[i * partial_nelts + j]);
4481 tree t = gimple_build_vector (seq, &partial_elts);
4482 t = gimple_build (seq, VIEW_CONVERT_EXPR,
4483 TREE_TYPE (new_vector_type), t);
4484
4485 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4486 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
4487 }
4488
4489 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4490 correct byte contents.
4491
4492 We need to repeat the following operation log2(nvectors) times:
4493
4494 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4495 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4496
4497 However, if each input repeats every N elements and the VF is
4498 a multiple of N * 2, the HI result is the same as the LO. */
4499 unsigned int in_start = 0;
4500 unsigned int out_start = nvectors;
4501 unsigned int hi_start = nvectors / 2;
4502 /* A bound on the number of outputs needed to produce NRESULTS results
4503 in the final iteration. */
4504 unsigned int noutputs_bound = nvectors * nresults;
4505 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
4506 {
4507 noutputs_bound /= 2;
4508 unsigned int limit = MIN (noutputs_bound, nvectors);
4509 for (unsigned int i = 0; i < limit; ++i)
4510 {
4511 if ((i & 1) != 0
4512 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
4513 2 * in_repeat))
4514 {
4515 pieces[out_start + i] = pieces[out_start + i - 1];
4516 continue;
4517 }
4518
4519 tree output = make_ssa_name (new_vector_type);
4520 tree input1 = pieces[in_start + (i / 2)];
4521 tree input2 = pieces[in_start + (i / 2) + hi_start];
4522 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
4523 input1, input2,
4524 permutes[i & 1]);
4525 gimple_seq_add_stmt (seq, stmt);
4526 pieces[out_start + i] = output;
4527 }
4528 std::swap (in_start, out_start);
4529 }
4530
4531 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4532 results.reserve (nresults);
4533 for (unsigned int i = 0; i < nresults; ++i)
4534 if (i < nvectors)
4535 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
4536 pieces[in_start + i]));
4537 else
4538 results.quick_push (results[i - nvectors]);
4539 }
4540
4541
4542 /* For constant and loop invariant defs in OP_NODE this function creates
4543 vector defs that will be used in the vectorized stmts and stores them
4544 to SLP_TREE_VEC_DEFS of OP_NODE. */
4545
4546 static void
4547 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
4548 {
4549 unsigned HOST_WIDE_INT nunits;
4550 tree vec_cst;
4551 unsigned j, number_of_places_left_in_vector;
4552 tree vector_type;
4553 tree vop;
4554 int group_size = op_node->ops.length ();
4555 unsigned int vec_num, i;
4556 unsigned number_of_copies = 1;
4557 bool constant_p;
4558 gimple_seq ctor_seq = NULL;
4559 auto_vec<tree, 16> permute_results;
4560
4561 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
4562 vector_type = SLP_TREE_VECTYPE (op_node);
4563
4564 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
4565 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
4566 auto_vec<tree> voprnds (number_of_vectors);
4567
4568 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4569 created vectors. It is greater than 1 if unrolling is performed.
4570
4571 For example, we have two scalar operands, s1 and s2 (e.g., group of
4572 strided accesses of size two), while NUNITS is four (i.e., four scalars
4573 of this type can be packed in a vector). The output vector will contain
4574 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4575 will be 2).
4576
4577 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4578 containing the operands.
4579
4580 For example, NUNITS is four as before, and the group size is 8
4581 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4582 {s5, s6, s7, s8}. */
4583
4584 /* When using duplicate_and_interleave, we just need one element for
4585 each scalar statement. */
4586 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4587 nunits = group_size;
4588
4589 number_of_copies = nunits * number_of_vectors / group_size;
4590
4591 number_of_places_left_in_vector = nunits;
4592 constant_p = true;
4593 tree_vector_builder elts (vector_type, nunits, 1);
4594 elts.quick_grow (nunits);
4595 stmt_vec_info insert_after = NULL;
4596 for (j = 0; j < number_of_copies; j++)
4597 {
4598 tree op;
4599 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
4600 {
4601 /* Create 'vect_ = {op0,op1,...,opn}'. */
4602 number_of_places_left_in_vector--;
4603 tree orig_op = op;
4604 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
4605 {
4606 if (CONSTANT_CLASS_P (op))
4607 {
4608 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4609 {
4610 /* Can't use VIEW_CONVERT_EXPR for booleans because
4611 of possibly different sizes of scalar value and
4612 vector element. */
4613 if (integer_zerop (op))
4614 op = build_int_cst (TREE_TYPE (vector_type), 0);
4615 else if (integer_onep (op))
4616 op = build_all_ones_cst (TREE_TYPE (vector_type));
4617 else
4618 gcc_unreachable ();
4619 }
4620 else
4621 op = fold_unary (VIEW_CONVERT_EXPR,
4622 TREE_TYPE (vector_type), op);
4623 gcc_assert (op && CONSTANT_CLASS_P (op));
4624 }
4625 else
4626 {
4627 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
4628 gimple *init_stmt;
4629 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4630 {
4631 tree true_val
4632 = build_all_ones_cst (TREE_TYPE (vector_type));
4633 tree false_val
4634 = build_zero_cst (TREE_TYPE (vector_type));
4635 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
4636 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
4637 op, true_val,
4638 false_val);
4639 }
4640 else
4641 {
4642 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
4643 op);
4644 init_stmt
4645 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
4646 op);
4647 }
4648 gimple_seq_add_stmt (&ctor_seq, init_stmt);
4649 op = new_temp;
4650 }
4651 }
4652 elts[number_of_places_left_in_vector] = op;
4653 if (!CONSTANT_CLASS_P (op))
4654 constant_p = false;
4655 /* For BB vectorization we have to compute an insert location
4656 when a def is inside the analyzed region since we cannot
4657 simply insert at the BB start in this case. */
4658 stmt_vec_info opdef;
4659 if (TREE_CODE (orig_op) == SSA_NAME
4660 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
4661 && is_a <bb_vec_info> (vinfo)
4662 && (opdef = vinfo->lookup_def (orig_op)))
4663 {
4664 if (!insert_after)
4665 insert_after = opdef;
4666 else
4667 insert_after = get_later_stmt (insert_after, opdef);
4668 }
4669
4670 if (number_of_places_left_in_vector == 0)
4671 {
4672 if (constant_p
4673 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4674 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4675 vec_cst = gimple_build_vector (&ctor_seq, &elts);
4676 else
4677 {
4678 if (permute_results.is_empty ())
4679 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
4680 elts, number_of_vectors,
4681 permute_results);
4682 vec_cst = permute_results[number_of_vectors - j - 1];
4683 }
4684 if (!gimple_seq_empty_p (ctor_seq))
4685 {
4686 if (insert_after)
4687 {
4688 gimple_stmt_iterator gsi;
4689 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
4690 {
4691 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
4692 gsi_insert_seq_before (&gsi, ctor_seq,
4693 GSI_CONTINUE_LINKING);
4694 }
4695 else if (!stmt_ends_bb_p (insert_after->stmt))
4696 {
4697 gsi = gsi_for_stmt (insert_after->stmt);
4698 gsi_insert_seq_after (&gsi, ctor_seq,
4699 GSI_CONTINUE_LINKING);
4700 }
4701 else
4702 {
4703 /* When we want to insert after a def where the
4704 defining stmt throws then insert on the fallthru
4705 edge. */
4706 edge e = find_fallthru_edge
4707 (gimple_bb (insert_after->stmt)->succs);
4708 basic_block new_bb
4709 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
4710 gcc_assert (!new_bb);
4711 }
4712 }
4713 else
4714 vinfo->insert_seq_on_entry (NULL, ctor_seq);
4715 ctor_seq = NULL;
4716 }
4717 voprnds.quick_push (vec_cst);
4718 insert_after = NULL;
4719 number_of_places_left_in_vector = nunits;
4720 constant_p = true;
4721 elts.new_vector (vector_type, nunits, 1);
4722 elts.quick_grow (nunits);
4723 }
4724 }
4725 }
4726
4727 /* Since the vectors are created in the reverse order, we should invert
4728 them. */
4729 vec_num = voprnds.length ();
4730 for (j = vec_num; j != 0; j--)
4731 {
4732 vop = voprnds[j - 1];
4733 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4734 }
4735
4736 /* In case that VF is greater than the unrolling factor needed for the SLP
4737 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4738 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4739 to replicate the vectors. */
4740 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
4741 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
4742 i++)
4743 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4744 }
4745
4746 /* Get the Ith vectorized definition from SLP_NODE. */
4747
4748 tree
4749 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
4750 {
4751 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
4752 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
4753 else
4754 return SLP_TREE_VEC_DEFS (slp_node)[i];
4755 }
4756
4757 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
4758
4759 void
4760 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
4761 {
4762 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
4763 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
4764 {
4765 unsigned j;
4766 gimple *vec_def_stmt;
4767 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
4768 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
4769 }
4770 else
4771 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
4772 }
4773
4774 /* Get N vectorized definitions for SLP_NODE. */
4775
4776 void
4777 vect_get_slp_defs (vec_info *,
4778 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
4779 {
4780 if (n == -1U)
4781 n = SLP_TREE_CHILDREN (slp_node).length ();
4782
4783 for (unsigned i = 0; i < n; ++i)
4784 {
4785 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
4786 vec<tree> vec_defs = vNULL;
4787 vect_get_slp_defs (child, &vec_defs);
4788 vec_oprnds->quick_push (vec_defs);
4789 }
4790 }
4791
4792 /* Generate vector permute statements from a list of loads in DR_CHAIN.
4793 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
4794 permute statements for the SLP node NODE. Store the number of vector
4795 permute instructions in *N_PERMS and the number of vector load
4796 instructions in *N_LOADS. */
4797
4798 bool
4799 vect_transform_slp_perm_load (vec_info *vinfo,
4800 slp_tree node, vec<tree> dr_chain,
4801 gimple_stmt_iterator *gsi, poly_uint64 vf,
4802 bool analyze_only, unsigned *n_perms,
4803 unsigned int *n_loads)
4804 {
4805 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4806 int vec_index = 0;
4807 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4808 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
4809 unsigned int mask_element;
4810 machine_mode mode;
4811
4812 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
4813 return false;
4814
4815 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
4816
4817 mode = TYPE_MODE (vectype);
4818 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4819
4820 /* Initialize the vect stmts of NODE to properly insert the generated
4821 stmts later. */
4822 if (! analyze_only)
4823 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
4824 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
4825 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
4826
4827 /* Generate permutation masks for every NODE. Number of masks for each NODE
4828 is equal to GROUP_SIZE.
4829 E.g., we have a group of three nodes with three loads from the same
4830 location in each node, and the vector size is 4. I.e., we have a
4831 a0b0c0a1b1c1... sequence and we need to create the following vectors:
4832 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
4833 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
4834 ...
4835
4836 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
4837 The last mask is illegal since we assume two operands for permute
4838 operation, and the mask element values can't be outside that range.
4839 Hence, the last mask must be converted into {2,5,5,5}.
4840 For the first two permutations we need the first and the second input
4841 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
4842 we need the second and the third vectors: {b1,c1,a2,b2} and
4843 {c2,a3,b3,c3}. */
4844
4845 int vect_stmts_counter = 0;
4846 unsigned int index = 0;
4847 int first_vec_index = -1;
4848 int second_vec_index = -1;
4849 bool noop_p = true;
4850 *n_perms = 0;
4851
4852 vec_perm_builder mask;
4853 unsigned int nelts_to_build;
4854 unsigned int nvectors_per_build;
4855 unsigned int in_nlanes;
4856 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
4857 && multiple_p (nunits, group_size));
4858 if (repeating_p)
4859 {
4860 /* A single vector contains a whole number of copies of the node, so:
4861 (a) all permutes can use the same mask; and
4862 (b) the permutes only need a single vector input. */
4863 mask.new_vector (nunits, group_size, 3);
4864 nelts_to_build = mask.encoded_nelts ();
4865 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
4866 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
4867 }
4868 else
4869 {
4870 /* We need to construct a separate mask for each vector statement. */
4871 unsigned HOST_WIDE_INT const_nunits, const_vf;
4872 if (!nunits.is_constant (&const_nunits)
4873 || !vf.is_constant (&const_vf))
4874 return false;
4875 mask.new_vector (const_nunits, const_nunits, 1);
4876 nelts_to_build = const_vf * group_size;
4877 nvectors_per_build = 1;
4878 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
4879 }
4880 auto_sbitmap used_in_lanes (in_nlanes);
4881 bitmap_clear (used_in_lanes);
4882
4883 unsigned int count = mask.encoded_nelts ();
4884 mask.quick_grow (count);
4885 vec_perm_indices indices;
4886
4887 for (unsigned int j = 0; j < nelts_to_build; j++)
4888 {
4889 unsigned int iter_num = j / group_size;
4890 unsigned int stmt_num = j % group_size;
4891 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
4892 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
4893 bitmap_set_bit (used_in_lanes, i);
4894 if (repeating_p)
4895 {
4896 first_vec_index = 0;
4897 mask_element = i;
4898 }
4899 else
4900 {
4901 /* Enforced before the loop when !repeating_p. */
4902 unsigned int const_nunits = nunits.to_constant ();
4903 vec_index = i / const_nunits;
4904 mask_element = i % const_nunits;
4905 if (vec_index == first_vec_index
4906 || first_vec_index == -1)
4907 {
4908 first_vec_index = vec_index;
4909 }
4910 else if (vec_index == second_vec_index
4911 || second_vec_index == -1)
4912 {
4913 second_vec_index = vec_index;
4914 mask_element += const_nunits;
4915 }
4916 else
4917 {
4918 if (dump_enabled_p ())
4919 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4920 "permutation requires at "
4921 "least three vectors %G",
4922 stmt_info->stmt);
4923 gcc_assert (analyze_only);
4924 return false;
4925 }
4926
4927 gcc_assert (mask_element < 2 * const_nunits);
4928 }
4929
4930 if (mask_element != index)
4931 noop_p = false;
4932 mask[index++] = mask_element;
4933
4934 if (index == count && !noop_p)
4935 {
4936 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
4937 if (!can_vec_perm_const_p (mode, indices))
4938 {
4939 if (dump_enabled_p ())
4940 {
4941 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4942 vect_location,
4943 "unsupported vect permute { ");
4944 for (i = 0; i < count; ++i)
4945 {
4946 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4947 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4948 }
4949 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4950 }
4951 gcc_assert (analyze_only);
4952 return false;
4953 }
4954
4955 ++*n_perms;
4956 }
4957
4958 if (index == count)
4959 {
4960 if (!analyze_only)
4961 {
4962 tree mask_vec = NULL_TREE;
4963
4964 if (! noop_p)
4965 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4966
4967 if (second_vec_index == -1)
4968 second_vec_index = first_vec_index;
4969
4970 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
4971 {
4972 /* Generate the permute statement if necessary. */
4973 tree first_vec = dr_chain[first_vec_index + ri];
4974 tree second_vec = dr_chain[second_vec_index + ri];
4975 gimple *perm_stmt;
4976 if (! noop_p)
4977 {
4978 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4979 tree perm_dest
4980 = vect_create_destination_var (gimple_assign_lhs (stmt),
4981 vectype);
4982 perm_dest = make_ssa_name (perm_dest);
4983 perm_stmt
4984 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4985 first_vec, second_vec,
4986 mask_vec);
4987 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
4988 gsi);
4989 }
4990 else
4991 /* If mask was NULL_TREE generate the requested
4992 identity transform. */
4993 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
4994
4995 /* Store the vector statement in NODE. */
4996 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
4997 }
4998 }
4999
5000 index = 0;
5001 first_vec_index = -1;
5002 second_vec_index = -1;
5003 noop_p = true;
5004 }
5005 }
5006
5007 if (n_loads)
5008 {
5009 if (repeating_p)
5010 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5011 else
5012 {
5013 /* Enforced above when !repeating_p. */
5014 unsigned int const_nunits = nunits.to_constant ();
5015 *n_loads = 0;
5016 bool load_seen = false;
5017 for (unsigned i = 0; i < in_nlanes; ++i)
5018 {
5019 if (i % const_nunits == 0)
5020 {
5021 if (load_seen)
5022 *n_loads += 1;
5023 load_seen = false;
5024 }
5025 if (bitmap_bit_p (used_in_lanes, i))
5026 load_seen = true;
5027 }
5028 if (load_seen)
5029 *n_loads += 1;
5030 }
5031 }
5032
5033 return true;
5034 }
5035
5036
5037 /* Vectorize the SLP permutations in NODE as specified
5038 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5039 child number and lane number.
5040 Interleaving of two two-lane two-child SLP subtrees (not supported):
5041 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5042 A blend of two four-lane two-child SLP subtrees:
5043 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5044 Highpart of a four-lane one-child SLP subtree (not supported):
5045 [ { 0, 2 }, { 0, 3 } ]
5046 Where currently only a subset is supported by code generating below. */
5047
5048 static bool
5049 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5050 slp_tree node, stmt_vector_for_cost *cost_vec)
5051 {
5052 tree vectype = SLP_TREE_VECTYPE (node);
5053
5054 /* ??? We currently only support all same vector input and output types
5055 while the SLP IL should really do a concat + select and thus accept
5056 arbitrary mismatches. */
5057 slp_tree child;
5058 unsigned i;
5059 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5060 if (!types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
5061 {
5062 if (dump_enabled_p ())
5063 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5064 "Unsupported lane permutation\n");
5065 return false;
5066 }
5067
5068 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
5069 gcc_assert (perm.length () == SLP_TREE_LANES (node));
5070 if (dump_enabled_p ())
5071 {
5072 dump_printf_loc (MSG_NOTE, vect_location,
5073 "vectorizing permutation");
5074 for (unsigned i = 0; i < perm.length (); ++i)
5075 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
5076 dump_printf (MSG_NOTE, "\n");
5077 }
5078
5079 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5080 if (!nunits.is_constant ())
5081 return false;
5082 unsigned HOST_WIDE_INT vf = 1;
5083 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
5084 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
5085 return false;
5086 unsigned olanes = vf * SLP_TREE_LANES (node);
5087 gcc_assert (multiple_p (olanes, nunits));
5088
5089 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5090 from the { SLP operand, scalar lane } permutation as recorded in the
5091 SLP node as intermediate step. This part should already work
5092 with SLP children with arbitrary number of lanes. */
5093 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
5094 auto_vec<unsigned> active_lane;
5095 vperm.create (olanes);
5096 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
5097 for (unsigned i = 0; i < vf; ++i)
5098 {
5099 for (unsigned pi = 0; pi < perm.length (); ++pi)
5100 {
5101 std::pair<unsigned, unsigned> p = perm[pi];
5102 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
5103 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
5104 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
5105 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
5106 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
5107 }
5108 /* Advance to the next group. */
5109 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
5110 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
5111 }
5112
5113 if (dump_enabled_p ())
5114 {
5115 dump_printf_loc (MSG_NOTE, vect_location, "as");
5116 for (unsigned i = 0; i < vperm.length (); ++i)
5117 {
5118 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
5119 dump_printf (MSG_NOTE, ",");
5120 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
5121 vperm[i].first.first, vperm[i].first.second,
5122 vperm[i].first.second);
5123 }
5124 dump_printf (MSG_NOTE, "\n");
5125 }
5126
5127 /* We can only handle two-vector permutes, everything else should
5128 be lowered on the SLP level. The following is closely inspired
5129 by vect_transform_slp_perm_load and is supposed to eventually
5130 replace it.
5131 ??? As intermediate step do code-gen in the SLP tree representation
5132 somehow? */
5133 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
5134 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
5135 unsigned int const_nunits = nunits.to_constant ();
5136 unsigned int index = 0;
5137 unsigned int mask_element;
5138 vec_perm_builder mask;
5139 mask.new_vector (const_nunits, const_nunits, 1);
5140 unsigned int count = mask.encoded_nelts ();
5141 mask.quick_grow (count);
5142 vec_perm_indices indices;
5143 unsigned nperms = 0;
5144 for (unsigned i = 0; i < vperm.length (); ++i)
5145 {
5146 mask_element = vperm[i].second;
5147 if (first_vec.first == -1U
5148 || first_vec == vperm[i].first)
5149 first_vec = vperm[i].first;
5150 else if (second_vec.first == -1U
5151 || second_vec == vperm[i].first)
5152 {
5153 second_vec = vperm[i].first;
5154 mask_element += const_nunits;
5155 }
5156 else
5157 {
5158 if (dump_enabled_p ())
5159 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5160 "permutation requires at "
5161 "least three vectors");
5162 gcc_assert (!gsi);
5163 return false;
5164 }
5165
5166 mask[index++] = mask_element;
5167
5168 if (index == count)
5169 {
5170 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
5171 const_nunits);
5172 bool identity_p = indices.series_p (0, 1, 0, 1);
5173 if (!identity_p
5174 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5175 {
5176 if (dump_enabled_p ())
5177 {
5178 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5179 vect_location,
5180 "unsupported vect permute { ");
5181 for (i = 0; i < count; ++i)
5182 {
5183 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5184 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5185 }
5186 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5187 }
5188 gcc_assert (!gsi);
5189 return false;
5190 }
5191
5192 if (!identity_p)
5193 nperms++;
5194 if (gsi)
5195 {
5196 if (second_vec.first == -1U)
5197 second_vec = first_vec;
5198
5199 /* Generate the permute statement if necessary. */
5200 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
5201 tree first_def
5202 = vect_get_slp_vect_def (first_node, first_vec.second);
5203 gassign *perm_stmt;
5204 tree perm_dest = make_ssa_name (vectype);
5205 if (!identity_p)
5206 {
5207 slp_tree second_node
5208 = SLP_TREE_CHILDREN (node)[second_vec.first];
5209 tree second_def
5210 = vect_get_slp_vect_def (second_node, second_vec.second);
5211 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5212 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5213 first_def, second_def,
5214 mask_vec);
5215 }
5216 else
5217 /* We need a copy here in case the def was external. */
5218 perm_stmt = gimple_build_assign (perm_dest, first_def);
5219 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
5220 /* Store the vector statement in NODE. */
5221 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
5222 }
5223
5224 index = 0;
5225 first_vec = std::make_pair (-1U, -1U);
5226 second_vec = std::make_pair (-1U, -1U);
5227 }
5228 }
5229
5230 if (!gsi)
5231 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
5232
5233 return true;
5234 }
5235
5236 /* Vectorize SLP NODE. */
5237
5238 static void
5239 vect_schedule_slp_node (vec_info *vinfo,
5240 slp_tree node, slp_instance instance)
5241 {
5242 gimple_stmt_iterator si;
5243 int i;
5244 slp_tree child;
5245
5246 /* For existing vectors there's nothing to do. */
5247 if (SLP_TREE_VEC_DEFS (node).exists ())
5248 return;
5249
5250 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
5251
5252 /* Vectorize externals and constants. */
5253 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5254 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5255 {
5256 /* ??? vectorizable_shift can end up using a scalar operand which is
5257 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5258 node in this case. */
5259 if (!SLP_TREE_VECTYPE (node))
5260 return;
5261
5262 vect_create_constant_vectors (vinfo, node);
5263 return;
5264 }
5265
5266 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5267
5268 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
5269 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
5270
5271 if (dump_enabled_p ())
5272 dump_printf_loc (MSG_NOTE, vect_location,
5273 "------>vectorizing SLP node starting from: %G",
5274 stmt_info->stmt);
5275
5276 if (STMT_VINFO_DATA_REF (stmt_info)
5277 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5278 {
5279 /* Vectorized loads go before the first scalar load to make it
5280 ready early, vectorized stores go before the last scalar
5281 stmt which is where all uses are ready. */
5282 stmt_vec_info last_stmt_info = NULL;
5283 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
5284 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
5285 else /* DR_IS_WRITE */
5286 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
5287 si = gsi_for_stmt (last_stmt_info->stmt);
5288 }
5289 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
5290 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
5291 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
5292 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5293 {
5294 /* For PHI node vectorization we do not use the insertion iterator. */
5295 si = gsi_none ();
5296 }
5297 else
5298 {
5299 /* Emit other stmts after the children vectorized defs which is
5300 earliest possible. */
5301 gimple *last_stmt = NULL;
5302 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5303 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5304 {
5305 /* For fold-left reductions we are retaining the scalar
5306 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5307 set so the representation isn't perfect. Resort to the
5308 last scalar def here. */
5309 if (SLP_TREE_VEC_STMTS (child).is_empty ())
5310 {
5311 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
5312 == cycle_phi_info_type);
5313 gphi *phi = as_a <gphi *>
5314 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
5315 if (!last_stmt
5316 || vect_stmt_dominates_stmt_p (last_stmt, phi))
5317 last_stmt = phi;
5318 }
5319 /* We are emitting all vectorized stmts in the same place and
5320 the last one is the last.
5321 ??? Unless we have a load permutation applied and that
5322 figures to re-use an earlier generated load. */
5323 unsigned j;
5324 gimple *vstmt;
5325 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
5326 if (!last_stmt
5327 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5328 last_stmt = vstmt;
5329 }
5330 else if (!SLP_TREE_VECTYPE (child))
5331 {
5332 /* For externals we use unvectorized at all scalar defs. */
5333 unsigned j;
5334 tree def;
5335 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
5336 if (TREE_CODE (def) == SSA_NAME
5337 && !SSA_NAME_IS_DEFAULT_DEF (def))
5338 {
5339 gimple *stmt = SSA_NAME_DEF_STMT (def);
5340 if (!last_stmt
5341 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
5342 last_stmt = stmt;
5343 }
5344 }
5345 else
5346 {
5347 /* For externals we have to look at all defs since their
5348 insertion place is decided per vector. But beware
5349 of pre-existing vectors where we need to make sure
5350 we do not insert before the region boundary. */
5351 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
5352 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
5353 last_stmt = gsi_stmt (gsi_after_labels
5354 (as_a <bb_vec_info> (vinfo)->bbs[0]));
5355 else
5356 {
5357 unsigned j;
5358 tree vdef;
5359 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
5360 if (TREE_CODE (vdef) == SSA_NAME
5361 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
5362 {
5363 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
5364 if (!last_stmt
5365 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5366 last_stmt = vstmt;
5367 }
5368 }
5369 }
5370 /* This can happen when all children are pre-existing vectors or
5371 constants. */
5372 if (!last_stmt)
5373 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
5374 if (is_a <gphi *> (last_stmt))
5375 si = gsi_after_labels (gimple_bb (last_stmt));
5376 else
5377 {
5378 si = gsi_for_stmt (last_stmt);
5379 gsi_next (&si);
5380 }
5381 }
5382
5383 bool done_p = false;
5384
5385 /* Handle purely internal nodes. */
5386 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5387 {
5388 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5389 be shared with different SLP nodes (but usually it's the same
5390 operation apart from the case the stmt is only there for denoting
5391 the actual scalar lane defs ...). So do not call vect_transform_stmt
5392 but open-code it here (partly). */
5393 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
5394 gcc_assert (done);
5395 done_p = true;
5396 }
5397 if (!done_p)
5398 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
5399 }
5400
5401 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5402 For loop vectorization this is done in vectorizable_call, but for SLP
5403 it needs to be deferred until end of vect_schedule_slp, because multiple
5404 SLP instances may refer to the same scalar stmt. */
5405
5406 static void
5407 vect_remove_slp_scalar_calls (vec_info *vinfo,
5408 slp_tree node, hash_set<slp_tree> &visited)
5409 {
5410 gimple *new_stmt;
5411 gimple_stmt_iterator gsi;
5412 int i;
5413 slp_tree child;
5414 tree lhs;
5415 stmt_vec_info stmt_info;
5416
5417 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5418 return;
5419
5420 if (visited.add (node))
5421 return;
5422
5423 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5424 vect_remove_slp_scalar_calls (vinfo, child, visited);
5425
5426 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5427 {
5428 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
5429 if (!stmt || gimple_bb (stmt) == NULL)
5430 continue;
5431 if (is_pattern_stmt_p (stmt_info)
5432 || !PURE_SLP_STMT (stmt_info))
5433 continue;
5434 lhs = gimple_call_lhs (stmt);
5435 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
5436 gsi = gsi_for_stmt (stmt);
5437 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
5438 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
5439 }
5440 }
5441
5442 static void
5443 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
5444 {
5445 hash_set<slp_tree> visited;
5446 vect_remove_slp_scalar_calls (vinfo, node, visited);
5447 }
5448
5449 /* Vectorize the instance root. */
5450
5451 void
5452 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
5453 {
5454 gassign *rstmt = NULL;
5455
5456 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
5457 {
5458 gimple *child_stmt;
5459 int j;
5460
5461 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5462 {
5463 tree vect_lhs = gimple_get_lhs (child_stmt);
5464 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
5465 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
5466 TREE_TYPE (vect_lhs)))
5467 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
5468 vect_lhs);
5469 rstmt = gimple_build_assign (root_lhs, vect_lhs);
5470 break;
5471 }
5472 }
5473 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
5474 {
5475 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5476 gimple *child_stmt;
5477 int j;
5478 vec<constructor_elt, va_gc> *v;
5479 vec_alloc (v, nelts);
5480
5481 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5482 {
5483 CONSTRUCTOR_APPEND_ELT (v,
5484 NULL_TREE,
5485 gimple_get_lhs (child_stmt));
5486 }
5487 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
5488 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
5489 tree r_constructor = build_constructor (rtype, v);
5490 rstmt = gimple_build_assign (lhs, r_constructor);
5491 }
5492
5493 gcc_assert (rstmt);
5494
5495 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
5496 gsi_replace (&rgsi, rstmt, true);
5497 }
5498
5499 struct slp_scc_info
5500 {
5501 bool on_stack;
5502 int dfs;
5503 int lowlink;
5504 };
5505
5506 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
5507
5508 static void
5509 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
5510 hash_map<slp_tree, slp_scc_info> &scc_info,
5511 int &maxdfs, vec<slp_tree> &stack)
5512 {
5513 bool existed_p;
5514 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
5515 gcc_assert (!existed_p);
5516 info->dfs = maxdfs;
5517 info->lowlink = maxdfs;
5518 maxdfs++;
5519
5520 /* Leaf. */
5521 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5522 {
5523 info->on_stack = false;
5524 vect_schedule_slp_node (vinfo, node, instance);
5525 return;
5526 }
5527
5528 info->on_stack = true;
5529 stack.safe_push (node);
5530
5531 unsigned i;
5532 slp_tree child;
5533 /* DFS recurse. */
5534 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5535 {
5536 if (!child)
5537 continue;
5538 slp_scc_info *child_info = scc_info.get (child);
5539 if (!child_info)
5540 {
5541 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
5542 /* Recursion might have re-allocated the node. */
5543 info = scc_info.get (node);
5544 child_info = scc_info.get (child);
5545 info->lowlink = MIN (info->lowlink, child_info->lowlink);
5546 }
5547 else if (child_info->on_stack)
5548 info->lowlink = MIN (info->lowlink, child_info->dfs);
5549 }
5550 if (info->lowlink != info->dfs)
5551 return;
5552
5553 auto_vec<slp_tree, 4> phis_to_fixup;
5554
5555 /* Singleton. */
5556 if (stack.last () == node)
5557 {
5558 stack.pop ();
5559 info->on_stack = false;
5560 vect_schedule_slp_node (vinfo, node, instance);
5561 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
5562 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
5563 phis_to_fixup.quick_push (node);
5564 }
5565 else
5566 {
5567 /* SCC. */
5568 int last_idx = stack.length () - 1;
5569 while (stack[last_idx] != node)
5570 last_idx--;
5571 /* We can break the cycle at PHIs who have at least one child
5572 code generated. Then we could re-start the DFS walk until
5573 all nodes in the SCC are covered (we might have new entries
5574 for only back-reachable nodes). But it's simpler to just
5575 iterate and schedule those that are ready. */
5576 unsigned todo = stack.length () - last_idx;
5577 do
5578 {
5579 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
5580 {
5581 slp_tree entry = stack[idx];
5582 if (!entry)
5583 continue;
5584 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
5585 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
5586 bool ready = !phi;
5587 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
5588 if (!child)
5589 {
5590 gcc_assert (phi);
5591 ready = true;
5592 break;
5593 }
5594 else if (scc_info.get (child)->on_stack)
5595 {
5596 if (!phi)
5597 {
5598 ready = false;
5599 break;
5600 }
5601 }
5602 else
5603 {
5604 if (phi)
5605 {
5606 ready = true;
5607 break;
5608 }
5609 }
5610 if (ready)
5611 {
5612 vect_schedule_slp_node (vinfo, entry, instance);
5613 scc_info.get (entry)->on_stack = false;
5614 stack[idx] = NULL;
5615 todo--;
5616 if (phi)
5617 phis_to_fixup.safe_push (entry);
5618 }
5619 }
5620 }
5621 while (todo != 0);
5622
5623 /* Pop the SCC. */
5624 stack.truncate (last_idx);
5625 }
5626
5627 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
5628 slp_tree phi_node;
5629 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
5630 {
5631 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
5632 edge_iterator ei;
5633 edge e;
5634 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
5635 {
5636 unsigned dest_idx = e->dest_idx;
5637 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
5638 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
5639 continue;
5640 /* Simply fill all args. */
5641 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
5642 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
5643 vect_get_slp_vect_def (child, i),
5644 e, gimple_phi_arg_location (phi, dest_idx));
5645 }
5646 }
5647 }
5648
5649 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
5650
5651 void
5652 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
5653 {
5654 slp_instance instance;
5655 unsigned int i;
5656
5657 hash_map<slp_tree, slp_scc_info> scc_info;
5658 int maxdfs = 0;
5659 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5660 {
5661 slp_tree node = SLP_INSTANCE_TREE (instance);
5662 if (dump_enabled_p ())
5663 {
5664 dump_printf_loc (MSG_NOTE, vect_location,
5665 "Vectorizing SLP tree:\n");
5666 if (SLP_INSTANCE_ROOT_STMT (instance))
5667 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
5668 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
5669 vect_print_slp_graph (MSG_NOTE, vect_location,
5670 SLP_INSTANCE_TREE (instance));
5671 }
5672 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
5673 have a PHI be the node breaking the cycle. */
5674 auto_vec<slp_tree> stack;
5675 if (!scc_info.get (node))
5676 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
5677
5678 if (SLP_INSTANCE_ROOT_STMT (instance))
5679 vectorize_slp_instance_root_stmt (node, instance);
5680
5681 if (dump_enabled_p ())
5682 dump_printf_loc (MSG_NOTE, vect_location,
5683 "vectorizing stmts using SLP.\n");
5684 }
5685
5686 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5687 {
5688 slp_tree root = SLP_INSTANCE_TREE (instance);
5689 stmt_vec_info store_info;
5690 unsigned int j;
5691
5692 /* Remove scalar call stmts. Do not do this for basic-block
5693 vectorization as not all uses may be vectorized.
5694 ??? Why should this be necessary? DCE should be able to
5695 remove the stmts itself.
5696 ??? For BB vectorization we can as well remove scalar
5697 stmts starting from the SLP tree root if they have no
5698 uses. */
5699 if (is_a <loop_vec_info> (vinfo))
5700 vect_remove_slp_scalar_calls (vinfo, root);
5701
5702 /* Remove vectorized stores original scalar stmts. */
5703 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
5704 {
5705 if (!STMT_VINFO_DATA_REF (store_info)
5706 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
5707 break;
5708
5709 store_info = vect_orig_stmt (store_info);
5710 /* Free the attached stmt_vec_info and remove the stmt. */
5711 vinfo->remove_stmt (store_info);
5712 }
5713 }
5714 }