]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
Fix SLP vectorization of stores from boolean vectors
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51
52 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
53 slp_tree, stmt_vector_for_cost *);
54
55 object_allocator<_slp_tree> *slp_tree_pool;
56
57 void *
58 _slp_tree::operator new (size_t n)
59 {
60 gcc_assert (n == sizeof (_slp_tree));
61 return slp_tree_pool->allocate_raw ();
62 }
63
64 void
65 _slp_tree::operator delete (void *node, size_t n)
66 {
67 gcc_assert (n == sizeof (_slp_tree));
68 slp_tree_pool->remove_raw (node);
69 }
70
71
72 /* Initialize a SLP node. */
73
74 _slp_tree::_slp_tree ()
75 {
76 SLP_TREE_SCALAR_STMTS (this) = vNULL;
77 SLP_TREE_SCALAR_OPS (this) = vNULL;
78 SLP_TREE_VEC_STMTS (this) = vNULL;
79 SLP_TREE_VEC_DEFS (this) = vNULL;
80 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
81 SLP_TREE_CHILDREN (this) = vNULL;
82 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
83 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
84 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
85 SLP_TREE_CODE (this) = ERROR_MARK;
86 SLP_TREE_VECTYPE (this) = NULL_TREE;
87 SLP_TREE_REPRESENTATIVE (this) = NULL;
88 SLP_TREE_REF_COUNT (this) = 1;
89 this->max_nunits = 1;
90 this->lanes = 0;
91 }
92
93 /* Tear down a SLP node. */
94
95 _slp_tree::~_slp_tree ()
96 {
97 SLP_TREE_CHILDREN (this).release ();
98 SLP_TREE_SCALAR_STMTS (this).release ();
99 SLP_TREE_SCALAR_OPS (this).release ();
100 SLP_TREE_VEC_STMTS (this).release ();
101 SLP_TREE_VEC_DEFS (this).release ();
102 SLP_TREE_LOAD_PERMUTATION (this).release ();
103 SLP_TREE_LANE_PERMUTATION (this).release ();
104 }
105
106 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
107
108 static void
109 vect_free_slp_tree (slp_tree node)
110 {
111 int i;
112 slp_tree child;
113
114 if (--SLP_TREE_REF_COUNT (node) != 0)
115 return;
116
117 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
118 if (child)
119 vect_free_slp_tree (child);
120
121 delete node;
122 }
123
124 /* Return a location suitable for dumpings related to the SLP instance. */
125
126 dump_user_location_t
127 _slp_instance::location () const
128 {
129 if (root_stmt)
130 return root_stmt->stmt;
131 else
132 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
133 }
134
135
136 /* Free the memory allocated for the SLP instance. */
137
138 void
139 vect_free_slp_instance (slp_instance instance)
140 {
141 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
142 SLP_INSTANCE_LOADS (instance).release ();
143 instance->subgraph_entries.release ();
144 instance->cost_vec.release ();
145 free (instance);
146 }
147
148
149 /* Create an SLP node for SCALAR_STMTS. */
150
151 static slp_tree
152 vect_create_new_slp_node (slp_tree node,
153 vec<stmt_vec_info> scalar_stmts, unsigned nops)
154 {
155 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
156 SLP_TREE_CHILDREN (node).create (nops);
157 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
158 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
159 SLP_TREE_LANES (node) = scalar_stmts.length ();
160 return node;
161 }
162
163 /* Create an SLP node for SCALAR_STMTS. */
164
165 static slp_tree
166 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
167 {
168 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
169 }
170
171 /* Create an SLP node for OPS. */
172
173 static slp_tree
174 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
175 {
176 SLP_TREE_SCALAR_OPS (node) = ops;
177 SLP_TREE_DEF_TYPE (node) = vect_external_def;
178 SLP_TREE_LANES (node) = ops.length ();
179 return node;
180 }
181
182 /* Create an SLP node for OPS. */
183
184 static slp_tree
185 vect_create_new_slp_node (vec<tree> ops)
186 {
187 return vect_create_new_slp_node (new _slp_tree, ops);
188 }
189
190
191 /* This structure is used in creation of an SLP tree. Each instance
192 corresponds to the same operand in a group of scalar stmts in an SLP
193 node. */
194 typedef struct _slp_oprnd_info
195 {
196 /* Def-stmts for the operands. */
197 vec<stmt_vec_info> def_stmts;
198 /* Operands. */
199 vec<tree> ops;
200 /* Information about the first statement, its vector def-type, type, the
201 operand itself in case it's constant, and an indication if it's a pattern
202 stmt. */
203 tree first_op_type;
204 enum vect_def_type first_dt;
205 bool any_pattern;
206 } *slp_oprnd_info;
207
208
209 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
210 operand. */
211 static vec<slp_oprnd_info>
212 vect_create_oprnd_info (int nops, int group_size)
213 {
214 int i;
215 slp_oprnd_info oprnd_info;
216 vec<slp_oprnd_info> oprnds_info;
217
218 oprnds_info.create (nops);
219 for (i = 0; i < nops; i++)
220 {
221 oprnd_info = XNEW (struct _slp_oprnd_info);
222 oprnd_info->def_stmts.create (group_size);
223 oprnd_info->ops.create (group_size);
224 oprnd_info->first_dt = vect_uninitialized_def;
225 oprnd_info->first_op_type = NULL_TREE;
226 oprnd_info->any_pattern = false;
227 oprnds_info.quick_push (oprnd_info);
228 }
229
230 return oprnds_info;
231 }
232
233
234 /* Free operands info. */
235
236 static void
237 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
238 {
239 int i;
240 slp_oprnd_info oprnd_info;
241
242 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
243 {
244 oprnd_info->def_stmts.release ();
245 oprnd_info->ops.release ();
246 XDELETE (oprnd_info);
247 }
248
249 oprnds_info.release ();
250 }
251
252
253 /* Return true if STMTS contains a pattern statement. */
254
255 static bool
256 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
257 {
258 stmt_vec_info stmt_info;
259 unsigned int i;
260 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
261 if (is_pattern_stmt_p (stmt_info))
262 return true;
263 return false;
264 }
265
266 /* Return true when all lanes in the external or constant NODE have
267 the same value. */
268
269 static bool
270 vect_slp_tree_uniform_p (slp_tree node)
271 {
272 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
273 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
274
275 /* Pre-exsting vectors. */
276 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
277 return false;
278
279 unsigned i;
280 tree op, first = NULL_TREE;
281 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
282 if (!first)
283 first = op;
284 else if (!operand_equal_p (first, op, 0))
285 return false;
286
287 return true;
288 }
289
290 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
291 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
292 of the chain. */
293
294 int
295 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
296 stmt_vec_info first_stmt_info)
297 {
298 stmt_vec_info next_stmt_info = first_stmt_info;
299 int result = 0;
300
301 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
302 return -1;
303
304 do
305 {
306 if (next_stmt_info == stmt_info)
307 return result;
308 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
309 if (next_stmt_info)
310 result += DR_GROUP_GAP (next_stmt_info);
311 }
312 while (next_stmt_info);
313
314 return -1;
315 }
316
317 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
318 using the method implemented by duplicate_and_interleave. Return true
319 if so, returning the number of intermediate vectors in *NVECTORS_OUT
320 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
321 (if nonnull). */
322
323 bool
324 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
325 tree elt_type, unsigned int *nvectors_out,
326 tree *vector_type_out,
327 tree *permutes)
328 {
329 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
330 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
331 return false;
332
333 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
334 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
335 unsigned int nvectors = 1;
336 for (;;)
337 {
338 scalar_int_mode int_mode;
339 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
340 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
341 {
342 /* Get the natural vector type for this SLP group size. */
343 tree int_type = build_nonstandard_integer_type
344 (GET_MODE_BITSIZE (int_mode), 1);
345 tree vector_type
346 = get_vectype_for_scalar_type (vinfo, int_type, count);
347 if (vector_type
348 && VECTOR_MODE_P (TYPE_MODE (vector_type))
349 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
350 GET_MODE_SIZE (base_vector_mode)))
351 {
352 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
353 together into elements of type INT_TYPE and using the result
354 to build NVECTORS vectors. */
355 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
356 vec_perm_builder sel1 (nelts, 2, 3);
357 vec_perm_builder sel2 (nelts, 2, 3);
358 poly_int64 half_nelts = exact_div (nelts, 2);
359 for (unsigned int i = 0; i < 3; ++i)
360 {
361 sel1.quick_push (i);
362 sel1.quick_push (i + nelts);
363 sel2.quick_push (half_nelts + i);
364 sel2.quick_push (half_nelts + i + nelts);
365 }
366 vec_perm_indices indices1 (sel1, 2, nelts);
367 vec_perm_indices indices2 (sel2, 2, nelts);
368 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
369 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
370 {
371 if (nvectors_out)
372 *nvectors_out = nvectors;
373 if (vector_type_out)
374 *vector_type_out = vector_type;
375 if (permutes)
376 {
377 permutes[0] = vect_gen_perm_mask_checked (vector_type,
378 indices1);
379 permutes[1] = vect_gen_perm_mask_checked (vector_type,
380 indices2);
381 }
382 return true;
383 }
384 }
385 }
386 if (!multiple_p (elt_bytes, 2, &elt_bytes))
387 return false;
388 nvectors *= 2;
389 }
390 }
391
392 /* Return true if DTA and DTB match. */
393
394 static bool
395 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
396 {
397 return (dta == dtb
398 || ((dta == vect_external_def || dta == vect_constant_def)
399 && (dtb == vect_external_def || dtb == vect_constant_def)));
400 }
401
402 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
403 they are of a valid type and that they match the defs of the first stmt of
404 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
405 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
406 indicates swap is required for cond_expr stmts. Specifically, *SWAP
407 is 1 if STMT is cond and operands of comparison need to be swapped;
408 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
409 If there is any operand swap in this function, *SWAP is set to non-zero
410 value.
411 If there was a fatal error return -1; if the error could be corrected by
412 swapping operands of father node of this one, return 1; if everything is
413 ok return 0. */
414 static int
415 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
416 bool *skip_args,
417 vec<stmt_vec_info> stmts, unsigned stmt_num,
418 vec<slp_oprnd_info> *oprnds_info)
419 {
420 stmt_vec_info stmt_info = stmts[stmt_num];
421 tree oprnd;
422 unsigned int i, number_of_oprnds;
423 enum vect_def_type dt = vect_uninitialized_def;
424 slp_oprnd_info oprnd_info;
425 int first_op_idx = 1;
426 unsigned int commutative_op = -1U;
427 bool first_op_cond = false;
428 bool first = stmt_num == 0;
429
430 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
431 {
432 number_of_oprnds = gimple_call_num_args (stmt);
433 first_op_idx = 3;
434 if (gimple_call_internal_p (stmt))
435 {
436 internal_fn ifn = gimple_call_internal_fn (stmt);
437 commutative_op = first_commutative_argument (ifn);
438
439 /* Masked load, only look at mask. */
440 if (ifn == IFN_MASK_LOAD)
441 {
442 number_of_oprnds = 1;
443 /* Mask operand index. */
444 first_op_idx = 5;
445 }
446 }
447 }
448 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
449 {
450 enum tree_code code = gimple_assign_rhs_code (stmt);
451 number_of_oprnds = gimple_num_ops (stmt) - 1;
452 /* Swap can only be done for cond_expr if asked to, otherwise we
453 could result in different comparison code to the first stmt. */
454 if (code == COND_EXPR
455 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
456 {
457 first_op_cond = true;
458 number_of_oprnds++;
459 }
460 else
461 commutative_op = commutative_tree_code (code) ? 0U : -1U;
462 }
463 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
464 number_of_oprnds = gimple_phi_num_args (stmt);
465 else
466 return -1;
467
468 bool swapped = (swap != 0);
469 bool backedge = false;
470 gcc_assert (!swapped || first_op_cond);
471 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
472 for (i = 0; i < number_of_oprnds; i++)
473 {
474 if (first_op_cond)
475 {
476 /* Map indicating how operands of cond_expr should be swapped. */
477 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
478 int *map = maps[swap];
479
480 if (i < 2)
481 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
482 first_op_idx), map[i]);
483 else
484 oprnd = gimple_op (stmt_info->stmt, map[i]);
485 }
486 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
487 {
488 oprnd = gimple_phi_arg_def (stmt, i);
489 backedge = dominated_by_p (CDI_DOMINATORS,
490 gimple_phi_arg_edge (stmt, i)->src,
491 gimple_bb (stmt_info->stmt));
492 }
493 else
494 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
495 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
496 oprnd = TREE_OPERAND (oprnd, 0);
497
498 oprnd_info = (*oprnds_info)[i];
499
500 stmt_vec_info def_stmt_info;
501 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
502 {
503 if (dump_enabled_p ())
504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
505 "Build SLP failed: can't analyze def for %T\n",
506 oprnd);
507
508 return -1;
509 }
510
511 if (skip_args[i])
512 {
513 oprnd_info->def_stmts.quick_push (NULL);
514 oprnd_info->ops.quick_push (NULL_TREE);
515 oprnd_info->first_dt = vect_uninitialized_def;
516 continue;
517 }
518
519 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
520 oprnd_info->any_pattern = true;
521
522 oprnd_info->def_stmts.quick_push (def_stmt_info);
523 oprnd_info->ops.quick_push (oprnd);
524
525 /* If there's a extern def on a backedge make sure we can
526 code-generate at the region start.
527 ??? This is another case that could be fixed by adjusting
528 how we split the function but at the moment we'd have conflicting
529 goals there. */
530 if (backedge
531 && dts[i] == vect_external_def
532 && is_a <bb_vec_info> (vinfo)
533 && TREE_CODE (oprnd) == SSA_NAME
534 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
535 && !dominated_by_p (CDI_DOMINATORS,
536 as_a <bb_vec_info> (vinfo)->bbs[0],
537 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
538 {
539 if (dump_enabled_p ())
540 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
541 "Build SLP failed: extern def %T only defined "
542 "on backedge\n", oprnd);
543 return -1;
544 }
545
546 if (first)
547 {
548 tree type = TREE_TYPE (oprnd);
549 dt = dts[i];
550 if ((dt == vect_constant_def
551 || dt == vect_external_def)
552 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
553 && (TREE_CODE (type) == BOOLEAN_TYPE
554 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
555 type)))
556 {
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
559 "Build SLP failed: invalid type of def "
560 "for variable-length SLP %T\n", oprnd);
561 return -1;
562 }
563
564 /* For the swapping logic below force vect_reduction_def
565 for the reduction op in a SLP reduction group. */
566 if (!STMT_VINFO_DATA_REF (stmt_info)
567 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
568 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
569 && def_stmt_info)
570 dts[i] = dt = vect_reduction_def;
571
572 /* Check the types of the definition. */
573 switch (dt)
574 {
575 case vect_external_def:
576 case vect_constant_def:
577 case vect_internal_def:
578 case vect_reduction_def:
579 case vect_induction_def:
580 case vect_nested_cycle:
581 break;
582
583 default:
584 /* FORNOW: Not supported. */
585 if (dump_enabled_p ())
586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
587 "Build SLP failed: illegal type of def %T\n",
588 oprnd);
589 return -1;
590 }
591
592 oprnd_info->first_dt = dt;
593 oprnd_info->first_op_type = type;
594 }
595 }
596 if (first)
597 return 0;
598
599 /* Now match the operand definition types to that of the first stmt. */
600 for (i = 0; i < number_of_oprnds;)
601 {
602 if (skip_args[i])
603 {
604 ++i;
605 continue;
606 }
607
608 oprnd_info = (*oprnds_info)[i];
609 dt = dts[i];
610 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
611 oprnd = oprnd_info->ops[stmt_num];
612 tree type = TREE_TYPE (oprnd);
613
614 if (!types_compatible_p (oprnd_info->first_op_type, type))
615 {
616 if (dump_enabled_p ())
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
618 "Build SLP failed: different operand types\n");
619 return 1;
620 }
621
622 /* Not first stmt of the group, check that the def-stmt/s match
623 the def-stmt/s of the first stmt. Allow different definition
624 types for reduction chains: the first stmt must be a
625 vect_reduction_def (a phi node), and the rest
626 end in the reduction chain. */
627 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
628 && !(oprnd_info->first_dt == vect_reduction_def
629 && !STMT_VINFO_DATA_REF (stmt_info)
630 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
631 && def_stmt_info
632 && !STMT_VINFO_DATA_REF (def_stmt_info)
633 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
634 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
635 || (!STMT_VINFO_DATA_REF (stmt_info)
636 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
637 && ((!def_stmt_info
638 || STMT_VINFO_DATA_REF (def_stmt_info)
639 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
640 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
641 != (oprnd_info->first_dt != vect_reduction_def))))
642 {
643 /* Try swapping operands if we got a mismatch. For BB
644 vectorization only in case it will clearly improve things. */
645 if (i == commutative_op && !swapped
646 && (!is_a <bb_vec_info> (vinfo)
647 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
648 dts[i+1])
649 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
650 || vect_def_types_match
651 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
652 {
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location,
655 "trying swapped operands\n");
656 std::swap (dts[i], dts[i+1]);
657 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
658 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
659 std::swap ((*oprnds_info)[i]->ops[stmt_num],
660 (*oprnds_info)[i+1]->ops[stmt_num]);
661 swapped = true;
662 continue;
663 }
664
665 if (is_a <bb_vec_info> (vinfo)
666 && !oprnd_info->any_pattern)
667 {
668 /* Now for commutative ops we should see whether we can
669 make the other operand matching. */
670 if (dump_enabled_p ())
671 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
672 "treating operand as external\n");
673 oprnd_info->first_dt = dt = vect_external_def;
674 }
675 else
676 {
677 if (dump_enabled_p ())
678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
679 "Build SLP failed: different types\n");
680 return 1;
681 }
682 }
683
684 /* Make sure to demote the overall operand to external. */
685 if (dt == vect_external_def)
686 oprnd_info->first_dt = vect_external_def;
687 /* For a SLP reduction chain we want to duplicate the reduction to
688 each of the chain members. That gets us a sane SLP graph (still
689 the stmts are not 100% correct wrt the initial values). */
690 else if ((dt == vect_internal_def
691 || dt == vect_reduction_def)
692 && oprnd_info->first_dt == vect_reduction_def
693 && !STMT_VINFO_DATA_REF (stmt_info)
694 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
695 && !STMT_VINFO_DATA_REF (def_stmt_info)
696 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
697 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
698 {
699 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
700 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
701 }
702
703 ++i;
704 }
705
706 /* Swap operands. */
707 if (swapped)
708 {
709 if (dump_enabled_p ())
710 dump_printf_loc (MSG_NOTE, vect_location,
711 "swapped operands to match def types in %G",
712 stmt_info->stmt);
713 }
714
715 return 0;
716 }
717
718 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
719 Return true if we can, meaning that this choice doesn't conflict with
720 existing SLP nodes that use STMT_INFO. */
721
722 bool
723 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
724 {
725 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
726 if (old_vectype)
727 return useless_type_conversion_p (vectype, old_vectype);
728
729 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
730 {
731 /* We maintain the invariant that if any statement in the group is
732 used, all other members of the group have the same vector type. */
733 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
734 stmt_vec_info member_info = first_info;
735 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
736 if (is_pattern_stmt_p (member_info)
737 && !useless_type_conversion_p (vectype,
738 STMT_VINFO_VECTYPE (member_info)))
739 break;
740
741 if (!member_info)
742 {
743 for (member_info = first_info; member_info;
744 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
745 STMT_VINFO_VECTYPE (member_info) = vectype;
746 return true;
747 }
748 }
749 else if (!is_pattern_stmt_p (stmt_info))
750 {
751 STMT_VINFO_VECTYPE (stmt_info) = vectype;
752 return true;
753 }
754
755 if (dump_enabled_p ())
756 {
757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758 "Build SLP failed: incompatible vector"
759 " types for: %G", stmt_info->stmt);
760 dump_printf_loc (MSG_NOTE, vect_location,
761 " old vector type: %T\n", old_vectype);
762 dump_printf_loc (MSG_NOTE, vect_location,
763 " new vector type: %T\n", vectype);
764 }
765 return false;
766 }
767
768 /* Return true if call statements CALL1 and CALL2 are similar enough
769 to be combined into the same SLP group. */
770
771 static bool
772 compatible_calls_p (gcall *call1, gcall *call2)
773 {
774 unsigned int nargs = gimple_call_num_args (call1);
775 if (nargs != gimple_call_num_args (call2))
776 return false;
777
778 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
779 return false;
780
781 if (gimple_call_internal_p (call1))
782 {
783 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
784 TREE_TYPE (gimple_call_lhs (call2))))
785 return false;
786 for (unsigned int i = 0; i < nargs; ++i)
787 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
788 TREE_TYPE (gimple_call_arg (call2, i))))
789 return false;
790 }
791 else
792 {
793 if (!operand_equal_p (gimple_call_fn (call1),
794 gimple_call_fn (call2), 0))
795 return false;
796
797 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
798 return false;
799 }
800 return true;
801 }
802
803 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
804 caller's attempt to find the vector type in STMT_INFO with the narrowest
805 element type. Return true if VECTYPE is nonnull and if it is valid
806 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
807 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
808 vect_build_slp_tree. */
809
810 static bool
811 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
812 unsigned int group_size,
813 tree vectype, poly_uint64 *max_nunits)
814 {
815 if (!vectype)
816 {
817 if (dump_enabled_p ())
818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
819 "Build SLP failed: unsupported data-type in %G\n",
820 stmt_info->stmt);
821 /* Fatal mismatch. */
822 return false;
823 }
824
825 /* If populating the vector type requires unrolling then fail
826 before adjusting *max_nunits for basic-block vectorization. */
827 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
828 unsigned HOST_WIDE_INT const_nunits;
829 if (is_a <bb_vec_info> (vinfo)
830 && (!nunits.is_constant (&const_nunits)
831 || const_nunits > group_size))
832 {
833 if (dump_enabled_p ())
834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
835 "Build SLP failed: unrolling required "
836 "in basic block SLP\n");
837 /* Fatal mismatch. */
838 return false;
839 }
840
841 /* In case of multiple types we need to detect the smallest type. */
842 vect_update_max_nunits (max_nunits, vectype);
843 return true;
844 }
845
846 /* Verify if the scalar stmts STMTS are isomorphic, require data
847 permutation or are of unsupported types of operation. Return
848 true if they are, otherwise return false and indicate in *MATCHES
849 which stmts are not isomorphic to the first one. If MATCHES[0]
850 is false then this indicates the comparison could not be
851 carried out or the stmts will never be vectorized by SLP.
852
853 Note COND_EXPR is possibly isomorphic to another one after swapping its
854 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
855 the first stmt by swapping the two operands of comparison; set SWAP[i]
856 to 2 if stmt I is isormorphic to the first stmt by inverting the code
857 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
858 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
859
860 static bool
861 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
862 vec<stmt_vec_info> stmts, unsigned int group_size,
863 poly_uint64 *max_nunits, bool *matches,
864 bool *two_operators, tree *node_vectype)
865 {
866 unsigned int i;
867 stmt_vec_info first_stmt_info = stmts[0];
868 enum tree_code first_stmt_code = ERROR_MARK;
869 enum tree_code alt_stmt_code = ERROR_MARK;
870 enum tree_code rhs_code = ERROR_MARK;
871 enum tree_code first_cond_code = ERROR_MARK;
872 tree lhs;
873 bool need_same_oprnds = false;
874 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
875 optab optab;
876 int icode;
877 machine_mode optab_op2_mode;
878 machine_mode vec_mode;
879 stmt_vec_info first_load = NULL, prev_first_load = NULL;
880 bool first_stmt_load_p = false, load_p = false;
881 bool first_stmt_phi_p = false, phi_p = false;
882
883 /* For every stmt in NODE find its def stmt/s. */
884 stmt_vec_info stmt_info;
885 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
886 {
887 gimple *stmt = stmt_info->stmt;
888 swap[i] = 0;
889 matches[i] = false;
890
891 if (dump_enabled_p ())
892 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
893
894 /* Fail to vectorize statements marked as unvectorizable, throw
895 or are volatile. */
896 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
897 || stmt_can_throw_internal (cfun, stmt)
898 || gimple_has_volatile_ops (stmt))
899 {
900 if (dump_enabled_p ())
901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
902 "Build SLP failed: unvectorizable statement %G",
903 stmt);
904 /* ??? For BB vectorization we want to commutate operands in a way
905 to shuffle all unvectorizable defs into one operand and have
906 the other still vectorized. The following doesn't reliably
907 work for this though but it's the easiest we can do here. */
908 if (is_a <bb_vec_info> (vinfo) && i != 0)
909 continue;
910 /* Fatal mismatch. */
911 matches[0] = false;
912 return false;
913 }
914
915 lhs = gimple_get_lhs (stmt);
916 if (lhs == NULL_TREE)
917 {
918 if (dump_enabled_p ())
919 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
920 "Build SLP failed: not GIMPLE_ASSIGN nor "
921 "GIMPLE_CALL %G", stmt);
922 if (is_a <bb_vec_info> (vinfo) && i != 0)
923 continue;
924 /* Fatal mismatch. */
925 matches[0] = false;
926 return false;
927 }
928
929 tree nunits_vectype;
930 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
931 &nunits_vectype, group_size)
932 || (nunits_vectype
933 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
934 nunits_vectype, max_nunits)))
935 {
936 if (is_a <bb_vec_info> (vinfo) && i != 0)
937 continue;
938 /* Fatal mismatch. */
939 matches[0] = false;
940 return false;
941 }
942
943 gcc_assert (vectype);
944
945 gcall *call_stmt = dyn_cast <gcall *> (stmt);
946 if (call_stmt)
947 {
948 rhs_code = CALL_EXPR;
949
950 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
951 load_p = true;
952 else if ((gimple_call_internal_p (call_stmt)
953 && (!vectorizable_internal_fn_p
954 (gimple_call_internal_fn (call_stmt))))
955 || gimple_call_tail_p (call_stmt)
956 || gimple_call_noreturn_p (call_stmt)
957 || !gimple_call_nothrow_p (call_stmt)
958 || gimple_call_chain (call_stmt))
959 {
960 if (dump_enabled_p ())
961 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
962 "Build SLP failed: unsupported call type %G",
963 call_stmt);
964 if (is_a <bb_vec_info> (vinfo) && i != 0)
965 continue;
966 /* Fatal mismatch. */
967 matches[0] = false;
968 return false;
969 }
970 }
971 else if (gimple_code (stmt) == GIMPLE_PHI)
972 {
973 rhs_code = ERROR_MARK;
974 phi_p = true;
975 }
976 else
977 {
978 rhs_code = gimple_assign_rhs_code (stmt);
979 load_p = gimple_vuse (stmt);
980 }
981
982 /* Check the operation. */
983 if (i == 0)
984 {
985 *node_vectype = vectype;
986 first_stmt_code = rhs_code;
987 first_stmt_load_p = load_p;
988 first_stmt_phi_p = phi_p;
989
990 /* Shift arguments should be equal in all the packed stmts for a
991 vector shift with scalar shift operand. */
992 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
993 || rhs_code == LROTATE_EXPR
994 || rhs_code == RROTATE_EXPR)
995 {
996 vec_mode = TYPE_MODE (vectype);
997
998 /* First see if we have a vector/vector shift. */
999 optab = optab_for_tree_code (rhs_code, vectype,
1000 optab_vector);
1001
1002 if (!optab
1003 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1004 {
1005 /* No vector/vector shift, try for a vector/scalar shift. */
1006 optab = optab_for_tree_code (rhs_code, vectype,
1007 optab_scalar);
1008
1009 if (!optab)
1010 {
1011 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1013 "Build SLP failed: no optab.\n");
1014 if (is_a <bb_vec_info> (vinfo) && i != 0)
1015 continue;
1016 /* Fatal mismatch. */
1017 matches[0] = false;
1018 return false;
1019 }
1020 icode = (int) optab_handler (optab, vec_mode);
1021 if (icode == CODE_FOR_nothing)
1022 {
1023 if (dump_enabled_p ())
1024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1025 "Build SLP failed: "
1026 "op not supported by target.\n");
1027 if (is_a <bb_vec_info> (vinfo) && i != 0)
1028 continue;
1029 /* Fatal mismatch. */
1030 matches[0] = false;
1031 return false;
1032 }
1033 optab_op2_mode = insn_data[icode].operand[2].mode;
1034 if (!VECTOR_MODE_P (optab_op2_mode))
1035 {
1036 need_same_oprnds = true;
1037 first_op1 = gimple_assign_rhs2 (stmt);
1038 }
1039 }
1040 }
1041 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1042 {
1043 need_same_oprnds = true;
1044 first_op1 = gimple_assign_rhs2 (stmt);
1045 }
1046 else if (!load_p
1047 && rhs_code == BIT_FIELD_REF)
1048 {
1049 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1050 if (TREE_CODE (vec) != SSA_NAME
1051 || !types_compatible_p (vectype, TREE_TYPE (vec)))
1052 {
1053 if (is_a <bb_vec_info> (vinfo) && i != 0)
1054 continue;
1055 if (dump_enabled_p ())
1056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1057 "Build SLP failed: "
1058 "BIT_FIELD_REF not supported\n");
1059 /* Fatal mismatch. */
1060 matches[0] = false;
1061 return false;
1062 }
1063 }
1064 else if (call_stmt
1065 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1066 {
1067 need_same_oprnds = true;
1068 first_op1 = gimple_call_arg (call_stmt, 1);
1069 }
1070 }
1071 else
1072 {
1073 if (first_stmt_code != rhs_code
1074 && alt_stmt_code == ERROR_MARK)
1075 alt_stmt_code = rhs_code;
1076 if ((first_stmt_code != rhs_code
1077 && (first_stmt_code != IMAGPART_EXPR
1078 || rhs_code != REALPART_EXPR)
1079 && (first_stmt_code != REALPART_EXPR
1080 || rhs_code != IMAGPART_EXPR)
1081 /* Handle mismatches in plus/minus by computing both
1082 and merging the results. */
1083 && !((first_stmt_code == PLUS_EXPR
1084 || first_stmt_code == MINUS_EXPR)
1085 && (alt_stmt_code == PLUS_EXPR
1086 || alt_stmt_code == MINUS_EXPR)
1087 && rhs_code == alt_stmt_code)
1088 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1089 && (first_stmt_code == ARRAY_REF
1090 || first_stmt_code == BIT_FIELD_REF
1091 || first_stmt_code == INDIRECT_REF
1092 || first_stmt_code == COMPONENT_REF
1093 || first_stmt_code == MEM_REF)))
1094 || first_stmt_load_p != load_p
1095 || first_stmt_phi_p != phi_p)
1096 {
1097 if (dump_enabled_p ())
1098 {
1099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1100 "Build SLP failed: different operation "
1101 "in stmt %G", stmt);
1102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1103 "original stmt %G", first_stmt_info->stmt);
1104 }
1105 /* Mismatch. */
1106 continue;
1107 }
1108
1109 if (need_same_oprnds)
1110 {
1111 tree other_op1 = (call_stmt
1112 ? gimple_call_arg (call_stmt, 1)
1113 : gimple_assign_rhs2 (stmt));
1114 if (!operand_equal_p (first_op1, other_op1, 0))
1115 {
1116 if (dump_enabled_p ())
1117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1118 "Build SLP failed: different shift "
1119 "arguments in %G", stmt);
1120 /* Mismatch. */
1121 continue;
1122 }
1123 }
1124 if (!load_p
1125 && first_stmt_code == BIT_FIELD_REF
1126 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1127 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1128 {
1129 if (dump_enabled_p ())
1130 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1131 "Build SLP failed: different BIT_FIELD_REF "
1132 "arguments in %G", stmt);
1133 /* Mismatch. */
1134 continue;
1135 }
1136
1137 if (!load_p && rhs_code == CALL_EXPR)
1138 {
1139 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1140 as_a <gcall *> (stmt)))
1141 {
1142 if (dump_enabled_p ())
1143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1144 "Build SLP failed: different calls in %G",
1145 stmt);
1146 /* Mismatch. */
1147 continue;
1148 }
1149 }
1150
1151 if (phi_p
1152 && (gimple_bb (first_stmt_info->stmt)
1153 != gimple_bb (stmt_info->stmt)))
1154 {
1155 if (dump_enabled_p ())
1156 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1157 "Build SLP failed: different BB for PHI "
1158 "in %G", stmt);
1159 /* Mismatch. */
1160 continue;
1161 }
1162
1163 if (!types_compatible_p (vectype, *node_vectype))
1164 {
1165 if (dump_enabled_p ())
1166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1167 "Build SLP failed: different vector type "
1168 "in %G", stmt);
1169 /* Mismatch. */
1170 continue;
1171 }
1172 }
1173
1174 /* Grouped store or load. */
1175 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1176 {
1177 if (REFERENCE_CLASS_P (lhs))
1178 {
1179 /* Store. */
1180 ;
1181 }
1182 else
1183 {
1184 /* Load. */
1185 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1186 if (prev_first_load)
1187 {
1188 /* Check that there are no loads from different interleaving
1189 chains in the same node. */
1190 if (prev_first_load != first_load)
1191 {
1192 if (dump_enabled_p ())
1193 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1194 vect_location,
1195 "Build SLP failed: different "
1196 "interleaving chains in one node %G",
1197 stmt);
1198 /* Mismatch. */
1199 continue;
1200 }
1201 }
1202 else
1203 prev_first_load = first_load;
1204 }
1205 } /* Grouped access. */
1206 else
1207 {
1208 if (load_p)
1209 {
1210 /* Not grouped load. */
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1213 "Build SLP failed: not grouped load %G", stmt);
1214
1215 /* FORNOW: Not grouped loads are not supported. */
1216 if (is_a <bb_vec_info> (vinfo) && i != 0)
1217 continue;
1218 /* Fatal mismatch. */
1219 matches[0] = false;
1220 return false;
1221 }
1222
1223 /* Not memory operation. */
1224 if (!phi_p
1225 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1226 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1227 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1228 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1229 && rhs_code != VIEW_CONVERT_EXPR
1230 && rhs_code != CALL_EXPR
1231 && rhs_code != BIT_FIELD_REF)
1232 {
1233 if (dump_enabled_p ())
1234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235 "Build SLP failed: operation unsupported %G",
1236 stmt);
1237 if (is_a <bb_vec_info> (vinfo) && i != 0)
1238 continue;
1239 /* Fatal mismatch. */
1240 matches[0] = false;
1241 return false;
1242 }
1243
1244 if (rhs_code == COND_EXPR)
1245 {
1246 tree cond_expr = gimple_assign_rhs1 (stmt);
1247 enum tree_code cond_code = TREE_CODE (cond_expr);
1248 enum tree_code swap_code = ERROR_MARK;
1249 enum tree_code invert_code = ERROR_MARK;
1250
1251 if (i == 0)
1252 first_cond_code = TREE_CODE (cond_expr);
1253 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1254 {
1255 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1256 swap_code = swap_tree_comparison (cond_code);
1257 invert_code = invert_tree_comparison (cond_code, honor_nans);
1258 }
1259
1260 if (first_cond_code == cond_code)
1261 ;
1262 /* Isomorphic can be achieved by swapping. */
1263 else if (first_cond_code == swap_code)
1264 swap[i] = 1;
1265 /* Isomorphic can be achieved by inverting. */
1266 else if (first_cond_code == invert_code)
1267 swap[i] = 2;
1268 else
1269 {
1270 if (dump_enabled_p ())
1271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1272 "Build SLP failed: different"
1273 " operation %G", stmt);
1274 /* Mismatch. */
1275 continue;
1276 }
1277 }
1278 }
1279
1280 matches[i] = true;
1281 }
1282
1283 for (i = 0; i < group_size; ++i)
1284 if (!matches[i])
1285 return false;
1286
1287 /* If we allowed a two-operation SLP node verify the target can cope
1288 with the permute we are going to use. */
1289 if (alt_stmt_code != ERROR_MARK
1290 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1291 {
1292 *two_operators = true;
1293 }
1294
1295 return true;
1296 }
1297
1298 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1299 Note we never remove apart from at destruction time so we do not
1300 need a special value for deleted that differs from empty. */
1301 struct bst_traits
1302 {
1303 typedef vec <stmt_vec_info> value_type;
1304 typedef vec <stmt_vec_info> compare_type;
1305 static inline hashval_t hash (value_type);
1306 static inline bool equal (value_type existing, value_type candidate);
1307 static inline bool is_empty (value_type x) { return !x.exists (); }
1308 static inline bool is_deleted (value_type x) { return !x.exists (); }
1309 static const bool empty_zero_p = true;
1310 static inline void mark_empty (value_type &x) { x.release (); }
1311 static inline void mark_deleted (value_type &x) { x.release (); }
1312 static inline void remove (value_type &x) { x.release (); }
1313 };
1314 inline hashval_t
1315 bst_traits::hash (value_type x)
1316 {
1317 inchash::hash h;
1318 for (unsigned i = 0; i < x.length (); ++i)
1319 h.add_int (gimple_uid (x[i]->stmt));
1320 return h.end ();
1321 }
1322 inline bool
1323 bst_traits::equal (value_type existing, value_type candidate)
1324 {
1325 if (existing.length () != candidate.length ())
1326 return false;
1327 for (unsigned i = 0; i < existing.length (); ++i)
1328 if (existing[i] != candidate[i])
1329 return false;
1330 return true;
1331 }
1332
1333 typedef hash_map <vec <gimple *>, slp_tree,
1334 simple_hashmap_traits <bst_traits, slp_tree> >
1335 scalar_stmts_to_slp_tree_map_t;
1336
1337 static slp_tree
1338 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1339 vec<stmt_vec_info> stmts, unsigned int group_size,
1340 poly_uint64 *max_nunits,
1341 bool *matches, unsigned *npermutes, unsigned *tree_size,
1342 scalar_stmts_to_slp_tree_map_t *bst_map);
1343
1344 static slp_tree
1345 vect_build_slp_tree (vec_info *vinfo,
1346 vec<stmt_vec_info> stmts, unsigned int group_size,
1347 poly_uint64 *max_nunits,
1348 bool *matches, unsigned *npermutes, unsigned *tree_size,
1349 scalar_stmts_to_slp_tree_map_t *bst_map)
1350 {
1351 if (slp_tree *leader = bst_map->get (stmts))
1352 {
1353 if (dump_enabled_p ())
1354 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1355 *leader ? "" : "failed ", *leader);
1356 if (*leader)
1357 {
1358 SLP_TREE_REF_COUNT (*leader)++;
1359 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1360 }
1361 return *leader;
1362 }
1363
1364 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1365 so we can pick up backedge destinations during discovery. */
1366 slp_tree res = new _slp_tree;
1367 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1368 SLP_TREE_SCALAR_STMTS (res) = stmts;
1369 bst_map->put (stmts.copy (), res);
1370
1371 poly_uint64 this_max_nunits = 1;
1372 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1373 &this_max_nunits,
1374 matches, npermutes, tree_size, bst_map);
1375 if (!res_)
1376 {
1377 bool existed_p = bst_map->put (stmts, NULL);
1378 gcc_assert (existed_p);
1379 /* Mark the node invalid so we can detect those when still in use
1380 as backedge destinations. */
1381 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1382 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1383 vect_free_slp_tree (res);
1384 }
1385 else
1386 {
1387 gcc_assert (res_ == res);
1388 res->max_nunits = this_max_nunits;
1389 vect_update_max_nunits (max_nunits, this_max_nunits);
1390 /* Keep a reference for the bst_map use. */
1391 SLP_TREE_REF_COUNT (res)++;
1392 }
1393 return res_;
1394 }
1395
1396 /* Recursively build an SLP tree starting from NODE.
1397 Fail (and return a value not equal to zero) if def-stmts are not
1398 isomorphic, require data permutation or are of unsupported types of
1399 operation. Otherwise, return 0.
1400 The value returned is the depth in the SLP tree where a mismatch
1401 was found. */
1402
1403 static slp_tree
1404 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1405 vec<stmt_vec_info> stmts, unsigned int group_size,
1406 poly_uint64 *max_nunits,
1407 bool *matches, unsigned *npermutes, unsigned *tree_size,
1408 scalar_stmts_to_slp_tree_map_t *bst_map)
1409 {
1410 unsigned nops, i, this_tree_size = 0;
1411 poly_uint64 this_max_nunits = *max_nunits;
1412
1413 matches[0] = false;
1414
1415 stmt_vec_info stmt_info = stmts[0];
1416 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1417 nops = gimple_call_num_args (stmt);
1418 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1419 {
1420 nops = gimple_num_ops (stmt) - 1;
1421 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1422 nops++;
1423 }
1424 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1425 nops = gimple_phi_num_args (phi);
1426 else
1427 return NULL;
1428
1429 /* If the SLP node is a PHI (induction or reduction), terminate
1430 the recursion. */
1431 bool *skip_args = XALLOCAVEC (bool, nops);
1432 memset (skip_args, 0, sizeof (bool) * nops);
1433 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1434 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1435 {
1436 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1437 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1438 group_size);
1439 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1440 max_nunits))
1441 return NULL;
1442
1443 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1444 if (def_type == vect_induction_def)
1445 {
1446 /* Induction PHIs are not cycles but walk the initial
1447 value. Only for inner loops through, for outer loops
1448 we need to pick up the value from the actual PHIs
1449 to more easily support peeling and epilogue vectorization. */
1450 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1451 if (!nested_in_vect_loop_p (loop, stmt_info))
1452 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1453 else
1454 loop = loop->inner;
1455 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1456 }
1457 else if (def_type == vect_reduction_def
1458 || def_type == vect_double_reduction_def
1459 || def_type == vect_nested_cycle)
1460 {
1461 /* Else def types have to match. */
1462 stmt_vec_info other_info;
1463 bool all_same = true;
1464 FOR_EACH_VEC_ELT (stmts, i, other_info)
1465 {
1466 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1467 return NULL;
1468 if (other_info != stmt_info)
1469 all_same = false;
1470 }
1471 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1472 /* Reduction initial values are not explicitely represented. */
1473 if (!nested_in_vect_loop_p (loop, stmt_info))
1474 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1475 /* Reduction chain backedge defs are filled manually.
1476 ??? Need a better way to identify a SLP reduction chain PHI.
1477 Or a better overall way to SLP match those. */
1478 if (all_same && def_type == vect_reduction_def)
1479 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1480 }
1481 else if (def_type != vect_internal_def)
1482 return NULL;
1483 }
1484
1485
1486 bool two_operators = false;
1487 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1488 tree vectype = NULL_TREE;
1489 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1490 &this_max_nunits, matches, &two_operators,
1491 &vectype))
1492 return NULL;
1493
1494 /* If the SLP node is a load, terminate the recursion unless masked. */
1495 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1496 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1497 {
1498 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1499 {
1500 /* Masked load. */
1501 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1502 nops = 1;
1503 }
1504 else
1505 {
1506 *max_nunits = this_max_nunits;
1507 (*tree_size)++;
1508 node = vect_create_new_slp_node (node, stmts, 0);
1509 SLP_TREE_VECTYPE (node) = vectype;
1510 /* And compute the load permutation. Whether it is actually
1511 a permutation depends on the unrolling factor which is
1512 decided later. */
1513 vec<unsigned> load_permutation;
1514 int j;
1515 stmt_vec_info load_info;
1516 load_permutation.create (group_size);
1517 stmt_vec_info first_stmt_info
1518 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1519 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1520 {
1521 int load_place = vect_get_place_in_interleaving_chain
1522 (load_info, first_stmt_info);
1523 gcc_assert (load_place != -1);
1524 load_permutation.safe_push (load_place);
1525 }
1526 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1527 return node;
1528 }
1529 }
1530 else if (gimple_assign_single_p (stmt_info->stmt)
1531 && !gimple_vuse (stmt_info->stmt)
1532 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1533 {
1534 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1535 the same SSA name vector of a compatible type to vectype. */
1536 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1537 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1538 stmt_vec_info estmt_info;
1539 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1540 {
1541 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1542 tree bfref = gimple_assign_rhs1 (estmt);
1543 HOST_WIDE_INT lane;
1544 if (!known_eq (bit_field_size (bfref),
1545 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1546 || !constant_multiple_p (bit_field_offset (bfref),
1547 bit_field_size (bfref), &lane))
1548 {
1549 lperm.release ();
1550 return NULL;
1551 }
1552 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1553 }
1554 slp_tree vnode = vect_create_new_slp_node (vNULL);
1555 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1556 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1557 /* We are always building a permutation node even if it is an identity
1558 permute to shield the rest of the vectorizer from the odd node
1559 representing an actual vector without any scalar ops.
1560 ??? We could hide it completely with making the permute node
1561 external? */
1562 node = vect_create_new_slp_node (node, stmts, 1);
1563 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1564 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1565 SLP_TREE_VECTYPE (node) = vectype;
1566 SLP_TREE_CHILDREN (node).quick_push (vnode);
1567 return node;
1568 }
1569
1570 /* Get at the operands, verifying they are compatible. */
1571 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1572 slp_oprnd_info oprnd_info;
1573 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1574 {
1575 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
1576 stmts, i, &oprnds_info);
1577 if (res != 0)
1578 matches[(res == -1) ? 0 : i] = false;
1579 if (!matches[0])
1580 break;
1581 }
1582 for (i = 0; i < group_size; ++i)
1583 if (!matches[i])
1584 {
1585 vect_free_oprnd_info (oprnds_info);
1586 return NULL;
1587 }
1588 swap = NULL;
1589
1590 auto_vec<slp_tree, 4> children;
1591
1592 stmt_info = stmts[0];
1593
1594 /* Create SLP_TREE nodes for the definition node/s. */
1595 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1596 {
1597 slp_tree child;
1598 unsigned int j;
1599
1600 /* We're skipping certain operands from processing, for example
1601 outer loop reduction initial defs. */
1602 if (skip_args[i])
1603 {
1604 children.safe_push (NULL);
1605 continue;
1606 }
1607
1608 if (oprnd_info->first_dt == vect_uninitialized_def)
1609 {
1610 /* COND_EXPR have one too many eventually if the condition
1611 is a SSA name. */
1612 gcc_assert (i == 3 && nops == 4);
1613 continue;
1614 }
1615
1616 if (is_a <bb_vec_info> (vinfo)
1617 && oprnd_info->first_dt == vect_internal_def
1618 && !oprnd_info->any_pattern)
1619 {
1620 /* For BB vectorization, if all defs are the same do not
1621 bother to continue the build along the single-lane
1622 graph but use a splat of the scalar value. */
1623 stmt_vec_info first_def = oprnd_info->def_stmts[0];
1624 for (j = 1; j < group_size; ++j)
1625 if (oprnd_info->def_stmts[j] != first_def)
1626 break;
1627 if (j == group_size
1628 /* But avoid doing this for loads where we may be
1629 able to CSE things, unless the stmt is not
1630 vectorizable. */
1631 && (!STMT_VINFO_VECTORIZABLE (first_def)
1632 || !gimple_vuse (first_def->stmt)))
1633 {
1634 if (dump_enabled_p ())
1635 dump_printf_loc (MSG_NOTE, vect_location,
1636 "Using a splat of the uniform operand\n");
1637 oprnd_info->first_dt = vect_external_def;
1638 }
1639 }
1640
1641 if (oprnd_info->first_dt == vect_external_def
1642 || oprnd_info->first_dt == vect_constant_def)
1643 {
1644 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1645 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1646 oprnd_info->ops = vNULL;
1647 children.safe_push (invnode);
1648 continue;
1649 }
1650
1651 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1652 group_size, &this_max_nunits,
1653 matches, npermutes,
1654 &this_tree_size, bst_map)) != NULL)
1655 {
1656 oprnd_info->def_stmts = vNULL;
1657 children.safe_push (child);
1658 continue;
1659 }
1660
1661 /* If the SLP build for operand zero failed and operand zero
1662 and one can be commutated try that for the scalar stmts
1663 that failed the match. */
1664 if (i == 0
1665 /* A first scalar stmt mismatch signals a fatal mismatch. */
1666 && matches[0]
1667 /* ??? For COND_EXPRs we can swap the comparison operands
1668 as well as the arms under some constraints. */
1669 && nops == 2
1670 && oprnds_info[1]->first_dt == vect_internal_def
1671 && is_gimple_assign (stmt_info->stmt)
1672 /* Swapping operands for reductions breaks assumptions later on. */
1673 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1674 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1675 /* Do so only if the number of not successful permutes was nor more
1676 than a cut-ff as re-trying the recursive match on
1677 possibly each level of the tree would expose exponential
1678 behavior. */
1679 && *npermutes < 4)
1680 {
1681 /* See whether we can swap the matching or the non-matching
1682 stmt operands. */
1683 bool swap_not_matching = true;
1684 do
1685 {
1686 for (j = 0; j < group_size; ++j)
1687 {
1688 if (matches[j] != !swap_not_matching)
1689 continue;
1690 stmt_vec_info stmt_info = stmts[j];
1691 /* Verify if we can swap operands of this stmt. */
1692 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1693 if (!stmt
1694 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1695 {
1696 if (!swap_not_matching)
1697 goto fail;
1698 swap_not_matching = false;
1699 break;
1700 }
1701 }
1702 }
1703 while (j != group_size);
1704
1705 /* Swap mismatched definition stmts. */
1706 if (dump_enabled_p ())
1707 dump_printf_loc (MSG_NOTE, vect_location,
1708 "Re-trying with swapped operands of stmts ");
1709 for (j = 0; j < group_size; ++j)
1710 if (matches[j] == !swap_not_matching)
1711 {
1712 std::swap (oprnds_info[0]->def_stmts[j],
1713 oprnds_info[1]->def_stmts[j]);
1714 std::swap (oprnds_info[0]->ops[j],
1715 oprnds_info[1]->ops[j]);
1716 if (dump_enabled_p ())
1717 dump_printf (MSG_NOTE, "%d ", j);
1718 }
1719 if (dump_enabled_p ())
1720 dump_printf (MSG_NOTE, "\n");
1721 /* And try again with scratch 'matches' ... */
1722 bool *tem = XALLOCAVEC (bool, group_size);
1723 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1724 group_size, &this_max_nunits,
1725 tem, npermutes,
1726 &this_tree_size, bst_map)) != NULL)
1727 {
1728 oprnd_info->def_stmts = vNULL;
1729 children.safe_push (child);
1730 continue;
1731 }
1732 /* We do not undo the swapping here since it might still be
1733 the better order for the second operand in case we build
1734 the first one from scalars below. */
1735 ++*npermutes;
1736 }
1737 fail:
1738
1739 /* If the SLP build failed and we analyze a basic-block
1740 simply treat nodes we fail to build as externally defined
1741 (and thus build vectors from the scalar defs).
1742 The cost model will reject outright expensive cases.
1743 ??? This doesn't treat cases where permutation ultimatively
1744 fails (or we don't try permutation below). Ideally we'd
1745 even compute a permutation that will end up with the maximum
1746 SLP tree size... */
1747 if (is_a <bb_vec_info> (vinfo)
1748 /* ??? Rejecting patterns this way doesn't work. We'd have to
1749 do extra work to cancel the pattern so the uses see the
1750 scalar version. */
1751 && !is_pattern_stmt_p (stmt_info)
1752 && !oprnd_info->any_pattern)
1753 {
1754 /* But if there's a leading vector sized set of matching stmts
1755 fail here so we can split the group. This matches the condition
1756 vect_analyze_slp_instance uses. */
1757 /* ??? We might want to split here and combine the results to support
1758 multiple vector sizes better. */
1759 for (j = 0; j < group_size; ++j)
1760 if (!matches[j])
1761 break;
1762 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1763 {
1764 if (dump_enabled_p ())
1765 dump_printf_loc (MSG_NOTE, vect_location,
1766 "Building vector operands from scalars\n");
1767 this_tree_size++;
1768 child = vect_create_new_slp_node (oprnd_info->ops);
1769 children.safe_push (child);
1770 oprnd_info->ops = vNULL;
1771 continue;
1772 }
1773 }
1774
1775 gcc_assert (child == NULL);
1776 FOR_EACH_VEC_ELT (children, j, child)
1777 if (child)
1778 vect_free_slp_tree (child);
1779 vect_free_oprnd_info (oprnds_info);
1780 return NULL;
1781 }
1782
1783 vect_free_oprnd_info (oprnds_info);
1784
1785 /* If we have all children of a child built up from uniform scalars
1786 or does more than one possibly expensive vector construction then
1787 just throw that away, causing it built up from scalars.
1788 The exception is the SLP node for the vector store. */
1789 if (is_a <bb_vec_info> (vinfo)
1790 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1791 /* ??? Rejecting patterns this way doesn't work. We'd have to
1792 do extra work to cancel the pattern so the uses see the
1793 scalar version. */
1794 && !is_pattern_stmt_p (stmt_info))
1795 {
1796 slp_tree child;
1797 unsigned j;
1798 bool all_uniform_p = true;
1799 unsigned n_vector_builds = 0;
1800 FOR_EACH_VEC_ELT (children, j, child)
1801 {
1802 if (!child)
1803 ;
1804 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1805 all_uniform_p = false;
1806 else if (!vect_slp_tree_uniform_p (child))
1807 {
1808 all_uniform_p = false;
1809 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1810 n_vector_builds++;
1811 }
1812 }
1813 if (all_uniform_p || n_vector_builds > 1)
1814 {
1815 /* Roll back. */
1816 matches[0] = false;
1817 FOR_EACH_VEC_ELT (children, j, child)
1818 if (child)
1819 vect_free_slp_tree (child);
1820
1821 if (dump_enabled_p ())
1822 dump_printf_loc (MSG_NOTE, vect_location,
1823 "Building parent vector operands from "
1824 "scalars instead\n");
1825 return NULL;
1826 }
1827 }
1828
1829 *tree_size += this_tree_size + 1;
1830 *max_nunits = this_max_nunits;
1831
1832 if (two_operators)
1833 {
1834 /* ??? We'd likely want to either cache in bst_map sth like
1835 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1836 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1837 explicit stmts to put in so the keying on 'stmts' doesn't
1838 work (but we have the same issue with nodes that use 'ops'). */
1839 slp_tree one = new _slp_tree;
1840 slp_tree two = new _slp_tree;
1841 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1842 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1843 SLP_TREE_VECTYPE (one) = vectype;
1844 SLP_TREE_VECTYPE (two) = vectype;
1845 SLP_TREE_CHILDREN (one).safe_splice (children);
1846 SLP_TREE_CHILDREN (two).safe_splice (children);
1847 slp_tree child;
1848 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1849 SLP_TREE_REF_COUNT (child)++;
1850
1851 /* Here we record the original defs since this
1852 node represents the final lane configuration. */
1853 node = vect_create_new_slp_node (node, stmts, 2);
1854 SLP_TREE_VECTYPE (node) = vectype;
1855 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1856 SLP_TREE_CHILDREN (node).quick_push (one);
1857 SLP_TREE_CHILDREN (node).quick_push (two);
1858 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1859 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1860 enum tree_code ocode = ERROR_MARK;
1861 stmt_vec_info ostmt_info;
1862 unsigned j = 0;
1863 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1864 {
1865 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1866 if (gimple_assign_rhs_code (ostmt) != code0)
1867 {
1868 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1869 ocode = gimple_assign_rhs_code (ostmt);
1870 j = i;
1871 }
1872 else
1873 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1874 }
1875 SLP_TREE_CODE (one) = code0;
1876 SLP_TREE_CODE (two) = ocode;
1877 SLP_TREE_LANES (one) = stmts.length ();
1878 SLP_TREE_LANES (two) = stmts.length ();
1879 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1880 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1881 return node;
1882 }
1883
1884 node = vect_create_new_slp_node (node, stmts, nops);
1885 SLP_TREE_VECTYPE (node) = vectype;
1886 SLP_TREE_CHILDREN (node).splice (children);
1887 return node;
1888 }
1889
1890 /* Dump a single SLP tree NODE. */
1891
1892 static void
1893 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1894 slp_tree node)
1895 {
1896 unsigned i, j;
1897 slp_tree child;
1898 stmt_vec_info stmt_info;
1899 tree op;
1900
1901 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1902 dump_user_location_t user_loc = loc.get_user_location ();
1903 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1904 SLP_TREE_DEF_TYPE (node) == vect_external_def
1905 ? " (external)"
1906 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1907 ? " (constant)"
1908 : ""), node,
1909 estimated_poly_value (node->max_nunits),
1910 SLP_TREE_REF_COUNT (node));
1911 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1912 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1913 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1914 else
1915 {
1916 dump_printf_loc (metadata, user_loc, "\t{ ");
1917 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1918 dump_printf (metadata, "%T%s ", op,
1919 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1920 dump_printf (metadata, "}\n");
1921 }
1922 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1923 {
1924 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1925 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1926 dump_printf (dump_kind, " %u", j);
1927 dump_printf (dump_kind, " }\n");
1928 }
1929 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1930 {
1931 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1932 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1933 dump_printf (dump_kind, " %u[%u]",
1934 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1935 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1936 dump_printf (dump_kind, " }\n");
1937 }
1938 if (SLP_TREE_CHILDREN (node).is_empty ())
1939 return;
1940 dump_printf_loc (metadata, user_loc, "\tchildren");
1941 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1942 dump_printf (dump_kind, " %p", (void *)child);
1943 dump_printf (dump_kind, "\n");
1944 }
1945
1946 DEBUG_FUNCTION void
1947 debug (slp_tree node)
1948 {
1949 debug_dump_context ctx;
1950 vect_print_slp_tree (MSG_NOTE,
1951 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1952 node);
1953 }
1954
1955 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1956
1957 static void
1958 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1959 slp_tree node, hash_set<slp_tree> &visited)
1960 {
1961 unsigned i;
1962 slp_tree child;
1963
1964 if (visited.add (node))
1965 return;
1966
1967 vect_print_slp_tree (dump_kind, loc, node);
1968
1969 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1970 if (child)
1971 vect_print_slp_graph (dump_kind, loc, child, visited);
1972 }
1973
1974 static void
1975 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1976 slp_tree entry)
1977 {
1978 hash_set<slp_tree> visited;
1979 vect_print_slp_graph (dump_kind, loc, entry, visited);
1980 }
1981
1982 /* Mark the tree rooted at NODE with PURE_SLP. */
1983
1984 static void
1985 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1986 {
1987 int i;
1988 stmt_vec_info stmt_info;
1989 slp_tree child;
1990
1991 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1992 return;
1993
1994 if (visited.add (node))
1995 return;
1996
1997 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1998 STMT_SLP_TYPE (stmt_info) = pure_slp;
1999
2000 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2001 if (child)
2002 vect_mark_slp_stmts (child, visited);
2003 }
2004
2005 static void
2006 vect_mark_slp_stmts (slp_tree node)
2007 {
2008 hash_set<slp_tree> visited;
2009 vect_mark_slp_stmts (node, visited);
2010 }
2011
2012 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2013
2014 static void
2015 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2016 {
2017 int i;
2018 stmt_vec_info stmt_info;
2019 slp_tree child;
2020
2021 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2022 return;
2023
2024 if (visited.add (node))
2025 return;
2026
2027 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2028 {
2029 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2030 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2031 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2032 }
2033
2034 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2035 if (child)
2036 vect_mark_slp_stmts_relevant (child, visited);
2037 }
2038
2039 static void
2040 vect_mark_slp_stmts_relevant (slp_tree node)
2041 {
2042 hash_set<slp_tree> visited;
2043 vect_mark_slp_stmts_relevant (node, visited);
2044 }
2045
2046
2047 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2048
2049 static void
2050 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2051 hash_set<slp_tree> &visited)
2052 {
2053 if (!node || visited.add (node))
2054 return;
2055
2056 if (SLP_TREE_CHILDREN (node).length () == 0)
2057 {
2058 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2059 return;
2060 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2061 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2062 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2063 loads.safe_push (node);
2064 }
2065 else
2066 {
2067 unsigned i;
2068 slp_tree child;
2069 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2070 vect_gather_slp_loads (loads, child, visited);
2071 }
2072 }
2073
2074 static void
2075 vect_gather_slp_loads (slp_instance inst, slp_tree node)
2076 {
2077 hash_set<slp_tree> visited;
2078 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
2079 }
2080
2081
2082 /* Find the last store in SLP INSTANCE. */
2083
2084 stmt_vec_info
2085 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2086 {
2087 stmt_vec_info last = NULL;
2088 stmt_vec_info stmt_vinfo;
2089
2090 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2091 {
2092 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2093 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2094 }
2095
2096 return last;
2097 }
2098
2099 /* Find the first stmt in NODE. */
2100
2101 stmt_vec_info
2102 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2103 {
2104 stmt_vec_info first = NULL;
2105 stmt_vec_info stmt_vinfo;
2106
2107 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2108 {
2109 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2110 if (!first
2111 || get_later_stmt (stmt_vinfo, first) == first)
2112 first = stmt_vinfo;
2113 }
2114
2115 return first;
2116 }
2117
2118 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2119 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2120 (also containing the first GROUP1_SIZE stmts, since stores are
2121 consecutive), the second containing the remainder.
2122 Return the first stmt in the second group. */
2123
2124 static stmt_vec_info
2125 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2126 {
2127 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2128 gcc_assert (group1_size > 0);
2129 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2130 gcc_assert (group2_size > 0);
2131 DR_GROUP_SIZE (first_vinfo) = group1_size;
2132
2133 stmt_vec_info stmt_info = first_vinfo;
2134 for (unsigned i = group1_size; i > 1; i--)
2135 {
2136 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2137 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2138 }
2139 /* STMT is now the last element of the first group. */
2140 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2141 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2142
2143 DR_GROUP_SIZE (group2) = group2_size;
2144 for (stmt_info = group2; stmt_info;
2145 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2146 {
2147 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2148 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2149 }
2150
2151 /* For the second group, the DR_GROUP_GAP is that before the original group,
2152 plus skipping over the first vector. */
2153 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2154
2155 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2156 DR_GROUP_GAP (first_vinfo) += group2_size;
2157
2158 if (dump_enabled_p ())
2159 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2160 group1_size, group2_size);
2161
2162 return group2;
2163 }
2164
2165 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2166 statements and a vector of NUNITS elements. */
2167
2168 static poly_uint64
2169 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2170 {
2171 return exact_div (common_multiple (nunits, group_size), group_size);
2172 }
2173
2174 static bool
2175 vect_analyze_slp_instance (vec_info *vinfo,
2176 scalar_stmts_to_slp_tree_map_t *bst_map,
2177 stmt_vec_info stmt_info, unsigned max_tree_size);
2178
2179 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2180 of KIND. Return true if successful. */
2181
2182 static bool
2183 vect_build_slp_instance (vec_info *vinfo,
2184 slp_instance_kind kind,
2185 vec<stmt_vec_info> scalar_stmts,
2186 stmt_vec_info root_stmt_info,
2187 unsigned max_tree_size,
2188 scalar_stmts_to_slp_tree_map_t *bst_map,
2189 /* ??? We need stmt_info for group splitting. */
2190 stmt_vec_info stmt_info_)
2191 {
2192 if (dump_enabled_p ())
2193 {
2194 dump_printf_loc (MSG_NOTE, vect_location,
2195 "Starting SLP discovery for\n");
2196 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
2197 dump_printf_loc (MSG_NOTE, vect_location,
2198 " %G", scalar_stmts[i]->stmt);
2199 }
2200
2201 /* Build the tree for the SLP instance. */
2202 unsigned int group_size = scalar_stmts.length ();
2203 bool *matches = XALLOCAVEC (bool, group_size);
2204 unsigned npermutes = 0;
2205 poly_uint64 max_nunits = 1;
2206 unsigned tree_size = 0;
2207 unsigned i;
2208 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2209 &max_nunits, matches, &npermutes,
2210 &tree_size, bst_map);
2211 if (node != NULL)
2212 {
2213 /* Calculate the unrolling factor based on the smallest type. */
2214 poly_uint64 unrolling_factor
2215 = calculate_unrolling_factor (max_nunits, group_size);
2216
2217 if (maybe_ne (unrolling_factor, 1U)
2218 && is_a <bb_vec_info> (vinfo))
2219 {
2220 unsigned HOST_WIDE_INT const_max_nunits;
2221 if (!max_nunits.is_constant (&const_max_nunits)
2222 || const_max_nunits > group_size)
2223 {
2224 if (dump_enabled_p ())
2225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2226 "Build SLP failed: store group "
2227 "size not a multiple of the vector size "
2228 "in basic block SLP\n");
2229 vect_free_slp_tree (node);
2230 return false;
2231 }
2232 /* Fatal mismatch. */
2233 if (dump_enabled_p ())
2234 dump_printf_loc (MSG_NOTE, vect_location,
2235 "SLP discovery succeeded but node needs "
2236 "splitting\n");
2237 memset (matches, true, group_size);
2238 matches[group_size / const_max_nunits * const_max_nunits] = false;
2239 vect_free_slp_tree (node);
2240 }
2241 else
2242 {
2243 /* Create a new SLP instance. */
2244 slp_instance new_instance = XNEW (class _slp_instance);
2245 SLP_INSTANCE_TREE (new_instance) = node;
2246 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2247 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2248 SLP_INSTANCE_ROOT_STMT (new_instance) = root_stmt_info;
2249 SLP_INSTANCE_KIND (new_instance) = kind;
2250 new_instance->reduc_phis = NULL;
2251 new_instance->cost_vec = vNULL;
2252 new_instance->subgraph_entries = vNULL;
2253
2254 vect_gather_slp_loads (new_instance, node);
2255 if (dump_enabled_p ())
2256 dump_printf_loc (MSG_NOTE, vect_location,
2257 "SLP size %u vs. limit %u.\n",
2258 tree_size, max_tree_size);
2259
2260 /* Fixup SLP reduction chains. */
2261 if (kind == slp_inst_kind_reduc_chain)
2262 {
2263 /* If this is a reduction chain with a conversion in front
2264 amend the SLP tree with a node for that. */
2265 gimple *scalar_def
2266 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2267 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
2268 {
2269 /* Get at the conversion stmt - we know it's the single use
2270 of the last stmt of the reduction chain. */
2271 use_operand_p use_p;
2272 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
2273 &use_p, &scalar_def);
2274 gcc_assert (r);
2275 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
2276 next_info = vect_stmt_to_vectorize (next_info);
2277 scalar_stmts = vNULL;
2278 scalar_stmts.create (group_size);
2279 for (unsigned i = 0; i < group_size; ++i)
2280 scalar_stmts.quick_push (next_info);
2281 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2282 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2283 SLP_TREE_CHILDREN (conv).quick_push (node);
2284 SLP_INSTANCE_TREE (new_instance) = conv;
2285 /* We also have to fake this conversion stmt as SLP reduction
2286 group so we don't have to mess with too much code
2287 elsewhere. */
2288 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2289 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2290 }
2291 /* Fill the backedge child of the PHI SLP node. The
2292 general matching code cannot find it because the
2293 scalar code does not reflect how we vectorize the
2294 reduction. */
2295 use_operand_p use_p;
2296 imm_use_iterator imm_iter;
2297 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
2298 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
2299 gimple_get_lhs (scalar_def))
2300 /* There are exactly two non-debug uses, the reduction
2301 PHI and the loop-closed PHI node. */
2302 if (!is_gimple_debug (USE_STMT (use_p))
2303 && gimple_bb (USE_STMT (use_p)) == loop->header)
2304 {
2305 auto_vec<stmt_vec_info, 64> phis (group_size);
2306 stmt_vec_info phi_info
2307 = vinfo->lookup_stmt (USE_STMT (use_p));
2308 for (unsigned i = 0; i < group_size; ++i)
2309 phis.quick_push (phi_info);
2310 slp_tree *phi_node = bst_map->get (phis);
2311 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
2312 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
2313 = SLP_INSTANCE_TREE (new_instance);
2314 SLP_INSTANCE_TREE (new_instance)->refcnt++;
2315 }
2316 }
2317
2318 vinfo->slp_instances.safe_push (new_instance);
2319
2320 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2321 the number of scalar stmts in the root in a few places.
2322 Verify that assumption holds. */
2323 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2324 .length () == group_size);
2325
2326 if (dump_enabled_p ())
2327 {
2328 dump_printf_loc (MSG_NOTE, vect_location,
2329 "Final SLP tree for instance %p:\n", new_instance);
2330 vect_print_slp_graph (MSG_NOTE, vect_location,
2331 SLP_INSTANCE_TREE (new_instance));
2332 }
2333
2334 return true;
2335 }
2336 }
2337 else
2338 {
2339 /* Failed to SLP. */
2340 /* Free the allocated memory. */
2341 scalar_stmts.release ();
2342 }
2343
2344 stmt_vec_info stmt_info = stmt_info_;
2345 /* Try to break the group up into pieces. */
2346 if (kind == slp_inst_kind_store)
2347 {
2348 /* ??? We could delay all the actual splitting of store-groups
2349 until after SLP discovery of the original group completed.
2350 Then we can recurse to vect_build_slp_instance directly. */
2351 for (i = 0; i < group_size; i++)
2352 if (!matches[i])
2353 break;
2354
2355 /* For basic block SLP, try to break the group up into multiples of
2356 a vector size. */
2357 if (is_a <bb_vec_info> (vinfo)
2358 && (i > 1 && i < group_size))
2359 {
2360 tree scalar_type
2361 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2362 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2363 1 << floor_log2 (i));
2364 unsigned HOST_WIDE_INT const_nunits;
2365 if (vectype
2366 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2367 {
2368 /* Split into two groups at the first vector boundary. */
2369 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2370 unsigned group1_size = i & ~(const_nunits - 1);
2371
2372 if (dump_enabled_p ())
2373 dump_printf_loc (MSG_NOTE, vect_location,
2374 "Splitting SLP group at stmt %u\n", i);
2375 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2376 group1_size);
2377 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2378 max_tree_size);
2379 /* Split the rest at the failure point and possibly
2380 re-analyze the remaining matching part if it has
2381 at least two lanes. */
2382 if (group1_size < i
2383 && (i + 1 < group_size
2384 || i - group1_size > 1))
2385 {
2386 stmt_vec_info rest2 = rest;
2387 rest = vect_split_slp_store_group (rest, i - group1_size);
2388 if (i - group1_size > 1)
2389 res |= vect_analyze_slp_instance (vinfo, bst_map,
2390 rest2, max_tree_size);
2391 }
2392 /* Re-analyze the non-matching tail if it has at least
2393 two lanes. */
2394 if (i + 1 < group_size)
2395 res |= vect_analyze_slp_instance (vinfo, bst_map,
2396 rest, max_tree_size);
2397 return res;
2398 }
2399 }
2400
2401 /* For loop vectorization split into arbitrary pieces of size > 1. */
2402 if (is_a <loop_vec_info> (vinfo)
2403 && (i > 1 && i < group_size))
2404 {
2405 unsigned group1_size = i;
2406
2407 if (dump_enabled_p ())
2408 dump_printf_loc (MSG_NOTE, vect_location,
2409 "Splitting SLP group at stmt %u\n", i);
2410
2411 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2412 group1_size);
2413 /* Loop vectorization cannot handle gaps in stores, make sure
2414 the split group appears as strided. */
2415 STMT_VINFO_STRIDED_P (rest) = 1;
2416 DR_GROUP_GAP (rest) = 0;
2417 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2418 DR_GROUP_GAP (stmt_info) = 0;
2419
2420 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2421 max_tree_size);
2422 if (i + 1 < group_size)
2423 res |= vect_analyze_slp_instance (vinfo, bst_map,
2424 rest, max_tree_size);
2425
2426 return res;
2427 }
2428
2429 /* Even though the first vector did not all match, we might be able to SLP
2430 (some) of the remainder. FORNOW ignore this possibility. */
2431 }
2432
2433 /* Failed to SLP. */
2434 if (dump_enabled_p ())
2435 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2436 return false;
2437 }
2438
2439
2440 /* Analyze an SLP instance starting from a group of grouped stores. Call
2441 vect_build_slp_tree to build a tree of packed stmts if possible.
2442 Return FALSE if it's impossible to SLP any stmt in the loop. */
2443
2444 static bool
2445 vect_analyze_slp_instance (vec_info *vinfo,
2446 scalar_stmts_to_slp_tree_map_t *bst_map,
2447 stmt_vec_info stmt_info, unsigned max_tree_size)
2448 {
2449 unsigned int group_size;
2450 unsigned int i;
2451 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2452 vec<stmt_vec_info> scalar_stmts;
2453 slp_instance_kind kind;
2454
2455 if (is_a <bb_vec_info> (vinfo))
2456 vect_location = stmt_info->stmt;
2457 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2458 {
2459 kind = slp_inst_kind_store;
2460 group_size = DR_GROUP_SIZE (stmt_info);
2461 }
2462 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2463 {
2464 kind = slp_inst_kind_reduc_chain;
2465 gcc_assert (is_a <loop_vec_info> (vinfo));
2466 group_size = REDUC_GROUP_SIZE (stmt_info);
2467 }
2468 else if (is_gimple_assign (stmt_info->stmt)
2469 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2470 {
2471 kind = slp_inst_kind_ctor;
2472 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2473 }
2474 else
2475 {
2476 kind = slp_inst_kind_reduc_group;
2477 gcc_assert (is_a <loop_vec_info> (vinfo));
2478 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2479 }
2480
2481 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2482 scalar_stmts.create (group_size);
2483 stmt_vec_info next_info = stmt_info;
2484 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2485 {
2486 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2487 while (next_info)
2488 {
2489 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2490 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2491 }
2492 }
2493 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2494 {
2495 /* Collect the reduction stmts and store them in
2496 SLP_TREE_SCALAR_STMTS. */
2497 while (next_info)
2498 {
2499 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2500 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2501 }
2502 /* Mark the first element of the reduction chain as reduction to properly
2503 transform the node. In the reduction analysis phase only the last
2504 element of the chain is marked as reduction. */
2505 STMT_VINFO_DEF_TYPE (stmt_info)
2506 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2507 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2508 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2509 }
2510 else if (kind == slp_inst_kind_ctor)
2511 {
2512 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2513 tree val;
2514 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2515 {
2516 if (TREE_CODE (val) == SSA_NAME)
2517 {
2518 gimple* def = SSA_NAME_DEF_STMT (val);
2519 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2520 /* Value is defined in another basic block. */
2521 if (!def_info)
2522 return false;
2523 def_info = vect_stmt_to_vectorize (def_info);
2524 scalar_stmts.safe_push (def_info);
2525 }
2526 else
2527 return false;
2528 }
2529 if (dump_enabled_p ())
2530 dump_printf_loc (MSG_NOTE, vect_location,
2531 "Analyzing vectorizable constructor: %G\n",
2532 stmt_info->stmt);
2533 }
2534 else
2535 {
2536 /* Collect reduction statements. */
2537 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2538 for (i = 0; reductions.iterate (i, &next_info); i++)
2539 if (STMT_VINFO_RELEVANT_P (next_info)
2540 || STMT_VINFO_LIVE_P (next_info))
2541 scalar_stmts.quick_push (next_info);
2542 }
2543
2544 /* Build the tree for the SLP instance. */
2545 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
2546 kind == slp_inst_kind_ctor
2547 ? stmt_info : NULL,
2548 max_tree_size,
2549 bst_map, stmt_info);
2550
2551 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2552 where we should do store group splitting. */
2553
2554 return res;
2555 }
2556
2557 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2558 trees of packed scalar stmts if SLP is possible. */
2559
2560 opt_result
2561 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2562 {
2563 unsigned int i;
2564 stmt_vec_info first_element;
2565
2566 DUMP_VECT_SCOPE ("vect_analyze_slp");
2567
2568 scalar_stmts_to_slp_tree_map_t *bst_map
2569 = new scalar_stmts_to_slp_tree_map_t ();
2570
2571 /* Find SLP sequences starting from groups of grouped stores. */
2572 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2573 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2574
2575 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2576 {
2577 /* Find SLP sequences starting from reduction chains. */
2578 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2579 if (! STMT_VINFO_RELEVANT_P (first_element)
2580 && ! STMT_VINFO_LIVE_P (first_element))
2581 ;
2582 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2583 max_tree_size))
2584 {
2585 /* Dissolve reduction chain group. */
2586 stmt_vec_info vinfo = first_element;
2587 stmt_vec_info last = NULL;
2588 while (vinfo)
2589 {
2590 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2591 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2592 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2593 last = vinfo;
2594 vinfo = next;
2595 }
2596 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2597 /* It can be still vectorized as part of an SLP reduction. */
2598 loop_vinfo->reductions.safe_push (last);
2599 }
2600
2601 /* Find SLP sequences starting from groups of reductions. */
2602 if (loop_vinfo->reductions.length () > 1)
2603 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2604 max_tree_size);
2605 }
2606
2607 /* The map keeps a reference on SLP nodes built, release that. */
2608 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2609 it != bst_map->end (); ++it)
2610 if ((*it).second)
2611 vect_free_slp_tree ((*it).second);
2612 delete bst_map;
2613
2614 return opt_result::success ();
2615 }
2616
2617 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2618
2619 static void
2620 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2621 vec<slp_tree> &vertices, vec<int> &leafs)
2622 {
2623 unsigned i;
2624 slp_tree child;
2625
2626 if (visited.add (node))
2627 return;
2628
2629 node->vertex = vertices.length ();
2630 vertices.safe_push (node);
2631
2632 bool leaf = true;
2633 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2634 if (child)
2635 {
2636 leaf = false;
2637 vect_slp_build_vertices (visited, child, vertices, leafs);
2638 }
2639 if (leaf)
2640 leafs.safe_push (node->vertex);
2641 }
2642
2643 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2644
2645 static void
2646 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2647 vec<int> &leafs)
2648 {
2649 hash_set<slp_tree> visited;
2650 unsigned i;
2651 slp_instance instance;
2652 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2653 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2654 leafs);
2655 }
2656
2657 /* Apply (reverse) bijectite PERM to VEC. */
2658
2659 template <class T>
2660 static void
2661 vect_slp_permute (vec<unsigned> perm,
2662 vec<T> &vec, bool reverse)
2663 {
2664 auto_vec<T, 64> saved;
2665 saved.create (vec.length ());
2666 for (unsigned i = 0; i < vec.length (); ++i)
2667 saved.quick_push (vec[i]);
2668
2669 if (reverse)
2670 {
2671 for (unsigned i = 0; i < vec.length (); ++i)
2672 vec[perm[i]] = saved[i];
2673 for (unsigned i = 0; i < vec.length (); ++i)
2674 gcc_assert (vec[perm[i]] == saved[i]);
2675 }
2676 else
2677 {
2678 for (unsigned i = 0; i < vec.length (); ++i)
2679 vec[i] = saved[perm[i]];
2680 for (unsigned i = 0; i < vec.length (); ++i)
2681 gcc_assert (vec[i] == saved[perm[i]]);
2682 }
2683 }
2684
2685 /* Return whether permutations PERM_A and PERM_B as recorded in the
2686 PERMS vector are equal. */
2687
2688 static bool
2689 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
2690 int perm_a, int perm_b)
2691 {
2692 return (perm_a == perm_b
2693 || (perms[perm_a].length () == perms[perm_b].length ()
2694 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
2695 sizeof (unsigned) * perms[perm_a].length ()) == 0));
2696 }
2697
2698 /* Optimize the SLP graph of VINFO. */
2699
2700 void
2701 vect_optimize_slp (vec_info *vinfo)
2702 {
2703 if (vinfo->slp_instances.is_empty ())
2704 return;
2705
2706 slp_tree node;
2707 unsigned i;
2708 auto_vec<slp_tree> vertices;
2709 auto_vec<int> leafs;
2710 vect_slp_build_vertices (vinfo, vertices, leafs);
2711
2712 struct graph *slpg = new_graph (vertices.length ());
2713 FOR_EACH_VEC_ELT (vertices, i, node)
2714 {
2715 unsigned j;
2716 slp_tree child;
2717 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2718 if (child)
2719 add_edge (slpg, i, child->vertex);
2720 }
2721
2722 /* Compute (reverse) postorder on the inverted graph. */
2723 auto_vec<int> ipo;
2724 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
2725
2726 auto_sbitmap n_visited (vertices.length ());
2727 auto_sbitmap n_materialize (vertices.length ());
2728 auto_vec<int> n_perm (vertices.length ());
2729 auto_vec<vec<unsigned> > perms;
2730
2731 bitmap_clear (n_visited);
2732 bitmap_clear (n_materialize);
2733 n_perm.quick_grow_cleared (vertices.length ());
2734 perms.safe_push (vNULL); /* zero is no permute */
2735
2736 /* Produce initial permutations. */
2737 for (i = 0; i < leafs.length (); ++i)
2738 {
2739 int idx = leafs[i];
2740 slp_tree node = vertices[idx];
2741
2742 /* Handle externals and constants optimistically throughout the
2743 iteration. */
2744 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
2745 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
2746 continue;
2747
2748 /* Loads are the only thing generating permutes and leafs do not
2749 change across iterations. */
2750 bitmap_set_bit (n_visited, idx);
2751 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2752 continue;
2753
2754 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2755 node unpermuted, record this permute. */
2756 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
2757 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
2758 continue;
2759 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
2760 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
2761 bool any_permute = false;
2762 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2763 {
2764 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
2765 imin = MIN (imin, idx);
2766 imax = MAX (imax, idx);
2767 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
2768 any_permute = true;
2769 }
2770 /* If there's no permute no need to split one out. */
2771 if (!any_permute)
2772 continue;
2773 /* If the span doesn't match we'd disrupt VF computation, avoid
2774 that for now. */
2775 if (imax - imin + 1 != SLP_TREE_LANES (node))
2776 continue;
2777
2778 /* For now only handle true permutes, like
2779 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2780 when permuting constants and invariants keeping the permute
2781 bijective. */
2782 auto_sbitmap load_index (SLP_TREE_LANES (node));
2783 bitmap_clear (load_index);
2784 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2785 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
2786 unsigned j;
2787 for (j = 0; j < SLP_TREE_LANES (node); ++j)
2788 if (!bitmap_bit_p (load_index, j))
2789 break;
2790 if (j != SLP_TREE_LANES (node))
2791 continue;
2792
2793 vec<unsigned> perm = vNULL;
2794 perm.safe_grow (SLP_TREE_LANES (node), true);
2795 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2796 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
2797 perms.safe_push (perm);
2798 n_perm[idx] = perms.length () - 1;
2799 }
2800
2801 /* Propagate permutes along the graph and compute materialization points. */
2802 bool changed;
2803 unsigned iteration = 0;
2804 do
2805 {
2806 changed = false;
2807 ++iteration;
2808
2809 for (i = vertices.length (); i > 0 ; --i)
2810 {
2811 int idx = ipo[i-1];
2812 slp_tree node = vertices[idx];
2813 /* For leafs there's nothing to do - we've seeded permutes
2814 on those above. */
2815 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2816 continue;
2817
2818 bitmap_set_bit (n_visited, idx);
2819
2820 /* We cannot move a permute across a store. */
2821 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2822 && DR_IS_WRITE
2823 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
2824 continue;
2825
2826 int perm = -1;
2827 for (graph_edge *succ = slpg->vertices[idx].succ;
2828 succ; succ = succ->succ_next)
2829 {
2830 int succ_idx = succ->dest;
2831 /* Handle unvisited nodes optimistically. */
2832 /* ??? But for constants once we want to handle non-bijective
2833 permutes we have to verify the permute, when unifying lanes,
2834 will not unify different constants. For example see
2835 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2836 if (!bitmap_bit_p (n_visited, succ_idx))
2837 continue;
2838 int succ_perm = n_perm[succ_idx];
2839 /* Once we materialize succs permutation its output lanes
2840 appear unpermuted to us. */
2841 if (bitmap_bit_p (n_materialize, succ_idx))
2842 succ_perm = 0;
2843 if (perm == -1)
2844 perm = succ_perm;
2845 else if (succ_perm == 0)
2846 {
2847 perm = 0;
2848 break;
2849 }
2850 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
2851 {
2852 perm = 0;
2853 break;
2854 }
2855 }
2856
2857 if (perm == -1)
2858 /* Pick up pre-computed leaf values. */
2859 perm = n_perm[idx];
2860 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
2861 {
2862 if (iteration > 1)
2863 /* Make sure we eventually converge. */
2864 gcc_checking_assert (perm == 0);
2865 n_perm[idx] = perm;
2866 if (perm == 0)
2867 bitmap_clear_bit (n_materialize, idx);
2868 changed = true;
2869 }
2870
2871 if (perm == 0)
2872 continue;
2873
2874 /* Elide pruning at materialization points in the first
2875 iteration so every node was visited once at least. */
2876 if (iteration == 1)
2877 continue;
2878
2879 /* Decide on permute materialization. Look whether there's
2880 a use (pred) edge that is permuted differently than us.
2881 In that case mark ourselves so the permutation is applied. */
2882 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
2883 for (graph_edge *pred = slpg->vertices[idx].pred;
2884 pred; pred = pred->pred_next)
2885 {
2886 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
2887 int pred_perm = n_perm[pred->src];
2888 if (!vect_slp_perms_eq (perms, perm, pred_perm))
2889 {
2890 all_preds_permuted = false;
2891 break;
2892 }
2893 }
2894 if (!all_preds_permuted)
2895 {
2896 if (!bitmap_bit_p (n_materialize, idx))
2897 changed = true;
2898 bitmap_set_bit (n_materialize, idx);
2899 }
2900 }
2901 }
2902 while (changed || iteration == 1);
2903
2904 /* Materialize. */
2905 for (i = 0; i < vertices.length (); ++i)
2906 {
2907 int perm = n_perm[i];
2908 if (perm <= 0)
2909 continue;
2910
2911 slp_tree node = vertices[i];
2912
2913 /* First permute invariant/external original successors. */
2914 unsigned j;
2915 slp_tree child;
2916 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2917 {
2918 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2919 continue;
2920
2921 /* If the vector is uniform there's nothing to do. */
2922 if (vect_slp_tree_uniform_p (child))
2923 continue;
2924
2925 /* We can end up sharing some externals via two_operator
2926 handling. Be prepared to unshare those. */
2927 if (child->refcnt != 1)
2928 {
2929 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
2930 SLP_TREE_CHILDREN (node)[j] = child
2931 = vect_create_new_slp_node
2932 (SLP_TREE_SCALAR_OPS (child).copy ());
2933 }
2934 vect_slp_permute (perms[perm],
2935 SLP_TREE_SCALAR_OPS (child), true);
2936 }
2937
2938 if (bitmap_bit_p (n_materialize, i))
2939 {
2940 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2941 /* For loads simply drop the permutation, the load permutation
2942 already performs the desired permutation. */
2943 ;
2944 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2945 {
2946 /* If the node if already a permute node we just need to apply
2947 the permutation to the permute node itself. */
2948 if (dump_enabled_p ())
2949 dump_printf_loc (MSG_NOTE, vect_location,
2950 "simplifying permute node %p\n",
2951 node);
2952
2953 vect_slp_permute (perms[perm], SLP_TREE_LANE_PERMUTATION (node),
2954 true);
2955 }
2956 else
2957 {
2958 if (dump_enabled_p ())
2959 dump_printf_loc (MSG_NOTE, vect_location,
2960 "inserting permute node in place of %p\n",
2961 node);
2962
2963 /* Make a copy of NODE and in-place change it to a
2964 VEC_PERM node to permute the lanes of the copy. */
2965 slp_tree copy = new _slp_tree;
2966 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
2967 SLP_TREE_CHILDREN (node) = vNULL;
2968 SLP_TREE_SCALAR_STMTS (copy)
2969 = SLP_TREE_SCALAR_STMTS (node).copy ();
2970 vect_slp_permute (perms[perm],
2971 SLP_TREE_SCALAR_STMTS (copy), true);
2972 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
2973 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
2974 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
2975 SLP_TREE_LANE_PERMUTATION (copy)
2976 = SLP_TREE_LANE_PERMUTATION (node);
2977 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
2978 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
2979 copy->refcnt = 1;
2980 copy->max_nunits = node->max_nunits;
2981 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
2982 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
2983 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
2984
2985 /* Now turn NODE into a VEC_PERM. */
2986 SLP_TREE_CHILDREN (node).safe_push (copy);
2987 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
2988 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2989 SLP_TREE_LANE_PERMUTATION (node)
2990 .quick_push (std::make_pair (0, perms[perm][j]));
2991 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2992 }
2993 }
2994 else
2995 {
2996 /* Apply the reverse permutation to our stmts. */
2997 vect_slp_permute (perms[perm],
2998 SLP_TREE_SCALAR_STMTS (node), true);
2999 /* And to the load permutation, which we can simply
3000 make regular by design. */
3001 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3002 {
3003 /* ??? When we handle non-bijective permutes the idea
3004 is that we can force the load-permutation to be
3005 { min, min + 1, min + 2, ... max }. But then the
3006 scalar defs might no longer match the lane content
3007 which means wrong-code with live lane vectorization.
3008 So we possibly have to have NULL entries for those. */
3009 vect_slp_permute (perms[perm],
3010 SLP_TREE_LOAD_PERMUTATION (node), true);
3011 }
3012 }
3013 }
3014
3015 /* Free the perms vector used for propagation. */
3016 while (!perms.is_empty ())
3017 perms.pop ().release ();
3018 free_graph (slpg);
3019
3020
3021 /* Now elide load permutations that are not necessary. */
3022 for (i = 0; i < leafs.length (); ++i)
3023 {
3024 node = vertices[leafs[i]];
3025 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3026 continue;
3027
3028 /* In basic block vectorization we allow any subchain of an interleaving
3029 chain.
3030 FORNOW: not in loop SLP because of realignment complications. */
3031 if (is_a <bb_vec_info> (vinfo))
3032 {
3033 bool subchain_p = true;
3034 stmt_vec_info next_load_info = NULL;
3035 stmt_vec_info load_info;
3036 unsigned j;
3037 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3038 {
3039 if (j != 0
3040 && (next_load_info != load_info
3041 || DR_GROUP_GAP (load_info) != 1))
3042 {
3043 subchain_p = false;
3044 break;
3045 }
3046 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3047 }
3048 if (subchain_p)
3049 {
3050 SLP_TREE_LOAD_PERMUTATION (node).release ();
3051 continue;
3052 }
3053 }
3054 else
3055 {
3056 stmt_vec_info load_info;
3057 bool this_load_permuted = false;
3058 unsigned j;
3059 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3060 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3061 {
3062 this_load_permuted = true;
3063 break;
3064 }
3065 stmt_vec_info first_stmt_info
3066 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3067 if (!this_load_permuted
3068 /* The load requires permutation when unrolling exposes
3069 a gap either because the group is larger than the SLP
3070 group-size or because there is a gap between the groups. */
3071 && (known_eq (LOOP_VINFO_VECT_FACTOR
3072 (as_a <loop_vec_info> (vinfo)), 1U)
3073 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3074 && DR_GROUP_GAP (first_stmt_info) == 0)))
3075 {
3076 SLP_TREE_LOAD_PERMUTATION (node).release ();
3077 continue;
3078 }
3079 }
3080 }
3081 }
3082
3083
3084 /* For each possible SLP instance decide whether to SLP it and calculate overall
3085 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3086 least one instance. */
3087
3088 bool
3089 vect_make_slp_decision (loop_vec_info loop_vinfo)
3090 {
3091 unsigned int i;
3092 poly_uint64 unrolling_factor = 1;
3093 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3094 slp_instance instance;
3095 int decided_to_slp = 0;
3096
3097 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3098
3099 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3100 {
3101 /* FORNOW: SLP if you can. */
3102 /* All unroll factors have the form:
3103
3104 GET_MODE_SIZE (vinfo->vector_mode) * X
3105
3106 for some rational X, so they must have a common multiple. */
3107 unrolling_factor
3108 = force_common_multiple (unrolling_factor,
3109 SLP_INSTANCE_UNROLLING_FACTOR (instance));
3110
3111 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3112 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3113 loop-based vectorization. Such stmts will be marked as HYBRID. */
3114 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3115 decided_to_slp++;
3116 }
3117
3118 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
3119
3120 if (decided_to_slp && dump_enabled_p ())
3121 {
3122 dump_printf_loc (MSG_NOTE, vect_location,
3123 "Decided to SLP %d instances. Unrolling factor ",
3124 decided_to_slp);
3125 dump_dec (MSG_NOTE, unrolling_factor);
3126 dump_printf (MSG_NOTE, "\n");
3127 }
3128
3129 return (decided_to_slp > 0);
3130 }
3131
3132 /* Private data for vect_detect_hybrid_slp. */
3133 struct vdhs_data
3134 {
3135 loop_vec_info loop_vinfo;
3136 vec<stmt_vec_info> *worklist;
3137 };
3138
3139 /* Walker for walk_gimple_op. */
3140
3141 static tree
3142 vect_detect_hybrid_slp (tree *tp, int *, void *data)
3143 {
3144 walk_stmt_info *wi = (walk_stmt_info *)data;
3145 vdhs_data *dat = (vdhs_data *)wi->info;
3146
3147 if (wi->is_lhs)
3148 return NULL_TREE;
3149
3150 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
3151 if (!def_stmt_info)
3152 return NULL_TREE;
3153 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3154 if (PURE_SLP_STMT (def_stmt_info))
3155 {
3156 if (dump_enabled_p ())
3157 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3158 def_stmt_info->stmt);
3159 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3160 dat->worklist->safe_push (def_stmt_info);
3161 }
3162
3163 return NULL_TREE;
3164 }
3165
3166 /* Find stmts that must be both vectorized and SLPed. */
3167
3168 void
3169 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3170 {
3171 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3172
3173 /* All stmts participating in SLP are marked pure_slp, all other
3174 stmts are loop_vect.
3175 First collect all loop_vect stmts into a worklist. */
3176 auto_vec<stmt_vec_info> worklist;
3177 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3178 {
3179 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3180 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3181 gsi_next (&gsi))
3182 {
3183 gphi *phi = gsi.phi ();
3184 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3185 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3186 worklist.safe_push (stmt_info);
3187 }
3188 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3189 gsi_next (&gsi))
3190 {
3191 gimple *stmt = gsi_stmt (gsi);
3192 if (is_gimple_debug (stmt))
3193 continue;
3194 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3195 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3196 {
3197 for (gimple_stmt_iterator gsi2
3198 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3199 !gsi_end_p (gsi2); gsi_next (&gsi2))
3200 {
3201 stmt_vec_info patt_info
3202 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3203 if (!STMT_SLP_TYPE (patt_info)
3204 && STMT_VINFO_RELEVANT (patt_info))
3205 worklist.safe_push (patt_info);
3206 }
3207 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3208 }
3209 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3210 worklist.safe_push (stmt_info);
3211 }
3212 }
3213
3214 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3215 mark any SLP vectorized stmt as hybrid.
3216 ??? We're visiting def stmts N times (once for each non-SLP and
3217 once for each hybrid-SLP use). */
3218 walk_stmt_info wi;
3219 vdhs_data dat;
3220 dat.worklist = &worklist;
3221 dat.loop_vinfo = loop_vinfo;
3222 memset (&wi, 0, sizeof (wi));
3223 wi.info = (void *)&dat;
3224 while (!worklist.is_empty ())
3225 {
3226 stmt_vec_info stmt_info = worklist.pop ();
3227 /* Since SSA operands are not set up for pattern stmts we need
3228 to use walk_gimple_op. */
3229 wi.is_lhs = 0;
3230 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3231 }
3232 }
3233
3234
3235 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3236
3237 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3238 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs)
3239 {
3240 for (unsigned i = 0; i < bbs.length (); ++i)
3241 {
3242 if (i != 0)
3243 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3244 gsi_next (&si))
3245 {
3246 gphi *phi = si.phi ();
3247 gimple_set_uid (phi, 0);
3248 add_stmt (phi);
3249 }
3250 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3251 !gsi_end_p (gsi); gsi_next (&gsi))
3252 {
3253 gimple *stmt = gsi_stmt (gsi);
3254 gimple_set_uid (stmt, 0);
3255 if (is_gimple_debug (stmt))
3256 continue;
3257 add_stmt (stmt);
3258 }
3259 }
3260 }
3261
3262
3263 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3264 stmts in the basic block. */
3265
3266 _bb_vec_info::~_bb_vec_info ()
3267 {
3268 /* Reset region marker. */
3269 for (unsigned i = 0; i < bbs.length (); ++i)
3270 {
3271 if (i != 0)
3272 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3273 gsi_next (&si))
3274 {
3275 gphi *phi = si.phi ();
3276 gimple_set_uid (phi, -1);
3277 }
3278 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3279 !gsi_end_p (gsi); gsi_next (&gsi))
3280 {
3281 gimple *stmt = gsi_stmt (gsi);
3282 gimple_set_uid (stmt, -1);
3283 }
3284 }
3285 }
3286
3287 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3288 given then that child nodes have already been processed, and that
3289 their def types currently match their SLP node's def type. */
3290
3291 static bool
3292 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3293 slp_instance node_instance,
3294 stmt_vector_for_cost *cost_vec)
3295 {
3296 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3297 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3298
3299 /* Calculate the number of vector statements to be created for the
3300 scalar stmts in this node. For SLP reductions it is equal to the
3301 number of vector statements in the children (which has already been
3302 calculated by the recursive call). Otherwise it is the number of
3303 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3304 VF divided by the number of elements in a vector. */
3305 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3306 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3307 {
3308 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3309 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3310 {
3311 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3312 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3313 break;
3314 }
3315 }
3316 else
3317 {
3318 poly_uint64 vf;
3319 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3320 vf = loop_vinfo->vectorization_factor;
3321 else
3322 vf = 1;
3323 unsigned int group_size = SLP_TREE_LANES (node);
3324 tree vectype = SLP_TREE_VECTYPE (node);
3325 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3326 = vect_get_num_vectors (vf * group_size, vectype);
3327 }
3328
3329 /* Handle purely internal nodes. */
3330 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3331 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3332
3333 if (is_a <bb_vec_info> (vinfo)
3334 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3335 {
3336 if (dump_enabled_p ())
3337 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3338 "desired vector type conflicts with earlier one "
3339 "for %G", stmt_info->stmt);
3340 return false;
3341 }
3342
3343 bool dummy;
3344 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3345 node, node_instance, cost_vec);
3346 }
3347
3348 /* Try to build NODE from scalars, returning true on success.
3349 NODE_INSTANCE is the SLP instance that contains NODE. */
3350
3351 static bool
3352 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3353 slp_instance node_instance)
3354 {
3355 stmt_vec_info stmt_info;
3356 unsigned int i;
3357
3358 if (!is_a <bb_vec_info> (vinfo)
3359 || node == SLP_INSTANCE_TREE (node_instance)
3360 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3361 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3362 return false;
3363
3364 if (dump_enabled_p ())
3365 dump_printf_loc (MSG_NOTE, vect_location,
3366 "Building vector operands of %p from scalars instead\n", node);
3367
3368 /* Don't remove and free the child nodes here, since they could be
3369 referenced by other structures. The analysis and scheduling phases
3370 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3371 unsigned int group_size = SLP_TREE_LANES (node);
3372 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3373 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3374 SLP_TREE_LOAD_PERMUTATION (node).release ();
3375 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3376 {
3377 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3378 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3379 }
3380 return true;
3381 }
3382
3383 /* Compute the prologue cost for invariant or constant operands represented
3384 by NODE. */
3385
3386 static void
3387 vect_prologue_cost_for_slp (slp_tree node,
3388 stmt_vector_for_cost *cost_vec)
3389 {
3390 /* There's a special case of an existing vector, that costs nothing. */
3391 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3392 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3393 return;
3394 /* Without looking at the actual initializer a vector of
3395 constants can be implemented as load from the constant pool.
3396 When all elements are the same we can use a splat. */
3397 tree vectype = SLP_TREE_VECTYPE (node);
3398 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3399 unsigned num_vects_to_check;
3400 unsigned HOST_WIDE_INT const_nunits;
3401 unsigned nelt_limit;
3402 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3403 && ! multiple_p (const_nunits, group_size))
3404 {
3405 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3406 nelt_limit = const_nunits;
3407 }
3408 else
3409 {
3410 /* If either the vector has variable length or the vectors
3411 are composed of repeated whole groups we only need to
3412 cost construction once. All vectors will be the same. */
3413 num_vects_to_check = 1;
3414 nelt_limit = group_size;
3415 }
3416 tree elt = NULL_TREE;
3417 unsigned nelt = 0;
3418 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3419 {
3420 unsigned si = j % group_size;
3421 if (nelt == 0)
3422 elt = SLP_TREE_SCALAR_OPS (node)[si];
3423 /* ??? We're just tracking whether all operands of a single
3424 vector initializer are the same, ideally we'd check if
3425 we emitted the same one already. */
3426 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3427 elt = NULL_TREE;
3428 nelt++;
3429 if (nelt == nelt_limit)
3430 {
3431 record_stmt_cost (cost_vec, 1,
3432 SLP_TREE_DEF_TYPE (node) == vect_external_def
3433 ? (elt ? scalar_to_vec : vec_construct)
3434 : vector_load,
3435 NULL, vectype, 0, vect_prologue);
3436 nelt = 0;
3437 }
3438 }
3439 }
3440
3441 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3442 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3443
3444 Return true if the operations are supported. */
3445
3446 static bool
3447 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3448 slp_instance node_instance,
3449 hash_set<slp_tree> &visited_set,
3450 vec<slp_tree> &visited_vec,
3451 stmt_vector_for_cost *cost_vec)
3452 {
3453 int i, j;
3454 slp_tree child;
3455
3456 /* Assume we can code-generate all invariants. */
3457 if (!node
3458 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
3459 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
3460 return true;
3461
3462 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
3463 {
3464 if (dump_enabled_p ())
3465 dump_printf_loc (MSG_NOTE, vect_location,
3466 "Failed cyclic SLP reference in %p", node);
3467 return false;
3468 }
3469 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
3470
3471 /* If we already analyzed the exact same set of scalar stmts we're done.
3472 We share the generated vector stmts for those. */
3473 if (visited_set.add (node))
3474 return true;
3475 visited_vec.safe_push (node);
3476
3477 bool res = true;
3478 unsigned visited_rec_start = visited_vec.length ();
3479 unsigned cost_vec_rec_start = cost_vec->length ();
3480 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3481 {
3482 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3483 visited_set, visited_vec,
3484 cost_vec);
3485 if (!res)
3486 break;
3487 }
3488
3489 if (res)
3490 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3491 cost_vec);
3492 /* If analysis failed we have to pop all recursive visited nodes
3493 plus ourselves. */
3494 if (!res)
3495 {
3496 while (visited_vec.length () >= visited_rec_start)
3497 visited_set.remove (visited_vec.pop ());
3498 cost_vec->truncate (cost_vec_rec_start);
3499 }
3500
3501 /* When the node can be vectorized cost invariant nodes it references.
3502 This is not done in DFS order to allow the refering node
3503 vectorizable_* calls to nail down the invariant nodes vector type
3504 and possibly unshare it if it needs a different vector type than
3505 other referrers. */
3506 if (res)
3507 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3508 if (child
3509 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
3510 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3511 /* Perform usual caching, note code-generation still
3512 code-gens these nodes multiple times but we expect
3513 to CSE them later. */
3514 && !visited_set.add (child))
3515 {
3516 visited_vec.safe_push (child);
3517 /* ??? After auditing more code paths make a "default"
3518 and push the vector type from NODE to all children
3519 if it is not already set. */
3520 /* Compute the number of vectors to be generated. */
3521 tree vector_type = SLP_TREE_VECTYPE (child);
3522 if (!vector_type)
3523 {
3524 /* For shifts with a scalar argument we don't need
3525 to cost or code-generate anything.
3526 ??? Represent this more explicitely. */
3527 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3528 == shift_vec_info_type)
3529 && j == 1);
3530 continue;
3531 }
3532 unsigned group_size = SLP_TREE_LANES (child);
3533 poly_uint64 vf = 1;
3534 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3535 vf = loop_vinfo->vectorization_factor;
3536 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3537 = vect_get_num_vectors (vf * group_size, vector_type);
3538 /* And cost them. */
3539 vect_prologue_cost_for_slp (child, cost_vec);
3540 }
3541
3542 /* If this node or any of its children can't be vectorized, try pruning
3543 the tree here rather than felling the whole thing. */
3544 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3545 {
3546 /* We'll need to revisit this for invariant costing and number
3547 of vectorized stmt setting. */
3548 res = true;
3549 }
3550
3551 return res;
3552 }
3553
3554
3555 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3556 region and that can be vectorized using vectorizable_live_operation
3557 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3558 scalar code computing it to be retained. */
3559
3560 static void
3561 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3562 slp_instance instance,
3563 stmt_vector_for_cost *cost_vec,
3564 hash_set<stmt_vec_info> &svisited,
3565 hash_set<slp_tree> &visited)
3566 {
3567 if (visited.add (node))
3568 return;
3569
3570 unsigned i;
3571 stmt_vec_info stmt_info;
3572 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3573 bool all_visited = true;
3574 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3575 {
3576 if (svisited.contains (stmt_info))
3577 continue;
3578 all_visited = false;
3579 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3580 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
3581 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
3582 /* Only the pattern root stmt computes the original scalar value. */
3583 continue;
3584 bool mark_visited = true;
3585 gimple *orig_stmt = orig_stmt_info->stmt;
3586 ssa_op_iter op_iter;
3587 def_operand_p def_p;
3588 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3589 {
3590 imm_use_iterator use_iter;
3591 gimple *use_stmt;
3592 stmt_vec_info use_stmt_info;
3593 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3594 if (!is_gimple_debug (use_stmt))
3595 {
3596 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3597 if (!use_stmt_info
3598 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3599 {
3600 STMT_VINFO_LIVE_P (stmt_info) = true;
3601 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3602 NULL, node, instance, i,
3603 false, cost_vec))
3604 /* ??? So we know we can vectorize the live stmt
3605 from one SLP node. If we cannot do so from all
3606 or none consistently we'd have to record which
3607 SLP node (and lane) we want to use for the live
3608 operation. So make sure we can code-generate
3609 from all nodes. */
3610 mark_visited = false;
3611 else
3612 STMT_VINFO_LIVE_P (stmt_info) = false;
3613 BREAK_FROM_IMM_USE_STMT (use_iter);
3614 }
3615 }
3616 /* We have to verify whether we can insert the lane extract
3617 before all uses. The following is a conservative approximation.
3618 We cannot put this into vectorizable_live_operation because
3619 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3620 doesn't work.
3621 Note that while the fact that we emit code for loads at the
3622 first load should make this a non-problem leafs we construct
3623 from scalars are vectorized after the last scalar def.
3624 ??? If we'd actually compute the insert location during
3625 analysis we could use sth less conservative than the last
3626 scalar stmt in the node for the dominance check. */
3627 /* ??? What remains is "live" uses in vector CTORs in the same
3628 SLP graph which is where those uses can end up code-generated
3629 right after their definition instead of close to their original
3630 use. But that would restrict us to code-generate lane-extracts
3631 from the latest stmt in a node. So we compensate for this
3632 during code-generation, simply not replacing uses for those
3633 hopefully rare cases. */
3634 if (STMT_VINFO_LIVE_P (stmt_info))
3635 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3636 if (!is_gimple_debug (use_stmt)
3637 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
3638 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3639 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
3640 {
3641 if (dump_enabled_p ())
3642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3643 "Cannot determine insertion place for "
3644 "lane extract\n");
3645 STMT_VINFO_LIVE_P (stmt_info) = false;
3646 mark_visited = true;
3647 }
3648 }
3649 if (mark_visited)
3650 svisited.add (stmt_info);
3651 }
3652 if (all_visited)
3653 return;
3654
3655 slp_tree child;
3656 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3657 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3658 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
3659 cost_vec, svisited, visited);
3660 }
3661
3662 /* Analyze statements in SLP instances of VINFO. Return true if the
3663 operations are supported. */
3664
3665 bool
3666 vect_slp_analyze_operations (vec_info *vinfo)
3667 {
3668 slp_instance instance;
3669 int i;
3670
3671 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3672
3673 hash_set<slp_tree> visited;
3674 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
3675 {
3676 auto_vec<slp_tree> visited_vec;
3677 stmt_vector_for_cost cost_vec;
3678 cost_vec.create (2);
3679 if (is_a <bb_vec_info> (vinfo))
3680 vect_location = instance->location ();
3681 if (!vect_slp_analyze_node_operations (vinfo,
3682 SLP_INSTANCE_TREE (instance),
3683 instance, visited, visited_vec,
3684 &cost_vec)
3685 /* Instances with a root stmt require vectorized defs for the
3686 SLP tree root. */
3687 || (SLP_INSTANCE_ROOT_STMT (instance)
3688 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
3689 != vect_internal_def)))
3690 {
3691 slp_tree node = SLP_INSTANCE_TREE (instance);
3692 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3693 if (dump_enabled_p ())
3694 dump_printf_loc (MSG_NOTE, vect_location,
3695 "removing SLP instance operations starting from: %G",
3696 stmt_info->stmt);
3697 vect_free_slp_instance (instance);
3698 vinfo->slp_instances.ordered_remove (i);
3699 cost_vec.release ();
3700 while (!visited_vec.is_empty ())
3701 visited.remove (visited_vec.pop ());
3702 }
3703 else
3704 {
3705 i++;
3706
3707 /* For BB vectorization remember the SLP graph entry
3708 cost for later. */
3709 if (is_a <bb_vec_info> (vinfo))
3710 instance->cost_vec = cost_vec;
3711 else
3712 {
3713 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
3714 cost_vec.release ();
3715 }
3716 }
3717 }
3718
3719 /* Compute vectorizable live stmts. */
3720 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3721 {
3722 hash_set<stmt_vec_info> svisited;
3723 hash_set<slp_tree> visited;
3724 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
3725 {
3726 vect_location = instance->location ();
3727 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
3728 instance, &instance->cost_vec, svisited,
3729 visited);
3730 }
3731 }
3732
3733 return !vinfo->slp_instances.is_empty ();
3734 }
3735
3736 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3737 closing the eventual chain. */
3738
3739 static slp_instance
3740 get_ultimate_leader (slp_instance instance,
3741 hash_map<slp_instance, slp_instance> &instance_leader)
3742 {
3743 auto_vec<slp_instance *, 8> chain;
3744 slp_instance *tem;
3745 while (*(tem = instance_leader.get (instance)) != instance)
3746 {
3747 chain.safe_push (tem);
3748 instance = *tem;
3749 }
3750 while (!chain.is_empty ())
3751 *chain.pop () = instance;
3752 return instance;
3753 }
3754
3755 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3756
3757 static void
3758 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
3759 slp_instance instance, slp_tree node,
3760 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
3761 hash_map<slp_instance, slp_instance> &instance_leader,
3762 hash_set<slp_tree> &visited)
3763 {
3764 stmt_vec_info stmt_info;
3765 unsigned i;
3766
3767 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3768 {
3769 bool existed_p;
3770 slp_instance &stmt_instance
3771 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
3772 if (!existed_p)
3773 ;
3774 else if (stmt_instance != instance)
3775 {
3776 /* If we're running into a previously marked stmt make us the
3777 leader of the current ultimate leader. This keeps the
3778 leader chain acyclic and works even when the current instance
3779 connects two previously independent graph parts. */
3780 slp_instance stmt_leader
3781 = get_ultimate_leader (stmt_instance, instance_leader);
3782 if (stmt_leader != instance)
3783 instance_leader.put (stmt_leader, instance);
3784 }
3785 stmt_instance = instance;
3786 }
3787
3788 if (visited.add (node))
3789 return;
3790
3791 slp_tree child;
3792 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3793 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3794 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
3795 instance_leader, visited);
3796 }
3797
3798 /* Partition the SLP graph into pieces that can be costed independently. */
3799
3800 static void
3801 vect_bb_partition_graph (bb_vec_info bb_vinfo)
3802 {
3803 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3804
3805 /* First walk the SLP graph assigning each involved scalar stmt a
3806 corresponding SLP graph entry and upon visiting a previously
3807 marked stmt, make the stmts leader the current SLP graph entry. */
3808 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
3809 hash_map<slp_instance, slp_instance> instance_leader;
3810 hash_set<slp_tree> visited;
3811 slp_instance instance;
3812 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3813 {
3814 instance_leader.put (instance, instance);
3815 vect_bb_partition_graph_r (bb_vinfo,
3816 instance, SLP_INSTANCE_TREE (instance),
3817 stmt_to_instance, instance_leader,
3818 visited);
3819 }
3820
3821 /* Then collect entries to each independent subgraph. */
3822 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3823 {
3824 slp_instance leader = get_ultimate_leader (instance, instance_leader);
3825 leader->subgraph_entries.safe_push (instance);
3826 if (dump_enabled_p ()
3827 && leader != instance)
3828 dump_printf_loc (MSG_NOTE, vect_location,
3829 "instance %p is leader of %p\n",
3830 leader, instance);
3831 }
3832 }
3833
3834 /* Compute the scalar cost of the SLP node NODE and its children
3835 and return it. Do not account defs that are marked in LIFE and
3836 update LIFE according to uses of NODE. */
3837
3838 static void
3839 vect_bb_slp_scalar_cost (vec_info *vinfo,
3840 slp_tree node, vec<bool, va_heap> *life,
3841 stmt_vector_for_cost *cost_vec,
3842 hash_set<slp_tree> &visited)
3843 {
3844 unsigned i;
3845 stmt_vec_info stmt_info;
3846 slp_tree child;
3847
3848 if (visited.add (node))
3849 return;
3850
3851 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3852 {
3853 ssa_op_iter op_iter;
3854 def_operand_p def_p;
3855
3856 if ((*life)[i])
3857 continue;
3858
3859 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3860 gimple *orig_stmt = orig_stmt_info->stmt;
3861
3862 /* If there is a non-vectorized use of the defs then the scalar
3863 stmt is kept live in which case we do not account it or any
3864 required defs in the SLP children in the scalar cost. This
3865 way we make the vectorization more costly when compared to
3866 the scalar cost. */
3867 if (!STMT_VINFO_LIVE_P (stmt_info))
3868 {
3869 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3870 {
3871 imm_use_iterator use_iter;
3872 gimple *use_stmt;
3873 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3874 if (!is_gimple_debug (use_stmt))
3875 {
3876 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3877 if (!use_stmt_info
3878 || !PURE_SLP_STMT
3879 (vect_stmt_to_vectorize (use_stmt_info)))
3880 {
3881 (*life)[i] = true;
3882 BREAK_FROM_IMM_USE_STMT (use_iter);
3883 }
3884 }
3885 }
3886 if ((*life)[i])
3887 continue;
3888 }
3889
3890 /* Count scalar stmts only once. */
3891 if (gimple_visited_p (orig_stmt))
3892 continue;
3893 gimple_set_visited (orig_stmt, true);
3894
3895 vect_cost_for_stmt kind;
3896 if (STMT_VINFO_DATA_REF (orig_stmt_info))
3897 {
3898 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
3899 kind = scalar_load;
3900 else
3901 kind = scalar_store;
3902 }
3903 else if (vect_nop_conversion_p (orig_stmt_info))
3904 continue;
3905 else
3906 kind = scalar_stmt;
3907 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
3908 SLP_TREE_VECTYPE (node), 0, vect_body);
3909 }
3910
3911 auto_vec<bool, 20> subtree_life;
3912 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3913 {
3914 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3915 {
3916 /* Do not directly pass LIFE to the recursive call, copy it to
3917 confine changes in the callee to the current child/subtree. */
3918 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3919 {
3920 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
3921 for (unsigned j = 0;
3922 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
3923 {
3924 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
3925 if (perm.first == i)
3926 subtree_life[perm.second] = (*life)[j];
3927 }
3928 }
3929 else
3930 {
3931 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
3932 subtree_life.safe_splice (*life);
3933 }
3934 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
3935 visited);
3936 subtree_life.truncate (0);
3937 }
3938 }
3939 }
3940
3941 /* Check if vectorization of the basic block is profitable for the
3942 subgraph denoted by SLP_INSTANCES. */
3943
3944 static bool
3945 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
3946 vec<slp_instance> slp_instances)
3947 {
3948 slp_instance instance;
3949 int i;
3950 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3951 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3952
3953 void *vect_target_cost_data = init_cost (NULL);
3954
3955 /* Calculate scalar cost and sum the cost for the vector stmts
3956 previously collected. */
3957 stmt_vector_for_cost scalar_costs;
3958 scalar_costs.create (0);
3959 hash_set<slp_tree> visited;
3960 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3961 {
3962 auto_vec<bool, 20> life;
3963 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
3964 true);
3965 vect_bb_slp_scalar_cost (bb_vinfo,
3966 SLP_INSTANCE_TREE (instance),
3967 &life, &scalar_costs, visited);
3968 add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
3969 instance->cost_vec.release ();
3970 }
3971 /* Unset visited flag. */
3972 stmt_info_for_cost *si;
3973 FOR_EACH_VEC_ELT (scalar_costs, i, si)
3974 gimple_set_visited (si->stmt_info->stmt, false);
3975
3976 void *scalar_target_cost_data = init_cost (NULL);
3977 add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
3978 scalar_costs.release ();
3979 unsigned dummy;
3980 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
3981 destroy_cost_data (scalar_target_cost_data);
3982
3983 /* Complete the target-specific vector cost calculation. */
3984 finish_cost (vect_target_cost_data, &vec_prologue_cost,
3985 &vec_inside_cost, &vec_epilogue_cost);
3986 destroy_cost_data (vect_target_cost_data);
3987
3988 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3989
3990 if (dump_enabled_p ())
3991 {
3992 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3993 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3994 vec_inside_cost);
3995 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3996 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3997 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3998 }
3999
4000 /* Vectorization is profitable if its cost is more than the cost of scalar
4001 version. Note that we err on the vector side for equal cost because
4002 the cost estimate is otherwise quite pessimistic (constant uses are
4003 free on the scalar side but cost a load on the vector side for
4004 example). */
4005 if (vec_outside_cost + vec_inside_cost > scalar_cost)
4006 return false;
4007
4008 return true;
4009 }
4010
4011 /* Find any vectorizable constructors and add them to the grouped_store
4012 array. */
4013
4014 static void
4015 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
4016 {
4017 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
4018 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
4019 !gsi_end_p (gsi); gsi_next (&gsi))
4020 {
4021 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
4022 if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
4023 continue;
4024
4025 tree rhs = gimple_assign_rhs1 (assign);
4026 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
4027 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
4028 CONSTRUCTOR_NELTS (rhs))
4029 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
4030 || uniform_vector_p (rhs))
4031 continue;
4032
4033 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
4034 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
4035 }
4036 }
4037
4038 /* Walk the grouped store chains and replace entries with their
4039 pattern variant if any. */
4040
4041 static void
4042 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
4043 {
4044 stmt_vec_info first_element;
4045 unsigned i;
4046
4047 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4048 {
4049 /* We also have CTORs in this array. */
4050 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
4051 continue;
4052 if (STMT_VINFO_IN_PATTERN_P (first_element))
4053 {
4054 stmt_vec_info orig = first_element;
4055 first_element = STMT_VINFO_RELATED_STMT (first_element);
4056 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
4057 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
4058 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
4059 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
4060 vinfo->grouped_stores[i] = first_element;
4061 }
4062 stmt_vec_info prev = first_element;
4063 while (DR_GROUP_NEXT_ELEMENT (prev))
4064 {
4065 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
4066 if (STMT_VINFO_IN_PATTERN_P (elt))
4067 {
4068 stmt_vec_info orig = elt;
4069 elt = STMT_VINFO_RELATED_STMT (elt);
4070 DR_GROUP_NEXT_ELEMENT (prev) = elt;
4071 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
4072 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
4073 }
4074 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
4075 prev = elt;
4076 }
4077 }
4078 }
4079
4080 /* Check if the region described by BB_VINFO can be vectorized, returning
4081 true if so. When returning false, set FATAL to true if the same failure
4082 would prevent vectorization at other vector sizes, false if it is still
4083 worth trying other sizes. N_STMTS is the number of statements in the
4084 region. */
4085
4086 static bool
4087 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
4088 vec<int> *dataref_groups)
4089 {
4090 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4091
4092 slp_instance instance;
4093 int i;
4094 poly_uint64 min_vf = 2;
4095
4096 /* The first group of checks is independent of the vector size. */
4097 fatal = true;
4098
4099 /* Analyze the data references. */
4100
4101 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
4102 {
4103 if (dump_enabled_p ())
4104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4105 "not vectorized: unhandled data-ref in basic "
4106 "block.\n");
4107 return false;
4108 }
4109
4110 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
4111 {
4112 if (dump_enabled_p ())
4113 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4114 "not vectorized: unhandled data access in "
4115 "basic block.\n");
4116 return false;
4117 }
4118
4119 vect_slp_check_for_constructors (bb_vinfo);
4120
4121 /* If there are no grouped stores and no constructors in the region
4122 there is no need to continue with pattern recog as vect_analyze_slp
4123 will fail anyway. */
4124 if (bb_vinfo->grouped_stores.is_empty ())
4125 {
4126 if (dump_enabled_p ())
4127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4128 "not vectorized: no grouped stores in "
4129 "basic block.\n");
4130 return false;
4131 }
4132
4133 /* While the rest of the analysis below depends on it in some way. */
4134 fatal = false;
4135
4136 vect_pattern_recog (bb_vinfo);
4137
4138 /* Update store groups from pattern processing. */
4139 vect_fixup_store_groups_with_patterns (bb_vinfo);
4140
4141 /* Check the SLP opportunities in the basic block, analyze and build SLP
4142 trees. */
4143 if (!vect_analyze_slp (bb_vinfo, n_stmts))
4144 {
4145 if (dump_enabled_p ())
4146 {
4147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4148 "Failed to SLP the basic block.\n");
4149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4150 "not vectorized: failed to find SLP opportunities "
4151 "in basic block.\n");
4152 }
4153 return false;
4154 }
4155
4156 /* Optimize permutations. */
4157 vect_optimize_slp (bb_vinfo);
4158
4159 vect_record_base_alignments (bb_vinfo);
4160
4161 /* Analyze and verify the alignment of data references and the
4162 dependence in the SLP instances. */
4163 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
4164 {
4165 vect_location = instance->location ();
4166 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
4167 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
4168 {
4169 slp_tree node = SLP_INSTANCE_TREE (instance);
4170 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4171 if (dump_enabled_p ())
4172 dump_printf_loc (MSG_NOTE, vect_location,
4173 "removing SLP instance operations starting from: %G",
4174 stmt_info->stmt);
4175 vect_free_slp_instance (instance);
4176 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
4177 continue;
4178 }
4179
4180 /* Mark all the statements that we want to vectorize as pure SLP and
4181 relevant. */
4182 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4183 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
4184 if (SLP_INSTANCE_ROOT_STMT (instance))
4185 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
4186
4187 i++;
4188 }
4189 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
4190 return false;
4191
4192 if (!vect_slp_analyze_operations (bb_vinfo))
4193 {
4194 if (dump_enabled_p ())
4195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4196 "not vectorized: bad operation in basic block.\n");
4197 return false;
4198 }
4199
4200 vect_bb_partition_graph (bb_vinfo);
4201
4202 return true;
4203 }
4204
4205 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4206 basic blocks in BBS, returning true on success.
4207 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4208
4209 static bool
4210 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
4211 vec<int> *dataref_groups, unsigned int n_stmts)
4212 {
4213 bb_vec_info bb_vinfo;
4214 auto_vector_modes vector_modes;
4215
4216 /* Autodetect first vector size we try. */
4217 machine_mode next_vector_mode = VOIDmode;
4218 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
4219 unsigned int mode_i = 0;
4220
4221 vec_info_shared shared;
4222
4223 machine_mode autodetected_vector_mode = VOIDmode;
4224 while (1)
4225 {
4226 bool vectorized = false;
4227 bool fatal = false;
4228 bb_vinfo = new _bb_vec_info (bbs, &shared);
4229
4230 bool first_time_p = shared.datarefs.is_empty ();
4231 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
4232 if (first_time_p)
4233 bb_vinfo->shared->save_datarefs ();
4234 else
4235 bb_vinfo->shared->check_datarefs ();
4236 bb_vinfo->vector_mode = next_vector_mode;
4237
4238 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)
4239 && dbg_cnt (vect_slp))
4240 {
4241 if (dump_enabled_p ())
4242 {
4243 dump_printf_loc (MSG_NOTE, vect_location,
4244 "***** Analysis succeeded with vector mode"
4245 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4246 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4247 }
4248
4249 bb_vinfo->shared->check_datarefs ();
4250
4251 unsigned i;
4252 slp_instance instance;
4253 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4254 {
4255 if (instance->subgraph_entries.is_empty ())
4256 continue;
4257
4258 vect_location = instance->location ();
4259 if (!unlimited_cost_model (NULL)
4260 && !vect_bb_vectorization_profitable_p
4261 (bb_vinfo, instance->subgraph_entries))
4262 {
4263 if (dump_enabled_p ())
4264 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4265 "not vectorized: vectorization is not "
4266 "profitable.\n");
4267 continue;
4268 }
4269
4270 if (!vectorized && dump_enabled_p ())
4271 dump_printf_loc (MSG_NOTE, vect_location,
4272 "Basic block will be vectorized "
4273 "using SLP\n");
4274 vectorized = true;
4275
4276 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4277
4278 unsigned HOST_WIDE_INT bytes;
4279 if (dump_enabled_p ())
4280 {
4281 if (GET_MODE_SIZE
4282 (bb_vinfo->vector_mode).is_constant (&bytes))
4283 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4284 "basic block part vectorized using %wu "
4285 "byte vectors\n", bytes);
4286 else
4287 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4288 "basic block part vectorized using "
4289 "variable length vectors\n");
4290 }
4291 }
4292 }
4293 else
4294 {
4295 if (dump_enabled_p ())
4296 dump_printf_loc (MSG_NOTE, vect_location,
4297 "***** Analysis failed with vector mode %s\n",
4298 GET_MODE_NAME (bb_vinfo->vector_mode));
4299 }
4300
4301 if (mode_i == 0)
4302 autodetected_vector_mode = bb_vinfo->vector_mode;
4303
4304 if (!fatal)
4305 while (mode_i < vector_modes.length ()
4306 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
4307 {
4308 if (dump_enabled_p ())
4309 dump_printf_loc (MSG_NOTE, vect_location,
4310 "***** The result for vector mode %s would"
4311 " be the same\n",
4312 GET_MODE_NAME (vector_modes[mode_i]));
4313 mode_i += 1;
4314 }
4315
4316 delete bb_vinfo;
4317
4318 if (mode_i < vector_modes.length ()
4319 && VECTOR_MODE_P (autodetected_vector_mode)
4320 && (related_vector_mode (vector_modes[mode_i],
4321 GET_MODE_INNER (autodetected_vector_mode))
4322 == autodetected_vector_mode)
4323 && (related_vector_mode (autodetected_vector_mode,
4324 GET_MODE_INNER (vector_modes[mode_i]))
4325 == vector_modes[mode_i]))
4326 {
4327 if (dump_enabled_p ())
4328 dump_printf_loc (MSG_NOTE, vect_location,
4329 "***** Skipping vector mode %s, which would"
4330 " repeat the analysis for %s\n",
4331 GET_MODE_NAME (vector_modes[mode_i]),
4332 GET_MODE_NAME (autodetected_vector_mode));
4333 mode_i += 1;
4334 }
4335
4336 if (vectorized
4337 || mode_i == vector_modes.length ()
4338 || autodetected_vector_mode == VOIDmode
4339 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4340 vector sizes will fail do not bother iterating. */
4341 || fatal)
4342 return vectorized;
4343
4344 /* Try the next biggest vector size. */
4345 next_vector_mode = vector_modes[mode_i++];
4346 if (dump_enabled_p ())
4347 dump_printf_loc (MSG_NOTE, vect_location,
4348 "***** Re-trying analysis with vector mode %s\n",
4349 GET_MODE_NAME (next_vector_mode));
4350 }
4351 }
4352
4353
4354 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4355 true if anything in the basic-block was vectorized. */
4356
4357 static bool
4358 vect_slp_bbs (vec<basic_block> bbs)
4359 {
4360 vec<data_reference_p> datarefs = vNULL;
4361 auto_vec<int> dataref_groups;
4362 int insns = 0;
4363 int current_group = 0;
4364
4365 for (unsigned i = 0; i < bbs.length (); i++)
4366 {
4367 basic_block bb = bbs[i];
4368 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
4369 gsi_next (&gsi))
4370 {
4371 gimple *stmt = gsi_stmt (gsi);
4372 if (is_gimple_debug (stmt))
4373 continue;
4374
4375 insns++;
4376
4377 if (gimple_location (stmt) != UNKNOWN_LOCATION)
4378 vect_location = stmt;
4379
4380 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
4381 &dataref_groups, current_group))
4382 ++current_group;
4383 }
4384 }
4385
4386 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
4387 }
4388
4389 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4390 true if anything in the basic-block was vectorized. */
4391
4392 bool
4393 vect_slp_bb (basic_block bb)
4394 {
4395 auto_vec<basic_block> bbs;
4396 bbs.safe_push (bb);
4397 return vect_slp_bbs (bbs);
4398 }
4399
4400 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4401 true if anything in the basic-block was vectorized. */
4402
4403 bool
4404 vect_slp_function (function *fun)
4405 {
4406 bool r = false;
4407 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
4408 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
4409
4410 /* For the moment split the function into pieces to avoid making
4411 the iteration on the vector mode moot. Split at points we know
4412 to not handle well which is CFG merges (SLP discovery doesn't
4413 handle non-loop-header PHIs) and loop exits. Since pattern
4414 recog requires reverse iteration to visit uses before defs
4415 simply chop RPO into pieces. */
4416 auto_vec<basic_block> bbs;
4417 for (unsigned i = 0; i < n; i++)
4418 {
4419 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
4420 bool split = false;
4421
4422 /* Split when a BB is not dominated by the first block. */
4423 if (!bbs.is_empty ()
4424 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
4425 {
4426 if (dump_enabled_p ())
4427 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4428 "splitting region at dominance boundary bb%d\n",
4429 bb->index);
4430 split = true;
4431 }
4432 /* Split when the loop determined by the first block
4433 is exited. This is because we eventually insert
4434 invariants at region begin. */
4435 else if (!bbs.is_empty ()
4436 && bbs[0]->loop_father != bb->loop_father
4437 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
4438 {
4439 if (dump_enabled_p ())
4440 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4441 "splitting region at loop %d exit at bb%d\n",
4442 bbs[0]->loop_father->num, bb->index);
4443 split = true;
4444 }
4445
4446 if (split && !bbs.is_empty ())
4447 {
4448 r |= vect_slp_bbs (bbs);
4449 bbs.truncate (0);
4450 bbs.quick_push (bb);
4451 }
4452 else
4453 bbs.safe_push (bb);
4454
4455 /* When we have a stmt ending this block and defining a
4456 value we have to insert on edges when inserting after it for
4457 a vector containing its definition. Avoid this for now. */
4458 if (gimple *last = last_stmt (bb))
4459 if (gimple_get_lhs (last)
4460 && is_ctrl_altering_stmt (last))
4461 {
4462 if (dump_enabled_p ())
4463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4464 "splitting region at control altering "
4465 "definition %G", last);
4466 r |= vect_slp_bbs (bbs);
4467 bbs.truncate (0);
4468 }
4469 }
4470
4471 if (!bbs.is_empty ())
4472 r |= vect_slp_bbs (bbs);
4473
4474 free (rpo);
4475
4476 return r;
4477 }
4478
4479 /* Build a variable-length vector in which the elements in ELTS are repeated
4480 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4481 RESULTS and add any new instructions to SEQ.
4482
4483 The approach we use is:
4484
4485 (1) Find a vector mode VM with integer elements of mode IM.
4486
4487 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4488 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4489 from small vectors to IM.
4490
4491 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4492
4493 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4494 correct byte contents.
4495
4496 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4497
4498 We try to find the largest IM for which this sequence works, in order
4499 to cut down on the number of interleaves. */
4500
4501 void
4502 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
4503 vec<tree> elts, unsigned int nresults,
4504 vec<tree> &results)
4505 {
4506 unsigned int nelts = elts.length ();
4507 tree element_type = TREE_TYPE (vector_type);
4508
4509 /* (1) Find a vector mode VM with integer elements of mode IM. */
4510 unsigned int nvectors = 1;
4511 tree new_vector_type;
4512 tree permutes[2];
4513 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
4514 &nvectors, &new_vector_type,
4515 permutes))
4516 gcc_unreachable ();
4517
4518 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4519 unsigned int partial_nelts = nelts / nvectors;
4520 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
4521
4522 tree_vector_builder partial_elts;
4523 auto_vec<tree, 32> pieces (nvectors * 2);
4524 pieces.quick_grow (nvectors * 2);
4525 for (unsigned int i = 0; i < nvectors; ++i)
4526 {
4527 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4528 ELTS' has mode IM. */
4529 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
4530 for (unsigned int j = 0; j < partial_nelts; ++j)
4531 partial_elts.quick_push (elts[i * partial_nelts + j]);
4532 tree t = gimple_build_vector (seq, &partial_elts);
4533 t = gimple_build (seq, VIEW_CONVERT_EXPR,
4534 TREE_TYPE (new_vector_type), t);
4535
4536 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4537 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
4538 }
4539
4540 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4541 correct byte contents.
4542
4543 We need to repeat the following operation log2(nvectors) times:
4544
4545 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4546 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4547
4548 However, if each input repeats every N elements and the VF is
4549 a multiple of N * 2, the HI result is the same as the LO. */
4550 unsigned int in_start = 0;
4551 unsigned int out_start = nvectors;
4552 unsigned int hi_start = nvectors / 2;
4553 /* A bound on the number of outputs needed to produce NRESULTS results
4554 in the final iteration. */
4555 unsigned int noutputs_bound = nvectors * nresults;
4556 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
4557 {
4558 noutputs_bound /= 2;
4559 unsigned int limit = MIN (noutputs_bound, nvectors);
4560 for (unsigned int i = 0; i < limit; ++i)
4561 {
4562 if ((i & 1) != 0
4563 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
4564 2 * in_repeat))
4565 {
4566 pieces[out_start + i] = pieces[out_start + i - 1];
4567 continue;
4568 }
4569
4570 tree output = make_ssa_name (new_vector_type);
4571 tree input1 = pieces[in_start + (i / 2)];
4572 tree input2 = pieces[in_start + (i / 2) + hi_start];
4573 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
4574 input1, input2,
4575 permutes[i & 1]);
4576 gimple_seq_add_stmt (seq, stmt);
4577 pieces[out_start + i] = output;
4578 }
4579 std::swap (in_start, out_start);
4580 }
4581
4582 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4583 results.reserve (nresults);
4584 for (unsigned int i = 0; i < nresults; ++i)
4585 if (i < nvectors)
4586 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
4587 pieces[in_start + i]));
4588 else
4589 results.quick_push (results[i - nvectors]);
4590 }
4591
4592
4593 /* For constant and loop invariant defs in OP_NODE this function creates
4594 vector defs that will be used in the vectorized stmts and stores them
4595 to SLP_TREE_VEC_DEFS of OP_NODE. */
4596
4597 static void
4598 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
4599 {
4600 unsigned HOST_WIDE_INT nunits;
4601 tree vec_cst;
4602 unsigned j, number_of_places_left_in_vector;
4603 tree vector_type;
4604 tree vop;
4605 int group_size = op_node->ops.length ();
4606 unsigned int vec_num, i;
4607 unsigned number_of_copies = 1;
4608 bool constant_p;
4609 gimple_seq ctor_seq = NULL;
4610 auto_vec<tree, 16> permute_results;
4611
4612 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
4613 vector_type = SLP_TREE_VECTYPE (op_node);
4614
4615 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
4616 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
4617 auto_vec<tree> voprnds (number_of_vectors);
4618
4619 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4620 created vectors. It is greater than 1 if unrolling is performed.
4621
4622 For example, we have two scalar operands, s1 and s2 (e.g., group of
4623 strided accesses of size two), while NUNITS is four (i.e., four scalars
4624 of this type can be packed in a vector). The output vector will contain
4625 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4626 will be 2).
4627
4628 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4629 containing the operands.
4630
4631 For example, NUNITS is four as before, and the group size is 8
4632 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4633 {s5, s6, s7, s8}. */
4634
4635 /* When using duplicate_and_interleave, we just need one element for
4636 each scalar statement. */
4637 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4638 nunits = group_size;
4639
4640 number_of_copies = nunits * number_of_vectors / group_size;
4641
4642 number_of_places_left_in_vector = nunits;
4643 constant_p = true;
4644 tree_vector_builder elts (vector_type, nunits, 1);
4645 elts.quick_grow (nunits);
4646 stmt_vec_info insert_after = NULL;
4647 for (j = 0; j < number_of_copies; j++)
4648 {
4649 tree op;
4650 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
4651 {
4652 /* Create 'vect_ = {op0,op1,...,opn}'. */
4653 number_of_places_left_in_vector--;
4654 tree orig_op = op;
4655 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
4656 {
4657 if (CONSTANT_CLASS_P (op))
4658 {
4659 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4660 {
4661 /* Can't use VIEW_CONVERT_EXPR for booleans because
4662 of possibly different sizes of scalar value and
4663 vector element. */
4664 if (integer_zerop (op))
4665 op = build_int_cst (TREE_TYPE (vector_type), 0);
4666 else if (integer_onep (op))
4667 op = build_all_ones_cst (TREE_TYPE (vector_type));
4668 else
4669 gcc_unreachable ();
4670 }
4671 else
4672 op = fold_unary (VIEW_CONVERT_EXPR,
4673 TREE_TYPE (vector_type), op);
4674 gcc_assert (op && CONSTANT_CLASS_P (op));
4675 }
4676 else
4677 {
4678 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
4679 gimple *init_stmt;
4680 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4681 {
4682 tree true_val
4683 = build_all_ones_cst (TREE_TYPE (vector_type));
4684 tree false_val
4685 = build_zero_cst (TREE_TYPE (vector_type));
4686 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
4687 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
4688 op, true_val,
4689 false_val);
4690 }
4691 else
4692 {
4693 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
4694 op);
4695 init_stmt
4696 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
4697 op);
4698 }
4699 gimple_seq_add_stmt (&ctor_seq, init_stmt);
4700 op = new_temp;
4701 }
4702 }
4703 elts[number_of_places_left_in_vector] = op;
4704 if (!CONSTANT_CLASS_P (op))
4705 constant_p = false;
4706 /* For BB vectorization we have to compute an insert location
4707 when a def is inside the analyzed region since we cannot
4708 simply insert at the BB start in this case. */
4709 stmt_vec_info opdef;
4710 if (TREE_CODE (orig_op) == SSA_NAME
4711 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
4712 && is_a <bb_vec_info> (vinfo)
4713 && (opdef = vinfo->lookup_def (orig_op)))
4714 {
4715 if (!insert_after)
4716 insert_after = opdef;
4717 else
4718 insert_after = get_later_stmt (insert_after, opdef);
4719 }
4720
4721 if (number_of_places_left_in_vector == 0)
4722 {
4723 if (constant_p
4724 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4725 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4726 vec_cst = gimple_build_vector (&ctor_seq, &elts);
4727 else
4728 {
4729 if (permute_results.is_empty ())
4730 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
4731 elts, number_of_vectors,
4732 permute_results);
4733 vec_cst = permute_results[number_of_vectors - j - 1];
4734 }
4735 if (!gimple_seq_empty_p (ctor_seq))
4736 {
4737 if (insert_after)
4738 {
4739 gimple_stmt_iterator gsi;
4740 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
4741 {
4742 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
4743 gsi_insert_seq_before (&gsi, ctor_seq,
4744 GSI_CONTINUE_LINKING);
4745 }
4746 else if (!stmt_ends_bb_p (insert_after->stmt))
4747 {
4748 gsi = gsi_for_stmt (insert_after->stmt);
4749 gsi_insert_seq_after (&gsi, ctor_seq,
4750 GSI_CONTINUE_LINKING);
4751 }
4752 else
4753 {
4754 /* When we want to insert after a def where the
4755 defining stmt throws then insert on the fallthru
4756 edge. */
4757 edge e = find_fallthru_edge
4758 (gimple_bb (insert_after->stmt)->succs);
4759 basic_block new_bb
4760 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
4761 gcc_assert (!new_bb);
4762 }
4763 }
4764 else
4765 vinfo->insert_seq_on_entry (NULL, ctor_seq);
4766 ctor_seq = NULL;
4767 }
4768 voprnds.quick_push (vec_cst);
4769 insert_after = NULL;
4770 number_of_places_left_in_vector = nunits;
4771 constant_p = true;
4772 elts.new_vector (vector_type, nunits, 1);
4773 elts.quick_grow (nunits);
4774 }
4775 }
4776 }
4777
4778 /* Since the vectors are created in the reverse order, we should invert
4779 them. */
4780 vec_num = voprnds.length ();
4781 for (j = vec_num; j != 0; j--)
4782 {
4783 vop = voprnds[j - 1];
4784 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4785 }
4786
4787 /* In case that VF is greater than the unrolling factor needed for the SLP
4788 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4789 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4790 to replicate the vectors. */
4791 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
4792 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
4793 i++)
4794 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4795 }
4796
4797 /* Get the Ith vectorized definition from SLP_NODE. */
4798
4799 tree
4800 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
4801 {
4802 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
4803 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
4804 else
4805 return SLP_TREE_VEC_DEFS (slp_node)[i];
4806 }
4807
4808 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
4809
4810 void
4811 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
4812 {
4813 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
4814 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
4815 {
4816 unsigned j;
4817 gimple *vec_def_stmt;
4818 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
4819 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
4820 }
4821 else
4822 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
4823 }
4824
4825 /* Get N vectorized definitions for SLP_NODE. */
4826
4827 void
4828 vect_get_slp_defs (vec_info *,
4829 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
4830 {
4831 if (n == -1U)
4832 n = SLP_TREE_CHILDREN (slp_node).length ();
4833
4834 for (unsigned i = 0; i < n; ++i)
4835 {
4836 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
4837 vec<tree> vec_defs = vNULL;
4838 vect_get_slp_defs (child, &vec_defs);
4839 vec_oprnds->quick_push (vec_defs);
4840 }
4841 }
4842
4843 /* Generate vector permute statements from a list of loads in DR_CHAIN.
4844 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
4845 permute statements for the SLP node NODE. Store the number of vector
4846 permute instructions in *N_PERMS and the number of vector load
4847 instructions in *N_LOADS. */
4848
4849 bool
4850 vect_transform_slp_perm_load (vec_info *vinfo,
4851 slp_tree node, vec<tree> dr_chain,
4852 gimple_stmt_iterator *gsi, poly_uint64 vf,
4853 bool analyze_only, unsigned *n_perms,
4854 unsigned int *n_loads)
4855 {
4856 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4857 int vec_index = 0;
4858 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4859 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
4860 unsigned int mask_element;
4861 machine_mode mode;
4862
4863 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
4864 return false;
4865
4866 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
4867
4868 mode = TYPE_MODE (vectype);
4869 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4870
4871 /* Initialize the vect stmts of NODE to properly insert the generated
4872 stmts later. */
4873 if (! analyze_only)
4874 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
4875 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
4876 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
4877
4878 /* Generate permutation masks for every NODE. Number of masks for each NODE
4879 is equal to GROUP_SIZE.
4880 E.g., we have a group of three nodes with three loads from the same
4881 location in each node, and the vector size is 4. I.e., we have a
4882 a0b0c0a1b1c1... sequence and we need to create the following vectors:
4883 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
4884 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
4885 ...
4886
4887 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
4888 The last mask is illegal since we assume two operands for permute
4889 operation, and the mask element values can't be outside that range.
4890 Hence, the last mask must be converted into {2,5,5,5}.
4891 For the first two permutations we need the first and the second input
4892 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
4893 we need the second and the third vectors: {b1,c1,a2,b2} and
4894 {c2,a3,b3,c3}. */
4895
4896 int vect_stmts_counter = 0;
4897 unsigned int index = 0;
4898 int first_vec_index = -1;
4899 int second_vec_index = -1;
4900 bool noop_p = true;
4901 *n_perms = 0;
4902
4903 vec_perm_builder mask;
4904 unsigned int nelts_to_build;
4905 unsigned int nvectors_per_build;
4906 unsigned int in_nlanes;
4907 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
4908 && multiple_p (nunits, group_size));
4909 if (repeating_p)
4910 {
4911 /* A single vector contains a whole number of copies of the node, so:
4912 (a) all permutes can use the same mask; and
4913 (b) the permutes only need a single vector input. */
4914 mask.new_vector (nunits, group_size, 3);
4915 nelts_to_build = mask.encoded_nelts ();
4916 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
4917 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
4918 }
4919 else
4920 {
4921 /* We need to construct a separate mask for each vector statement. */
4922 unsigned HOST_WIDE_INT const_nunits, const_vf;
4923 if (!nunits.is_constant (&const_nunits)
4924 || !vf.is_constant (&const_vf))
4925 return false;
4926 mask.new_vector (const_nunits, const_nunits, 1);
4927 nelts_to_build = const_vf * group_size;
4928 nvectors_per_build = 1;
4929 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
4930 }
4931 auto_sbitmap used_in_lanes (in_nlanes);
4932 bitmap_clear (used_in_lanes);
4933
4934 unsigned int count = mask.encoded_nelts ();
4935 mask.quick_grow (count);
4936 vec_perm_indices indices;
4937
4938 for (unsigned int j = 0; j < nelts_to_build; j++)
4939 {
4940 unsigned int iter_num = j / group_size;
4941 unsigned int stmt_num = j % group_size;
4942 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
4943 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
4944 bitmap_set_bit (used_in_lanes, i);
4945 if (repeating_p)
4946 {
4947 first_vec_index = 0;
4948 mask_element = i;
4949 }
4950 else
4951 {
4952 /* Enforced before the loop when !repeating_p. */
4953 unsigned int const_nunits = nunits.to_constant ();
4954 vec_index = i / const_nunits;
4955 mask_element = i % const_nunits;
4956 if (vec_index == first_vec_index
4957 || first_vec_index == -1)
4958 {
4959 first_vec_index = vec_index;
4960 }
4961 else if (vec_index == second_vec_index
4962 || second_vec_index == -1)
4963 {
4964 second_vec_index = vec_index;
4965 mask_element += const_nunits;
4966 }
4967 else
4968 {
4969 if (dump_enabled_p ())
4970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4971 "permutation requires at "
4972 "least three vectors %G",
4973 stmt_info->stmt);
4974 gcc_assert (analyze_only);
4975 return false;
4976 }
4977
4978 gcc_assert (mask_element < 2 * const_nunits);
4979 }
4980
4981 if (mask_element != index)
4982 noop_p = false;
4983 mask[index++] = mask_element;
4984
4985 if (index == count && !noop_p)
4986 {
4987 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
4988 if (!can_vec_perm_const_p (mode, indices))
4989 {
4990 if (dump_enabled_p ())
4991 {
4992 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4993 vect_location,
4994 "unsupported vect permute { ");
4995 for (i = 0; i < count; ++i)
4996 {
4997 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4998 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4999 }
5000 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5001 }
5002 gcc_assert (analyze_only);
5003 return false;
5004 }
5005
5006 ++*n_perms;
5007 }
5008
5009 if (index == count)
5010 {
5011 if (!analyze_only)
5012 {
5013 tree mask_vec = NULL_TREE;
5014
5015 if (! noop_p)
5016 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5017
5018 if (second_vec_index == -1)
5019 second_vec_index = first_vec_index;
5020
5021 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
5022 {
5023 /* Generate the permute statement if necessary. */
5024 tree first_vec = dr_chain[first_vec_index + ri];
5025 tree second_vec = dr_chain[second_vec_index + ri];
5026 gimple *perm_stmt;
5027 if (! noop_p)
5028 {
5029 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5030 tree perm_dest
5031 = vect_create_destination_var (gimple_assign_lhs (stmt),
5032 vectype);
5033 perm_dest = make_ssa_name (perm_dest);
5034 perm_stmt
5035 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5036 first_vec, second_vec,
5037 mask_vec);
5038 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
5039 gsi);
5040 }
5041 else
5042 /* If mask was NULL_TREE generate the requested
5043 identity transform. */
5044 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
5045
5046 /* Store the vector statement in NODE. */
5047 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
5048 }
5049 }
5050
5051 index = 0;
5052 first_vec_index = -1;
5053 second_vec_index = -1;
5054 noop_p = true;
5055 }
5056 }
5057
5058 if (n_loads)
5059 {
5060 if (repeating_p)
5061 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5062 else
5063 {
5064 /* Enforced above when !repeating_p. */
5065 unsigned int const_nunits = nunits.to_constant ();
5066 *n_loads = 0;
5067 bool load_seen = false;
5068 for (unsigned i = 0; i < in_nlanes; ++i)
5069 {
5070 if (i % const_nunits == 0)
5071 {
5072 if (load_seen)
5073 *n_loads += 1;
5074 load_seen = false;
5075 }
5076 if (bitmap_bit_p (used_in_lanes, i))
5077 load_seen = true;
5078 }
5079 if (load_seen)
5080 *n_loads += 1;
5081 }
5082 }
5083
5084 return true;
5085 }
5086
5087
5088 /* Vectorize the SLP permutations in NODE as specified
5089 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5090 child number and lane number.
5091 Interleaving of two two-lane two-child SLP subtrees (not supported):
5092 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5093 A blend of two four-lane two-child SLP subtrees:
5094 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5095 Highpart of a four-lane one-child SLP subtree (not supported):
5096 [ { 0, 2 }, { 0, 3 } ]
5097 Where currently only a subset is supported by code generating below. */
5098
5099 static bool
5100 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5101 slp_tree node, stmt_vector_for_cost *cost_vec)
5102 {
5103 tree vectype = SLP_TREE_VECTYPE (node);
5104
5105 /* ??? We currently only support all same vector input and output types
5106 while the SLP IL should really do a concat + select and thus accept
5107 arbitrary mismatches. */
5108 slp_tree child;
5109 unsigned i;
5110 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5111 if (!types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
5112 {
5113 if (dump_enabled_p ())
5114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5115 "Unsupported lane permutation\n");
5116 return false;
5117 }
5118
5119 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
5120 gcc_assert (perm.length () == SLP_TREE_LANES (node));
5121 if (dump_enabled_p ())
5122 {
5123 dump_printf_loc (MSG_NOTE, vect_location,
5124 "vectorizing permutation");
5125 for (unsigned i = 0; i < perm.length (); ++i)
5126 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
5127 dump_printf (MSG_NOTE, "\n");
5128 }
5129
5130 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5131 if (!nunits.is_constant ())
5132 return false;
5133 unsigned HOST_WIDE_INT vf = 1;
5134 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
5135 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
5136 return false;
5137 unsigned olanes = vf * SLP_TREE_LANES (node);
5138 gcc_assert (multiple_p (olanes, nunits));
5139
5140 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5141 from the { SLP operand, scalar lane } permutation as recorded in the
5142 SLP node as intermediate step. This part should already work
5143 with SLP children with arbitrary number of lanes. */
5144 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
5145 auto_vec<unsigned> active_lane;
5146 vperm.create (olanes);
5147 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
5148 for (unsigned i = 0; i < vf; ++i)
5149 {
5150 for (unsigned pi = 0; pi < perm.length (); ++pi)
5151 {
5152 std::pair<unsigned, unsigned> p = perm[pi];
5153 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
5154 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
5155 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
5156 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
5157 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
5158 }
5159 /* Advance to the next group. */
5160 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
5161 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
5162 }
5163
5164 if (dump_enabled_p ())
5165 {
5166 dump_printf_loc (MSG_NOTE, vect_location, "as");
5167 for (unsigned i = 0; i < vperm.length (); ++i)
5168 {
5169 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
5170 dump_printf (MSG_NOTE, ",");
5171 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
5172 vperm[i].first.first, vperm[i].first.second,
5173 vperm[i].first.second);
5174 }
5175 dump_printf (MSG_NOTE, "\n");
5176 }
5177
5178 /* We can only handle two-vector permutes, everything else should
5179 be lowered on the SLP level. The following is closely inspired
5180 by vect_transform_slp_perm_load and is supposed to eventually
5181 replace it.
5182 ??? As intermediate step do code-gen in the SLP tree representation
5183 somehow? */
5184 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
5185 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
5186 unsigned int const_nunits = nunits.to_constant ();
5187 unsigned int index = 0;
5188 unsigned int mask_element;
5189 vec_perm_builder mask;
5190 mask.new_vector (const_nunits, const_nunits, 1);
5191 unsigned int count = mask.encoded_nelts ();
5192 mask.quick_grow (count);
5193 vec_perm_indices indices;
5194 unsigned nperms = 0;
5195 for (unsigned i = 0; i < vperm.length (); ++i)
5196 {
5197 mask_element = vperm[i].second;
5198 if (first_vec.first == -1U
5199 || first_vec == vperm[i].first)
5200 first_vec = vperm[i].first;
5201 else if (second_vec.first == -1U
5202 || second_vec == vperm[i].first)
5203 {
5204 second_vec = vperm[i].first;
5205 mask_element += const_nunits;
5206 }
5207 else
5208 {
5209 if (dump_enabled_p ())
5210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5211 "permutation requires at "
5212 "least three vectors\n");
5213 gcc_assert (!gsi);
5214 return false;
5215 }
5216
5217 mask[index++] = mask_element;
5218
5219 if (index == count)
5220 {
5221 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
5222 const_nunits);
5223 bool identity_p = indices.series_p (0, 1, 0, 1);
5224 if (!identity_p
5225 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5226 {
5227 if (dump_enabled_p ())
5228 {
5229 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5230 vect_location,
5231 "unsupported vect permute { ");
5232 for (i = 0; i < count; ++i)
5233 {
5234 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5235 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5236 }
5237 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5238 }
5239 gcc_assert (!gsi);
5240 return false;
5241 }
5242
5243 if (!identity_p)
5244 nperms++;
5245 if (gsi)
5246 {
5247 if (second_vec.first == -1U)
5248 second_vec = first_vec;
5249
5250 /* Generate the permute statement if necessary. */
5251 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
5252 tree first_def
5253 = vect_get_slp_vect_def (first_node, first_vec.second);
5254 gassign *perm_stmt;
5255 tree perm_dest = make_ssa_name (vectype);
5256 if (!identity_p)
5257 {
5258 slp_tree second_node
5259 = SLP_TREE_CHILDREN (node)[second_vec.first];
5260 tree second_def
5261 = vect_get_slp_vect_def (second_node, second_vec.second);
5262 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5263 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5264 first_def, second_def,
5265 mask_vec);
5266 }
5267 else
5268 /* We need a copy here in case the def was external. */
5269 perm_stmt = gimple_build_assign (perm_dest, first_def);
5270 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
5271 /* Store the vector statement in NODE. */
5272 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
5273 }
5274
5275 index = 0;
5276 first_vec = std::make_pair (-1U, -1U);
5277 second_vec = std::make_pair (-1U, -1U);
5278 }
5279 }
5280
5281 if (!gsi)
5282 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
5283
5284 return true;
5285 }
5286
5287 /* Vectorize SLP NODE. */
5288
5289 static void
5290 vect_schedule_slp_node (vec_info *vinfo,
5291 slp_tree node, slp_instance instance)
5292 {
5293 gimple_stmt_iterator si;
5294 int i;
5295 slp_tree child;
5296
5297 /* For existing vectors there's nothing to do. */
5298 if (SLP_TREE_VEC_DEFS (node).exists ())
5299 return;
5300
5301 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
5302
5303 /* Vectorize externals and constants. */
5304 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5305 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5306 {
5307 /* ??? vectorizable_shift can end up using a scalar operand which is
5308 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5309 node in this case. */
5310 if (!SLP_TREE_VECTYPE (node))
5311 return;
5312
5313 vect_create_constant_vectors (vinfo, node);
5314 return;
5315 }
5316
5317 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5318
5319 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
5320 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
5321
5322 if (dump_enabled_p ())
5323 dump_printf_loc (MSG_NOTE, vect_location,
5324 "------>vectorizing SLP node starting from: %G",
5325 stmt_info->stmt);
5326
5327 if (STMT_VINFO_DATA_REF (stmt_info)
5328 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5329 {
5330 /* Vectorized loads go before the first scalar load to make it
5331 ready early, vectorized stores go before the last scalar
5332 stmt which is where all uses are ready. */
5333 stmt_vec_info last_stmt_info = NULL;
5334 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
5335 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
5336 else /* DR_IS_WRITE */
5337 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
5338 si = gsi_for_stmt (last_stmt_info->stmt);
5339 }
5340 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
5341 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
5342 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
5343 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5344 {
5345 /* For PHI node vectorization we do not use the insertion iterator. */
5346 si = gsi_none ();
5347 }
5348 else
5349 {
5350 /* Emit other stmts after the children vectorized defs which is
5351 earliest possible. */
5352 gimple *last_stmt = NULL;
5353 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5354 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5355 {
5356 /* For fold-left reductions we are retaining the scalar
5357 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5358 set so the representation isn't perfect. Resort to the
5359 last scalar def here. */
5360 if (SLP_TREE_VEC_STMTS (child).is_empty ())
5361 {
5362 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
5363 == cycle_phi_info_type);
5364 gphi *phi = as_a <gphi *>
5365 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
5366 if (!last_stmt
5367 || vect_stmt_dominates_stmt_p (last_stmt, phi))
5368 last_stmt = phi;
5369 }
5370 /* We are emitting all vectorized stmts in the same place and
5371 the last one is the last.
5372 ??? Unless we have a load permutation applied and that
5373 figures to re-use an earlier generated load. */
5374 unsigned j;
5375 gimple *vstmt;
5376 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
5377 if (!last_stmt
5378 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5379 last_stmt = vstmt;
5380 }
5381 else if (!SLP_TREE_VECTYPE (child))
5382 {
5383 /* For externals we use unvectorized at all scalar defs. */
5384 unsigned j;
5385 tree def;
5386 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
5387 if (TREE_CODE (def) == SSA_NAME
5388 && !SSA_NAME_IS_DEFAULT_DEF (def))
5389 {
5390 gimple *stmt = SSA_NAME_DEF_STMT (def);
5391 if (!last_stmt
5392 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
5393 last_stmt = stmt;
5394 }
5395 }
5396 else
5397 {
5398 /* For externals we have to look at all defs since their
5399 insertion place is decided per vector. But beware
5400 of pre-existing vectors where we need to make sure
5401 we do not insert before the region boundary. */
5402 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
5403 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
5404 last_stmt = gsi_stmt (gsi_after_labels
5405 (as_a <bb_vec_info> (vinfo)->bbs[0]));
5406 else
5407 {
5408 unsigned j;
5409 tree vdef;
5410 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
5411 if (TREE_CODE (vdef) == SSA_NAME
5412 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
5413 {
5414 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
5415 if (!last_stmt
5416 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5417 last_stmt = vstmt;
5418 }
5419 }
5420 }
5421 /* This can happen when all children are pre-existing vectors or
5422 constants. */
5423 if (!last_stmt)
5424 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
5425 if (is_a <gphi *> (last_stmt))
5426 si = gsi_after_labels (gimple_bb (last_stmt));
5427 else
5428 {
5429 si = gsi_for_stmt (last_stmt);
5430 gsi_next (&si);
5431 }
5432 }
5433
5434 bool done_p = false;
5435
5436 /* Handle purely internal nodes. */
5437 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5438 {
5439 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5440 be shared with different SLP nodes (but usually it's the same
5441 operation apart from the case the stmt is only there for denoting
5442 the actual scalar lane defs ...). So do not call vect_transform_stmt
5443 but open-code it here (partly). */
5444 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
5445 gcc_assert (done);
5446 done_p = true;
5447 }
5448 if (!done_p)
5449 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
5450 }
5451
5452 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5453 For loop vectorization this is done in vectorizable_call, but for SLP
5454 it needs to be deferred until end of vect_schedule_slp, because multiple
5455 SLP instances may refer to the same scalar stmt. */
5456
5457 static void
5458 vect_remove_slp_scalar_calls (vec_info *vinfo,
5459 slp_tree node, hash_set<slp_tree> &visited)
5460 {
5461 gimple *new_stmt;
5462 gimple_stmt_iterator gsi;
5463 int i;
5464 slp_tree child;
5465 tree lhs;
5466 stmt_vec_info stmt_info;
5467
5468 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5469 return;
5470
5471 if (visited.add (node))
5472 return;
5473
5474 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5475 vect_remove_slp_scalar_calls (vinfo, child, visited);
5476
5477 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5478 {
5479 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
5480 if (!stmt || gimple_bb (stmt) == NULL)
5481 continue;
5482 if (is_pattern_stmt_p (stmt_info)
5483 || !PURE_SLP_STMT (stmt_info))
5484 continue;
5485 lhs = gimple_call_lhs (stmt);
5486 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
5487 gsi = gsi_for_stmt (stmt);
5488 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
5489 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
5490 }
5491 }
5492
5493 static void
5494 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
5495 {
5496 hash_set<slp_tree> visited;
5497 vect_remove_slp_scalar_calls (vinfo, node, visited);
5498 }
5499
5500 /* Vectorize the instance root. */
5501
5502 void
5503 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
5504 {
5505 gassign *rstmt = NULL;
5506
5507 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
5508 {
5509 gimple *child_stmt;
5510 int j;
5511
5512 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5513 {
5514 tree vect_lhs = gimple_get_lhs (child_stmt);
5515 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
5516 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
5517 TREE_TYPE (vect_lhs)))
5518 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
5519 vect_lhs);
5520 rstmt = gimple_build_assign (root_lhs, vect_lhs);
5521 break;
5522 }
5523 }
5524 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
5525 {
5526 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5527 gimple *child_stmt;
5528 int j;
5529 vec<constructor_elt, va_gc> *v;
5530 vec_alloc (v, nelts);
5531
5532 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5533 {
5534 CONSTRUCTOR_APPEND_ELT (v,
5535 NULL_TREE,
5536 gimple_get_lhs (child_stmt));
5537 }
5538 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
5539 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
5540 tree r_constructor = build_constructor (rtype, v);
5541 rstmt = gimple_build_assign (lhs, r_constructor);
5542 }
5543
5544 gcc_assert (rstmt);
5545
5546 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
5547 gsi_replace (&rgsi, rstmt, true);
5548 }
5549
5550 struct slp_scc_info
5551 {
5552 bool on_stack;
5553 int dfs;
5554 int lowlink;
5555 };
5556
5557 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
5558
5559 static void
5560 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
5561 hash_map<slp_tree, slp_scc_info> &scc_info,
5562 int &maxdfs, vec<slp_tree> &stack)
5563 {
5564 bool existed_p;
5565 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
5566 gcc_assert (!existed_p);
5567 info->dfs = maxdfs;
5568 info->lowlink = maxdfs;
5569 maxdfs++;
5570
5571 /* Leaf. */
5572 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5573 {
5574 info->on_stack = false;
5575 vect_schedule_slp_node (vinfo, node, instance);
5576 return;
5577 }
5578
5579 info->on_stack = true;
5580 stack.safe_push (node);
5581
5582 unsigned i;
5583 slp_tree child;
5584 /* DFS recurse. */
5585 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5586 {
5587 if (!child)
5588 continue;
5589 slp_scc_info *child_info = scc_info.get (child);
5590 if (!child_info)
5591 {
5592 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
5593 /* Recursion might have re-allocated the node. */
5594 info = scc_info.get (node);
5595 child_info = scc_info.get (child);
5596 info->lowlink = MIN (info->lowlink, child_info->lowlink);
5597 }
5598 else if (child_info->on_stack)
5599 info->lowlink = MIN (info->lowlink, child_info->dfs);
5600 }
5601 if (info->lowlink != info->dfs)
5602 return;
5603
5604 auto_vec<slp_tree, 4> phis_to_fixup;
5605
5606 /* Singleton. */
5607 if (stack.last () == node)
5608 {
5609 stack.pop ();
5610 info->on_stack = false;
5611 vect_schedule_slp_node (vinfo, node, instance);
5612 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
5613 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
5614 phis_to_fixup.quick_push (node);
5615 }
5616 else
5617 {
5618 /* SCC. */
5619 int last_idx = stack.length () - 1;
5620 while (stack[last_idx] != node)
5621 last_idx--;
5622 /* We can break the cycle at PHIs who have at least one child
5623 code generated. Then we could re-start the DFS walk until
5624 all nodes in the SCC are covered (we might have new entries
5625 for only back-reachable nodes). But it's simpler to just
5626 iterate and schedule those that are ready. */
5627 unsigned todo = stack.length () - last_idx;
5628 do
5629 {
5630 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
5631 {
5632 slp_tree entry = stack[idx];
5633 if (!entry)
5634 continue;
5635 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
5636 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
5637 bool ready = !phi;
5638 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
5639 if (!child)
5640 {
5641 gcc_assert (phi);
5642 ready = true;
5643 break;
5644 }
5645 else if (scc_info.get (child)->on_stack)
5646 {
5647 if (!phi)
5648 {
5649 ready = false;
5650 break;
5651 }
5652 }
5653 else
5654 {
5655 if (phi)
5656 {
5657 ready = true;
5658 break;
5659 }
5660 }
5661 if (ready)
5662 {
5663 vect_schedule_slp_node (vinfo, entry, instance);
5664 scc_info.get (entry)->on_stack = false;
5665 stack[idx] = NULL;
5666 todo--;
5667 if (phi)
5668 phis_to_fixup.safe_push (entry);
5669 }
5670 }
5671 }
5672 while (todo != 0);
5673
5674 /* Pop the SCC. */
5675 stack.truncate (last_idx);
5676 }
5677
5678 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
5679 slp_tree phi_node;
5680 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
5681 {
5682 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
5683 edge_iterator ei;
5684 edge e;
5685 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
5686 {
5687 unsigned dest_idx = e->dest_idx;
5688 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
5689 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
5690 continue;
5691 /* Simply fill all args. */
5692 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
5693 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
5694 vect_get_slp_vect_def (child, i),
5695 e, gimple_phi_arg_location (phi, dest_idx));
5696 }
5697 }
5698 }
5699
5700 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
5701
5702 void
5703 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
5704 {
5705 slp_instance instance;
5706 unsigned int i;
5707
5708 hash_map<slp_tree, slp_scc_info> scc_info;
5709 int maxdfs = 0;
5710 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5711 {
5712 slp_tree node = SLP_INSTANCE_TREE (instance);
5713 if (dump_enabled_p ())
5714 {
5715 dump_printf_loc (MSG_NOTE, vect_location,
5716 "Vectorizing SLP tree:\n");
5717 if (SLP_INSTANCE_ROOT_STMT (instance))
5718 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
5719 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
5720 vect_print_slp_graph (MSG_NOTE, vect_location,
5721 SLP_INSTANCE_TREE (instance));
5722 }
5723 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
5724 have a PHI be the node breaking the cycle. */
5725 auto_vec<slp_tree> stack;
5726 if (!scc_info.get (node))
5727 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
5728
5729 if (SLP_INSTANCE_ROOT_STMT (instance))
5730 vectorize_slp_instance_root_stmt (node, instance);
5731
5732 if (dump_enabled_p ())
5733 dump_printf_loc (MSG_NOTE, vect_location,
5734 "vectorizing stmts using SLP.\n");
5735 }
5736
5737 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5738 {
5739 slp_tree root = SLP_INSTANCE_TREE (instance);
5740 stmt_vec_info store_info;
5741 unsigned int j;
5742
5743 /* Remove scalar call stmts. Do not do this for basic-block
5744 vectorization as not all uses may be vectorized.
5745 ??? Why should this be necessary? DCE should be able to
5746 remove the stmts itself.
5747 ??? For BB vectorization we can as well remove scalar
5748 stmts starting from the SLP tree root if they have no
5749 uses. */
5750 if (is_a <loop_vec_info> (vinfo))
5751 vect_remove_slp_scalar_calls (vinfo, root);
5752
5753 /* Remove vectorized stores original scalar stmts. */
5754 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
5755 {
5756 if (!STMT_VINFO_DATA_REF (store_info)
5757 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
5758 break;
5759
5760 store_info = vect_orig_stmt (store_info);
5761 /* Free the attached stmt_vec_info and remove the stmt. */
5762 vinfo->remove_stmt (store_info);
5763 }
5764 }
5765 }