]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
tree-optimization/97650 - fix ICE in vect_get_and_check_slp_defs
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51
52 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
53 slp_tree, stmt_vector_for_cost *);
54
55 object_allocator<_slp_tree> *slp_tree_pool;
56
57 void *
58 _slp_tree::operator new (size_t n)
59 {
60 gcc_assert (n == sizeof (_slp_tree));
61 return slp_tree_pool->allocate_raw ();
62 }
63
64 void
65 _slp_tree::operator delete (void *node, size_t n)
66 {
67 gcc_assert (n == sizeof (_slp_tree));
68 slp_tree_pool->remove_raw (node);
69 }
70
71
72 /* Initialize a SLP node. */
73
74 _slp_tree::_slp_tree ()
75 {
76 SLP_TREE_SCALAR_STMTS (this) = vNULL;
77 SLP_TREE_SCALAR_OPS (this) = vNULL;
78 SLP_TREE_VEC_STMTS (this) = vNULL;
79 SLP_TREE_VEC_DEFS (this) = vNULL;
80 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
81 SLP_TREE_CHILDREN (this) = vNULL;
82 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
83 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
84 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
85 SLP_TREE_CODE (this) = ERROR_MARK;
86 SLP_TREE_VECTYPE (this) = NULL_TREE;
87 SLP_TREE_REPRESENTATIVE (this) = NULL;
88 SLP_TREE_REF_COUNT (this) = 1;
89 this->max_nunits = 1;
90 this->lanes = 0;
91 }
92
93 /* Tear down a SLP node. */
94
95 _slp_tree::~_slp_tree ()
96 {
97 SLP_TREE_CHILDREN (this).release ();
98 SLP_TREE_SCALAR_STMTS (this).release ();
99 SLP_TREE_SCALAR_OPS (this).release ();
100 SLP_TREE_VEC_STMTS (this).release ();
101 SLP_TREE_VEC_DEFS (this).release ();
102 SLP_TREE_LOAD_PERMUTATION (this).release ();
103 SLP_TREE_LANE_PERMUTATION (this).release ();
104 }
105
106 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
107
108 static void
109 vect_free_slp_tree (slp_tree node)
110 {
111 int i;
112 slp_tree child;
113
114 if (--SLP_TREE_REF_COUNT (node) != 0)
115 return;
116
117 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
118 if (child)
119 vect_free_slp_tree (child);
120
121 delete node;
122 }
123
124 /* Return a location suitable for dumpings related to the SLP instance. */
125
126 dump_user_location_t
127 _slp_instance::location () const
128 {
129 if (root_stmt)
130 return root_stmt->stmt;
131 else
132 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
133 }
134
135
136 /* Free the memory allocated for the SLP instance. */
137
138 void
139 vect_free_slp_instance (slp_instance instance)
140 {
141 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
142 SLP_INSTANCE_LOADS (instance).release ();
143 instance->subgraph_entries.release ();
144 instance->cost_vec.release ();
145 free (instance);
146 }
147
148
149 /* Create an SLP node for SCALAR_STMTS. */
150
151 static slp_tree
152 vect_create_new_slp_node (slp_tree node,
153 vec<stmt_vec_info> scalar_stmts, unsigned nops)
154 {
155 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
156 SLP_TREE_CHILDREN (node).create (nops);
157 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
158 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
159 SLP_TREE_LANES (node) = scalar_stmts.length ();
160 return node;
161 }
162
163 /* Create an SLP node for SCALAR_STMTS. */
164
165 static slp_tree
166 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
167 {
168 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
169 }
170
171 /* Create an SLP node for OPS. */
172
173 static slp_tree
174 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
175 {
176 SLP_TREE_SCALAR_OPS (node) = ops;
177 SLP_TREE_DEF_TYPE (node) = vect_external_def;
178 SLP_TREE_LANES (node) = ops.length ();
179 return node;
180 }
181
182 /* Create an SLP node for OPS. */
183
184 static slp_tree
185 vect_create_new_slp_node (vec<tree> ops)
186 {
187 return vect_create_new_slp_node (new _slp_tree, ops);
188 }
189
190
191 /* This structure is used in creation of an SLP tree. Each instance
192 corresponds to the same operand in a group of scalar stmts in an SLP
193 node. */
194 typedef struct _slp_oprnd_info
195 {
196 /* Def-stmts for the operands. */
197 vec<stmt_vec_info> def_stmts;
198 /* Operands. */
199 vec<tree> ops;
200 /* Information about the first statement, its vector def-type, type, the
201 operand itself in case it's constant, and an indication if it's a pattern
202 stmt. */
203 tree first_op_type;
204 enum vect_def_type first_dt;
205 bool any_pattern;
206 } *slp_oprnd_info;
207
208
209 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
210 operand. */
211 static vec<slp_oprnd_info>
212 vect_create_oprnd_info (int nops, int group_size)
213 {
214 int i;
215 slp_oprnd_info oprnd_info;
216 vec<slp_oprnd_info> oprnds_info;
217
218 oprnds_info.create (nops);
219 for (i = 0; i < nops; i++)
220 {
221 oprnd_info = XNEW (struct _slp_oprnd_info);
222 oprnd_info->def_stmts.create (group_size);
223 oprnd_info->ops.create (group_size);
224 oprnd_info->first_dt = vect_uninitialized_def;
225 oprnd_info->first_op_type = NULL_TREE;
226 oprnd_info->any_pattern = false;
227 oprnds_info.quick_push (oprnd_info);
228 }
229
230 return oprnds_info;
231 }
232
233
234 /* Free operands info. */
235
236 static void
237 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
238 {
239 int i;
240 slp_oprnd_info oprnd_info;
241
242 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
243 {
244 oprnd_info->def_stmts.release ();
245 oprnd_info->ops.release ();
246 XDELETE (oprnd_info);
247 }
248
249 oprnds_info.release ();
250 }
251
252
253 /* Return true if STMTS contains a pattern statement. */
254
255 static bool
256 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
257 {
258 stmt_vec_info stmt_info;
259 unsigned int i;
260 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
261 if (is_pattern_stmt_p (stmt_info))
262 return true;
263 return false;
264 }
265
266 /* Return true when all lanes in the external or constant NODE have
267 the same value. */
268
269 static bool
270 vect_slp_tree_uniform_p (slp_tree node)
271 {
272 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
273 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
274
275 /* Pre-exsting vectors. */
276 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
277 return false;
278
279 unsigned i;
280 tree op, first = NULL_TREE;
281 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
282 if (!first)
283 first = op;
284 else if (!operand_equal_p (first, op, 0))
285 return false;
286
287 return true;
288 }
289
290 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
291 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
292 of the chain. */
293
294 int
295 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
296 stmt_vec_info first_stmt_info)
297 {
298 stmt_vec_info next_stmt_info = first_stmt_info;
299 int result = 0;
300
301 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
302 return -1;
303
304 do
305 {
306 if (next_stmt_info == stmt_info)
307 return result;
308 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
309 if (next_stmt_info)
310 result += DR_GROUP_GAP (next_stmt_info);
311 }
312 while (next_stmt_info);
313
314 return -1;
315 }
316
317 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
318 using the method implemented by duplicate_and_interleave. Return true
319 if so, returning the number of intermediate vectors in *NVECTORS_OUT
320 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
321 (if nonnull). */
322
323 bool
324 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
325 tree elt_type, unsigned int *nvectors_out,
326 tree *vector_type_out,
327 tree *permutes)
328 {
329 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
330 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
331 return false;
332
333 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
334 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
335 unsigned int nvectors = 1;
336 for (;;)
337 {
338 scalar_int_mode int_mode;
339 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
340 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
341 {
342 /* Get the natural vector type for this SLP group size. */
343 tree int_type = build_nonstandard_integer_type
344 (GET_MODE_BITSIZE (int_mode), 1);
345 tree vector_type
346 = get_vectype_for_scalar_type (vinfo, int_type, count);
347 if (vector_type
348 && VECTOR_MODE_P (TYPE_MODE (vector_type))
349 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
350 GET_MODE_SIZE (base_vector_mode)))
351 {
352 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
353 together into elements of type INT_TYPE and using the result
354 to build NVECTORS vectors. */
355 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
356 vec_perm_builder sel1 (nelts, 2, 3);
357 vec_perm_builder sel2 (nelts, 2, 3);
358 poly_int64 half_nelts = exact_div (nelts, 2);
359 for (unsigned int i = 0; i < 3; ++i)
360 {
361 sel1.quick_push (i);
362 sel1.quick_push (i + nelts);
363 sel2.quick_push (half_nelts + i);
364 sel2.quick_push (half_nelts + i + nelts);
365 }
366 vec_perm_indices indices1 (sel1, 2, nelts);
367 vec_perm_indices indices2 (sel2, 2, nelts);
368 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
369 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
370 {
371 if (nvectors_out)
372 *nvectors_out = nvectors;
373 if (vector_type_out)
374 *vector_type_out = vector_type;
375 if (permutes)
376 {
377 permutes[0] = vect_gen_perm_mask_checked (vector_type,
378 indices1);
379 permutes[1] = vect_gen_perm_mask_checked (vector_type,
380 indices2);
381 }
382 return true;
383 }
384 }
385 }
386 if (!multiple_p (elt_bytes, 2, &elt_bytes))
387 return false;
388 nvectors *= 2;
389 }
390 }
391
392 /* Return true if DTA and DTB match. */
393
394 static bool
395 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
396 {
397 return (dta == dtb
398 || ((dta == vect_external_def || dta == vect_constant_def)
399 && (dtb == vect_external_def || dtb == vect_constant_def)));
400 }
401
402 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
403 they are of a valid type and that they match the defs of the first stmt of
404 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
405 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
406 indicates swap is required for cond_expr stmts. Specifically, *SWAP
407 is 1 if STMT is cond and operands of comparison need to be swapped;
408 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
409 If there is any operand swap in this function, *SWAP is set to non-zero
410 value.
411 If there was a fatal error return -1; if the error could be corrected by
412 swapping operands of father node of this one, return 1; if everything is
413 ok return 0. */
414 static int
415 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
416 bool *skip_args,
417 vec<stmt_vec_info> stmts, unsigned stmt_num,
418 vec<slp_oprnd_info> *oprnds_info)
419 {
420 stmt_vec_info stmt_info = stmts[stmt_num];
421 tree oprnd;
422 unsigned int i, number_of_oprnds;
423 enum vect_def_type dt = vect_uninitialized_def;
424 slp_oprnd_info oprnd_info;
425 int first_op_idx = 1;
426 unsigned int commutative_op = -1U;
427 bool first_op_cond = false;
428 bool first = stmt_num == 0;
429
430 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
431 {
432 number_of_oprnds = gimple_call_num_args (stmt);
433 first_op_idx = 3;
434 if (gimple_call_internal_p (stmt))
435 {
436 internal_fn ifn = gimple_call_internal_fn (stmt);
437 commutative_op = first_commutative_argument (ifn);
438
439 /* Masked load, only look at mask. */
440 if (ifn == IFN_MASK_LOAD)
441 {
442 number_of_oprnds = 1;
443 /* Mask operand index. */
444 first_op_idx = 5;
445 }
446 }
447 }
448 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
449 {
450 enum tree_code code = gimple_assign_rhs_code (stmt);
451 number_of_oprnds = gimple_num_ops (stmt) - 1;
452 /* Swap can only be done for cond_expr if asked to, otherwise we
453 could result in different comparison code to the first stmt. */
454 if (code == COND_EXPR
455 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
456 {
457 first_op_cond = true;
458 number_of_oprnds++;
459 }
460 else
461 commutative_op = commutative_tree_code (code) ? 0U : -1U;
462 }
463 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
464 number_of_oprnds = gimple_phi_num_args (stmt);
465 else
466 return -1;
467
468 bool swapped = (swap != 0);
469 bool backedge = false;
470 gcc_assert (!swapped || first_op_cond);
471 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
472 for (i = 0; i < number_of_oprnds; i++)
473 {
474 if (first_op_cond)
475 {
476 /* Map indicating how operands of cond_expr should be swapped. */
477 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
478 int *map = maps[swap];
479
480 if (i < 2)
481 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
482 first_op_idx), map[i]);
483 else
484 oprnd = gimple_op (stmt_info->stmt, map[i]);
485 }
486 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
487 {
488 oprnd = gimple_phi_arg_def (stmt, i);
489 backedge = dominated_by_p (CDI_DOMINATORS,
490 gimple_phi_arg_edge (stmt, i)->src,
491 gimple_bb (stmt_info->stmt));
492 }
493 else
494 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
495 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
496 oprnd = TREE_OPERAND (oprnd, 0);
497
498 oprnd_info = (*oprnds_info)[i];
499
500 stmt_vec_info def_stmt_info;
501 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
502 {
503 if (dump_enabled_p ())
504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
505 "Build SLP failed: can't analyze def for %T\n",
506 oprnd);
507
508 return -1;
509 }
510
511 if (skip_args[i])
512 {
513 oprnd_info->def_stmts.quick_push (NULL);
514 oprnd_info->ops.quick_push (NULL_TREE);
515 oprnd_info->first_dt = vect_uninitialized_def;
516 continue;
517 }
518
519 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
520 oprnd_info->any_pattern = true;
521
522 oprnd_info->def_stmts.quick_push (def_stmt_info);
523 oprnd_info->ops.quick_push (oprnd);
524
525 /* If there's a extern def on a backedge make sure we can
526 code-generate at the region start.
527 ??? This is another case that could be fixed by adjusting
528 how we split the function but at the moment we'd have conflicting
529 goals there. */
530 if (backedge
531 && dts[i] == vect_external_def
532 && is_a <bb_vec_info> (vinfo)
533 && TREE_CODE (oprnd) == SSA_NAME
534 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
535 && !dominated_by_p (CDI_DOMINATORS,
536 as_a <bb_vec_info> (vinfo)->bbs[0],
537 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
538 {
539 if (dump_enabled_p ())
540 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
541 "Build SLP failed: extern def %T only defined "
542 "on backedge\n", oprnd);
543 return -1;
544 }
545
546 if (first)
547 {
548 tree type = TREE_TYPE (oprnd);
549 dt = dts[i];
550 if ((dt == vect_constant_def
551 || dt == vect_external_def)
552 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
553 && (TREE_CODE (type) == BOOLEAN_TYPE
554 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
555 type)))
556 {
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
559 "Build SLP failed: invalid type of def "
560 "for variable-length SLP %T\n", oprnd);
561 return -1;
562 }
563
564 /* For the swapping logic below force vect_reduction_def
565 for the reduction op in a SLP reduction group. */
566 if (!STMT_VINFO_DATA_REF (stmt_info)
567 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
568 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
569 && def_stmt_info)
570 dts[i] = dt = vect_reduction_def;
571
572 /* Check the types of the definition. */
573 switch (dt)
574 {
575 case vect_external_def:
576 case vect_constant_def:
577 case vect_internal_def:
578 case vect_reduction_def:
579 case vect_induction_def:
580 case vect_nested_cycle:
581 break;
582
583 default:
584 /* FORNOW: Not supported. */
585 if (dump_enabled_p ())
586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
587 "Build SLP failed: illegal type of def %T\n",
588 oprnd);
589 return -1;
590 }
591
592 oprnd_info->first_dt = dt;
593 oprnd_info->first_op_type = type;
594 }
595 }
596 if (first)
597 return 0;
598
599 /* Now match the operand definition types to that of the first stmt. */
600 for (i = 0; i < number_of_oprnds;)
601 {
602 if (skip_args[i])
603 {
604 ++i;
605 continue;
606 }
607
608 oprnd_info = (*oprnds_info)[i];
609 dt = dts[i];
610 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
611 oprnd = oprnd_info->ops[stmt_num];
612 tree type = TREE_TYPE (oprnd);
613
614 if (!types_compatible_p (oprnd_info->first_op_type, type))
615 {
616 if (dump_enabled_p ())
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
618 "Build SLP failed: different operand types\n");
619 return 1;
620 }
621
622 /* Not first stmt of the group, check that the def-stmt/s match
623 the def-stmt/s of the first stmt. Allow different definition
624 types for reduction chains: the first stmt must be a
625 vect_reduction_def (a phi node), and the rest
626 end in the reduction chain. */
627 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
628 && !(oprnd_info->first_dt == vect_reduction_def
629 && !STMT_VINFO_DATA_REF (stmt_info)
630 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
631 && def_stmt_info
632 && !STMT_VINFO_DATA_REF (def_stmt_info)
633 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
634 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
635 || (!STMT_VINFO_DATA_REF (stmt_info)
636 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
637 && ((!def_stmt_info
638 || STMT_VINFO_DATA_REF (def_stmt_info)
639 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
640 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
641 != (oprnd_info->first_dt != vect_reduction_def))))
642 {
643 /* Try swapping operands if we got a mismatch. For BB
644 vectorization only in case it will clearly improve things. */
645 if (i == commutative_op && !swapped
646 && (!is_a <bb_vec_info> (vinfo)
647 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
648 dts[i+1])
649 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
650 || vect_def_types_match
651 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
652 {
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location,
655 "trying swapped operands\n");
656 std::swap (dts[i], dts[i+1]);
657 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
658 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
659 std::swap ((*oprnds_info)[i]->ops[stmt_num],
660 (*oprnds_info)[i+1]->ops[stmt_num]);
661 swapped = true;
662 continue;
663 }
664
665 if (is_a <bb_vec_info> (vinfo)
666 && !oprnd_info->any_pattern)
667 {
668 /* Now for commutative ops we should see whether we can
669 make the other operand matching. */
670 if (dump_enabled_p ())
671 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
672 "treating operand as external\n");
673 oprnd_info->first_dt = dt = vect_external_def;
674 }
675 else
676 {
677 if (dump_enabled_p ())
678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
679 "Build SLP failed: different types\n");
680 return 1;
681 }
682 }
683
684 /* Make sure to demote the overall operand to external. */
685 if (dt == vect_external_def)
686 oprnd_info->first_dt = vect_external_def;
687 /* For a SLP reduction chain we want to duplicate the reduction to
688 each of the chain members. That gets us a sane SLP graph (still
689 the stmts are not 100% correct wrt the initial values). */
690 else if ((dt == vect_internal_def
691 || dt == vect_reduction_def)
692 && oprnd_info->first_dt == vect_reduction_def
693 && !STMT_VINFO_DATA_REF (stmt_info)
694 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
695 && !STMT_VINFO_DATA_REF (def_stmt_info)
696 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
697 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
698 {
699 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
700 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
701 }
702
703 ++i;
704 }
705
706 /* Swap operands. */
707 if (swapped)
708 {
709 if (dump_enabled_p ())
710 dump_printf_loc (MSG_NOTE, vect_location,
711 "swapped operands to match def types in %G",
712 stmt_info->stmt);
713 }
714
715 return 0;
716 }
717
718 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
719 Return true if we can, meaning that this choice doesn't conflict with
720 existing SLP nodes that use STMT_INFO. */
721
722 bool
723 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
724 {
725 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
726 if (old_vectype)
727 return useless_type_conversion_p (vectype, old_vectype);
728
729 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
730 {
731 /* We maintain the invariant that if any statement in the group is
732 used, all other members of the group have the same vector type. */
733 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
734 stmt_vec_info member_info = first_info;
735 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
736 if (is_pattern_stmt_p (member_info)
737 && !useless_type_conversion_p (vectype,
738 STMT_VINFO_VECTYPE (member_info)))
739 break;
740
741 if (!member_info)
742 {
743 for (member_info = first_info; member_info;
744 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
745 STMT_VINFO_VECTYPE (member_info) = vectype;
746 return true;
747 }
748 }
749 else if (!is_pattern_stmt_p (stmt_info))
750 {
751 STMT_VINFO_VECTYPE (stmt_info) = vectype;
752 return true;
753 }
754
755 if (dump_enabled_p ())
756 {
757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758 "Build SLP failed: incompatible vector"
759 " types for: %G", stmt_info->stmt);
760 dump_printf_loc (MSG_NOTE, vect_location,
761 " old vector type: %T\n", old_vectype);
762 dump_printf_loc (MSG_NOTE, vect_location,
763 " new vector type: %T\n", vectype);
764 }
765 return false;
766 }
767
768 /* Return true if call statements CALL1 and CALL2 are similar enough
769 to be combined into the same SLP group. */
770
771 static bool
772 compatible_calls_p (gcall *call1, gcall *call2)
773 {
774 unsigned int nargs = gimple_call_num_args (call1);
775 if (nargs != gimple_call_num_args (call2))
776 return false;
777
778 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
779 return false;
780
781 if (gimple_call_internal_p (call1))
782 {
783 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
784 TREE_TYPE (gimple_call_lhs (call2))))
785 return false;
786 for (unsigned int i = 0; i < nargs; ++i)
787 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
788 TREE_TYPE (gimple_call_arg (call2, i))))
789 return false;
790 }
791 else
792 {
793 if (!operand_equal_p (gimple_call_fn (call1),
794 gimple_call_fn (call2), 0))
795 return false;
796
797 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
798 return false;
799 }
800 return true;
801 }
802
803 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
804 caller's attempt to find the vector type in STMT_INFO with the narrowest
805 element type. Return true if VECTYPE is nonnull and if it is valid
806 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
807 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
808 vect_build_slp_tree. */
809
810 static bool
811 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
812 unsigned int group_size,
813 tree vectype, poly_uint64 *max_nunits)
814 {
815 if (!vectype)
816 {
817 if (dump_enabled_p ())
818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
819 "Build SLP failed: unsupported data-type in %G\n",
820 stmt_info->stmt);
821 /* Fatal mismatch. */
822 return false;
823 }
824
825 /* If populating the vector type requires unrolling then fail
826 before adjusting *max_nunits for basic-block vectorization. */
827 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
828 unsigned HOST_WIDE_INT const_nunits;
829 if (is_a <bb_vec_info> (vinfo)
830 && (!nunits.is_constant (&const_nunits)
831 || const_nunits > group_size))
832 {
833 if (dump_enabled_p ())
834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
835 "Build SLP failed: unrolling required "
836 "in basic block SLP\n");
837 /* Fatal mismatch. */
838 return false;
839 }
840
841 /* In case of multiple types we need to detect the smallest type. */
842 vect_update_max_nunits (max_nunits, vectype);
843 return true;
844 }
845
846 /* Verify if the scalar stmts STMTS are isomorphic, require data
847 permutation or are of unsupported types of operation. Return
848 true if they are, otherwise return false and indicate in *MATCHES
849 which stmts are not isomorphic to the first one. If MATCHES[0]
850 is false then this indicates the comparison could not be
851 carried out or the stmts will never be vectorized by SLP.
852
853 Note COND_EXPR is possibly isomorphic to another one after swapping its
854 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
855 the first stmt by swapping the two operands of comparison; set SWAP[i]
856 to 2 if stmt I is isormorphic to the first stmt by inverting the code
857 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
858 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
859
860 static bool
861 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
862 vec<stmt_vec_info> stmts, unsigned int group_size,
863 poly_uint64 *max_nunits, bool *matches,
864 bool *two_operators, tree *node_vectype)
865 {
866 unsigned int i;
867 stmt_vec_info first_stmt_info = stmts[0];
868 enum tree_code first_stmt_code = ERROR_MARK;
869 enum tree_code alt_stmt_code = ERROR_MARK;
870 enum tree_code rhs_code = ERROR_MARK;
871 enum tree_code first_cond_code = ERROR_MARK;
872 tree lhs;
873 bool need_same_oprnds = false;
874 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
875 optab optab;
876 int icode;
877 machine_mode optab_op2_mode;
878 machine_mode vec_mode;
879 stmt_vec_info first_load = NULL, prev_first_load = NULL;
880 bool first_stmt_load_p = false, load_p = false;
881 bool first_stmt_phi_p = false, phi_p = false;
882
883 /* For every stmt in NODE find its def stmt/s. */
884 stmt_vec_info stmt_info;
885 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
886 {
887 gimple *stmt = stmt_info->stmt;
888 swap[i] = 0;
889 matches[i] = false;
890
891 if (dump_enabled_p ())
892 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
893
894 /* Fail to vectorize statements marked as unvectorizable, throw
895 or are volatile. */
896 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
897 || stmt_can_throw_internal (cfun, stmt)
898 || gimple_has_volatile_ops (stmt))
899 {
900 if (dump_enabled_p ())
901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
902 "Build SLP failed: unvectorizable statement %G",
903 stmt);
904 /* ??? For BB vectorization we want to commutate operands in a way
905 to shuffle all unvectorizable defs into one operand and have
906 the other still vectorized. The following doesn't reliably
907 work for this though but it's the easiest we can do here. */
908 if (is_a <bb_vec_info> (vinfo) && i != 0)
909 continue;
910 /* Fatal mismatch. */
911 matches[0] = false;
912 return false;
913 }
914
915 lhs = gimple_get_lhs (stmt);
916 if (lhs == NULL_TREE)
917 {
918 if (dump_enabled_p ())
919 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
920 "Build SLP failed: not GIMPLE_ASSIGN nor "
921 "GIMPLE_CALL %G", stmt);
922 if (is_a <bb_vec_info> (vinfo) && i != 0)
923 continue;
924 /* Fatal mismatch. */
925 matches[0] = false;
926 return false;
927 }
928
929 tree nunits_vectype;
930 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
931 &nunits_vectype, group_size)
932 || (nunits_vectype
933 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
934 nunits_vectype, max_nunits)))
935 {
936 if (is_a <bb_vec_info> (vinfo) && i != 0)
937 continue;
938 /* Fatal mismatch. */
939 matches[0] = false;
940 return false;
941 }
942
943 gcc_assert (vectype);
944
945 gcall *call_stmt = dyn_cast <gcall *> (stmt);
946 if (call_stmt)
947 {
948 rhs_code = CALL_EXPR;
949
950 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
951 load_p = true;
952 else if ((gimple_call_internal_p (call_stmt)
953 && (!vectorizable_internal_fn_p
954 (gimple_call_internal_fn (call_stmt))))
955 || gimple_call_tail_p (call_stmt)
956 || gimple_call_noreturn_p (call_stmt)
957 || !gimple_call_nothrow_p (call_stmt)
958 || gimple_call_chain (call_stmt))
959 {
960 if (dump_enabled_p ())
961 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
962 "Build SLP failed: unsupported call type %G",
963 call_stmt);
964 if (is_a <bb_vec_info> (vinfo) && i != 0)
965 continue;
966 /* Fatal mismatch. */
967 matches[0] = false;
968 return false;
969 }
970 }
971 else if (gimple_code (stmt) == GIMPLE_PHI)
972 {
973 rhs_code = ERROR_MARK;
974 phi_p = true;
975 }
976 else
977 {
978 rhs_code = gimple_assign_rhs_code (stmt);
979 load_p = gimple_vuse (stmt);
980 }
981
982 /* Check the operation. */
983 if (i == 0)
984 {
985 *node_vectype = vectype;
986 first_stmt_code = rhs_code;
987 first_stmt_load_p = load_p;
988 first_stmt_phi_p = phi_p;
989
990 /* Shift arguments should be equal in all the packed stmts for a
991 vector shift with scalar shift operand. */
992 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
993 || rhs_code == LROTATE_EXPR
994 || rhs_code == RROTATE_EXPR)
995 {
996 vec_mode = TYPE_MODE (vectype);
997
998 /* First see if we have a vector/vector shift. */
999 optab = optab_for_tree_code (rhs_code, vectype,
1000 optab_vector);
1001
1002 if (!optab
1003 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1004 {
1005 /* No vector/vector shift, try for a vector/scalar shift. */
1006 optab = optab_for_tree_code (rhs_code, vectype,
1007 optab_scalar);
1008
1009 if (!optab)
1010 {
1011 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1013 "Build SLP failed: no optab.\n");
1014 if (is_a <bb_vec_info> (vinfo) && i != 0)
1015 continue;
1016 /* Fatal mismatch. */
1017 matches[0] = false;
1018 return false;
1019 }
1020 icode = (int) optab_handler (optab, vec_mode);
1021 if (icode == CODE_FOR_nothing)
1022 {
1023 if (dump_enabled_p ())
1024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1025 "Build SLP failed: "
1026 "op not supported by target.\n");
1027 if (is_a <bb_vec_info> (vinfo) && i != 0)
1028 continue;
1029 /* Fatal mismatch. */
1030 matches[0] = false;
1031 return false;
1032 }
1033 optab_op2_mode = insn_data[icode].operand[2].mode;
1034 if (!VECTOR_MODE_P (optab_op2_mode))
1035 {
1036 need_same_oprnds = true;
1037 first_op1 = gimple_assign_rhs2 (stmt);
1038 }
1039 }
1040 }
1041 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1042 {
1043 need_same_oprnds = true;
1044 first_op1 = gimple_assign_rhs2 (stmt);
1045 }
1046 else if (!load_p
1047 && rhs_code == BIT_FIELD_REF)
1048 {
1049 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1050 if (TREE_CODE (vec) != SSA_NAME
1051 || !types_compatible_p (vectype, TREE_TYPE (vec)))
1052 {
1053 if (is_a <bb_vec_info> (vinfo) && i != 0)
1054 continue;
1055 if (dump_enabled_p ())
1056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1057 "Build SLP failed: "
1058 "BIT_FIELD_REF not supported\n");
1059 /* Fatal mismatch. */
1060 matches[0] = false;
1061 return false;
1062 }
1063 }
1064 else if (call_stmt
1065 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1066 {
1067 need_same_oprnds = true;
1068 first_op1 = gimple_call_arg (call_stmt, 1);
1069 }
1070 }
1071 else
1072 {
1073 if (first_stmt_code != rhs_code
1074 && alt_stmt_code == ERROR_MARK)
1075 alt_stmt_code = rhs_code;
1076 if ((first_stmt_code != rhs_code
1077 && (first_stmt_code != IMAGPART_EXPR
1078 || rhs_code != REALPART_EXPR)
1079 && (first_stmt_code != REALPART_EXPR
1080 || rhs_code != IMAGPART_EXPR)
1081 /* Handle mismatches in plus/minus by computing both
1082 and merging the results. */
1083 && !((first_stmt_code == PLUS_EXPR
1084 || first_stmt_code == MINUS_EXPR)
1085 && (alt_stmt_code == PLUS_EXPR
1086 || alt_stmt_code == MINUS_EXPR)
1087 && rhs_code == alt_stmt_code)
1088 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1089 && (first_stmt_code == ARRAY_REF
1090 || first_stmt_code == BIT_FIELD_REF
1091 || first_stmt_code == INDIRECT_REF
1092 || first_stmt_code == COMPONENT_REF
1093 || first_stmt_code == MEM_REF)))
1094 || first_stmt_load_p != load_p
1095 || first_stmt_phi_p != phi_p)
1096 {
1097 if (dump_enabled_p ())
1098 {
1099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1100 "Build SLP failed: different operation "
1101 "in stmt %G", stmt);
1102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1103 "original stmt %G", first_stmt_info->stmt);
1104 }
1105 /* Mismatch. */
1106 continue;
1107 }
1108
1109 if (need_same_oprnds)
1110 {
1111 tree other_op1 = (call_stmt
1112 ? gimple_call_arg (call_stmt, 1)
1113 : gimple_assign_rhs2 (stmt));
1114 if (!operand_equal_p (first_op1, other_op1, 0))
1115 {
1116 if (dump_enabled_p ())
1117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1118 "Build SLP failed: different shift "
1119 "arguments in %G", stmt);
1120 /* Mismatch. */
1121 continue;
1122 }
1123 }
1124 if (!load_p
1125 && first_stmt_code == BIT_FIELD_REF
1126 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1127 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1128 {
1129 if (dump_enabled_p ())
1130 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1131 "Build SLP failed: different BIT_FIELD_REF "
1132 "arguments in %G", stmt);
1133 /* Mismatch. */
1134 continue;
1135 }
1136
1137 if (!load_p && rhs_code == CALL_EXPR)
1138 {
1139 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1140 as_a <gcall *> (stmt)))
1141 {
1142 if (dump_enabled_p ())
1143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1144 "Build SLP failed: different calls in %G",
1145 stmt);
1146 /* Mismatch. */
1147 continue;
1148 }
1149 }
1150
1151 if (phi_p
1152 && (gimple_bb (first_stmt_info->stmt)
1153 != gimple_bb (stmt_info->stmt)))
1154 {
1155 if (dump_enabled_p ())
1156 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1157 "Build SLP failed: different BB for PHI "
1158 "in %G", stmt);
1159 /* Mismatch. */
1160 continue;
1161 }
1162
1163 if (!types_compatible_p (vectype, *node_vectype))
1164 {
1165 if (dump_enabled_p ())
1166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1167 "Build SLP failed: different vector type "
1168 "in %G", stmt);
1169 /* Mismatch. */
1170 continue;
1171 }
1172 }
1173
1174 /* Grouped store or load. */
1175 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1176 {
1177 if (REFERENCE_CLASS_P (lhs))
1178 {
1179 /* Store. */
1180 ;
1181 }
1182 else
1183 {
1184 /* Load. */
1185 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1186 if (prev_first_load)
1187 {
1188 /* Check that there are no loads from different interleaving
1189 chains in the same node. */
1190 if (prev_first_load != first_load)
1191 {
1192 if (dump_enabled_p ())
1193 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1194 vect_location,
1195 "Build SLP failed: different "
1196 "interleaving chains in one node %G",
1197 stmt);
1198 /* Mismatch. */
1199 continue;
1200 }
1201 }
1202 else
1203 prev_first_load = first_load;
1204 }
1205 } /* Grouped access. */
1206 else
1207 {
1208 if (load_p)
1209 {
1210 /* Not grouped load. */
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1213 "Build SLP failed: not grouped load %G", stmt);
1214
1215 /* FORNOW: Not grouped loads are not supported. */
1216 if (is_a <bb_vec_info> (vinfo) && i != 0)
1217 continue;
1218 /* Fatal mismatch. */
1219 matches[0] = false;
1220 return false;
1221 }
1222
1223 /* Not memory operation. */
1224 if (!phi_p
1225 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1226 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1227 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1228 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1229 && rhs_code != VIEW_CONVERT_EXPR
1230 && rhs_code != CALL_EXPR
1231 && rhs_code != BIT_FIELD_REF)
1232 {
1233 if (dump_enabled_p ())
1234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235 "Build SLP failed: operation unsupported %G",
1236 stmt);
1237 if (is_a <bb_vec_info> (vinfo) && i != 0)
1238 continue;
1239 /* Fatal mismatch. */
1240 matches[0] = false;
1241 return false;
1242 }
1243
1244 if (rhs_code == COND_EXPR)
1245 {
1246 tree cond_expr = gimple_assign_rhs1 (stmt);
1247 enum tree_code cond_code = TREE_CODE (cond_expr);
1248 enum tree_code swap_code = ERROR_MARK;
1249 enum tree_code invert_code = ERROR_MARK;
1250
1251 if (i == 0)
1252 first_cond_code = TREE_CODE (cond_expr);
1253 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1254 {
1255 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1256 swap_code = swap_tree_comparison (cond_code);
1257 invert_code = invert_tree_comparison (cond_code, honor_nans);
1258 }
1259
1260 if (first_cond_code == cond_code)
1261 ;
1262 /* Isomorphic can be achieved by swapping. */
1263 else if (first_cond_code == swap_code)
1264 swap[i] = 1;
1265 /* Isomorphic can be achieved by inverting. */
1266 else if (first_cond_code == invert_code)
1267 swap[i] = 2;
1268 else
1269 {
1270 if (dump_enabled_p ())
1271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1272 "Build SLP failed: different"
1273 " operation %G", stmt);
1274 /* Mismatch. */
1275 continue;
1276 }
1277 }
1278 }
1279
1280 matches[i] = true;
1281 }
1282
1283 for (i = 0; i < group_size; ++i)
1284 if (!matches[i])
1285 return false;
1286
1287 /* If we allowed a two-operation SLP node verify the target can cope
1288 with the permute we are going to use. */
1289 if (alt_stmt_code != ERROR_MARK
1290 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1291 {
1292 *two_operators = true;
1293 }
1294
1295 return true;
1296 }
1297
1298 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1299 Note we never remove apart from at destruction time so we do not
1300 need a special value for deleted that differs from empty. */
1301 struct bst_traits
1302 {
1303 typedef vec <stmt_vec_info> value_type;
1304 typedef vec <stmt_vec_info> compare_type;
1305 static inline hashval_t hash (value_type);
1306 static inline bool equal (value_type existing, value_type candidate);
1307 static inline bool is_empty (value_type x) { return !x.exists (); }
1308 static inline bool is_deleted (value_type x) { return !x.exists (); }
1309 static const bool empty_zero_p = true;
1310 static inline void mark_empty (value_type &x) { x.release (); }
1311 static inline void mark_deleted (value_type &x) { x.release (); }
1312 static inline void remove (value_type &x) { x.release (); }
1313 };
1314 inline hashval_t
1315 bst_traits::hash (value_type x)
1316 {
1317 inchash::hash h;
1318 for (unsigned i = 0; i < x.length (); ++i)
1319 h.add_int (gimple_uid (x[i]->stmt));
1320 return h.end ();
1321 }
1322 inline bool
1323 bst_traits::equal (value_type existing, value_type candidate)
1324 {
1325 if (existing.length () != candidate.length ())
1326 return false;
1327 for (unsigned i = 0; i < existing.length (); ++i)
1328 if (existing[i] != candidate[i])
1329 return false;
1330 return true;
1331 }
1332
1333 typedef hash_map <vec <gimple *>, slp_tree,
1334 simple_hashmap_traits <bst_traits, slp_tree> >
1335 scalar_stmts_to_slp_tree_map_t;
1336
1337 static slp_tree
1338 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1339 vec<stmt_vec_info> stmts, unsigned int group_size,
1340 poly_uint64 *max_nunits,
1341 bool *matches, unsigned *npermutes, unsigned *tree_size,
1342 scalar_stmts_to_slp_tree_map_t *bst_map);
1343
1344 static slp_tree
1345 vect_build_slp_tree (vec_info *vinfo,
1346 vec<stmt_vec_info> stmts, unsigned int group_size,
1347 poly_uint64 *max_nunits,
1348 bool *matches, unsigned *npermutes, unsigned *tree_size,
1349 scalar_stmts_to_slp_tree_map_t *bst_map)
1350 {
1351 if (slp_tree *leader = bst_map->get (stmts))
1352 {
1353 if (dump_enabled_p ())
1354 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1355 *leader ? "" : "failed ", *leader);
1356 if (*leader)
1357 {
1358 SLP_TREE_REF_COUNT (*leader)++;
1359 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1360 }
1361 return *leader;
1362 }
1363
1364 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1365 so we can pick up backedge destinations during discovery. */
1366 slp_tree res = new _slp_tree;
1367 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1368 SLP_TREE_SCALAR_STMTS (res) = stmts;
1369 bst_map->put (stmts.copy (), res);
1370
1371 poly_uint64 this_max_nunits = 1;
1372 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1373 &this_max_nunits,
1374 matches, npermutes, tree_size, bst_map);
1375 if (!res_)
1376 {
1377 bool existed_p = bst_map->put (stmts, NULL);
1378 gcc_assert (existed_p);
1379 /* Mark the node invalid so we can detect those when still in use
1380 as backedge destinations. */
1381 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1382 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1383 vect_free_slp_tree (res);
1384 }
1385 else
1386 {
1387 gcc_assert (res_ == res);
1388 res->max_nunits = this_max_nunits;
1389 vect_update_max_nunits (max_nunits, this_max_nunits);
1390 /* Keep a reference for the bst_map use. */
1391 SLP_TREE_REF_COUNT (res)++;
1392 }
1393 return res_;
1394 }
1395
1396 /* Recursively build an SLP tree starting from NODE.
1397 Fail (and return a value not equal to zero) if def-stmts are not
1398 isomorphic, require data permutation or are of unsupported types of
1399 operation. Otherwise, return 0.
1400 The value returned is the depth in the SLP tree where a mismatch
1401 was found. */
1402
1403 static slp_tree
1404 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1405 vec<stmt_vec_info> stmts, unsigned int group_size,
1406 poly_uint64 *max_nunits,
1407 bool *matches, unsigned *npermutes, unsigned *tree_size,
1408 scalar_stmts_to_slp_tree_map_t *bst_map)
1409 {
1410 unsigned nops, i, this_tree_size = 0;
1411 poly_uint64 this_max_nunits = *max_nunits;
1412
1413 matches[0] = false;
1414
1415 stmt_vec_info stmt_info = stmts[0];
1416 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1417 nops = gimple_call_num_args (stmt);
1418 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1419 {
1420 nops = gimple_num_ops (stmt) - 1;
1421 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1422 nops++;
1423 }
1424 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1425 nops = gimple_phi_num_args (phi);
1426 else
1427 return NULL;
1428
1429 /* If the SLP node is a PHI (induction or reduction), terminate
1430 the recursion. */
1431 bool *skip_args = XALLOCAVEC (bool, nops);
1432 memset (skip_args, 0, nops);
1433 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1434 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1435 {
1436 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1437 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1438 group_size);
1439 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1440 max_nunits))
1441 return NULL;
1442
1443 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1444 /* Induction from different IVs is not supported. */
1445 if (def_type == vect_induction_def)
1446 {
1447 stmt_vec_info other_info;
1448 FOR_EACH_VEC_ELT (stmts, i, other_info)
1449 if (stmt_info != other_info)
1450 return NULL;
1451
1452 /* Induction PHIs are leafs. */
1453 (*tree_size)++;
1454 node = vect_create_new_slp_node (node, stmts, nops);
1455 SLP_TREE_VECTYPE (node) = vectype;
1456 SLP_TREE_CHILDREN (node).quick_grow_cleared (nops);
1457 return node;
1458 }
1459 else if (def_type == vect_reduction_def
1460 || def_type == vect_double_reduction_def
1461 || def_type == vect_nested_cycle)
1462 {
1463 /* Else def types have to match. */
1464 stmt_vec_info other_info;
1465 bool all_same = true;
1466 FOR_EACH_VEC_ELT (stmts, i, other_info)
1467 {
1468 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1469 return NULL;
1470 if (other_info != stmt_info)
1471 all_same = false;
1472 }
1473 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1474 /* Reduction initial values are not explicitely represented. */
1475 if (!nested_in_vect_loop_p (loop, stmt_info))
1476 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1477 /* Reduction chain backedge defs are filled manually.
1478 ??? Need a better way to identify a SLP reduction chain PHI.
1479 Or a better overall way to SLP match those. */
1480 if (all_same && def_type == vect_reduction_def)
1481 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1482 }
1483 else if (def_type != vect_internal_def)
1484 return NULL;
1485 }
1486
1487
1488 bool two_operators = false;
1489 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1490 tree vectype = NULL_TREE;
1491 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1492 &this_max_nunits, matches, &two_operators,
1493 &vectype))
1494 return NULL;
1495
1496 /* If the SLP node is a load, terminate the recursion unless masked. */
1497 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1498 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1499 {
1500 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1501 {
1502 /* Masked load. */
1503 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1504 nops = 1;
1505 }
1506 else
1507 {
1508 *max_nunits = this_max_nunits;
1509 (*tree_size)++;
1510 node = vect_create_new_slp_node (node, stmts, 0);
1511 SLP_TREE_VECTYPE (node) = vectype;
1512 /* And compute the load permutation. Whether it is actually
1513 a permutation depends on the unrolling factor which is
1514 decided later. */
1515 vec<unsigned> load_permutation;
1516 int j;
1517 stmt_vec_info load_info;
1518 load_permutation.create (group_size);
1519 stmt_vec_info first_stmt_info
1520 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1521 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1522 {
1523 int load_place = vect_get_place_in_interleaving_chain
1524 (load_info, first_stmt_info);
1525 gcc_assert (load_place != -1);
1526 load_permutation.safe_push (load_place);
1527 }
1528 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1529 return node;
1530 }
1531 }
1532 else if (gimple_assign_single_p (stmt_info->stmt)
1533 && !gimple_vuse (stmt_info->stmt)
1534 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1535 {
1536 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1537 the same SSA name vector of a compatible type to vectype. */
1538 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1539 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1540 stmt_vec_info estmt_info;
1541 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1542 {
1543 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1544 tree bfref = gimple_assign_rhs1 (estmt);
1545 HOST_WIDE_INT lane;
1546 if (!known_eq (bit_field_size (bfref),
1547 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1548 || !constant_multiple_p (bit_field_offset (bfref),
1549 bit_field_size (bfref), &lane))
1550 {
1551 lperm.release ();
1552 return NULL;
1553 }
1554 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1555 }
1556 slp_tree vnode = vect_create_new_slp_node (vNULL);
1557 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1558 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1559 /* We are always building a permutation node even if it is an identity
1560 permute to shield the rest of the vectorizer from the odd node
1561 representing an actual vector without any scalar ops.
1562 ??? We could hide it completely with making the permute node
1563 external? */
1564 node = vect_create_new_slp_node (node, stmts, 1);
1565 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1566 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1567 SLP_TREE_VECTYPE (node) = vectype;
1568 SLP_TREE_CHILDREN (node).quick_push (vnode);
1569 return node;
1570 }
1571
1572 /* Get at the operands, verifying they are compatible. */
1573 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1574 slp_oprnd_info oprnd_info;
1575 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1576 {
1577 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
1578 stmts, i, &oprnds_info);
1579 if (res != 0)
1580 matches[(res == -1) ? 0 : i] = false;
1581 if (!matches[0])
1582 break;
1583 }
1584 for (i = 0; i < group_size; ++i)
1585 if (!matches[i])
1586 {
1587 vect_free_oprnd_info (oprnds_info);
1588 return NULL;
1589 }
1590 swap = NULL;
1591
1592 auto_vec<slp_tree, 4> children;
1593
1594 stmt_info = stmts[0];
1595
1596 /* Create SLP_TREE nodes for the definition node/s. */
1597 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1598 {
1599 slp_tree child;
1600 unsigned int j;
1601
1602 /* We're skipping certain operands from processing, for example
1603 outer loop reduction initial defs. */
1604 if (skip_args[i])
1605 {
1606 children.safe_push (NULL);
1607 continue;
1608 }
1609
1610 if (oprnd_info->first_dt == vect_uninitialized_def)
1611 {
1612 /* COND_EXPR have one too many eventually if the condition
1613 is a SSA name. */
1614 gcc_assert (i == 3 && nops == 4);
1615 continue;
1616 }
1617
1618 if (is_a <bb_vec_info> (vinfo)
1619 && oprnd_info->first_dt == vect_internal_def
1620 && !oprnd_info->any_pattern)
1621 {
1622 /* For BB vectorization, if all defs are the same do not
1623 bother to continue the build along the single-lane
1624 graph but use a splat of the scalar value. */
1625 stmt_vec_info first_def = oprnd_info->def_stmts[0];
1626 for (j = 1; j < group_size; ++j)
1627 if (oprnd_info->def_stmts[j] != first_def)
1628 break;
1629 if (j == group_size
1630 /* But avoid doing this for loads where we may be
1631 able to CSE things, unless the stmt is not
1632 vectorizable. */
1633 && (!STMT_VINFO_VECTORIZABLE (first_def)
1634 || !gimple_vuse (first_def->stmt)))
1635 {
1636 if (dump_enabled_p ())
1637 dump_printf_loc (MSG_NOTE, vect_location,
1638 "Using a splat of the uniform operand\n");
1639 oprnd_info->first_dt = vect_external_def;
1640 }
1641 }
1642
1643 if (oprnd_info->first_dt == vect_external_def
1644 || oprnd_info->first_dt == vect_constant_def)
1645 {
1646 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1647 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1648 oprnd_info->ops = vNULL;
1649 children.safe_push (invnode);
1650 continue;
1651 }
1652
1653 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1654 group_size, &this_max_nunits,
1655 matches, npermutes,
1656 &this_tree_size, bst_map)) != NULL)
1657 {
1658 oprnd_info->def_stmts = vNULL;
1659 children.safe_push (child);
1660 continue;
1661 }
1662
1663 /* If the SLP build for operand zero failed and operand zero
1664 and one can be commutated try that for the scalar stmts
1665 that failed the match. */
1666 if (i == 0
1667 /* A first scalar stmt mismatch signals a fatal mismatch. */
1668 && matches[0]
1669 /* ??? For COND_EXPRs we can swap the comparison operands
1670 as well as the arms under some constraints. */
1671 && nops == 2
1672 && oprnds_info[1]->first_dt == vect_internal_def
1673 && is_gimple_assign (stmt_info->stmt)
1674 /* Swapping operands for reductions breaks assumptions later on. */
1675 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1676 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1677 /* Do so only if the number of not successful permutes was nor more
1678 than a cut-ff as re-trying the recursive match on
1679 possibly each level of the tree would expose exponential
1680 behavior. */
1681 && *npermutes < 4)
1682 {
1683 /* See whether we can swap the matching or the non-matching
1684 stmt operands. */
1685 bool swap_not_matching = true;
1686 do
1687 {
1688 for (j = 0; j < group_size; ++j)
1689 {
1690 if (matches[j] != !swap_not_matching)
1691 continue;
1692 stmt_vec_info stmt_info = stmts[j];
1693 /* Verify if we can swap operands of this stmt. */
1694 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1695 if (!stmt
1696 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1697 {
1698 if (!swap_not_matching)
1699 goto fail;
1700 swap_not_matching = false;
1701 break;
1702 }
1703 }
1704 }
1705 while (j != group_size);
1706
1707 /* Swap mismatched definition stmts. */
1708 if (dump_enabled_p ())
1709 dump_printf_loc (MSG_NOTE, vect_location,
1710 "Re-trying with swapped operands of stmts ");
1711 for (j = 0; j < group_size; ++j)
1712 if (matches[j] == !swap_not_matching)
1713 {
1714 std::swap (oprnds_info[0]->def_stmts[j],
1715 oprnds_info[1]->def_stmts[j]);
1716 std::swap (oprnds_info[0]->ops[j],
1717 oprnds_info[1]->ops[j]);
1718 if (dump_enabled_p ())
1719 dump_printf (MSG_NOTE, "%d ", j);
1720 }
1721 if (dump_enabled_p ())
1722 dump_printf (MSG_NOTE, "\n");
1723 /* And try again with scratch 'matches' ... */
1724 bool *tem = XALLOCAVEC (bool, group_size);
1725 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1726 group_size, &this_max_nunits,
1727 tem, npermutes,
1728 &this_tree_size, bst_map)) != NULL)
1729 {
1730 oprnd_info->def_stmts = vNULL;
1731 children.safe_push (child);
1732 continue;
1733 }
1734 /* We do not undo the swapping here since it might still be
1735 the better order for the second operand in case we build
1736 the first one from scalars below. */
1737 ++*npermutes;
1738 }
1739 fail:
1740
1741 /* If the SLP build failed and we analyze a basic-block
1742 simply treat nodes we fail to build as externally defined
1743 (and thus build vectors from the scalar defs).
1744 The cost model will reject outright expensive cases.
1745 ??? This doesn't treat cases where permutation ultimatively
1746 fails (or we don't try permutation below). Ideally we'd
1747 even compute a permutation that will end up with the maximum
1748 SLP tree size... */
1749 if (is_a <bb_vec_info> (vinfo)
1750 /* ??? Rejecting patterns this way doesn't work. We'd have to
1751 do extra work to cancel the pattern so the uses see the
1752 scalar version. */
1753 && !is_pattern_stmt_p (stmt_info)
1754 && !oprnd_info->any_pattern)
1755 {
1756 /* But if there's a leading vector sized set of matching stmts
1757 fail here so we can split the group. This matches the condition
1758 vect_analyze_slp_instance uses. */
1759 /* ??? We might want to split here and combine the results to support
1760 multiple vector sizes better. */
1761 for (j = 0; j < group_size; ++j)
1762 if (!matches[j])
1763 break;
1764 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1765 {
1766 if (dump_enabled_p ())
1767 dump_printf_loc (MSG_NOTE, vect_location,
1768 "Building vector operands from scalars\n");
1769 this_tree_size++;
1770 child = vect_create_new_slp_node (oprnd_info->ops);
1771 children.safe_push (child);
1772 oprnd_info->ops = vNULL;
1773 continue;
1774 }
1775 }
1776
1777 gcc_assert (child == NULL);
1778 FOR_EACH_VEC_ELT (children, j, child)
1779 if (child)
1780 vect_free_slp_tree (child);
1781 vect_free_oprnd_info (oprnds_info);
1782 return NULL;
1783 }
1784
1785 vect_free_oprnd_info (oprnds_info);
1786
1787 /* If we have all children of a child built up from uniform scalars
1788 or does more than one possibly expensive vector construction then
1789 just throw that away, causing it built up from scalars.
1790 The exception is the SLP node for the vector store. */
1791 if (is_a <bb_vec_info> (vinfo)
1792 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1793 /* ??? Rejecting patterns this way doesn't work. We'd have to
1794 do extra work to cancel the pattern so the uses see the
1795 scalar version. */
1796 && !is_pattern_stmt_p (stmt_info))
1797 {
1798 slp_tree child;
1799 unsigned j;
1800 bool all_uniform_p = true;
1801 unsigned n_vector_builds = 0;
1802 FOR_EACH_VEC_ELT (children, j, child)
1803 {
1804 if (!child)
1805 ;
1806 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1807 all_uniform_p = false;
1808 else if (!vect_slp_tree_uniform_p (child))
1809 {
1810 all_uniform_p = false;
1811 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1812 n_vector_builds++;
1813 }
1814 }
1815 if (all_uniform_p || n_vector_builds > 1)
1816 {
1817 /* Roll back. */
1818 matches[0] = false;
1819 FOR_EACH_VEC_ELT (children, j, child)
1820 if (child)
1821 vect_free_slp_tree (child);
1822
1823 if (dump_enabled_p ())
1824 dump_printf_loc (MSG_NOTE, vect_location,
1825 "Building parent vector operands from "
1826 "scalars instead\n");
1827 return NULL;
1828 }
1829 }
1830
1831 *tree_size += this_tree_size + 1;
1832 *max_nunits = this_max_nunits;
1833
1834 if (two_operators)
1835 {
1836 /* ??? We'd likely want to either cache in bst_map sth like
1837 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1838 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1839 explicit stmts to put in so the keying on 'stmts' doesn't
1840 work (but we have the same issue with nodes that use 'ops'). */
1841 slp_tree one = new _slp_tree;
1842 slp_tree two = new _slp_tree;
1843 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1844 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1845 SLP_TREE_VECTYPE (one) = vectype;
1846 SLP_TREE_VECTYPE (two) = vectype;
1847 SLP_TREE_CHILDREN (one).safe_splice (children);
1848 SLP_TREE_CHILDREN (two).safe_splice (children);
1849 slp_tree child;
1850 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1851 SLP_TREE_REF_COUNT (child)++;
1852
1853 /* Here we record the original defs since this
1854 node represents the final lane configuration. */
1855 node = vect_create_new_slp_node (node, stmts, 2);
1856 SLP_TREE_VECTYPE (node) = vectype;
1857 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1858 SLP_TREE_CHILDREN (node).quick_push (one);
1859 SLP_TREE_CHILDREN (node).quick_push (two);
1860 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1861 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1862 enum tree_code ocode = ERROR_MARK;
1863 stmt_vec_info ostmt_info;
1864 unsigned j = 0;
1865 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1866 {
1867 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1868 if (gimple_assign_rhs_code (ostmt) != code0)
1869 {
1870 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1871 ocode = gimple_assign_rhs_code (ostmt);
1872 j = i;
1873 }
1874 else
1875 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1876 }
1877 SLP_TREE_CODE (one) = code0;
1878 SLP_TREE_CODE (two) = ocode;
1879 SLP_TREE_LANES (one) = stmts.length ();
1880 SLP_TREE_LANES (two) = stmts.length ();
1881 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1882 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1883 return node;
1884 }
1885
1886 node = vect_create_new_slp_node (node, stmts, nops);
1887 SLP_TREE_VECTYPE (node) = vectype;
1888 SLP_TREE_CHILDREN (node).splice (children);
1889 return node;
1890 }
1891
1892 /* Dump a single SLP tree NODE. */
1893
1894 static void
1895 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1896 slp_tree node)
1897 {
1898 unsigned i, j;
1899 slp_tree child;
1900 stmt_vec_info stmt_info;
1901 tree op;
1902
1903 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1904 dump_user_location_t user_loc = loc.get_user_location ();
1905 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1906 SLP_TREE_DEF_TYPE (node) == vect_external_def
1907 ? " (external)"
1908 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1909 ? " (constant)"
1910 : ""), node,
1911 estimated_poly_value (node->max_nunits),
1912 SLP_TREE_REF_COUNT (node));
1913 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1914 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1915 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1916 else
1917 {
1918 dump_printf_loc (metadata, user_loc, "\t{ ");
1919 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1920 dump_printf (metadata, "%T%s ", op,
1921 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1922 dump_printf (metadata, "}\n");
1923 }
1924 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1925 {
1926 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1927 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1928 dump_printf (dump_kind, " %u", j);
1929 dump_printf (dump_kind, " }\n");
1930 }
1931 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1932 {
1933 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1934 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1935 dump_printf (dump_kind, " %u[%u]",
1936 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1937 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1938 dump_printf (dump_kind, " }\n");
1939 }
1940 if (SLP_TREE_CHILDREN (node).is_empty ())
1941 return;
1942 dump_printf_loc (metadata, user_loc, "\tchildren");
1943 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1944 dump_printf (dump_kind, " %p", (void *)child);
1945 dump_printf (dump_kind, "\n");
1946 }
1947
1948 DEBUG_FUNCTION void
1949 debug (slp_tree node)
1950 {
1951 debug_dump_context ctx;
1952 vect_print_slp_tree (MSG_NOTE,
1953 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1954 node);
1955 }
1956
1957 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1958
1959 static void
1960 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1961 slp_tree node, hash_set<slp_tree> &visited)
1962 {
1963 unsigned i;
1964 slp_tree child;
1965
1966 if (visited.add (node))
1967 return;
1968
1969 vect_print_slp_tree (dump_kind, loc, node);
1970
1971 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1972 if (child)
1973 vect_print_slp_graph (dump_kind, loc, child, visited);
1974 }
1975
1976 static void
1977 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1978 slp_tree entry)
1979 {
1980 hash_set<slp_tree> visited;
1981 vect_print_slp_graph (dump_kind, loc, entry, visited);
1982 }
1983
1984 /* Mark the tree rooted at NODE with PURE_SLP. */
1985
1986 static void
1987 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1988 {
1989 int i;
1990 stmt_vec_info stmt_info;
1991 slp_tree child;
1992
1993 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1994 return;
1995
1996 if (visited.add (node))
1997 return;
1998
1999 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2000 STMT_SLP_TYPE (stmt_info) = pure_slp;
2001
2002 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2003 if (child)
2004 vect_mark_slp_stmts (child, visited);
2005 }
2006
2007 static void
2008 vect_mark_slp_stmts (slp_tree node)
2009 {
2010 hash_set<slp_tree> visited;
2011 vect_mark_slp_stmts (node, visited);
2012 }
2013
2014 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2015
2016 static void
2017 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2018 {
2019 int i;
2020 stmt_vec_info stmt_info;
2021 slp_tree child;
2022
2023 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2024 return;
2025
2026 if (visited.add (node))
2027 return;
2028
2029 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2030 {
2031 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2032 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2033 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2034 }
2035
2036 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2037 if (child)
2038 vect_mark_slp_stmts_relevant (child, visited);
2039 }
2040
2041 static void
2042 vect_mark_slp_stmts_relevant (slp_tree node)
2043 {
2044 hash_set<slp_tree> visited;
2045 vect_mark_slp_stmts_relevant (node, visited);
2046 }
2047
2048
2049 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2050
2051 static void
2052 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2053 hash_set<slp_tree> &visited)
2054 {
2055 if (!node || visited.add (node))
2056 return;
2057
2058 if (SLP_TREE_CHILDREN (node).length () == 0)
2059 {
2060 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2061 return;
2062 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2063 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2064 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2065 loads.safe_push (node);
2066 }
2067 else
2068 {
2069 unsigned i;
2070 slp_tree child;
2071 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2072 vect_gather_slp_loads (loads, child, visited);
2073 }
2074 }
2075
2076 static void
2077 vect_gather_slp_loads (slp_instance inst, slp_tree node)
2078 {
2079 hash_set<slp_tree> visited;
2080 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
2081 }
2082
2083
2084 /* Find the last store in SLP INSTANCE. */
2085
2086 stmt_vec_info
2087 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2088 {
2089 stmt_vec_info last = NULL;
2090 stmt_vec_info stmt_vinfo;
2091
2092 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2093 {
2094 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2095 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2096 }
2097
2098 return last;
2099 }
2100
2101 /* Find the first stmt in NODE. */
2102
2103 stmt_vec_info
2104 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2105 {
2106 stmt_vec_info first = NULL;
2107 stmt_vec_info stmt_vinfo;
2108
2109 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2110 {
2111 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2112 if (!first
2113 || get_later_stmt (stmt_vinfo, first) == first)
2114 first = stmt_vinfo;
2115 }
2116
2117 return first;
2118 }
2119
2120 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2121 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2122 (also containing the first GROUP1_SIZE stmts, since stores are
2123 consecutive), the second containing the remainder.
2124 Return the first stmt in the second group. */
2125
2126 static stmt_vec_info
2127 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2128 {
2129 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2130 gcc_assert (group1_size > 0);
2131 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2132 gcc_assert (group2_size > 0);
2133 DR_GROUP_SIZE (first_vinfo) = group1_size;
2134
2135 stmt_vec_info stmt_info = first_vinfo;
2136 for (unsigned i = group1_size; i > 1; i--)
2137 {
2138 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2139 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2140 }
2141 /* STMT is now the last element of the first group. */
2142 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2143 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2144
2145 DR_GROUP_SIZE (group2) = group2_size;
2146 for (stmt_info = group2; stmt_info;
2147 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2148 {
2149 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2150 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2151 }
2152
2153 /* For the second group, the DR_GROUP_GAP is that before the original group,
2154 plus skipping over the first vector. */
2155 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2156
2157 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2158 DR_GROUP_GAP (first_vinfo) += group2_size;
2159
2160 if (dump_enabled_p ())
2161 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2162 group1_size, group2_size);
2163
2164 return group2;
2165 }
2166
2167 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2168 statements and a vector of NUNITS elements. */
2169
2170 static poly_uint64
2171 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2172 {
2173 return exact_div (common_multiple (nunits, group_size), group_size);
2174 }
2175
2176 enum slp_instance_kind {
2177 slp_inst_kind_store,
2178 slp_inst_kind_reduc_group,
2179 slp_inst_kind_reduc_chain,
2180 slp_inst_kind_ctor
2181 };
2182
2183 static bool
2184 vect_analyze_slp_instance (vec_info *vinfo,
2185 scalar_stmts_to_slp_tree_map_t *bst_map,
2186 stmt_vec_info stmt_info, unsigned max_tree_size);
2187
2188 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2189 of KIND. Return true if successful. */
2190
2191 static bool
2192 vect_build_slp_instance (vec_info *vinfo,
2193 slp_instance_kind kind,
2194 vec<stmt_vec_info> scalar_stmts,
2195 stmt_vec_info root_stmt_info,
2196 unsigned max_tree_size,
2197 scalar_stmts_to_slp_tree_map_t *bst_map,
2198 /* ??? We need stmt_info for group splitting. */
2199 stmt_vec_info stmt_info_)
2200 {
2201 if (dump_enabled_p ())
2202 {
2203 dump_printf_loc (MSG_NOTE, vect_location,
2204 "Starting SLP discovery for\n");
2205 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
2206 dump_printf_loc (MSG_NOTE, vect_location,
2207 " %G", scalar_stmts[i]->stmt);
2208 }
2209
2210 /* Build the tree for the SLP instance. */
2211 unsigned int group_size = scalar_stmts.length ();
2212 bool *matches = XALLOCAVEC (bool, group_size);
2213 unsigned npermutes = 0;
2214 poly_uint64 max_nunits = 1;
2215 unsigned tree_size = 0;
2216 unsigned i;
2217 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2218 &max_nunits, matches, &npermutes,
2219 &tree_size, bst_map);
2220 if (node != NULL)
2221 {
2222 /* Calculate the unrolling factor based on the smallest type. */
2223 poly_uint64 unrolling_factor
2224 = calculate_unrolling_factor (max_nunits, group_size);
2225
2226 if (maybe_ne (unrolling_factor, 1U)
2227 && is_a <bb_vec_info> (vinfo))
2228 {
2229 unsigned HOST_WIDE_INT const_max_nunits;
2230 if (!max_nunits.is_constant (&const_max_nunits)
2231 || const_max_nunits > group_size)
2232 {
2233 if (dump_enabled_p ())
2234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2235 "Build SLP failed: store group "
2236 "size not a multiple of the vector size "
2237 "in basic block SLP\n");
2238 vect_free_slp_tree (node);
2239 return false;
2240 }
2241 /* Fatal mismatch. */
2242 if (dump_enabled_p ())
2243 dump_printf_loc (MSG_NOTE, vect_location,
2244 "SLP discovery succeeded but node needs "
2245 "splitting\n");
2246 memset (matches, true, group_size);
2247 matches[group_size / const_max_nunits * const_max_nunits] = false;
2248 vect_free_slp_tree (node);
2249 }
2250 else
2251 {
2252 /* Create a new SLP instance. */
2253 slp_instance new_instance = XNEW (class _slp_instance);
2254 SLP_INSTANCE_TREE (new_instance) = node;
2255 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2256 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2257 SLP_INSTANCE_ROOT_STMT (new_instance) = root_stmt_info;
2258 new_instance->reduc_phis = NULL;
2259 new_instance->cost_vec = vNULL;
2260 new_instance->subgraph_entries = vNULL;
2261
2262 vect_gather_slp_loads (new_instance, node);
2263 if (dump_enabled_p ())
2264 dump_printf_loc (MSG_NOTE, vect_location,
2265 "SLP size %u vs. limit %u.\n",
2266 tree_size, max_tree_size);
2267
2268 /* Check whether any load is possibly permuted. */
2269 slp_tree load_node;
2270 bool loads_permuted = false;
2271 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2272 {
2273 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2274 continue;
2275 unsigned j;
2276 stmt_vec_info load_info;
2277 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2278 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2279 {
2280 loads_permuted = true;
2281 break;
2282 }
2283 }
2284
2285 /* If the loads and stores can be handled with load/store-lane
2286 instructions do not generate this SLP instance. */
2287 if (is_a <loop_vec_info> (vinfo)
2288 && loads_permuted
2289 && kind == slp_inst_kind_store
2290 && vect_store_lanes_supported
2291 (STMT_VINFO_VECTYPE (scalar_stmts[0]), group_size, false))
2292 {
2293 slp_tree load_node;
2294 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2295 {
2296 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2297 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2298 /* Use SLP for strided accesses (or if we can't load-lanes). */
2299 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2300 || ! vect_load_lanes_supported
2301 (STMT_VINFO_VECTYPE (stmt_vinfo),
2302 DR_GROUP_SIZE (stmt_vinfo), false))
2303 break;
2304 }
2305 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2306 {
2307 if (dump_enabled_p ())
2308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2309 "Built SLP cancelled: can use "
2310 "load/store-lanes\n");
2311 vect_free_slp_instance (new_instance);
2312 return false;
2313 }
2314 }
2315
2316 /* Fixup SLP reduction chains. */
2317 if (kind == slp_inst_kind_reduc_chain)
2318 {
2319 /* If this is a reduction chain with a conversion in front
2320 amend the SLP tree with a node for that. */
2321 gimple *scalar_def
2322 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2323 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
2324 {
2325 /* Get at the conversion stmt - we know it's the single use
2326 of the last stmt of the reduction chain. */
2327 use_operand_p use_p;
2328 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
2329 &use_p, &scalar_def);
2330 gcc_assert (r);
2331 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
2332 next_info = vect_stmt_to_vectorize (next_info);
2333 scalar_stmts = vNULL;
2334 scalar_stmts.create (group_size);
2335 for (unsigned i = 0; i < group_size; ++i)
2336 scalar_stmts.quick_push (next_info);
2337 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2338 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2339 SLP_TREE_CHILDREN (conv).quick_push (node);
2340 SLP_INSTANCE_TREE (new_instance) = conv;
2341 /* We also have to fake this conversion stmt as SLP reduction
2342 group so we don't have to mess with too much code
2343 elsewhere. */
2344 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2345 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2346 }
2347 /* Fill the backedge child of the PHI SLP node. The
2348 general matching code cannot find it because the
2349 scalar code does not reflect how we vectorize the
2350 reduction. */
2351 use_operand_p use_p;
2352 imm_use_iterator imm_iter;
2353 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
2354 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
2355 gimple_get_lhs (scalar_def))
2356 /* There are exactly two non-debug uses, the reduction
2357 PHI and the loop-closed PHI node. */
2358 if (!is_gimple_debug (USE_STMT (use_p))
2359 && gimple_bb (USE_STMT (use_p)) == loop->header)
2360 {
2361 auto_vec<stmt_vec_info, 64> phis (group_size);
2362 stmt_vec_info phi_info
2363 = vinfo->lookup_stmt (USE_STMT (use_p));
2364 for (unsigned i = 0; i < group_size; ++i)
2365 phis.quick_push (phi_info);
2366 slp_tree *phi_node = bst_map->get (phis);
2367 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
2368 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
2369 = SLP_INSTANCE_TREE (new_instance);
2370 SLP_INSTANCE_TREE (new_instance)->refcnt++;
2371 }
2372 }
2373
2374 vinfo->slp_instances.safe_push (new_instance);
2375
2376 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2377 the number of scalar stmts in the root in a few places.
2378 Verify that assumption holds. */
2379 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2380 .length () == group_size);
2381
2382 if (dump_enabled_p ())
2383 {
2384 dump_printf_loc (MSG_NOTE, vect_location,
2385 "Final SLP tree for instance %p:\n", new_instance);
2386 vect_print_slp_graph (MSG_NOTE, vect_location,
2387 SLP_INSTANCE_TREE (new_instance));
2388 }
2389
2390 return true;
2391 }
2392 }
2393 else
2394 {
2395 /* Failed to SLP. */
2396 /* Free the allocated memory. */
2397 scalar_stmts.release ();
2398 }
2399
2400 stmt_vec_info stmt_info = stmt_info_;
2401 /* Try to break the group up into pieces. */
2402 if (kind == slp_inst_kind_store)
2403 {
2404 /* ??? We could delay all the actual splitting of store-groups
2405 until after SLP discovery of the original group completed.
2406 Then we can recurse to vect_build_slp_instance directly. */
2407 for (i = 0; i < group_size; i++)
2408 if (!matches[i])
2409 break;
2410
2411 /* For basic block SLP, try to break the group up into multiples of
2412 a vector size. */
2413 if (is_a <bb_vec_info> (vinfo)
2414 && (i > 1 && i < group_size))
2415 {
2416 tree scalar_type
2417 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2418 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2419 1 << floor_log2 (i));
2420 unsigned HOST_WIDE_INT const_nunits;
2421 if (vectype
2422 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2423 {
2424 /* Split into two groups at the first vector boundary. */
2425 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2426 unsigned group1_size = i & ~(const_nunits - 1);
2427
2428 if (dump_enabled_p ())
2429 dump_printf_loc (MSG_NOTE, vect_location,
2430 "Splitting SLP group at stmt %u\n", i);
2431 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2432 group1_size);
2433 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2434 max_tree_size);
2435 /* Split the rest at the failure point and possibly
2436 re-analyze the remaining matching part if it has
2437 at least two lanes. */
2438 if (group1_size < i
2439 && (i + 1 < group_size
2440 || i - group1_size > 1))
2441 {
2442 stmt_vec_info rest2 = rest;
2443 rest = vect_split_slp_store_group (rest, i - group1_size);
2444 if (i - group1_size > 1)
2445 res |= vect_analyze_slp_instance (vinfo, bst_map,
2446 rest2, max_tree_size);
2447 }
2448 /* Re-analyze the non-matching tail if it has at least
2449 two lanes. */
2450 if (i + 1 < group_size)
2451 res |= vect_analyze_slp_instance (vinfo, bst_map,
2452 rest, max_tree_size);
2453 return res;
2454 }
2455 }
2456
2457 /* For loop vectorization split into arbitrary pieces of size > 1. */
2458 if (is_a <loop_vec_info> (vinfo)
2459 && (i > 1 && i < group_size))
2460 {
2461 unsigned group1_size = i;
2462
2463 if (dump_enabled_p ())
2464 dump_printf_loc (MSG_NOTE, vect_location,
2465 "Splitting SLP group at stmt %u\n", i);
2466
2467 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2468 group1_size);
2469 /* Loop vectorization cannot handle gaps in stores, make sure
2470 the split group appears as strided. */
2471 STMT_VINFO_STRIDED_P (rest) = 1;
2472 DR_GROUP_GAP (rest) = 0;
2473 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2474 DR_GROUP_GAP (stmt_info) = 0;
2475
2476 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2477 max_tree_size);
2478 if (i + 1 < group_size)
2479 res |= vect_analyze_slp_instance (vinfo, bst_map,
2480 rest, max_tree_size);
2481
2482 return res;
2483 }
2484
2485 /* Even though the first vector did not all match, we might be able to SLP
2486 (some) of the remainder. FORNOW ignore this possibility. */
2487 }
2488
2489 /* Failed to SLP. */
2490 if (dump_enabled_p ())
2491 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2492 return false;
2493 }
2494
2495
2496 /* Analyze an SLP instance starting from a group of grouped stores. Call
2497 vect_build_slp_tree to build a tree of packed stmts if possible.
2498 Return FALSE if it's impossible to SLP any stmt in the loop. */
2499
2500 static bool
2501 vect_analyze_slp_instance (vec_info *vinfo,
2502 scalar_stmts_to_slp_tree_map_t *bst_map,
2503 stmt_vec_info stmt_info, unsigned max_tree_size)
2504 {
2505 unsigned int group_size;
2506 unsigned int i;
2507 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2508 vec<stmt_vec_info> scalar_stmts;
2509 slp_instance_kind kind;
2510
2511 if (is_a <bb_vec_info> (vinfo))
2512 vect_location = stmt_info->stmt;
2513 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2514 {
2515 kind = slp_inst_kind_store;
2516 group_size = DR_GROUP_SIZE (stmt_info);
2517 }
2518 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2519 {
2520 kind = slp_inst_kind_reduc_chain;
2521 gcc_assert (is_a <loop_vec_info> (vinfo));
2522 group_size = REDUC_GROUP_SIZE (stmt_info);
2523 }
2524 else if (is_gimple_assign (stmt_info->stmt)
2525 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2526 {
2527 kind = slp_inst_kind_ctor;
2528 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2529 }
2530 else
2531 {
2532 kind = slp_inst_kind_reduc_group;
2533 gcc_assert (is_a <loop_vec_info> (vinfo));
2534 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2535 }
2536
2537 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2538 scalar_stmts.create (group_size);
2539 stmt_vec_info next_info = stmt_info;
2540 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2541 {
2542 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2543 while (next_info)
2544 {
2545 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2546 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2547 }
2548 }
2549 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2550 {
2551 /* Collect the reduction stmts and store them in
2552 SLP_TREE_SCALAR_STMTS. */
2553 while (next_info)
2554 {
2555 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2556 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2557 }
2558 /* Mark the first element of the reduction chain as reduction to properly
2559 transform the node. In the reduction analysis phase only the last
2560 element of the chain is marked as reduction. */
2561 STMT_VINFO_DEF_TYPE (stmt_info)
2562 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2563 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2564 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2565 }
2566 else if (kind == slp_inst_kind_ctor)
2567 {
2568 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2569 tree val;
2570 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2571 {
2572 if (TREE_CODE (val) == SSA_NAME)
2573 {
2574 gimple* def = SSA_NAME_DEF_STMT (val);
2575 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2576 /* Value is defined in another basic block. */
2577 if (!def_info)
2578 return false;
2579 def_info = vect_stmt_to_vectorize (def_info);
2580 scalar_stmts.safe_push (def_info);
2581 }
2582 else
2583 return false;
2584 }
2585 if (dump_enabled_p ())
2586 dump_printf_loc (MSG_NOTE, vect_location,
2587 "Analyzing vectorizable constructor: %G\n",
2588 stmt_info->stmt);
2589 }
2590 else
2591 {
2592 /* Collect reduction statements. */
2593 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2594 for (i = 0; reductions.iterate (i, &next_info); i++)
2595 scalar_stmts.safe_push (next_info);
2596 }
2597
2598 /* Build the tree for the SLP instance. */
2599 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
2600 kind == slp_inst_kind_ctor
2601 ? stmt_info : NULL,
2602 max_tree_size,
2603 bst_map, stmt_info);
2604
2605 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2606 where we should do store group splitting. */
2607
2608 return res;
2609 }
2610
2611 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2612 trees of packed scalar stmts if SLP is possible. */
2613
2614 opt_result
2615 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2616 {
2617 unsigned int i;
2618 stmt_vec_info first_element;
2619
2620 DUMP_VECT_SCOPE ("vect_analyze_slp");
2621
2622 scalar_stmts_to_slp_tree_map_t *bst_map
2623 = new scalar_stmts_to_slp_tree_map_t ();
2624
2625 /* Find SLP sequences starting from groups of grouped stores. */
2626 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2627 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2628
2629 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2630 {
2631 if (loop_vinfo->reduction_chains.length () > 0)
2632 {
2633 /* Find SLP sequences starting from reduction chains. */
2634 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2635 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2636 max_tree_size))
2637 {
2638 /* Dissolve reduction chain group. */
2639 stmt_vec_info vinfo = first_element;
2640 stmt_vec_info last = NULL;
2641 while (vinfo)
2642 {
2643 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2644 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2645 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2646 last = vinfo;
2647 vinfo = next;
2648 }
2649 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2650 /* It can be still vectorized as part of an SLP reduction. */
2651 loop_vinfo->reductions.safe_push (last);
2652 }
2653 }
2654
2655 /* Find SLP sequences starting from groups of reductions. */
2656 if (loop_vinfo->reductions.length () > 1)
2657 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2658 max_tree_size);
2659 }
2660
2661 /* The map keeps a reference on SLP nodes built, release that. */
2662 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2663 it != bst_map->end (); ++it)
2664 if ((*it).second)
2665 vect_free_slp_tree ((*it).second);
2666 delete bst_map;
2667
2668 return opt_result::success ();
2669 }
2670
2671 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2672
2673 static void
2674 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2675 vec<slp_tree> &vertices, vec<int> &leafs)
2676 {
2677 unsigned i;
2678 slp_tree child;
2679
2680 if (visited.add (node))
2681 return;
2682
2683 node->vertex = vertices.length ();
2684 vertices.safe_push (node);
2685
2686 bool leaf = true;
2687 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2688 if (child)
2689 {
2690 leaf = false;
2691 vect_slp_build_vertices (visited, child, vertices, leafs);
2692 }
2693 if (leaf)
2694 leafs.safe_push (node->vertex);
2695 }
2696
2697 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2698
2699 static void
2700 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2701 vec<int> &leafs)
2702 {
2703 hash_set<slp_tree> visited;
2704 unsigned i;
2705 slp_instance instance;
2706 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2707 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2708 leafs);
2709 }
2710
2711 /* Apply (reverse) bijectite PERM to VEC. */
2712
2713 template <class T>
2714 static void
2715 vect_slp_permute (vec<unsigned> perm,
2716 vec<T> &vec, bool reverse)
2717 {
2718 auto_vec<T, 64> saved;
2719 saved.create (vec.length ());
2720 for (unsigned i = 0; i < vec.length (); ++i)
2721 saved.quick_push (vec[i]);
2722
2723 if (reverse)
2724 {
2725 for (unsigned i = 0; i < vec.length (); ++i)
2726 vec[perm[i]] = saved[i];
2727 for (unsigned i = 0; i < vec.length (); ++i)
2728 gcc_assert (vec[perm[i]] == saved[i]);
2729 }
2730 else
2731 {
2732 for (unsigned i = 0; i < vec.length (); ++i)
2733 vec[i] = saved[perm[i]];
2734 for (unsigned i = 0; i < vec.length (); ++i)
2735 gcc_assert (vec[i] == saved[perm[i]]);
2736 }
2737 }
2738
2739 /* Return whether permutations PERM_A and PERM_B as recorded in the
2740 PERMS vector are equal. */
2741
2742 static bool
2743 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
2744 int perm_a, int perm_b)
2745 {
2746 return (perm_a == perm_b
2747 || (perms[perm_a].length () == perms[perm_b].length ()
2748 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
2749 sizeof (unsigned) * perms[perm_a].length ()) == 0));
2750 }
2751
2752 /* Optimize the SLP graph of VINFO. */
2753
2754 void
2755 vect_optimize_slp (vec_info *vinfo)
2756 {
2757 if (vinfo->slp_instances.is_empty ())
2758 return;
2759
2760 slp_tree node;
2761 unsigned i;
2762 auto_vec<slp_tree> vertices;
2763 auto_vec<int> leafs;
2764 vect_slp_build_vertices (vinfo, vertices, leafs);
2765
2766 struct graph *slpg = new_graph (vertices.length ());
2767 FOR_EACH_VEC_ELT (vertices, i, node)
2768 {
2769 unsigned j;
2770 slp_tree child;
2771 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2772 if (child)
2773 add_edge (slpg, i, child->vertex);
2774 }
2775
2776 /* Compute (reverse) postorder on the inverted graph. */
2777 auto_vec<int> ipo;
2778 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
2779
2780 auto_sbitmap n_visited (vertices.length ());
2781 auto_sbitmap n_materialize (vertices.length ());
2782 auto_vec<int> n_perm (vertices.length ());
2783 auto_vec<vec<unsigned> > perms;
2784
2785 bitmap_clear (n_visited);
2786 bitmap_clear (n_materialize);
2787 n_perm.quick_grow_cleared (vertices.length ());
2788 perms.safe_push (vNULL); /* zero is no permute */
2789
2790 /* Produce initial permutations. */
2791 for (i = 0; i < leafs.length (); ++i)
2792 {
2793 int idx = leafs[i];
2794 slp_tree node = vertices[idx];
2795
2796 /* Handle externals and constants optimistically throughout the
2797 iteration. */
2798 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
2799 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
2800 continue;
2801
2802 /* Loads are the only thing generating permutes and leafs do not
2803 change across iterations. */
2804 bitmap_set_bit (n_visited, idx);
2805 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2806 continue;
2807
2808 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2809 node unpermuted, record this permute. */
2810 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
2811 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
2812 continue;
2813 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
2814 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
2815 bool any_permute = false;
2816 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2817 {
2818 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
2819 imin = MIN (imin, idx);
2820 imax = MAX (imax, idx);
2821 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
2822 any_permute = true;
2823 }
2824 /* If there's no permute no need to split one out. */
2825 if (!any_permute)
2826 continue;
2827 /* If the span doesn't match we'd disrupt VF computation, avoid
2828 that for now. */
2829 if (imax - imin + 1 != SLP_TREE_LANES (node))
2830 continue;
2831
2832 /* For now only handle true permutes, like
2833 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2834 when permuting constants and invariants keeping the permute
2835 bijective. */
2836 auto_sbitmap load_index (SLP_TREE_LANES (node));
2837 bitmap_clear (load_index);
2838 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2839 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
2840 unsigned j;
2841 for (j = 0; j < SLP_TREE_LANES (node); ++j)
2842 if (!bitmap_bit_p (load_index, j))
2843 break;
2844 if (j != SLP_TREE_LANES (node))
2845 continue;
2846
2847 vec<unsigned> perm = vNULL;
2848 perm.safe_grow (SLP_TREE_LANES (node), true);
2849 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2850 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
2851 perms.safe_push (perm);
2852 n_perm[idx] = perms.length () - 1;
2853 }
2854
2855 /* Propagate permutes along the graph and compute materialization points. */
2856 bool changed;
2857 unsigned iteration = 0;
2858 do
2859 {
2860 changed = false;
2861 ++iteration;
2862
2863 for (i = vertices.length (); i > 0 ; --i)
2864 {
2865 int idx = ipo[i-1];
2866 slp_tree node = vertices[idx];
2867 /* For leafs there's nothing to do - we've seeded permutes
2868 on those above. */
2869 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2870 continue;
2871
2872 bitmap_set_bit (n_visited, idx);
2873
2874 /* We cannot move a permute across a store. */
2875 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2876 && DR_IS_WRITE
2877 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
2878 continue;
2879
2880 int perm = -1;
2881 for (graph_edge *succ = slpg->vertices[idx].succ;
2882 succ; succ = succ->succ_next)
2883 {
2884 int succ_idx = succ->dest;
2885 /* Handle unvisited nodes optimistically. */
2886 /* ??? But for constants once we want to handle non-bijective
2887 permutes we have to verify the permute, when unifying lanes,
2888 will not unify different constants. For example see
2889 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2890 if (!bitmap_bit_p (n_visited, succ_idx))
2891 continue;
2892 int succ_perm = n_perm[succ_idx];
2893 /* Once we materialize succs permutation its output lanes
2894 appear unpermuted to us. */
2895 if (bitmap_bit_p (n_materialize, succ_idx))
2896 succ_perm = 0;
2897 if (perm == -1)
2898 perm = succ_perm;
2899 else if (succ_perm == 0)
2900 {
2901 perm = 0;
2902 break;
2903 }
2904 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
2905 {
2906 perm = 0;
2907 break;
2908 }
2909 }
2910
2911 if (perm == -1)
2912 /* Pick up pre-computed leaf values. */
2913 perm = n_perm[idx];
2914 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
2915 {
2916 if (iteration > 1)
2917 /* Make sure we eventually converge. */
2918 gcc_checking_assert (perm == 0);
2919 n_perm[idx] = perm;
2920 if (perm == 0)
2921 bitmap_clear_bit (n_materialize, idx);
2922 changed = true;
2923 }
2924
2925 if (perm == 0)
2926 continue;
2927
2928 /* Elide pruning at materialization points in the first
2929 iteration so every node was visited once at least. */
2930 if (iteration == 1)
2931 continue;
2932
2933 /* Decide on permute materialization. Look whether there's
2934 a use (pred) edge that is permuted differently than us.
2935 In that case mark ourselves so the permutation is applied. */
2936 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
2937 for (graph_edge *pred = slpg->vertices[idx].pred;
2938 pred; pred = pred->pred_next)
2939 {
2940 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
2941 int pred_perm = n_perm[pred->src];
2942 if (!vect_slp_perms_eq (perms, perm, pred_perm))
2943 {
2944 all_preds_permuted = false;
2945 break;
2946 }
2947 }
2948 if (!all_preds_permuted)
2949 {
2950 if (!bitmap_bit_p (n_materialize, idx))
2951 changed = true;
2952 bitmap_set_bit (n_materialize, idx);
2953 }
2954 }
2955 }
2956 while (changed || iteration == 1);
2957
2958 /* Materialize. */
2959 for (i = 0; i < vertices.length (); ++i)
2960 {
2961 int perm = n_perm[i];
2962 if (perm <= 0)
2963 continue;
2964
2965 slp_tree node = vertices[i];
2966
2967 /* First permute invariant/external original successors. */
2968 unsigned j;
2969 slp_tree child;
2970 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2971 {
2972 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2973 continue;
2974
2975 /* If the vector is uniform there's nothing to do. */
2976 if (vect_slp_tree_uniform_p (child))
2977 continue;
2978
2979 /* We can end up sharing some externals via two_operator
2980 handling. Be prepared to unshare those. */
2981 if (child->refcnt != 1)
2982 {
2983 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
2984 SLP_TREE_CHILDREN (node)[j] = child
2985 = vect_create_new_slp_node
2986 (SLP_TREE_SCALAR_OPS (child).copy ());
2987 }
2988 vect_slp_permute (perms[perm],
2989 SLP_TREE_SCALAR_OPS (child), true);
2990 }
2991
2992 if (bitmap_bit_p (n_materialize, i))
2993 {
2994 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2995 /* For loads simply drop the permutation, the load permutation
2996 already performs the desired permutation. */
2997 ;
2998 else
2999 {
3000 if (dump_enabled_p ())
3001 dump_printf_loc (MSG_NOTE, vect_location,
3002 "inserting permute node in place of %p\n",
3003 node);
3004
3005 /* Make a copy of NODE and in-place change it to a
3006 VEC_PERM node to permute the lanes of the copy. */
3007 slp_tree copy = new _slp_tree;
3008 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3009 SLP_TREE_CHILDREN (node) = vNULL;
3010 SLP_TREE_SCALAR_STMTS (copy)
3011 = SLP_TREE_SCALAR_STMTS (node).copy ();
3012 vect_slp_permute (perms[perm],
3013 SLP_TREE_SCALAR_STMTS (copy), true);
3014 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3015 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3016 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3017 SLP_TREE_LANE_PERMUTATION (copy)
3018 = SLP_TREE_LANE_PERMUTATION (node);
3019 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3020 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3021 copy->refcnt = 1;
3022 copy->max_nunits = node->max_nunits;
3023 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3024 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3025 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3026
3027 /* Now turn NODE into a VEC_PERM. */
3028 SLP_TREE_CHILDREN (node).safe_push (copy);
3029 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3030 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3031 SLP_TREE_LANE_PERMUTATION (node)
3032 .quick_push (std::make_pair (0, perms[perm][j]));
3033 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3034 }
3035 }
3036 else
3037 {
3038 /* Apply the reverse permutation to our stmts. */
3039 vect_slp_permute (perms[perm],
3040 SLP_TREE_SCALAR_STMTS (node), true);
3041 /* And to the load permutation, which we can simply
3042 make regular by design. */
3043 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3044 {
3045 /* ??? When we handle non-bijective permutes the idea
3046 is that we can force the load-permutation to be
3047 { min, min + 1, min + 2, ... max }. But then the
3048 scalar defs might no longer match the lane content
3049 which means wrong-code with live lane vectorization.
3050 So we possibly have to have NULL entries for those. */
3051 vect_slp_permute (perms[perm],
3052 SLP_TREE_LOAD_PERMUTATION (node), true);
3053 }
3054 }
3055 }
3056
3057 /* Free the perms vector used for propagation. */
3058 while (!perms.is_empty ())
3059 perms.pop ().release ();
3060 free_graph (slpg);
3061
3062
3063 /* Now elide load permutations that are not necessary. */
3064 for (i = 0; i < leafs.length (); ++i)
3065 {
3066 node = vertices[leafs[i]];
3067 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3068 continue;
3069
3070 /* In basic block vectorization we allow any subchain of an interleaving
3071 chain.
3072 FORNOW: not in loop SLP because of realignment complications. */
3073 if (is_a <bb_vec_info> (vinfo))
3074 {
3075 bool subchain_p = true;
3076 stmt_vec_info next_load_info = NULL;
3077 stmt_vec_info load_info;
3078 unsigned j;
3079 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3080 {
3081 if (j != 0
3082 && (next_load_info != load_info
3083 || DR_GROUP_GAP (load_info) != 1))
3084 {
3085 subchain_p = false;
3086 break;
3087 }
3088 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3089 }
3090 if (subchain_p)
3091 {
3092 SLP_TREE_LOAD_PERMUTATION (node).release ();
3093 continue;
3094 }
3095 }
3096 else
3097 {
3098 stmt_vec_info load_info;
3099 bool this_load_permuted = false;
3100 unsigned j;
3101 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3102 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3103 {
3104 this_load_permuted = true;
3105 break;
3106 }
3107 stmt_vec_info first_stmt_info
3108 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3109 if (!this_load_permuted
3110 /* The load requires permutation when unrolling exposes
3111 a gap either because the group is larger than the SLP
3112 group-size or because there is a gap between the groups. */
3113 && (known_eq (LOOP_VINFO_VECT_FACTOR
3114 (as_a <loop_vec_info> (vinfo)), 1U)
3115 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3116 && DR_GROUP_GAP (first_stmt_info) == 0)))
3117 {
3118 SLP_TREE_LOAD_PERMUTATION (node).release ();
3119 continue;
3120 }
3121 }
3122 }
3123 }
3124
3125
3126 /* For each possible SLP instance decide whether to SLP it and calculate overall
3127 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3128 least one instance. */
3129
3130 bool
3131 vect_make_slp_decision (loop_vec_info loop_vinfo)
3132 {
3133 unsigned int i;
3134 poly_uint64 unrolling_factor = 1;
3135 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3136 slp_instance instance;
3137 int decided_to_slp = 0;
3138
3139 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3140
3141 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3142 {
3143 /* FORNOW: SLP if you can. */
3144 /* All unroll factors have the form:
3145
3146 GET_MODE_SIZE (vinfo->vector_mode) * X
3147
3148 for some rational X, so they must have a common multiple. */
3149 unrolling_factor
3150 = force_common_multiple (unrolling_factor,
3151 SLP_INSTANCE_UNROLLING_FACTOR (instance));
3152
3153 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3154 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3155 loop-based vectorization. Such stmts will be marked as HYBRID. */
3156 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3157 decided_to_slp++;
3158 }
3159
3160 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
3161
3162 if (decided_to_slp && dump_enabled_p ())
3163 {
3164 dump_printf_loc (MSG_NOTE, vect_location,
3165 "Decided to SLP %d instances. Unrolling factor ",
3166 decided_to_slp);
3167 dump_dec (MSG_NOTE, unrolling_factor);
3168 dump_printf (MSG_NOTE, "\n");
3169 }
3170
3171 return (decided_to_slp > 0);
3172 }
3173
3174 /* Private data for vect_detect_hybrid_slp. */
3175 struct vdhs_data
3176 {
3177 loop_vec_info loop_vinfo;
3178 vec<stmt_vec_info> *worklist;
3179 };
3180
3181 /* Walker for walk_gimple_op. */
3182
3183 static tree
3184 vect_detect_hybrid_slp (tree *tp, int *, void *data)
3185 {
3186 walk_stmt_info *wi = (walk_stmt_info *)data;
3187 vdhs_data *dat = (vdhs_data *)wi->info;
3188
3189 if (wi->is_lhs)
3190 return NULL_TREE;
3191
3192 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
3193 if (!def_stmt_info)
3194 return NULL_TREE;
3195 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3196 if (PURE_SLP_STMT (def_stmt_info))
3197 {
3198 if (dump_enabled_p ())
3199 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3200 def_stmt_info->stmt);
3201 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3202 dat->worklist->safe_push (def_stmt_info);
3203 }
3204
3205 return NULL_TREE;
3206 }
3207
3208 /* Find stmts that must be both vectorized and SLPed. */
3209
3210 void
3211 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3212 {
3213 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3214
3215 /* All stmts participating in SLP are marked pure_slp, all other
3216 stmts are loop_vect.
3217 First collect all loop_vect stmts into a worklist. */
3218 auto_vec<stmt_vec_info> worklist;
3219 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3220 {
3221 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3222 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3223 gsi_next (&gsi))
3224 {
3225 gphi *phi = gsi.phi ();
3226 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3227 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3228 worklist.safe_push (stmt_info);
3229 }
3230 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3231 gsi_next (&gsi))
3232 {
3233 gimple *stmt = gsi_stmt (gsi);
3234 if (is_gimple_debug (stmt))
3235 continue;
3236 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3237 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3238 {
3239 for (gimple_stmt_iterator gsi2
3240 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3241 !gsi_end_p (gsi2); gsi_next (&gsi2))
3242 {
3243 stmt_vec_info patt_info
3244 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3245 if (!STMT_SLP_TYPE (patt_info)
3246 && STMT_VINFO_RELEVANT (patt_info))
3247 worklist.safe_push (patt_info);
3248 }
3249 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3250 }
3251 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3252 worklist.safe_push (stmt_info);
3253 }
3254 }
3255
3256 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3257 mark any SLP vectorized stmt as hybrid.
3258 ??? We're visiting def stmts N times (once for each non-SLP and
3259 once for each hybrid-SLP use). */
3260 walk_stmt_info wi;
3261 vdhs_data dat;
3262 dat.worklist = &worklist;
3263 dat.loop_vinfo = loop_vinfo;
3264 memset (&wi, 0, sizeof (wi));
3265 wi.info = (void *)&dat;
3266 while (!worklist.is_empty ())
3267 {
3268 stmt_vec_info stmt_info = worklist.pop ();
3269 /* Since SSA operands are not set up for pattern stmts we need
3270 to use walk_gimple_op. */
3271 wi.is_lhs = 0;
3272 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3273 }
3274 }
3275
3276
3277 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3278
3279 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3280 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs)
3281 {
3282 for (unsigned i = 0; i < bbs.length (); ++i)
3283 {
3284 if (i != 0)
3285 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3286 gsi_next (&si))
3287 {
3288 gphi *phi = si.phi ();
3289 gimple_set_uid (phi, 0);
3290 add_stmt (phi);
3291 }
3292 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3293 !gsi_end_p (gsi); gsi_next (&gsi))
3294 {
3295 gimple *stmt = gsi_stmt (gsi);
3296 gimple_set_uid (stmt, 0);
3297 if (is_gimple_debug (stmt))
3298 continue;
3299 add_stmt (stmt);
3300 }
3301 }
3302 }
3303
3304
3305 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3306 stmts in the basic block. */
3307
3308 _bb_vec_info::~_bb_vec_info ()
3309 {
3310 /* Reset region marker. */
3311 for (unsigned i = 0; i < bbs.length (); ++i)
3312 {
3313 if (i != 0)
3314 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3315 gsi_next (&si))
3316 {
3317 gphi *phi = si.phi ();
3318 gimple_set_uid (phi, -1);
3319 }
3320 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3321 !gsi_end_p (gsi); gsi_next (&gsi))
3322 {
3323 gimple *stmt = gsi_stmt (gsi);
3324 gimple_set_uid (stmt, -1);
3325 }
3326 }
3327 }
3328
3329 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3330 given then that child nodes have already been processed, and that
3331 their def types currently match their SLP node's def type. */
3332
3333 static bool
3334 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3335 slp_instance node_instance,
3336 stmt_vector_for_cost *cost_vec)
3337 {
3338 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3339 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3340
3341 /* Calculate the number of vector statements to be created for the
3342 scalar stmts in this node. For SLP reductions it is equal to the
3343 number of vector statements in the children (which has already been
3344 calculated by the recursive call). Otherwise it is the number of
3345 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3346 VF divided by the number of elements in a vector. */
3347 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3348 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3349 {
3350 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3351 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3352 {
3353 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3354 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3355 break;
3356 }
3357 }
3358 else
3359 {
3360 poly_uint64 vf;
3361 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3362 vf = loop_vinfo->vectorization_factor;
3363 else
3364 vf = 1;
3365 unsigned int group_size = SLP_TREE_LANES (node);
3366 tree vectype = SLP_TREE_VECTYPE (node);
3367 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3368 = vect_get_num_vectors (vf * group_size, vectype);
3369 }
3370
3371 /* Handle purely internal nodes. */
3372 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3373 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3374
3375 if (is_a <bb_vec_info> (vinfo)
3376 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3377 {
3378 if (dump_enabled_p ())
3379 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3380 "desired vector type conflicts with earlier one "
3381 "for %G", stmt_info->stmt);
3382 return false;
3383 }
3384
3385 bool dummy;
3386 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3387 node, node_instance, cost_vec);
3388 }
3389
3390 /* Try to build NODE from scalars, returning true on success.
3391 NODE_INSTANCE is the SLP instance that contains NODE. */
3392
3393 static bool
3394 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3395 slp_instance node_instance)
3396 {
3397 stmt_vec_info stmt_info;
3398 unsigned int i;
3399
3400 if (!is_a <bb_vec_info> (vinfo)
3401 || node == SLP_INSTANCE_TREE (node_instance)
3402 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3403 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3404 return false;
3405
3406 if (dump_enabled_p ())
3407 dump_printf_loc (MSG_NOTE, vect_location,
3408 "Building vector operands of %p from scalars instead\n", node);
3409
3410 /* Don't remove and free the child nodes here, since they could be
3411 referenced by other structures. The analysis and scheduling phases
3412 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3413 unsigned int group_size = SLP_TREE_LANES (node);
3414 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3415 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3416 SLP_TREE_LOAD_PERMUTATION (node).release ();
3417 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3418 {
3419 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3420 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3421 }
3422 return true;
3423 }
3424
3425 /* Compute the prologue cost for invariant or constant operands represented
3426 by NODE. */
3427
3428 static void
3429 vect_prologue_cost_for_slp (slp_tree node,
3430 stmt_vector_for_cost *cost_vec)
3431 {
3432 /* There's a special case of an existing vector, that costs nothing. */
3433 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3434 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3435 return;
3436 /* Without looking at the actual initializer a vector of
3437 constants can be implemented as load from the constant pool.
3438 When all elements are the same we can use a splat. */
3439 tree vectype = SLP_TREE_VECTYPE (node);
3440 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3441 unsigned num_vects_to_check;
3442 unsigned HOST_WIDE_INT const_nunits;
3443 unsigned nelt_limit;
3444 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3445 && ! multiple_p (const_nunits, group_size))
3446 {
3447 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3448 nelt_limit = const_nunits;
3449 }
3450 else
3451 {
3452 /* If either the vector has variable length or the vectors
3453 are composed of repeated whole groups we only need to
3454 cost construction once. All vectors will be the same. */
3455 num_vects_to_check = 1;
3456 nelt_limit = group_size;
3457 }
3458 tree elt = NULL_TREE;
3459 unsigned nelt = 0;
3460 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3461 {
3462 unsigned si = j % group_size;
3463 if (nelt == 0)
3464 elt = SLP_TREE_SCALAR_OPS (node)[si];
3465 /* ??? We're just tracking whether all operands of a single
3466 vector initializer are the same, ideally we'd check if
3467 we emitted the same one already. */
3468 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3469 elt = NULL_TREE;
3470 nelt++;
3471 if (nelt == nelt_limit)
3472 {
3473 record_stmt_cost (cost_vec, 1,
3474 SLP_TREE_DEF_TYPE (node) == vect_external_def
3475 ? (elt ? scalar_to_vec : vec_construct)
3476 : vector_load,
3477 NULL, vectype, 0, vect_prologue);
3478 nelt = 0;
3479 }
3480 }
3481 }
3482
3483 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3484 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3485
3486 Return true if the operations are supported. */
3487
3488 static bool
3489 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3490 slp_instance node_instance,
3491 hash_set<slp_tree> &visited_set,
3492 vec<slp_tree> &visited_vec,
3493 stmt_vector_for_cost *cost_vec)
3494 {
3495 int i, j;
3496 slp_tree child;
3497
3498 /* Assume we can code-generate all invariants. */
3499 if (!node
3500 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
3501 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
3502 return true;
3503
3504 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
3505 {
3506 if (dump_enabled_p ())
3507 dump_printf_loc (MSG_NOTE, vect_location,
3508 "Failed cyclic SLP reference in %p", node);
3509 return false;
3510 }
3511 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
3512
3513 /* If we already analyzed the exact same set of scalar stmts we're done.
3514 We share the generated vector stmts for those. */
3515 if (visited_set.add (node))
3516 return true;
3517 visited_vec.safe_push (node);
3518
3519 bool res = true;
3520 unsigned visited_rec_start = visited_vec.length ();
3521 unsigned cost_vec_rec_start = cost_vec->length ();
3522 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3523 {
3524 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3525 visited_set, visited_vec,
3526 cost_vec);
3527 if (!res)
3528 break;
3529 }
3530
3531 if (res)
3532 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3533 cost_vec);
3534 /* If analysis failed we have to pop all recursive visited nodes
3535 plus ourselves. */
3536 if (!res)
3537 {
3538 while (visited_vec.length () >= visited_rec_start)
3539 visited_set.remove (visited_vec.pop ());
3540 cost_vec->truncate (cost_vec_rec_start);
3541 }
3542
3543 /* When the node can be vectorized cost invariant nodes it references.
3544 This is not done in DFS order to allow the refering node
3545 vectorizable_* calls to nail down the invariant nodes vector type
3546 and possibly unshare it if it needs a different vector type than
3547 other referrers. */
3548 if (res)
3549 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3550 if (child
3551 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
3552 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3553 /* Perform usual caching, note code-generation still
3554 code-gens these nodes multiple times but we expect
3555 to CSE them later. */
3556 && !visited_set.add (child))
3557 {
3558 visited_vec.safe_push (child);
3559 /* ??? After auditing more code paths make a "default"
3560 and push the vector type from NODE to all children
3561 if it is not already set. */
3562 /* Compute the number of vectors to be generated. */
3563 tree vector_type = SLP_TREE_VECTYPE (child);
3564 if (!vector_type)
3565 {
3566 /* For shifts with a scalar argument we don't need
3567 to cost or code-generate anything.
3568 ??? Represent this more explicitely. */
3569 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3570 == shift_vec_info_type)
3571 && j == 1);
3572 continue;
3573 }
3574 unsigned group_size = SLP_TREE_LANES (child);
3575 poly_uint64 vf = 1;
3576 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3577 vf = loop_vinfo->vectorization_factor;
3578 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3579 = vect_get_num_vectors (vf * group_size, vector_type);
3580 /* And cost them. */
3581 vect_prologue_cost_for_slp (child, cost_vec);
3582 }
3583
3584 /* If this node or any of its children can't be vectorized, try pruning
3585 the tree here rather than felling the whole thing. */
3586 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3587 {
3588 /* We'll need to revisit this for invariant costing and number
3589 of vectorized stmt setting. */
3590 res = true;
3591 }
3592
3593 return res;
3594 }
3595
3596
3597 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3598 region and that can be vectorized using vectorizable_live_operation
3599 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3600 scalar code computing it to be retained. */
3601
3602 static void
3603 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3604 slp_instance instance,
3605 stmt_vector_for_cost *cost_vec,
3606 hash_set<stmt_vec_info> &svisited,
3607 hash_set<slp_tree> &visited)
3608 {
3609 if (visited.add (node))
3610 return;
3611
3612 unsigned i;
3613 stmt_vec_info stmt_info;
3614 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3615 bool all_visited = true;
3616 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3617 {
3618 if (svisited.contains (stmt_info))
3619 continue;
3620 all_visited = false;
3621 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3622 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
3623 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
3624 /* Only the pattern root stmt computes the original scalar value. */
3625 continue;
3626 bool mark_visited = true;
3627 gimple *orig_stmt = orig_stmt_info->stmt;
3628 ssa_op_iter op_iter;
3629 def_operand_p def_p;
3630 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3631 {
3632 imm_use_iterator use_iter;
3633 gimple *use_stmt;
3634 stmt_vec_info use_stmt_info;
3635 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3636 if (!is_gimple_debug (use_stmt))
3637 {
3638 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3639 if (!use_stmt_info
3640 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3641 {
3642 STMT_VINFO_LIVE_P (stmt_info) = true;
3643 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3644 NULL, node, instance, i,
3645 false, cost_vec))
3646 /* ??? So we know we can vectorize the live stmt
3647 from one SLP node. If we cannot do so from all
3648 or none consistently we'd have to record which
3649 SLP node (and lane) we want to use for the live
3650 operation. So make sure we can code-generate
3651 from all nodes. */
3652 mark_visited = false;
3653 else
3654 STMT_VINFO_LIVE_P (stmt_info) = false;
3655 BREAK_FROM_IMM_USE_STMT (use_iter);
3656 }
3657 }
3658 /* We have to verify whether we can insert the lane extract
3659 before all uses. The following is a conservative approximation.
3660 We cannot put this into vectorizable_live_operation because
3661 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3662 doesn't work.
3663 Note that while the fact that we emit code for loads at the
3664 first load should make this a non-problem leafs we construct
3665 from scalars are vectorized after the last scalar def.
3666 ??? If we'd actually compute the insert location during
3667 analysis we could use sth less conservative than the last
3668 scalar stmt in the node for the dominance check. */
3669 /* ??? What remains is "live" uses in vector CTORs in the same
3670 SLP graph which is where those uses can end up code-generated
3671 right after their definition instead of close to their original
3672 use. But that would restrict us to code-generate lane-extracts
3673 from the latest stmt in a node. So we compensate for this
3674 during code-generation, simply not replacing uses for those
3675 hopefully rare cases. */
3676 if (STMT_VINFO_LIVE_P (stmt_info))
3677 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3678 if (!is_gimple_debug (use_stmt)
3679 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
3680 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3681 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
3682 {
3683 if (dump_enabled_p ())
3684 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3685 "Cannot determine insertion place for "
3686 "lane extract\n");
3687 STMT_VINFO_LIVE_P (stmt_info) = false;
3688 mark_visited = true;
3689 }
3690 }
3691 if (mark_visited)
3692 svisited.add (stmt_info);
3693 }
3694 if (all_visited)
3695 return;
3696
3697 slp_tree child;
3698 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3699 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3700 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
3701 cost_vec, svisited, visited);
3702 }
3703
3704 /* Analyze statements in SLP instances of VINFO. Return true if the
3705 operations are supported. */
3706
3707 bool
3708 vect_slp_analyze_operations (vec_info *vinfo)
3709 {
3710 slp_instance instance;
3711 int i;
3712
3713 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3714
3715 hash_set<slp_tree> visited;
3716 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
3717 {
3718 auto_vec<slp_tree> visited_vec;
3719 stmt_vector_for_cost cost_vec;
3720 cost_vec.create (2);
3721 if (is_a <bb_vec_info> (vinfo))
3722 vect_location = instance->location ();
3723 if (!vect_slp_analyze_node_operations (vinfo,
3724 SLP_INSTANCE_TREE (instance),
3725 instance, visited, visited_vec,
3726 &cost_vec)
3727 /* Instances with a root stmt require vectorized defs for the
3728 SLP tree root. */
3729 || (SLP_INSTANCE_ROOT_STMT (instance)
3730 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
3731 != vect_internal_def)))
3732 {
3733 slp_tree node = SLP_INSTANCE_TREE (instance);
3734 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3735 if (dump_enabled_p ())
3736 dump_printf_loc (MSG_NOTE, vect_location,
3737 "removing SLP instance operations starting from: %G",
3738 stmt_info->stmt);
3739 vect_free_slp_instance (instance);
3740 vinfo->slp_instances.ordered_remove (i);
3741 cost_vec.release ();
3742 while (!visited_vec.is_empty ())
3743 visited.remove (visited_vec.pop ());
3744 }
3745 else
3746 {
3747 i++;
3748
3749 /* For BB vectorization remember the SLP graph entry
3750 cost for later. */
3751 if (is_a <bb_vec_info> (vinfo))
3752 instance->cost_vec = cost_vec;
3753 else
3754 {
3755 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
3756 cost_vec.release ();
3757 }
3758 }
3759 }
3760
3761 /* Compute vectorizable live stmts. */
3762 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3763 {
3764 hash_set<stmt_vec_info> svisited;
3765 hash_set<slp_tree> visited;
3766 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
3767 {
3768 vect_location = instance->location ();
3769 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
3770 instance, &instance->cost_vec, svisited,
3771 visited);
3772 }
3773 }
3774
3775 return !vinfo->slp_instances.is_empty ();
3776 }
3777
3778 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3779 closing the eventual chain. */
3780
3781 static slp_instance
3782 get_ultimate_leader (slp_instance instance,
3783 hash_map<slp_instance, slp_instance> &instance_leader)
3784 {
3785 auto_vec<slp_instance *, 8> chain;
3786 slp_instance *tem;
3787 while (*(tem = instance_leader.get (instance)) != instance)
3788 {
3789 chain.safe_push (tem);
3790 instance = *tem;
3791 }
3792 while (!chain.is_empty ())
3793 *chain.pop () = instance;
3794 return instance;
3795 }
3796
3797 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3798
3799 static void
3800 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
3801 slp_instance instance, slp_tree node,
3802 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
3803 hash_map<slp_instance, slp_instance> &instance_leader,
3804 hash_set<slp_tree> &visited)
3805 {
3806 stmt_vec_info stmt_info;
3807 unsigned i;
3808
3809 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3810 {
3811 bool existed_p;
3812 slp_instance &stmt_instance
3813 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
3814 if (!existed_p)
3815 ;
3816 else if (stmt_instance != instance)
3817 {
3818 /* If we're running into a previously marked stmt make us the
3819 leader of the current ultimate leader. This keeps the
3820 leader chain acyclic and works even when the current instance
3821 connects two previously independent graph parts. */
3822 slp_instance stmt_leader
3823 = get_ultimate_leader (stmt_instance, instance_leader);
3824 if (stmt_leader != instance)
3825 instance_leader.put (stmt_leader, instance);
3826 }
3827 stmt_instance = instance;
3828 }
3829
3830 if (visited.add (node))
3831 return;
3832
3833 slp_tree child;
3834 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3835 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3836 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
3837 instance_leader, visited);
3838 }
3839
3840 /* Partition the SLP graph into pieces that can be costed independently. */
3841
3842 static void
3843 vect_bb_partition_graph (bb_vec_info bb_vinfo)
3844 {
3845 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3846
3847 /* First walk the SLP graph assigning each involved scalar stmt a
3848 corresponding SLP graph entry and upon visiting a previously
3849 marked stmt, make the stmts leader the current SLP graph entry. */
3850 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
3851 hash_map<slp_instance, slp_instance> instance_leader;
3852 hash_set<slp_tree> visited;
3853 slp_instance instance;
3854 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3855 {
3856 instance_leader.put (instance, instance);
3857 vect_bb_partition_graph_r (bb_vinfo,
3858 instance, SLP_INSTANCE_TREE (instance),
3859 stmt_to_instance, instance_leader,
3860 visited);
3861 }
3862
3863 /* Then collect entries to each independent subgraph. */
3864 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3865 {
3866 slp_instance leader = get_ultimate_leader (instance, instance_leader);
3867 leader->subgraph_entries.safe_push (instance);
3868 if (dump_enabled_p ()
3869 && leader != instance)
3870 dump_printf_loc (MSG_NOTE, vect_location,
3871 "instance %p is leader of %p\n",
3872 leader, instance);
3873 }
3874 }
3875
3876 /* Compute the scalar cost of the SLP node NODE and its children
3877 and return it. Do not account defs that are marked in LIFE and
3878 update LIFE according to uses of NODE. */
3879
3880 static void
3881 vect_bb_slp_scalar_cost (vec_info *vinfo,
3882 slp_tree node, vec<bool, va_heap> *life,
3883 stmt_vector_for_cost *cost_vec,
3884 hash_set<slp_tree> &visited)
3885 {
3886 unsigned i;
3887 stmt_vec_info stmt_info;
3888 slp_tree child;
3889
3890 if (visited.add (node))
3891 return;
3892
3893 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3894 {
3895 ssa_op_iter op_iter;
3896 def_operand_p def_p;
3897
3898 if ((*life)[i])
3899 continue;
3900
3901 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3902 gimple *orig_stmt = orig_stmt_info->stmt;
3903
3904 /* If there is a non-vectorized use of the defs then the scalar
3905 stmt is kept live in which case we do not account it or any
3906 required defs in the SLP children in the scalar cost. This
3907 way we make the vectorization more costly when compared to
3908 the scalar cost. */
3909 if (!STMT_VINFO_LIVE_P (stmt_info))
3910 {
3911 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3912 {
3913 imm_use_iterator use_iter;
3914 gimple *use_stmt;
3915 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3916 if (!is_gimple_debug (use_stmt))
3917 {
3918 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3919 if (!use_stmt_info
3920 || !PURE_SLP_STMT
3921 (vect_stmt_to_vectorize (use_stmt_info)))
3922 {
3923 (*life)[i] = true;
3924 BREAK_FROM_IMM_USE_STMT (use_iter);
3925 }
3926 }
3927 }
3928 if ((*life)[i])
3929 continue;
3930 }
3931
3932 /* Count scalar stmts only once. */
3933 if (gimple_visited_p (orig_stmt))
3934 continue;
3935 gimple_set_visited (orig_stmt, true);
3936
3937 vect_cost_for_stmt kind;
3938 if (STMT_VINFO_DATA_REF (orig_stmt_info))
3939 {
3940 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
3941 kind = scalar_load;
3942 else
3943 kind = scalar_store;
3944 }
3945 else if (vect_nop_conversion_p (orig_stmt_info))
3946 continue;
3947 else
3948 kind = scalar_stmt;
3949 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
3950 SLP_TREE_VECTYPE (node), 0, vect_body);
3951 }
3952
3953 auto_vec<bool, 20> subtree_life;
3954 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3955 {
3956 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3957 {
3958 /* Do not directly pass LIFE to the recursive call, copy it to
3959 confine changes in the callee to the current child/subtree. */
3960 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3961 {
3962 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
3963 for (unsigned j = 0;
3964 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
3965 {
3966 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
3967 if (perm.first == i)
3968 subtree_life[perm.second] = (*life)[j];
3969 }
3970 }
3971 else
3972 {
3973 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
3974 subtree_life.safe_splice (*life);
3975 }
3976 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
3977 visited);
3978 subtree_life.truncate (0);
3979 }
3980 }
3981 }
3982
3983 /* Check if vectorization of the basic block is profitable for the
3984 subgraph denoted by SLP_INSTANCES. */
3985
3986 static bool
3987 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
3988 vec<slp_instance> slp_instances)
3989 {
3990 slp_instance instance;
3991 int i;
3992 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3993 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3994
3995 void *vect_target_cost_data = init_cost (NULL);
3996
3997 /* Calculate scalar cost and sum the cost for the vector stmts
3998 previously collected. */
3999 stmt_vector_for_cost scalar_costs;
4000 scalar_costs.create (0);
4001 hash_set<slp_tree> visited;
4002 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4003 {
4004 auto_vec<bool, 20> life;
4005 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
4006 true);
4007 vect_bb_slp_scalar_cost (bb_vinfo,
4008 SLP_INSTANCE_TREE (instance),
4009 &life, &scalar_costs, visited);
4010 add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
4011 instance->cost_vec.release ();
4012 }
4013 /* Unset visited flag. */
4014 stmt_info_for_cost *si;
4015 FOR_EACH_VEC_ELT (scalar_costs, i, si)
4016 gimple_set_visited (si->stmt_info->stmt, false);
4017
4018 void *scalar_target_cost_data = init_cost (NULL);
4019 add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
4020 scalar_costs.release ();
4021 unsigned dummy;
4022 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
4023 destroy_cost_data (scalar_target_cost_data);
4024
4025 /* Complete the target-specific vector cost calculation. */
4026 finish_cost (vect_target_cost_data, &vec_prologue_cost,
4027 &vec_inside_cost, &vec_epilogue_cost);
4028 destroy_cost_data (vect_target_cost_data);
4029
4030 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
4031
4032 if (dump_enabled_p ())
4033 {
4034 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4035 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
4036 vec_inside_cost);
4037 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
4038 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
4039 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
4040 }
4041
4042 /* Vectorization is profitable if its cost is more than the cost of scalar
4043 version. Note that we err on the vector side for equal cost because
4044 the cost estimate is otherwise quite pessimistic (constant uses are
4045 free on the scalar side but cost a load on the vector side for
4046 example). */
4047 if (vec_outside_cost + vec_inside_cost > scalar_cost)
4048 return false;
4049
4050 return true;
4051 }
4052
4053 /* Find any vectorizable constructors and add them to the grouped_store
4054 array. */
4055
4056 static void
4057 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
4058 {
4059 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
4060 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
4061 !gsi_end_p (gsi); gsi_next (&gsi))
4062 {
4063 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
4064 if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
4065 continue;
4066
4067 tree rhs = gimple_assign_rhs1 (assign);
4068 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
4069 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
4070 CONSTRUCTOR_NELTS (rhs))
4071 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
4072 || uniform_vector_p (rhs))
4073 continue;
4074
4075 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
4076 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
4077 }
4078 }
4079
4080 /* Check if the region described by BB_VINFO can be vectorized, returning
4081 true if so. When returning false, set FATAL to true if the same failure
4082 would prevent vectorization at other vector sizes, false if it is still
4083 worth trying other sizes. N_STMTS is the number of statements in the
4084 region. */
4085
4086 static bool
4087 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
4088 vec<int> *dataref_groups)
4089 {
4090 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4091
4092 slp_instance instance;
4093 int i;
4094 poly_uint64 min_vf = 2;
4095
4096 /* The first group of checks is independent of the vector size. */
4097 fatal = true;
4098
4099 /* Analyze the data references. */
4100
4101 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
4102 {
4103 if (dump_enabled_p ())
4104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4105 "not vectorized: unhandled data-ref in basic "
4106 "block.\n");
4107 return false;
4108 }
4109
4110 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
4111 {
4112 if (dump_enabled_p ())
4113 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4114 "not vectorized: unhandled data access in "
4115 "basic block.\n");
4116 return false;
4117 }
4118
4119 vect_slp_check_for_constructors (bb_vinfo);
4120
4121 /* If there are no grouped stores and no constructors in the region
4122 there is no need to continue with pattern recog as vect_analyze_slp
4123 will fail anyway. */
4124 if (bb_vinfo->grouped_stores.is_empty ())
4125 {
4126 if (dump_enabled_p ())
4127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4128 "not vectorized: no grouped stores in "
4129 "basic block.\n");
4130 return false;
4131 }
4132
4133 /* While the rest of the analysis below depends on it in some way. */
4134 fatal = false;
4135
4136 vect_pattern_recog (bb_vinfo);
4137
4138 /* Check the SLP opportunities in the basic block, analyze and build SLP
4139 trees. */
4140 if (!vect_analyze_slp (bb_vinfo, n_stmts))
4141 {
4142 if (dump_enabled_p ())
4143 {
4144 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4145 "Failed to SLP the basic block.\n");
4146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4147 "not vectorized: failed to find SLP opportunities "
4148 "in basic block.\n");
4149 }
4150 return false;
4151 }
4152
4153 /* Optimize permutations. */
4154 vect_optimize_slp (bb_vinfo);
4155
4156 vect_record_base_alignments (bb_vinfo);
4157
4158 /* Analyze and verify the alignment of data references and the
4159 dependence in the SLP instances. */
4160 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
4161 {
4162 vect_location = instance->location ();
4163 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
4164 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
4165 {
4166 slp_tree node = SLP_INSTANCE_TREE (instance);
4167 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4168 if (dump_enabled_p ())
4169 dump_printf_loc (MSG_NOTE, vect_location,
4170 "removing SLP instance operations starting from: %G",
4171 stmt_info->stmt);
4172 vect_free_slp_instance (instance);
4173 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
4174 continue;
4175 }
4176
4177 /* Mark all the statements that we want to vectorize as pure SLP and
4178 relevant. */
4179 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4180 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
4181 if (SLP_INSTANCE_ROOT_STMT (instance))
4182 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
4183
4184 i++;
4185 }
4186 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
4187 return false;
4188
4189 if (!vect_slp_analyze_operations (bb_vinfo))
4190 {
4191 if (dump_enabled_p ())
4192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4193 "not vectorized: bad operation in basic block.\n");
4194 return false;
4195 }
4196
4197 vect_bb_partition_graph (bb_vinfo);
4198
4199 return true;
4200 }
4201
4202 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4203 basic blocks in BBS, returning true on success.
4204 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4205
4206 static bool
4207 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
4208 vec<int> *dataref_groups, unsigned int n_stmts)
4209 {
4210 bb_vec_info bb_vinfo;
4211 auto_vector_modes vector_modes;
4212
4213 /* Autodetect first vector size we try. */
4214 machine_mode next_vector_mode = VOIDmode;
4215 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
4216 unsigned int mode_i = 0;
4217
4218 vec_info_shared shared;
4219
4220 machine_mode autodetected_vector_mode = VOIDmode;
4221 while (1)
4222 {
4223 bool vectorized = false;
4224 bool fatal = false;
4225 bb_vinfo = new _bb_vec_info (bbs, &shared);
4226
4227 bool first_time_p = shared.datarefs.is_empty ();
4228 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
4229 if (first_time_p)
4230 bb_vinfo->shared->save_datarefs ();
4231 else
4232 bb_vinfo->shared->check_datarefs ();
4233 bb_vinfo->vector_mode = next_vector_mode;
4234
4235 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)
4236 && dbg_cnt (vect_slp))
4237 {
4238 if (dump_enabled_p ())
4239 {
4240 dump_printf_loc (MSG_NOTE, vect_location,
4241 "***** Analysis succeeded with vector mode"
4242 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4243 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4244 }
4245
4246 bb_vinfo->shared->check_datarefs ();
4247
4248 unsigned i;
4249 slp_instance instance;
4250 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4251 {
4252 if (instance->subgraph_entries.is_empty ())
4253 continue;
4254
4255 vect_location = instance->location ();
4256 if (!unlimited_cost_model (NULL)
4257 && !vect_bb_vectorization_profitable_p
4258 (bb_vinfo, instance->subgraph_entries))
4259 {
4260 if (dump_enabled_p ())
4261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4262 "not vectorized: vectorization is not "
4263 "profitable.\n");
4264 continue;
4265 }
4266
4267 if (!vectorized && dump_enabled_p ())
4268 dump_printf_loc (MSG_NOTE, vect_location,
4269 "Basic block will be vectorized "
4270 "using SLP\n");
4271 vectorized = true;
4272
4273 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4274
4275 unsigned HOST_WIDE_INT bytes;
4276 if (dump_enabled_p ())
4277 {
4278 if (GET_MODE_SIZE
4279 (bb_vinfo->vector_mode).is_constant (&bytes))
4280 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4281 "basic block part vectorized using %wu "
4282 "byte vectors\n", bytes);
4283 else
4284 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4285 "basic block part vectorized using "
4286 "variable length vectors\n");
4287 }
4288 }
4289 }
4290 else
4291 {
4292 if (dump_enabled_p ())
4293 dump_printf_loc (MSG_NOTE, vect_location,
4294 "***** Analysis failed with vector mode %s\n",
4295 GET_MODE_NAME (bb_vinfo->vector_mode));
4296 }
4297
4298 if (mode_i == 0)
4299 autodetected_vector_mode = bb_vinfo->vector_mode;
4300
4301 if (!fatal)
4302 while (mode_i < vector_modes.length ()
4303 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
4304 {
4305 if (dump_enabled_p ())
4306 dump_printf_loc (MSG_NOTE, vect_location,
4307 "***** The result for vector mode %s would"
4308 " be the same\n",
4309 GET_MODE_NAME (vector_modes[mode_i]));
4310 mode_i += 1;
4311 }
4312
4313 delete bb_vinfo;
4314
4315 if (mode_i < vector_modes.length ()
4316 && VECTOR_MODE_P (autodetected_vector_mode)
4317 && (related_vector_mode (vector_modes[mode_i],
4318 GET_MODE_INNER (autodetected_vector_mode))
4319 == autodetected_vector_mode)
4320 && (related_vector_mode (autodetected_vector_mode,
4321 GET_MODE_INNER (vector_modes[mode_i]))
4322 == vector_modes[mode_i]))
4323 {
4324 if (dump_enabled_p ())
4325 dump_printf_loc (MSG_NOTE, vect_location,
4326 "***** Skipping vector mode %s, which would"
4327 " repeat the analysis for %s\n",
4328 GET_MODE_NAME (vector_modes[mode_i]),
4329 GET_MODE_NAME (autodetected_vector_mode));
4330 mode_i += 1;
4331 }
4332
4333 if (vectorized
4334 || mode_i == vector_modes.length ()
4335 || autodetected_vector_mode == VOIDmode
4336 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4337 vector sizes will fail do not bother iterating. */
4338 || fatal)
4339 return vectorized;
4340
4341 /* Try the next biggest vector size. */
4342 next_vector_mode = vector_modes[mode_i++];
4343 if (dump_enabled_p ())
4344 dump_printf_loc (MSG_NOTE, vect_location,
4345 "***** Re-trying analysis with vector mode %s\n",
4346 GET_MODE_NAME (next_vector_mode));
4347 }
4348 }
4349
4350
4351 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4352 true if anything in the basic-block was vectorized. */
4353
4354 static bool
4355 vect_slp_bbs (vec<basic_block> bbs)
4356 {
4357 vec<data_reference_p> datarefs = vNULL;
4358 auto_vec<int> dataref_groups;
4359 int insns = 0;
4360 int current_group = 0;
4361
4362 for (unsigned i = 0; i < bbs.length (); i++)
4363 {
4364 basic_block bb = bbs[i];
4365 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
4366 gsi_next (&gsi))
4367 {
4368 gimple *stmt = gsi_stmt (gsi);
4369 if (is_gimple_debug (stmt))
4370 continue;
4371
4372 insns++;
4373
4374 if (gimple_location (stmt) != UNKNOWN_LOCATION)
4375 vect_location = stmt;
4376
4377 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
4378 &dataref_groups, current_group))
4379 ++current_group;
4380 }
4381 }
4382
4383 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
4384 }
4385
4386 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4387 true if anything in the basic-block was vectorized. */
4388
4389 bool
4390 vect_slp_bb (basic_block bb)
4391 {
4392 auto_vec<basic_block> bbs;
4393 bbs.safe_push (bb);
4394 return vect_slp_bbs (bbs);
4395 }
4396
4397 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4398 true if anything in the basic-block was vectorized. */
4399
4400 bool
4401 vect_slp_function (function *fun)
4402 {
4403 bool r = false;
4404 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
4405 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
4406
4407 /* For the moment split the function into pieces to avoid making
4408 the iteration on the vector mode moot. Split at points we know
4409 to not handle well which is CFG merges (SLP discovery doesn't
4410 handle non-loop-header PHIs) and loop exits. Since pattern
4411 recog requires reverse iteration to visit uses before defs
4412 simply chop RPO into pieces. */
4413 auto_vec<basic_block> bbs;
4414 for (unsigned i = 0; i < n; i++)
4415 {
4416 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
4417 bool split = false;
4418
4419 /* Split when a BB is not dominated by the first block. */
4420 if (!bbs.is_empty ()
4421 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
4422 {
4423 if (dump_enabled_p ())
4424 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4425 "splitting region at dominance boundary bb%d\n",
4426 bb->index);
4427 split = true;
4428 }
4429 /* Split when the loop determined by the first block
4430 is exited. This is because we eventually insert
4431 invariants at region begin. */
4432 else if (!bbs.is_empty ()
4433 && bbs[0]->loop_father != bb->loop_father
4434 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
4435 {
4436 if (dump_enabled_p ())
4437 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4438 "splitting region at loop %d exit at bb%d\n",
4439 bbs[0]->loop_father->num, bb->index);
4440 split = true;
4441 }
4442
4443 if (split && !bbs.is_empty ())
4444 {
4445 r |= vect_slp_bbs (bbs);
4446 bbs.truncate (0);
4447 bbs.quick_push (bb);
4448 }
4449 else
4450 bbs.safe_push (bb);
4451
4452 /* When we have a stmt ending this block and defining a
4453 value we have to insert on edges when inserting after it for
4454 a vector containing its definition. Avoid this for now. */
4455 if (gimple *last = last_stmt (bb))
4456 if (gimple_get_lhs (last)
4457 && is_ctrl_altering_stmt (last))
4458 {
4459 if (dump_enabled_p ())
4460 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4461 "splitting region at control altering "
4462 "definition %G", last);
4463 r |= vect_slp_bbs (bbs);
4464 bbs.truncate (0);
4465 }
4466 }
4467
4468 if (!bbs.is_empty ())
4469 r |= vect_slp_bbs (bbs);
4470
4471 free (rpo);
4472
4473 return r;
4474 }
4475
4476 /* Build a variable-length vector in which the elements in ELTS are repeated
4477 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4478 RESULTS and add any new instructions to SEQ.
4479
4480 The approach we use is:
4481
4482 (1) Find a vector mode VM with integer elements of mode IM.
4483
4484 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4485 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4486 from small vectors to IM.
4487
4488 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4489
4490 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4491 correct byte contents.
4492
4493 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4494
4495 We try to find the largest IM for which this sequence works, in order
4496 to cut down on the number of interleaves. */
4497
4498 void
4499 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
4500 vec<tree> elts, unsigned int nresults,
4501 vec<tree> &results)
4502 {
4503 unsigned int nelts = elts.length ();
4504 tree element_type = TREE_TYPE (vector_type);
4505
4506 /* (1) Find a vector mode VM with integer elements of mode IM. */
4507 unsigned int nvectors = 1;
4508 tree new_vector_type;
4509 tree permutes[2];
4510 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
4511 &nvectors, &new_vector_type,
4512 permutes))
4513 gcc_unreachable ();
4514
4515 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4516 unsigned int partial_nelts = nelts / nvectors;
4517 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
4518
4519 tree_vector_builder partial_elts;
4520 auto_vec<tree, 32> pieces (nvectors * 2);
4521 pieces.quick_grow (nvectors * 2);
4522 for (unsigned int i = 0; i < nvectors; ++i)
4523 {
4524 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4525 ELTS' has mode IM. */
4526 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
4527 for (unsigned int j = 0; j < partial_nelts; ++j)
4528 partial_elts.quick_push (elts[i * partial_nelts + j]);
4529 tree t = gimple_build_vector (seq, &partial_elts);
4530 t = gimple_build (seq, VIEW_CONVERT_EXPR,
4531 TREE_TYPE (new_vector_type), t);
4532
4533 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4534 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
4535 }
4536
4537 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4538 correct byte contents.
4539
4540 We need to repeat the following operation log2(nvectors) times:
4541
4542 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4543 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4544
4545 However, if each input repeats every N elements and the VF is
4546 a multiple of N * 2, the HI result is the same as the LO. */
4547 unsigned int in_start = 0;
4548 unsigned int out_start = nvectors;
4549 unsigned int hi_start = nvectors / 2;
4550 /* A bound on the number of outputs needed to produce NRESULTS results
4551 in the final iteration. */
4552 unsigned int noutputs_bound = nvectors * nresults;
4553 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
4554 {
4555 noutputs_bound /= 2;
4556 unsigned int limit = MIN (noutputs_bound, nvectors);
4557 for (unsigned int i = 0; i < limit; ++i)
4558 {
4559 if ((i & 1) != 0
4560 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
4561 2 * in_repeat))
4562 {
4563 pieces[out_start + i] = pieces[out_start + i - 1];
4564 continue;
4565 }
4566
4567 tree output = make_ssa_name (new_vector_type);
4568 tree input1 = pieces[in_start + (i / 2)];
4569 tree input2 = pieces[in_start + (i / 2) + hi_start];
4570 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
4571 input1, input2,
4572 permutes[i & 1]);
4573 gimple_seq_add_stmt (seq, stmt);
4574 pieces[out_start + i] = output;
4575 }
4576 std::swap (in_start, out_start);
4577 }
4578
4579 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4580 results.reserve (nresults);
4581 for (unsigned int i = 0; i < nresults; ++i)
4582 if (i < nvectors)
4583 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
4584 pieces[in_start + i]));
4585 else
4586 results.quick_push (results[i - nvectors]);
4587 }
4588
4589
4590 /* For constant and loop invariant defs in OP_NODE this function creates
4591 vector defs that will be used in the vectorized stmts and stores them
4592 to SLP_TREE_VEC_DEFS of OP_NODE. */
4593
4594 static void
4595 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
4596 {
4597 unsigned HOST_WIDE_INT nunits;
4598 tree vec_cst;
4599 unsigned j, number_of_places_left_in_vector;
4600 tree vector_type;
4601 tree vop;
4602 int group_size = op_node->ops.length ();
4603 unsigned int vec_num, i;
4604 unsigned number_of_copies = 1;
4605 bool constant_p;
4606 gimple_seq ctor_seq = NULL;
4607 auto_vec<tree, 16> permute_results;
4608
4609 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
4610 vector_type = SLP_TREE_VECTYPE (op_node);
4611
4612 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
4613 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
4614 auto_vec<tree> voprnds (number_of_vectors);
4615
4616 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4617 created vectors. It is greater than 1 if unrolling is performed.
4618
4619 For example, we have two scalar operands, s1 and s2 (e.g., group of
4620 strided accesses of size two), while NUNITS is four (i.e., four scalars
4621 of this type can be packed in a vector). The output vector will contain
4622 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4623 will be 2).
4624
4625 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4626 containing the operands.
4627
4628 For example, NUNITS is four as before, and the group size is 8
4629 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4630 {s5, s6, s7, s8}. */
4631
4632 /* When using duplicate_and_interleave, we just need one element for
4633 each scalar statement. */
4634 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4635 nunits = group_size;
4636
4637 number_of_copies = nunits * number_of_vectors / group_size;
4638
4639 number_of_places_left_in_vector = nunits;
4640 constant_p = true;
4641 tree_vector_builder elts (vector_type, nunits, 1);
4642 elts.quick_grow (nunits);
4643 stmt_vec_info insert_after = NULL;
4644 for (j = 0; j < number_of_copies; j++)
4645 {
4646 tree op;
4647 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
4648 {
4649 /* Create 'vect_ = {op0,op1,...,opn}'. */
4650 number_of_places_left_in_vector--;
4651 tree orig_op = op;
4652 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
4653 {
4654 if (CONSTANT_CLASS_P (op))
4655 {
4656 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4657 {
4658 /* Can't use VIEW_CONVERT_EXPR for booleans because
4659 of possibly different sizes of scalar value and
4660 vector element. */
4661 if (integer_zerop (op))
4662 op = build_int_cst (TREE_TYPE (vector_type), 0);
4663 else if (integer_onep (op))
4664 op = build_all_ones_cst (TREE_TYPE (vector_type));
4665 else
4666 gcc_unreachable ();
4667 }
4668 else
4669 op = fold_unary (VIEW_CONVERT_EXPR,
4670 TREE_TYPE (vector_type), op);
4671 gcc_assert (op && CONSTANT_CLASS_P (op));
4672 }
4673 else
4674 {
4675 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
4676 gimple *init_stmt;
4677 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4678 {
4679 tree true_val
4680 = build_all_ones_cst (TREE_TYPE (vector_type));
4681 tree false_val
4682 = build_zero_cst (TREE_TYPE (vector_type));
4683 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
4684 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
4685 op, true_val,
4686 false_val);
4687 }
4688 else
4689 {
4690 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
4691 op);
4692 init_stmt
4693 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
4694 op);
4695 }
4696 gimple_seq_add_stmt (&ctor_seq, init_stmt);
4697 op = new_temp;
4698 }
4699 }
4700 elts[number_of_places_left_in_vector] = op;
4701 if (!CONSTANT_CLASS_P (op))
4702 constant_p = false;
4703 /* For BB vectorization we have to compute an insert location
4704 when a def is inside the analyzed region since we cannot
4705 simply insert at the BB start in this case. */
4706 stmt_vec_info opdef;
4707 if (TREE_CODE (orig_op) == SSA_NAME
4708 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
4709 && is_a <bb_vec_info> (vinfo)
4710 && (opdef = vinfo->lookup_def (orig_op)))
4711 {
4712 if (!insert_after)
4713 insert_after = opdef;
4714 else
4715 insert_after = get_later_stmt (insert_after, opdef);
4716 }
4717
4718 if (number_of_places_left_in_vector == 0)
4719 {
4720 if (constant_p
4721 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4722 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4723 vec_cst = gimple_build_vector (&ctor_seq, &elts);
4724 else
4725 {
4726 if (permute_results.is_empty ())
4727 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
4728 elts, number_of_vectors,
4729 permute_results);
4730 vec_cst = permute_results[number_of_vectors - j - 1];
4731 }
4732 if (!gimple_seq_empty_p (ctor_seq))
4733 {
4734 if (insert_after)
4735 {
4736 gimple_stmt_iterator gsi;
4737 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
4738 {
4739 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
4740 gsi_insert_seq_before (&gsi, ctor_seq,
4741 GSI_CONTINUE_LINKING);
4742 }
4743 else if (!stmt_ends_bb_p (insert_after->stmt))
4744 {
4745 gsi = gsi_for_stmt (insert_after->stmt);
4746 gsi_insert_seq_after (&gsi, ctor_seq,
4747 GSI_CONTINUE_LINKING);
4748 }
4749 else
4750 {
4751 /* When we want to insert after a def where the
4752 defining stmt throws then insert on the fallthru
4753 edge. */
4754 edge e = find_fallthru_edge
4755 (gimple_bb (insert_after->stmt)->succs);
4756 basic_block new_bb
4757 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
4758 gcc_assert (!new_bb);
4759 }
4760 }
4761 else
4762 vinfo->insert_seq_on_entry (NULL, ctor_seq);
4763 ctor_seq = NULL;
4764 }
4765 voprnds.quick_push (vec_cst);
4766 insert_after = NULL;
4767 number_of_places_left_in_vector = nunits;
4768 constant_p = true;
4769 elts.new_vector (vector_type, nunits, 1);
4770 elts.quick_grow (nunits);
4771 }
4772 }
4773 }
4774
4775 /* Since the vectors are created in the reverse order, we should invert
4776 them. */
4777 vec_num = voprnds.length ();
4778 for (j = vec_num; j != 0; j--)
4779 {
4780 vop = voprnds[j - 1];
4781 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4782 }
4783
4784 /* In case that VF is greater than the unrolling factor needed for the SLP
4785 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4786 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4787 to replicate the vectors. */
4788 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
4789 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
4790 i++)
4791 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4792 }
4793
4794 /* Get the Ith vectorized definition from SLP_NODE. */
4795
4796 tree
4797 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
4798 {
4799 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
4800 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
4801 else
4802 return SLP_TREE_VEC_DEFS (slp_node)[i];
4803 }
4804
4805 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
4806
4807 void
4808 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
4809 {
4810 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
4811 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
4812 {
4813 unsigned j;
4814 gimple *vec_def_stmt;
4815 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
4816 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
4817 }
4818 else
4819 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
4820 }
4821
4822 /* Get N vectorized definitions for SLP_NODE. */
4823
4824 void
4825 vect_get_slp_defs (vec_info *,
4826 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
4827 {
4828 if (n == -1U)
4829 n = SLP_TREE_CHILDREN (slp_node).length ();
4830
4831 for (unsigned i = 0; i < n; ++i)
4832 {
4833 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
4834 vec<tree> vec_defs = vNULL;
4835 vect_get_slp_defs (child, &vec_defs);
4836 vec_oprnds->quick_push (vec_defs);
4837 }
4838 }
4839
4840 /* Generate vector permute statements from a list of loads in DR_CHAIN.
4841 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
4842 permute statements for the SLP node NODE. Store the number of vector
4843 permute instructions in *N_PERMS and the number of vector load
4844 instructions in *N_LOADS. */
4845
4846 bool
4847 vect_transform_slp_perm_load (vec_info *vinfo,
4848 slp_tree node, vec<tree> dr_chain,
4849 gimple_stmt_iterator *gsi, poly_uint64 vf,
4850 bool analyze_only, unsigned *n_perms,
4851 unsigned int *n_loads)
4852 {
4853 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4854 int vec_index = 0;
4855 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4856 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
4857 unsigned int mask_element;
4858 machine_mode mode;
4859
4860 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
4861 return false;
4862
4863 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
4864
4865 mode = TYPE_MODE (vectype);
4866 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4867
4868 /* Initialize the vect stmts of NODE to properly insert the generated
4869 stmts later. */
4870 if (! analyze_only)
4871 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
4872 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
4873 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
4874
4875 /* Generate permutation masks for every NODE. Number of masks for each NODE
4876 is equal to GROUP_SIZE.
4877 E.g., we have a group of three nodes with three loads from the same
4878 location in each node, and the vector size is 4. I.e., we have a
4879 a0b0c0a1b1c1... sequence and we need to create the following vectors:
4880 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
4881 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
4882 ...
4883
4884 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
4885 The last mask is illegal since we assume two operands for permute
4886 operation, and the mask element values can't be outside that range.
4887 Hence, the last mask must be converted into {2,5,5,5}.
4888 For the first two permutations we need the first and the second input
4889 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
4890 we need the second and the third vectors: {b1,c1,a2,b2} and
4891 {c2,a3,b3,c3}. */
4892
4893 int vect_stmts_counter = 0;
4894 unsigned int index = 0;
4895 int first_vec_index = -1;
4896 int second_vec_index = -1;
4897 bool noop_p = true;
4898 *n_perms = 0;
4899
4900 vec_perm_builder mask;
4901 unsigned int nelts_to_build;
4902 unsigned int nvectors_per_build;
4903 unsigned int in_nlanes;
4904 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
4905 && multiple_p (nunits, group_size));
4906 if (repeating_p)
4907 {
4908 /* A single vector contains a whole number of copies of the node, so:
4909 (a) all permutes can use the same mask; and
4910 (b) the permutes only need a single vector input. */
4911 mask.new_vector (nunits, group_size, 3);
4912 nelts_to_build = mask.encoded_nelts ();
4913 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
4914 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
4915 }
4916 else
4917 {
4918 /* We need to construct a separate mask for each vector statement. */
4919 unsigned HOST_WIDE_INT const_nunits, const_vf;
4920 if (!nunits.is_constant (&const_nunits)
4921 || !vf.is_constant (&const_vf))
4922 return false;
4923 mask.new_vector (const_nunits, const_nunits, 1);
4924 nelts_to_build = const_vf * group_size;
4925 nvectors_per_build = 1;
4926 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
4927 }
4928 auto_sbitmap used_in_lanes (in_nlanes);
4929 bitmap_clear (used_in_lanes);
4930
4931 unsigned int count = mask.encoded_nelts ();
4932 mask.quick_grow (count);
4933 vec_perm_indices indices;
4934
4935 for (unsigned int j = 0; j < nelts_to_build; j++)
4936 {
4937 unsigned int iter_num = j / group_size;
4938 unsigned int stmt_num = j % group_size;
4939 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
4940 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
4941 bitmap_set_bit (used_in_lanes, i);
4942 if (repeating_p)
4943 {
4944 first_vec_index = 0;
4945 mask_element = i;
4946 }
4947 else
4948 {
4949 /* Enforced before the loop when !repeating_p. */
4950 unsigned int const_nunits = nunits.to_constant ();
4951 vec_index = i / const_nunits;
4952 mask_element = i % const_nunits;
4953 if (vec_index == first_vec_index
4954 || first_vec_index == -1)
4955 {
4956 first_vec_index = vec_index;
4957 }
4958 else if (vec_index == second_vec_index
4959 || second_vec_index == -1)
4960 {
4961 second_vec_index = vec_index;
4962 mask_element += const_nunits;
4963 }
4964 else
4965 {
4966 if (dump_enabled_p ())
4967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4968 "permutation requires at "
4969 "least three vectors %G",
4970 stmt_info->stmt);
4971 gcc_assert (analyze_only);
4972 return false;
4973 }
4974
4975 gcc_assert (mask_element < 2 * const_nunits);
4976 }
4977
4978 if (mask_element != index)
4979 noop_p = false;
4980 mask[index++] = mask_element;
4981
4982 if (index == count && !noop_p)
4983 {
4984 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
4985 if (!can_vec_perm_const_p (mode, indices))
4986 {
4987 if (dump_enabled_p ())
4988 {
4989 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4990 vect_location,
4991 "unsupported vect permute { ");
4992 for (i = 0; i < count; ++i)
4993 {
4994 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4995 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4996 }
4997 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4998 }
4999 gcc_assert (analyze_only);
5000 return false;
5001 }
5002
5003 ++*n_perms;
5004 }
5005
5006 if (index == count)
5007 {
5008 if (!analyze_only)
5009 {
5010 tree mask_vec = NULL_TREE;
5011
5012 if (! noop_p)
5013 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5014
5015 if (second_vec_index == -1)
5016 second_vec_index = first_vec_index;
5017
5018 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
5019 {
5020 /* Generate the permute statement if necessary. */
5021 tree first_vec = dr_chain[first_vec_index + ri];
5022 tree second_vec = dr_chain[second_vec_index + ri];
5023 gimple *perm_stmt;
5024 if (! noop_p)
5025 {
5026 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5027 tree perm_dest
5028 = vect_create_destination_var (gimple_assign_lhs (stmt),
5029 vectype);
5030 perm_dest = make_ssa_name (perm_dest);
5031 perm_stmt
5032 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5033 first_vec, second_vec,
5034 mask_vec);
5035 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
5036 gsi);
5037 }
5038 else
5039 /* If mask was NULL_TREE generate the requested
5040 identity transform. */
5041 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
5042
5043 /* Store the vector statement in NODE. */
5044 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
5045 }
5046 }
5047
5048 index = 0;
5049 first_vec_index = -1;
5050 second_vec_index = -1;
5051 noop_p = true;
5052 }
5053 }
5054
5055 if (n_loads)
5056 {
5057 if (repeating_p)
5058 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5059 else
5060 {
5061 /* Enforced above when !repeating_p. */
5062 unsigned int const_nunits = nunits.to_constant ();
5063 *n_loads = 0;
5064 bool load_seen = false;
5065 for (unsigned i = 0; i < in_nlanes; ++i)
5066 {
5067 if (i % const_nunits == 0)
5068 {
5069 if (load_seen)
5070 *n_loads += 1;
5071 load_seen = false;
5072 }
5073 if (bitmap_bit_p (used_in_lanes, i))
5074 load_seen = true;
5075 }
5076 if (load_seen)
5077 *n_loads += 1;
5078 }
5079 }
5080
5081 return true;
5082 }
5083
5084
5085 /* Vectorize the SLP permutations in NODE as specified
5086 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5087 child number and lane number.
5088 Interleaving of two two-lane two-child SLP subtrees (not supported):
5089 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5090 A blend of two four-lane two-child SLP subtrees:
5091 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5092 Highpart of a four-lane one-child SLP subtree (not supported):
5093 [ { 0, 2 }, { 0, 3 } ]
5094 Where currently only a subset is supported by code generating below. */
5095
5096 static bool
5097 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5098 slp_tree node, stmt_vector_for_cost *cost_vec)
5099 {
5100 tree vectype = SLP_TREE_VECTYPE (node);
5101
5102 /* ??? We currently only support all same vector input and output types
5103 while the SLP IL should really do a concat + select and thus accept
5104 arbitrary mismatches. */
5105 slp_tree child;
5106 unsigned i;
5107 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5108 if (!types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
5109 {
5110 if (dump_enabled_p ())
5111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5112 "Unsupported lane permutation\n");
5113 return false;
5114 }
5115
5116 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
5117 gcc_assert (perm.length () == SLP_TREE_LANES (node));
5118 if (dump_enabled_p ())
5119 {
5120 dump_printf_loc (MSG_NOTE, vect_location,
5121 "vectorizing permutation");
5122 for (unsigned i = 0; i < perm.length (); ++i)
5123 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
5124 dump_printf (MSG_NOTE, "\n");
5125 }
5126
5127 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5128 if (!nunits.is_constant ())
5129 return false;
5130 unsigned HOST_WIDE_INT vf = 1;
5131 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
5132 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
5133 return false;
5134 unsigned olanes = vf * SLP_TREE_LANES (node);
5135 gcc_assert (multiple_p (olanes, nunits));
5136
5137 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5138 from the { SLP operand, scalar lane } permutation as recorded in the
5139 SLP node as intermediate step. This part should already work
5140 with SLP children with arbitrary number of lanes. */
5141 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
5142 auto_vec<unsigned> active_lane;
5143 vperm.create (olanes);
5144 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
5145 for (unsigned i = 0; i < vf; ++i)
5146 {
5147 for (unsigned pi = 0; pi < perm.length (); ++pi)
5148 {
5149 std::pair<unsigned, unsigned> p = perm[pi];
5150 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
5151 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
5152 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
5153 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
5154 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
5155 }
5156 /* Advance to the next group. */
5157 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
5158 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
5159 }
5160
5161 if (dump_enabled_p ())
5162 {
5163 dump_printf_loc (MSG_NOTE, vect_location, "as");
5164 for (unsigned i = 0; i < vperm.length (); ++i)
5165 {
5166 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
5167 dump_printf (MSG_NOTE, ",");
5168 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
5169 vperm[i].first.first, vperm[i].first.second,
5170 vperm[i].first.second);
5171 }
5172 dump_printf (MSG_NOTE, "\n");
5173 }
5174
5175 /* We can only handle two-vector permutes, everything else should
5176 be lowered on the SLP level. The following is closely inspired
5177 by vect_transform_slp_perm_load and is supposed to eventually
5178 replace it.
5179 ??? As intermediate step do code-gen in the SLP tree representation
5180 somehow? */
5181 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
5182 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
5183 unsigned int const_nunits = nunits.to_constant ();
5184 unsigned int index = 0;
5185 unsigned int mask_element;
5186 vec_perm_builder mask;
5187 mask.new_vector (const_nunits, const_nunits, 1);
5188 unsigned int count = mask.encoded_nelts ();
5189 mask.quick_grow (count);
5190 vec_perm_indices indices;
5191 unsigned nperms = 0;
5192 for (unsigned i = 0; i < vperm.length (); ++i)
5193 {
5194 mask_element = vperm[i].second;
5195 if (first_vec.first == -1U
5196 || first_vec == vperm[i].first)
5197 first_vec = vperm[i].first;
5198 else if (second_vec.first == -1U
5199 || second_vec == vperm[i].first)
5200 {
5201 second_vec = vperm[i].first;
5202 mask_element += const_nunits;
5203 }
5204 else
5205 {
5206 if (dump_enabled_p ())
5207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5208 "permutation requires at "
5209 "least three vectors");
5210 gcc_assert (!gsi);
5211 return false;
5212 }
5213
5214 mask[index++] = mask_element;
5215
5216 if (index == count)
5217 {
5218 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
5219 const_nunits);
5220 bool identity_p = indices.series_p (0, 1, 0, 1);
5221 if (!identity_p
5222 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5223 {
5224 if (dump_enabled_p ())
5225 {
5226 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5227 vect_location,
5228 "unsupported vect permute { ");
5229 for (i = 0; i < count; ++i)
5230 {
5231 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5232 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5233 }
5234 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5235 }
5236 gcc_assert (!gsi);
5237 return false;
5238 }
5239
5240 if (!identity_p)
5241 nperms++;
5242 if (gsi)
5243 {
5244 if (second_vec.first == -1U)
5245 second_vec = first_vec;
5246
5247 /* Generate the permute statement if necessary. */
5248 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
5249 tree first_def
5250 = vect_get_slp_vect_def (first_node, first_vec.second);
5251 gassign *perm_stmt;
5252 tree perm_dest = make_ssa_name (vectype);
5253 if (!identity_p)
5254 {
5255 slp_tree second_node
5256 = SLP_TREE_CHILDREN (node)[second_vec.first];
5257 tree second_def
5258 = vect_get_slp_vect_def (second_node, second_vec.second);
5259 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5260 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5261 first_def, second_def,
5262 mask_vec);
5263 }
5264 else
5265 /* We need a copy here in case the def was external. */
5266 perm_stmt = gimple_build_assign (perm_dest, first_def);
5267 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
5268 /* Store the vector statement in NODE. */
5269 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
5270 }
5271
5272 index = 0;
5273 first_vec = std::make_pair (-1U, -1U);
5274 second_vec = std::make_pair (-1U, -1U);
5275 }
5276 }
5277
5278 if (!gsi)
5279 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
5280
5281 return true;
5282 }
5283
5284 /* Vectorize SLP NODE. */
5285
5286 static void
5287 vect_schedule_slp_node (vec_info *vinfo,
5288 slp_tree node, slp_instance instance)
5289 {
5290 gimple_stmt_iterator si;
5291 int i;
5292 slp_tree child;
5293
5294 /* For existing vectors there's nothing to do. */
5295 if (SLP_TREE_VEC_DEFS (node).exists ())
5296 return;
5297
5298 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
5299
5300 /* Vectorize externals and constants. */
5301 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5302 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5303 {
5304 /* ??? vectorizable_shift can end up using a scalar operand which is
5305 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5306 node in this case. */
5307 if (!SLP_TREE_VECTYPE (node))
5308 return;
5309
5310 vect_create_constant_vectors (vinfo, node);
5311 return;
5312 }
5313
5314 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5315
5316 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
5317 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
5318
5319 if (dump_enabled_p ())
5320 dump_printf_loc (MSG_NOTE, vect_location,
5321 "------>vectorizing SLP node starting from: %G",
5322 stmt_info->stmt);
5323
5324 if (STMT_VINFO_DATA_REF (stmt_info)
5325 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5326 {
5327 /* Vectorized loads go before the first scalar load to make it
5328 ready early, vectorized stores go before the last scalar
5329 stmt which is where all uses are ready. */
5330 stmt_vec_info last_stmt_info = NULL;
5331 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
5332 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
5333 else /* DR_IS_WRITE */
5334 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
5335 si = gsi_for_stmt (last_stmt_info->stmt);
5336 }
5337 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
5338 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
5339 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
5340 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5341 {
5342 /* For PHI node vectorization we do not use the insertion iterator. */
5343 si = gsi_none ();
5344 }
5345 else
5346 {
5347 /* Emit other stmts after the children vectorized defs which is
5348 earliest possible. */
5349 gimple *last_stmt = NULL;
5350 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5351 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5352 {
5353 /* For fold-left reductions we are retaining the scalar
5354 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5355 set so the representation isn't perfect. Resort to the
5356 last scalar def here. */
5357 if (SLP_TREE_VEC_STMTS (child).is_empty ())
5358 {
5359 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
5360 == cycle_phi_info_type);
5361 gphi *phi = as_a <gphi *>
5362 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
5363 if (!last_stmt
5364 || vect_stmt_dominates_stmt_p (last_stmt, phi))
5365 last_stmt = phi;
5366 }
5367 /* We are emitting all vectorized stmts in the same place and
5368 the last one is the last.
5369 ??? Unless we have a load permutation applied and that
5370 figures to re-use an earlier generated load. */
5371 unsigned j;
5372 gimple *vstmt;
5373 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
5374 if (!last_stmt
5375 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5376 last_stmt = vstmt;
5377 }
5378 else if (!SLP_TREE_VECTYPE (child))
5379 {
5380 /* For externals we use unvectorized at all scalar defs. */
5381 unsigned j;
5382 tree def;
5383 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
5384 if (TREE_CODE (def) == SSA_NAME
5385 && !SSA_NAME_IS_DEFAULT_DEF (def))
5386 {
5387 gimple *stmt = SSA_NAME_DEF_STMT (def);
5388 if (!last_stmt
5389 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
5390 last_stmt = stmt;
5391 }
5392 }
5393 else
5394 {
5395 /* For externals we have to look at all defs since their
5396 insertion place is decided per vector. But beware
5397 of pre-existing vectors where we need to make sure
5398 we do not insert before the region boundary. */
5399 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
5400 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
5401 last_stmt = gsi_stmt (gsi_after_labels
5402 (as_a <bb_vec_info> (vinfo)->bbs[0]));
5403 else
5404 {
5405 unsigned j;
5406 tree vdef;
5407 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
5408 if (TREE_CODE (vdef) == SSA_NAME
5409 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
5410 {
5411 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
5412 if (!last_stmt
5413 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5414 last_stmt = vstmt;
5415 }
5416 }
5417 }
5418 /* This can happen when all children are pre-existing vectors or
5419 constants. */
5420 if (!last_stmt)
5421 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
5422 if (is_a <gphi *> (last_stmt))
5423 si = gsi_after_labels (gimple_bb (last_stmt));
5424 else
5425 {
5426 si = gsi_for_stmt (last_stmt);
5427 gsi_next (&si);
5428 }
5429 }
5430
5431 bool done_p = false;
5432
5433 /* Handle purely internal nodes. */
5434 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5435 {
5436 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5437 be shared with different SLP nodes (but usually it's the same
5438 operation apart from the case the stmt is only there for denoting
5439 the actual scalar lane defs ...). So do not call vect_transform_stmt
5440 but open-code it here (partly). */
5441 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
5442 gcc_assert (done);
5443 done_p = true;
5444 }
5445 if (!done_p)
5446 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
5447 }
5448
5449 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5450 For loop vectorization this is done in vectorizable_call, but for SLP
5451 it needs to be deferred until end of vect_schedule_slp, because multiple
5452 SLP instances may refer to the same scalar stmt. */
5453
5454 static void
5455 vect_remove_slp_scalar_calls (vec_info *vinfo,
5456 slp_tree node, hash_set<slp_tree> &visited)
5457 {
5458 gimple *new_stmt;
5459 gimple_stmt_iterator gsi;
5460 int i;
5461 slp_tree child;
5462 tree lhs;
5463 stmt_vec_info stmt_info;
5464
5465 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5466 return;
5467
5468 if (visited.add (node))
5469 return;
5470
5471 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5472 vect_remove_slp_scalar_calls (vinfo, child, visited);
5473
5474 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5475 {
5476 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
5477 if (!stmt || gimple_bb (stmt) == NULL)
5478 continue;
5479 if (is_pattern_stmt_p (stmt_info)
5480 || !PURE_SLP_STMT (stmt_info))
5481 continue;
5482 lhs = gimple_call_lhs (stmt);
5483 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
5484 gsi = gsi_for_stmt (stmt);
5485 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
5486 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
5487 }
5488 }
5489
5490 static void
5491 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
5492 {
5493 hash_set<slp_tree> visited;
5494 vect_remove_slp_scalar_calls (vinfo, node, visited);
5495 }
5496
5497 /* Vectorize the instance root. */
5498
5499 void
5500 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
5501 {
5502 gassign *rstmt = NULL;
5503
5504 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
5505 {
5506 gimple *child_stmt;
5507 int j;
5508
5509 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5510 {
5511 tree vect_lhs = gimple_get_lhs (child_stmt);
5512 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
5513 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
5514 TREE_TYPE (vect_lhs)))
5515 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
5516 vect_lhs);
5517 rstmt = gimple_build_assign (root_lhs, vect_lhs);
5518 break;
5519 }
5520 }
5521 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
5522 {
5523 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5524 gimple *child_stmt;
5525 int j;
5526 vec<constructor_elt, va_gc> *v;
5527 vec_alloc (v, nelts);
5528
5529 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5530 {
5531 CONSTRUCTOR_APPEND_ELT (v,
5532 NULL_TREE,
5533 gimple_get_lhs (child_stmt));
5534 }
5535 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
5536 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
5537 tree r_constructor = build_constructor (rtype, v);
5538 rstmt = gimple_build_assign (lhs, r_constructor);
5539 }
5540
5541 gcc_assert (rstmt);
5542
5543 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
5544 gsi_replace (&rgsi, rstmt, true);
5545 }
5546
5547 struct slp_scc_info
5548 {
5549 bool on_stack;
5550 int dfs;
5551 int lowlink;
5552 };
5553
5554 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
5555
5556 static void
5557 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
5558 hash_map<slp_tree, slp_scc_info> &scc_info,
5559 int &maxdfs, vec<slp_tree> &stack)
5560 {
5561 bool existed_p;
5562 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
5563 gcc_assert (!existed_p);
5564 info->dfs = maxdfs;
5565 info->lowlink = maxdfs;
5566 maxdfs++;
5567
5568 /* Leaf. */
5569 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5570 {
5571 info->on_stack = false;
5572 vect_schedule_slp_node (vinfo, node, instance);
5573 return;
5574 }
5575
5576 info->on_stack = true;
5577 stack.safe_push (node);
5578
5579 unsigned i;
5580 slp_tree child;
5581 /* DFS recurse. */
5582 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5583 {
5584 if (!child)
5585 continue;
5586 slp_scc_info *child_info = scc_info.get (child);
5587 if (!child_info)
5588 {
5589 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
5590 /* Recursion might have re-allocated the node. */
5591 info = scc_info.get (node);
5592 child_info = scc_info.get (child);
5593 info->lowlink = MIN (info->lowlink, child_info->lowlink);
5594 }
5595 else if (child_info->on_stack)
5596 info->lowlink = MIN (info->lowlink, child_info->dfs);
5597 }
5598 if (info->lowlink != info->dfs)
5599 return;
5600
5601 auto_vec<slp_tree, 4> phis_to_fixup;
5602
5603 /* Singleton. */
5604 if (stack.last () == node)
5605 {
5606 stack.pop ();
5607 info->on_stack = false;
5608 vect_schedule_slp_node (vinfo, node, instance);
5609 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
5610 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
5611 phis_to_fixup.quick_push (node);
5612 }
5613 else
5614 {
5615 /* SCC. */
5616 int last_idx = stack.length () - 1;
5617 while (stack[last_idx] != node)
5618 last_idx--;
5619 /* We can break the cycle at PHIs who have at least one child
5620 code generated. Then we could re-start the DFS walk until
5621 all nodes in the SCC are covered (we might have new entries
5622 for only back-reachable nodes). But it's simpler to just
5623 iterate and schedule those that are ready. */
5624 unsigned todo = stack.length () - last_idx;
5625 do
5626 {
5627 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
5628 {
5629 slp_tree entry = stack[idx];
5630 if (!entry)
5631 continue;
5632 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
5633 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
5634 bool ready = !phi;
5635 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
5636 if (!child)
5637 {
5638 gcc_assert (phi);
5639 ready = true;
5640 break;
5641 }
5642 else if (scc_info.get (child)->on_stack)
5643 {
5644 if (!phi)
5645 {
5646 ready = false;
5647 break;
5648 }
5649 }
5650 else
5651 {
5652 if (phi)
5653 {
5654 ready = true;
5655 break;
5656 }
5657 }
5658 if (ready)
5659 {
5660 vect_schedule_slp_node (vinfo, entry, instance);
5661 scc_info.get (entry)->on_stack = false;
5662 stack[idx] = NULL;
5663 todo--;
5664 if (phi)
5665 phis_to_fixup.safe_push (entry);
5666 }
5667 }
5668 }
5669 while (todo != 0);
5670
5671 /* Pop the SCC. */
5672 stack.truncate (last_idx);
5673 }
5674
5675 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
5676 slp_tree phi_node;
5677 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
5678 {
5679 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
5680 edge_iterator ei;
5681 edge e;
5682 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
5683 {
5684 unsigned dest_idx = e->dest_idx;
5685 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
5686 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
5687 continue;
5688 /* Simply fill all args. */
5689 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
5690 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
5691 vect_get_slp_vect_def (child, i),
5692 e, gimple_phi_arg_location (phi, dest_idx));
5693 }
5694 }
5695 }
5696
5697 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
5698
5699 void
5700 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
5701 {
5702 slp_instance instance;
5703 unsigned int i;
5704
5705 hash_map<slp_tree, slp_scc_info> scc_info;
5706 int maxdfs = 0;
5707 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5708 {
5709 slp_tree node = SLP_INSTANCE_TREE (instance);
5710 if (dump_enabled_p ())
5711 {
5712 dump_printf_loc (MSG_NOTE, vect_location,
5713 "Vectorizing SLP tree:\n");
5714 if (SLP_INSTANCE_ROOT_STMT (instance))
5715 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
5716 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
5717 vect_print_slp_graph (MSG_NOTE, vect_location,
5718 SLP_INSTANCE_TREE (instance));
5719 }
5720 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
5721 have a PHI be the node breaking the cycle. */
5722 auto_vec<slp_tree> stack;
5723 if (!scc_info.get (node))
5724 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
5725
5726 if (SLP_INSTANCE_ROOT_STMT (instance))
5727 vectorize_slp_instance_root_stmt (node, instance);
5728
5729 if (dump_enabled_p ())
5730 dump_printf_loc (MSG_NOTE, vect_location,
5731 "vectorizing stmts using SLP.\n");
5732 }
5733
5734 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5735 {
5736 slp_tree root = SLP_INSTANCE_TREE (instance);
5737 stmt_vec_info store_info;
5738 unsigned int j;
5739
5740 /* Remove scalar call stmts. Do not do this for basic-block
5741 vectorization as not all uses may be vectorized.
5742 ??? Why should this be necessary? DCE should be able to
5743 remove the stmts itself.
5744 ??? For BB vectorization we can as well remove scalar
5745 stmts starting from the SLP tree root if they have no
5746 uses. */
5747 if (is_a <loop_vec_info> (vinfo))
5748 vect_remove_slp_scalar_calls (vinfo, root);
5749
5750 /* Remove vectorized stores original scalar stmts. */
5751 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
5752 {
5753 if (!STMT_VINFO_DATA_REF (store_info)
5754 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
5755 break;
5756
5757 store_info = vect_orig_stmt (store_info);
5758 /* Free the attached stmt_vec_info and remove the stmt. */
5759 vinfo->remove_stmt (store_info);
5760 }
5761 }
5762 }