]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
Make the PR99220 fix more robust
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
52
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55
56 static object_allocator<_slp_tree> *slp_tree_pool;
57 static slp_tree slp_first_node;
58
59 void
60 vect_slp_init (void)
61 {
62 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
63 }
64
65 void
66 vect_slp_fini (void)
67 {
68 while (slp_first_node)
69 delete slp_first_node;
70 delete slp_tree_pool;
71 slp_tree_pool = NULL;
72 }
73
74 void *
75 _slp_tree::operator new (size_t n)
76 {
77 gcc_assert (n == sizeof (_slp_tree));
78 return slp_tree_pool->allocate_raw ();
79 }
80
81 void
82 _slp_tree::operator delete (void *node, size_t n)
83 {
84 gcc_assert (n == sizeof (_slp_tree));
85 slp_tree_pool->remove_raw (node);
86 }
87
88
89 /* Initialize a SLP node. */
90
91 _slp_tree::_slp_tree ()
92 {
93 this->prev_node = NULL;
94 if (slp_first_node)
95 slp_first_node->prev_node = this;
96 this->next_node = slp_first_node;
97 slp_first_node = this;
98 SLP_TREE_SCALAR_STMTS (this) = vNULL;
99 SLP_TREE_SCALAR_OPS (this) = vNULL;
100 SLP_TREE_VEC_STMTS (this) = vNULL;
101 SLP_TREE_VEC_DEFS (this) = vNULL;
102 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
103 SLP_TREE_CHILDREN (this) = vNULL;
104 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
105 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
106 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
107 SLP_TREE_CODE (this) = ERROR_MARK;
108 SLP_TREE_VECTYPE (this) = NULL_TREE;
109 SLP_TREE_REPRESENTATIVE (this) = NULL;
110 SLP_TREE_REF_COUNT (this) = 1;
111 this->max_nunits = 1;
112 this->lanes = 0;
113 }
114
115 /* Tear down a SLP node. */
116
117 _slp_tree::~_slp_tree ()
118 {
119 if (this->prev_node)
120 this->prev_node->next_node = this->next_node;
121 else
122 slp_first_node = this->next_node;
123 if (this->next_node)
124 this->next_node->prev_node = this->prev_node;
125 SLP_TREE_CHILDREN (this).release ();
126 SLP_TREE_SCALAR_STMTS (this).release ();
127 SLP_TREE_SCALAR_OPS (this).release ();
128 SLP_TREE_VEC_STMTS (this).release ();
129 SLP_TREE_VEC_DEFS (this).release ();
130 SLP_TREE_LOAD_PERMUTATION (this).release ();
131 SLP_TREE_LANE_PERMUTATION (this).release ();
132 }
133
134 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
135
136 void
137 vect_free_slp_tree (slp_tree node)
138 {
139 int i;
140 slp_tree child;
141
142 if (--SLP_TREE_REF_COUNT (node) != 0)
143 return;
144
145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
146 if (child)
147 vect_free_slp_tree (child);
148
149 /* If the node defines any SLP only patterns then those patterns are no
150 longer valid and should be removed. */
151 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
152 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
153 {
154 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
155 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
156 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
157 }
158
159 delete node;
160 }
161
162 /* Return a location suitable for dumpings related to the SLP instance. */
163
164 dump_user_location_t
165 _slp_instance::location () const
166 {
167 if (root_stmt)
168 return root_stmt->stmt;
169 else
170 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
171 }
172
173
174 /* Free the memory allocated for the SLP instance. */
175
176 void
177 vect_free_slp_instance (slp_instance instance)
178 {
179 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
180 SLP_INSTANCE_LOADS (instance).release ();
181 instance->subgraph_entries.release ();
182 instance->cost_vec.release ();
183 free (instance);
184 }
185
186
187 /* Create an SLP node for SCALAR_STMTS. */
188
189 slp_tree
190 vect_create_new_slp_node (unsigned nops, tree_code code)
191 {
192 slp_tree node = new _slp_tree;
193 SLP_TREE_SCALAR_STMTS (node) = vNULL;
194 SLP_TREE_CHILDREN (node).create (nops);
195 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
196 SLP_TREE_CODE (node) = code;
197 return node;
198 }
199 /* Create an SLP node for SCALAR_STMTS. */
200
201 static slp_tree
202 vect_create_new_slp_node (slp_tree node,
203 vec<stmt_vec_info> scalar_stmts, unsigned nops)
204 {
205 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
206 SLP_TREE_CHILDREN (node).create (nops);
207 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
208 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
209 SLP_TREE_LANES (node) = scalar_stmts.length ();
210 return node;
211 }
212
213 /* Create an SLP node for SCALAR_STMTS. */
214
215 static slp_tree
216 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
217 {
218 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
219 }
220
221 /* Create an SLP node for OPS. */
222
223 static slp_tree
224 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
225 {
226 SLP_TREE_SCALAR_OPS (node) = ops;
227 SLP_TREE_DEF_TYPE (node) = vect_external_def;
228 SLP_TREE_LANES (node) = ops.length ();
229 return node;
230 }
231
232 /* Create an SLP node for OPS. */
233
234 static slp_tree
235 vect_create_new_slp_node (vec<tree> ops)
236 {
237 return vect_create_new_slp_node (new _slp_tree, ops);
238 }
239
240
241 /* This structure is used in creation of an SLP tree. Each instance
242 corresponds to the same operand in a group of scalar stmts in an SLP
243 node. */
244 typedef struct _slp_oprnd_info
245 {
246 /* Def-stmts for the operands. */
247 vec<stmt_vec_info> def_stmts;
248 /* Operands. */
249 vec<tree> ops;
250 /* Information about the first statement, its vector def-type, type, the
251 operand itself in case it's constant, and an indication if it's a pattern
252 stmt. */
253 tree first_op_type;
254 enum vect_def_type first_dt;
255 bool any_pattern;
256 } *slp_oprnd_info;
257
258
259 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
260 operand. */
261 static vec<slp_oprnd_info>
262 vect_create_oprnd_info (int nops, int group_size)
263 {
264 int i;
265 slp_oprnd_info oprnd_info;
266 vec<slp_oprnd_info> oprnds_info;
267
268 oprnds_info.create (nops);
269 for (i = 0; i < nops; i++)
270 {
271 oprnd_info = XNEW (struct _slp_oprnd_info);
272 oprnd_info->def_stmts.create (group_size);
273 oprnd_info->ops.create (group_size);
274 oprnd_info->first_dt = vect_uninitialized_def;
275 oprnd_info->first_op_type = NULL_TREE;
276 oprnd_info->any_pattern = false;
277 oprnds_info.quick_push (oprnd_info);
278 }
279
280 return oprnds_info;
281 }
282
283
284 /* Free operands info. */
285
286 static void
287 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
288 {
289 int i;
290 slp_oprnd_info oprnd_info;
291
292 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
293 {
294 oprnd_info->def_stmts.release ();
295 oprnd_info->ops.release ();
296 XDELETE (oprnd_info);
297 }
298
299 oprnds_info.release ();
300 }
301
302
303 /* Return true if STMTS contains a pattern statement. */
304
305 static bool
306 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
307 {
308 stmt_vec_info stmt_info;
309 unsigned int i;
310 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
311 if (is_pattern_stmt_p (stmt_info))
312 return true;
313 return false;
314 }
315
316 /* Return true when all lanes in the external or constant NODE have
317 the same value. */
318
319 static bool
320 vect_slp_tree_uniform_p (slp_tree node)
321 {
322 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
323 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
324
325 /* Pre-exsting vectors. */
326 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
327 return false;
328
329 unsigned i;
330 tree op, first = NULL_TREE;
331 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
332 if (!first)
333 first = op;
334 else if (!operand_equal_p (first, op, 0))
335 return false;
336
337 return true;
338 }
339
340 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
341 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
342 of the chain. */
343
344 int
345 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
346 stmt_vec_info first_stmt_info)
347 {
348 stmt_vec_info next_stmt_info = first_stmt_info;
349 int result = 0;
350
351 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
352 return -1;
353
354 do
355 {
356 if (next_stmt_info == stmt_info)
357 return result;
358 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
359 if (next_stmt_info)
360 result += DR_GROUP_GAP (next_stmt_info);
361 }
362 while (next_stmt_info);
363
364 return -1;
365 }
366
367 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
368 using the method implemented by duplicate_and_interleave. Return true
369 if so, returning the number of intermediate vectors in *NVECTORS_OUT
370 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
371 (if nonnull). */
372
373 bool
374 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
375 tree elt_type, unsigned int *nvectors_out,
376 tree *vector_type_out,
377 tree *permutes)
378 {
379 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
380 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
381 return false;
382
383 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
384 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
385 unsigned int nvectors = 1;
386 for (;;)
387 {
388 scalar_int_mode int_mode;
389 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
390 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
391 {
392 /* Get the natural vector type for this SLP group size. */
393 tree int_type = build_nonstandard_integer_type
394 (GET_MODE_BITSIZE (int_mode), 1);
395 tree vector_type
396 = get_vectype_for_scalar_type (vinfo, int_type, count);
397 if (vector_type
398 && VECTOR_MODE_P (TYPE_MODE (vector_type))
399 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
400 GET_MODE_SIZE (base_vector_mode)))
401 {
402 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
403 together into elements of type INT_TYPE and using the result
404 to build NVECTORS vectors. */
405 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
406 vec_perm_builder sel1 (nelts, 2, 3);
407 vec_perm_builder sel2 (nelts, 2, 3);
408 poly_int64 half_nelts = exact_div (nelts, 2);
409 for (unsigned int i = 0; i < 3; ++i)
410 {
411 sel1.quick_push (i);
412 sel1.quick_push (i + nelts);
413 sel2.quick_push (half_nelts + i);
414 sel2.quick_push (half_nelts + i + nelts);
415 }
416 vec_perm_indices indices1 (sel1, 2, nelts);
417 vec_perm_indices indices2 (sel2, 2, nelts);
418 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
419 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
420 {
421 if (nvectors_out)
422 *nvectors_out = nvectors;
423 if (vector_type_out)
424 *vector_type_out = vector_type;
425 if (permutes)
426 {
427 permutes[0] = vect_gen_perm_mask_checked (vector_type,
428 indices1);
429 permutes[1] = vect_gen_perm_mask_checked (vector_type,
430 indices2);
431 }
432 return true;
433 }
434 }
435 }
436 if (!multiple_p (elt_bytes, 2, &elt_bytes))
437 return false;
438 nvectors *= 2;
439 }
440 }
441
442 /* Return true if DTA and DTB match. */
443
444 static bool
445 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
446 {
447 return (dta == dtb
448 || ((dta == vect_external_def || dta == vect_constant_def)
449 && (dtb == vect_external_def || dtb == vect_constant_def)));
450 }
451
452 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
453 they are of a valid type and that they match the defs of the first stmt of
454 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
455 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
456 indicates swap is required for cond_expr stmts. Specifically, *SWAP
457 is 1 if STMT is cond and operands of comparison need to be swapped;
458 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
459 If there is any operand swap in this function, *SWAP is set to non-zero
460 value.
461 If there was a fatal error return -1; if the error could be corrected by
462 swapping operands of father node of this one, return 1; if everything is
463 ok return 0. */
464 static int
465 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
466 bool *skip_args,
467 vec<stmt_vec_info> stmts, unsigned stmt_num,
468 vec<slp_oprnd_info> *oprnds_info)
469 {
470 stmt_vec_info stmt_info = stmts[stmt_num];
471 tree oprnd;
472 unsigned int i, number_of_oprnds;
473 enum vect_def_type dt = vect_uninitialized_def;
474 slp_oprnd_info oprnd_info;
475 int first_op_idx = 1;
476 unsigned int commutative_op = -1U;
477 bool first_op_cond = false;
478 bool first = stmt_num == 0;
479
480 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
481 {
482 number_of_oprnds = gimple_call_num_args (stmt);
483 first_op_idx = 3;
484 if (gimple_call_internal_p (stmt))
485 {
486 internal_fn ifn = gimple_call_internal_fn (stmt);
487 commutative_op = first_commutative_argument (ifn);
488
489 /* Masked load, only look at mask. */
490 if (ifn == IFN_MASK_LOAD)
491 {
492 number_of_oprnds = 1;
493 /* Mask operand index. */
494 first_op_idx = 5;
495 }
496 }
497 }
498 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
499 {
500 enum tree_code code = gimple_assign_rhs_code (stmt);
501 number_of_oprnds = gimple_num_ops (stmt) - 1;
502 /* Swap can only be done for cond_expr if asked to, otherwise we
503 could result in different comparison code to the first stmt. */
504 if (code == COND_EXPR
505 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
506 {
507 first_op_cond = true;
508 number_of_oprnds++;
509 }
510 else
511 commutative_op = commutative_tree_code (code) ? 0U : -1U;
512 }
513 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
514 number_of_oprnds = gimple_phi_num_args (stmt);
515 else
516 return -1;
517
518 bool swapped = (swap != 0);
519 bool backedge = false;
520 gcc_assert (!swapped || first_op_cond);
521 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
522 for (i = 0; i < number_of_oprnds; i++)
523 {
524 if (first_op_cond)
525 {
526 /* Map indicating how operands of cond_expr should be swapped. */
527 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
528 int *map = maps[swap];
529
530 if (i < 2)
531 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
532 first_op_idx), map[i]);
533 else
534 oprnd = gimple_op (stmt_info->stmt, map[i]);
535 }
536 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
537 {
538 oprnd = gimple_phi_arg_def (stmt, i);
539 backedge = dominated_by_p (CDI_DOMINATORS,
540 gimple_phi_arg_edge (stmt, i)->src,
541 gimple_bb (stmt_info->stmt));
542 }
543 else
544 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
545 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
546 oprnd = TREE_OPERAND (oprnd, 0);
547
548 oprnd_info = (*oprnds_info)[i];
549
550 stmt_vec_info def_stmt_info;
551 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
552 {
553 if (dump_enabled_p ())
554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
555 "Build SLP failed: can't analyze def for %T\n",
556 oprnd);
557
558 return -1;
559 }
560
561 if (skip_args[i])
562 {
563 oprnd_info->def_stmts.quick_push (NULL);
564 oprnd_info->ops.quick_push (NULL_TREE);
565 oprnd_info->first_dt = vect_uninitialized_def;
566 continue;
567 }
568
569 oprnd_info->def_stmts.quick_push (def_stmt_info);
570 oprnd_info->ops.quick_push (oprnd);
571
572 if (def_stmt_info
573 && is_pattern_stmt_p (def_stmt_info))
574 {
575 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
576 != def_stmt_info)
577 oprnd_info->any_pattern = true;
578 else
579 /* If we promote this to external use the original stmt def. */
580 oprnd_info->ops.last ()
581 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
582 }
583
584 /* If there's a extern def on a backedge make sure we can
585 code-generate at the region start.
586 ??? This is another case that could be fixed by adjusting
587 how we split the function but at the moment we'd have conflicting
588 goals there. */
589 if (backedge
590 && dts[i] == vect_external_def
591 && is_a <bb_vec_info> (vinfo)
592 && TREE_CODE (oprnd) == SSA_NAME
593 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
594 && !dominated_by_p (CDI_DOMINATORS,
595 as_a <bb_vec_info> (vinfo)->bbs[0],
596 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
597 {
598 if (dump_enabled_p ())
599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
600 "Build SLP failed: extern def %T only defined "
601 "on backedge\n", oprnd);
602 return -1;
603 }
604
605 if (first)
606 {
607 tree type = TREE_TYPE (oprnd);
608 dt = dts[i];
609 if ((dt == vect_constant_def
610 || dt == vect_external_def)
611 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
612 && (TREE_CODE (type) == BOOLEAN_TYPE
613 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
614 type)))
615 {
616 if (dump_enabled_p ())
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
618 "Build SLP failed: invalid type of def "
619 "for variable-length SLP %T\n", oprnd);
620 return -1;
621 }
622
623 /* For the swapping logic below force vect_reduction_def
624 for the reduction op in a SLP reduction group. */
625 if (!STMT_VINFO_DATA_REF (stmt_info)
626 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
627 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
628 && def_stmt_info)
629 dts[i] = dt = vect_reduction_def;
630
631 /* Check the types of the definition. */
632 switch (dt)
633 {
634 case vect_external_def:
635 case vect_constant_def:
636 case vect_internal_def:
637 case vect_reduction_def:
638 case vect_induction_def:
639 case vect_nested_cycle:
640 break;
641
642 default:
643 /* FORNOW: Not supported. */
644 if (dump_enabled_p ())
645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
646 "Build SLP failed: illegal type of def %T\n",
647 oprnd);
648 return -1;
649 }
650
651 oprnd_info->first_dt = dt;
652 oprnd_info->first_op_type = type;
653 }
654 }
655 if (first)
656 return 0;
657
658 /* Now match the operand definition types to that of the first stmt. */
659 for (i = 0; i < number_of_oprnds;)
660 {
661 if (skip_args[i])
662 {
663 ++i;
664 continue;
665 }
666
667 oprnd_info = (*oprnds_info)[i];
668 dt = dts[i];
669 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
670 oprnd = oprnd_info->ops[stmt_num];
671 tree type = TREE_TYPE (oprnd);
672
673 if (!types_compatible_p (oprnd_info->first_op_type, type))
674 {
675 if (dump_enabled_p ())
676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
677 "Build SLP failed: different operand types\n");
678 return 1;
679 }
680
681 /* Not first stmt of the group, check that the def-stmt/s match
682 the def-stmt/s of the first stmt. Allow different definition
683 types for reduction chains: the first stmt must be a
684 vect_reduction_def (a phi node), and the rest
685 end in the reduction chain. */
686 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
687 && !(oprnd_info->first_dt == vect_reduction_def
688 && !STMT_VINFO_DATA_REF (stmt_info)
689 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
690 && def_stmt_info
691 && !STMT_VINFO_DATA_REF (def_stmt_info)
692 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
693 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
694 || (!STMT_VINFO_DATA_REF (stmt_info)
695 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
696 && ((!def_stmt_info
697 || STMT_VINFO_DATA_REF (def_stmt_info)
698 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
699 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
700 != (oprnd_info->first_dt != vect_reduction_def))))
701 {
702 /* Try swapping operands if we got a mismatch. For BB
703 vectorization only in case it will clearly improve things. */
704 if (i == commutative_op && !swapped
705 && (!is_a <bb_vec_info> (vinfo)
706 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
707 dts[i+1])
708 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
709 || vect_def_types_match
710 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
711 {
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_NOTE, vect_location,
714 "trying swapped operands\n");
715 std::swap (dts[i], dts[i+1]);
716 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
717 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
718 std::swap ((*oprnds_info)[i]->ops[stmt_num],
719 (*oprnds_info)[i+1]->ops[stmt_num]);
720 swapped = true;
721 continue;
722 }
723
724 if (is_a <bb_vec_info> (vinfo)
725 && !oprnd_info->any_pattern)
726 {
727 /* Now for commutative ops we should see whether we can
728 make the other operand matching. */
729 if (dump_enabled_p ())
730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
731 "treating operand as external\n");
732 oprnd_info->first_dt = dt = vect_external_def;
733 }
734 else
735 {
736 if (dump_enabled_p ())
737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
738 "Build SLP failed: different types\n");
739 return 1;
740 }
741 }
742
743 /* Make sure to demote the overall operand to external. */
744 if (dt == vect_external_def)
745 oprnd_info->first_dt = vect_external_def;
746 /* For a SLP reduction chain we want to duplicate the reduction to
747 each of the chain members. That gets us a sane SLP graph (still
748 the stmts are not 100% correct wrt the initial values). */
749 else if ((dt == vect_internal_def
750 || dt == vect_reduction_def)
751 && oprnd_info->first_dt == vect_reduction_def
752 && !STMT_VINFO_DATA_REF (stmt_info)
753 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
754 && !STMT_VINFO_DATA_REF (def_stmt_info)
755 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
756 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
757 {
758 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
759 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
760 }
761
762 ++i;
763 }
764
765 /* Swap operands. */
766 if (swapped)
767 {
768 if (dump_enabled_p ())
769 dump_printf_loc (MSG_NOTE, vect_location,
770 "swapped operands to match def types in %G",
771 stmt_info->stmt);
772 }
773
774 return 0;
775 }
776
777 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
778 Return true if we can, meaning that this choice doesn't conflict with
779 existing SLP nodes that use STMT_INFO. */
780
781 bool
782 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
783 {
784 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
785 if (old_vectype)
786 return useless_type_conversion_p (vectype, old_vectype);
787
788 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
789 {
790 /* We maintain the invariant that if any statement in the group is
791 used, all other members of the group have the same vector type. */
792 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
793 stmt_vec_info member_info = first_info;
794 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
795 if (is_pattern_stmt_p (member_info)
796 && !useless_type_conversion_p (vectype,
797 STMT_VINFO_VECTYPE (member_info)))
798 break;
799
800 if (!member_info)
801 {
802 for (member_info = first_info; member_info;
803 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
804 STMT_VINFO_VECTYPE (member_info) = vectype;
805 return true;
806 }
807 }
808 else if (!is_pattern_stmt_p (stmt_info))
809 {
810 STMT_VINFO_VECTYPE (stmt_info) = vectype;
811 return true;
812 }
813
814 if (dump_enabled_p ())
815 {
816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
817 "Build SLP failed: incompatible vector"
818 " types for: %G", stmt_info->stmt);
819 dump_printf_loc (MSG_NOTE, vect_location,
820 " old vector type: %T\n", old_vectype);
821 dump_printf_loc (MSG_NOTE, vect_location,
822 " new vector type: %T\n", vectype);
823 }
824 return false;
825 }
826
827 /* Return true if call statements CALL1 and CALL2 are similar enough
828 to be combined into the same SLP group. */
829
830 static bool
831 compatible_calls_p (gcall *call1, gcall *call2)
832 {
833 unsigned int nargs = gimple_call_num_args (call1);
834 if (nargs != gimple_call_num_args (call2))
835 return false;
836
837 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
838 return false;
839
840 if (gimple_call_internal_p (call1))
841 {
842 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
843 TREE_TYPE (gimple_call_lhs (call2))))
844 return false;
845 for (unsigned int i = 0; i < nargs; ++i)
846 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
847 TREE_TYPE (gimple_call_arg (call2, i))))
848 return false;
849 }
850 else
851 {
852 if (!operand_equal_p (gimple_call_fn (call1),
853 gimple_call_fn (call2), 0))
854 return false;
855
856 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
857 return false;
858 }
859 return true;
860 }
861
862 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
863 caller's attempt to find the vector type in STMT_INFO with the narrowest
864 element type. Return true if VECTYPE is nonnull and if it is valid
865 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
866 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
867 vect_build_slp_tree. */
868
869 static bool
870 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
871 unsigned int group_size,
872 tree vectype, poly_uint64 *max_nunits)
873 {
874 if (!vectype)
875 {
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
878 "Build SLP failed: unsupported data-type in %G\n",
879 stmt_info->stmt);
880 /* Fatal mismatch. */
881 return false;
882 }
883
884 /* If populating the vector type requires unrolling then fail
885 before adjusting *max_nunits for basic-block vectorization. */
886 if (is_a <bb_vec_info> (vinfo)
887 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
888 {
889 if (dump_enabled_p ())
890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
891 "Build SLP failed: unrolling required "
892 "in basic block SLP\n");
893 /* Fatal mismatch. */
894 return false;
895 }
896
897 /* In case of multiple types we need to detect the smallest type. */
898 vect_update_max_nunits (max_nunits, vectype);
899 return true;
900 }
901
902 /* Verify if the scalar stmts STMTS are isomorphic, require data
903 permutation or are of unsupported types of operation. Return
904 true if they are, otherwise return false and indicate in *MATCHES
905 which stmts are not isomorphic to the first one. If MATCHES[0]
906 is false then this indicates the comparison could not be
907 carried out or the stmts will never be vectorized by SLP.
908
909 Note COND_EXPR is possibly isomorphic to another one after swapping its
910 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
911 the first stmt by swapping the two operands of comparison; set SWAP[i]
912 to 2 if stmt I is isormorphic to the first stmt by inverting the code
913 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
914 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
915
916 static bool
917 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
918 vec<stmt_vec_info> stmts, unsigned int group_size,
919 poly_uint64 *max_nunits, bool *matches,
920 bool *two_operators, tree *node_vectype)
921 {
922 unsigned int i;
923 stmt_vec_info first_stmt_info = stmts[0];
924 enum tree_code first_stmt_code = ERROR_MARK;
925 enum tree_code alt_stmt_code = ERROR_MARK;
926 enum tree_code rhs_code = ERROR_MARK;
927 enum tree_code first_cond_code = ERROR_MARK;
928 tree lhs;
929 bool need_same_oprnds = false;
930 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
931 optab optab;
932 int icode;
933 machine_mode optab_op2_mode;
934 machine_mode vec_mode;
935 stmt_vec_info first_load = NULL, prev_first_load = NULL;
936 bool first_stmt_load_p = false, load_p = false;
937 bool first_stmt_phi_p = false, phi_p = false;
938 bool maybe_soft_fail = false;
939 tree soft_fail_nunits_vectype = NULL_TREE;
940
941 /* For every stmt in NODE find its def stmt/s. */
942 stmt_vec_info stmt_info;
943 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
944 {
945 gimple *stmt = stmt_info->stmt;
946 swap[i] = 0;
947 matches[i] = false;
948
949 if (dump_enabled_p ())
950 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
951
952 /* Fail to vectorize statements marked as unvectorizable, throw
953 or are volatile. */
954 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
955 || stmt_can_throw_internal (cfun, stmt)
956 || gimple_has_volatile_ops (stmt))
957 {
958 if (dump_enabled_p ())
959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
960 "Build SLP failed: unvectorizable statement %G",
961 stmt);
962 /* ??? For BB vectorization we want to commutate operands in a way
963 to shuffle all unvectorizable defs into one operand and have
964 the other still vectorized. The following doesn't reliably
965 work for this though but it's the easiest we can do here. */
966 if (is_a <bb_vec_info> (vinfo) && i != 0)
967 continue;
968 /* Fatal mismatch. */
969 matches[0] = false;
970 return false;
971 }
972
973 lhs = gimple_get_lhs (stmt);
974 if (lhs == NULL_TREE)
975 {
976 if (dump_enabled_p ())
977 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
978 "Build SLP failed: not GIMPLE_ASSIGN nor "
979 "GIMPLE_CALL %G", stmt);
980 if (is_a <bb_vec_info> (vinfo) && i != 0)
981 continue;
982 /* Fatal mismatch. */
983 matches[0] = false;
984 return false;
985 }
986
987 tree nunits_vectype;
988 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
989 &nunits_vectype, group_size))
990 {
991 if (is_a <bb_vec_info> (vinfo) && i != 0)
992 continue;
993 /* Fatal mismatch. */
994 matches[0] = false;
995 return false;
996 }
997 /* Record nunits required but continue analysis, producing matches[]
998 as if nunits was not an issue. This allows splitting of groups
999 to happen. */
1000 if (nunits_vectype
1001 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1002 nunits_vectype, max_nunits))
1003 {
1004 gcc_assert (is_a <bb_vec_info> (vinfo));
1005 maybe_soft_fail = true;
1006 soft_fail_nunits_vectype = nunits_vectype;
1007 }
1008
1009 gcc_assert (vectype);
1010
1011 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1012 if (call_stmt)
1013 {
1014 rhs_code = CALL_EXPR;
1015
1016 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
1017 load_p = true;
1018 else if ((gimple_call_internal_p (call_stmt)
1019 && (!vectorizable_internal_fn_p
1020 (gimple_call_internal_fn (call_stmt))))
1021 || gimple_call_tail_p (call_stmt)
1022 || gimple_call_noreturn_p (call_stmt)
1023 || !gimple_call_nothrow_p (call_stmt)
1024 || gimple_call_chain (call_stmt))
1025 {
1026 if (dump_enabled_p ())
1027 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1028 "Build SLP failed: unsupported call type %G",
1029 call_stmt);
1030 if (is_a <bb_vec_info> (vinfo) && i != 0)
1031 continue;
1032 /* Fatal mismatch. */
1033 matches[0] = false;
1034 return false;
1035 }
1036 }
1037 else if (gimple_code (stmt) == GIMPLE_PHI)
1038 {
1039 rhs_code = ERROR_MARK;
1040 phi_p = true;
1041 }
1042 else
1043 {
1044 rhs_code = gimple_assign_rhs_code (stmt);
1045 load_p = gimple_vuse (stmt);
1046 }
1047
1048 /* Check the operation. */
1049 if (i == 0)
1050 {
1051 *node_vectype = vectype;
1052 first_stmt_code = rhs_code;
1053 first_stmt_load_p = load_p;
1054 first_stmt_phi_p = phi_p;
1055
1056 /* Shift arguments should be equal in all the packed stmts for a
1057 vector shift with scalar shift operand. */
1058 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1059 || rhs_code == LROTATE_EXPR
1060 || rhs_code == RROTATE_EXPR)
1061 {
1062 vec_mode = TYPE_MODE (vectype);
1063
1064 /* First see if we have a vector/vector shift. */
1065 optab = optab_for_tree_code (rhs_code, vectype,
1066 optab_vector);
1067
1068 if (!optab
1069 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1070 {
1071 /* No vector/vector shift, try for a vector/scalar shift. */
1072 optab = optab_for_tree_code (rhs_code, vectype,
1073 optab_scalar);
1074
1075 if (!optab)
1076 {
1077 if (dump_enabled_p ())
1078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1079 "Build SLP failed: no optab.\n");
1080 if (is_a <bb_vec_info> (vinfo) && i != 0)
1081 continue;
1082 /* Fatal mismatch. */
1083 matches[0] = false;
1084 return false;
1085 }
1086 icode = (int) optab_handler (optab, vec_mode);
1087 if (icode == CODE_FOR_nothing)
1088 {
1089 if (dump_enabled_p ())
1090 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1091 "Build SLP failed: "
1092 "op not supported by target.\n");
1093 if (is_a <bb_vec_info> (vinfo) && i != 0)
1094 continue;
1095 /* Fatal mismatch. */
1096 matches[0] = false;
1097 return false;
1098 }
1099 optab_op2_mode = insn_data[icode].operand[2].mode;
1100 if (!VECTOR_MODE_P (optab_op2_mode))
1101 {
1102 need_same_oprnds = true;
1103 first_op1 = gimple_assign_rhs2 (stmt);
1104 }
1105 }
1106 }
1107 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1108 {
1109 need_same_oprnds = true;
1110 first_op1 = gimple_assign_rhs2 (stmt);
1111 }
1112 else if (!load_p
1113 && rhs_code == BIT_FIELD_REF)
1114 {
1115 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1116 if (!is_a <bb_vec_info> (vinfo)
1117 || TREE_CODE (vec) != SSA_NAME
1118 || !operand_equal_p (TYPE_SIZE (vectype),
1119 TYPE_SIZE (TREE_TYPE (vec))))
1120 {
1121 if (dump_enabled_p ())
1122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1123 "Build SLP failed: "
1124 "BIT_FIELD_REF not supported\n");
1125 /* Fatal mismatch. */
1126 matches[0] = false;
1127 return false;
1128 }
1129 }
1130 else if (call_stmt
1131 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1132 {
1133 need_same_oprnds = true;
1134 first_op1 = gimple_call_arg (call_stmt, 1);
1135 }
1136 }
1137 else
1138 {
1139 if (first_stmt_code != rhs_code
1140 && alt_stmt_code == ERROR_MARK)
1141 alt_stmt_code = rhs_code;
1142 if ((first_stmt_code != rhs_code
1143 && (first_stmt_code != IMAGPART_EXPR
1144 || rhs_code != REALPART_EXPR)
1145 && (first_stmt_code != REALPART_EXPR
1146 || rhs_code != IMAGPART_EXPR)
1147 /* Handle mismatches in plus/minus by computing both
1148 and merging the results. */
1149 && !((first_stmt_code == PLUS_EXPR
1150 || first_stmt_code == MINUS_EXPR)
1151 && (alt_stmt_code == PLUS_EXPR
1152 || alt_stmt_code == MINUS_EXPR)
1153 && rhs_code == alt_stmt_code)
1154 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1155 && (first_stmt_code == ARRAY_REF
1156 || first_stmt_code == BIT_FIELD_REF
1157 || first_stmt_code == INDIRECT_REF
1158 || first_stmt_code == COMPONENT_REF
1159 || first_stmt_code == MEM_REF)))
1160 || first_stmt_load_p != load_p
1161 || first_stmt_phi_p != phi_p)
1162 {
1163 if (dump_enabled_p ())
1164 {
1165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1166 "Build SLP failed: different operation "
1167 "in stmt %G", stmt);
1168 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1169 "original stmt %G", first_stmt_info->stmt);
1170 }
1171 /* Mismatch. */
1172 continue;
1173 }
1174
1175 if (need_same_oprnds)
1176 {
1177 tree other_op1 = (call_stmt
1178 ? gimple_call_arg (call_stmt, 1)
1179 : gimple_assign_rhs2 (stmt));
1180 if (!operand_equal_p (first_op1, other_op1, 0))
1181 {
1182 if (dump_enabled_p ())
1183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1184 "Build SLP failed: different shift "
1185 "arguments in %G", stmt);
1186 /* Mismatch. */
1187 continue;
1188 }
1189 }
1190 if (!load_p
1191 && first_stmt_code == BIT_FIELD_REF
1192 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1193 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1194 {
1195 if (dump_enabled_p ())
1196 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1197 "Build SLP failed: different BIT_FIELD_REF "
1198 "arguments in %G", stmt);
1199 /* Mismatch. */
1200 continue;
1201 }
1202
1203 if (!load_p && rhs_code == CALL_EXPR)
1204 {
1205 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1206 as_a <gcall *> (stmt)))
1207 {
1208 if (dump_enabled_p ())
1209 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1210 "Build SLP failed: different calls in %G",
1211 stmt);
1212 /* Mismatch. */
1213 continue;
1214 }
1215 }
1216
1217 if (phi_p
1218 && (gimple_bb (first_stmt_info->stmt)
1219 != gimple_bb (stmt_info->stmt)))
1220 {
1221 if (dump_enabled_p ())
1222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1223 "Build SLP failed: different BB for PHI "
1224 "in %G", stmt);
1225 /* Mismatch. */
1226 continue;
1227 }
1228
1229 if (!types_compatible_p (vectype, *node_vectype))
1230 {
1231 if (dump_enabled_p ())
1232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1233 "Build SLP failed: different vector type "
1234 "in %G", stmt);
1235 /* Mismatch. */
1236 continue;
1237 }
1238 }
1239
1240 /* Grouped store or load. */
1241 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1242 {
1243 if (REFERENCE_CLASS_P (lhs))
1244 {
1245 /* Store. */
1246 ;
1247 }
1248 else
1249 {
1250 /* Load. */
1251 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1252 if (prev_first_load)
1253 {
1254 /* Check that there are no loads from different interleaving
1255 chains in the same node. */
1256 if (prev_first_load != first_load)
1257 {
1258 if (dump_enabled_p ())
1259 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1260 vect_location,
1261 "Build SLP failed: different "
1262 "interleaving chains in one node %G",
1263 stmt);
1264 /* Mismatch. */
1265 continue;
1266 }
1267 }
1268 else
1269 prev_first_load = first_load;
1270 }
1271 } /* Grouped access. */
1272 else
1273 {
1274 if (load_p)
1275 {
1276 /* Not grouped load. */
1277 if (dump_enabled_p ())
1278 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1279 "Build SLP failed: not grouped load %G", stmt);
1280
1281 /* FORNOW: Not grouped loads are not supported. */
1282 if (is_a <bb_vec_info> (vinfo) && i != 0)
1283 continue;
1284 /* Fatal mismatch. */
1285 matches[0] = false;
1286 return false;
1287 }
1288
1289 /* Not memory operation. */
1290 if (!phi_p
1291 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1292 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1293 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1294 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1295 && rhs_code != VIEW_CONVERT_EXPR
1296 && rhs_code != CALL_EXPR
1297 && rhs_code != BIT_FIELD_REF)
1298 {
1299 if (dump_enabled_p ())
1300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1301 "Build SLP failed: operation unsupported %G",
1302 stmt);
1303 if (is_a <bb_vec_info> (vinfo) && i != 0)
1304 continue;
1305 /* Fatal mismatch. */
1306 matches[0] = false;
1307 return false;
1308 }
1309
1310 if (rhs_code == COND_EXPR)
1311 {
1312 tree cond_expr = gimple_assign_rhs1 (stmt);
1313 enum tree_code cond_code = TREE_CODE (cond_expr);
1314 enum tree_code swap_code = ERROR_MARK;
1315 enum tree_code invert_code = ERROR_MARK;
1316
1317 if (i == 0)
1318 first_cond_code = TREE_CODE (cond_expr);
1319 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1320 {
1321 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1322 swap_code = swap_tree_comparison (cond_code);
1323 invert_code = invert_tree_comparison (cond_code, honor_nans);
1324 }
1325
1326 if (first_cond_code == cond_code)
1327 ;
1328 /* Isomorphic can be achieved by swapping. */
1329 else if (first_cond_code == swap_code)
1330 swap[i] = 1;
1331 /* Isomorphic can be achieved by inverting. */
1332 else if (first_cond_code == invert_code)
1333 swap[i] = 2;
1334 else
1335 {
1336 if (dump_enabled_p ())
1337 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1338 "Build SLP failed: different"
1339 " operation %G", stmt);
1340 /* Mismatch. */
1341 continue;
1342 }
1343 }
1344 }
1345
1346 matches[i] = true;
1347 }
1348
1349 for (i = 0; i < group_size; ++i)
1350 if (!matches[i])
1351 return false;
1352
1353 /* If we allowed a two-operation SLP node verify the target can cope
1354 with the permute we are going to use. */
1355 if (alt_stmt_code != ERROR_MARK
1356 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1357 {
1358 *two_operators = true;
1359 }
1360
1361 if (maybe_soft_fail)
1362 {
1363 unsigned HOST_WIDE_INT const_nunits;
1364 if (!TYPE_VECTOR_SUBPARTS
1365 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1366 || const_nunits > group_size)
1367 matches[0] = false;
1368 else
1369 {
1370 /* With constant vector elements simulate a mismatch at the
1371 point we need to split. */
1372 unsigned tail = group_size & (const_nunits - 1);
1373 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1374 }
1375 return false;
1376 }
1377
1378 return true;
1379 }
1380
1381 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1382 Note we never remove apart from at destruction time so we do not
1383 need a special value for deleted that differs from empty. */
1384 struct bst_traits
1385 {
1386 typedef vec <stmt_vec_info> value_type;
1387 typedef vec <stmt_vec_info> compare_type;
1388 static inline hashval_t hash (value_type);
1389 static inline bool equal (value_type existing, value_type candidate);
1390 static inline bool is_empty (value_type x) { return !x.exists (); }
1391 static inline bool is_deleted (value_type x) { return !x.exists (); }
1392 static const bool empty_zero_p = true;
1393 static inline void mark_empty (value_type &x) { x.release (); }
1394 static inline void mark_deleted (value_type &x) { x.release (); }
1395 static inline void remove (value_type &x) { x.release (); }
1396 };
1397 inline hashval_t
1398 bst_traits::hash (value_type x)
1399 {
1400 inchash::hash h;
1401 for (unsigned i = 0; i < x.length (); ++i)
1402 h.add_int (gimple_uid (x[i]->stmt));
1403 return h.end ();
1404 }
1405 inline bool
1406 bst_traits::equal (value_type existing, value_type candidate)
1407 {
1408 if (existing.length () != candidate.length ())
1409 return false;
1410 for (unsigned i = 0; i < existing.length (); ++i)
1411 if (existing[i] != candidate[i])
1412 return false;
1413 return true;
1414 }
1415
1416 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1417 simple_hashmap_traits <bst_traits, slp_tree> >
1418 scalar_stmts_to_slp_tree_map_t;
1419
1420 static slp_tree
1421 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1422 vec<stmt_vec_info> stmts, unsigned int group_size,
1423 poly_uint64 *max_nunits,
1424 bool *matches, unsigned *limit, unsigned *tree_size,
1425 scalar_stmts_to_slp_tree_map_t *bst_map);
1426
1427 static slp_tree
1428 vect_build_slp_tree (vec_info *vinfo,
1429 vec<stmt_vec_info> stmts, unsigned int group_size,
1430 poly_uint64 *max_nunits,
1431 bool *matches, unsigned *limit, unsigned *tree_size,
1432 scalar_stmts_to_slp_tree_map_t *bst_map)
1433 {
1434 if (slp_tree *leader = bst_map->get (stmts))
1435 {
1436 if (dump_enabled_p ())
1437 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1438 *leader ? "" : "failed ", *leader);
1439 if (*leader)
1440 {
1441 SLP_TREE_REF_COUNT (*leader)++;
1442 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1443 stmts.release ();
1444 }
1445 return *leader;
1446 }
1447
1448 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1449 so we can pick up backedge destinations during discovery. */
1450 slp_tree res = new _slp_tree;
1451 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1452 SLP_TREE_SCALAR_STMTS (res) = stmts;
1453 bst_map->put (stmts.copy (), res);
1454
1455 if (*limit == 0)
1456 {
1457 if (dump_enabled_p ())
1458 dump_printf_loc (MSG_NOTE, vect_location,
1459 "SLP discovery limit exceeded\n");
1460 bool existed_p = bst_map->put (stmts, NULL);
1461 gcc_assert (existed_p);
1462 /* Mark the node invalid so we can detect those when still in use
1463 as backedge destinations. */
1464 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1465 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1466 vect_free_slp_tree (res);
1467 memset (matches, 0, sizeof (bool) * group_size);
1468 return NULL;
1469 }
1470 --*limit;
1471
1472 poly_uint64 this_max_nunits = 1;
1473 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1474 &this_max_nunits,
1475 matches, limit, tree_size, bst_map);
1476 if (!res_)
1477 {
1478 bool existed_p = bst_map->put (stmts, NULL);
1479 gcc_assert (existed_p);
1480 /* Mark the node invalid so we can detect those when still in use
1481 as backedge destinations. */
1482 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1483 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1484 vect_free_slp_tree (res);
1485 }
1486 else
1487 {
1488 gcc_assert (res_ == res);
1489 res->max_nunits = this_max_nunits;
1490 vect_update_max_nunits (max_nunits, this_max_nunits);
1491 /* Keep a reference for the bst_map use. */
1492 SLP_TREE_REF_COUNT (res)++;
1493 }
1494 return res_;
1495 }
1496
1497 /* Recursively build an SLP tree starting from NODE.
1498 Fail (and return a value not equal to zero) if def-stmts are not
1499 isomorphic, require data permutation or are of unsupported types of
1500 operation. Otherwise, return 0.
1501 The value returned is the depth in the SLP tree where a mismatch
1502 was found. */
1503
1504 static slp_tree
1505 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1506 vec<stmt_vec_info> stmts, unsigned int group_size,
1507 poly_uint64 *max_nunits,
1508 bool *matches, unsigned *limit, unsigned *tree_size,
1509 scalar_stmts_to_slp_tree_map_t *bst_map)
1510 {
1511 unsigned nops, i, this_tree_size = 0;
1512 poly_uint64 this_max_nunits = *max_nunits;
1513
1514 matches[0] = false;
1515
1516 stmt_vec_info stmt_info = stmts[0];
1517 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1518 nops = gimple_call_num_args (stmt);
1519 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1520 {
1521 nops = gimple_num_ops (stmt) - 1;
1522 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1523 nops++;
1524 }
1525 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1526 nops = gimple_phi_num_args (phi);
1527 else
1528 return NULL;
1529
1530 /* If the SLP node is a PHI (induction or reduction), terminate
1531 the recursion. */
1532 bool *skip_args = XALLOCAVEC (bool, nops);
1533 memset (skip_args, 0, sizeof (bool) * nops);
1534 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1535 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1536 {
1537 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1538 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1539 group_size);
1540 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1541 max_nunits))
1542 return NULL;
1543
1544 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1545 if (def_type == vect_induction_def)
1546 {
1547 /* Induction PHIs are not cycles but walk the initial
1548 value. Only for inner loops through, for outer loops
1549 we need to pick up the value from the actual PHIs
1550 to more easily support peeling and epilogue vectorization. */
1551 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1552 if (!nested_in_vect_loop_p (loop, stmt_info))
1553 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1554 else
1555 loop = loop->inner;
1556 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1557 }
1558 else if (def_type == vect_reduction_def
1559 || def_type == vect_double_reduction_def
1560 || def_type == vect_nested_cycle)
1561 {
1562 /* Else def types have to match. */
1563 stmt_vec_info other_info;
1564 bool all_same = true;
1565 FOR_EACH_VEC_ELT (stmts, i, other_info)
1566 {
1567 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1568 return NULL;
1569 if (other_info != stmt_info)
1570 all_same = false;
1571 }
1572 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1573 /* Reduction initial values are not explicitely represented. */
1574 if (!nested_in_vect_loop_p (loop, stmt_info))
1575 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1576 /* Reduction chain backedge defs are filled manually.
1577 ??? Need a better way to identify a SLP reduction chain PHI.
1578 Or a better overall way to SLP match those. */
1579 if (all_same && def_type == vect_reduction_def)
1580 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1581 }
1582 else if (def_type != vect_internal_def)
1583 return NULL;
1584 }
1585
1586
1587 bool two_operators = false;
1588 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1589 tree vectype = NULL_TREE;
1590 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1591 &this_max_nunits, matches, &two_operators,
1592 &vectype))
1593 return NULL;
1594
1595 /* If the SLP node is a load, terminate the recursion unless masked. */
1596 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1597 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1598 {
1599 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1600 {
1601 /* Masked load. */
1602 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1603 nops = 1;
1604 }
1605 else
1606 {
1607 *max_nunits = this_max_nunits;
1608 (*tree_size)++;
1609 node = vect_create_new_slp_node (node, stmts, 0);
1610 SLP_TREE_VECTYPE (node) = vectype;
1611 /* And compute the load permutation. Whether it is actually
1612 a permutation depends on the unrolling factor which is
1613 decided later. */
1614 vec<unsigned> load_permutation;
1615 int j;
1616 stmt_vec_info load_info;
1617 load_permutation.create (group_size);
1618 stmt_vec_info first_stmt_info
1619 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1620 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1621 {
1622 int load_place = vect_get_place_in_interleaving_chain
1623 (load_info, first_stmt_info);
1624 gcc_assert (load_place != -1);
1625 load_permutation.safe_push (load_place);
1626 }
1627 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1628 return node;
1629 }
1630 }
1631 else if (gimple_assign_single_p (stmt_info->stmt)
1632 && !gimple_vuse (stmt_info->stmt)
1633 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1634 {
1635 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1636 the same SSA name vector of a compatible type to vectype. */
1637 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1638 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1639 stmt_vec_info estmt_info;
1640 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1641 {
1642 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1643 tree bfref = gimple_assign_rhs1 (estmt);
1644 HOST_WIDE_INT lane;
1645 if (!known_eq (bit_field_size (bfref),
1646 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1647 || !constant_multiple_p (bit_field_offset (bfref),
1648 bit_field_size (bfref), &lane))
1649 {
1650 lperm.release ();
1651 return NULL;
1652 }
1653 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1654 }
1655 slp_tree vnode = vect_create_new_slp_node (vNULL);
1656 /* ??? We record vectype here but we hide eventually necessary
1657 punning and instead rely on code generation to materialize
1658 VIEW_CONVERT_EXPRs as necessary. We instead should make
1659 this explicit somehow. */
1660 SLP_TREE_VECTYPE (vnode) = vectype;
1661 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1662 /* We are always building a permutation node even if it is an identity
1663 permute to shield the rest of the vectorizer from the odd node
1664 representing an actual vector without any scalar ops.
1665 ??? We could hide it completely with making the permute node
1666 external? */
1667 node = vect_create_new_slp_node (node, stmts, 1);
1668 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1669 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1670 SLP_TREE_VECTYPE (node) = vectype;
1671 SLP_TREE_CHILDREN (node).quick_push (vnode);
1672 return node;
1673 }
1674
1675 /* Get at the operands, verifying they are compatible. */
1676 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1677 slp_oprnd_info oprnd_info;
1678 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1679 {
1680 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
1681 stmts, i, &oprnds_info);
1682 if (res != 0)
1683 matches[(res == -1) ? 0 : i] = false;
1684 if (!matches[0])
1685 break;
1686 }
1687 for (i = 0; i < group_size; ++i)
1688 if (!matches[i])
1689 {
1690 vect_free_oprnd_info (oprnds_info);
1691 return NULL;
1692 }
1693 swap = NULL;
1694
1695 auto_vec<slp_tree, 4> children;
1696
1697 stmt_info = stmts[0];
1698
1699 /* Create SLP_TREE nodes for the definition node/s. */
1700 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1701 {
1702 slp_tree child;
1703 unsigned int j;
1704
1705 /* We're skipping certain operands from processing, for example
1706 outer loop reduction initial defs. */
1707 if (skip_args[i])
1708 {
1709 children.safe_push (NULL);
1710 continue;
1711 }
1712
1713 if (oprnd_info->first_dt == vect_uninitialized_def)
1714 {
1715 /* COND_EXPR have one too many eventually if the condition
1716 is a SSA name. */
1717 gcc_assert (i == 3 && nops == 4);
1718 continue;
1719 }
1720
1721 if (is_a <bb_vec_info> (vinfo)
1722 && oprnd_info->first_dt == vect_internal_def
1723 && !oprnd_info->any_pattern)
1724 {
1725 /* For BB vectorization, if all defs are the same do not
1726 bother to continue the build along the single-lane
1727 graph but use a splat of the scalar value. */
1728 stmt_vec_info first_def = oprnd_info->def_stmts[0];
1729 for (j = 1; j < group_size; ++j)
1730 if (oprnd_info->def_stmts[j] != first_def)
1731 break;
1732 if (j == group_size
1733 /* But avoid doing this for loads where we may be
1734 able to CSE things, unless the stmt is not
1735 vectorizable. */
1736 && (!STMT_VINFO_VECTORIZABLE (first_def)
1737 || !gimple_vuse (first_def->stmt)))
1738 {
1739 if (dump_enabled_p ())
1740 dump_printf_loc (MSG_NOTE, vect_location,
1741 "Using a splat of the uniform operand\n");
1742 oprnd_info->first_dt = vect_external_def;
1743 }
1744 }
1745
1746 if (oprnd_info->first_dt == vect_external_def
1747 || oprnd_info->first_dt == vect_constant_def)
1748 {
1749 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1750 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1751 oprnd_info->ops = vNULL;
1752 children.safe_push (invnode);
1753 continue;
1754 }
1755
1756 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1757 group_size, &this_max_nunits,
1758 matches, limit,
1759 &this_tree_size, bst_map)) != NULL)
1760 {
1761 oprnd_info->def_stmts = vNULL;
1762 children.safe_push (child);
1763 continue;
1764 }
1765
1766 /* If the SLP build for operand zero failed and operand zero
1767 and one can be commutated try that for the scalar stmts
1768 that failed the match. */
1769 if (i == 0
1770 /* A first scalar stmt mismatch signals a fatal mismatch. */
1771 && matches[0]
1772 /* ??? For COND_EXPRs we can swap the comparison operands
1773 as well as the arms under some constraints. */
1774 && nops == 2
1775 && oprnds_info[1]->first_dt == vect_internal_def
1776 && is_gimple_assign (stmt_info->stmt)
1777 /* Swapping operands for reductions breaks assumptions later on. */
1778 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1779 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1780 {
1781 /* See whether we can swap the matching or the non-matching
1782 stmt operands. */
1783 bool swap_not_matching = true;
1784 do
1785 {
1786 for (j = 0; j < group_size; ++j)
1787 {
1788 if (matches[j] != !swap_not_matching)
1789 continue;
1790 stmt_vec_info stmt_info = stmts[j];
1791 /* Verify if we can swap operands of this stmt. */
1792 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1793 if (!stmt
1794 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1795 {
1796 if (!swap_not_matching)
1797 goto fail;
1798 swap_not_matching = false;
1799 break;
1800 }
1801 }
1802 }
1803 while (j != group_size);
1804
1805 /* Swap mismatched definition stmts. */
1806 if (dump_enabled_p ())
1807 dump_printf_loc (MSG_NOTE, vect_location,
1808 "Re-trying with swapped operands of stmts ");
1809 for (j = 0; j < group_size; ++j)
1810 if (matches[j] == !swap_not_matching)
1811 {
1812 std::swap (oprnds_info[0]->def_stmts[j],
1813 oprnds_info[1]->def_stmts[j]);
1814 std::swap (oprnds_info[0]->ops[j],
1815 oprnds_info[1]->ops[j]);
1816 if (dump_enabled_p ())
1817 dump_printf (MSG_NOTE, "%d ", j);
1818 }
1819 if (dump_enabled_p ())
1820 dump_printf (MSG_NOTE, "\n");
1821 /* And try again with scratch 'matches' ... */
1822 bool *tem = XALLOCAVEC (bool, group_size);
1823 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1824 group_size, &this_max_nunits,
1825 tem, limit,
1826 &this_tree_size, bst_map)) != NULL)
1827 {
1828 oprnd_info->def_stmts = vNULL;
1829 children.safe_push (child);
1830 continue;
1831 }
1832 }
1833 fail:
1834
1835 /* If the SLP build failed and we analyze a basic-block
1836 simply treat nodes we fail to build as externally defined
1837 (and thus build vectors from the scalar defs).
1838 The cost model will reject outright expensive cases.
1839 ??? This doesn't treat cases where permutation ultimatively
1840 fails (or we don't try permutation below). Ideally we'd
1841 even compute a permutation that will end up with the maximum
1842 SLP tree size... */
1843 if (is_a <bb_vec_info> (vinfo)
1844 /* ??? Rejecting patterns this way doesn't work. We'd have to
1845 do extra work to cancel the pattern so the uses see the
1846 scalar version. */
1847 && !is_pattern_stmt_p (stmt_info)
1848 && !oprnd_info->any_pattern)
1849 {
1850 /* But if there's a leading vector sized set of matching stmts
1851 fail here so we can split the group. This matches the condition
1852 vect_analyze_slp_instance uses. */
1853 /* ??? We might want to split here and combine the results to support
1854 multiple vector sizes better. */
1855 for (j = 0; j < group_size; ++j)
1856 if (!matches[j])
1857 break;
1858 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1859 {
1860 if (dump_enabled_p ())
1861 dump_printf_loc (MSG_NOTE, vect_location,
1862 "Building vector operands from scalars\n");
1863 this_tree_size++;
1864 child = vect_create_new_slp_node (oprnd_info->ops);
1865 children.safe_push (child);
1866 oprnd_info->ops = vNULL;
1867 continue;
1868 }
1869 }
1870
1871 gcc_assert (child == NULL);
1872 FOR_EACH_VEC_ELT (children, j, child)
1873 if (child)
1874 vect_free_slp_tree (child);
1875 vect_free_oprnd_info (oprnds_info);
1876 return NULL;
1877 }
1878
1879 vect_free_oprnd_info (oprnds_info);
1880
1881 /* If we have all children of a child built up from uniform scalars
1882 or does more than one possibly expensive vector construction then
1883 just throw that away, causing it built up from scalars.
1884 The exception is the SLP node for the vector store. */
1885 if (is_a <bb_vec_info> (vinfo)
1886 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1887 /* ??? Rejecting patterns this way doesn't work. We'd have to
1888 do extra work to cancel the pattern so the uses see the
1889 scalar version. */
1890 && !is_pattern_stmt_p (stmt_info))
1891 {
1892 slp_tree child;
1893 unsigned j;
1894 bool all_uniform_p = true;
1895 unsigned n_vector_builds = 0;
1896 FOR_EACH_VEC_ELT (children, j, child)
1897 {
1898 if (!child)
1899 ;
1900 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1901 all_uniform_p = false;
1902 else if (!vect_slp_tree_uniform_p (child))
1903 {
1904 all_uniform_p = false;
1905 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1906 n_vector_builds++;
1907 }
1908 }
1909 if (all_uniform_p
1910 || n_vector_builds > 1
1911 || (n_vector_builds == children.length ()
1912 && is_a <gphi *> (stmt_info->stmt)))
1913 {
1914 /* Roll back. */
1915 matches[0] = false;
1916 FOR_EACH_VEC_ELT (children, j, child)
1917 if (child)
1918 vect_free_slp_tree (child);
1919
1920 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_NOTE, vect_location,
1922 "Building parent vector operands from "
1923 "scalars instead\n");
1924 return NULL;
1925 }
1926 }
1927
1928 *tree_size += this_tree_size + 1;
1929 *max_nunits = this_max_nunits;
1930
1931 if (two_operators)
1932 {
1933 /* ??? We'd likely want to either cache in bst_map sth like
1934 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1935 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1936 explicit stmts to put in so the keying on 'stmts' doesn't
1937 work (but we have the same issue with nodes that use 'ops'). */
1938 slp_tree one = new _slp_tree;
1939 slp_tree two = new _slp_tree;
1940 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1941 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1942 SLP_TREE_VECTYPE (one) = vectype;
1943 SLP_TREE_VECTYPE (two) = vectype;
1944 SLP_TREE_CHILDREN (one).safe_splice (children);
1945 SLP_TREE_CHILDREN (two).safe_splice (children);
1946 slp_tree child;
1947 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1948 SLP_TREE_REF_COUNT (child)++;
1949
1950 /* Here we record the original defs since this
1951 node represents the final lane configuration. */
1952 node = vect_create_new_slp_node (node, stmts, 2);
1953 SLP_TREE_VECTYPE (node) = vectype;
1954 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1955 SLP_TREE_CHILDREN (node).quick_push (one);
1956 SLP_TREE_CHILDREN (node).quick_push (two);
1957 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1958 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1959 enum tree_code ocode = ERROR_MARK;
1960 stmt_vec_info ostmt_info;
1961 unsigned j = 0;
1962 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1963 {
1964 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1965 if (gimple_assign_rhs_code (ostmt) != code0)
1966 {
1967 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1968 ocode = gimple_assign_rhs_code (ostmt);
1969 j = i;
1970 }
1971 else
1972 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1973 }
1974 SLP_TREE_CODE (one) = code0;
1975 SLP_TREE_CODE (two) = ocode;
1976 SLP_TREE_LANES (one) = stmts.length ();
1977 SLP_TREE_LANES (two) = stmts.length ();
1978 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1979 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1980 return node;
1981 }
1982
1983 node = vect_create_new_slp_node (node, stmts, nops);
1984 SLP_TREE_VECTYPE (node) = vectype;
1985 SLP_TREE_CHILDREN (node).splice (children);
1986 return node;
1987 }
1988
1989 /* Dump a single SLP tree NODE. */
1990
1991 static void
1992 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1993 slp_tree node)
1994 {
1995 unsigned i, j;
1996 slp_tree child;
1997 stmt_vec_info stmt_info;
1998 tree op;
1999
2000 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2001 dump_user_location_t user_loc = loc.get_user_location ();
2002 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
2003 SLP_TREE_DEF_TYPE (node) == vect_external_def
2004 ? " (external)"
2005 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2006 ? " (constant)"
2007 : ""), node,
2008 estimated_poly_value (node->max_nunits),
2009 SLP_TREE_REF_COUNT (node));
2010 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2011 {
2012 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2013 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2014 else
2015 dump_printf_loc (metadata, user_loc, "op template: %G",
2016 SLP_TREE_REPRESENTATIVE (node)->stmt);
2017 }
2018 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2019 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2020 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2021 else
2022 {
2023 dump_printf_loc (metadata, user_loc, "\t{ ");
2024 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2025 dump_printf (metadata, "%T%s ", op,
2026 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2027 dump_printf (metadata, "}\n");
2028 }
2029 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2030 {
2031 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2032 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2033 dump_printf (dump_kind, " %u", j);
2034 dump_printf (dump_kind, " }\n");
2035 }
2036 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2037 {
2038 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2039 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2040 dump_printf (dump_kind, " %u[%u]",
2041 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2042 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2043 dump_printf (dump_kind, " }\n");
2044 }
2045 if (SLP_TREE_CHILDREN (node).is_empty ())
2046 return;
2047 dump_printf_loc (metadata, user_loc, "\tchildren");
2048 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2049 dump_printf (dump_kind, " %p", (void *)child);
2050 dump_printf (dump_kind, "\n");
2051 }
2052
2053 DEBUG_FUNCTION void
2054 debug (slp_tree node)
2055 {
2056 debug_dump_context ctx;
2057 vect_print_slp_tree (MSG_NOTE,
2058 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2059 node);
2060 }
2061
2062 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2063
2064 static void
2065 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2066 slp_tree node, hash_set<slp_tree> &visited)
2067 {
2068 unsigned i;
2069 slp_tree child;
2070
2071 if (visited.add (node))
2072 return;
2073
2074 vect_print_slp_tree (dump_kind, loc, node);
2075
2076 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2077 if (child)
2078 vect_print_slp_graph (dump_kind, loc, child, visited);
2079 }
2080
2081 static void
2082 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2083 slp_tree entry)
2084 {
2085 hash_set<slp_tree> visited;
2086 vect_print_slp_graph (dump_kind, loc, entry, visited);
2087 }
2088
2089 /* Mark the tree rooted at NODE with PURE_SLP. */
2090
2091 static void
2092 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2093 {
2094 int i;
2095 stmt_vec_info stmt_info;
2096 slp_tree child;
2097
2098 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2099 return;
2100
2101 if (visited.add (node))
2102 return;
2103
2104 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2105 STMT_SLP_TYPE (stmt_info) = pure_slp;
2106
2107 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2108 if (child)
2109 vect_mark_slp_stmts (child, visited);
2110 }
2111
2112 static void
2113 vect_mark_slp_stmts (slp_tree node)
2114 {
2115 hash_set<slp_tree> visited;
2116 vect_mark_slp_stmts (node, visited);
2117 }
2118
2119 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2120
2121 static void
2122 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2123 {
2124 int i;
2125 stmt_vec_info stmt_info;
2126 slp_tree child;
2127
2128 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2129 return;
2130
2131 if (visited.add (node))
2132 return;
2133
2134 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2135 {
2136 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2137 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2138 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2139 }
2140
2141 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2142 if (child)
2143 vect_mark_slp_stmts_relevant (child, visited);
2144 }
2145
2146 static void
2147 vect_mark_slp_stmts_relevant (slp_tree node)
2148 {
2149 hash_set<slp_tree> visited;
2150 vect_mark_slp_stmts_relevant (node, visited);
2151 }
2152
2153
2154 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2155
2156 static void
2157 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2158 hash_set<slp_tree> &visited)
2159 {
2160 if (!node || visited.add (node))
2161 return;
2162
2163 if (SLP_TREE_CHILDREN (node).length () == 0)
2164 {
2165 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2166 return;
2167 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2168 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2169 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2170 loads.safe_push (node);
2171 }
2172 else
2173 {
2174 unsigned i;
2175 slp_tree child;
2176 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2177 vect_gather_slp_loads (loads, child, visited);
2178 }
2179 }
2180
2181
2182 /* Find the last store in SLP INSTANCE. */
2183
2184 stmt_vec_info
2185 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2186 {
2187 stmt_vec_info last = NULL;
2188 stmt_vec_info stmt_vinfo;
2189
2190 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2191 {
2192 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2193 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2194 }
2195
2196 return last;
2197 }
2198
2199 /* Find the first stmt in NODE. */
2200
2201 stmt_vec_info
2202 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2203 {
2204 stmt_vec_info first = NULL;
2205 stmt_vec_info stmt_vinfo;
2206
2207 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2208 {
2209 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2210 if (!first
2211 || get_later_stmt (stmt_vinfo, first) == first)
2212 first = stmt_vinfo;
2213 }
2214
2215 return first;
2216 }
2217
2218 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2219 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2220 (also containing the first GROUP1_SIZE stmts, since stores are
2221 consecutive), the second containing the remainder.
2222 Return the first stmt in the second group. */
2223
2224 static stmt_vec_info
2225 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2226 {
2227 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2228 gcc_assert (group1_size > 0);
2229 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2230 gcc_assert (group2_size > 0);
2231 DR_GROUP_SIZE (first_vinfo) = group1_size;
2232
2233 stmt_vec_info stmt_info = first_vinfo;
2234 for (unsigned i = group1_size; i > 1; i--)
2235 {
2236 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2237 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2238 }
2239 /* STMT is now the last element of the first group. */
2240 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2241 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2242
2243 DR_GROUP_SIZE (group2) = group2_size;
2244 for (stmt_info = group2; stmt_info;
2245 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2246 {
2247 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2248 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2249 }
2250
2251 /* For the second group, the DR_GROUP_GAP is that before the original group,
2252 plus skipping over the first vector. */
2253 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2254
2255 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2256 DR_GROUP_GAP (first_vinfo) += group2_size;
2257
2258 if (dump_enabled_p ())
2259 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2260 group1_size, group2_size);
2261
2262 return group2;
2263 }
2264
2265 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2266 statements and a vector of NUNITS elements. */
2267
2268 static poly_uint64
2269 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2270 {
2271 return exact_div (common_multiple (nunits, group_size), group_size);
2272 }
2273
2274 /* Helper that checks to see if a node is a load node. */
2275
2276 static inline bool
2277 vect_is_slp_load_node (slp_tree root)
2278 {
2279 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2280 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2281 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2282 }
2283
2284
2285 /* Helper function of optimize_load_redistribution that performs the operation
2286 recursively. */
2287
2288 static slp_tree
2289 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2290 vec_info *vinfo, unsigned int group_size,
2291 hash_map<slp_tree, slp_tree> *load_map,
2292 slp_tree root)
2293 {
2294 if (slp_tree *leader = load_map->get (root))
2295 return *leader;
2296
2297 slp_tree node;
2298 unsigned i;
2299
2300 /* For now, we don't know anything about externals so do not do anything. */
2301 if (SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2302 return NULL;
2303 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2304 {
2305 /* First convert this node into a load node and add it to the leaves
2306 list and flatten the permute from a lane to a load one. If it's
2307 unneeded it will be elided later. */
2308 vec<stmt_vec_info> stmts;
2309 stmts.create (SLP_TREE_LANES (root));
2310 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2311 for (unsigned j = 0; j < lane_perm.length (); j++)
2312 {
2313 std::pair<unsigned, unsigned> perm = lane_perm[j];
2314 node = SLP_TREE_CHILDREN (root)[perm.first];
2315
2316 if (!vect_is_slp_load_node (node)
2317 || SLP_TREE_CHILDREN (node).exists ())
2318 {
2319 stmts.release ();
2320 goto next;
2321 }
2322
2323 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2324 }
2325
2326 if (dump_enabled_p ())
2327 dump_printf_loc (MSG_NOTE, vect_location,
2328 "converting stmts on permute node %p\n", root);
2329
2330 bool *matches = XALLOCAVEC (bool, group_size);
2331 poly_uint64 max_nunits = 1;
2332 unsigned tree_size = 0, limit = 1;
2333 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2334 matches, &limit, &tree_size, bst_map);
2335 if (!node)
2336 stmts.release ();
2337
2338 load_map->put (root, node);
2339 return node;
2340 }
2341
2342 next:
2343 load_map->put (root, NULL);
2344
2345 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2346 {
2347 slp_tree value
2348 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2349 node);
2350 if (value)
2351 {
2352 SLP_TREE_REF_COUNT (value)++;
2353 SLP_TREE_CHILDREN (root)[i] = value;
2354 /* ??? We know the original leafs of the replaced nodes will
2355 be referenced by bst_map, only the permutes created by
2356 pattern matching are not. */
2357 if (SLP_TREE_REF_COUNT (node) == 1)
2358 load_map->remove (node);
2359 vect_free_slp_tree (node);
2360 }
2361 }
2362
2363 return NULL;
2364 }
2365
2366 /* Temporary workaround for loads not being CSEd during SLP build. This
2367 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2368 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2369 same DR such that the final operation is equal to a permuted load. Such
2370 NODES are then directly converted into LOADS themselves. The nodes are
2371 CSEd using BST_MAP. */
2372
2373 static void
2374 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2375 vec_info *vinfo, unsigned int group_size,
2376 hash_map<slp_tree, slp_tree> *load_map,
2377 slp_tree root)
2378 {
2379 slp_tree node;
2380 unsigned i;
2381
2382 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2383 {
2384 slp_tree value
2385 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2386 node);
2387 if (value)
2388 {
2389 SLP_TREE_REF_COUNT (value)++;
2390 SLP_TREE_CHILDREN (root)[i] = value;
2391 /* ??? We know the original leafs of the replaced nodes will
2392 be referenced by bst_map, only the permutes created by
2393 pattern matching are not. */
2394 if (SLP_TREE_REF_COUNT (node) == 1)
2395 load_map->remove (node);
2396 vect_free_slp_tree (node);
2397 }
2398 }
2399 }
2400
2401 /* Helper function of vect_match_slp_patterns.
2402
2403 Attempts to match patterns against the slp tree rooted in REF_NODE using
2404 VINFO. Patterns are matched in post-order traversal.
2405
2406 If matching is successful the value in REF_NODE is updated and returned, if
2407 not then it is returned unchanged. */
2408
2409 static bool
2410 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2411 slp_tree_to_load_perm_map_t *perm_cache,
2412 hash_set<slp_tree> *visited)
2413 {
2414 unsigned i;
2415 slp_tree node = *ref_node;
2416 bool found_p = false;
2417 if (!node || visited->add (node))
2418 return false;
2419
2420 slp_tree child;
2421 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2422 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2423 vinfo, perm_cache, visited);
2424
2425 for (unsigned x = 0; x < num__slp_patterns; x++)
2426 {
2427 vect_pattern *pattern = slp_patterns[x] (perm_cache, ref_node);
2428 if (pattern)
2429 {
2430 pattern->build (vinfo);
2431 delete pattern;
2432 found_p = true;
2433 }
2434 }
2435
2436 return found_p;
2437 }
2438
2439 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2440 vec_info VINFO.
2441
2442 The modified tree is returned. Patterns are tried in order and multiple
2443 patterns may match. */
2444
2445 static bool
2446 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2447 hash_set<slp_tree> *visited,
2448 slp_tree_to_load_perm_map_t *perm_cache)
2449 {
2450 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2451 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2452
2453 if (dump_enabled_p ())
2454 dump_printf_loc (MSG_NOTE, vect_location,
2455 "Analyzing SLP tree %p for patterns\n",
2456 SLP_INSTANCE_TREE (instance));
2457
2458 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
2459 }
2460
2461 /* Analyze an SLP instance starting from a group of grouped stores. Call
2462 vect_build_slp_tree to build a tree of packed stmts if possible.
2463 Return FALSE if it's impossible to SLP any stmt in the loop. */
2464
2465 static bool
2466 vect_analyze_slp_instance (vec_info *vinfo,
2467 scalar_stmts_to_slp_tree_map_t *bst_map,
2468 stmt_vec_info stmt_info, slp_instance_kind kind,
2469 unsigned max_tree_size, unsigned *limit);
2470
2471 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2472 of KIND. Return true if successful. */
2473
2474 static bool
2475 vect_build_slp_instance (vec_info *vinfo,
2476 slp_instance_kind kind,
2477 vec<stmt_vec_info> &scalar_stmts,
2478 stmt_vec_info root_stmt_info,
2479 unsigned max_tree_size, unsigned *limit,
2480 scalar_stmts_to_slp_tree_map_t *bst_map,
2481 /* ??? We need stmt_info for group splitting. */
2482 stmt_vec_info stmt_info_)
2483 {
2484 if (dump_enabled_p ())
2485 {
2486 dump_printf_loc (MSG_NOTE, vect_location,
2487 "Starting SLP discovery for\n");
2488 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
2489 dump_printf_loc (MSG_NOTE, vect_location,
2490 " %G", scalar_stmts[i]->stmt);
2491 }
2492
2493 /* Build the tree for the SLP instance. */
2494 unsigned int group_size = scalar_stmts.length ();
2495 bool *matches = XALLOCAVEC (bool, group_size);
2496 poly_uint64 max_nunits = 1;
2497 unsigned tree_size = 0;
2498 unsigned i;
2499 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2500 &max_nunits, matches, limit,
2501 &tree_size, bst_map);
2502 if (node != NULL)
2503 {
2504 /* Calculate the unrolling factor based on the smallest type. */
2505 poly_uint64 unrolling_factor
2506 = calculate_unrolling_factor (max_nunits, group_size);
2507
2508 if (maybe_ne (unrolling_factor, 1U)
2509 && is_a <bb_vec_info> (vinfo))
2510 {
2511 unsigned HOST_WIDE_INT const_max_nunits;
2512 if (!max_nunits.is_constant (&const_max_nunits)
2513 || const_max_nunits > group_size)
2514 {
2515 if (dump_enabled_p ())
2516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2517 "Build SLP failed: store group "
2518 "size not a multiple of the vector size "
2519 "in basic block SLP\n");
2520 vect_free_slp_tree (node);
2521 return false;
2522 }
2523 /* Fatal mismatch. */
2524 if (dump_enabled_p ())
2525 dump_printf_loc (MSG_NOTE, vect_location,
2526 "SLP discovery succeeded but node needs "
2527 "splitting\n");
2528 memset (matches, true, group_size);
2529 matches[group_size / const_max_nunits * const_max_nunits] = false;
2530 vect_free_slp_tree (node);
2531 }
2532 else
2533 {
2534 /* Create a new SLP instance. */
2535 slp_instance new_instance = XNEW (class _slp_instance);
2536 SLP_INSTANCE_TREE (new_instance) = node;
2537 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2538 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2539 SLP_INSTANCE_ROOT_STMT (new_instance) = root_stmt_info;
2540 SLP_INSTANCE_KIND (new_instance) = kind;
2541 new_instance->reduc_phis = NULL;
2542 new_instance->cost_vec = vNULL;
2543 new_instance->subgraph_entries = vNULL;
2544
2545 if (dump_enabled_p ())
2546 dump_printf_loc (MSG_NOTE, vect_location,
2547 "SLP size %u vs. limit %u.\n",
2548 tree_size, max_tree_size);
2549
2550 /* Fixup SLP reduction chains. */
2551 if (kind == slp_inst_kind_reduc_chain)
2552 {
2553 /* If this is a reduction chain with a conversion in front
2554 amend the SLP tree with a node for that. */
2555 gimple *scalar_def
2556 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2557 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
2558 {
2559 /* Get at the conversion stmt - we know it's the single use
2560 of the last stmt of the reduction chain. */
2561 use_operand_p use_p;
2562 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
2563 &use_p, &scalar_def);
2564 gcc_assert (r);
2565 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
2566 next_info = vect_stmt_to_vectorize (next_info);
2567 scalar_stmts = vNULL;
2568 scalar_stmts.create (group_size);
2569 for (unsigned i = 0; i < group_size; ++i)
2570 scalar_stmts.quick_push (next_info);
2571 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2572 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2573 SLP_TREE_CHILDREN (conv).quick_push (node);
2574 SLP_INSTANCE_TREE (new_instance) = conv;
2575 /* We also have to fake this conversion stmt as SLP reduction
2576 group so we don't have to mess with too much code
2577 elsewhere. */
2578 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2579 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2580 }
2581 /* Fill the backedge child of the PHI SLP node. The
2582 general matching code cannot find it because the
2583 scalar code does not reflect how we vectorize the
2584 reduction. */
2585 use_operand_p use_p;
2586 imm_use_iterator imm_iter;
2587 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
2588 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
2589 gimple_get_lhs (scalar_def))
2590 /* There are exactly two non-debug uses, the reduction
2591 PHI and the loop-closed PHI node. */
2592 if (!is_gimple_debug (USE_STMT (use_p))
2593 && gimple_bb (USE_STMT (use_p)) == loop->header)
2594 {
2595 auto_vec<stmt_vec_info, 64> phis (group_size);
2596 stmt_vec_info phi_info
2597 = vinfo->lookup_stmt (USE_STMT (use_p));
2598 for (unsigned i = 0; i < group_size; ++i)
2599 phis.quick_push (phi_info);
2600 slp_tree *phi_node = bst_map->get (phis);
2601 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
2602 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
2603 = SLP_INSTANCE_TREE (new_instance);
2604 SLP_INSTANCE_TREE (new_instance)->refcnt++;
2605 }
2606 }
2607
2608 vinfo->slp_instances.safe_push (new_instance);
2609
2610 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2611 the number of scalar stmts in the root in a few places.
2612 Verify that assumption holds. */
2613 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2614 .length () == group_size);
2615
2616 if (dump_enabled_p ())
2617 {
2618 dump_printf_loc (MSG_NOTE, vect_location,
2619 "Final SLP tree for instance %p:\n", new_instance);
2620 vect_print_slp_graph (MSG_NOTE, vect_location,
2621 SLP_INSTANCE_TREE (new_instance));
2622 }
2623
2624 return true;
2625 }
2626 }
2627 else
2628 {
2629 /* Failed to SLP. */
2630 /* Free the allocated memory. */
2631 scalar_stmts.release ();
2632 }
2633
2634 stmt_vec_info stmt_info = stmt_info_;
2635 /* Try to break the group up into pieces. */
2636 if (kind == slp_inst_kind_store)
2637 {
2638 /* ??? We could delay all the actual splitting of store-groups
2639 until after SLP discovery of the original group completed.
2640 Then we can recurse to vect_build_slp_instance directly. */
2641 for (i = 0; i < group_size; i++)
2642 if (!matches[i])
2643 break;
2644
2645 /* For basic block SLP, try to break the group up into multiples of
2646 a vector size. */
2647 if (is_a <bb_vec_info> (vinfo)
2648 && (i > 1 && i < group_size))
2649 {
2650 tree scalar_type
2651 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2652 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2653 1 << floor_log2 (i));
2654 unsigned HOST_WIDE_INT const_nunits;
2655 if (vectype
2656 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2657 {
2658 /* Split into two groups at the first vector boundary. */
2659 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2660 unsigned group1_size = i & ~(const_nunits - 1);
2661
2662 if (dump_enabled_p ())
2663 dump_printf_loc (MSG_NOTE, vect_location,
2664 "Splitting SLP group at stmt %u\n", i);
2665 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2666 group1_size);
2667 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2668 kind, max_tree_size,
2669 limit);
2670 /* Split the rest at the failure point and possibly
2671 re-analyze the remaining matching part if it has
2672 at least two lanes. */
2673 if (group1_size < i
2674 && (i + 1 < group_size
2675 || i - group1_size > 1))
2676 {
2677 stmt_vec_info rest2 = rest;
2678 rest = vect_split_slp_store_group (rest, i - group1_size);
2679 if (i - group1_size > 1)
2680 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
2681 kind, max_tree_size,
2682 limit);
2683 }
2684 /* Re-analyze the non-matching tail if it has at least
2685 two lanes. */
2686 if (i + 1 < group_size)
2687 res |= vect_analyze_slp_instance (vinfo, bst_map,
2688 rest, kind, max_tree_size,
2689 limit);
2690 return res;
2691 }
2692 }
2693
2694 /* For loop vectorization split into arbitrary pieces of size > 1. */
2695 if (is_a <loop_vec_info> (vinfo)
2696 && (i > 1 && i < group_size))
2697 {
2698 unsigned group1_size = i;
2699
2700 if (dump_enabled_p ())
2701 dump_printf_loc (MSG_NOTE, vect_location,
2702 "Splitting SLP group at stmt %u\n", i);
2703
2704 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2705 group1_size);
2706 /* Loop vectorization cannot handle gaps in stores, make sure
2707 the split group appears as strided. */
2708 STMT_VINFO_STRIDED_P (rest) = 1;
2709 DR_GROUP_GAP (rest) = 0;
2710 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2711 DR_GROUP_GAP (stmt_info) = 0;
2712
2713 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2714 kind, max_tree_size, limit);
2715 if (i + 1 < group_size)
2716 res |= vect_analyze_slp_instance (vinfo, bst_map,
2717 rest, kind, max_tree_size, limit);
2718
2719 return res;
2720 }
2721
2722 /* Even though the first vector did not all match, we might be able to SLP
2723 (some) of the remainder. FORNOW ignore this possibility. */
2724 }
2725
2726 /* Failed to SLP. */
2727 if (dump_enabled_p ())
2728 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2729 return false;
2730 }
2731
2732
2733 /* Analyze an SLP instance starting from a group of grouped stores. Call
2734 vect_build_slp_tree to build a tree of packed stmts if possible.
2735 Return FALSE if it's impossible to SLP any stmt in the loop. */
2736
2737 static bool
2738 vect_analyze_slp_instance (vec_info *vinfo,
2739 scalar_stmts_to_slp_tree_map_t *bst_map,
2740 stmt_vec_info stmt_info,
2741 slp_instance_kind kind,
2742 unsigned max_tree_size, unsigned *limit)
2743 {
2744 unsigned int i;
2745 vec<stmt_vec_info> scalar_stmts;
2746
2747 if (is_a <bb_vec_info> (vinfo))
2748 vect_location = stmt_info->stmt;
2749
2750 stmt_vec_info next_info = stmt_info;
2751 if (kind == slp_inst_kind_store)
2752 {
2753 /* Collect the stores and store them in scalar_stmts. */
2754 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
2755 while (next_info)
2756 {
2757 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2758 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2759 }
2760 }
2761 else if (kind == slp_inst_kind_reduc_chain)
2762 {
2763 /* Collect the reduction stmts and store them in scalar_stmts. */
2764 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
2765 while (next_info)
2766 {
2767 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2768 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2769 }
2770 /* Mark the first element of the reduction chain as reduction to properly
2771 transform the node. In the reduction analysis phase only the last
2772 element of the chain is marked as reduction. */
2773 STMT_VINFO_DEF_TYPE (stmt_info)
2774 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2775 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2776 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2777 }
2778 else if (kind == slp_inst_kind_ctor)
2779 {
2780 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2781 tree val;
2782 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
2783 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2784 {
2785 stmt_vec_info def_info = vinfo->lookup_def (val);
2786 def_info = vect_stmt_to_vectorize (def_info);
2787 scalar_stmts.quick_push (def_info);
2788 }
2789 if (dump_enabled_p ())
2790 dump_printf_loc (MSG_NOTE, vect_location,
2791 "Analyzing vectorizable constructor: %G\n",
2792 stmt_info->stmt);
2793 }
2794 else if (kind == slp_inst_kind_reduc_group)
2795 {
2796 /* Collect reduction statements. */
2797 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2798 scalar_stmts.create (reductions.length ());
2799 for (i = 0; reductions.iterate (i, &next_info); i++)
2800 if (STMT_VINFO_RELEVANT_P (next_info)
2801 || STMT_VINFO_LIVE_P (next_info))
2802 scalar_stmts.quick_push (next_info);
2803 /* If less than two were relevant/live there's nothing to SLP. */
2804 if (scalar_stmts.length () < 2)
2805 return false;
2806 }
2807 else
2808 gcc_unreachable ();
2809
2810 /* Build the tree for the SLP instance. */
2811 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
2812 kind == slp_inst_kind_ctor
2813 ? stmt_info : NULL,
2814 max_tree_size, limit, bst_map,
2815 kind == slp_inst_kind_store
2816 ? stmt_info : NULL);
2817
2818 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2819 where we should do store group splitting. */
2820
2821 return res;
2822 }
2823
2824 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2825 trees of packed scalar stmts if SLP is possible. */
2826
2827 opt_result
2828 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2829 {
2830 unsigned int i;
2831 stmt_vec_info first_element;
2832 slp_instance instance;
2833
2834 DUMP_VECT_SCOPE ("vect_analyze_slp");
2835
2836 unsigned limit = max_tree_size;
2837
2838 scalar_stmts_to_slp_tree_map_t *bst_map
2839 = new scalar_stmts_to_slp_tree_map_t ();
2840
2841 /* Find SLP sequences starting from groups of grouped stores. */
2842 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2843 vect_analyze_slp_instance (vinfo, bst_map, first_element,
2844 STMT_VINFO_GROUPED_ACCESS (first_element)
2845 ? slp_inst_kind_store : slp_inst_kind_ctor,
2846 max_tree_size, &limit);
2847
2848 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
2849 {
2850 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
2851 {
2852 vect_location = bb_vinfo->roots[i].root->stmt;
2853 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
2854 bb_vinfo->roots[i].stmts,
2855 bb_vinfo->roots[i].root,
2856 max_tree_size, &limit, bst_map, NULL))
2857 bb_vinfo->roots[i].stmts = vNULL;
2858 }
2859 }
2860
2861 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2862 {
2863 /* Find SLP sequences starting from reduction chains. */
2864 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2865 if (! STMT_VINFO_RELEVANT_P (first_element)
2866 && ! STMT_VINFO_LIVE_P (first_element))
2867 ;
2868 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2869 slp_inst_kind_reduc_chain,
2870 max_tree_size, &limit))
2871 {
2872 /* Dissolve reduction chain group. */
2873 stmt_vec_info vinfo = first_element;
2874 stmt_vec_info last = NULL;
2875 while (vinfo)
2876 {
2877 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2878 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2879 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2880 last = vinfo;
2881 vinfo = next;
2882 }
2883 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2884 /* It can be still vectorized as part of an SLP reduction. */
2885 loop_vinfo->reductions.safe_push (last);
2886 }
2887
2888 /* Find SLP sequences starting from groups of reductions. */
2889 if (loop_vinfo->reductions.length () > 1)
2890 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2891 slp_inst_kind_reduc_group, max_tree_size,
2892 &limit);
2893 }
2894
2895 hash_set<slp_tree> visited_patterns;
2896 slp_tree_to_load_perm_map_t perm_cache;
2897
2898 /* See if any patterns can be found in the SLP tree. */
2899 bool pattern_found = false;
2900 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
2901 pattern_found |= vect_match_slp_patterns (instance, vinfo,
2902 &visited_patterns, &perm_cache);
2903
2904 /* If any were found optimize permutations of loads. */
2905 if (pattern_found)
2906 {
2907 hash_map<slp_tree, slp_tree> load_map;
2908 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
2909 {
2910 slp_tree root = SLP_INSTANCE_TREE (instance);
2911 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
2912 &load_map, root);
2913 }
2914 }
2915
2916
2917
2918 /* The map keeps a reference on SLP nodes built, release that. */
2919 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2920 it != bst_map->end (); ++it)
2921 if ((*it).second)
2922 vect_free_slp_tree ((*it).second);
2923 delete bst_map;
2924
2925 if (pattern_found && dump_enabled_p ())
2926 {
2927 dump_printf_loc (MSG_NOTE, vect_location,
2928 "Pattern matched SLP tree\n");
2929 hash_set<slp_tree> visited;
2930 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
2931 vect_print_slp_graph (MSG_NOTE, vect_location,
2932 SLP_INSTANCE_TREE (instance), visited);
2933 }
2934
2935 return opt_result::success ();
2936 }
2937
2938 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2939
2940 static void
2941 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2942 vec<slp_tree> &vertices, vec<int> &leafs)
2943 {
2944 unsigned i;
2945 slp_tree child;
2946
2947 if (visited.add (node))
2948 return;
2949
2950 node->vertex = vertices.length ();
2951 vertices.safe_push (node);
2952
2953 bool leaf = true;
2954 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2955 if (child)
2956 {
2957 leaf = false;
2958 vect_slp_build_vertices (visited, child, vertices, leafs);
2959 }
2960 if (leaf)
2961 leafs.safe_push (node->vertex);
2962 }
2963
2964 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2965
2966 static void
2967 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2968 vec<int> &leafs)
2969 {
2970 hash_set<slp_tree> visited;
2971 unsigned i;
2972 slp_instance instance;
2973 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2974 {
2975 unsigned n_v = vertices.length ();
2976 unsigned n_l = leafs.length ();
2977 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2978 leafs);
2979 /* If we added vertices but no entries to the reverse graph we've
2980 added a cycle that is not backwards-reachable. Push the entry
2981 to mimic as leaf then. */
2982 if (vertices.length () > n_v
2983 && leafs.length () == n_l)
2984 leafs.safe_push (SLP_INSTANCE_TREE (instance)->vertex);
2985 }
2986 }
2987
2988 /* Apply (reverse) bijectite PERM to VEC. */
2989
2990 template <class T>
2991 static void
2992 vect_slp_permute (vec<unsigned> perm,
2993 vec<T> &vec, bool reverse)
2994 {
2995 auto_vec<T, 64> saved;
2996 saved.create (vec.length ());
2997 for (unsigned i = 0; i < vec.length (); ++i)
2998 saved.quick_push (vec[i]);
2999
3000 if (reverse)
3001 {
3002 for (unsigned i = 0; i < vec.length (); ++i)
3003 vec[perm[i]] = saved[i];
3004 for (unsigned i = 0; i < vec.length (); ++i)
3005 gcc_assert (vec[perm[i]] == saved[i]);
3006 }
3007 else
3008 {
3009 for (unsigned i = 0; i < vec.length (); ++i)
3010 vec[i] = saved[perm[i]];
3011 for (unsigned i = 0; i < vec.length (); ++i)
3012 gcc_assert (vec[i] == saved[perm[i]]);
3013 }
3014 }
3015
3016 /* Return whether permutations PERM_A and PERM_B as recorded in the
3017 PERMS vector are equal. */
3018
3019 static bool
3020 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
3021 int perm_a, int perm_b)
3022 {
3023 return (perm_a == perm_b
3024 || (perms[perm_a].length () == perms[perm_b].length ()
3025 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
3026 sizeof (unsigned) * perms[perm_a].length ()) == 0));
3027 }
3028
3029 /* Optimize the SLP graph of VINFO. */
3030
3031 void
3032 vect_optimize_slp (vec_info *vinfo)
3033 {
3034 if (vinfo->slp_instances.is_empty ())
3035 return;
3036
3037 slp_tree node;
3038 unsigned i;
3039 auto_vec<slp_tree> vertices;
3040 auto_vec<int> leafs;
3041 vect_slp_build_vertices (vinfo, vertices, leafs);
3042
3043 struct graph *slpg = new_graph (vertices.length ());
3044 FOR_EACH_VEC_ELT (vertices, i, node)
3045 {
3046 unsigned j;
3047 slp_tree child;
3048 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3049 if (child)
3050 add_edge (slpg, i, child->vertex);
3051 }
3052
3053 /* Compute (reverse) postorder on the inverted graph. */
3054 auto_vec<int> ipo;
3055 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3056
3057 auto_sbitmap n_visited (vertices.length ());
3058 auto_sbitmap n_materialize (vertices.length ());
3059 auto_vec<int> n_perm (vertices.length ());
3060 auto_vec<vec<unsigned> > perms;
3061
3062 bitmap_clear (n_visited);
3063 bitmap_clear (n_materialize);
3064 n_perm.quick_grow_cleared (vertices.length ());
3065 perms.safe_push (vNULL); /* zero is no permute */
3066
3067 /* Produce initial permutations. */
3068 for (i = 0; i < leafs.length (); ++i)
3069 {
3070 int idx = leafs[i];
3071 slp_tree node = vertices[idx];
3072
3073 /* Handle externals and constants optimistically throughout the
3074 iteration. */
3075 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3076 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3077 continue;
3078
3079 /* Leafs do not change across iterations. Note leafs also double
3080 as entries to the reverse graph. */
3081 if (!slpg->vertices[idx].succ)
3082 bitmap_set_bit (n_visited, idx);
3083 /* Loads are the only thing generating permutes. */
3084 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3085 continue;
3086
3087 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3088 node unpermuted, record this permute. */
3089 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3090 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3091 continue;
3092 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3093 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3094 bool any_permute = false;
3095 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3096 {
3097 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3098 imin = MIN (imin, idx);
3099 imax = MAX (imax, idx);
3100 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3101 any_permute = true;
3102 }
3103 /* If there's no permute no need to split one out. */
3104 if (!any_permute)
3105 continue;
3106 /* If the span doesn't match we'd disrupt VF computation, avoid
3107 that for now. */
3108 if (imax - imin + 1 != SLP_TREE_LANES (node))
3109 continue;
3110
3111 /* For now only handle true permutes, like
3112 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3113 when permuting constants and invariants keeping the permute
3114 bijective. */
3115 auto_sbitmap load_index (SLP_TREE_LANES (node));
3116 bitmap_clear (load_index);
3117 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3118 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3119 unsigned j;
3120 for (j = 0; j < SLP_TREE_LANES (node); ++j)
3121 if (!bitmap_bit_p (load_index, j))
3122 break;
3123 if (j != SLP_TREE_LANES (node))
3124 continue;
3125
3126 vec<unsigned> perm = vNULL;
3127 perm.safe_grow (SLP_TREE_LANES (node), true);
3128 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3129 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3130 perms.safe_push (perm);
3131 n_perm[idx] = perms.length () - 1;
3132 }
3133
3134 /* Propagate permutes along the graph and compute materialization points. */
3135 bool changed;
3136 unsigned iteration = 0;
3137 do
3138 {
3139 changed = false;
3140 ++iteration;
3141
3142 for (i = vertices.length (); i > 0 ; --i)
3143 {
3144 int idx = ipo[i-1];
3145 slp_tree node = vertices[idx];
3146 /* For leafs there's nothing to do - we've seeded permutes
3147 on those above. */
3148 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3149 continue;
3150
3151 bitmap_set_bit (n_visited, idx);
3152
3153 /* We cannot move a permute across a store. */
3154 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
3155 && DR_IS_WRITE
3156 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
3157 continue;
3158
3159 int perm = -1;
3160 for (graph_edge *succ = slpg->vertices[idx].succ;
3161 succ; succ = succ->succ_next)
3162 {
3163 int succ_idx = succ->dest;
3164 /* Handle unvisited nodes optimistically. */
3165 /* ??? But for constants once we want to handle non-bijective
3166 permutes we have to verify the permute, when unifying lanes,
3167 will not unify different constants. For example see
3168 gcc.dg/vect/bb-slp-14.c for a case that would break. */
3169 if (!bitmap_bit_p (n_visited, succ_idx))
3170 continue;
3171 int succ_perm = n_perm[succ_idx];
3172 /* Once we materialize succs permutation its output lanes
3173 appear unpermuted to us. */
3174 if (bitmap_bit_p (n_materialize, succ_idx))
3175 succ_perm = 0;
3176 if (perm == -1)
3177 perm = succ_perm;
3178 else if (succ_perm == 0)
3179 {
3180 perm = 0;
3181 break;
3182 }
3183 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
3184 {
3185 perm = 0;
3186 break;
3187 }
3188 }
3189
3190 if (perm == -1)
3191 /* Pick up pre-computed leaf values. */
3192 perm = n_perm[idx];
3193 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
3194 {
3195 if (iteration > 1)
3196 /* Make sure we eventually converge. */
3197 gcc_checking_assert (perm == 0);
3198 n_perm[idx] = perm;
3199 if (perm == 0)
3200 bitmap_clear_bit (n_materialize, idx);
3201 changed = true;
3202 }
3203
3204 if (perm == 0)
3205 continue;
3206
3207 /* Elide pruning at materialization points in the first
3208 iteration so every node was visited once at least. */
3209 if (iteration == 1)
3210 continue;
3211
3212 /* Decide on permute materialization. Look whether there's
3213 a use (pred) edge that is permuted differently than us.
3214 In that case mark ourselves so the permutation is applied.
3215 For VEC_PERM_EXPRs the permutation doesn't carry along
3216 from children to parents so force materialization at the
3217 point of the VEC_PERM_EXPR. In principle VEC_PERM_EXPRs
3218 are a source of an arbitrary permutation again, similar
3219 to constants/externals - that's something we do not yet
3220 optimally handle. */
3221 bool all_preds_permuted = (SLP_TREE_CODE (node) != VEC_PERM_EXPR
3222 && slpg->vertices[idx].pred != NULL);
3223 if (all_preds_permuted)
3224 for (graph_edge *pred = slpg->vertices[idx].pred;
3225 pred; pred = pred->pred_next)
3226 {
3227 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
3228 int pred_perm = n_perm[pred->src];
3229 if (!vect_slp_perms_eq (perms, perm, pred_perm))
3230 {
3231 all_preds_permuted = false;
3232 break;
3233 }
3234 }
3235 if (!all_preds_permuted)
3236 {
3237 if (!bitmap_bit_p (n_materialize, idx))
3238 changed = true;
3239 bitmap_set_bit (n_materialize, idx);
3240 }
3241 }
3242 }
3243 while (changed || iteration == 1);
3244
3245 /* Materialize. */
3246 for (i = 0; i < vertices.length (); ++i)
3247 {
3248 int perm = n_perm[i];
3249 if (perm <= 0)
3250 continue;
3251
3252 slp_tree node = vertices[i];
3253
3254 /* First permute invariant/external original successors. */
3255 unsigned j;
3256 slp_tree child;
3257 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3258 {
3259 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3260 continue;
3261
3262 /* If the vector is uniform there's nothing to do. */
3263 if (vect_slp_tree_uniform_p (child))
3264 continue;
3265
3266 /* We can end up sharing some externals via two_operator
3267 handling. Be prepared to unshare those. */
3268 if (child->refcnt != 1)
3269 {
3270 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3271 SLP_TREE_CHILDREN (node)[j] = child
3272 = vect_create_new_slp_node
3273 (SLP_TREE_SCALAR_OPS (child).copy ());
3274 }
3275 vect_slp_permute (perms[perm],
3276 SLP_TREE_SCALAR_OPS (child), true);
3277 }
3278
3279 if (bitmap_bit_p (n_materialize, i))
3280 {
3281 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3282 /* For loads simply drop the permutation, the load permutation
3283 already performs the desired permutation. */
3284 ;
3285 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3286 {
3287 /* If the node is already a permute node we can apply
3288 the permutation to the lane selection, effectively
3289 materializing it on the incoming vectors. */
3290 if (dump_enabled_p ())
3291 dump_printf_loc (MSG_NOTE, vect_location,
3292 "simplifying permute node %p\n",
3293 node);
3294
3295 for (unsigned k = 0;
3296 k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3297 SLP_TREE_LANE_PERMUTATION (node)[k].second
3298 = perms[perm][SLP_TREE_LANE_PERMUTATION (node)[k].second];
3299 }
3300 else
3301 {
3302 if (dump_enabled_p ())
3303 dump_printf_loc (MSG_NOTE, vect_location,
3304 "inserting permute node in place of %p\n",
3305 node);
3306
3307 /* Make a copy of NODE and in-place change it to a
3308 VEC_PERM node to permute the lanes of the copy. */
3309 slp_tree copy = new _slp_tree;
3310 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3311 SLP_TREE_CHILDREN (node) = vNULL;
3312 SLP_TREE_SCALAR_STMTS (copy)
3313 = SLP_TREE_SCALAR_STMTS (node).copy ();
3314 vect_slp_permute (perms[perm],
3315 SLP_TREE_SCALAR_STMTS (copy), true);
3316 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3317 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3318 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3319 SLP_TREE_LANE_PERMUTATION (copy)
3320 = SLP_TREE_LANE_PERMUTATION (node);
3321 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3322 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3323 copy->refcnt = 1;
3324 copy->max_nunits = node->max_nunits;
3325 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3326 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3327 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3328
3329 /* Now turn NODE into a VEC_PERM. */
3330 SLP_TREE_CHILDREN (node).safe_push (copy);
3331 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3332 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3333 SLP_TREE_LANE_PERMUTATION (node)
3334 .quick_push (std::make_pair (0, perms[perm][j]));
3335 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3336 }
3337 }
3338 else
3339 {
3340 /* Apply the reverse permutation to our stmts. */
3341 vect_slp_permute (perms[perm],
3342 SLP_TREE_SCALAR_STMTS (node), true);
3343 /* And to the load permutation, which we can simply
3344 make regular by design. */
3345 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3346 {
3347 /* ??? When we handle non-bijective permutes the idea
3348 is that we can force the load-permutation to be
3349 { min, min + 1, min + 2, ... max }. But then the
3350 scalar defs might no longer match the lane content
3351 which means wrong-code with live lane vectorization.
3352 So we possibly have to have NULL entries for those. */
3353 vect_slp_permute (perms[perm],
3354 SLP_TREE_LOAD_PERMUTATION (node), true);
3355 }
3356 }
3357 }
3358
3359 /* Free the perms vector used for propagation. */
3360 while (!perms.is_empty ())
3361 perms.pop ().release ();
3362 free_graph (slpg);
3363
3364
3365 /* Now elide load permutations that are not necessary. */
3366 for (i = 0; i < leafs.length (); ++i)
3367 {
3368 node = vertices[leafs[i]];
3369 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3370 continue;
3371
3372 /* In basic block vectorization we allow any subchain of an interleaving
3373 chain.
3374 FORNOW: not in loop SLP because of realignment complications. */
3375 if (is_a <bb_vec_info> (vinfo))
3376 {
3377 bool subchain_p = true;
3378 stmt_vec_info next_load_info = NULL;
3379 stmt_vec_info load_info;
3380 unsigned j;
3381 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3382 {
3383 if (j != 0
3384 && (next_load_info != load_info
3385 || DR_GROUP_GAP (load_info) != 1))
3386 {
3387 subchain_p = false;
3388 break;
3389 }
3390 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3391 }
3392 if (subchain_p)
3393 {
3394 SLP_TREE_LOAD_PERMUTATION (node).release ();
3395 continue;
3396 }
3397 }
3398 else
3399 {
3400 stmt_vec_info load_info;
3401 bool this_load_permuted = false;
3402 unsigned j;
3403 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3404 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3405 {
3406 this_load_permuted = true;
3407 break;
3408 }
3409 stmt_vec_info first_stmt_info
3410 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3411 if (!this_load_permuted
3412 /* The load requires permutation when unrolling exposes
3413 a gap either because the group is larger than the SLP
3414 group-size or because there is a gap between the groups. */
3415 && (known_eq (LOOP_VINFO_VECT_FACTOR
3416 (as_a <loop_vec_info> (vinfo)), 1U)
3417 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3418 && DR_GROUP_GAP (first_stmt_info) == 0)))
3419 {
3420 SLP_TREE_LOAD_PERMUTATION (node).release ();
3421 continue;
3422 }
3423 }
3424 }
3425 }
3426
3427 /* Gather loads reachable from the individual SLP graph entries. */
3428
3429 void
3430 vect_gather_slp_loads (vec_info *vinfo)
3431 {
3432 unsigned i;
3433 slp_instance instance;
3434 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
3435 {
3436 hash_set<slp_tree> visited;
3437 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
3438 SLP_INSTANCE_TREE (instance), visited);
3439 }
3440 }
3441
3442
3443 /* For each possible SLP instance decide whether to SLP it and calculate overall
3444 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3445 least one instance. */
3446
3447 bool
3448 vect_make_slp_decision (loop_vec_info loop_vinfo)
3449 {
3450 unsigned int i;
3451 poly_uint64 unrolling_factor = 1;
3452 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3453 slp_instance instance;
3454 int decided_to_slp = 0;
3455
3456 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3457
3458 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3459 {
3460 /* FORNOW: SLP if you can. */
3461 /* All unroll factors have the form:
3462
3463 GET_MODE_SIZE (vinfo->vector_mode) * X
3464
3465 for some rational X, so they must have a common multiple. */
3466 unrolling_factor
3467 = force_common_multiple (unrolling_factor,
3468 SLP_INSTANCE_UNROLLING_FACTOR (instance));
3469
3470 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3471 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3472 loop-based vectorization. Such stmts will be marked as HYBRID. */
3473 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3474 decided_to_slp++;
3475 }
3476
3477 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
3478
3479 if (decided_to_slp && dump_enabled_p ())
3480 {
3481 dump_printf_loc (MSG_NOTE, vect_location,
3482 "Decided to SLP %d instances. Unrolling factor ",
3483 decided_to_slp);
3484 dump_dec (MSG_NOTE, unrolling_factor);
3485 dump_printf (MSG_NOTE, "\n");
3486 }
3487
3488 return (decided_to_slp > 0);
3489 }
3490
3491 /* Private data for vect_detect_hybrid_slp. */
3492 struct vdhs_data
3493 {
3494 loop_vec_info loop_vinfo;
3495 vec<stmt_vec_info> *worklist;
3496 };
3497
3498 /* Walker for walk_gimple_op. */
3499
3500 static tree
3501 vect_detect_hybrid_slp (tree *tp, int *, void *data)
3502 {
3503 walk_stmt_info *wi = (walk_stmt_info *)data;
3504 vdhs_data *dat = (vdhs_data *)wi->info;
3505
3506 if (wi->is_lhs)
3507 return NULL_TREE;
3508
3509 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
3510 if (!def_stmt_info)
3511 return NULL_TREE;
3512 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3513 if (PURE_SLP_STMT (def_stmt_info))
3514 {
3515 if (dump_enabled_p ())
3516 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3517 def_stmt_info->stmt);
3518 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3519 dat->worklist->safe_push (def_stmt_info);
3520 }
3521
3522 return NULL_TREE;
3523 }
3524
3525 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3526 if so, otherwise pushing it to WORKLIST. */
3527
3528 static void
3529 maybe_push_to_hybrid_worklist (vec_info *vinfo,
3530 vec<stmt_vec_info> &worklist,
3531 stmt_vec_info stmt_info)
3532 {
3533 if (dump_enabled_p ())
3534 dump_printf_loc (MSG_NOTE, vect_location,
3535 "Processing hybrid candidate : %G", stmt_info->stmt);
3536 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
3537 imm_use_iterator iter2;
3538 ssa_op_iter iter1;
3539 use_operand_p use_p;
3540 def_operand_p def_p;
3541 bool any_def = false;
3542 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
3543 {
3544 any_def = true;
3545 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
3546 {
3547 if (is_gimple_debug (USE_STMT (use_p)))
3548 continue;
3549 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
3550 /* An out-of loop use means this is a loop_vect sink. */
3551 if (!use_info)
3552 {
3553 if (dump_enabled_p ())
3554 dump_printf_loc (MSG_NOTE, vect_location,
3555 "Found loop_vect sink: %G", stmt_info->stmt);
3556 worklist.safe_push (stmt_info);
3557 return;
3558 }
3559 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
3560 {
3561 if (dump_enabled_p ())
3562 dump_printf_loc (MSG_NOTE, vect_location,
3563 "Found loop_vect use: %G", use_info->stmt);
3564 worklist.safe_push (stmt_info);
3565 return;
3566 }
3567 }
3568 }
3569 /* No def means this is a loo_vect sink. */
3570 if (!any_def)
3571 {
3572 if (dump_enabled_p ())
3573 dump_printf_loc (MSG_NOTE, vect_location,
3574 "Found loop_vect sink: %G", stmt_info->stmt);
3575 worklist.safe_push (stmt_info);
3576 return;
3577 }
3578 if (dump_enabled_p ())
3579 dump_printf_loc (MSG_NOTE, vect_location,
3580 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
3581 STMT_SLP_TYPE (stmt_info) = pure_slp;
3582 }
3583
3584 /* Find stmts that must be both vectorized and SLPed. */
3585
3586 void
3587 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3588 {
3589 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3590
3591 /* All stmts participating in SLP are marked pure_slp, all other
3592 stmts are loop_vect.
3593 First collect all loop_vect stmts into a worklist.
3594 SLP patterns cause not all original scalar stmts to appear in
3595 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3596 Rectify this here and do a backward walk over the IL only considering
3597 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3598 mark them as pure_slp. */
3599 auto_vec<stmt_vec_info> worklist;
3600 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
3601 {
3602 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3603 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3604 gsi_next (&gsi))
3605 {
3606 gphi *phi = gsi.phi ();
3607 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3608 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3609 maybe_push_to_hybrid_worklist (loop_vinfo,
3610 worklist, stmt_info);
3611 }
3612 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
3613 gsi_prev (&gsi))
3614 {
3615 gimple *stmt = gsi_stmt (gsi);
3616 if (is_gimple_debug (stmt))
3617 continue;
3618 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3619 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3620 {
3621 for (gimple_stmt_iterator gsi2
3622 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3623 !gsi_end_p (gsi2); gsi_next (&gsi2))
3624 {
3625 stmt_vec_info patt_info
3626 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3627 if (!STMT_SLP_TYPE (patt_info)
3628 && STMT_VINFO_RELEVANT (patt_info))
3629 maybe_push_to_hybrid_worklist (loop_vinfo,
3630 worklist, patt_info);
3631 }
3632 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3633 }
3634 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3635 maybe_push_to_hybrid_worklist (loop_vinfo,
3636 worklist, stmt_info);
3637 }
3638 }
3639
3640 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3641 mark any SLP vectorized stmt as hybrid.
3642 ??? We're visiting def stmts N times (once for each non-SLP and
3643 once for each hybrid-SLP use). */
3644 walk_stmt_info wi;
3645 vdhs_data dat;
3646 dat.worklist = &worklist;
3647 dat.loop_vinfo = loop_vinfo;
3648 memset (&wi, 0, sizeof (wi));
3649 wi.info = (void *)&dat;
3650 while (!worklist.is_empty ())
3651 {
3652 stmt_vec_info stmt_info = worklist.pop ();
3653 /* Since SSA operands are not set up for pattern stmts we need
3654 to use walk_gimple_op. */
3655 wi.is_lhs = 0;
3656 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3657 }
3658 }
3659
3660
3661 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3662
3663 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3664 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs), roots (vNULL)
3665 {
3666 for (unsigned i = 0; i < bbs.length (); ++i)
3667 {
3668 if (i != 0)
3669 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3670 gsi_next (&si))
3671 {
3672 gphi *phi = si.phi ();
3673 gimple_set_uid (phi, 0);
3674 add_stmt (phi);
3675 }
3676 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3677 !gsi_end_p (gsi); gsi_next (&gsi))
3678 {
3679 gimple *stmt = gsi_stmt (gsi);
3680 gimple_set_uid (stmt, 0);
3681 if (is_gimple_debug (stmt))
3682 continue;
3683 add_stmt (stmt);
3684 }
3685 }
3686 }
3687
3688
3689 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3690 stmts in the basic block. */
3691
3692 _bb_vec_info::~_bb_vec_info ()
3693 {
3694 /* Reset region marker. */
3695 for (unsigned i = 0; i < bbs.length (); ++i)
3696 {
3697 if (i != 0)
3698 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3699 gsi_next (&si))
3700 {
3701 gphi *phi = si.phi ();
3702 gimple_set_uid (phi, -1);
3703 }
3704 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3705 !gsi_end_p (gsi); gsi_next (&gsi))
3706 {
3707 gimple *stmt = gsi_stmt (gsi);
3708 gimple_set_uid (stmt, -1);
3709 }
3710 }
3711
3712 for (unsigned i = 0; i < roots.length (); ++i)
3713 roots[i].stmts.release ();
3714 roots.release ();
3715 }
3716
3717 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3718 given then that child nodes have already been processed, and that
3719 their def types currently match their SLP node's def type. */
3720
3721 static bool
3722 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3723 slp_instance node_instance,
3724 stmt_vector_for_cost *cost_vec)
3725 {
3726 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3727 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3728
3729 /* Calculate the number of vector statements to be created for the
3730 scalar stmts in this node. For SLP reductions it is equal to the
3731 number of vector statements in the children (which has already been
3732 calculated by the recursive call). Otherwise it is the number of
3733 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3734 VF divided by the number of elements in a vector. */
3735 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3736 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3737 {
3738 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3739 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3740 {
3741 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3742 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3743 break;
3744 }
3745 }
3746 else
3747 {
3748 poly_uint64 vf;
3749 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3750 vf = loop_vinfo->vectorization_factor;
3751 else
3752 vf = 1;
3753 unsigned int group_size = SLP_TREE_LANES (node);
3754 tree vectype = SLP_TREE_VECTYPE (node);
3755 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3756 = vect_get_num_vectors (vf * group_size, vectype);
3757 }
3758
3759 /* Handle purely internal nodes. */
3760 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3761 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3762
3763 if (is_a <bb_vec_info> (vinfo)
3764 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3765 {
3766 if (dump_enabled_p ())
3767 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3768 "desired vector type conflicts with earlier one "
3769 "for %G", stmt_info->stmt);
3770 return false;
3771 }
3772
3773 bool dummy;
3774 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3775 node, node_instance, cost_vec);
3776 }
3777
3778 /* Try to build NODE from scalars, returning true on success.
3779 NODE_INSTANCE is the SLP instance that contains NODE. */
3780
3781 static bool
3782 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3783 slp_instance node_instance)
3784 {
3785 stmt_vec_info stmt_info;
3786 unsigned int i;
3787
3788 if (!is_a <bb_vec_info> (vinfo)
3789 || node == SLP_INSTANCE_TREE (node_instance)
3790 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3791 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3792 return false;
3793
3794 if (dump_enabled_p ())
3795 dump_printf_loc (MSG_NOTE, vect_location,
3796 "Building vector operands of %p from scalars instead\n", node);
3797
3798 /* Don't remove and free the child nodes here, since they could be
3799 referenced by other structures. The analysis and scheduling phases
3800 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3801 unsigned int group_size = SLP_TREE_LANES (node);
3802 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3803 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3804 SLP_TREE_LOAD_PERMUTATION (node).release ();
3805 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3806 {
3807 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3808 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3809 }
3810 return true;
3811 }
3812
3813 /* Compute the prologue cost for invariant or constant operands represented
3814 by NODE. */
3815
3816 static void
3817 vect_prologue_cost_for_slp (slp_tree node,
3818 stmt_vector_for_cost *cost_vec)
3819 {
3820 /* There's a special case of an existing vector, that costs nothing. */
3821 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3822 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3823 return;
3824 /* Without looking at the actual initializer a vector of
3825 constants can be implemented as load from the constant pool.
3826 When all elements are the same we can use a splat. */
3827 tree vectype = SLP_TREE_VECTYPE (node);
3828 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3829 unsigned num_vects_to_check;
3830 unsigned HOST_WIDE_INT const_nunits;
3831 unsigned nelt_limit;
3832 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3833 && ! multiple_p (const_nunits, group_size))
3834 {
3835 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3836 nelt_limit = const_nunits;
3837 }
3838 else
3839 {
3840 /* If either the vector has variable length or the vectors
3841 are composed of repeated whole groups we only need to
3842 cost construction once. All vectors will be the same. */
3843 num_vects_to_check = 1;
3844 nelt_limit = group_size;
3845 }
3846 tree elt = NULL_TREE;
3847 unsigned nelt = 0;
3848 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3849 {
3850 unsigned si = j % group_size;
3851 if (nelt == 0)
3852 elt = SLP_TREE_SCALAR_OPS (node)[si];
3853 /* ??? We're just tracking whether all operands of a single
3854 vector initializer are the same, ideally we'd check if
3855 we emitted the same one already. */
3856 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3857 elt = NULL_TREE;
3858 nelt++;
3859 if (nelt == nelt_limit)
3860 {
3861 record_stmt_cost (cost_vec, 1,
3862 SLP_TREE_DEF_TYPE (node) == vect_external_def
3863 ? (elt ? scalar_to_vec : vec_construct)
3864 : vector_load,
3865 NULL, vectype, 0, vect_prologue);
3866 nelt = 0;
3867 }
3868 }
3869 }
3870
3871 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3872 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3873
3874 Return true if the operations are supported. */
3875
3876 static bool
3877 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3878 slp_instance node_instance,
3879 hash_set<slp_tree> &visited_set,
3880 vec<slp_tree> &visited_vec,
3881 stmt_vector_for_cost *cost_vec)
3882 {
3883 int i, j;
3884 slp_tree child;
3885
3886 /* Assume we can code-generate all invariants. */
3887 if (!node
3888 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
3889 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
3890 return true;
3891
3892 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
3893 {
3894 if (dump_enabled_p ())
3895 dump_printf_loc (MSG_NOTE, vect_location,
3896 "Failed cyclic SLP reference in %p", node);
3897 return false;
3898 }
3899 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
3900
3901 /* If we already analyzed the exact same set of scalar stmts we're done.
3902 We share the generated vector stmts for those. */
3903 if (visited_set.add (node))
3904 return true;
3905 visited_vec.safe_push (node);
3906
3907 bool res = true;
3908 unsigned visited_rec_start = visited_vec.length ();
3909 unsigned cost_vec_rec_start = cost_vec->length ();
3910 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3911 {
3912 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3913 visited_set, visited_vec,
3914 cost_vec);
3915 if (!res)
3916 break;
3917 }
3918
3919 if (res)
3920 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3921 cost_vec);
3922 /* If analysis failed we have to pop all recursive visited nodes
3923 plus ourselves. */
3924 if (!res)
3925 {
3926 while (visited_vec.length () >= visited_rec_start)
3927 visited_set.remove (visited_vec.pop ());
3928 cost_vec->truncate (cost_vec_rec_start);
3929 }
3930
3931 /* When the node can be vectorized cost invariant nodes it references.
3932 This is not done in DFS order to allow the refering node
3933 vectorizable_* calls to nail down the invariant nodes vector type
3934 and possibly unshare it if it needs a different vector type than
3935 other referrers. */
3936 if (res)
3937 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3938 if (child
3939 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
3940 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3941 /* Perform usual caching, note code-generation still
3942 code-gens these nodes multiple times but we expect
3943 to CSE them later. */
3944 && !visited_set.add (child))
3945 {
3946 visited_vec.safe_push (child);
3947 /* ??? After auditing more code paths make a "default"
3948 and push the vector type from NODE to all children
3949 if it is not already set. */
3950 /* Compute the number of vectors to be generated. */
3951 tree vector_type = SLP_TREE_VECTYPE (child);
3952 if (!vector_type)
3953 {
3954 /* For shifts with a scalar argument we don't need
3955 to cost or code-generate anything.
3956 ??? Represent this more explicitely. */
3957 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3958 == shift_vec_info_type)
3959 && j == 1);
3960 continue;
3961 }
3962 unsigned group_size = SLP_TREE_LANES (child);
3963 poly_uint64 vf = 1;
3964 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3965 vf = loop_vinfo->vectorization_factor;
3966 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3967 = vect_get_num_vectors (vf * group_size, vector_type);
3968 /* And cost them. */
3969 vect_prologue_cost_for_slp (child, cost_vec);
3970 }
3971
3972 /* If this node or any of its children can't be vectorized, try pruning
3973 the tree here rather than felling the whole thing. */
3974 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3975 {
3976 /* We'll need to revisit this for invariant costing and number
3977 of vectorized stmt setting. */
3978 res = true;
3979 }
3980
3981 return res;
3982 }
3983
3984
3985 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3986 region and that can be vectorized using vectorizable_live_operation
3987 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3988 scalar code computing it to be retained. */
3989
3990 static void
3991 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3992 slp_instance instance,
3993 stmt_vector_for_cost *cost_vec,
3994 hash_set<stmt_vec_info> &svisited,
3995 hash_set<slp_tree> &visited)
3996 {
3997 if (visited.add (node))
3998 return;
3999
4000 unsigned i;
4001 stmt_vec_info stmt_info;
4002 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
4003 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4004 {
4005 if (svisited.contains (stmt_info))
4006 continue;
4007 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4008 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
4009 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
4010 /* Only the pattern root stmt computes the original scalar value. */
4011 continue;
4012 bool mark_visited = true;
4013 gimple *orig_stmt = orig_stmt_info->stmt;
4014 ssa_op_iter op_iter;
4015 def_operand_p def_p;
4016 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4017 {
4018 imm_use_iterator use_iter;
4019 gimple *use_stmt;
4020 stmt_vec_info use_stmt_info;
4021 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4022 if (!is_gimple_debug (use_stmt))
4023 {
4024 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
4025 if (!use_stmt_info
4026 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4027 {
4028 STMT_VINFO_LIVE_P (stmt_info) = true;
4029 if (vectorizable_live_operation (bb_vinfo, stmt_info,
4030 NULL, node, instance, i,
4031 false, cost_vec))
4032 /* ??? So we know we can vectorize the live stmt
4033 from one SLP node. If we cannot do so from all
4034 or none consistently we'd have to record which
4035 SLP node (and lane) we want to use for the live
4036 operation. So make sure we can code-generate
4037 from all nodes. */
4038 mark_visited = false;
4039 else
4040 STMT_VINFO_LIVE_P (stmt_info) = false;
4041 break;
4042 }
4043 }
4044 /* We have to verify whether we can insert the lane extract
4045 before all uses. The following is a conservative approximation.
4046 We cannot put this into vectorizable_live_operation because
4047 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4048 doesn't work.
4049 Note that while the fact that we emit code for loads at the
4050 first load should make this a non-problem leafs we construct
4051 from scalars are vectorized after the last scalar def.
4052 ??? If we'd actually compute the insert location during
4053 analysis we could use sth less conservative than the last
4054 scalar stmt in the node for the dominance check. */
4055 /* ??? What remains is "live" uses in vector CTORs in the same
4056 SLP graph which is where those uses can end up code-generated
4057 right after their definition instead of close to their original
4058 use. But that would restrict us to code-generate lane-extracts
4059 from the latest stmt in a node. So we compensate for this
4060 during code-generation, simply not replacing uses for those
4061 hopefully rare cases. */
4062 if (STMT_VINFO_LIVE_P (stmt_info))
4063 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4064 if (!is_gimple_debug (use_stmt)
4065 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4066 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4067 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4068 {
4069 if (dump_enabled_p ())
4070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4071 "Cannot determine insertion place for "
4072 "lane extract\n");
4073 STMT_VINFO_LIVE_P (stmt_info) = false;
4074 mark_visited = true;
4075 }
4076 }
4077 if (mark_visited)
4078 svisited.add (stmt_info);
4079 }
4080
4081 slp_tree child;
4082 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4083 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4084 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4085 cost_vec, svisited, visited);
4086 }
4087
4088 /* Analyze statements in SLP instances of VINFO. Return true if the
4089 operations are supported. */
4090
4091 bool
4092 vect_slp_analyze_operations (vec_info *vinfo)
4093 {
4094 slp_instance instance;
4095 int i;
4096
4097 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4098
4099 hash_set<slp_tree> visited;
4100 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4101 {
4102 auto_vec<slp_tree> visited_vec;
4103 stmt_vector_for_cost cost_vec;
4104 cost_vec.create (2);
4105 if (is_a <bb_vec_info> (vinfo))
4106 vect_location = instance->location ();
4107 if (!vect_slp_analyze_node_operations (vinfo,
4108 SLP_INSTANCE_TREE (instance),
4109 instance, visited, visited_vec,
4110 &cost_vec)
4111 /* Instances with a root stmt require vectorized defs for the
4112 SLP tree root. */
4113 || (SLP_INSTANCE_ROOT_STMT (instance)
4114 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
4115 != vect_internal_def)))
4116 {
4117 slp_tree node = SLP_INSTANCE_TREE (instance);
4118 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4119 if (dump_enabled_p ())
4120 dump_printf_loc (MSG_NOTE, vect_location,
4121 "removing SLP instance operations starting from: %G",
4122 stmt_info->stmt);
4123 vect_free_slp_instance (instance);
4124 vinfo->slp_instances.ordered_remove (i);
4125 cost_vec.release ();
4126 while (!visited_vec.is_empty ())
4127 visited.remove (visited_vec.pop ());
4128 }
4129 else
4130 {
4131 i++;
4132
4133 /* For BB vectorization remember the SLP graph entry
4134 cost for later. */
4135 if (is_a <bb_vec_info> (vinfo))
4136 instance->cost_vec = cost_vec;
4137 else
4138 {
4139 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
4140 cost_vec.release ();
4141 }
4142 }
4143 }
4144
4145 /* Compute vectorizable live stmts. */
4146 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4147 {
4148 hash_set<stmt_vec_info> svisited;
4149 hash_set<slp_tree> visited;
4150 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4151 {
4152 vect_location = instance->location ();
4153 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
4154 instance, &instance->cost_vec, svisited,
4155 visited);
4156 }
4157 }
4158
4159 return !vinfo->slp_instances.is_empty ();
4160 }
4161
4162 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
4163 closing the eventual chain. */
4164
4165 static slp_instance
4166 get_ultimate_leader (slp_instance instance,
4167 hash_map<slp_instance, slp_instance> &instance_leader)
4168 {
4169 auto_vec<slp_instance *, 8> chain;
4170 slp_instance *tem;
4171 while (*(tem = instance_leader.get (instance)) != instance)
4172 {
4173 chain.safe_push (tem);
4174 instance = *tem;
4175 }
4176 while (!chain.is_empty ())
4177 *chain.pop () = instance;
4178 return instance;
4179 }
4180
4181 /* Worker of vect_bb_partition_graph, recurse on NODE. */
4182
4183 static void
4184 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
4185 slp_instance instance, slp_tree node,
4186 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
4187 hash_map<slp_instance, slp_instance> &instance_leader,
4188 hash_set<slp_tree> &visited)
4189 {
4190 stmt_vec_info stmt_info;
4191 unsigned i;
4192
4193 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4194 {
4195 bool existed_p;
4196 slp_instance &stmt_instance
4197 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
4198 if (!existed_p)
4199 ;
4200 else if (stmt_instance != instance)
4201 {
4202 /* If we're running into a previously marked stmt make us the
4203 leader of the current ultimate leader. This keeps the
4204 leader chain acyclic and works even when the current instance
4205 connects two previously independent graph parts. */
4206 slp_instance stmt_leader
4207 = get_ultimate_leader (stmt_instance, instance_leader);
4208 if (stmt_leader != instance)
4209 instance_leader.put (stmt_leader, instance);
4210 }
4211 stmt_instance = instance;
4212 }
4213
4214 if (visited.add (node))
4215 return;
4216
4217 slp_tree child;
4218 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4219 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4220 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
4221 instance_leader, visited);
4222 }
4223
4224 /* Partition the SLP graph into pieces that can be costed independently. */
4225
4226 static void
4227 vect_bb_partition_graph (bb_vec_info bb_vinfo)
4228 {
4229 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
4230
4231 /* First walk the SLP graph assigning each involved scalar stmt a
4232 corresponding SLP graph entry and upon visiting a previously
4233 marked stmt, make the stmts leader the current SLP graph entry. */
4234 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
4235 hash_map<slp_instance, slp_instance> instance_leader;
4236 hash_set<slp_tree> visited;
4237 slp_instance instance;
4238 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4239 {
4240 instance_leader.put (instance, instance);
4241 vect_bb_partition_graph_r (bb_vinfo,
4242 instance, SLP_INSTANCE_TREE (instance),
4243 stmt_to_instance, instance_leader,
4244 visited);
4245 }
4246
4247 /* Then collect entries to each independent subgraph. */
4248 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4249 {
4250 slp_instance leader = get_ultimate_leader (instance, instance_leader);
4251 leader->subgraph_entries.safe_push (instance);
4252 if (dump_enabled_p ()
4253 && leader != instance)
4254 dump_printf_loc (MSG_NOTE, vect_location,
4255 "instance %p is leader of %p\n",
4256 leader, instance);
4257 }
4258 }
4259
4260 /* Compute the scalar cost of the SLP node NODE and its children
4261 and return it. Do not account defs that are marked in LIFE and
4262 update LIFE according to uses of NODE. */
4263
4264 static void
4265 vect_bb_slp_scalar_cost (vec_info *vinfo,
4266 slp_tree node, vec<bool, va_heap> *life,
4267 stmt_vector_for_cost *cost_vec,
4268 hash_set<slp_tree> &visited)
4269 {
4270 unsigned i;
4271 stmt_vec_info stmt_info;
4272 slp_tree child;
4273
4274 if (visited.add (node))
4275 return;
4276
4277 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4278 {
4279 ssa_op_iter op_iter;
4280 def_operand_p def_p;
4281
4282 if ((*life)[i])
4283 continue;
4284
4285 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4286 gimple *orig_stmt = orig_stmt_info->stmt;
4287
4288 /* If there is a non-vectorized use of the defs then the scalar
4289 stmt is kept live in which case we do not account it or any
4290 required defs in the SLP children in the scalar cost. This
4291 way we make the vectorization more costly when compared to
4292 the scalar cost. */
4293 if (!STMT_VINFO_LIVE_P (stmt_info))
4294 {
4295 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4296 {
4297 imm_use_iterator use_iter;
4298 gimple *use_stmt;
4299 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4300 if (!is_gimple_debug (use_stmt))
4301 {
4302 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
4303 if (!use_stmt_info
4304 || !PURE_SLP_STMT
4305 (vect_stmt_to_vectorize (use_stmt_info)))
4306 {
4307 (*life)[i] = true;
4308 break;
4309 }
4310 }
4311 }
4312 if ((*life)[i])
4313 continue;
4314 }
4315
4316 /* Count scalar stmts only once. */
4317 if (gimple_visited_p (orig_stmt))
4318 continue;
4319 gimple_set_visited (orig_stmt, true);
4320
4321 vect_cost_for_stmt kind;
4322 if (STMT_VINFO_DATA_REF (orig_stmt_info))
4323 {
4324 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
4325 kind = scalar_load;
4326 else
4327 kind = scalar_store;
4328 }
4329 else if (vect_nop_conversion_p (orig_stmt_info))
4330 continue;
4331 /* For single-argument PHIs assume coalescing which means zero cost
4332 for the scalar and the vector PHIs. This avoids artificially
4333 favoring the vector path (but may pessimize it in some cases). */
4334 else if (is_a <gphi *> (orig_stmt_info->stmt)
4335 && gimple_phi_num_args
4336 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
4337 continue;
4338 else
4339 kind = scalar_stmt;
4340 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
4341 SLP_TREE_VECTYPE (node), 0, vect_body);
4342 }
4343
4344 auto_vec<bool, 20> subtree_life;
4345 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4346 {
4347 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4348 {
4349 /* Do not directly pass LIFE to the recursive call, copy it to
4350 confine changes in the callee to the current child/subtree. */
4351 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4352 {
4353 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
4354 for (unsigned j = 0;
4355 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
4356 {
4357 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
4358 if (perm.first == i)
4359 subtree_life[perm.second] = (*life)[j];
4360 }
4361 }
4362 else
4363 {
4364 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
4365 subtree_life.safe_splice (*life);
4366 }
4367 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
4368 visited);
4369 subtree_life.truncate (0);
4370 }
4371 }
4372 }
4373
4374 /* Comparator for the loop-index sorted cost vectors. */
4375
4376 static int
4377 li_cost_vec_cmp (const void *a_, const void *b_)
4378 {
4379 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
4380 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
4381 if (a->first < b->first)
4382 return -1;
4383 else if (a->first == b->first)
4384 return 0;
4385 return 1;
4386 }
4387
4388 /* Check if vectorization of the basic block is profitable for the
4389 subgraph denoted by SLP_INSTANCES. */
4390
4391 static bool
4392 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
4393 vec<slp_instance> slp_instances)
4394 {
4395 slp_instance instance;
4396 int i;
4397 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
4398 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
4399
4400 if (dump_enabled_p ())
4401 {
4402 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
4403 hash_set<slp_tree> visited;
4404 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4405 vect_print_slp_graph (MSG_NOTE, vect_location,
4406 SLP_INSTANCE_TREE (instance), visited);
4407 }
4408
4409 /* Calculate scalar cost and sum the cost for the vector stmts
4410 previously collected. */
4411 stmt_vector_for_cost scalar_costs = vNULL;
4412 stmt_vector_for_cost vector_costs = vNULL;
4413 hash_set<slp_tree> visited;
4414 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4415 {
4416 auto_vec<bool, 20> life;
4417 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
4418 true);
4419 if (SLP_INSTANCE_ROOT_STMT (instance))
4420 record_stmt_cost (&scalar_costs, 1, scalar_stmt,
4421 SLP_INSTANCE_ROOT_STMT (instance), 0, vect_body);
4422 vect_bb_slp_scalar_cost (bb_vinfo,
4423 SLP_INSTANCE_TREE (instance),
4424 &life, &scalar_costs, visited);
4425 vector_costs.safe_splice (instance->cost_vec);
4426 instance->cost_vec.release ();
4427 }
4428 /* Unset visited flag. */
4429 stmt_info_for_cost *cost;
4430 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
4431 gimple_set_visited (cost->stmt_info->stmt, false);
4432
4433 if (dump_enabled_p ())
4434 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4435
4436 /* When costing non-loop vectorization we need to consider each covered
4437 loop independently and make sure vectorization is profitable. For
4438 now we assume a loop may be not entered or executed an arbitrary
4439 number of iterations (??? static information can provide more
4440 precise info here) which means we can simply cost each containing
4441 loops stmts separately. */
4442
4443 /* First produce cost vectors sorted by loop index. */
4444 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
4445 li_scalar_costs (scalar_costs.length ());
4446 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
4447 li_vector_costs (vector_costs.length ());
4448 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
4449 {
4450 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
4451 li_scalar_costs.quick_push (std::make_pair (l, cost));
4452 }
4453 /* Use a random used loop as fallback in case the first vector_costs
4454 entry does not have a stmt_info associated with it. */
4455 unsigned l = li_scalar_costs[0].first;
4456 FOR_EACH_VEC_ELT (vector_costs, i, cost)
4457 {
4458 /* We inherit from the previous COST, invariants, externals and
4459 extracts immediately follow the cost for the related stmt. */
4460 if (cost->stmt_info)
4461 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
4462 li_vector_costs.quick_push (std::make_pair (l, cost));
4463 }
4464 li_scalar_costs.qsort (li_cost_vec_cmp);
4465 li_vector_costs.qsort (li_cost_vec_cmp);
4466
4467 /* Now cost the portions individually. */
4468 unsigned vi = 0;
4469 unsigned si = 0;
4470 while (si < li_scalar_costs.length ()
4471 && vi < li_vector_costs.length ())
4472 {
4473 unsigned sl = li_scalar_costs[si].first;
4474 unsigned vl = li_vector_costs[vi].first;
4475 if (sl != vl)
4476 {
4477 if (dump_enabled_p ())
4478 dump_printf_loc (MSG_NOTE, vect_location,
4479 "Scalar %d and vector %d loop part do not "
4480 "match up, skipping scalar part\n", sl, vl);
4481 /* Skip the scalar part, assuming zero cost on the vector side. */
4482 do
4483 {
4484 si++;
4485 }
4486 while (si < li_scalar_costs.length ()
4487 && li_scalar_costs[si].first == sl);
4488 continue;
4489 }
4490
4491 void *scalar_target_cost_data = init_cost (NULL);
4492 do
4493 {
4494 add_stmt_cost (bb_vinfo, scalar_target_cost_data,
4495 li_scalar_costs[si].second);
4496 si++;
4497 }
4498 while (si < li_scalar_costs.length ()
4499 && li_scalar_costs[si].first == sl);
4500 unsigned dummy;
4501 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
4502 destroy_cost_data (scalar_target_cost_data);
4503
4504 /* Complete the target-specific vector cost calculation. */
4505 void *vect_target_cost_data = init_cost (NULL);
4506 do
4507 {
4508 add_stmt_cost (bb_vinfo, vect_target_cost_data,
4509 li_vector_costs[vi].second);
4510 vi++;
4511 }
4512 while (vi < li_vector_costs.length ()
4513 && li_vector_costs[vi].first == vl);
4514 finish_cost (vect_target_cost_data, &vec_prologue_cost,
4515 &vec_inside_cost, &vec_epilogue_cost);
4516 destroy_cost_data (vect_target_cost_data);
4517
4518 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
4519
4520 if (dump_enabled_p ())
4521 {
4522 dump_printf_loc (MSG_NOTE, vect_location,
4523 "Cost model analysis for part in loop %d:\n", sl);
4524 dump_printf (MSG_NOTE, " Vector cost: %d\n",
4525 vec_inside_cost + vec_outside_cost);
4526 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
4527 }
4528
4529 /* Vectorization is profitable if its cost is more than the cost of scalar
4530 version. Note that we err on the vector side for equal cost because
4531 the cost estimate is otherwise quite pessimistic (constant uses are
4532 free on the scalar side but cost a load on the vector side for
4533 example). */
4534 if (vec_outside_cost + vec_inside_cost > scalar_cost)
4535 {
4536 scalar_costs.release ();
4537 vector_costs.release ();
4538 return false;
4539 }
4540 }
4541 if (vi < li_vector_costs.length ())
4542 {
4543 if (dump_enabled_p ())
4544 dump_printf_loc (MSG_NOTE, vect_location,
4545 "Excess vector cost for part in loop %d:\n",
4546 li_vector_costs[vi].first);
4547 scalar_costs.release ();
4548 vector_costs.release ();
4549 return false;
4550 }
4551
4552 scalar_costs.release ();
4553 vector_costs.release ();
4554 return true;
4555 }
4556
4557 /* qsort comparator for lane defs. */
4558
4559 static int
4560 vld_cmp (const void *a_, const void *b_)
4561 {
4562 auto *a = (const std::pair<unsigned, tree> *)a_;
4563 auto *b = (const std::pair<unsigned, tree> *)b_;
4564 return a->first - b->first;
4565 }
4566
4567 /* Return true if USE_STMT is a vector lane insert into VEC and set
4568 *THIS_LANE to the lane number that is set. */
4569
4570 static bool
4571 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
4572 {
4573 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
4574 if (!use_ass
4575 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
4576 || (vec
4577 ? gimple_assign_rhs1 (use_ass) != vec
4578 : ((vec = gimple_assign_rhs1 (use_ass)), false))
4579 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
4580 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
4581 || !constant_multiple_p
4582 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
4583 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
4584 this_lane))
4585 return false;
4586 return true;
4587 }
4588
4589 /* Find any vectorizable constructors and add them to the grouped_store
4590 array. */
4591
4592 static void
4593 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
4594 {
4595 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
4596 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
4597 !gsi_end_p (gsi); gsi_next (&gsi))
4598 {
4599 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
4600 if (!assign)
4601 continue;
4602
4603 tree rhs = gimple_assign_rhs1 (assign);
4604 if (gimple_assign_rhs_code (assign) == CONSTRUCTOR)
4605 {
4606 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
4607 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
4608 CONSTRUCTOR_NELTS (rhs))
4609 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
4610 || uniform_vector_p (rhs))
4611 continue;
4612
4613 unsigned j;
4614 tree val;
4615 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
4616 if (TREE_CODE (val) != SSA_NAME
4617 || !bb_vinfo->lookup_def (val))
4618 break;
4619 if (j != CONSTRUCTOR_NELTS (rhs))
4620 continue;
4621
4622 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
4623 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
4624 }
4625 else if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR
4626 && VECTOR_TYPE_P (TREE_TYPE (rhs))
4627 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
4628 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
4629 && integer_zerop (gimple_assign_rhs3 (assign))
4630 && useless_type_conversion_p
4631 (TREE_TYPE (TREE_TYPE (rhs)),
4632 TREE_TYPE (gimple_assign_rhs2 (assign)))
4633 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
4634 {
4635 /* We start to match on insert to lane zero but since the
4636 inserts need not be ordered we'd have to search both
4637 the def and the use chains. */
4638 tree vectype = TREE_TYPE (rhs);
4639 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4640 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
4641 auto_sbitmap lanes (nlanes);
4642 bitmap_clear (lanes);
4643 bitmap_set_bit (lanes, 0);
4644 tree def = gimple_assign_lhs (assign);
4645 lane_defs.quick_push
4646 (std::make_pair (0, gimple_assign_rhs2 (assign)));
4647 unsigned lanes_found = 1;
4648 /* Start with the use chains, the last stmt will be the root. */
4649 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
4650 do
4651 {
4652 use_operand_p use_p;
4653 gimple *use_stmt;
4654 if (!single_imm_use (def, &use_p, &use_stmt))
4655 break;
4656 unsigned this_lane;
4657 if (!bb_vinfo->lookup_stmt (use_stmt)
4658 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
4659 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
4660 break;
4661 if (bitmap_bit_p (lanes, this_lane))
4662 break;
4663 lanes_found++;
4664 bitmap_set_bit (lanes, this_lane);
4665 gassign *use_ass = as_a <gassign *> (use_stmt);
4666 lane_defs.quick_push (std::make_pair
4667 (this_lane, gimple_assign_rhs2 (use_ass)));
4668 last = bb_vinfo->lookup_stmt (use_ass);
4669 def = gimple_assign_lhs (use_ass);
4670 }
4671 while (lanes_found < nlanes);
4672 if (lanes_found < nlanes)
4673 {
4674 /* Now search the def chain. */
4675 def = gimple_assign_rhs1 (assign);
4676 do
4677 {
4678 if (TREE_CODE (def) != SSA_NAME
4679 || !has_single_use (def))
4680 break;
4681 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
4682 unsigned this_lane;
4683 if (!bb_vinfo->lookup_stmt (def_stmt)
4684 || !vect_slp_is_lane_insert (def_stmt,
4685 NULL_TREE, &this_lane)
4686 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
4687 break;
4688 if (bitmap_bit_p (lanes, this_lane))
4689 break;
4690 lanes_found++;
4691 bitmap_set_bit (lanes, this_lane);
4692 lane_defs.quick_push (std::make_pair
4693 (this_lane,
4694 gimple_assign_rhs2 (def_stmt)));
4695 def = gimple_assign_rhs1 (def_stmt);
4696 }
4697 while (lanes_found < nlanes);
4698 }
4699 if (lanes_found == nlanes)
4700 {
4701 /* Sort lane_defs after the lane index and register the root. */
4702 lane_defs.qsort (vld_cmp);
4703 vec<stmt_vec_info> stmts;
4704 stmts.create (nlanes);
4705 for (unsigned i = 0; i < nlanes; ++i)
4706 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
4707 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
4708 stmts, last));
4709 }
4710 }
4711 }
4712 }
4713
4714 /* Walk the grouped store chains and replace entries with their
4715 pattern variant if any. */
4716
4717 static void
4718 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
4719 {
4720 stmt_vec_info first_element;
4721 unsigned i;
4722
4723 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4724 {
4725 /* We also have CTORs in this array. */
4726 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
4727 continue;
4728 if (STMT_VINFO_IN_PATTERN_P (first_element))
4729 {
4730 stmt_vec_info orig = first_element;
4731 first_element = STMT_VINFO_RELATED_STMT (first_element);
4732 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
4733 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
4734 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
4735 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
4736 vinfo->grouped_stores[i] = first_element;
4737 }
4738 stmt_vec_info prev = first_element;
4739 while (DR_GROUP_NEXT_ELEMENT (prev))
4740 {
4741 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
4742 if (STMT_VINFO_IN_PATTERN_P (elt))
4743 {
4744 stmt_vec_info orig = elt;
4745 elt = STMT_VINFO_RELATED_STMT (elt);
4746 DR_GROUP_NEXT_ELEMENT (prev) = elt;
4747 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
4748 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
4749 }
4750 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
4751 prev = elt;
4752 }
4753 }
4754 }
4755
4756 /* Check if the region described by BB_VINFO can be vectorized, returning
4757 true if so. When returning false, set FATAL to true if the same failure
4758 would prevent vectorization at other vector sizes, false if it is still
4759 worth trying other sizes. N_STMTS is the number of statements in the
4760 region. */
4761
4762 static bool
4763 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
4764 vec<int> *dataref_groups)
4765 {
4766 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4767
4768 slp_instance instance;
4769 int i;
4770 poly_uint64 min_vf = 2;
4771
4772 /* The first group of checks is independent of the vector size. */
4773 fatal = true;
4774
4775 /* Analyze the data references. */
4776
4777 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
4778 {
4779 if (dump_enabled_p ())
4780 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4781 "not vectorized: unhandled data-ref in basic "
4782 "block.\n");
4783 return false;
4784 }
4785
4786 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
4787 {
4788 if (dump_enabled_p ())
4789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4790 "not vectorized: unhandled data access in "
4791 "basic block.\n");
4792 return false;
4793 }
4794
4795 vect_slp_check_for_constructors (bb_vinfo);
4796
4797 /* If there are no grouped stores and no constructors in the region
4798 there is no need to continue with pattern recog as vect_analyze_slp
4799 will fail anyway. */
4800 if (bb_vinfo->grouped_stores.is_empty ()
4801 && bb_vinfo->roots.is_empty ())
4802 {
4803 if (dump_enabled_p ())
4804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4805 "not vectorized: no grouped stores in "
4806 "basic block.\n");
4807 return false;
4808 }
4809
4810 /* While the rest of the analysis below depends on it in some way. */
4811 fatal = false;
4812
4813 vect_pattern_recog (bb_vinfo);
4814
4815 /* Update store groups from pattern processing. */
4816 vect_fixup_store_groups_with_patterns (bb_vinfo);
4817
4818 /* Check the SLP opportunities in the basic block, analyze and build SLP
4819 trees. */
4820 if (!vect_analyze_slp (bb_vinfo, n_stmts))
4821 {
4822 if (dump_enabled_p ())
4823 {
4824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4825 "Failed to SLP the basic block.\n");
4826 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4827 "not vectorized: failed to find SLP opportunities "
4828 "in basic block.\n");
4829 }
4830 return false;
4831 }
4832
4833 /* Optimize permutations. */
4834 vect_optimize_slp (bb_vinfo);
4835
4836 /* Gather the loads reachable from the SLP graph entries. */
4837 vect_gather_slp_loads (bb_vinfo);
4838
4839 vect_record_base_alignments (bb_vinfo);
4840
4841 /* Analyze and verify the alignment of data references and the
4842 dependence in the SLP instances. */
4843 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
4844 {
4845 vect_location = instance->location ();
4846 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
4847 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
4848 {
4849 slp_tree node = SLP_INSTANCE_TREE (instance);
4850 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4851 if (dump_enabled_p ())
4852 dump_printf_loc (MSG_NOTE, vect_location,
4853 "removing SLP instance operations starting from: %G",
4854 stmt_info->stmt);
4855 vect_free_slp_instance (instance);
4856 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
4857 continue;
4858 }
4859
4860 /* Mark all the statements that we want to vectorize as pure SLP and
4861 relevant. */
4862 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4863 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
4864 if (stmt_vec_info root = SLP_INSTANCE_ROOT_STMT (instance))
4865 {
4866 STMT_SLP_TYPE (root) = pure_slp;
4867 if (is_gimple_assign (root->stmt)
4868 && gimple_assign_rhs_code (root->stmt) == BIT_INSERT_EXPR)
4869 {
4870 /* ??? We should probably record the whole vector of
4871 root stmts so we do not have to back-track here... */
4872 for (unsigned n = SLP_TREE_LANES (SLP_INSTANCE_TREE (instance));
4873 n != 1; --n)
4874 {
4875 root = bb_vinfo->lookup_def (gimple_assign_rhs1 (root->stmt));
4876 STMT_SLP_TYPE (root) = pure_slp;
4877 }
4878 }
4879 }
4880
4881 i++;
4882 }
4883 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
4884 return false;
4885
4886 if (!vect_slp_analyze_operations (bb_vinfo))
4887 {
4888 if (dump_enabled_p ())
4889 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4890 "not vectorized: bad operation in basic block.\n");
4891 return false;
4892 }
4893
4894 vect_bb_partition_graph (bb_vinfo);
4895
4896 return true;
4897 }
4898
4899 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4900 basic blocks in BBS, returning true on success.
4901 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4902
4903 static bool
4904 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
4905 vec<int> *dataref_groups, unsigned int n_stmts)
4906 {
4907 bb_vec_info bb_vinfo;
4908 auto_vector_modes vector_modes;
4909
4910 /* Autodetect first vector size we try. */
4911 machine_mode next_vector_mode = VOIDmode;
4912 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
4913 unsigned int mode_i = 0;
4914
4915 vec_info_shared shared;
4916
4917 machine_mode autodetected_vector_mode = VOIDmode;
4918 while (1)
4919 {
4920 bool vectorized = false;
4921 bool fatal = false;
4922 bb_vinfo = new _bb_vec_info (bbs, &shared);
4923
4924 bool first_time_p = shared.datarefs.is_empty ();
4925 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
4926 if (first_time_p)
4927 bb_vinfo->shared->save_datarefs ();
4928 else
4929 bb_vinfo->shared->check_datarefs ();
4930 bb_vinfo->vector_mode = next_vector_mode;
4931
4932 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
4933 {
4934 if (dump_enabled_p ())
4935 {
4936 dump_printf_loc (MSG_NOTE, vect_location,
4937 "***** Analysis succeeded with vector mode"
4938 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4939 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4940 }
4941
4942 bb_vinfo->shared->check_datarefs ();
4943
4944 unsigned i;
4945 slp_instance instance;
4946 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4947 {
4948 if (instance->subgraph_entries.is_empty ())
4949 continue;
4950
4951 vect_location = instance->location ();
4952 if (!unlimited_cost_model (NULL)
4953 && !vect_bb_vectorization_profitable_p
4954 (bb_vinfo, instance->subgraph_entries))
4955 {
4956 if (dump_enabled_p ())
4957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4958 "not vectorized: vectorization is not "
4959 "profitable.\n");
4960 continue;
4961 }
4962
4963 if (!dbg_cnt (vect_slp))
4964 continue;
4965
4966 if (!vectorized && dump_enabled_p ())
4967 dump_printf_loc (MSG_NOTE, vect_location,
4968 "Basic block will be vectorized "
4969 "using SLP\n");
4970 vectorized = true;
4971
4972 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4973
4974 unsigned HOST_WIDE_INT bytes;
4975 if (dump_enabled_p ())
4976 {
4977 if (GET_MODE_SIZE
4978 (bb_vinfo->vector_mode).is_constant (&bytes))
4979 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4980 "basic block part vectorized using %wu "
4981 "byte vectors\n", bytes);
4982 else
4983 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4984 "basic block part vectorized using "
4985 "variable length vectors\n");
4986 }
4987 }
4988 }
4989 else
4990 {
4991 if (dump_enabled_p ())
4992 dump_printf_loc (MSG_NOTE, vect_location,
4993 "***** Analysis failed with vector mode %s\n",
4994 GET_MODE_NAME (bb_vinfo->vector_mode));
4995 }
4996
4997 if (mode_i == 0)
4998 autodetected_vector_mode = bb_vinfo->vector_mode;
4999
5000 if (!fatal)
5001 while (mode_i < vector_modes.length ()
5002 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
5003 {
5004 if (dump_enabled_p ())
5005 dump_printf_loc (MSG_NOTE, vect_location,
5006 "***** The result for vector mode %s would"
5007 " be the same\n",
5008 GET_MODE_NAME (vector_modes[mode_i]));
5009 mode_i += 1;
5010 }
5011
5012 delete bb_vinfo;
5013
5014 if (mode_i < vector_modes.length ()
5015 && VECTOR_MODE_P (autodetected_vector_mode)
5016 && (related_vector_mode (vector_modes[mode_i],
5017 GET_MODE_INNER (autodetected_vector_mode))
5018 == autodetected_vector_mode)
5019 && (related_vector_mode (autodetected_vector_mode,
5020 GET_MODE_INNER (vector_modes[mode_i]))
5021 == vector_modes[mode_i]))
5022 {
5023 if (dump_enabled_p ())
5024 dump_printf_loc (MSG_NOTE, vect_location,
5025 "***** Skipping vector mode %s, which would"
5026 " repeat the analysis for %s\n",
5027 GET_MODE_NAME (vector_modes[mode_i]),
5028 GET_MODE_NAME (autodetected_vector_mode));
5029 mode_i += 1;
5030 }
5031
5032 if (vectorized
5033 || mode_i == vector_modes.length ()
5034 || autodetected_vector_mode == VOIDmode
5035 /* If vect_slp_analyze_bb_1 signaled that analysis for all
5036 vector sizes will fail do not bother iterating. */
5037 || fatal)
5038 return vectorized;
5039
5040 /* Try the next biggest vector size. */
5041 next_vector_mode = vector_modes[mode_i++];
5042 if (dump_enabled_p ())
5043 dump_printf_loc (MSG_NOTE, vect_location,
5044 "***** Re-trying analysis with vector mode %s\n",
5045 GET_MODE_NAME (next_vector_mode));
5046 }
5047 }
5048
5049
5050 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
5051 true if anything in the basic-block was vectorized. */
5052
5053 static bool
5054 vect_slp_bbs (vec<basic_block> bbs)
5055 {
5056 vec<data_reference_p> datarefs = vNULL;
5057 auto_vec<int> dataref_groups;
5058 int insns = 0;
5059 int current_group = 0;
5060
5061 for (unsigned i = 0; i < bbs.length (); i++)
5062 {
5063 basic_block bb = bbs[i];
5064 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
5065 gsi_next (&gsi))
5066 {
5067 gimple *stmt = gsi_stmt (gsi);
5068 if (is_gimple_debug (stmt))
5069 continue;
5070
5071 insns++;
5072
5073 if (gimple_location (stmt) != UNKNOWN_LOCATION)
5074 vect_location = stmt;
5075
5076 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
5077 &dataref_groups, current_group))
5078 ++current_group;
5079 }
5080 }
5081
5082 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
5083 }
5084
5085 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5086 true if anything in the basic-block was vectorized. */
5087
5088 bool
5089 vect_slp_bb (basic_block bb)
5090 {
5091 auto_vec<basic_block> bbs;
5092 bbs.safe_push (bb);
5093 return vect_slp_bbs (bbs);
5094 }
5095
5096 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5097 true if anything in the basic-block was vectorized. */
5098
5099 bool
5100 vect_slp_function (function *fun)
5101 {
5102 bool r = false;
5103 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
5104 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
5105
5106 /* For the moment split the function into pieces to avoid making
5107 the iteration on the vector mode moot. Split at points we know
5108 to not handle well which is CFG merges (SLP discovery doesn't
5109 handle non-loop-header PHIs) and loop exits. Since pattern
5110 recog requires reverse iteration to visit uses before defs
5111 simply chop RPO into pieces. */
5112 auto_vec<basic_block> bbs;
5113 for (unsigned i = 0; i < n; i++)
5114 {
5115 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
5116 bool split = false;
5117
5118 /* Split when a BB is not dominated by the first block. */
5119 if (!bbs.is_empty ()
5120 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
5121 {
5122 if (dump_enabled_p ())
5123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5124 "splitting region at dominance boundary bb%d\n",
5125 bb->index);
5126 split = true;
5127 }
5128 /* Split when the loop determined by the first block
5129 is exited. This is because we eventually insert
5130 invariants at region begin. */
5131 else if (!bbs.is_empty ()
5132 && bbs[0]->loop_father != bb->loop_father
5133 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
5134 {
5135 if (dump_enabled_p ())
5136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5137 "splitting region at loop %d exit at bb%d\n",
5138 bbs[0]->loop_father->num, bb->index);
5139 split = true;
5140 }
5141
5142 if (split && !bbs.is_empty ())
5143 {
5144 r |= vect_slp_bbs (bbs);
5145 bbs.truncate (0);
5146 bbs.quick_push (bb);
5147 }
5148 else
5149 bbs.safe_push (bb);
5150
5151 /* When we have a stmt ending this block and defining a
5152 value we have to insert on edges when inserting after it for
5153 a vector containing its definition. Avoid this for now. */
5154 if (gimple *last = last_stmt (bb))
5155 if (gimple_get_lhs (last)
5156 && is_ctrl_altering_stmt (last))
5157 {
5158 if (dump_enabled_p ())
5159 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5160 "splitting region at control altering "
5161 "definition %G", last);
5162 r |= vect_slp_bbs (bbs);
5163 bbs.truncate (0);
5164 }
5165 }
5166
5167 if (!bbs.is_empty ())
5168 r |= vect_slp_bbs (bbs);
5169
5170 free (rpo);
5171
5172 return r;
5173 }
5174
5175 /* Build a variable-length vector in which the elements in ELTS are repeated
5176 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
5177 RESULTS and add any new instructions to SEQ.
5178
5179 The approach we use is:
5180
5181 (1) Find a vector mode VM with integer elements of mode IM.
5182
5183 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5184 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
5185 from small vectors to IM.
5186
5187 (3) Duplicate each ELTS'[I] into a vector of mode VM.
5188
5189 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
5190 correct byte contents.
5191
5192 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
5193
5194 We try to find the largest IM for which this sequence works, in order
5195 to cut down on the number of interleaves. */
5196
5197 void
5198 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
5199 vec<tree> elts, unsigned int nresults,
5200 vec<tree> &results)
5201 {
5202 unsigned int nelts = elts.length ();
5203 tree element_type = TREE_TYPE (vector_type);
5204
5205 /* (1) Find a vector mode VM with integer elements of mode IM. */
5206 unsigned int nvectors = 1;
5207 tree new_vector_type;
5208 tree permutes[2];
5209 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
5210 &nvectors, &new_vector_type,
5211 permutes))
5212 gcc_unreachable ();
5213
5214 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
5215 unsigned int partial_nelts = nelts / nvectors;
5216 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
5217
5218 tree_vector_builder partial_elts;
5219 auto_vec<tree, 32> pieces (nvectors * 2);
5220 pieces.quick_grow_cleared (nvectors * 2);
5221 for (unsigned int i = 0; i < nvectors; ++i)
5222 {
5223 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5224 ELTS' has mode IM. */
5225 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
5226 for (unsigned int j = 0; j < partial_nelts; ++j)
5227 partial_elts.quick_push (elts[i * partial_nelts + j]);
5228 tree t = gimple_build_vector (seq, &partial_elts);
5229 t = gimple_build (seq, VIEW_CONVERT_EXPR,
5230 TREE_TYPE (new_vector_type), t);
5231
5232 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
5233 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
5234 }
5235
5236 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
5237 correct byte contents.
5238
5239 Conceptually, we need to repeat the following operation log2(nvectors)
5240 times, where hi_start = nvectors / 2:
5241
5242 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
5243 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
5244
5245 However, if each input repeats every N elements and the VF is
5246 a multiple of N * 2, the HI result is the same as the LO result.
5247 This will be true for the first N1 iterations of the outer loop,
5248 followed by N2 iterations for which both the LO and HI results
5249 are needed. I.e.:
5250
5251 N1 + N2 = log2(nvectors)
5252
5253 Each "N1 iteration" doubles the number of redundant vectors and the
5254 effect of the process as a whole is to have a sequence of nvectors/2**N1
5255 vectors that repeats 2**N1 times. Rather than generate these redundant
5256 vectors, we halve the number of vectors for each N1 iteration. */
5257 unsigned int in_start = 0;
5258 unsigned int out_start = nvectors;
5259 unsigned int new_nvectors = nvectors;
5260 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
5261 {
5262 unsigned int hi_start = new_nvectors / 2;
5263 unsigned int out_i = 0;
5264 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
5265 {
5266 if ((in_i & 1) != 0
5267 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
5268 2 * in_repeat))
5269 continue;
5270
5271 tree output = make_ssa_name (new_vector_type);
5272 tree input1 = pieces[in_start + (in_i / 2)];
5273 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
5274 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
5275 input1, input2,
5276 permutes[in_i & 1]);
5277 gimple_seq_add_stmt (seq, stmt);
5278 pieces[out_start + out_i] = output;
5279 out_i += 1;
5280 }
5281 std::swap (in_start, out_start);
5282 new_nvectors = out_i;
5283 }
5284
5285 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
5286 results.reserve (nresults);
5287 for (unsigned int i = 0; i < nresults; ++i)
5288 if (i < new_nvectors)
5289 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
5290 pieces[in_start + i]));
5291 else
5292 results.quick_push (results[i - new_nvectors]);
5293 }
5294
5295
5296 /* For constant and loop invariant defs in OP_NODE this function creates
5297 vector defs that will be used in the vectorized stmts and stores them
5298 to SLP_TREE_VEC_DEFS of OP_NODE. */
5299
5300 static void
5301 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
5302 {
5303 unsigned HOST_WIDE_INT nunits;
5304 tree vec_cst;
5305 unsigned j, number_of_places_left_in_vector;
5306 tree vector_type;
5307 tree vop;
5308 int group_size = op_node->ops.length ();
5309 unsigned int vec_num, i;
5310 unsigned number_of_copies = 1;
5311 bool constant_p;
5312 gimple_seq ctor_seq = NULL;
5313 auto_vec<tree, 16> permute_results;
5314
5315 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
5316 vector_type = SLP_TREE_VECTYPE (op_node);
5317
5318 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
5319 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
5320 auto_vec<tree> voprnds (number_of_vectors);
5321
5322 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5323 created vectors. It is greater than 1 if unrolling is performed.
5324
5325 For example, we have two scalar operands, s1 and s2 (e.g., group of
5326 strided accesses of size two), while NUNITS is four (i.e., four scalars
5327 of this type can be packed in a vector). The output vector will contain
5328 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5329 will be 2).
5330
5331 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
5332 containing the operands.
5333
5334 For example, NUNITS is four as before, and the group size is 8
5335 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5336 {s5, s6, s7, s8}. */
5337
5338 /* When using duplicate_and_interleave, we just need one element for
5339 each scalar statement. */
5340 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5341 nunits = group_size;
5342
5343 number_of_copies = nunits * number_of_vectors / group_size;
5344
5345 number_of_places_left_in_vector = nunits;
5346 constant_p = true;
5347 tree_vector_builder elts (vector_type, nunits, 1);
5348 elts.quick_grow (nunits);
5349 stmt_vec_info insert_after = NULL;
5350 for (j = 0; j < number_of_copies; j++)
5351 {
5352 tree op;
5353 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
5354 {
5355 /* Create 'vect_ = {op0,op1,...,opn}'. */
5356 number_of_places_left_in_vector--;
5357 tree orig_op = op;
5358 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
5359 {
5360 if (CONSTANT_CLASS_P (op))
5361 {
5362 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
5363 {
5364 /* Can't use VIEW_CONVERT_EXPR for booleans because
5365 of possibly different sizes of scalar value and
5366 vector element. */
5367 if (integer_zerop (op))
5368 op = build_int_cst (TREE_TYPE (vector_type), 0);
5369 else if (integer_onep (op))
5370 op = build_all_ones_cst (TREE_TYPE (vector_type));
5371 else
5372 gcc_unreachable ();
5373 }
5374 else
5375 op = fold_unary (VIEW_CONVERT_EXPR,
5376 TREE_TYPE (vector_type), op);
5377 gcc_assert (op && CONSTANT_CLASS_P (op));
5378 }
5379 else
5380 {
5381 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
5382 gimple *init_stmt;
5383 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
5384 {
5385 tree true_val
5386 = build_all_ones_cst (TREE_TYPE (vector_type));
5387 tree false_val
5388 = build_zero_cst (TREE_TYPE (vector_type));
5389 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
5390 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
5391 op, true_val,
5392 false_val);
5393 }
5394 else
5395 {
5396 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
5397 op);
5398 init_stmt
5399 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
5400 op);
5401 }
5402 gimple_seq_add_stmt (&ctor_seq, init_stmt);
5403 op = new_temp;
5404 }
5405 }
5406 elts[number_of_places_left_in_vector] = op;
5407 if (!CONSTANT_CLASS_P (op))
5408 constant_p = false;
5409 /* For BB vectorization we have to compute an insert location
5410 when a def is inside the analyzed region since we cannot
5411 simply insert at the BB start in this case. */
5412 stmt_vec_info opdef;
5413 if (TREE_CODE (orig_op) == SSA_NAME
5414 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
5415 && is_a <bb_vec_info> (vinfo)
5416 && (opdef = vinfo->lookup_def (orig_op)))
5417 {
5418 if (!insert_after)
5419 insert_after = opdef;
5420 else
5421 insert_after = get_later_stmt (insert_after, opdef);
5422 }
5423
5424 if (number_of_places_left_in_vector == 0)
5425 {
5426 if (constant_p
5427 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5428 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5429 vec_cst = gimple_build_vector (&ctor_seq, &elts);
5430 else
5431 {
5432 if (permute_results.is_empty ())
5433 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
5434 elts, number_of_vectors,
5435 permute_results);
5436 vec_cst = permute_results[number_of_vectors - j - 1];
5437 }
5438 if (!gimple_seq_empty_p (ctor_seq))
5439 {
5440 if (insert_after)
5441 {
5442 gimple_stmt_iterator gsi;
5443 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
5444 {
5445 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
5446 gsi_insert_seq_before (&gsi, ctor_seq,
5447 GSI_CONTINUE_LINKING);
5448 }
5449 else if (!stmt_ends_bb_p (insert_after->stmt))
5450 {
5451 gsi = gsi_for_stmt (insert_after->stmt);
5452 gsi_insert_seq_after (&gsi, ctor_seq,
5453 GSI_CONTINUE_LINKING);
5454 }
5455 else
5456 {
5457 /* When we want to insert after a def where the
5458 defining stmt throws then insert on the fallthru
5459 edge. */
5460 edge e = find_fallthru_edge
5461 (gimple_bb (insert_after->stmt)->succs);
5462 basic_block new_bb
5463 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
5464 gcc_assert (!new_bb);
5465 }
5466 }
5467 else
5468 vinfo->insert_seq_on_entry (NULL, ctor_seq);
5469 ctor_seq = NULL;
5470 }
5471 voprnds.quick_push (vec_cst);
5472 insert_after = NULL;
5473 number_of_places_left_in_vector = nunits;
5474 constant_p = true;
5475 elts.new_vector (vector_type, nunits, 1);
5476 elts.quick_grow (nunits);
5477 }
5478 }
5479 }
5480
5481 /* Since the vectors are created in the reverse order, we should invert
5482 them. */
5483 vec_num = voprnds.length ();
5484 for (j = vec_num; j != 0; j--)
5485 {
5486 vop = voprnds[j - 1];
5487 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5488 }
5489
5490 /* In case that VF is greater than the unrolling factor needed for the SLP
5491 group of stmts, NUMBER_OF_VECTORS to be created is greater than
5492 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
5493 to replicate the vectors. */
5494 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
5495 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
5496 i++)
5497 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5498 }
5499
5500 /* Get the Ith vectorized definition from SLP_NODE. */
5501
5502 tree
5503 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
5504 {
5505 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
5506 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
5507 else
5508 return SLP_TREE_VEC_DEFS (slp_node)[i];
5509 }
5510
5511 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
5512
5513 void
5514 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
5515 {
5516 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
5517 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
5518 {
5519 unsigned j;
5520 gimple *vec_def_stmt;
5521 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
5522 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
5523 }
5524 else
5525 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
5526 }
5527
5528 /* Get N vectorized definitions for SLP_NODE. */
5529
5530 void
5531 vect_get_slp_defs (vec_info *,
5532 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
5533 {
5534 if (n == -1U)
5535 n = SLP_TREE_CHILDREN (slp_node).length ();
5536
5537 for (unsigned i = 0; i < n; ++i)
5538 {
5539 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
5540 vec<tree> vec_defs = vNULL;
5541 vect_get_slp_defs (child, &vec_defs);
5542 vec_oprnds->quick_push (vec_defs);
5543 }
5544 }
5545
5546 /* Generate vector permute statements from a list of loads in DR_CHAIN.
5547 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
5548 permute statements for the SLP node NODE. Store the number of vector
5549 permute instructions in *N_PERMS and the number of vector load
5550 instructions in *N_LOADS. */
5551
5552 bool
5553 vect_transform_slp_perm_load (vec_info *vinfo,
5554 slp_tree node, vec<tree> dr_chain,
5555 gimple_stmt_iterator *gsi, poly_uint64 vf,
5556 bool analyze_only, unsigned *n_perms,
5557 unsigned int *n_loads)
5558 {
5559 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5560 int vec_index = 0;
5561 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5562 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
5563 unsigned int mask_element;
5564 machine_mode mode;
5565
5566 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
5567 return false;
5568
5569 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
5570
5571 mode = TYPE_MODE (vectype);
5572 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5573
5574 /* Initialize the vect stmts of NODE to properly insert the generated
5575 stmts later. */
5576 if (! analyze_only)
5577 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
5578 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
5579 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
5580
5581 /* Generate permutation masks for every NODE. Number of masks for each NODE
5582 is equal to GROUP_SIZE.
5583 E.g., we have a group of three nodes with three loads from the same
5584 location in each node, and the vector size is 4. I.e., we have a
5585 a0b0c0a1b1c1... sequence and we need to create the following vectors:
5586 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
5587 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
5588 ...
5589
5590 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
5591 The last mask is illegal since we assume two operands for permute
5592 operation, and the mask element values can't be outside that range.
5593 Hence, the last mask must be converted into {2,5,5,5}.
5594 For the first two permutations we need the first and the second input
5595 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
5596 we need the second and the third vectors: {b1,c1,a2,b2} and
5597 {c2,a3,b3,c3}. */
5598
5599 int vect_stmts_counter = 0;
5600 unsigned int index = 0;
5601 int first_vec_index = -1;
5602 int second_vec_index = -1;
5603 bool noop_p = true;
5604 *n_perms = 0;
5605
5606 vec_perm_builder mask;
5607 unsigned int nelts_to_build;
5608 unsigned int nvectors_per_build;
5609 unsigned int in_nlanes;
5610 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
5611 && multiple_p (nunits, group_size));
5612 if (repeating_p)
5613 {
5614 /* A single vector contains a whole number of copies of the node, so:
5615 (a) all permutes can use the same mask; and
5616 (b) the permutes only need a single vector input. */
5617 mask.new_vector (nunits, group_size, 3);
5618 nelts_to_build = mask.encoded_nelts ();
5619 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
5620 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
5621 }
5622 else
5623 {
5624 /* We need to construct a separate mask for each vector statement. */
5625 unsigned HOST_WIDE_INT const_nunits, const_vf;
5626 if (!nunits.is_constant (&const_nunits)
5627 || !vf.is_constant (&const_vf))
5628 return false;
5629 mask.new_vector (const_nunits, const_nunits, 1);
5630 nelts_to_build = const_vf * group_size;
5631 nvectors_per_build = 1;
5632 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
5633 }
5634 auto_sbitmap used_in_lanes (in_nlanes);
5635 bitmap_clear (used_in_lanes);
5636
5637 unsigned int count = mask.encoded_nelts ();
5638 mask.quick_grow (count);
5639 vec_perm_indices indices;
5640
5641 for (unsigned int j = 0; j < nelts_to_build; j++)
5642 {
5643 unsigned int iter_num = j / group_size;
5644 unsigned int stmt_num = j % group_size;
5645 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
5646 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
5647 bitmap_set_bit (used_in_lanes, i);
5648 if (repeating_p)
5649 {
5650 first_vec_index = 0;
5651 mask_element = i;
5652 }
5653 else
5654 {
5655 /* Enforced before the loop when !repeating_p. */
5656 unsigned int const_nunits = nunits.to_constant ();
5657 vec_index = i / const_nunits;
5658 mask_element = i % const_nunits;
5659 if (vec_index == first_vec_index
5660 || first_vec_index == -1)
5661 {
5662 first_vec_index = vec_index;
5663 }
5664 else if (vec_index == second_vec_index
5665 || second_vec_index == -1)
5666 {
5667 second_vec_index = vec_index;
5668 mask_element += const_nunits;
5669 }
5670 else
5671 {
5672 if (dump_enabled_p ())
5673 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5674 "permutation requires at "
5675 "least three vectors %G",
5676 stmt_info->stmt);
5677 gcc_assert (analyze_only);
5678 return false;
5679 }
5680
5681 gcc_assert (mask_element < 2 * const_nunits);
5682 }
5683
5684 if (mask_element != index)
5685 noop_p = false;
5686 mask[index++] = mask_element;
5687
5688 if (index == count && !noop_p)
5689 {
5690 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
5691 if (!can_vec_perm_const_p (mode, indices))
5692 {
5693 if (dump_enabled_p ())
5694 {
5695 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5696 vect_location,
5697 "unsupported vect permute { ");
5698 for (i = 0; i < count; ++i)
5699 {
5700 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5701 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5702 }
5703 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5704 }
5705 gcc_assert (analyze_only);
5706 return false;
5707 }
5708
5709 ++*n_perms;
5710 }
5711
5712 if (index == count)
5713 {
5714 if (!analyze_only)
5715 {
5716 tree mask_vec = NULL_TREE;
5717
5718 if (! noop_p)
5719 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5720
5721 if (second_vec_index == -1)
5722 second_vec_index = first_vec_index;
5723
5724 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
5725 {
5726 /* Generate the permute statement if necessary. */
5727 tree first_vec = dr_chain[first_vec_index + ri];
5728 tree second_vec = dr_chain[second_vec_index + ri];
5729 gimple *perm_stmt;
5730 if (! noop_p)
5731 {
5732 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5733 tree perm_dest
5734 = vect_create_destination_var (gimple_assign_lhs (stmt),
5735 vectype);
5736 perm_dest = make_ssa_name (perm_dest);
5737 perm_stmt
5738 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5739 first_vec, second_vec,
5740 mask_vec);
5741 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
5742 gsi);
5743 }
5744 else
5745 /* If mask was NULL_TREE generate the requested
5746 identity transform. */
5747 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
5748
5749 /* Store the vector statement in NODE. */
5750 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
5751 }
5752 }
5753
5754 index = 0;
5755 first_vec_index = -1;
5756 second_vec_index = -1;
5757 noop_p = true;
5758 }
5759 }
5760
5761 if (n_loads)
5762 {
5763 if (repeating_p)
5764 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5765 else
5766 {
5767 /* Enforced above when !repeating_p. */
5768 unsigned int const_nunits = nunits.to_constant ();
5769 *n_loads = 0;
5770 bool load_seen = false;
5771 for (unsigned i = 0; i < in_nlanes; ++i)
5772 {
5773 if (i % const_nunits == 0)
5774 {
5775 if (load_seen)
5776 *n_loads += 1;
5777 load_seen = false;
5778 }
5779 if (bitmap_bit_p (used_in_lanes, i))
5780 load_seen = true;
5781 }
5782 if (load_seen)
5783 *n_loads += 1;
5784 }
5785 }
5786
5787 return true;
5788 }
5789
5790
5791 /* Vectorize the SLP permutations in NODE as specified
5792 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5793 child number and lane number.
5794 Interleaving of two two-lane two-child SLP subtrees (not supported):
5795 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5796 A blend of two four-lane two-child SLP subtrees:
5797 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5798 Highpart of a four-lane one-child SLP subtree (not supported):
5799 [ { 0, 2 }, { 0, 3 } ]
5800 Where currently only a subset is supported by code generating below. */
5801
5802 static bool
5803 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5804 slp_tree node, stmt_vector_for_cost *cost_vec)
5805 {
5806 tree vectype = SLP_TREE_VECTYPE (node);
5807
5808 /* ??? We currently only support all same vector input and output types
5809 while the SLP IL should really do a concat + select and thus accept
5810 arbitrary mismatches. */
5811 slp_tree child;
5812 unsigned i;
5813 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5814 if (!vect_maybe_update_slp_op_vectype (child, vectype)
5815 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
5816 {
5817 if (dump_enabled_p ())
5818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5819 "Unsupported lane permutation\n");
5820 return false;
5821 }
5822
5823 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
5824 gcc_assert (perm.length () == SLP_TREE_LANES (node));
5825 if (dump_enabled_p ())
5826 {
5827 dump_printf_loc (MSG_NOTE, vect_location,
5828 "vectorizing permutation");
5829 for (unsigned i = 0; i < perm.length (); ++i)
5830 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
5831 dump_printf (MSG_NOTE, "\n");
5832 }
5833
5834 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5835 if (!nunits.is_constant ())
5836 return false;
5837 unsigned HOST_WIDE_INT vf = 1;
5838 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
5839 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
5840 return false;
5841 unsigned olanes = vf * SLP_TREE_LANES (node);
5842 gcc_assert (multiple_p (olanes, nunits));
5843
5844 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5845 from the { SLP operand, scalar lane } permutation as recorded in the
5846 SLP node as intermediate step. This part should already work
5847 with SLP children with arbitrary number of lanes. */
5848 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
5849 auto_vec<unsigned> active_lane;
5850 vperm.create (olanes);
5851 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
5852 for (unsigned i = 0; i < vf; ++i)
5853 {
5854 for (unsigned pi = 0; pi < perm.length (); ++pi)
5855 {
5856 std::pair<unsigned, unsigned> p = perm[pi];
5857 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
5858 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
5859 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
5860 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
5861 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
5862 }
5863 /* Advance to the next group. */
5864 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
5865 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
5866 }
5867
5868 if (dump_enabled_p ())
5869 {
5870 dump_printf_loc (MSG_NOTE, vect_location, "as");
5871 for (unsigned i = 0; i < vperm.length (); ++i)
5872 {
5873 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
5874 dump_printf (MSG_NOTE, ",");
5875 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
5876 vperm[i].first.first, vperm[i].first.second,
5877 vperm[i].second);
5878 }
5879 dump_printf (MSG_NOTE, "\n");
5880 }
5881
5882 /* We can only handle two-vector permutes, everything else should
5883 be lowered on the SLP level. The following is closely inspired
5884 by vect_transform_slp_perm_load and is supposed to eventually
5885 replace it.
5886 ??? As intermediate step do code-gen in the SLP tree representation
5887 somehow? */
5888 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
5889 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
5890 unsigned int const_nunits = nunits.to_constant ();
5891 unsigned int index = 0;
5892 unsigned int mask_element;
5893 vec_perm_builder mask;
5894 mask.new_vector (const_nunits, const_nunits, 1);
5895 unsigned int count = mask.encoded_nelts ();
5896 mask.quick_grow (count);
5897 vec_perm_indices indices;
5898 unsigned nperms = 0;
5899 for (unsigned i = 0; i < vperm.length (); ++i)
5900 {
5901 mask_element = vperm[i].second;
5902 if (first_vec.first == -1U
5903 || first_vec == vperm[i].first)
5904 first_vec = vperm[i].first;
5905 else if (second_vec.first == -1U
5906 || second_vec == vperm[i].first)
5907 {
5908 second_vec = vperm[i].first;
5909 mask_element += const_nunits;
5910 }
5911 else
5912 {
5913 if (dump_enabled_p ())
5914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5915 "permutation requires at "
5916 "least three vectors\n");
5917 gcc_assert (!gsi);
5918 return false;
5919 }
5920
5921 mask[index++] = mask_element;
5922
5923 if (index == count)
5924 {
5925 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
5926 const_nunits);
5927 bool identity_p = indices.series_p (0, 1, 0, 1);
5928 if (!identity_p
5929 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5930 {
5931 if (dump_enabled_p ())
5932 {
5933 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5934 vect_location,
5935 "unsupported vect permute { ");
5936 for (i = 0; i < count; ++i)
5937 {
5938 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5939 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5940 }
5941 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5942 }
5943 gcc_assert (!gsi);
5944 return false;
5945 }
5946
5947 if (!identity_p)
5948 nperms++;
5949 if (gsi)
5950 {
5951 if (second_vec.first == -1U)
5952 second_vec = first_vec;
5953
5954 /* Generate the permute statement if necessary. */
5955 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
5956 tree first_def
5957 = vect_get_slp_vect_def (first_node, first_vec.second);
5958 /* ??? We SLP match existing vector element extracts but
5959 allow punning which we need to re-instantiate at uses
5960 but have no good way of explicitely representing. */
5961 if (!types_compatible_p (TREE_TYPE (first_def), vectype))
5962 {
5963 gassign *conv_stmt;
5964 conv_stmt = gimple_build_assign (make_ssa_name (vectype),
5965 build1 (VIEW_CONVERT_EXPR,
5966 vectype, first_def));
5967 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
5968 first_def = gimple_assign_lhs (conv_stmt);
5969 }
5970 gassign *perm_stmt;
5971 tree perm_dest = make_ssa_name (vectype);
5972 if (!identity_p)
5973 {
5974 slp_tree second_node
5975 = SLP_TREE_CHILDREN (node)[second_vec.first];
5976 tree second_def
5977 = vect_get_slp_vect_def (second_node, second_vec.second);
5978 if (!types_compatible_p (TREE_TYPE (second_def), vectype))
5979 {
5980 gassign *conv_stmt;
5981 conv_stmt = gimple_build_assign (make_ssa_name (vectype),
5982 build1
5983 (VIEW_CONVERT_EXPR,
5984 vectype, second_def));
5985 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
5986 second_def = gimple_assign_lhs (conv_stmt);
5987 }
5988 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5989 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5990 first_def, second_def,
5991 mask_vec);
5992 }
5993 else
5994 /* We need a copy here in case the def was external. */
5995 perm_stmt = gimple_build_assign (perm_dest, first_def);
5996 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
5997 /* Store the vector statement in NODE. */
5998 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
5999 }
6000
6001 index = 0;
6002 first_vec = std::make_pair (-1U, -1U);
6003 second_vec = std::make_pair (-1U, -1U);
6004 }
6005 }
6006
6007 if (!gsi)
6008 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
6009
6010 return true;
6011 }
6012
6013 /* Vectorize SLP NODE. */
6014
6015 static void
6016 vect_schedule_slp_node (vec_info *vinfo,
6017 slp_tree node, slp_instance instance)
6018 {
6019 gimple_stmt_iterator si;
6020 int i;
6021 slp_tree child;
6022
6023 /* For existing vectors there's nothing to do. */
6024 if (SLP_TREE_VEC_DEFS (node).exists ())
6025 return;
6026
6027 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
6028
6029 /* Vectorize externals and constants. */
6030 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
6031 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6032 {
6033 /* ??? vectorizable_shift can end up using a scalar operand which is
6034 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
6035 node in this case. */
6036 if (!SLP_TREE_VECTYPE (node))
6037 return;
6038
6039 vect_create_constant_vectors (vinfo, node);
6040 return;
6041 }
6042
6043 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6044
6045 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
6046 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6047
6048 if (dump_enabled_p ())
6049 dump_printf_loc (MSG_NOTE, vect_location,
6050 "------>vectorizing SLP node starting from: %G",
6051 stmt_info->stmt);
6052
6053 if (STMT_VINFO_DATA_REF (stmt_info)
6054 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
6055 {
6056 /* Vectorized loads go before the first scalar load to make it
6057 ready early, vectorized stores go before the last scalar
6058 stmt which is where all uses are ready. */
6059 stmt_vec_info last_stmt_info = NULL;
6060 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
6061 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
6062 else /* DR_IS_WRITE */
6063 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
6064 si = gsi_for_stmt (last_stmt_info->stmt);
6065 }
6066 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
6067 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
6068 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
6069 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
6070 {
6071 /* For PHI node vectorization we do not use the insertion iterator. */
6072 si = gsi_none ();
6073 }
6074 else
6075 {
6076 /* Emit other stmts after the children vectorized defs which is
6077 earliest possible. */
6078 gimple *last_stmt = NULL;
6079 bool seen_vector_def = false;
6080 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6081 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6082 {
6083 /* For fold-left reductions we are retaining the scalar
6084 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
6085 set so the representation isn't perfect. Resort to the
6086 last scalar def here. */
6087 if (SLP_TREE_VEC_STMTS (child).is_empty ())
6088 {
6089 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
6090 == cycle_phi_info_type);
6091 gphi *phi = as_a <gphi *>
6092 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
6093 if (!last_stmt
6094 || vect_stmt_dominates_stmt_p (last_stmt, phi))
6095 last_stmt = phi;
6096 }
6097 /* We are emitting all vectorized stmts in the same place and
6098 the last one is the last.
6099 ??? Unless we have a load permutation applied and that
6100 figures to re-use an earlier generated load. */
6101 unsigned j;
6102 gimple *vstmt;
6103 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
6104 if (!last_stmt
6105 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
6106 last_stmt = vstmt;
6107 }
6108 else if (!SLP_TREE_VECTYPE (child))
6109 {
6110 /* For externals we use unvectorized at all scalar defs. */
6111 unsigned j;
6112 tree def;
6113 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
6114 if (TREE_CODE (def) == SSA_NAME
6115 && !SSA_NAME_IS_DEFAULT_DEF (def))
6116 {
6117 gimple *stmt = SSA_NAME_DEF_STMT (def);
6118 if (!last_stmt
6119 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
6120 last_stmt = stmt;
6121 }
6122 }
6123 else
6124 {
6125 /* For externals we have to look at all defs since their
6126 insertion place is decided per vector. But beware
6127 of pre-existing vectors where we need to make sure
6128 we do not insert before the region boundary. */
6129 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
6130 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
6131 seen_vector_def = true;
6132 else
6133 {
6134 unsigned j;
6135 tree vdef;
6136 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
6137 if (TREE_CODE (vdef) == SSA_NAME
6138 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
6139 {
6140 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
6141 if (!last_stmt
6142 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
6143 last_stmt = vstmt;
6144 }
6145 }
6146 }
6147 /* This can happen when all children are pre-existing vectors or
6148 constants. */
6149 if (!last_stmt)
6150 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
6151 if (!last_stmt)
6152 {
6153 gcc_assert (seen_vector_def);
6154 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
6155 }
6156 else if (is_a <gphi *> (last_stmt))
6157 si = gsi_after_labels (gimple_bb (last_stmt));
6158 else
6159 {
6160 si = gsi_for_stmt (last_stmt);
6161 gsi_next (&si);
6162 }
6163 }
6164
6165 bool done_p = false;
6166
6167 /* Handle purely internal nodes. */
6168 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6169 {
6170 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
6171 be shared with different SLP nodes (but usually it's the same
6172 operation apart from the case the stmt is only there for denoting
6173 the actual scalar lane defs ...). So do not call vect_transform_stmt
6174 but open-code it here (partly). */
6175 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
6176 gcc_assert (done);
6177 done_p = true;
6178 }
6179 if (!done_p)
6180 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
6181 }
6182
6183 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
6184 For loop vectorization this is done in vectorizable_call, but for SLP
6185 it needs to be deferred until end of vect_schedule_slp, because multiple
6186 SLP instances may refer to the same scalar stmt. */
6187
6188 static void
6189 vect_remove_slp_scalar_calls (vec_info *vinfo,
6190 slp_tree node, hash_set<slp_tree> &visited)
6191 {
6192 gimple *new_stmt;
6193 gimple_stmt_iterator gsi;
6194 int i;
6195 slp_tree child;
6196 tree lhs;
6197 stmt_vec_info stmt_info;
6198
6199 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
6200 return;
6201
6202 if (visited.add (node))
6203 return;
6204
6205 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6206 vect_remove_slp_scalar_calls (vinfo, child, visited);
6207
6208 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6209 {
6210 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
6211 if (!stmt || gimple_bb (stmt) == NULL)
6212 continue;
6213 if (is_pattern_stmt_p (stmt_info)
6214 || !PURE_SLP_STMT (stmt_info))
6215 continue;
6216 lhs = gimple_call_lhs (stmt);
6217 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
6218 gsi = gsi_for_stmt (stmt);
6219 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
6220 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
6221 }
6222 }
6223
6224 static void
6225 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
6226 {
6227 hash_set<slp_tree> visited;
6228 vect_remove_slp_scalar_calls (vinfo, node, visited);
6229 }
6230
6231 /* Vectorize the instance root. */
6232
6233 void
6234 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
6235 {
6236 gassign *rstmt = NULL;
6237
6238 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
6239 {
6240 gimple *child_stmt;
6241 int j;
6242
6243 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
6244 {
6245 tree vect_lhs = gimple_get_lhs (child_stmt);
6246 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
6247 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
6248 TREE_TYPE (vect_lhs)))
6249 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
6250 vect_lhs);
6251 rstmt = gimple_build_assign (root_lhs, vect_lhs);
6252 break;
6253 }
6254 }
6255 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
6256 {
6257 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6258 gimple *child_stmt;
6259 int j;
6260 vec<constructor_elt, va_gc> *v;
6261 vec_alloc (v, nelts);
6262
6263 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
6264 {
6265 CONSTRUCTOR_APPEND_ELT (v,
6266 NULL_TREE,
6267 gimple_get_lhs (child_stmt));
6268 }
6269 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
6270 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
6271 tree r_constructor = build_constructor (rtype, v);
6272 rstmt = gimple_build_assign (lhs, r_constructor);
6273 }
6274
6275 gcc_assert (rstmt);
6276
6277 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
6278 gsi_replace (&rgsi, rstmt, true);
6279 }
6280
6281 struct slp_scc_info
6282 {
6283 bool on_stack;
6284 int dfs;
6285 int lowlink;
6286 };
6287
6288 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
6289
6290 static void
6291 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
6292 hash_map<slp_tree, slp_scc_info> &scc_info,
6293 int &maxdfs, vec<slp_tree> &stack)
6294 {
6295 bool existed_p;
6296 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
6297 gcc_assert (!existed_p);
6298 info->dfs = maxdfs;
6299 info->lowlink = maxdfs;
6300 maxdfs++;
6301
6302 /* Leaf. */
6303 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
6304 {
6305 info->on_stack = false;
6306 vect_schedule_slp_node (vinfo, node, instance);
6307 return;
6308 }
6309
6310 info->on_stack = true;
6311 stack.safe_push (node);
6312
6313 unsigned i;
6314 slp_tree child;
6315 /* DFS recurse. */
6316 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6317 {
6318 if (!child)
6319 continue;
6320 slp_scc_info *child_info = scc_info.get (child);
6321 if (!child_info)
6322 {
6323 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
6324 /* Recursion might have re-allocated the node. */
6325 info = scc_info.get (node);
6326 child_info = scc_info.get (child);
6327 info->lowlink = MIN (info->lowlink, child_info->lowlink);
6328 }
6329 else if (child_info->on_stack)
6330 info->lowlink = MIN (info->lowlink, child_info->dfs);
6331 }
6332 if (info->lowlink != info->dfs)
6333 return;
6334
6335 auto_vec<slp_tree, 4> phis_to_fixup;
6336
6337 /* Singleton. */
6338 if (stack.last () == node)
6339 {
6340 stack.pop ();
6341 info->on_stack = false;
6342 vect_schedule_slp_node (vinfo, node, instance);
6343 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6344 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
6345 phis_to_fixup.quick_push (node);
6346 }
6347 else
6348 {
6349 /* SCC. */
6350 int last_idx = stack.length () - 1;
6351 while (stack[last_idx] != node)
6352 last_idx--;
6353 /* We can break the cycle at PHIs who have at least one child
6354 code generated. Then we could re-start the DFS walk until
6355 all nodes in the SCC are covered (we might have new entries
6356 for only back-reachable nodes). But it's simpler to just
6357 iterate and schedule those that are ready. */
6358 unsigned todo = stack.length () - last_idx;
6359 do
6360 {
6361 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
6362 {
6363 slp_tree entry = stack[idx];
6364 if (!entry)
6365 continue;
6366 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
6367 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
6368 bool ready = !phi;
6369 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
6370 if (!child)
6371 {
6372 gcc_assert (phi);
6373 ready = true;
6374 break;
6375 }
6376 else if (scc_info.get (child)->on_stack)
6377 {
6378 if (!phi)
6379 {
6380 ready = false;
6381 break;
6382 }
6383 }
6384 else
6385 {
6386 if (phi)
6387 {
6388 ready = true;
6389 break;
6390 }
6391 }
6392 if (ready)
6393 {
6394 vect_schedule_slp_node (vinfo, entry, instance);
6395 scc_info.get (entry)->on_stack = false;
6396 stack[idx] = NULL;
6397 todo--;
6398 if (phi)
6399 phis_to_fixup.safe_push (entry);
6400 }
6401 }
6402 }
6403 while (todo != 0);
6404
6405 /* Pop the SCC. */
6406 stack.truncate (last_idx);
6407 }
6408
6409 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
6410 slp_tree phi_node;
6411 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
6412 {
6413 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
6414 edge_iterator ei;
6415 edge e;
6416 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
6417 {
6418 unsigned dest_idx = e->dest_idx;
6419 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
6420 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
6421 continue;
6422 /* Simply fill all args. */
6423 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
6424 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
6425 vect_get_slp_vect_def (child, i),
6426 e, gimple_phi_arg_location (phi, dest_idx));
6427 }
6428 }
6429 }
6430
6431 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
6432
6433 void
6434 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
6435 {
6436 slp_instance instance;
6437 unsigned int i;
6438
6439 hash_map<slp_tree, slp_scc_info> scc_info;
6440 int maxdfs = 0;
6441 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6442 {
6443 slp_tree node = SLP_INSTANCE_TREE (instance);
6444 if (dump_enabled_p ())
6445 {
6446 dump_printf_loc (MSG_NOTE, vect_location,
6447 "Vectorizing SLP tree:\n");
6448 if (SLP_INSTANCE_ROOT_STMT (instance))
6449 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
6450 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
6451 vect_print_slp_graph (MSG_NOTE, vect_location,
6452 SLP_INSTANCE_TREE (instance));
6453 }
6454 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
6455 have a PHI be the node breaking the cycle. */
6456 auto_vec<slp_tree> stack;
6457 if (!scc_info.get (node))
6458 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
6459
6460 if (SLP_INSTANCE_ROOT_STMT (instance))
6461 vectorize_slp_instance_root_stmt (node, instance);
6462
6463 if (dump_enabled_p ())
6464 dump_printf_loc (MSG_NOTE, vect_location,
6465 "vectorizing stmts using SLP.\n");
6466 }
6467
6468 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6469 {
6470 slp_tree root = SLP_INSTANCE_TREE (instance);
6471 stmt_vec_info store_info;
6472 unsigned int j;
6473
6474 /* Remove scalar call stmts. Do not do this for basic-block
6475 vectorization as not all uses may be vectorized.
6476 ??? Why should this be necessary? DCE should be able to
6477 remove the stmts itself.
6478 ??? For BB vectorization we can as well remove scalar
6479 stmts starting from the SLP tree root if they have no
6480 uses. */
6481 if (is_a <loop_vec_info> (vinfo))
6482 vect_remove_slp_scalar_calls (vinfo, root);
6483
6484 /* Remove vectorized stores original scalar stmts. */
6485 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
6486 {
6487 if (!STMT_VINFO_DATA_REF (store_info)
6488 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
6489 break;
6490
6491 store_info = vect_orig_stmt (store_info);
6492 /* Free the attached stmt_vec_info and remove the stmt. */
6493 vinfo->remove_stmt (store_info);
6494
6495 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
6496 to not crash in vect_free_slp_tree later. */
6497 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
6498 SLP_TREE_REPRESENTATIVE (root) = NULL;
6499 }
6500 }
6501 }