]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
tree-optimization/99807 - avoid bogus assert with permute SLP node
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
52
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55
56 static object_allocator<_slp_tree> *slp_tree_pool;
57 static slp_tree slp_first_node;
58
59 void
60 vect_slp_init (void)
61 {
62 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
63 }
64
65 void
66 vect_slp_fini (void)
67 {
68 while (slp_first_node)
69 delete slp_first_node;
70 delete slp_tree_pool;
71 slp_tree_pool = NULL;
72 }
73
74 void *
75 _slp_tree::operator new (size_t n)
76 {
77 gcc_assert (n == sizeof (_slp_tree));
78 return slp_tree_pool->allocate_raw ();
79 }
80
81 void
82 _slp_tree::operator delete (void *node, size_t n)
83 {
84 gcc_assert (n == sizeof (_slp_tree));
85 slp_tree_pool->remove_raw (node);
86 }
87
88
89 /* Initialize a SLP node. */
90
91 _slp_tree::_slp_tree ()
92 {
93 this->prev_node = NULL;
94 if (slp_first_node)
95 slp_first_node->prev_node = this;
96 this->next_node = slp_first_node;
97 slp_first_node = this;
98 SLP_TREE_SCALAR_STMTS (this) = vNULL;
99 SLP_TREE_SCALAR_OPS (this) = vNULL;
100 SLP_TREE_VEC_STMTS (this) = vNULL;
101 SLP_TREE_VEC_DEFS (this) = vNULL;
102 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
103 SLP_TREE_CHILDREN (this) = vNULL;
104 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
105 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
106 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
107 SLP_TREE_CODE (this) = ERROR_MARK;
108 SLP_TREE_VECTYPE (this) = NULL_TREE;
109 SLP_TREE_REPRESENTATIVE (this) = NULL;
110 SLP_TREE_REF_COUNT (this) = 1;
111 this->max_nunits = 1;
112 this->lanes = 0;
113 }
114
115 /* Tear down a SLP node. */
116
117 _slp_tree::~_slp_tree ()
118 {
119 if (this->prev_node)
120 this->prev_node->next_node = this->next_node;
121 else
122 slp_first_node = this->next_node;
123 if (this->next_node)
124 this->next_node->prev_node = this->prev_node;
125 SLP_TREE_CHILDREN (this).release ();
126 SLP_TREE_SCALAR_STMTS (this).release ();
127 SLP_TREE_SCALAR_OPS (this).release ();
128 SLP_TREE_VEC_STMTS (this).release ();
129 SLP_TREE_VEC_DEFS (this).release ();
130 SLP_TREE_LOAD_PERMUTATION (this).release ();
131 SLP_TREE_LANE_PERMUTATION (this).release ();
132 }
133
134 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
135
136 void
137 vect_free_slp_tree (slp_tree node)
138 {
139 int i;
140 slp_tree child;
141
142 if (--SLP_TREE_REF_COUNT (node) != 0)
143 return;
144
145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
146 if (child)
147 vect_free_slp_tree (child);
148
149 /* If the node defines any SLP only patterns then those patterns are no
150 longer valid and should be removed. */
151 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
152 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
153 {
154 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
155 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
156 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
157 }
158
159 delete node;
160 }
161
162 /* Return a location suitable for dumpings related to the SLP instance. */
163
164 dump_user_location_t
165 _slp_instance::location () const
166 {
167 if (root_stmt)
168 return root_stmt->stmt;
169 else
170 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
171 }
172
173
174 /* Free the memory allocated for the SLP instance. */
175
176 void
177 vect_free_slp_instance (slp_instance instance)
178 {
179 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
180 SLP_INSTANCE_LOADS (instance).release ();
181 instance->subgraph_entries.release ();
182 instance->cost_vec.release ();
183 free (instance);
184 }
185
186
187 /* Create an SLP node for SCALAR_STMTS. */
188
189 slp_tree
190 vect_create_new_slp_node (unsigned nops, tree_code code)
191 {
192 slp_tree node = new _slp_tree;
193 SLP_TREE_SCALAR_STMTS (node) = vNULL;
194 SLP_TREE_CHILDREN (node).create (nops);
195 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
196 SLP_TREE_CODE (node) = code;
197 return node;
198 }
199 /* Create an SLP node for SCALAR_STMTS. */
200
201 static slp_tree
202 vect_create_new_slp_node (slp_tree node,
203 vec<stmt_vec_info> scalar_stmts, unsigned nops)
204 {
205 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
206 SLP_TREE_CHILDREN (node).create (nops);
207 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
208 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
209 SLP_TREE_LANES (node) = scalar_stmts.length ();
210 return node;
211 }
212
213 /* Create an SLP node for SCALAR_STMTS. */
214
215 static slp_tree
216 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
217 {
218 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
219 }
220
221 /* Create an SLP node for OPS. */
222
223 static slp_tree
224 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
225 {
226 SLP_TREE_SCALAR_OPS (node) = ops;
227 SLP_TREE_DEF_TYPE (node) = vect_external_def;
228 SLP_TREE_LANES (node) = ops.length ();
229 return node;
230 }
231
232 /* Create an SLP node for OPS. */
233
234 static slp_tree
235 vect_create_new_slp_node (vec<tree> ops)
236 {
237 return vect_create_new_slp_node (new _slp_tree, ops);
238 }
239
240
241 /* This structure is used in creation of an SLP tree. Each instance
242 corresponds to the same operand in a group of scalar stmts in an SLP
243 node. */
244 typedef struct _slp_oprnd_info
245 {
246 /* Def-stmts for the operands. */
247 vec<stmt_vec_info> def_stmts;
248 /* Operands. */
249 vec<tree> ops;
250 /* Information about the first statement, its vector def-type, type, the
251 operand itself in case it's constant, and an indication if it's a pattern
252 stmt. */
253 tree first_op_type;
254 enum vect_def_type first_dt;
255 bool any_pattern;
256 } *slp_oprnd_info;
257
258
259 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
260 operand. */
261 static vec<slp_oprnd_info>
262 vect_create_oprnd_info (int nops, int group_size)
263 {
264 int i;
265 slp_oprnd_info oprnd_info;
266 vec<slp_oprnd_info> oprnds_info;
267
268 oprnds_info.create (nops);
269 for (i = 0; i < nops; i++)
270 {
271 oprnd_info = XNEW (struct _slp_oprnd_info);
272 oprnd_info->def_stmts.create (group_size);
273 oprnd_info->ops.create (group_size);
274 oprnd_info->first_dt = vect_uninitialized_def;
275 oprnd_info->first_op_type = NULL_TREE;
276 oprnd_info->any_pattern = false;
277 oprnds_info.quick_push (oprnd_info);
278 }
279
280 return oprnds_info;
281 }
282
283
284 /* Free operands info. */
285
286 static void
287 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
288 {
289 int i;
290 slp_oprnd_info oprnd_info;
291
292 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
293 {
294 oprnd_info->def_stmts.release ();
295 oprnd_info->ops.release ();
296 XDELETE (oprnd_info);
297 }
298
299 oprnds_info.release ();
300 }
301
302
303 /* Return true if STMTS contains a pattern statement. */
304
305 static bool
306 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
307 {
308 stmt_vec_info stmt_info;
309 unsigned int i;
310 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
311 if (is_pattern_stmt_p (stmt_info))
312 return true;
313 return false;
314 }
315
316 /* Return true when all lanes in the external or constant NODE have
317 the same value. */
318
319 static bool
320 vect_slp_tree_uniform_p (slp_tree node)
321 {
322 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
323 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
324
325 /* Pre-exsting vectors. */
326 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
327 return false;
328
329 unsigned i;
330 tree op, first = NULL_TREE;
331 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
332 if (!first)
333 first = op;
334 else if (!operand_equal_p (first, op, 0))
335 return false;
336
337 return true;
338 }
339
340 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
341 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
342 of the chain. */
343
344 int
345 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
346 stmt_vec_info first_stmt_info)
347 {
348 stmt_vec_info next_stmt_info = first_stmt_info;
349 int result = 0;
350
351 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
352 return -1;
353
354 do
355 {
356 if (next_stmt_info == stmt_info)
357 return result;
358 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
359 if (next_stmt_info)
360 result += DR_GROUP_GAP (next_stmt_info);
361 }
362 while (next_stmt_info);
363
364 return -1;
365 }
366
367 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
368 using the method implemented by duplicate_and_interleave. Return true
369 if so, returning the number of intermediate vectors in *NVECTORS_OUT
370 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
371 (if nonnull). */
372
373 bool
374 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
375 tree elt_type, unsigned int *nvectors_out,
376 tree *vector_type_out,
377 tree *permutes)
378 {
379 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
380 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
381 return false;
382
383 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
384 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
385 unsigned int nvectors = 1;
386 for (;;)
387 {
388 scalar_int_mode int_mode;
389 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
390 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
391 {
392 /* Get the natural vector type for this SLP group size. */
393 tree int_type = build_nonstandard_integer_type
394 (GET_MODE_BITSIZE (int_mode), 1);
395 tree vector_type
396 = get_vectype_for_scalar_type (vinfo, int_type, count);
397 if (vector_type
398 && VECTOR_MODE_P (TYPE_MODE (vector_type))
399 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
400 GET_MODE_SIZE (base_vector_mode)))
401 {
402 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
403 together into elements of type INT_TYPE and using the result
404 to build NVECTORS vectors. */
405 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
406 vec_perm_builder sel1 (nelts, 2, 3);
407 vec_perm_builder sel2 (nelts, 2, 3);
408 poly_int64 half_nelts = exact_div (nelts, 2);
409 for (unsigned int i = 0; i < 3; ++i)
410 {
411 sel1.quick_push (i);
412 sel1.quick_push (i + nelts);
413 sel2.quick_push (half_nelts + i);
414 sel2.quick_push (half_nelts + i + nelts);
415 }
416 vec_perm_indices indices1 (sel1, 2, nelts);
417 vec_perm_indices indices2 (sel2, 2, nelts);
418 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
419 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
420 {
421 if (nvectors_out)
422 *nvectors_out = nvectors;
423 if (vector_type_out)
424 *vector_type_out = vector_type;
425 if (permutes)
426 {
427 permutes[0] = vect_gen_perm_mask_checked (vector_type,
428 indices1);
429 permutes[1] = vect_gen_perm_mask_checked (vector_type,
430 indices2);
431 }
432 return true;
433 }
434 }
435 }
436 if (!multiple_p (elt_bytes, 2, &elt_bytes))
437 return false;
438 nvectors *= 2;
439 }
440 }
441
442 /* Return true if DTA and DTB match. */
443
444 static bool
445 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
446 {
447 return (dta == dtb
448 || ((dta == vect_external_def || dta == vect_constant_def)
449 && (dtb == vect_external_def || dtb == vect_constant_def)));
450 }
451
452 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
453 they are of a valid type and that they match the defs of the first stmt of
454 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
455 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
456 indicates swap is required for cond_expr stmts. Specifically, *SWAP
457 is 1 if STMT is cond and operands of comparison need to be swapped;
458 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
459 If there is any operand swap in this function, *SWAP is set to non-zero
460 value.
461 If there was a fatal error return -1; if the error could be corrected by
462 swapping operands of father node of this one, return 1; if everything is
463 ok return 0. */
464 static int
465 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
466 bool *skip_args,
467 vec<stmt_vec_info> stmts, unsigned stmt_num,
468 vec<slp_oprnd_info> *oprnds_info)
469 {
470 stmt_vec_info stmt_info = stmts[stmt_num];
471 tree oprnd;
472 unsigned int i, number_of_oprnds;
473 enum vect_def_type dt = vect_uninitialized_def;
474 slp_oprnd_info oprnd_info;
475 int first_op_idx = 1;
476 unsigned int commutative_op = -1U;
477 bool first_op_cond = false;
478 bool first = stmt_num == 0;
479
480 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
481 {
482 number_of_oprnds = gimple_call_num_args (stmt);
483 first_op_idx = 3;
484 if (gimple_call_internal_p (stmt))
485 {
486 internal_fn ifn = gimple_call_internal_fn (stmt);
487 commutative_op = first_commutative_argument (ifn);
488
489 /* Masked load, only look at mask. */
490 if (ifn == IFN_MASK_LOAD)
491 {
492 number_of_oprnds = 1;
493 /* Mask operand index. */
494 first_op_idx = 5;
495 }
496 }
497 }
498 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
499 {
500 enum tree_code code = gimple_assign_rhs_code (stmt);
501 number_of_oprnds = gimple_num_ops (stmt) - 1;
502 /* Swap can only be done for cond_expr if asked to, otherwise we
503 could result in different comparison code to the first stmt. */
504 if (code == COND_EXPR
505 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
506 {
507 first_op_cond = true;
508 number_of_oprnds++;
509 }
510 else
511 commutative_op = commutative_tree_code (code) ? 0U : -1U;
512 }
513 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
514 number_of_oprnds = gimple_phi_num_args (stmt);
515 else
516 return -1;
517
518 bool swapped = (swap != 0);
519 bool backedge = false;
520 gcc_assert (!swapped || first_op_cond);
521 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
522 for (i = 0; i < number_of_oprnds; i++)
523 {
524 if (first_op_cond)
525 {
526 /* Map indicating how operands of cond_expr should be swapped. */
527 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
528 int *map = maps[swap];
529
530 if (i < 2)
531 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
532 first_op_idx), map[i]);
533 else
534 oprnd = gimple_op (stmt_info->stmt, map[i]);
535 }
536 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
537 {
538 oprnd = gimple_phi_arg_def (stmt, i);
539 backedge = dominated_by_p (CDI_DOMINATORS,
540 gimple_phi_arg_edge (stmt, i)->src,
541 gimple_bb (stmt_info->stmt));
542 }
543 else
544 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
545 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
546 oprnd = TREE_OPERAND (oprnd, 0);
547
548 oprnd_info = (*oprnds_info)[i];
549
550 stmt_vec_info def_stmt_info;
551 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
552 {
553 if (dump_enabled_p ())
554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
555 "Build SLP failed: can't analyze def for %T\n",
556 oprnd);
557
558 return -1;
559 }
560
561 if (skip_args[i])
562 {
563 oprnd_info->def_stmts.quick_push (NULL);
564 oprnd_info->ops.quick_push (NULL_TREE);
565 oprnd_info->first_dt = vect_uninitialized_def;
566 continue;
567 }
568
569 oprnd_info->def_stmts.quick_push (def_stmt_info);
570 oprnd_info->ops.quick_push (oprnd);
571
572 if (def_stmt_info
573 && is_pattern_stmt_p (def_stmt_info))
574 {
575 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
576 != def_stmt_info)
577 oprnd_info->any_pattern = true;
578 else
579 /* If we promote this to external use the original stmt def. */
580 oprnd_info->ops.last ()
581 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
582 }
583
584 /* If there's a extern def on a backedge make sure we can
585 code-generate at the region start.
586 ??? This is another case that could be fixed by adjusting
587 how we split the function but at the moment we'd have conflicting
588 goals there. */
589 if (backedge
590 && dts[i] == vect_external_def
591 && is_a <bb_vec_info> (vinfo)
592 && TREE_CODE (oprnd) == SSA_NAME
593 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
594 && !dominated_by_p (CDI_DOMINATORS,
595 as_a <bb_vec_info> (vinfo)->bbs[0],
596 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
597 {
598 if (dump_enabled_p ())
599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
600 "Build SLP failed: extern def %T only defined "
601 "on backedge\n", oprnd);
602 return -1;
603 }
604
605 if (first)
606 {
607 tree type = TREE_TYPE (oprnd);
608 dt = dts[i];
609 if ((dt == vect_constant_def
610 || dt == vect_external_def)
611 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
612 && (TREE_CODE (type) == BOOLEAN_TYPE
613 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
614 type)))
615 {
616 if (dump_enabled_p ())
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
618 "Build SLP failed: invalid type of def "
619 "for variable-length SLP %T\n", oprnd);
620 return -1;
621 }
622
623 /* For the swapping logic below force vect_reduction_def
624 for the reduction op in a SLP reduction group. */
625 if (!STMT_VINFO_DATA_REF (stmt_info)
626 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
627 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
628 && def_stmt_info)
629 dts[i] = dt = vect_reduction_def;
630
631 /* Check the types of the definition. */
632 switch (dt)
633 {
634 case vect_external_def:
635 case vect_constant_def:
636 case vect_internal_def:
637 case vect_reduction_def:
638 case vect_induction_def:
639 case vect_nested_cycle:
640 break;
641
642 default:
643 /* FORNOW: Not supported. */
644 if (dump_enabled_p ())
645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
646 "Build SLP failed: illegal type of def %T\n",
647 oprnd);
648 return -1;
649 }
650
651 oprnd_info->first_dt = dt;
652 oprnd_info->first_op_type = type;
653 }
654 }
655 if (first)
656 return 0;
657
658 /* Now match the operand definition types to that of the first stmt. */
659 for (i = 0; i < number_of_oprnds;)
660 {
661 if (skip_args[i])
662 {
663 ++i;
664 continue;
665 }
666
667 oprnd_info = (*oprnds_info)[i];
668 dt = dts[i];
669 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
670 oprnd = oprnd_info->ops[stmt_num];
671 tree type = TREE_TYPE (oprnd);
672
673 if (!types_compatible_p (oprnd_info->first_op_type, type))
674 {
675 if (dump_enabled_p ())
676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
677 "Build SLP failed: different operand types\n");
678 return 1;
679 }
680
681 /* Not first stmt of the group, check that the def-stmt/s match
682 the def-stmt/s of the first stmt. Allow different definition
683 types for reduction chains: the first stmt must be a
684 vect_reduction_def (a phi node), and the rest
685 end in the reduction chain. */
686 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
687 && !(oprnd_info->first_dt == vect_reduction_def
688 && !STMT_VINFO_DATA_REF (stmt_info)
689 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
690 && def_stmt_info
691 && !STMT_VINFO_DATA_REF (def_stmt_info)
692 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
693 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
694 || (!STMT_VINFO_DATA_REF (stmt_info)
695 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
696 && ((!def_stmt_info
697 || STMT_VINFO_DATA_REF (def_stmt_info)
698 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
699 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
700 != (oprnd_info->first_dt != vect_reduction_def))))
701 {
702 /* Try swapping operands if we got a mismatch. For BB
703 vectorization only in case it will clearly improve things. */
704 if (i == commutative_op && !swapped
705 && (!is_a <bb_vec_info> (vinfo)
706 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
707 dts[i+1])
708 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
709 || vect_def_types_match
710 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
711 {
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_NOTE, vect_location,
714 "trying swapped operands\n");
715 std::swap (dts[i], dts[i+1]);
716 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
717 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
718 std::swap ((*oprnds_info)[i]->ops[stmt_num],
719 (*oprnds_info)[i+1]->ops[stmt_num]);
720 swapped = true;
721 continue;
722 }
723
724 if (is_a <bb_vec_info> (vinfo)
725 && !oprnd_info->any_pattern)
726 {
727 /* Now for commutative ops we should see whether we can
728 make the other operand matching. */
729 if (dump_enabled_p ())
730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
731 "treating operand as external\n");
732 oprnd_info->first_dt = dt = vect_external_def;
733 }
734 else
735 {
736 if (dump_enabled_p ())
737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
738 "Build SLP failed: different types\n");
739 return 1;
740 }
741 }
742
743 /* Make sure to demote the overall operand to external. */
744 if (dt == vect_external_def)
745 oprnd_info->first_dt = vect_external_def;
746 /* For a SLP reduction chain we want to duplicate the reduction to
747 each of the chain members. That gets us a sane SLP graph (still
748 the stmts are not 100% correct wrt the initial values). */
749 else if ((dt == vect_internal_def
750 || dt == vect_reduction_def)
751 && oprnd_info->first_dt == vect_reduction_def
752 && !STMT_VINFO_DATA_REF (stmt_info)
753 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
754 && !STMT_VINFO_DATA_REF (def_stmt_info)
755 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
756 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
757 {
758 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
759 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
760 }
761
762 ++i;
763 }
764
765 /* Swap operands. */
766 if (swapped)
767 {
768 if (dump_enabled_p ())
769 dump_printf_loc (MSG_NOTE, vect_location,
770 "swapped operands to match def types in %G",
771 stmt_info->stmt);
772 }
773
774 return 0;
775 }
776
777 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
778 Return true if we can, meaning that this choice doesn't conflict with
779 existing SLP nodes that use STMT_INFO. */
780
781 bool
782 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
783 {
784 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
785 if (old_vectype)
786 return useless_type_conversion_p (vectype, old_vectype);
787
788 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
789 {
790 /* We maintain the invariant that if any statement in the group is
791 used, all other members of the group have the same vector type. */
792 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
793 stmt_vec_info member_info = first_info;
794 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
795 if (is_pattern_stmt_p (member_info)
796 && !useless_type_conversion_p (vectype,
797 STMT_VINFO_VECTYPE (member_info)))
798 break;
799
800 if (!member_info)
801 {
802 for (member_info = first_info; member_info;
803 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
804 STMT_VINFO_VECTYPE (member_info) = vectype;
805 return true;
806 }
807 }
808 else if (!is_pattern_stmt_p (stmt_info))
809 {
810 STMT_VINFO_VECTYPE (stmt_info) = vectype;
811 return true;
812 }
813
814 if (dump_enabled_p ())
815 {
816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
817 "Build SLP failed: incompatible vector"
818 " types for: %G", stmt_info->stmt);
819 dump_printf_loc (MSG_NOTE, vect_location,
820 " old vector type: %T\n", old_vectype);
821 dump_printf_loc (MSG_NOTE, vect_location,
822 " new vector type: %T\n", vectype);
823 }
824 return false;
825 }
826
827 /* Return true if call statements CALL1 and CALL2 are similar enough
828 to be combined into the same SLP group. */
829
830 static bool
831 compatible_calls_p (gcall *call1, gcall *call2)
832 {
833 unsigned int nargs = gimple_call_num_args (call1);
834 if (nargs != gimple_call_num_args (call2))
835 return false;
836
837 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
838 return false;
839
840 if (gimple_call_internal_p (call1))
841 {
842 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
843 TREE_TYPE (gimple_call_lhs (call2))))
844 return false;
845 for (unsigned int i = 0; i < nargs; ++i)
846 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
847 TREE_TYPE (gimple_call_arg (call2, i))))
848 return false;
849 }
850 else
851 {
852 if (!operand_equal_p (gimple_call_fn (call1),
853 gimple_call_fn (call2), 0))
854 return false;
855
856 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
857 return false;
858 }
859 return true;
860 }
861
862 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
863 caller's attempt to find the vector type in STMT_INFO with the narrowest
864 element type. Return true if VECTYPE is nonnull and if it is valid
865 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
866 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
867 vect_build_slp_tree. */
868
869 static bool
870 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
871 unsigned int group_size,
872 tree vectype, poly_uint64 *max_nunits)
873 {
874 if (!vectype)
875 {
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
878 "Build SLP failed: unsupported data-type in %G\n",
879 stmt_info->stmt);
880 /* Fatal mismatch. */
881 return false;
882 }
883
884 /* If populating the vector type requires unrolling then fail
885 before adjusting *max_nunits for basic-block vectorization. */
886 if (is_a <bb_vec_info> (vinfo)
887 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
888 {
889 if (dump_enabled_p ())
890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
891 "Build SLP failed: unrolling required "
892 "in basic block SLP\n");
893 /* Fatal mismatch. */
894 return false;
895 }
896
897 /* In case of multiple types we need to detect the smallest type. */
898 vect_update_max_nunits (max_nunits, vectype);
899 return true;
900 }
901
902 /* Verify if the scalar stmts STMTS are isomorphic, require data
903 permutation or are of unsupported types of operation. Return
904 true if they are, otherwise return false and indicate in *MATCHES
905 which stmts are not isomorphic to the first one. If MATCHES[0]
906 is false then this indicates the comparison could not be
907 carried out or the stmts will never be vectorized by SLP.
908
909 Note COND_EXPR is possibly isomorphic to another one after swapping its
910 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
911 the first stmt by swapping the two operands of comparison; set SWAP[i]
912 to 2 if stmt I is isormorphic to the first stmt by inverting the code
913 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
914 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
915
916 static bool
917 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
918 vec<stmt_vec_info> stmts, unsigned int group_size,
919 poly_uint64 *max_nunits, bool *matches,
920 bool *two_operators, tree *node_vectype)
921 {
922 unsigned int i;
923 stmt_vec_info first_stmt_info = stmts[0];
924 enum tree_code first_stmt_code = ERROR_MARK;
925 enum tree_code alt_stmt_code = ERROR_MARK;
926 enum tree_code rhs_code = ERROR_MARK;
927 enum tree_code first_cond_code = ERROR_MARK;
928 tree lhs;
929 bool need_same_oprnds = false;
930 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
931 optab optab;
932 int icode;
933 machine_mode optab_op2_mode;
934 machine_mode vec_mode;
935 stmt_vec_info first_load = NULL, prev_first_load = NULL;
936 bool first_stmt_load_p = false, load_p = false;
937 bool first_stmt_phi_p = false, phi_p = false;
938 bool maybe_soft_fail = false;
939 tree soft_fail_nunits_vectype = NULL_TREE;
940
941 /* For every stmt in NODE find its def stmt/s. */
942 stmt_vec_info stmt_info;
943 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
944 {
945 gimple *stmt = stmt_info->stmt;
946 swap[i] = 0;
947 matches[i] = false;
948
949 if (dump_enabled_p ())
950 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
951
952 /* Fail to vectorize statements marked as unvectorizable, throw
953 or are volatile. */
954 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
955 || stmt_can_throw_internal (cfun, stmt)
956 || gimple_has_volatile_ops (stmt))
957 {
958 if (dump_enabled_p ())
959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
960 "Build SLP failed: unvectorizable statement %G",
961 stmt);
962 /* ??? For BB vectorization we want to commutate operands in a way
963 to shuffle all unvectorizable defs into one operand and have
964 the other still vectorized. The following doesn't reliably
965 work for this though but it's the easiest we can do here. */
966 if (is_a <bb_vec_info> (vinfo) && i != 0)
967 continue;
968 /* Fatal mismatch. */
969 matches[0] = false;
970 return false;
971 }
972
973 lhs = gimple_get_lhs (stmt);
974 if (lhs == NULL_TREE)
975 {
976 if (dump_enabled_p ())
977 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
978 "Build SLP failed: not GIMPLE_ASSIGN nor "
979 "GIMPLE_CALL %G", stmt);
980 if (is_a <bb_vec_info> (vinfo) && i != 0)
981 continue;
982 /* Fatal mismatch. */
983 matches[0] = false;
984 return false;
985 }
986
987 tree nunits_vectype;
988 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
989 &nunits_vectype, group_size))
990 {
991 if (is_a <bb_vec_info> (vinfo) && i != 0)
992 continue;
993 /* Fatal mismatch. */
994 matches[0] = false;
995 return false;
996 }
997 /* Record nunits required but continue analysis, producing matches[]
998 as if nunits was not an issue. This allows splitting of groups
999 to happen. */
1000 if (nunits_vectype
1001 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1002 nunits_vectype, max_nunits))
1003 {
1004 gcc_assert (is_a <bb_vec_info> (vinfo));
1005 maybe_soft_fail = true;
1006 soft_fail_nunits_vectype = nunits_vectype;
1007 }
1008
1009 gcc_assert (vectype);
1010
1011 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1012 if (call_stmt)
1013 {
1014 rhs_code = CALL_EXPR;
1015
1016 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
1017 load_p = true;
1018 else if ((gimple_call_internal_p (call_stmt)
1019 && (!vectorizable_internal_fn_p
1020 (gimple_call_internal_fn (call_stmt))))
1021 || gimple_call_tail_p (call_stmt)
1022 || gimple_call_noreturn_p (call_stmt)
1023 || !gimple_call_nothrow_p (call_stmt)
1024 || gimple_call_chain (call_stmt))
1025 {
1026 if (dump_enabled_p ())
1027 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1028 "Build SLP failed: unsupported call type %G",
1029 call_stmt);
1030 if (is_a <bb_vec_info> (vinfo) && i != 0)
1031 continue;
1032 /* Fatal mismatch. */
1033 matches[0] = false;
1034 return false;
1035 }
1036 }
1037 else if (gimple_code (stmt) == GIMPLE_PHI)
1038 {
1039 rhs_code = ERROR_MARK;
1040 phi_p = true;
1041 }
1042 else
1043 {
1044 rhs_code = gimple_assign_rhs_code (stmt);
1045 load_p = gimple_vuse (stmt);
1046 }
1047
1048 /* Check the operation. */
1049 if (i == 0)
1050 {
1051 *node_vectype = vectype;
1052 first_stmt_code = rhs_code;
1053 first_stmt_load_p = load_p;
1054 first_stmt_phi_p = phi_p;
1055
1056 /* Shift arguments should be equal in all the packed stmts for a
1057 vector shift with scalar shift operand. */
1058 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1059 || rhs_code == LROTATE_EXPR
1060 || rhs_code == RROTATE_EXPR)
1061 {
1062 vec_mode = TYPE_MODE (vectype);
1063
1064 /* First see if we have a vector/vector shift. */
1065 optab = optab_for_tree_code (rhs_code, vectype,
1066 optab_vector);
1067
1068 if (!optab
1069 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1070 {
1071 /* No vector/vector shift, try for a vector/scalar shift. */
1072 optab = optab_for_tree_code (rhs_code, vectype,
1073 optab_scalar);
1074
1075 if (!optab)
1076 {
1077 if (dump_enabled_p ())
1078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1079 "Build SLP failed: no optab.\n");
1080 if (is_a <bb_vec_info> (vinfo) && i != 0)
1081 continue;
1082 /* Fatal mismatch. */
1083 matches[0] = false;
1084 return false;
1085 }
1086 icode = (int) optab_handler (optab, vec_mode);
1087 if (icode == CODE_FOR_nothing)
1088 {
1089 if (dump_enabled_p ())
1090 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1091 "Build SLP failed: "
1092 "op not supported by target.\n");
1093 if (is_a <bb_vec_info> (vinfo) && i != 0)
1094 continue;
1095 /* Fatal mismatch. */
1096 matches[0] = false;
1097 return false;
1098 }
1099 optab_op2_mode = insn_data[icode].operand[2].mode;
1100 if (!VECTOR_MODE_P (optab_op2_mode))
1101 {
1102 need_same_oprnds = true;
1103 first_op1 = gimple_assign_rhs2 (stmt);
1104 }
1105 }
1106 }
1107 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1108 {
1109 need_same_oprnds = true;
1110 first_op1 = gimple_assign_rhs2 (stmt);
1111 }
1112 else if (!load_p
1113 && rhs_code == BIT_FIELD_REF)
1114 {
1115 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1116 if (!is_a <bb_vec_info> (vinfo)
1117 || TREE_CODE (vec) != SSA_NAME
1118 || !operand_equal_p (TYPE_SIZE (vectype),
1119 TYPE_SIZE (TREE_TYPE (vec))))
1120 {
1121 if (dump_enabled_p ())
1122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1123 "Build SLP failed: "
1124 "BIT_FIELD_REF not supported\n");
1125 /* Fatal mismatch. */
1126 matches[0] = false;
1127 return false;
1128 }
1129 }
1130 else if (call_stmt
1131 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1132 {
1133 need_same_oprnds = true;
1134 first_op1 = gimple_call_arg (call_stmt, 1);
1135 }
1136 }
1137 else
1138 {
1139 if (first_stmt_code != rhs_code
1140 && alt_stmt_code == ERROR_MARK)
1141 alt_stmt_code = rhs_code;
1142 if ((first_stmt_code != rhs_code
1143 && (first_stmt_code != IMAGPART_EXPR
1144 || rhs_code != REALPART_EXPR)
1145 && (first_stmt_code != REALPART_EXPR
1146 || rhs_code != IMAGPART_EXPR)
1147 /* Handle mismatches in plus/minus by computing both
1148 and merging the results. */
1149 && !((first_stmt_code == PLUS_EXPR
1150 || first_stmt_code == MINUS_EXPR)
1151 && (alt_stmt_code == PLUS_EXPR
1152 || alt_stmt_code == MINUS_EXPR)
1153 && rhs_code == alt_stmt_code)
1154 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1155 && (first_stmt_code == ARRAY_REF
1156 || first_stmt_code == BIT_FIELD_REF
1157 || first_stmt_code == INDIRECT_REF
1158 || first_stmt_code == COMPONENT_REF
1159 || first_stmt_code == MEM_REF)))
1160 || first_stmt_load_p != load_p
1161 || first_stmt_phi_p != phi_p)
1162 {
1163 if (dump_enabled_p ())
1164 {
1165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1166 "Build SLP failed: different operation "
1167 "in stmt %G", stmt);
1168 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1169 "original stmt %G", first_stmt_info->stmt);
1170 }
1171 /* Mismatch. */
1172 continue;
1173 }
1174
1175 if (need_same_oprnds)
1176 {
1177 tree other_op1 = (call_stmt
1178 ? gimple_call_arg (call_stmt, 1)
1179 : gimple_assign_rhs2 (stmt));
1180 if (!operand_equal_p (first_op1, other_op1, 0))
1181 {
1182 if (dump_enabled_p ())
1183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1184 "Build SLP failed: different shift "
1185 "arguments in %G", stmt);
1186 /* Mismatch. */
1187 continue;
1188 }
1189 }
1190 if (!load_p
1191 && first_stmt_code == BIT_FIELD_REF
1192 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1193 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1194 {
1195 if (dump_enabled_p ())
1196 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1197 "Build SLP failed: different BIT_FIELD_REF "
1198 "arguments in %G", stmt);
1199 /* Mismatch. */
1200 continue;
1201 }
1202
1203 if (!load_p && rhs_code == CALL_EXPR)
1204 {
1205 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1206 as_a <gcall *> (stmt)))
1207 {
1208 if (dump_enabled_p ())
1209 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1210 "Build SLP failed: different calls in %G",
1211 stmt);
1212 /* Mismatch. */
1213 continue;
1214 }
1215 }
1216
1217 if (phi_p
1218 && (gimple_bb (first_stmt_info->stmt)
1219 != gimple_bb (stmt_info->stmt)))
1220 {
1221 if (dump_enabled_p ())
1222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1223 "Build SLP failed: different BB for PHI "
1224 "in %G", stmt);
1225 /* Mismatch. */
1226 continue;
1227 }
1228
1229 if (!types_compatible_p (vectype, *node_vectype))
1230 {
1231 if (dump_enabled_p ())
1232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1233 "Build SLP failed: different vector type "
1234 "in %G", stmt);
1235 /* Mismatch. */
1236 continue;
1237 }
1238 }
1239
1240 /* Grouped store or load. */
1241 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1242 {
1243 if (REFERENCE_CLASS_P (lhs))
1244 {
1245 /* Store. */
1246 ;
1247 }
1248 else
1249 {
1250 /* Load. */
1251 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1252 if (prev_first_load)
1253 {
1254 /* Check that there are no loads from different interleaving
1255 chains in the same node. */
1256 if (prev_first_load != first_load)
1257 {
1258 if (dump_enabled_p ())
1259 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1260 vect_location,
1261 "Build SLP failed: different "
1262 "interleaving chains in one node %G",
1263 stmt);
1264 /* Mismatch. */
1265 continue;
1266 }
1267 }
1268 else
1269 prev_first_load = first_load;
1270 }
1271 } /* Grouped access. */
1272 else
1273 {
1274 if (load_p)
1275 {
1276 /* Not grouped load. */
1277 if (dump_enabled_p ())
1278 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1279 "Build SLP failed: not grouped load %G", stmt);
1280
1281 /* FORNOW: Not grouped loads are not supported. */
1282 if (is_a <bb_vec_info> (vinfo) && i != 0)
1283 continue;
1284 /* Fatal mismatch. */
1285 matches[0] = false;
1286 return false;
1287 }
1288
1289 /* Not memory operation. */
1290 if (!phi_p
1291 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1292 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1293 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1294 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1295 && rhs_code != VIEW_CONVERT_EXPR
1296 && rhs_code != CALL_EXPR
1297 && rhs_code != BIT_FIELD_REF)
1298 {
1299 if (dump_enabled_p ())
1300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1301 "Build SLP failed: operation unsupported %G",
1302 stmt);
1303 if (is_a <bb_vec_info> (vinfo) && i != 0)
1304 continue;
1305 /* Fatal mismatch. */
1306 matches[0] = false;
1307 return false;
1308 }
1309
1310 if (rhs_code == COND_EXPR)
1311 {
1312 tree cond_expr = gimple_assign_rhs1 (stmt);
1313 enum tree_code cond_code = TREE_CODE (cond_expr);
1314 enum tree_code swap_code = ERROR_MARK;
1315 enum tree_code invert_code = ERROR_MARK;
1316
1317 if (i == 0)
1318 first_cond_code = TREE_CODE (cond_expr);
1319 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1320 {
1321 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1322 swap_code = swap_tree_comparison (cond_code);
1323 invert_code = invert_tree_comparison (cond_code, honor_nans);
1324 }
1325
1326 if (first_cond_code == cond_code)
1327 ;
1328 /* Isomorphic can be achieved by swapping. */
1329 else if (first_cond_code == swap_code)
1330 swap[i] = 1;
1331 /* Isomorphic can be achieved by inverting. */
1332 else if (first_cond_code == invert_code)
1333 swap[i] = 2;
1334 else
1335 {
1336 if (dump_enabled_p ())
1337 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1338 "Build SLP failed: different"
1339 " operation %G", stmt);
1340 /* Mismatch. */
1341 continue;
1342 }
1343 }
1344 }
1345
1346 matches[i] = true;
1347 }
1348
1349 for (i = 0; i < group_size; ++i)
1350 if (!matches[i])
1351 return false;
1352
1353 /* If we allowed a two-operation SLP node verify the target can cope
1354 with the permute we are going to use. */
1355 if (alt_stmt_code != ERROR_MARK
1356 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1357 {
1358 *two_operators = true;
1359 }
1360
1361 if (maybe_soft_fail)
1362 {
1363 unsigned HOST_WIDE_INT const_nunits;
1364 if (!TYPE_VECTOR_SUBPARTS
1365 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1366 || const_nunits > group_size)
1367 matches[0] = false;
1368 else
1369 {
1370 /* With constant vector elements simulate a mismatch at the
1371 point we need to split. */
1372 unsigned tail = group_size & (const_nunits - 1);
1373 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1374 }
1375 return false;
1376 }
1377
1378 return true;
1379 }
1380
1381 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1382 Note we never remove apart from at destruction time so we do not
1383 need a special value for deleted that differs from empty. */
1384 struct bst_traits
1385 {
1386 typedef vec <stmt_vec_info> value_type;
1387 typedef vec <stmt_vec_info> compare_type;
1388 static inline hashval_t hash (value_type);
1389 static inline bool equal (value_type existing, value_type candidate);
1390 static inline bool is_empty (value_type x) { return !x.exists (); }
1391 static inline bool is_deleted (value_type x) { return !x.exists (); }
1392 static const bool empty_zero_p = true;
1393 static inline void mark_empty (value_type &x) { x.release (); }
1394 static inline void mark_deleted (value_type &x) { x.release (); }
1395 static inline void remove (value_type &x) { x.release (); }
1396 };
1397 inline hashval_t
1398 bst_traits::hash (value_type x)
1399 {
1400 inchash::hash h;
1401 for (unsigned i = 0; i < x.length (); ++i)
1402 h.add_int (gimple_uid (x[i]->stmt));
1403 return h.end ();
1404 }
1405 inline bool
1406 bst_traits::equal (value_type existing, value_type candidate)
1407 {
1408 if (existing.length () != candidate.length ())
1409 return false;
1410 for (unsigned i = 0; i < existing.length (); ++i)
1411 if (existing[i] != candidate[i])
1412 return false;
1413 return true;
1414 }
1415
1416 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1417 simple_hashmap_traits <bst_traits, slp_tree> >
1418 scalar_stmts_to_slp_tree_map_t;
1419
1420 static slp_tree
1421 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1422 vec<stmt_vec_info> stmts, unsigned int group_size,
1423 poly_uint64 *max_nunits,
1424 bool *matches, unsigned *limit, unsigned *tree_size,
1425 scalar_stmts_to_slp_tree_map_t *bst_map);
1426
1427 static slp_tree
1428 vect_build_slp_tree (vec_info *vinfo,
1429 vec<stmt_vec_info> stmts, unsigned int group_size,
1430 poly_uint64 *max_nunits,
1431 bool *matches, unsigned *limit, unsigned *tree_size,
1432 scalar_stmts_to_slp_tree_map_t *bst_map)
1433 {
1434 if (slp_tree *leader = bst_map->get (stmts))
1435 {
1436 if (dump_enabled_p ())
1437 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1438 *leader ? "" : "failed ", *leader);
1439 if (*leader)
1440 {
1441 SLP_TREE_REF_COUNT (*leader)++;
1442 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1443 stmts.release ();
1444 }
1445 return *leader;
1446 }
1447
1448 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1449 so we can pick up backedge destinations during discovery. */
1450 slp_tree res = new _slp_tree;
1451 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1452 SLP_TREE_SCALAR_STMTS (res) = stmts;
1453 bst_map->put (stmts.copy (), res);
1454
1455 if (*limit == 0)
1456 {
1457 if (dump_enabled_p ())
1458 dump_printf_loc (MSG_NOTE, vect_location,
1459 "SLP discovery limit exceeded\n");
1460 bool existed_p = bst_map->put (stmts, NULL);
1461 gcc_assert (existed_p);
1462 /* Mark the node invalid so we can detect those when still in use
1463 as backedge destinations. */
1464 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1465 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1466 vect_free_slp_tree (res);
1467 memset (matches, 0, sizeof (bool) * group_size);
1468 return NULL;
1469 }
1470 --*limit;
1471
1472 poly_uint64 this_max_nunits = 1;
1473 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1474 &this_max_nunits,
1475 matches, limit, tree_size, bst_map);
1476 if (!res_)
1477 {
1478 bool existed_p = bst_map->put (stmts, NULL);
1479 gcc_assert (existed_p);
1480 /* Mark the node invalid so we can detect those when still in use
1481 as backedge destinations. */
1482 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1483 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1484 vect_free_slp_tree (res);
1485 }
1486 else
1487 {
1488 gcc_assert (res_ == res);
1489 res->max_nunits = this_max_nunits;
1490 vect_update_max_nunits (max_nunits, this_max_nunits);
1491 /* Keep a reference for the bst_map use. */
1492 SLP_TREE_REF_COUNT (res)++;
1493 }
1494 return res_;
1495 }
1496
1497 /* Recursively build an SLP tree starting from NODE.
1498 Fail (and return a value not equal to zero) if def-stmts are not
1499 isomorphic, require data permutation or are of unsupported types of
1500 operation. Otherwise, return 0.
1501 The value returned is the depth in the SLP tree where a mismatch
1502 was found. */
1503
1504 static slp_tree
1505 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1506 vec<stmt_vec_info> stmts, unsigned int group_size,
1507 poly_uint64 *max_nunits,
1508 bool *matches, unsigned *limit, unsigned *tree_size,
1509 scalar_stmts_to_slp_tree_map_t *bst_map)
1510 {
1511 unsigned nops, i, this_tree_size = 0;
1512 poly_uint64 this_max_nunits = *max_nunits;
1513
1514 matches[0] = false;
1515
1516 stmt_vec_info stmt_info = stmts[0];
1517 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1518 nops = gimple_call_num_args (stmt);
1519 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1520 {
1521 nops = gimple_num_ops (stmt) - 1;
1522 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1523 nops++;
1524 }
1525 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1526 nops = gimple_phi_num_args (phi);
1527 else
1528 return NULL;
1529
1530 /* If the SLP node is a PHI (induction or reduction), terminate
1531 the recursion. */
1532 bool *skip_args = XALLOCAVEC (bool, nops);
1533 memset (skip_args, 0, sizeof (bool) * nops);
1534 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1535 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1536 {
1537 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1538 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1539 group_size);
1540 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1541 max_nunits))
1542 return NULL;
1543
1544 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1545 if (def_type == vect_induction_def)
1546 {
1547 /* Induction PHIs are not cycles but walk the initial
1548 value. Only for inner loops through, for outer loops
1549 we need to pick up the value from the actual PHIs
1550 to more easily support peeling and epilogue vectorization. */
1551 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1552 if (!nested_in_vect_loop_p (loop, stmt_info))
1553 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1554 else
1555 loop = loop->inner;
1556 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1557 }
1558 else if (def_type == vect_reduction_def
1559 || def_type == vect_double_reduction_def
1560 || def_type == vect_nested_cycle)
1561 {
1562 /* Else def types have to match. */
1563 stmt_vec_info other_info;
1564 bool all_same = true;
1565 FOR_EACH_VEC_ELT (stmts, i, other_info)
1566 {
1567 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1568 return NULL;
1569 if (other_info != stmt_info)
1570 all_same = false;
1571 }
1572 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1573 /* Reduction initial values are not explicitely represented. */
1574 if (!nested_in_vect_loop_p (loop, stmt_info))
1575 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1576 /* Reduction chain backedge defs are filled manually.
1577 ??? Need a better way to identify a SLP reduction chain PHI.
1578 Or a better overall way to SLP match those. */
1579 if (all_same && def_type == vect_reduction_def)
1580 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1581 }
1582 else if (def_type != vect_internal_def)
1583 return NULL;
1584 }
1585
1586
1587 bool two_operators = false;
1588 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1589 tree vectype = NULL_TREE;
1590 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1591 &this_max_nunits, matches, &two_operators,
1592 &vectype))
1593 return NULL;
1594
1595 /* If the SLP node is a load, terminate the recursion unless masked. */
1596 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1597 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1598 {
1599 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1600 {
1601 /* Masked load. */
1602 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1603 nops = 1;
1604 }
1605 else
1606 {
1607 *max_nunits = this_max_nunits;
1608 (*tree_size)++;
1609 node = vect_create_new_slp_node (node, stmts, 0);
1610 SLP_TREE_VECTYPE (node) = vectype;
1611 /* And compute the load permutation. Whether it is actually
1612 a permutation depends on the unrolling factor which is
1613 decided later. */
1614 vec<unsigned> load_permutation;
1615 int j;
1616 stmt_vec_info load_info;
1617 load_permutation.create (group_size);
1618 stmt_vec_info first_stmt_info
1619 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1620 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1621 {
1622 int load_place = vect_get_place_in_interleaving_chain
1623 (load_info, first_stmt_info);
1624 gcc_assert (load_place != -1);
1625 load_permutation.safe_push (load_place);
1626 }
1627 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1628 return node;
1629 }
1630 }
1631 else if (gimple_assign_single_p (stmt_info->stmt)
1632 && !gimple_vuse (stmt_info->stmt)
1633 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1634 {
1635 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1636 the same SSA name vector of a compatible type to vectype. */
1637 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1638 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1639 stmt_vec_info estmt_info;
1640 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1641 {
1642 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1643 tree bfref = gimple_assign_rhs1 (estmt);
1644 HOST_WIDE_INT lane;
1645 if (!known_eq (bit_field_size (bfref),
1646 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1647 || !constant_multiple_p (bit_field_offset (bfref),
1648 bit_field_size (bfref), &lane))
1649 {
1650 lperm.release ();
1651 return NULL;
1652 }
1653 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1654 }
1655 slp_tree vnode = vect_create_new_slp_node (vNULL);
1656 /* ??? We record vectype here but we hide eventually necessary
1657 punning and instead rely on code generation to materialize
1658 VIEW_CONVERT_EXPRs as necessary. We instead should make
1659 this explicit somehow. */
1660 SLP_TREE_VECTYPE (vnode) = vectype;
1661 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1662 /* We are always building a permutation node even if it is an identity
1663 permute to shield the rest of the vectorizer from the odd node
1664 representing an actual vector without any scalar ops.
1665 ??? We could hide it completely with making the permute node
1666 external? */
1667 node = vect_create_new_slp_node (node, stmts, 1);
1668 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1669 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1670 SLP_TREE_VECTYPE (node) = vectype;
1671 SLP_TREE_CHILDREN (node).quick_push (vnode);
1672 return node;
1673 }
1674
1675 /* Get at the operands, verifying they are compatible. */
1676 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1677 slp_oprnd_info oprnd_info;
1678 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1679 {
1680 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
1681 stmts, i, &oprnds_info);
1682 if (res != 0)
1683 matches[(res == -1) ? 0 : i] = false;
1684 if (!matches[0])
1685 break;
1686 }
1687 for (i = 0; i < group_size; ++i)
1688 if (!matches[i])
1689 {
1690 vect_free_oprnd_info (oprnds_info);
1691 return NULL;
1692 }
1693 swap = NULL;
1694
1695 auto_vec<slp_tree, 4> children;
1696
1697 stmt_info = stmts[0];
1698
1699 /* Create SLP_TREE nodes for the definition node/s. */
1700 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1701 {
1702 slp_tree child;
1703 unsigned int j;
1704
1705 /* We're skipping certain operands from processing, for example
1706 outer loop reduction initial defs. */
1707 if (skip_args[i])
1708 {
1709 children.safe_push (NULL);
1710 continue;
1711 }
1712
1713 if (oprnd_info->first_dt == vect_uninitialized_def)
1714 {
1715 /* COND_EXPR have one too many eventually if the condition
1716 is a SSA name. */
1717 gcc_assert (i == 3 && nops == 4);
1718 continue;
1719 }
1720
1721 if (is_a <bb_vec_info> (vinfo)
1722 && oprnd_info->first_dt == vect_internal_def
1723 && !oprnd_info->any_pattern)
1724 {
1725 /* For BB vectorization, if all defs are the same do not
1726 bother to continue the build along the single-lane
1727 graph but use a splat of the scalar value. */
1728 stmt_vec_info first_def = oprnd_info->def_stmts[0];
1729 for (j = 1; j < group_size; ++j)
1730 if (oprnd_info->def_stmts[j] != first_def)
1731 break;
1732 if (j == group_size
1733 /* But avoid doing this for loads where we may be
1734 able to CSE things, unless the stmt is not
1735 vectorizable. */
1736 && (!STMT_VINFO_VECTORIZABLE (first_def)
1737 || !gimple_vuse (first_def->stmt)))
1738 {
1739 if (dump_enabled_p ())
1740 dump_printf_loc (MSG_NOTE, vect_location,
1741 "Using a splat of the uniform operand\n");
1742 oprnd_info->first_dt = vect_external_def;
1743 }
1744 }
1745
1746 if (oprnd_info->first_dt == vect_external_def
1747 || oprnd_info->first_dt == vect_constant_def)
1748 {
1749 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1750 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1751 oprnd_info->ops = vNULL;
1752 children.safe_push (invnode);
1753 continue;
1754 }
1755
1756 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1757 group_size, &this_max_nunits,
1758 matches, limit,
1759 &this_tree_size, bst_map)) != NULL)
1760 {
1761 oprnd_info->def_stmts = vNULL;
1762 children.safe_push (child);
1763 continue;
1764 }
1765
1766 /* If the SLP build for operand zero failed and operand zero
1767 and one can be commutated try that for the scalar stmts
1768 that failed the match. */
1769 if (i == 0
1770 /* A first scalar stmt mismatch signals a fatal mismatch. */
1771 && matches[0]
1772 /* ??? For COND_EXPRs we can swap the comparison operands
1773 as well as the arms under some constraints. */
1774 && nops == 2
1775 && oprnds_info[1]->first_dt == vect_internal_def
1776 && is_gimple_assign (stmt_info->stmt)
1777 /* Swapping operands for reductions breaks assumptions later on. */
1778 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1779 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1780 {
1781 /* See whether we can swap the matching or the non-matching
1782 stmt operands. */
1783 bool swap_not_matching = true;
1784 do
1785 {
1786 for (j = 0; j < group_size; ++j)
1787 {
1788 if (matches[j] != !swap_not_matching)
1789 continue;
1790 stmt_vec_info stmt_info = stmts[j];
1791 /* Verify if we can swap operands of this stmt. */
1792 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1793 if (!stmt
1794 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1795 {
1796 if (!swap_not_matching)
1797 goto fail;
1798 swap_not_matching = false;
1799 break;
1800 }
1801 }
1802 }
1803 while (j != group_size);
1804
1805 /* Swap mismatched definition stmts. */
1806 if (dump_enabled_p ())
1807 dump_printf_loc (MSG_NOTE, vect_location,
1808 "Re-trying with swapped operands of stmts ");
1809 for (j = 0; j < group_size; ++j)
1810 if (matches[j] == !swap_not_matching)
1811 {
1812 std::swap (oprnds_info[0]->def_stmts[j],
1813 oprnds_info[1]->def_stmts[j]);
1814 std::swap (oprnds_info[0]->ops[j],
1815 oprnds_info[1]->ops[j]);
1816 if (dump_enabled_p ())
1817 dump_printf (MSG_NOTE, "%d ", j);
1818 }
1819 if (dump_enabled_p ())
1820 dump_printf (MSG_NOTE, "\n");
1821 /* And try again with scratch 'matches' ... */
1822 bool *tem = XALLOCAVEC (bool, group_size);
1823 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1824 group_size, &this_max_nunits,
1825 tem, limit,
1826 &this_tree_size, bst_map)) != NULL)
1827 {
1828 oprnd_info->def_stmts = vNULL;
1829 children.safe_push (child);
1830 continue;
1831 }
1832 }
1833 fail:
1834
1835 /* If the SLP build failed and we analyze a basic-block
1836 simply treat nodes we fail to build as externally defined
1837 (and thus build vectors from the scalar defs).
1838 The cost model will reject outright expensive cases.
1839 ??? This doesn't treat cases where permutation ultimatively
1840 fails (or we don't try permutation below). Ideally we'd
1841 even compute a permutation that will end up with the maximum
1842 SLP tree size... */
1843 if (is_a <bb_vec_info> (vinfo)
1844 /* ??? Rejecting patterns this way doesn't work. We'd have to
1845 do extra work to cancel the pattern so the uses see the
1846 scalar version. */
1847 && !is_pattern_stmt_p (stmt_info)
1848 && !oprnd_info->any_pattern)
1849 {
1850 /* But if there's a leading vector sized set of matching stmts
1851 fail here so we can split the group. This matches the condition
1852 vect_analyze_slp_instance uses. */
1853 /* ??? We might want to split here and combine the results to support
1854 multiple vector sizes better. */
1855 for (j = 0; j < group_size; ++j)
1856 if (!matches[j])
1857 break;
1858 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1859 {
1860 if (dump_enabled_p ())
1861 dump_printf_loc (MSG_NOTE, vect_location,
1862 "Building vector operands from scalars\n");
1863 this_tree_size++;
1864 child = vect_create_new_slp_node (oprnd_info->ops);
1865 children.safe_push (child);
1866 oprnd_info->ops = vNULL;
1867 continue;
1868 }
1869 }
1870
1871 gcc_assert (child == NULL);
1872 FOR_EACH_VEC_ELT (children, j, child)
1873 if (child)
1874 vect_free_slp_tree (child);
1875 vect_free_oprnd_info (oprnds_info);
1876 return NULL;
1877 }
1878
1879 vect_free_oprnd_info (oprnds_info);
1880
1881 /* If we have all children of a child built up from uniform scalars
1882 or does more than one possibly expensive vector construction then
1883 just throw that away, causing it built up from scalars.
1884 The exception is the SLP node for the vector store. */
1885 if (is_a <bb_vec_info> (vinfo)
1886 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1887 /* ??? Rejecting patterns this way doesn't work. We'd have to
1888 do extra work to cancel the pattern so the uses see the
1889 scalar version. */
1890 && !is_pattern_stmt_p (stmt_info))
1891 {
1892 slp_tree child;
1893 unsigned j;
1894 bool all_uniform_p = true;
1895 unsigned n_vector_builds = 0;
1896 FOR_EACH_VEC_ELT (children, j, child)
1897 {
1898 if (!child)
1899 ;
1900 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1901 all_uniform_p = false;
1902 else if (!vect_slp_tree_uniform_p (child))
1903 {
1904 all_uniform_p = false;
1905 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1906 n_vector_builds++;
1907 }
1908 }
1909 if (all_uniform_p
1910 || n_vector_builds > 1
1911 || (n_vector_builds == children.length ()
1912 && is_a <gphi *> (stmt_info->stmt)))
1913 {
1914 /* Roll back. */
1915 matches[0] = false;
1916 FOR_EACH_VEC_ELT (children, j, child)
1917 if (child)
1918 vect_free_slp_tree (child);
1919
1920 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_NOTE, vect_location,
1922 "Building parent vector operands from "
1923 "scalars instead\n");
1924 return NULL;
1925 }
1926 }
1927
1928 *tree_size += this_tree_size + 1;
1929 *max_nunits = this_max_nunits;
1930
1931 if (two_operators)
1932 {
1933 /* ??? We'd likely want to either cache in bst_map sth like
1934 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1935 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1936 explicit stmts to put in so the keying on 'stmts' doesn't
1937 work (but we have the same issue with nodes that use 'ops'). */
1938 slp_tree one = new _slp_tree;
1939 slp_tree two = new _slp_tree;
1940 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1941 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1942 SLP_TREE_VECTYPE (one) = vectype;
1943 SLP_TREE_VECTYPE (two) = vectype;
1944 SLP_TREE_CHILDREN (one).safe_splice (children);
1945 SLP_TREE_CHILDREN (two).safe_splice (children);
1946 slp_tree child;
1947 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1948 SLP_TREE_REF_COUNT (child)++;
1949
1950 /* Here we record the original defs since this
1951 node represents the final lane configuration. */
1952 node = vect_create_new_slp_node (node, stmts, 2);
1953 SLP_TREE_VECTYPE (node) = vectype;
1954 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1955 SLP_TREE_CHILDREN (node).quick_push (one);
1956 SLP_TREE_CHILDREN (node).quick_push (two);
1957 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1958 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1959 enum tree_code ocode = ERROR_MARK;
1960 stmt_vec_info ostmt_info;
1961 unsigned j = 0;
1962 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1963 {
1964 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1965 if (gimple_assign_rhs_code (ostmt) != code0)
1966 {
1967 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1968 ocode = gimple_assign_rhs_code (ostmt);
1969 j = i;
1970 }
1971 else
1972 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1973 }
1974 SLP_TREE_CODE (one) = code0;
1975 SLP_TREE_CODE (two) = ocode;
1976 SLP_TREE_LANES (one) = stmts.length ();
1977 SLP_TREE_LANES (two) = stmts.length ();
1978 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1979 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1980 return node;
1981 }
1982
1983 node = vect_create_new_slp_node (node, stmts, nops);
1984 SLP_TREE_VECTYPE (node) = vectype;
1985 SLP_TREE_CHILDREN (node).splice (children);
1986 return node;
1987 }
1988
1989 /* Dump a single SLP tree NODE. */
1990
1991 static void
1992 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1993 slp_tree node)
1994 {
1995 unsigned i, j;
1996 slp_tree child;
1997 stmt_vec_info stmt_info;
1998 tree op;
1999
2000 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2001 dump_user_location_t user_loc = loc.get_user_location ();
2002 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
2003 SLP_TREE_DEF_TYPE (node) == vect_external_def
2004 ? " (external)"
2005 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2006 ? " (constant)"
2007 : ""), node,
2008 estimated_poly_value (node->max_nunits),
2009 SLP_TREE_REF_COUNT (node));
2010 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2011 {
2012 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2013 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2014 else
2015 dump_printf_loc (metadata, user_loc, "op template: %G",
2016 SLP_TREE_REPRESENTATIVE (node)->stmt);
2017 }
2018 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2019 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2020 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2021 else
2022 {
2023 dump_printf_loc (metadata, user_loc, "\t{ ");
2024 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2025 dump_printf (metadata, "%T%s ", op,
2026 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2027 dump_printf (metadata, "}\n");
2028 }
2029 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2030 {
2031 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2032 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2033 dump_printf (dump_kind, " %u", j);
2034 dump_printf (dump_kind, " }\n");
2035 }
2036 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2037 {
2038 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2039 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2040 dump_printf (dump_kind, " %u[%u]",
2041 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2042 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2043 dump_printf (dump_kind, " }\n");
2044 }
2045 if (SLP_TREE_CHILDREN (node).is_empty ())
2046 return;
2047 dump_printf_loc (metadata, user_loc, "\tchildren");
2048 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2049 dump_printf (dump_kind, " %p", (void *)child);
2050 dump_printf (dump_kind, "\n");
2051 }
2052
2053 DEBUG_FUNCTION void
2054 debug (slp_tree node)
2055 {
2056 debug_dump_context ctx;
2057 vect_print_slp_tree (MSG_NOTE,
2058 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2059 node);
2060 }
2061
2062 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2063
2064 static void
2065 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2066 slp_tree node, hash_set<slp_tree> &visited)
2067 {
2068 unsigned i;
2069 slp_tree child;
2070
2071 if (visited.add (node))
2072 return;
2073
2074 vect_print_slp_tree (dump_kind, loc, node);
2075
2076 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2077 if (child)
2078 vect_print_slp_graph (dump_kind, loc, child, visited);
2079 }
2080
2081 static void
2082 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2083 slp_tree entry)
2084 {
2085 hash_set<slp_tree> visited;
2086 vect_print_slp_graph (dump_kind, loc, entry, visited);
2087 }
2088
2089 /* Mark the tree rooted at NODE with PURE_SLP. */
2090
2091 static void
2092 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2093 {
2094 int i;
2095 stmt_vec_info stmt_info;
2096 slp_tree child;
2097
2098 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2099 return;
2100
2101 if (visited.add (node))
2102 return;
2103
2104 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2105 STMT_SLP_TYPE (stmt_info) = pure_slp;
2106
2107 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2108 if (child)
2109 vect_mark_slp_stmts (child, visited);
2110 }
2111
2112 static void
2113 vect_mark_slp_stmts (slp_tree node)
2114 {
2115 hash_set<slp_tree> visited;
2116 vect_mark_slp_stmts (node, visited);
2117 }
2118
2119 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2120
2121 static void
2122 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2123 {
2124 int i;
2125 stmt_vec_info stmt_info;
2126 slp_tree child;
2127
2128 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2129 return;
2130
2131 if (visited.add (node))
2132 return;
2133
2134 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2135 {
2136 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2137 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2138 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2139 }
2140
2141 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2142 if (child)
2143 vect_mark_slp_stmts_relevant (child, visited);
2144 }
2145
2146 static void
2147 vect_mark_slp_stmts_relevant (slp_tree node)
2148 {
2149 hash_set<slp_tree> visited;
2150 vect_mark_slp_stmts_relevant (node, visited);
2151 }
2152
2153
2154 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2155
2156 static void
2157 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2158 hash_set<slp_tree> &visited)
2159 {
2160 if (!node || visited.add (node))
2161 return;
2162
2163 if (SLP_TREE_CHILDREN (node).length () == 0)
2164 {
2165 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2166 return;
2167 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2168 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2169 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2170 loads.safe_push (node);
2171 }
2172 else
2173 {
2174 unsigned i;
2175 slp_tree child;
2176 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2177 vect_gather_slp_loads (loads, child, visited);
2178 }
2179 }
2180
2181
2182 /* Find the last store in SLP INSTANCE. */
2183
2184 stmt_vec_info
2185 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2186 {
2187 stmt_vec_info last = NULL;
2188 stmt_vec_info stmt_vinfo;
2189
2190 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2191 {
2192 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2193 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2194 }
2195
2196 return last;
2197 }
2198
2199 /* Find the first stmt in NODE. */
2200
2201 stmt_vec_info
2202 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2203 {
2204 stmt_vec_info first = NULL;
2205 stmt_vec_info stmt_vinfo;
2206
2207 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2208 {
2209 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2210 if (!first
2211 || get_later_stmt (stmt_vinfo, first) == first)
2212 first = stmt_vinfo;
2213 }
2214
2215 return first;
2216 }
2217
2218 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2219 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2220 (also containing the first GROUP1_SIZE stmts, since stores are
2221 consecutive), the second containing the remainder.
2222 Return the first stmt in the second group. */
2223
2224 static stmt_vec_info
2225 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2226 {
2227 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2228 gcc_assert (group1_size > 0);
2229 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2230 gcc_assert (group2_size > 0);
2231 DR_GROUP_SIZE (first_vinfo) = group1_size;
2232
2233 stmt_vec_info stmt_info = first_vinfo;
2234 for (unsigned i = group1_size; i > 1; i--)
2235 {
2236 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2237 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2238 }
2239 /* STMT is now the last element of the first group. */
2240 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2241 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2242
2243 DR_GROUP_SIZE (group2) = group2_size;
2244 for (stmt_info = group2; stmt_info;
2245 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2246 {
2247 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2248 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2249 }
2250
2251 /* For the second group, the DR_GROUP_GAP is that before the original group,
2252 plus skipping over the first vector. */
2253 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2254
2255 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2256 DR_GROUP_GAP (first_vinfo) += group2_size;
2257
2258 if (dump_enabled_p ())
2259 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2260 group1_size, group2_size);
2261
2262 return group2;
2263 }
2264
2265 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2266 statements and a vector of NUNITS elements. */
2267
2268 static poly_uint64
2269 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2270 {
2271 return exact_div (common_multiple (nunits, group_size), group_size);
2272 }
2273
2274 /* Helper that checks to see if a node is a load node. */
2275
2276 static inline bool
2277 vect_is_slp_load_node (slp_tree root)
2278 {
2279 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2280 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2281 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2282 }
2283
2284
2285 /* Helper function of optimize_load_redistribution that performs the operation
2286 recursively. */
2287
2288 static slp_tree
2289 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2290 vec_info *vinfo, unsigned int group_size,
2291 hash_map<slp_tree, slp_tree> *load_map,
2292 slp_tree root)
2293 {
2294 if (slp_tree *leader = load_map->get (root))
2295 return *leader;
2296
2297 slp_tree node;
2298 unsigned i;
2299
2300 /* For now, we don't know anything about externals so do not do anything. */
2301 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2302 return NULL;
2303 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2304 {
2305 /* First convert this node into a load node and add it to the leaves
2306 list and flatten the permute from a lane to a load one. If it's
2307 unneeded it will be elided later. */
2308 vec<stmt_vec_info> stmts;
2309 stmts.create (SLP_TREE_LANES (root));
2310 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2311 for (unsigned j = 0; j < lane_perm.length (); j++)
2312 {
2313 std::pair<unsigned, unsigned> perm = lane_perm[j];
2314 node = SLP_TREE_CHILDREN (root)[perm.first];
2315
2316 if (!vect_is_slp_load_node (node)
2317 || SLP_TREE_CHILDREN (node).exists ())
2318 {
2319 stmts.release ();
2320 goto next;
2321 }
2322
2323 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2324 }
2325
2326 if (dump_enabled_p ())
2327 dump_printf_loc (MSG_NOTE, vect_location,
2328 "converting stmts on permute node %p\n", root);
2329
2330 bool *matches = XALLOCAVEC (bool, group_size);
2331 poly_uint64 max_nunits = 1;
2332 unsigned tree_size = 0, limit = 1;
2333 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2334 matches, &limit, &tree_size, bst_map);
2335 if (!node)
2336 stmts.release ();
2337
2338 load_map->put (root, node);
2339 return node;
2340 }
2341
2342 next:
2343 load_map->put (root, NULL);
2344
2345 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2346 {
2347 slp_tree value
2348 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2349 node);
2350 if (value)
2351 {
2352 SLP_TREE_REF_COUNT (value)++;
2353 SLP_TREE_CHILDREN (root)[i] = value;
2354 /* ??? We know the original leafs of the replaced nodes will
2355 be referenced by bst_map, only the permutes created by
2356 pattern matching are not. */
2357 if (SLP_TREE_REF_COUNT (node) == 1)
2358 load_map->remove (node);
2359 vect_free_slp_tree (node);
2360 }
2361 }
2362
2363 return NULL;
2364 }
2365
2366 /* Temporary workaround for loads not being CSEd during SLP build. This
2367 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2368 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2369 same DR such that the final operation is equal to a permuted load. Such
2370 NODES are then directly converted into LOADS themselves. The nodes are
2371 CSEd using BST_MAP. */
2372
2373 static void
2374 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2375 vec_info *vinfo, unsigned int group_size,
2376 hash_map<slp_tree, slp_tree> *load_map,
2377 slp_tree root)
2378 {
2379 slp_tree node;
2380 unsigned i;
2381
2382 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2383 {
2384 slp_tree value
2385 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2386 node);
2387 if (value)
2388 {
2389 SLP_TREE_REF_COUNT (value)++;
2390 SLP_TREE_CHILDREN (root)[i] = value;
2391 /* ??? We know the original leafs of the replaced nodes will
2392 be referenced by bst_map, only the permutes created by
2393 pattern matching are not. */
2394 if (SLP_TREE_REF_COUNT (node) == 1)
2395 load_map->remove (node);
2396 vect_free_slp_tree (node);
2397 }
2398 }
2399 }
2400
2401 /* Helper function of vect_match_slp_patterns.
2402
2403 Attempts to match patterns against the slp tree rooted in REF_NODE using
2404 VINFO. Patterns are matched in post-order traversal.
2405
2406 If matching is successful the value in REF_NODE is updated and returned, if
2407 not then it is returned unchanged. */
2408
2409 static bool
2410 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2411 slp_tree_to_load_perm_map_t *perm_cache,
2412 hash_set<slp_tree> *visited)
2413 {
2414 unsigned i;
2415 slp_tree node = *ref_node;
2416 bool found_p = false;
2417 if (!node || visited->add (node))
2418 return false;
2419
2420 slp_tree child;
2421 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2422 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2423 vinfo, perm_cache, visited);
2424
2425 for (unsigned x = 0; x < num__slp_patterns; x++)
2426 {
2427 vect_pattern *pattern = slp_patterns[x] (perm_cache, ref_node);
2428 if (pattern)
2429 {
2430 pattern->build (vinfo);
2431 delete pattern;
2432 found_p = true;
2433 }
2434 }
2435
2436 return found_p;
2437 }
2438
2439 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2440 vec_info VINFO.
2441
2442 The modified tree is returned. Patterns are tried in order and multiple
2443 patterns may match. */
2444
2445 static bool
2446 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2447 hash_set<slp_tree> *visited,
2448 slp_tree_to_load_perm_map_t *perm_cache)
2449 {
2450 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2451 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2452
2453 if (dump_enabled_p ())
2454 dump_printf_loc (MSG_NOTE, vect_location,
2455 "Analyzing SLP tree %p for patterns\n",
2456 SLP_INSTANCE_TREE (instance));
2457
2458 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
2459 }
2460
2461 /* Analyze an SLP instance starting from a group of grouped stores. Call
2462 vect_build_slp_tree to build a tree of packed stmts if possible.
2463 Return FALSE if it's impossible to SLP any stmt in the loop. */
2464
2465 static bool
2466 vect_analyze_slp_instance (vec_info *vinfo,
2467 scalar_stmts_to_slp_tree_map_t *bst_map,
2468 stmt_vec_info stmt_info, slp_instance_kind kind,
2469 unsigned max_tree_size, unsigned *limit);
2470
2471 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2472 of KIND. Return true if successful. */
2473
2474 static bool
2475 vect_build_slp_instance (vec_info *vinfo,
2476 slp_instance_kind kind,
2477 vec<stmt_vec_info> &scalar_stmts,
2478 stmt_vec_info root_stmt_info,
2479 unsigned max_tree_size, unsigned *limit,
2480 scalar_stmts_to_slp_tree_map_t *bst_map,
2481 /* ??? We need stmt_info for group splitting. */
2482 stmt_vec_info stmt_info_)
2483 {
2484 if (dump_enabled_p ())
2485 {
2486 dump_printf_loc (MSG_NOTE, vect_location,
2487 "Starting SLP discovery for\n");
2488 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
2489 dump_printf_loc (MSG_NOTE, vect_location,
2490 " %G", scalar_stmts[i]->stmt);
2491 }
2492
2493 /* Build the tree for the SLP instance. */
2494 unsigned int group_size = scalar_stmts.length ();
2495 bool *matches = XALLOCAVEC (bool, group_size);
2496 poly_uint64 max_nunits = 1;
2497 unsigned tree_size = 0;
2498 unsigned i;
2499 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2500 &max_nunits, matches, limit,
2501 &tree_size, bst_map);
2502 if (node != NULL)
2503 {
2504 /* Calculate the unrolling factor based on the smallest type. */
2505 poly_uint64 unrolling_factor
2506 = calculate_unrolling_factor (max_nunits, group_size);
2507
2508 if (maybe_ne (unrolling_factor, 1U)
2509 && is_a <bb_vec_info> (vinfo))
2510 {
2511 unsigned HOST_WIDE_INT const_max_nunits;
2512 if (!max_nunits.is_constant (&const_max_nunits)
2513 || const_max_nunits > group_size)
2514 {
2515 if (dump_enabled_p ())
2516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2517 "Build SLP failed: store group "
2518 "size not a multiple of the vector size "
2519 "in basic block SLP\n");
2520 vect_free_slp_tree (node);
2521 return false;
2522 }
2523 /* Fatal mismatch. */
2524 if (dump_enabled_p ())
2525 dump_printf_loc (MSG_NOTE, vect_location,
2526 "SLP discovery succeeded but node needs "
2527 "splitting\n");
2528 memset (matches, true, group_size);
2529 matches[group_size / const_max_nunits * const_max_nunits] = false;
2530 vect_free_slp_tree (node);
2531 }
2532 else
2533 {
2534 /* Create a new SLP instance. */
2535 slp_instance new_instance = XNEW (class _slp_instance);
2536 SLP_INSTANCE_TREE (new_instance) = node;
2537 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2538 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2539 SLP_INSTANCE_ROOT_STMT (new_instance) = root_stmt_info;
2540 SLP_INSTANCE_KIND (new_instance) = kind;
2541 new_instance->reduc_phis = NULL;
2542 new_instance->cost_vec = vNULL;
2543 new_instance->subgraph_entries = vNULL;
2544
2545 if (dump_enabled_p ())
2546 dump_printf_loc (MSG_NOTE, vect_location,
2547 "SLP size %u vs. limit %u.\n",
2548 tree_size, max_tree_size);
2549
2550 /* Fixup SLP reduction chains. */
2551 if (kind == slp_inst_kind_reduc_chain)
2552 {
2553 /* If this is a reduction chain with a conversion in front
2554 amend the SLP tree with a node for that. */
2555 gimple *scalar_def
2556 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2557 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
2558 {
2559 /* Get at the conversion stmt - we know it's the single use
2560 of the last stmt of the reduction chain. */
2561 use_operand_p use_p;
2562 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
2563 &use_p, &scalar_def);
2564 gcc_assert (r);
2565 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
2566 next_info = vect_stmt_to_vectorize (next_info);
2567 scalar_stmts = vNULL;
2568 scalar_stmts.create (group_size);
2569 for (unsigned i = 0; i < group_size; ++i)
2570 scalar_stmts.quick_push (next_info);
2571 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2572 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2573 SLP_TREE_CHILDREN (conv).quick_push (node);
2574 SLP_INSTANCE_TREE (new_instance) = conv;
2575 /* We also have to fake this conversion stmt as SLP reduction
2576 group so we don't have to mess with too much code
2577 elsewhere. */
2578 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2579 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2580 }
2581 /* Fill the backedge child of the PHI SLP node. The
2582 general matching code cannot find it because the
2583 scalar code does not reflect how we vectorize the
2584 reduction. */
2585 use_operand_p use_p;
2586 imm_use_iterator imm_iter;
2587 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
2588 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
2589 gimple_get_lhs (scalar_def))
2590 /* There are exactly two non-debug uses, the reduction
2591 PHI and the loop-closed PHI node. */
2592 if (!is_gimple_debug (USE_STMT (use_p))
2593 && gimple_bb (USE_STMT (use_p)) == loop->header)
2594 {
2595 auto_vec<stmt_vec_info, 64> phis (group_size);
2596 stmt_vec_info phi_info
2597 = vinfo->lookup_stmt (USE_STMT (use_p));
2598 for (unsigned i = 0; i < group_size; ++i)
2599 phis.quick_push (phi_info);
2600 slp_tree *phi_node = bst_map->get (phis);
2601 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
2602 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
2603 = SLP_INSTANCE_TREE (new_instance);
2604 SLP_INSTANCE_TREE (new_instance)->refcnt++;
2605 }
2606 }
2607
2608 vinfo->slp_instances.safe_push (new_instance);
2609
2610 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2611 the number of scalar stmts in the root in a few places.
2612 Verify that assumption holds. */
2613 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2614 .length () == group_size);
2615
2616 if (dump_enabled_p ())
2617 {
2618 dump_printf_loc (MSG_NOTE, vect_location,
2619 "Final SLP tree for instance %p:\n", new_instance);
2620 vect_print_slp_graph (MSG_NOTE, vect_location,
2621 SLP_INSTANCE_TREE (new_instance));
2622 }
2623
2624 return true;
2625 }
2626 }
2627 else
2628 {
2629 /* Failed to SLP. */
2630 /* Free the allocated memory. */
2631 scalar_stmts.release ();
2632 }
2633
2634 stmt_vec_info stmt_info = stmt_info_;
2635 /* Try to break the group up into pieces. */
2636 if (kind == slp_inst_kind_store)
2637 {
2638 /* ??? We could delay all the actual splitting of store-groups
2639 until after SLP discovery of the original group completed.
2640 Then we can recurse to vect_build_slp_instance directly. */
2641 for (i = 0; i < group_size; i++)
2642 if (!matches[i])
2643 break;
2644
2645 /* For basic block SLP, try to break the group up into multiples of
2646 a vector size. */
2647 if (is_a <bb_vec_info> (vinfo)
2648 && (i > 1 && i < group_size))
2649 {
2650 tree scalar_type
2651 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2652 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2653 1 << floor_log2 (i));
2654 unsigned HOST_WIDE_INT const_nunits;
2655 if (vectype
2656 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2657 {
2658 /* Split into two groups at the first vector boundary. */
2659 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2660 unsigned group1_size = i & ~(const_nunits - 1);
2661
2662 if (dump_enabled_p ())
2663 dump_printf_loc (MSG_NOTE, vect_location,
2664 "Splitting SLP group at stmt %u\n", i);
2665 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2666 group1_size);
2667 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2668 kind, max_tree_size,
2669 limit);
2670 /* Split the rest at the failure point and possibly
2671 re-analyze the remaining matching part if it has
2672 at least two lanes. */
2673 if (group1_size < i
2674 && (i + 1 < group_size
2675 || i - group1_size > 1))
2676 {
2677 stmt_vec_info rest2 = rest;
2678 rest = vect_split_slp_store_group (rest, i - group1_size);
2679 if (i - group1_size > 1)
2680 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
2681 kind, max_tree_size,
2682 limit);
2683 }
2684 /* Re-analyze the non-matching tail if it has at least
2685 two lanes. */
2686 if (i + 1 < group_size)
2687 res |= vect_analyze_slp_instance (vinfo, bst_map,
2688 rest, kind, max_tree_size,
2689 limit);
2690 return res;
2691 }
2692 }
2693
2694 /* For loop vectorization split into arbitrary pieces of size > 1. */
2695 if (is_a <loop_vec_info> (vinfo)
2696 && (i > 1 && i < group_size))
2697 {
2698 unsigned group1_size = i;
2699
2700 if (dump_enabled_p ())
2701 dump_printf_loc (MSG_NOTE, vect_location,
2702 "Splitting SLP group at stmt %u\n", i);
2703
2704 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2705 group1_size);
2706 /* Loop vectorization cannot handle gaps in stores, make sure
2707 the split group appears as strided. */
2708 STMT_VINFO_STRIDED_P (rest) = 1;
2709 DR_GROUP_GAP (rest) = 0;
2710 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2711 DR_GROUP_GAP (stmt_info) = 0;
2712
2713 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2714 kind, max_tree_size, limit);
2715 if (i + 1 < group_size)
2716 res |= vect_analyze_slp_instance (vinfo, bst_map,
2717 rest, kind, max_tree_size, limit);
2718
2719 return res;
2720 }
2721
2722 /* Even though the first vector did not all match, we might be able to SLP
2723 (some) of the remainder. FORNOW ignore this possibility. */
2724 }
2725
2726 /* Failed to SLP. */
2727 if (dump_enabled_p ())
2728 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2729 return false;
2730 }
2731
2732
2733 /* Analyze an SLP instance starting from a group of grouped stores. Call
2734 vect_build_slp_tree to build a tree of packed stmts if possible.
2735 Return FALSE if it's impossible to SLP any stmt in the loop. */
2736
2737 static bool
2738 vect_analyze_slp_instance (vec_info *vinfo,
2739 scalar_stmts_to_slp_tree_map_t *bst_map,
2740 stmt_vec_info stmt_info,
2741 slp_instance_kind kind,
2742 unsigned max_tree_size, unsigned *limit)
2743 {
2744 unsigned int i;
2745 vec<stmt_vec_info> scalar_stmts;
2746
2747 if (is_a <bb_vec_info> (vinfo))
2748 vect_location = stmt_info->stmt;
2749
2750 stmt_vec_info next_info = stmt_info;
2751 if (kind == slp_inst_kind_store)
2752 {
2753 /* Collect the stores and store them in scalar_stmts. */
2754 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
2755 while (next_info)
2756 {
2757 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2758 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2759 }
2760 }
2761 else if (kind == slp_inst_kind_reduc_chain)
2762 {
2763 /* Collect the reduction stmts and store them in scalar_stmts. */
2764 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
2765 while (next_info)
2766 {
2767 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2768 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2769 }
2770 /* Mark the first element of the reduction chain as reduction to properly
2771 transform the node. In the reduction analysis phase only the last
2772 element of the chain is marked as reduction. */
2773 STMT_VINFO_DEF_TYPE (stmt_info)
2774 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2775 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2776 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2777 }
2778 else if (kind == slp_inst_kind_ctor)
2779 {
2780 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2781 tree val;
2782 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
2783 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2784 {
2785 stmt_vec_info def_info = vinfo->lookup_def (val);
2786 def_info = vect_stmt_to_vectorize (def_info);
2787 scalar_stmts.quick_push (def_info);
2788 }
2789 if (dump_enabled_p ())
2790 dump_printf_loc (MSG_NOTE, vect_location,
2791 "Analyzing vectorizable constructor: %G\n",
2792 stmt_info->stmt);
2793 }
2794 else if (kind == slp_inst_kind_reduc_group)
2795 {
2796 /* Collect reduction statements. */
2797 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2798 scalar_stmts.create (reductions.length ());
2799 for (i = 0; reductions.iterate (i, &next_info); i++)
2800 if (STMT_VINFO_RELEVANT_P (next_info)
2801 || STMT_VINFO_LIVE_P (next_info))
2802 scalar_stmts.quick_push (next_info);
2803 /* If less than two were relevant/live there's nothing to SLP. */
2804 if (scalar_stmts.length () < 2)
2805 return false;
2806 }
2807 else
2808 gcc_unreachable ();
2809
2810 /* Build the tree for the SLP instance. */
2811 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
2812 kind == slp_inst_kind_ctor
2813 ? stmt_info : NULL,
2814 max_tree_size, limit, bst_map,
2815 kind == slp_inst_kind_store
2816 ? stmt_info : NULL);
2817
2818 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2819 where we should do store group splitting. */
2820
2821 return res;
2822 }
2823
2824 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2825 trees of packed scalar stmts if SLP is possible. */
2826
2827 opt_result
2828 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2829 {
2830 unsigned int i;
2831 stmt_vec_info first_element;
2832 slp_instance instance;
2833
2834 DUMP_VECT_SCOPE ("vect_analyze_slp");
2835
2836 unsigned limit = max_tree_size;
2837
2838 scalar_stmts_to_slp_tree_map_t *bst_map
2839 = new scalar_stmts_to_slp_tree_map_t ();
2840
2841 /* Find SLP sequences starting from groups of grouped stores. */
2842 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2843 vect_analyze_slp_instance (vinfo, bst_map, first_element,
2844 STMT_VINFO_GROUPED_ACCESS (first_element)
2845 ? slp_inst_kind_store : slp_inst_kind_ctor,
2846 max_tree_size, &limit);
2847
2848 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
2849 {
2850 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
2851 {
2852 vect_location = bb_vinfo->roots[i].root->stmt;
2853 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
2854 bb_vinfo->roots[i].stmts,
2855 bb_vinfo->roots[i].root,
2856 max_tree_size, &limit, bst_map, NULL))
2857 bb_vinfo->roots[i].stmts = vNULL;
2858 }
2859 }
2860
2861 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2862 {
2863 /* Find SLP sequences starting from reduction chains. */
2864 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2865 if (! STMT_VINFO_RELEVANT_P (first_element)
2866 && ! STMT_VINFO_LIVE_P (first_element))
2867 ;
2868 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2869 slp_inst_kind_reduc_chain,
2870 max_tree_size, &limit))
2871 {
2872 /* Dissolve reduction chain group. */
2873 stmt_vec_info vinfo = first_element;
2874 stmt_vec_info last = NULL;
2875 while (vinfo)
2876 {
2877 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2878 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2879 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2880 last = vinfo;
2881 vinfo = next;
2882 }
2883 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2884 /* It can be still vectorized as part of an SLP reduction. */
2885 loop_vinfo->reductions.safe_push (last);
2886 }
2887
2888 /* Find SLP sequences starting from groups of reductions. */
2889 if (loop_vinfo->reductions.length () > 1)
2890 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2891 slp_inst_kind_reduc_group, max_tree_size,
2892 &limit);
2893 }
2894
2895 hash_set<slp_tree> visited_patterns;
2896 slp_tree_to_load_perm_map_t perm_cache;
2897
2898 /* See if any patterns can be found in the SLP tree. */
2899 bool pattern_found = false;
2900 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
2901 pattern_found |= vect_match_slp_patterns (instance, vinfo,
2902 &visited_patterns, &perm_cache);
2903
2904 /* If any were found optimize permutations of loads. */
2905 if (pattern_found)
2906 {
2907 hash_map<slp_tree, slp_tree> load_map;
2908 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
2909 {
2910 slp_tree root = SLP_INSTANCE_TREE (instance);
2911 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
2912 &load_map, root);
2913 }
2914 }
2915
2916
2917
2918 /* The map keeps a reference on SLP nodes built, release that. */
2919 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2920 it != bst_map->end (); ++it)
2921 if ((*it).second)
2922 vect_free_slp_tree ((*it).second);
2923 delete bst_map;
2924
2925 if (pattern_found && dump_enabled_p ())
2926 {
2927 dump_printf_loc (MSG_NOTE, vect_location,
2928 "Pattern matched SLP tree\n");
2929 hash_set<slp_tree> visited;
2930 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
2931 vect_print_slp_graph (MSG_NOTE, vect_location,
2932 SLP_INSTANCE_TREE (instance), visited);
2933 }
2934
2935 return opt_result::success ();
2936 }
2937
2938 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2939
2940 static void
2941 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2942 vec<slp_tree> &vertices, vec<int> &leafs)
2943 {
2944 unsigned i;
2945 slp_tree child;
2946
2947 if (visited.add (node))
2948 return;
2949
2950 node->vertex = vertices.length ();
2951 vertices.safe_push (node);
2952
2953 bool leaf = true;
2954 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2955 if (child)
2956 {
2957 leaf = false;
2958 vect_slp_build_vertices (visited, child, vertices, leafs);
2959 }
2960 if (leaf)
2961 leafs.safe_push (node->vertex);
2962 }
2963
2964 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2965
2966 static void
2967 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2968 vec<int> &leafs)
2969 {
2970 hash_set<slp_tree> visited;
2971 unsigned i;
2972 slp_instance instance;
2973 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2974 {
2975 unsigned n_v = vertices.length ();
2976 unsigned n_l = leafs.length ();
2977 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2978 leafs);
2979 /* If we added vertices but no entries to the reverse graph we've
2980 added a cycle that is not backwards-reachable. Push the entry
2981 to mimic as leaf then. */
2982 if (vertices.length () > n_v
2983 && leafs.length () == n_l)
2984 leafs.safe_push (SLP_INSTANCE_TREE (instance)->vertex);
2985 }
2986 }
2987
2988 /* Apply (reverse) bijectite PERM to VEC. */
2989
2990 template <class T>
2991 static void
2992 vect_slp_permute (vec<unsigned> perm,
2993 vec<T> &vec, bool reverse)
2994 {
2995 auto_vec<T, 64> saved;
2996 saved.create (vec.length ());
2997 for (unsigned i = 0; i < vec.length (); ++i)
2998 saved.quick_push (vec[i]);
2999
3000 if (reverse)
3001 {
3002 for (unsigned i = 0; i < vec.length (); ++i)
3003 vec[perm[i]] = saved[i];
3004 for (unsigned i = 0; i < vec.length (); ++i)
3005 gcc_assert (vec[perm[i]] == saved[i]);
3006 }
3007 else
3008 {
3009 for (unsigned i = 0; i < vec.length (); ++i)
3010 vec[i] = saved[perm[i]];
3011 for (unsigned i = 0; i < vec.length (); ++i)
3012 gcc_assert (vec[i] == saved[perm[i]]);
3013 }
3014 }
3015
3016 /* Return whether permutations PERM_A and PERM_B as recorded in the
3017 PERMS vector are equal. */
3018
3019 static bool
3020 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
3021 int perm_a, int perm_b)
3022 {
3023 return (perm_a == perm_b
3024 || (perms[perm_a].length () == perms[perm_b].length ()
3025 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
3026 sizeof (unsigned) * perms[perm_a].length ()) == 0));
3027 }
3028
3029 /* Optimize the SLP graph of VINFO. */
3030
3031 void
3032 vect_optimize_slp (vec_info *vinfo)
3033 {
3034 if (vinfo->slp_instances.is_empty ())
3035 return;
3036
3037 slp_tree node;
3038 unsigned i;
3039 auto_vec<slp_tree> vertices;
3040 auto_vec<int> leafs;
3041 vect_slp_build_vertices (vinfo, vertices, leafs);
3042
3043 struct graph *slpg = new_graph (vertices.length ());
3044 FOR_EACH_VEC_ELT (vertices, i, node)
3045 {
3046 unsigned j;
3047 slp_tree child;
3048 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3049 if (child)
3050 add_edge (slpg, i, child->vertex);
3051 }
3052
3053 /* Compute (reverse) postorder on the inverted graph. */
3054 auto_vec<int> ipo;
3055 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3056
3057 auto_sbitmap n_visited (vertices.length ());
3058 auto_sbitmap n_materialize (vertices.length ());
3059 auto_vec<int> n_perm (vertices.length ());
3060 auto_vec<vec<unsigned> > perms;
3061
3062 bitmap_clear (n_visited);
3063 bitmap_clear (n_materialize);
3064 n_perm.quick_grow_cleared (vertices.length ());
3065 perms.safe_push (vNULL); /* zero is no permute */
3066
3067 /* Produce initial permutations. */
3068 for (i = 0; i < leafs.length (); ++i)
3069 {
3070 int idx = leafs[i];
3071 slp_tree node = vertices[idx];
3072
3073 /* Handle externals and constants optimistically throughout the
3074 iteration. */
3075 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3076 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3077 continue;
3078
3079 /* Leafs do not change across iterations. Note leafs also double
3080 as entries to the reverse graph. */
3081 if (!slpg->vertices[idx].succ)
3082 bitmap_set_bit (n_visited, idx);
3083 /* Loads are the only thing generating permutes. */
3084 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3085 continue;
3086
3087 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3088 node unpermuted, record this permute. */
3089 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3090 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3091 continue;
3092 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3093 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3094 bool any_permute = false;
3095 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3096 {
3097 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3098 imin = MIN (imin, idx);
3099 imax = MAX (imax, idx);
3100 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3101 any_permute = true;
3102 }
3103 /* If there's no permute no need to split one out. */
3104 if (!any_permute)
3105 continue;
3106 /* If the span doesn't match we'd disrupt VF computation, avoid
3107 that for now. */
3108 if (imax - imin + 1 != SLP_TREE_LANES (node))
3109 continue;
3110
3111 /* For now only handle true permutes, like
3112 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3113 when permuting constants and invariants keeping the permute
3114 bijective. */
3115 auto_sbitmap load_index (SLP_TREE_LANES (node));
3116 bitmap_clear (load_index);
3117 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3118 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3119 unsigned j;
3120 for (j = 0; j < SLP_TREE_LANES (node); ++j)
3121 if (!bitmap_bit_p (load_index, j))
3122 break;
3123 if (j != SLP_TREE_LANES (node))
3124 continue;
3125
3126 vec<unsigned> perm = vNULL;
3127 perm.safe_grow (SLP_TREE_LANES (node), true);
3128 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3129 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3130 perms.safe_push (perm);
3131 n_perm[idx] = perms.length () - 1;
3132 }
3133
3134 /* Propagate permutes along the graph and compute materialization points. */
3135 bool changed;
3136 unsigned iteration = 0;
3137 do
3138 {
3139 changed = false;
3140 ++iteration;
3141
3142 for (i = vertices.length (); i > 0 ; --i)
3143 {
3144 int idx = ipo[i-1];
3145 slp_tree node = vertices[idx];
3146 /* For leafs there's nothing to do - we've seeded permutes
3147 on those above. */
3148 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3149 continue;
3150
3151 bitmap_set_bit (n_visited, idx);
3152
3153 /* We cannot move a permute across a store. */
3154 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
3155 && DR_IS_WRITE
3156 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
3157 continue;
3158
3159 int perm = -1;
3160 for (graph_edge *succ = slpg->vertices[idx].succ;
3161 succ; succ = succ->succ_next)
3162 {
3163 int succ_idx = succ->dest;
3164 /* Handle unvisited nodes optimistically. */
3165 /* ??? But for constants once we want to handle non-bijective
3166 permutes we have to verify the permute, when unifying lanes,
3167 will not unify different constants. For example see
3168 gcc.dg/vect/bb-slp-14.c for a case that would break. */
3169 if (!bitmap_bit_p (n_visited, succ_idx))
3170 continue;
3171 int succ_perm = n_perm[succ_idx];
3172 /* Once we materialize succs permutation its output lanes
3173 appear unpermuted to us. */
3174 if (bitmap_bit_p (n_materialize, succ_idx))
3175 succ_perm = 0;
3176 if (perm == -1)
3177 perm = succ_perm;
3178 else if (succ_perm == 0)
3179 {
3180 perm = 0;
3181 break;
3182 }
3183 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
3184 {
3185 perm = 0;
3186 break;
3187 }
3188 }
3189
3190 if (perm == -1)
3191 /* Pick up pre-computed leaf values. */
3192 perm = n_perm[idx];
3193 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
3194 {
3195 if (iteration > 1)
3196 /* Make sure we eventually converge. */
3197 gcc_checking_assert (perm == 0);
3198 n_perm[idx] = perm;
3199 if (perm == 0)
3200 bitmap_clear_bit (n_materialize, idx);
3201 changed = true;
3202 }
3203
3204 if (perm == 0)
3205 continue;
3206
3207 /* Elide pruning at materialization points in the first
3208 iteration so every node was visited once at least. */
3209 if (iteration == 1)
3210 continue;
3211
3212 /* Decide on permute materialization. Look whether there's
3213 a use (pred) edge that is permuted differently than us.
3214 In that case mark ourselves so the permutation is applied.
3215 For VEC_PERM_EXPRs the permutation doesn't carry along
3216 from children to parents so force materialization at the
3217 point of the VEC_PERM_EXPR. In principle VEC_PERM_EXPRs
3218 are a source of an arbitrary permutation again, similar
3219 to constants/externals - that's something we do not yet
3220 optimally handle. */
3221 bool all_preds_permuted = (SLP_TREE_CODE (node) != VEC_PERM_EXPR
3222 && slpg->vertices[idx].pred != NULL);
3223 if (all_preds_permuted)
3224 for (graph_edge *pred = slpg->vertices[idx].pred;
3225 pred; pred = pred->pred_next)
3226 {
3227 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
3228 int pred_perm = n_perm[pred->src];
3229 if (!vect_slp_perms_eq (perms, perm, pred_perm))
3230 {
3231 all_preds_permuted = false;
3232 break;
3233 }
3234 }
3235 if (!all_preds_permuted)
3236 {
3237 if (!bitmap_bit_p (n_materialize, idx))
3238 changed = true;
3239 bitmap_set_bit (n_materialize, idx);
3240 }
3241 }
3242 }
3243 while (changed || iteration == 1);
3244
3245 /* Materialize. */
3246 for (i = 0; i < vertices.length (); ++i)
3247 {
3248 int perm = n_perm[i];
3249 if (perm <= 0)
3250 continue;
3251
3252 slp_tree node = vertices[i];
3253
3254 /* First permute invariant/external original successors. */
3255 unsigned j;
3256 slp_tree child;
3257 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3258 {
3259 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3260 continue;
3261
3262 /* If the vector is uniform there's nothing to do. */
3263 if (vect_slp_tree_uniform_p (child))
3264 continue;
3265
3266 /* We can end up sharing some externals via two_operator
3267 handling. Be prepared to unshare those. */
3268 if (child->refcnt != 1)
3269 {
3270 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3271 SLP_TREE_CHILDREN (node)[j] = child
3272 = vect_create_new_slp_node
3273 (SLP_TREE_SCALAR_OPS (child).copy ());
3274 }
3275 vect_slp_permute (perms[perm],
3276 SLP_TREE_SCALAR_OPS (child), true);
3277 }
3278
3279 if (bitmap_bit_p (n_materialize, i))
3280 {
3281 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3282 /* For loads simply drop the permutation, the load permutation
3283 already performs the desired permutation. */
3284 ;
3285 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3286 {
3287 /* If the node is already a permute node we can apply
3288 the permutation to the lane selection, effectively
3289 materializing it on the incoming vectors. */
3290 if (dump_enabled_p ())
3291 dump_printf_loc (MSG_NOTE, vect_location,
3292 "simplifying permute node %p\n",
3293 node);
3294
3295 for (unsigned k = 0;
3296 k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3297 SLP_TREE_LANE_PERMUTATION (node)[k].second
3298 = perms[perm][SLP_TREE_LANE_PERMUTATION (node)[k].second];
3299 }
3300 else
3301 {
3302 if (dump_enabled_p ())
3303 dump_printf_loc (MSG_NOTE, vect_location,
3304 "inserting permute node in place of %p\n",
3305 node);
3306
3307 /* Make a copy of NODE and in-place change it to a
3308 VEC_PERM node to permute the lanes of the copy. */
3309 slp_tree copy = new _slp_tree;
3310 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3311 SLP_TREE_CHILDREN (node) = vNULL;
3312 SLP_TREE_SCALAR_STMTS (copy)
3313 = SLP_TREE_SCALAR_STMTS (node).copy ();
3314 vect_slp_permute (perms[perm],
3315 SLP_TREE_SCALAR_STMTS (copy), true);
3316 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3317 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3318 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3319 SLP_TREE_LANE_PERMUTATION (copy)
3320 = SLP_TREE_LANE_PERMUTATION (node);
3321 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3322 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3323 copy->refcnt = 1;
3324 copy->max_nunits = node->max_nunits;
3325 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3326 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3327 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3328
3329 /* Now turn NODE into a VEC_PERM. */
3330 SLP_TREE_CHILDREN (node).safe_push (copy);
3331 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3332 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3333 SLP_TREE_LANE_PERMUTATION (node)
3334 .quick_push (std::make_pair (0, perms[perm][j]));
3335 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3336 }
3337 }
3338 else
3339 {
3340 /* Apply the reverse permutation to our stmts. */
3341 vect_slp_permute (perms[perm],
3342 SLP_TREE_SCALAR_STMTS (node), true);
3343 /* And to the load permutation, which we can simply
3344 make regular by design. */
3345 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3346 {
3347 /* ??? When we handle non-bijective permutes the idea
3348 is that we can force the load-permutation to be
3349 { min, min + 1, min + 2, ... max }. But then the
3350 scalar defs might no longer match the lane content
3351 which means wrong-code with live lane vectorization.
3352 So we possibly have to have NULL entries for those. */
3353 vect_slp_permute (perms[perm],
3354 SLP_TREE_LOAD_PERMUTATION (node), true);
3355 }
3356 }
3357 }
3358
3359 /* Free the perms vector used for propagation. */
3360 while (!perms.is_empty ())
3361 perms.pop ().release ();
3362 free_graph (slpg);
3363
3364
3365 /* Now elide load permutations that are not necessary. */
3366 for (i = 0; i < leafs.length (); ++i)
3367 {
3368 node = vertices[leafs[i]];
3369 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3370 continue;
3371
3372 /* In basic block vectorization we allow any subchain of an interleaving
3373 chain.
3374 FORNOW: not in loop SLP because of realignment complications. */
3375 if (is_a <bb_vec_info> (vinfo))
3376 {
3377 bool subchain_p = true;
3378 stmt_vec_info next_load_info = NULL;
3379 stmt_vec_info load_info;
3380 unsigned j;
3381 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3382 {
3383 if (j != 0
3384 && (next_load_info != load_info
3385 || DR_GROUP_GAP (load_info) != 1))
3386 {
3387 subchain_p = false;
3388 break;
3389 }
3390 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3391 }
3392 if (subchain_p)
3393 {
3394 SLP_TREE_LOAD_PERMUTATION (node).release ();
3395 continue;
3396 }
3397 }
3398 else
3399 {
3400 stmt_vec_info load_info;
3401 bool this_load_permuted = false;
3402 unsigned j;
3403 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3404 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3405 {
3406 this_load_permuted = true;
3407 break;
3408 }
3409 stmt_vec_info first_stmt_info
3410 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3411 if (!this_load_permuted
3412 /* The load requires permutation when unrolling exposes
3413 a gap either because the group is larger than the SLP
3414 group-size or because there is a gap between the groups. */
3415 && (known_eq (LOOP_VINFO_VECT_FACTOR
3416 (as_a <loop_vec_info> (vinfo)), 1U)
3417 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3418 && DR_GROUP_GAP (first_stmt_info) == 0)))
3419 {
3420 SLP_TREE_LOAD_PERMUTATION (node).release ();
3421 continue;
3422 }
3423 }
3424 }
3425 }
3426
3427 /* Gather loads reachable from the individual SLP graph entries. */
3428
3429 void
3430 vect_gather_slp_loads (vec_info *vinfo)
3431 {
3432 unsigned i;
3433 slp_instance instance;
3434 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
3435 {
3436 hash_set<slp_tree> visited;
3437 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
3438 SLP_INSTANCE_TREE (instance), visited);
3439 }
3440 }
3441
3442
3443 /* For each possible SLP instance decide whether to SLP it and calculate overall
3444 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3445 least one instance. */
3446
3447 bool
3448 vect_make_slp_decision (loop_vec_info loop_vinfo)
3449 {
3450 unsigned int i;
3451 poly_uint64 unrolling_factor = 1;
3452 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3453 slp_instance instance;
3454 int decided_to_slp = 0;
3455
3456 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3457
3458 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3459 {
3460 /* FORNOW: SLP if you can. */
3461 /* All unroll factors have the form:
3462
3463 GET_MODE_SIZE (vinfo->vector_mode) * X
3464
3465 for some rational X, so they must have a common multiple. */
3466 unrolling_factor
3467 = force_common_multiple (unrolling_factor,
3468 SLP_INSTANCE_UNROLLING_FACTOR (instance));
3469
3470 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3471 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3472 loop-based vectorization. Such stmts will be marked as HYBRID. */
3473 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3474 decided_to_slp++;
3475 }
3476
3477 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
3478
3479 if (decided_to_slp && dump_enabled_p ())
3480 {
3481 dump_printf_loc (MSG_NOTE, vect_location,
3482 "Decided to SLP %d instances. Unrolling factor ",
3483 decided_to_slp);
3484 dump_dec (MSG_NOTE, unrolling_factor);
3485 dump_printf (MSG_NOTE, "\n");
3486 }
3487
3488 return (decided_to_slp > 0);
3489 }
3490
3491 /* Private data for vect_detect_hybrid_slp. */
3492 struct vdhs_data
3493 {
3494 loop_vec_info loop_vinfo;
3495 vec<stmt_vec_info> *worklist;
3496 };
3497
3498 /* Walker for walk_gimple_op. */
3499
3500 static tree
3501 vect_detect_hybrid_slp (tree *tp, int *, void *data)
3502 {
3503 walk_stmt_info *wi = (walk_stmt_info *)data;
3504 vdhs_data *dat = (vdhs_data *)wi->info;
3505
3506 if (wi->is_lhs)
3507 return NULL_TREE;
3508
3509 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
3510 if (!def_stmt_info)
3511 return NULL_TREE;
3512 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3513 if (PURE_SLP_STMT (def_stmt_info))
3514 {
3515 if (dump_enabled_p ())
3516 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3517 def_stmt_info->stmt);
3518 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3519 dat->worklist->safe_push (def_stmt_info);
3520 }
3521
3522 return NULL_TREE;
3523 }
3524
3525 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3526 if so, otherwise pushing it to WORKLIST. */
3527
3528 static void
3529 maybe_push_to_hybrid_worklist (vec_info *vinfo,
3530 vec<stmt_vec_info> &worklist,
3531 stmt_vec_info stmt_info)
3532 {
3533 if (dump_enabled_p ())
3534 dump_printf_loc (MSG_NOTE, vect_location,
3535 "Processing hybrid candidate : %G", stmt_info->stmt);
3536 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
3537 imm_use_iterator iter2;
3538 ssa_op_iter iter1;
3539 use_operand_p use_p;
3540 def_operand_p def_p;
3541 bool any_def = false;
3542 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
3543 {
3544 any_def = true;
3545 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
3546 {
3547 if (is_gimple_debug (USE_STMT (use_p)))
3548 continue;
3549 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
3550 /* An out-of loop use means this is a loop_vect sink. */
3551 if (!use_info)
3552 {
3553 if (dump_enabled_p ())
3554 dump_printf_loc (MSG_NOTE, vect_location,
3555 "Found loop_vect sink: %G", stmt_info->stmt);
3556 worklist.safe_push (stmt_info);
3557 return;
3558 }
3559 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
3560 {
3561 if (dump_enabled_p ())
3562 dump_printf_loc (MSG_NOTE, vect_location,
3563 "Found loop_vect use: %G", use_info->stmt);
3564 worklist.safe_push (stmt_info);
3565 return;
3566 }
3567 }
3568 }
3569 /* No def means this is a loo_vect sink. */
3570 if (!any_def)
3571 {
3572 if (dump_enabled_p ())
3573 dump_printf_loc (MSG_NOTE, vect_location,
3574 "Found loop_vect sink: %G", stmt_info->stmt);
3575 worklist.safe_push (stmt_info);
3576 return;
3577 }
3578 if (dump_enabled_p ())
3579 dump_printf_loc (MSG_NOTE, vect_location,
3580 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
3581 STMT_SLP_TYPE (stmt_info) = pure_slp;
3582 }
3583
3584 /* Find stmts that must be both vectorized and SLPed. */
3585
3586 void
3587 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3588 {
3589 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3590
3591 /* All stmts participating in SLP are marked pure_slp, all other
3592 stmts are loop_vect.
3593 First collect all loop_vect stmts into a worklist.
3594 SLP patterns cause not all original scalar stmts to appear in
3595 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3596 Rectify this here and do a backward walk over the IL only considering
3597 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3598 mark them as pure_slp. */
3599 auto_vec<stmt_vec_info> worklist;
3600 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
3601 {
3602 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3603 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3604 gsi_next (&gsi))
3605 {
3606 gphi *phi = gsi.phi ();
3607 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3608 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3609 maybe_push_to_hybrid_worklist (loop_vinfo,
3610 worklist, stmt_info);
3611 }
3612 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
3613 gsi_prev (&gsi))
3614 {
3615 gimple *stmt = gsi_stmt (gsi);
3616 if (is_gimple_debug (stmt))
3617 continue;
3618 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3619 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3620 {
3621 for (gimple_stmt_iterator gsi2
3622 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3623 !gsi_end_p (gsi2); gsi_next (&gsi2))
3624 {
3625 stmt_vec_info patt_info
3626 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3627 if (!STMT_SLP_TYPE (patt_info)
3628 && STMT_VINFO_RELEVANT (patt_info))
3629 maybe_push_to_hybrid_worklist (loop_vinfo,
3630 worklist, patt_info);
3631 }
3632 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3633 }
3634 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3635 maybe_push_to_hybrid_worklist (loop_vinfo,
3636 worklist, stmt_info);
3637 }
3638 }
3639
3640 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3641 mark any SLP vectorized stmt as hybrid.
3642 ??? We're visiting def stmts N times (once for each non-SLP and
3643 once for each hybrid-SLP use). */
3644 walk_stmt_info wi;
3645 vdhs_data dat;
3646 dat.worklist = &worklist;
3647 dat.loop_vinfo = loop_vinfo;
3648 memset (&wi, 0, sizeof (wi));
3649 wi.info = (void *)&dat;
3650 while (!worklist.is_empty ())
3651 {
3652 stmt_vec_info stmt_info = worklist.pop ();
3653 /* Since SSA operands are not set up for pattern stmts we need
3654 to use walk_gimple_op. */
3655 wi.is_lhs = 0;
3656 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3657 }
3658 }
3659
3660
3661 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3662
3663 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3664 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs), roots (vNULL)
3665 {
3666 for (unsigned i = 0; i < bbs.length (); ++i)
3667 {
3668 if (i != 0)
3669 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3670 gsi_next (&si))
3671 {
3672 gphi *phi = si.phi ();
3673 gimple_set_uid (phi, 0);
3674 add_stmt (phi);
3675 }
3676 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3677 !gsi_end_p (gsi); gsi_next (&gsi))
3678 {
3679 gimple *stmt = gsi_stmt (gsi);
3680 gimple_set_uid (stmt, 0);
3681 if (is_gimple_debug (stmt))
3682 continue;
3683 add_stmt (stmt);
3684 }
3685 }
3686 }
3687
3688
3689 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3690 stmts in the basic block. */
3691
3692 _bb_vec_info::~_bb_vec_info ()
3693 {
3694 /* Reset region marker. */
3695 for (unsigned i = 0; i < bbs.length (); ++i)
3696 {
3697 if (i != 0)
3698 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3699 gsi_next (&si))
3700 {
3701 gphi *phi = si.phi ();
3702 gimple_set_uid (phi, -1);
3703 }
3704 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3705 !gsi_end_p (gsi); gsi_next (&gsi))
3706 {
3707 gimple *stmt = gsi_stmt (gsi);
3708 gimple_set_uid (stmt, -1);
3709 }
3710 }
3711
3712 for (unsigned i = 0; i < roots.length (); ++i)
3713 roots[i].stmts.release ();
3714 roots.release ();
3715 }
3716
3717 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3718 given then that child nodes have already been processed, and that
3719 their def types currently match their SLP node's def type. */
3720
3721 static bool
3722 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3723 slp_instance node_instance,
3724 stmt_vector_for_cost *cost_vec)
3725 {
3726 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3727
3728 /* Calculate the number of vector statements to be created for the
3729 scalar stmts in this node. For SLP reductions it is equal to the
3730 number of vector statements in the children (which has already been
3731 calculated by the recursive call). Otherwise it is the number of
3732 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3733 VF divided by the number of elements in a vector. */
3734 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3735 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3736 {
3737 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3738 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3739 {
3740 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3741 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3742 break;
3743 }
3744 }
3745 else
3746 {
3747 poly_uint64 vf;
3748 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3749 vf = loop_vinfo->vectorization_factor;
3750 else
3751 vf = 1;
3752 unsigned int group_size = SLP_TREE_LANES (node);
3753 tree vectype = SLP_TREE_VECTYPE (node);
3754 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3755 = vect_get_num_vectors (vf * group_size, vectype);
3756 }
3757
3758 /* Handle purely internal nodes. */
3759 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3760 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3761
3762 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3763 if (is_a <bb_vec_info> (vinfo)
3764 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3765 {
3766 if (dump_enabled_p ())
3767 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3768 "desired vector type conflicts with earlier one "
3769 "for %G", stmt_info->stmt);
3770 return false;
3771 }
3772
3773 bool dummy;
3774 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3775 node, node_instance, cost_vec);
3776 }
3777
3778 /* Try to build NODE from scalars, returning true on success.
3779 NODE_INSTANCE is the SLP instance that contains NODE. */
3780
3781 static bool
3782 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3783 slp_instance node_instance)
3784 {
3785 stmt_vec_info stmt_info;
3786 unsigned int i;
3787
3788 if (!is_a <bb_vec_info> (vinfo)
3789 || node == SLP_INSTANCE_TREE (node_instance)
3790 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3791 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3792 return false;
3793
3794 if (dump_enabled_p ())
3795 dump_printf_loc (MSG_NOTE, vect_location,
3796 "Building vector operands of %p from scalars instead\n", node);
3797
3798 /* Don't remove and free the child nodes here, since they could be
3799 referenced by other structures. The analysis and scheduling phases
3800 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3801 unsigned int group_size = SLP_TREE_LANES (node);
3802 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3803 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3804 SLP_TREE_LOAD_PERMUTATION (node).release ();
3805 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3806 {
3807 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3808 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3809 }
3810 return true;
3811 }
3812
3813 /* Compute the prologue cost for invariant or constant operands represented
3814 by NODE. */
3815
3816 static void
3817 vect_prologue_cost_for_slp (slp_tree node,
3818 stmt_vector_for_cost *cost_vec)
3819 {
3820 /* There's a special case of an existing vector, that costs nothing. */
3821 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3822 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3823 return;
3824 /* Without looking at the actual initializer a vector of
3825 constants can be implemented as load from the constant pool.
3826 When all elements are the same we can use a splat. */
3827 tree vectype = SLP_TREE_VECTYPE (node);
3828 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3829 unsigned num_vects_to_check;
3830 unsigned HOST_WIDE_INT const_nunits;
3831 unsigned nelt_limit;
3832 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3833 && ! multiple_p (const_nunits, group_size))
3834 {
3835 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3836 nelt_limit = const_nunits;
3837 }
3838 else
3839 {
3840 /* If either the vector has variable length or the vectors
3841 are composed of repeated whole groups we only need to
3842 cost construction once. All vectors will be the same. */
3843 num_vects_to_check = 1;
3844 nelt_limit = group_size;
3845 }
3846 tree elt = NULL_TREE;
3847 unsigned nelt = 0;
3848 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3849 {
3850 unsigned si = j % group_size;
3851 if (nelt == 0)
3852 elt = SLP_TREE_SCALAR_OPS (node)[si];
3853 /* ??? We're just tracking whether all operands of a single
3854 vector initializer are the same, ideally we'd check if
3855 we emitted the same one already. */
3856 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3857 elt = NULL_TREE;
3858 nelt++;
3859 if (nelt == nelt_limit)
3860 {
3861 record_stmt_cost (cost_vec, 1,
3862 SLP_TREE_DEF_TYPE (node) == vect_external_def
3863 ? (elt ? scalar_to_vec : vec_construct)
3864 : vector_load,
3865 NULL, vectype, 0, vect_prologue);
3866 nelt = 0;
3867 }
3868 }
3869 }
3870
3871 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3872 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3873
3874 Return true if the operations are supported. */
3875
3876 static bool
3877 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3878 slp_instance node_instance,
3879 hash_set<slp_tree> &visited_set,
3880 vec<slp_tree> &visited_vec,
3881 stmt_vector_for_cost *cost_vec)
3882 {
3883 int i, j;
3884 slp_tree child;
3885
3886 /* Assume we can code-generate all invariants. */
3887 if (!node
3888 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
3889 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
3890 return true;
3891
3892 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
3893 {
3894 if (dump_enabled_p ())
3895 dump_printf_loc (MSG_NOTE, vect_location,
3896 "Failed cyclic SLP reference in %p\n", node);
3897 return false;
3898 }
3899 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
3900
3901 /* If we already analyzed the exact same set of scalar stmts we're done.
3902 We share the generated vector stmts for those. */
3903 if (visited_set.add (node))
3904 return true;
3905 visited_vec.safe_push (node);
3906
3907 bool res = true;
3908 unsigned visited_rec_start = visited_vec.length ();
3909 unsigned cost_vec_rec_start = cost_vec->length ();
3910 bool seen_non_constant_child = false;
3911 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3912 {
3913 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3914 visited_set, visited_vec,
3915 cost_vec);
3916 if (!res)
3917 break;
3918 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
3919 seen_non_constant_child = true;
3920 }
3921 /* We're having difficulties scheduling nodes with just constant
3922 operands and no scalar stmts since we then cannot compute a stmt
3923 insertion place. */
3924 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
3925 {
3926 if (dump_enabled_p ())
3927 dump_printf_loc (MSG_NOTE, vect_location,
3928 "Cannot vectorize all-constant op node %p\n", node);
3929 res = false;
3930 }
3931
3932 if (res)
3933 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3934 cost_vec);
3935 /* If analysis failed we have to pop all recursive visited nodes
3936 plus ourselves. */
3937 if (!res)
3938 {
3939 while (visited_vec.length () >= visited_rec_start)
3940 visited_set.remove (visited_vec.pop ());
3941 cost_vec->truncate (cost_vec_rec_start);
3942 }
3943
3944 /* When the node can be vectorized cost invariant nodes it references.
3945 This is not done in DFS order to allow the refering node
3946 vectorizable_* calls to nail down the invariant nodes vector type
3947 and possibly unshare it if it needs a different vector type than
3948 other referrers. */
3949 if (res)
3950 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3951 if (child
3952 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
3953 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3954 /* Perform usual caching, note code-generation still
3955 code-gens these nodes multiple times but we expect
3956 to CSE them later. */
3957 && !visited_set.add (child))
3958 {
3959 visited_vec.safe_push (child);
3960 /* ??? After auditing more code paths make a "default"
3961 and push the vector type from NODE to all children
3962 if it is not already set. */
3963 /* Compute the number of vectors to be generated. */
3964 tree vector_type = SLP_TREE_VECTYPE (child);
3965 if (!vector_type)
3966 {
3967 /* For shifts with a scalar argument we don't need
3968 to cost or code-generate anything.
3969 ??? Represent this more explicitely. */
3970 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3971 == shift_vec_info_type)
3972 && j == 1);
3973 continue;
3974 }
3975 unsigned group_size = SLP_TREE_LANES (child);
3976 poly_uint64 vf = 1;
3977 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3978 vf = loop_vinfo->vectorization_factor;
3979 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3980 = vect_get_num_vectors (vf * group_size, vector_type);
3981 /* And cost them. */
3982 vect_prologue_cost_for_slp (child, cost_vec);
3983 }
3984
3985 /* If this node or any of its children can't be vectorized, try pruning
3986 the tree here rather than felling the whole thing. */
3987 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3988 {
3989 /* We'll need to revisit this for invariant costing and number
3990 of vectorized stmt setting. */
3991 res = true;
3992 }
3993
3994 return res;
3995 }
3996
3997
3998 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3999 region and that can be vectorized using vectorizable_live_operation
4000 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
4001 scalar code computing it to be retained. */
4002
4003 static void
4004 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
4005 slp_instance instance,
4006 stmt_vector_for_cost *cost_vec,
4007 hash_set<stmt_vec_info> &svisited,
4008 hash_set<slp_tree> &visited)
4009 {
4010 if (visited.add (node))
4011 return;
4012
4013 unsigned i;
4014 stmt_vec_info stmt_info;
4015 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
4016 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4017 {
4018 if (svisited.contains (stmt_info))
4019 continue;
4020 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4021 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
4022 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
4023 /* Only the pattern root stmt computes the original scalar value. */
4024 continue;
4025 bool mark_visited = true;
4026 gimple *orig_stmt = orig_stmt_info->stmt;
4027 ssa_op_iter op_iter;
4028 def_operand_p def_p;
4029 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4030 {
4031 imm_use_iterator use_iter;
4032 gimple *use_stmt;
4033 stmt_vec_info use_stmt_info;
4034 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4035 if (!is_gimple_debug (use_stmt))
4036 {
4037 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
4038 if (!use_stmt_info
4039 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4040 {
4041 STMT_VINFO_LIVE_P (stmt_info) = true;
4042 if (vectorizable_live_operation (bb_vinfo, stmt_info,
4043 NULL, node, instance, i,
4044 false, cost_vec))
4045 /* ??? So we know we can vectorize the live stmt
4046 from one SLP node. If we cannot do so from all
4047 or none consistently we'd have to record which
4048 SLP node (and lane) we want to use for the live
4049 operation. So make sure we can code-generate
4050 from all nodes. */
4051 mark_visited = false;
4052 else
4053 STMT_VINFO_LIVE_P (stmt_info) = false;
4054 break;
4055 }
4056 }
4057 /* We have to verify whether we can insert the lane extract
4058 before all uses. The following is a conservative approximation.
4059 We cannot put this into vectorizable_live_operation because
4060 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4061 doesn't work.
4062 Note that while the fact that we emit code for loads at the
4063 first load should make this a non-problem leafs we construct
4064 from scalars are vectorized after the last scalar def.
4065 ??? If we'd actually compute the insert location during
4066 analysis we could use sth less conservative than the last
4067 scalar stmt in the node for the dominance check. */
4068 /* ??? What remains is "live" uses in vector CTORs in the same
4069 SLP graph which is where those uses can end up code-generated
4070 right after their definition instead of close to their original
4071 use. But that would restrict us to code-generate lane-extracts
4072 from the latest stmt in a node. So we compensate for this
4073 during code-generation, simply not replacing uses for those
4074 hopefully rare cases. */
4075 if (STMT_VINFO_LIVE_P (stmt_info))
4076 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4077 if (!is_gimple_debug (use_stmt)
4078 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4079 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4080 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4081 {
4082 if (dump_enabled_p ())
4083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4084 "Cannot determine insertion place for "
4085 "lane extract\n");
4086 STMT_VINFO_LIVE_P (stmt_info) = false;
4087 mark_visited = true;
4088 }
4089 }
4090 if (mark_visited)
4091 svisited.add (stmt_info);
4092 }
4093
4094 slp_tree child;
4095 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4096 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4097 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4098 cost_vec, svisited, visited);
4099 }
4100
4101 /* Analyze statements in SLP instances of VINFO. Return true if the
4102 operations are supported. */
4103
4104 bool
4105 vect_slp_analyze_operations (vec_info *vinfo)
4106 {
4107 slp_instance instance;
4108 int i;
4109
4110 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4111
4112 hash_set<slp_tree> visited;
4113 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4114 {
4115 auto_vec<slp_tree> visited_vec;
4116 stmt_vector_for_cost cost_vec;
4117 cost_vec.create (2);
4118 if (is_a <bb_vec_info> (vinfo))
4119 vect_location = instance->location ();
4120 if (!vect_slp_analyze_node_operations (vinfo,
4121 SLP_INSTANCE_TREE (instance),
4122 instance, visited, visited_vec,
4123 &cost_vec)
4124 /* Instances with a root stmt require vectorized defs for the
4125 SLP tree root. */
4126 || (SLP_INSTANCE_ROOT_STMT (instance)
4127 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
4128 != vect_internal_def)))
4129 {
4130 slp_tree node = SLP_INSTANCE_TREE (instance);
4131 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4132 if (dump_enabled_p ())
4133 dump_printf_loc (MSG_NOTE, vect_location,
4134 "removing SLP instance operations starting from: %G",
4135 stmt_info->stmt);
4136 vect_free_slp_instance (instance);
4137 vinfo->slp_instances.ordered_remove (i);
4138 cost_vec.release ();
4139 while (!visited_vec.is_empty ())
4140 visited.remove (visited_vec.pop ());
4141 }
4142 else
4143 {
4144 i++;
4145
4146 /* For BB vectorization remember the SLP graph entry
4147 cost for later. */
4148 if (is_a <bb_vec_info> (vinfo))
4149 instance->cost_vec = cost_vec;
4150 else
4151 {
4152 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
4153 cost_vec.release ();
4154 }
4155 }
4156 }
4157
4158 /* Compute vectorizable live stmts. */
4159 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4160 {
4161 hash_set<stmt_vec_info> svisited;
4162 hash_set<slp_tree> visited;
4163 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4164 {
4165 vect_location = instance->location ();
4166 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
4167 instance, &instance->cost_vec, svisited,
4168 visited);
4169 }
4170 }
4171
4172 return !vinfo->slp_instances.is_empty ();
4173 }
4174
4175 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
4176 closing the eventual chain. */
4177
4178 static slp_instance
4179 get_ultimate_leader (slp_instance instance,
4180 hash_map<slp_instance, slp_instance> &instance_leader)
4181 {
4182 auto_vec<slp_instance *, 8> chain;
4183 slp_instance *tem;
4184 while (*(tem = instance_leader.get (instance)) != instance)
4185 {
4186 chain.safe_push (tem);
4187 instance = *tem;
4188 }
4189 while (!chain.is_empty ())
4190 *chain.pop () = instance;
4191 return instance;
4192 }
4193
4194 /* Worker of vect_bb_partition_graph, recurse on NODE. */
4195
4196 static void
4197 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
4198 slp_instance instance, slp_tree node,
4199 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
4200 hash_map<slp_instance, slp_instance> &instance_leader,
4201 hash_set<slp_tree> &visited)
4202 {
4203 stmt_vec_info stmt_info;
4204 unsigned i;
4205
4206 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4207 {
4208 bool existed_p;
4209 slp_instance &stmt_instance
4210 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
4211 if (!existed_p)
4212 ;
4213 else if (stmt_instance != instance)
4214 {
4215 /* If we're running into a previously marked stmt make us the
4216 leader of the current ultimate leader. This keeps the
4217 leader chain acyclic and works even when the current instance
4218 connects two previously independent graph parts. */
4219 slp_instance stmt_leader
4220 = get_ultimate_leader (stmt_instance, instance_leader);
4221 if (stmt_leader != instance)
4222 instance_leader.put (stmt_leader, instance);
4223 }
4224 stmt_instance = instance;
4225 }
4226
4227 if (visited.add (node))
4228 return;
4229
4230 slp_tree child;
4231 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4232 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4233 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
4234 instance_leader, visited);
4235 }
4236
4237 /* Partition the SLP graph into pieces that can be costed independently. */
4238
4239 static void
4240 vect_bb_partition_graph (bb_vec_info bb_vinfo)
4241 {
4242 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
4243
4244 /* First walk the SLP graph assigning each involved scalar stmt a
4245 corresponding SLP graph entry and upon visiting a previously
4246 marked stmt, make the stmts leader the current SLP graph entry. */
4247 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
4248 hash_map<slp_instance, slp_instance> instance_leader;
4249 hash_set<slp_tree> visited;
4250 slp_instance instance;
4251 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4252 {
4253 instance_leader.put (instance, instance);
4254 vect_bb_partition_graph_r (bb_vinfo,
4255 instance, SLP_INSTANCE_TREE (instance),
4256 stmt_to_instance, instance_leader,
4257 visited);
4258 }
4259
4260 /* Then collect entries to each independent subgraph. */
4261 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4262 {
4263 slp_instance leader = get_ultimate_leader (instance, instance_leader);
4264 leader->subgraph_entries.safe_push (instance);
4265 if (dump_enabled_p ()
4266 && leader != instance)
4267 dump_printf_loc (MSG_NOTE, vect_location,
4268 "instance %p is leader of %p\n",
4269 leader, instance);
4270 }
4271 }
4272
4273 /* Compute the scalar cost of the SLP node NODE and its children
4274 and return it. Do not account defs that are marked in LIFE and
4275 update LIFE according to uses of NODE. */
4276
4277 static void
4278 vect_bb_slp_scalar_cost (vec_info *vinfo,
4279 slp_tree node, vec<bool, va_heap> *life,
4280 stmt_vector_for_cost *cost_vec,
4281 hash_set<slp_tree> &visited)
4282 {
4283 unsigned i;
4284 stmt_vec_info stmt_info;
4285 slp_tree child;
4286
4287 if (visited.add (node))
4288 return;
4289
4290 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4291 {
4292 ssa_op_iter op_iter;
4293 def_operand_p def_p;
4294
4295 if ((*life)[i])
4296 continue;
4297
4298 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4299 gimple *orig_stmt = orig_stmt_info->stmt;
4300
4301 /* If there is a non-vectorized use of the defs then the scalar
4302 stmt is kept live in which case we do not account it or any
4303 required defs in the SLP children in the scalar cost. This
4304 way we make the vectorization more costly when compared to
4305 the scalar cost. */
4306 if (!STMT_VINFO_LIVE_P (stmt_info))
4307 {
4308 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4309 {
4310 imm_use_iterator use_iter;
4311 gimple *use_stmt;
4312 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4313 if (!is_gimple_debug (use_stmt))
4314 {
4315 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
4316 if (!use_stmt_info
4317 || !PURE_SLP_STMT
4318 (vect_stmt_to_vectorize (use_stmt_info)))
4319 {
4320 (*life)[i] = true;
4321 break;
4322 }
4323 }
4324 }
4325 if ((*life)[i])
4326 continue;
4327 }
4328
4329 /* Count scalar stmts only once. */
4330 if (gimple_visited_p (orig_stmt))
4331 continue;
4332 gimple_set_visited (orig_stmt, true);
4333
4334 vect_cost_for_stmt kind;
4335 if (STMT_VINFO_DATA_REF (orig_stmt_info))
4336 {
4337 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
4338 kind = scalar_load;
4339 else
4340 kind = scalar_store;
4341 }
4342 else if (vect_nop_conversion_p (orig_stmt_info))
4343 continue;
4344 /* For single-argument PHIs assume coalescing which means zero cost
4345 for the scalar and the vector PHIs. This avoids artificially
4346 favoring the vector path (but may pessimize it in some cases). */
4347 else if (is_a <gphi *> (orig_stmt_info->stmt)
4348 && gimple_phi_num_args
4349 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
4350 continue;
4351 else
4352 kind = scalar_stmt;
4353 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
4354 SLP_TREE_VECTYPE (node), 0, vect_body);
4355 }
4356
4357 auto_vec<bool, 20> subtree_life;
4358 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4359 {
4360 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4361 {
4362 /* Do not directly pass LIFE to the recursive call, copy it to
4363 confine changes in the callee to the current child/subtree. */
4364 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4365 {
4366 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
4367 for (unsigned j = 0;
4368 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
4369 {
4370 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
4371 if (perm.first == i)
4372 subtree_life[perm.second] = (*life)[j];
4373 }
4374 }
4375 else
4376 {
4377 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
4378 subtree_life.safe_splice (*life);
4379 }
4380 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
4381 visited);
4382 subtree_life.truncate (0);
4383 }
4384 }
4385 }
4386
4387 /* Comparator for the loop-index sorted cost vectors. */
4388
4389 static int
4390 li_cost_vec_cmp (const void *a_, const void *b_)
4391 {
4392 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
4393 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
4394 if (a->first < b->first)
4395 return -1;
4396 else if (a->first == b->first)
4397 return 0;
4398 return 1;
4399 }
4400
4401 /* Check if vectorization of the basic block is profitable for the
4402 subgraph denoted by SLP_INSTANCES. */
4403
4404 static bool
4405 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
4406 vec<slp_instance> slp_instances)
4407 {
4408 slp_instance instance;
4409 int i;
4410 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
4411 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
4412
4413 if (dump_enabled_p ())
4414 {
4415 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
4416 hash_set<slp_tree> visited;
4417 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4418 vect_print_slp_graph (MSG_NOTE, vect_location,
4419 SLP_INSTANCE_TREE (instance), visited);
4420 }
4421
4422 /* Calculate scalar cost and sum the cost for the vector stmts
4423 previously collected. */
4424 stmt_vector_for_cost scalar_costs = vNULL;
4425 stmt_vector_for_cost vector_costs = vNULL;
4426 hash_set<slp_tree> visited;
4427 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4428 {
4429 auto_vec<bool, 20> life;
4430 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
4431 true);
4432 if (SLP_INSTANCE_ROOT_STMT (instance))
4433 record_stmt_cost (&scalar_costs, 1, scalar_stmt,
4434 SLP_INSTANCE_ROOT_STMT (instance), 0, vect_body);
4435 vect_bb_slp_scalar_cost (bb_vinfo,
4436 SLP_INSTANCE_TREE (instance),
4437 &life, &scalar_costs, visited);
4438 vector_costs.safe_splice (instance->cost_vec);
4439 instance->cost_vec.release ();
4440 }
4441 /* Unset visited flag. */
4442 stmt_info_for_cost *cost;
4443 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
4444 gimple_set_visited (cost->stmt_info->stmt, false);
4445
4446 if (dump_enabled_p ())
4447 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4448
4449 /* When costing non-loop vectorization we need to consider each covered
4450 loop independently and make sure vectorization is profitable. For
4451 now we assume a loop may be not entered or executed an arbitrary
4452 number of iterations (??? static information can provide more
4453 precise info here) which means we can simply cost each containing
4454 loops stmts separately. */
4455
4456 /* First produce cost vectors sorted by loop index. */
4457 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
4458 li_scalar_costs (scalar_costs.length ());
4459 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
4460 li_vector_costs (vector_costs.length ());
4461 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
4462 {
4463 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
4464 li_scalar_costs.quick_push (std::make_pair (l, cost));
4465 }
4466 /* Use a random used loop as fallback in case the first vector_costs
4467 entry does not have a stmt_info associated with it. */
4468 unsigned l = li_scalar_costs[0].first;
4469 FOR_EACH_VEC_ELT (vector_costs, i, cost)
4470 {
4471 /* We inherit from the previous COST, invariants, externals and
4472 extracts immediately follow the cost for the related stmt. */
4473 if (cost->stmt_info)
4474 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
4475 li_vector_costs.quick_push (std::make_pair (l, cost));
4476 }
4477 li_scalar_costs.qsort (li_cost_vec_cmp);
4478 li_vector_costs.qsort (li_cost_vec_cmp);
4479
4480 /* Now cost the portions individually. */
4481 unsigned vi = 0;
4482 unsigned si = 0;
4483 while (si < li_scalar_costs.length ()
4484 && vi < li_vector_costs.length ())
4485 {
4486 unsigned sl = li_scalar_costs[si].first;
4487 unsigned vl = li_vector_costs[vi].first;
4488 if (sl != vl)
4489 {
4490 if (dump_enabled_p ())
4491 dump_printf_loc (MSG_NOTE, vect_location,
4492 "Scalar %d and vector %d loop part do not "
4493 "match up, skipping scalar part\n", sl, vl);
4494 /* Skip the scalar part, assuming zero cost on the vector side. */
4495 do
4496 {
4497 si++;
4498 }
4499 while (si < li_scalar_costs.length ()
4500 && li_scalar_costs[si].first == sl);
4501 continue;
4502 }
4503
4504 void *scalar_target_cost_data = init_cost (NULL);
4505 do
4506 {
4507 add_stmt_cost (bb_vinfo, scalar_target_cost_data,
4508 li_scalar_costs[si].second);
4509 si++;
4510 }
4511 while (si < li_scalar_costs.length ()
4512 && li_scalar_costs[si].first == sl);
4513 unsigned dummy;
4514 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
4515 destroy_cost_data (scalar_target_cost_data);
4516
4517 /* Complete the target-specific vector cost calculation. */
4518 void *vect_target_cost_data = init_cost (NULL);
4519 do
4520 {
4521 add_stmt_cost (bb_vinfo, vect_target_cost_data,
4522 li_vector_costs[vi].second);
4523 vi++;
4524 }
4525 while (vi < li_vector_costs.length ()
4526 && li_vector_costs[vi].first == vl);
4527 finish_cost (vect_target_cost_data, &vec_prologue_cost,
4528 &vec_inside_cost, &vec_epilogue_cost);
4529 destroy_cost_data (vect_target_cost_data);
4530
4531 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
4532
4533 if (dump_enabled_p ())
4534 {
4535 dump_printf_loc (MSG_NOTE, vect_location,
4536 "Cost model analysis for part in loop %d:\n", sl);
4537 dump_printf (MSG_NOTE, " Vector cost: %d\n",
4538 vec_inside_cost + vec_outside_cost);
4539 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
4540 }
4541
4542 /* Vectorization is profitable if its cost is more than the cost of scalar
4543 version. Note that we err on the vector side for equal cost because
4544 the cost estimate is otherwise quite pessimistic (constant uses are
4545 free on the scalar side but cost a load on the vector side for
4546 example). */
4547 if (vec_outside_cost + vec_inside_cost > scalar_cost)
4548 {
4549 scalar_costs.release ();
4550 vector_costs.release ();
4551 return false;
4552 }
4553 }
4554 if (vi < li_vector_costs.length ())
4555 {
4556 if (dump_enabled_p ())
4557 dump_printf_loc (MSG_NOTE, vect_location,
4558 "Excess vector cost for part in loop %d:\n",
4559 li_vector_costs[vi].first);
4560 scalar_costs.release ();
4561 vector_costs.release ();
4562 return false;
4563 }
4564
4565 scalar_costs.release ();
4566 vector_costs.release ();
4567 return true;
4568 }
4569
4570 /* qsort comparator for lane defs. */
4571
4572 static int
4573 vld_cmp (const void *a_, const void *b_)
4574 {
4575 auto *a = (const std::pair<unsigned, tree> *)a_;
4576 auto *b = (const std::pair<unsigned, tree> *)b_;
4577 return a->first - b->first;
4578 }
4579
4580 /* Return true if USE_STMT is a vector lane insert into VEC and set
4581 *THIS_LANE to the lane number that is set. */
4582
4583 static bool
4584 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
4585 {
4586 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
4587 if (!use_ass
4588 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
4589 || (vec
4590 ? gimple_assign_rhs1 (use_ass) != vec
4591 : ((vec = gimple_assign_rhs1 (use_ass)), false))
4592 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
4593 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
4594 || !constant_multiple_p
4595 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
4596 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
4597 this_lane))
4598 return false;
4599 return true;
4600 }
4601
4602 /* Find any vectorizable constructors and add them to the grouped_store
4603 array. */
4604
4605 static void
4606 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
4607 {
4608 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
4609 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
4610 !gsi_end_p (gsi); gsi_next (&gsi))
4611 {
4612 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
4613 if (!assign)
4614 continue;
4615
4616 tree rhs = gimple_assign_rhs1 (assign);
4617 if (gimple_assign_rhs_code (assign) == CONSTRUCTOR)
4618 {
4619 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
4620 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
4621 CONSTRUCTOR_NELTS (rhs))
4622 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
4623 || uniform_vector_p (rhs))
4624 continue;
4625
4626 unsigned j;
4627 tree val;
4628 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
4629 if (TREE_CODE (val) != SSA_NAME
4630 || !bb_vinfo->lookup_def (val))
4631 break;
4632 if (j != CONSTRUCTOR_NELTS (rhs))
4633 continue;
4634
4635 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
4636 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
4637 }
4638 else if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR
4639 && VECTOR_TYPE_P (TREE_TYPE (rhs))
4640 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
4641 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
4642 && integer_zerop (gimple_assign_rhs3 (assign))
4643 && useless_type_conversion_p
4644 (TREE_TYPE (TREE_TYPE (rhs)),
4645 TREE_TYPE (gimple_assign_rhs2 (assign)))
4646 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
4647 {
4648 /* We start to match on insert to lane zero but since the
4649 inserts need not be ordered we'd have to search both
4650 the def and the use chains. */
4651 tree vectype = TREE_TYPE (rhs);
4652 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4653 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
4654 auto_sbitmap lanes (nlanes);
4655 bitmap_clear (lanes);
4656 bitmap_set_bit (lanes, 0);
4657 tree def = gimple_assign_lhs (assign);
4658 lane_defs.quick_push
4659 (std::make_pair (0, gimple_assign_rhs2 (assign)));
4660 unsigned lanes_found = 1;
4661 /* Start with the use chains, the last stmt will be the root. */
4662 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
4663 do
4664 {
4665 use_operand_p use_p;
4666 gimple *use_stmt;
4667 if (!single_imm_use (def, &use_p, &use_stmt))
4668 break;
4669 unsigned this_lane;
4670 if (!bb_vinfo->lookup_stmt (use_stmt)
4671 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
4672 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
4673 break;
4674 if (bitmap_bit_p (lanes, this_lane))
4675 break;
4676 lanes_found++;
4677 bitmap_set_bit (lanes, this_lane);
4678 gassign *use_ass = as_a <gassign *> (use_stmt);
4679 lane_defs.quick_push (std::make_pair
4680 (this_lane, gimple_assign_rhs2 (use_ass)));
4681 last = bb_vinfo->lookup_stmt (use_ass);
4682 def = gimple_assign_lhs (use_ass);
4683 }
4684 while (lanes_found < nlanes);
4685 if (lanes_found < nlanes)
4686 {
4687 /* Now search the def chain. */
4688 def = gimple_assign_rhs1 (assign);
4689 do
4690 {
4691 if (TREE_CODE (def) != SSA_NAME
4692 || !has_single_use (def))
4693 break;
4694 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
4695 unsigned this_lane;
4696 if (!bb_vinfo->lookup_stmt (def_stmt)
4697 || !vect_slp_is_lane_insert (def_stmt,
4698 NULL_TREE, &this_lane)
4699 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
4700 break;
4701 if (bitmap_bit_p (lanes, this_lane))
4702 break;
4703 lanes_found++;
4704 bitmap_set_bit (lanes, this_lane);
4705 lane_defs.quick_push (std::make_pair
4706 (this_lane,
4707 gimple_assign_rhs2 (def_stmt)));
4708 def = gimple_assign_rhs1 (def_stmt);
4709 }
4710 while (lanes_found < nlanes);
4711 }
4712 if (lanes_found == nlanes)
4713 {
4714 /* Sort lane_defs after the lane index and register the root. */
4715 lane_defs.qsort (vld_cmp);
4716 vec<stmt_vec_info> stmts;
4717 stmts.create (nlanes);
4718 for (unsigned i = 0; i < nlanes; ++i)
4719 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
4720 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
4721 stmts, last));
4722 }
4723 }
4724 }
4725 }
4726
4727 /* Walk the grouped store chains and replace entries with their
4728 pattern variant if any. */
4729
4730 static void
4731 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
4732 {
4733 stmt_vec_info first_element;
4734 unsigned i;
4735
4736 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4737 {
4738 /* We also have CTORs in this array. */
4739 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
4740 continue;
4741 if (STMT_VINFO_IN_PATTERN_P (first_element))
4742 {
4743 stmt_vec_info orig = first_element;
4744 first_element = STMT_VINFO_RELATED_STMT (first_element);
4745 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
4746 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
4747 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
4748 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
4749 vinfo->grouped_stores[i] = first_element;
4750 }
4751 stmt_vec_info prev = first_element;
4752 while (DR_GROUP_NEXT_ELEMENT (prev))
4753 {
4754 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
4755 if (STMT_VINFO_IN_PATTERN_P (elt))
4756 {
4757 stmt_vec_info orig = elt;
4758 elt = STMT_VINFO_RELATED_STMT (elt);
4759 DR_GROUP_NEXT_ELEMENT (prev) = elt;
4760 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
4761 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
4762 }
4763 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
4764 prev = elt;
4765 }
4766 }
4767 }
4768
4769 /* Check if the region described by BB_VINFO can be vectorized, returning
4770 true if so. When returning false, set FATAL to true if the same failure
4771 would prevent vectorization at other vector sizes, false if it is still
4772 worth trying other sizes. N_STMTS is the number of statements in the
4773 region. */
4774
4775 static bool
4776 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
4777 vec<int> *dataref_groups)
4778 {
4779 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4780
4781 slp_instance instance;
4782 int i;
4783 poly_uint64 min_vf = 2;
4784
4785 /* The first group of checks is independent of the vector size. */
4786 fatal = true;
4787
4788 /* Analyze the data references. */
4789
4790 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
4791 {
4792 if (dump_enabled_p ())
4793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4794 "not vectorized: unhandled data-ref in basic "
4795 "block.\n");
4796 return false;
4797 }
4798
4799 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
4800 {
4801 if (dump_enabled_p ())
4802 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4803 "not vectorized: unhandled data access in "
4804 "basic block.\n");
4805 return false;
4806 }
4807
4808 vect_slp_check_for_constructors (bb_vinfo);
4809
4810 /* If there are no grouped stores and no constructors in the region
4811 there is no need to continue with pattern recog as vect_analyze_slp
4812 will fail anyway. */
4813 if (bb_vinfo->grouped_stores.is_empty ()
4814 && bb_vinfo->roots.is_empty ())
4815 {
4816 if (dump_enabled_p ())
4817 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4818 "not vectorized: no grouped stores in "
4819 "basic block.\n");
4820 return false;
4821 }
4822
4823 /* While the rest of the analysis below depends on it in some way. */
4824 fatal = false;
4825
4826 vect_pattern_recog (bb_vinfo);
4827
4828 /* Update store groups from pattern processing. */
4829 vect_fixup_store_groups_with_patterns (bb_vinfo);
4830
4831 /* Check the SLP opportunities in the basic block, analyze and build SLP
4832 trees. */
4833 if (!vect_analyze_slp (bb_vinfo, n_stmts))
4834 {
4835 if (dump_enabled_p ())
4836 {
4837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4838 "Failed to SLP the basic block.\n");
4839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4840 "not vectorized: failed to find SLP opportunities "
4841 "in basic block.\n");
4842 }
4843 return false;
4844 }
4845
4846 /* Optimize permutations. */
4847 vect_optimize_slp (bb_vinfo);
4848
4849 /* Gather the loads reachable from the SLP graph entries. */
4850 vect_gather_slp_loads (bb_vinfo);
4851
4852 vect_record_base_alignments (bb_vinfo);
4853
4854 /* Analyze and verify the alignment of data references and the
4855 dependence in the SLP instances. */
4856 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
4857 {
4858 vect_location = instance->location ();
4859 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
4860 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
4861 {
4862 slp_tree node = SLP_INSTANCE_TREE (instance);
4863 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4864 if (dump_enabled_p ())
4865 dump_printf_loc (MSG_NOTE, vect_location,
4866 "removing SLP instance operations starting from: %G",
4867 stmt_info->stmt);
4868 vect_free_slp_instance (instance);
4869 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
4870 continue;
4871 }
4872
4873 /* Mark all the statements that we want to vectorize as pure SLP and
4874 relevant. */
4875 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4876 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
4877 if (stmt_vec_info root = SLP_INSTANCE_ROOT_STMT (instance))
4878 {
4879 STMT_SLP_TYPE (root) = pure_slp;
4880 if (is_gimple_assign (root->stmt)
4881 && gimple_assign_rhs_code (root->stmt) == BIT_INSERT_EXPR)
4882 {
4883 /* ??? We should probably record the whole vector of
4884 root stmts so we do not have to back-track here... */
4885 for (unsigned n = SLP_TREE_LANES (SLP_INSTANCE_TREE (instance));
4886 n != 1; --n)
4887 {
4888 root = bb_vinfo->lookup_def (gimple_assign_rhs1 (root->stmt));
4889 STMT_SLP_TYPE (root) = pure_slp;
4890 }
4891 }
4892 }
4893
4894 i++;
4895 }
4896 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
4897 return false;
4898
4899 if (!vect_slp_analyze_operations (bb_vinfo))
4900 {
4901 if (dump_enabled_p ())
4902 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4903 "not vectorized: bad operation in basic block.\n");
4904 return false;
4905 }
4906
4907 vect_bb_partition_graph (bb_vinfo);
4908
4909 return true;
4910 }
4911
4912 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4913 basic blocks in BBS, returning true on success.
4914 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4915
4916 static bool
4917 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
4918 vec<int> *dataref_groups, unsigned int n_stmts)
4919 {
4920 bb_vec_info bb_vinfo;
4921 auto_vector_modes vector_modes;
4922
4923 /* Autodetect first vector size we try. */
4924 machine_mode next_vector_mode = VOIDmode;
4925 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
4926 unsigned int mode_i = 0;
4927
4928 vec_info_shared shared;
4929
4930 machine_mode autodetected_vector_mode = VOIDmode;
4931 while (1)
4932 {
4933 bool vectorized = false;
4934 bool fatal = false;
4935 bb_vinfo = new _bb_vec_info (bbs, &shared);
4936
4937 bool first_time_p = shared.datarefs.is_empty ();
4938 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
4939 if (first_time_p)
4940 bb_vinfo->shared->save_datarefs ();
4941 else
4942 bb_vinfo->shared->check_datarefs ();
4943 bb_vinfo->vector_mode = next_vector_mode;
4944
4945 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
4946 {
4947 if (dump_enabled_p ())
4948 {
4949 dump_printf_loc (MSG_NOTE, vect_location,
4950 "***** Analysis succeeded with vector mode"
4951 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4952 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4953 }
4954
4955 bb_vinfo->shared->check_datarefs ();
4956
4957 unsigned i;
4958 slp_instance instance;
4959 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4960 {
4961 if (instance->subgraph_entries.is_empty ())
4962 continue;
4963
4964 vect_location = instance->location ();
4965 if (!unlimited_cost_model (NULL)
4966 && !vect_bb_vectorization_profitable_p
4967 (bb_vinfo, instance->subgraph_entries))
4968 {
4969 if (dump_enabled_p ())
4970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4971 "not vectorized: vectorization is not "
4972 "profitable.\n");
4973 continue;
4974 }
4975
4976 if (!dbg_cnt (vect_slp))
4977 continue;
4978
4979 if (!vectorized && dump_enabled_p ())
4980 dump_printf_loc (MSG_NOTE, vect_location,
4981 "Basic block will be vectorized "
4982 "using SLP\n");
4983 vectorized = true;
4984
4985 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4986
4987 unsigned HOST_WIDE_INT bytes;
4988 if (dump_enabled_p ())
4989 {
4990 if (GET_MODE_SIZE
4991 (bb_vinfo->vector_mode).is_constant (&bytes))
4992 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4993 "basic block part vectorized using %wu "
4994 "byte vectors\n", bytes);
4995 else
4996 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4997 "basic block part vectorized using "
4998 "variable length vectors\n");
4999 }
5000 }
5001 }
5002 else
5003 {
5004 if (dump_enabled_p ())
5005 dump_printf_loc (MSG_NOTE, vect_location,
5006 "***** Analysis failed with vector mode %s\n",
5007 GET_MODE_NAME (bb_vinfo->vector_mode));
5008 }
5009
5010 if (mode_i == 0)
5011 autodetected_vector_mode = bb_vinfo->vector_mode;
5012
5013 if (!fatal)
5014 while (mode_i < vector_modes.length ()
5015 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
5016 {
5017 if (dump_enabled_p ())
5018 dump_printf_loc (MSG_NOTE, vect_location,
5019 "***** The result for vector mode %s would"
5020 " be the same\n",
5021 GET_MODE_NAME (vector_modes[mode_i]));
5022 mode_i += 1;
5023 }
5024
5025 delete bb_vinfo;
5026
5027 if (mode_i < vector_modes.length ()
5028 && VECTOR_MODE_P (autodetected_vector_mode)
5029 && (related_vector_mode (vector_modes[mode_i],
5030 GET_MODE_INNER (autodetected_vector_mode))
5031 == autodetected_vector_mode)
5032 && (related_vector_mode (autodetected_vector_mode,
5033 GET_MODE_INNER (vector_modes[mode_i]))
5034 == vector_modes[mode_i]))
5035 {
5036 if (dump_enabled_p ())
5037 dump_printf_loc (MSG_NOTE, vect_location,
5038 "***** Skipping vector mode %s, which would"
5039 " repeat the analysis for %s\n",
5040 GET_MODE_NAME (vector_modes[mode_i]),
5041 GET_MODE_NAME (autodetected_vector_mode));
5042 mode_i += 1;
5043 }
5044
5045 if (vectorized
5046 || mode_i == vector_modes.length ()
5047 || autodetected_vector_mode == VOIDmode
5048 /* If vect_slp_analyze_bb_1 signaled that analysis for all
5049 vector sizes will fail do not bother iterating. */
5050 || fatal)
5051 return vectorized;
5052
5053 /* Try the next biggest vector size. */
5054 next_vector_mode = vector_modes[mode_i++];
5055 if (dump_enabled_p ())
5056 dump_printf_loc (MSG_NOTE, vect_location,
5057 "***** Re-trying analysis with vector mode %s\n",
5058 GET_MODE_NAME (next_vector_mode));
5059 }
5060 }
5061
5062
5063 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
5064 true if anything in the basic-block was vectorized. */
5065
5066 static bool
5067 vect_slp_bbs (vec<basic_block> bbs)
5068 {
5069 vec<data_reference_p> datarefs = vNULL;
5070 auto_vec<int> dataref_groups;
5071 int insns = 0;
5072 int current_group = 0;
5073
5074 for (unsigned i = 0; i < bbs.length (); i++)
5075 {
5076 basic_block bb = bbs[i];
5077 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
5078 gsi_next (&gsi))
5079 {
5080 gimple *stmt = gsi_stmt (gsi);
5081 if (is_gimple_debug (stmt))
5082 continue;
5083
5084 insns++;
5085
5086 if (gimple_location (stmt) != UNKNOWN_LOCATION)
5087 vect_location = stmt;
5088
5089 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
5090 &dataref_groups, current_group))
5091 ++current_group;
5092 }
5093 }
5094
5095 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
5096 }
5097
5098 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5099 true if anything in the basic-block was vectorized. */
5100
5101 bool
5102 vect_slp_bb (basic_block bb)
5103 {
5104 auto_vec<basic_block> bbs;
5105 bbs.safe_push (bb);
5106 return vect_slp_bbs (bbs);
5107 }
5108
5109 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5110 true if anything in the basic-block was vectorized. */
5111
5112 bool
5113 vect_slp_function (function *fun)
5114 {
5115 bool r = false;
5116 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
5117 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
5118
5119 /* For the moment split the function into pieces to avoid making
5120 the iteration on the vector mode moot. Split at points we know
5121 to not handle well which is CFG merges (SLP discovery doesn't
5122 handle non-loop-header PHIs) and loop exits. Since pattern
5123 recog requires reverse iteration to visit uses before defs
5124 simply chop RPO into pieces. */
5125 auto_vec<basic_block> bbs;
5126 for (unsigned i = 0; i < n; i++)
5127 {
5128 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
5129 bool split = false;
5130
5131 /* Split when a BB is not dominated by the first block. */
5132 if (!bbs.is_empty ()
5133 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
5134 {
5135 if (dump_enabled_p ())
5136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5137 "splitting region at dominance boundary bb%d\n",
5138 bb->index);
5139 split = true;
5140 }
5141 /* Split when the loop determined by the first block
5142 is exited. This is because we eventually insert
5143 invariants at region begin. */
5144 else if (!bbs.is_empty ()
5145 && bbs[0]->loop_father != bb->loop_father
5146 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
5147 {
5148 if (dump_enabled_p ())
5149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5150 "splitting region at loop %d exit at bb%d\n",
5151 bbs[0]->loop_father->num, bb->index);
5152 split = true;
5153 }
5154
5155 if (split && !bbs.is_empty ())
5156 {
5157 r |= vect_slp_bbs (bbs);
5158 bbs.truncate (0);
5159 bbs.quick_push (bb);
5160 }
5161 else
5162 bbs.safe_push (bb);
5163
5164 /* When we have a stmt ending this block and defining a
5165 value we have to insert on edges when inserting after it for
5166 a vector containing its definition. Avoid this for now. */
5167 if (gimple *last = last_stmt (bb))
5168 if (gimple_get_lhs (last)
5169 && is_ctrl_altering_stmt (last))
5170 {
5171 if (dump_enabled_p ())
5172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5173 "splitting region at control altering "
5174 "definition %G", last);
5175 r |= vect_slp_bbs (bbs);
5176 bbs.truncate (0);
5177 }
5178 }
5179
5180 if (!bbs.is_empty ())
5181 r |= vect_slp_bbs (bbs);
5182
5183 free (rpo);
5184
5185 return r;
5186 }
5187
5188 /* Build a variable-length vector in which the elements in ELTS are repeated
5189 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
5190 RESULTS and add any new instructions to SEQ.
5191
5192 The approach we use is:
5193
5194 (1) Find a vector mode VM with integer elements of mode IM.
5195
5196 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5197 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
5198 from small vectors to IM.
5199
5200 (3) Duplicate each ELTS'[I] into a vector of mode VM.
5201
5202 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
5203 correct byte contents.
5204
5205 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
5206
5207 We try to find the largest IM for which this sequence works, in order
5208 to cut down on the number of interleaves. */
5209
5210 void
5211 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
5212 vec<tree> elts, unsigned int nresults,
5213 vec<tree> &results)
5214 {
5215 unsigned int nelts = elts.length ();
5216 tree element_type = TREE_TYPE (vector_type);
5217
5218 /* (1) Find a vector mode VM with integer elements of mode IM. */
5219 unsigned int nvectors = 1;
5220 tree new_vector_type;
5221 tree permutes[2];
5222 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
5223 &nvectors, &new_vector_type,
5224 permutes))
5225 gcc_unreachable ();
5226
5227 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
5228 unsigned int partial_nelts = nelts / nvectors;
5229 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
5230
5231 tree_vector_builder partial_elts;
5232 auto_vec<tree, 32> pieces (nvectors * 2);
5233 pieces.quick_grow_cleared (nvectors * 2);
5234 for (unsigned int i = 0; i < nvectors; ++i)
5235 {
5236 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5237 ELTS' has mode IM. */
5238 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
5239 for (unsigned int j = 0; j < partial_nelts; ++j)
5240 partial_elts.quick_push (elts[i * partial_nelts + j]);
5241 tree t = gimple_build_vector (seq, &partial_elts);
5242 t = gimple_build (seq, VIEW_CONVERT_EXPR,
5243 TREE_TYPE (new_vector_type), t);
5244
5245 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
5246 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
5247 }
5248
5249 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
5250 correct byte contents.
5251
5252 Conceptually, we need to repeat the following operation log2(nvectors)
5253 times, where hi_start = nvectors / 2:
5254
5255 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
5256 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
5257
5258 However, if each input repeats every N elements and the VF is
5259 a multiple of N * 2, the HI result is the same as the LO result.
5260 This will be true for the first N1 iterations of the outer loop,
5261 followed by N2 iterations for which both the LO and HI results
5262 are needed. I.e.:
5263
5264 N1 + N2 = log2(nvectors)
5265
5266 Each "N1 iteration" doubles the number of redundant vectors and the
5267 effect of the process as a whole is to have a sequence of nvectors/2**N1
5268 vectors that repeats 2**N1 times. Rather than generate these redundant
5269 vectors, we halve the number of vectors for each N1 iteration. */
5270 unsigned int in_start = 0;
5271 unsigned int out_start = nvectors;
5272 unsigned int new_nvectors = nvectors;
5273 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
5274 {
5275 unsigned int hi_start = new_nvectors / 2;
5276 unsigned int out_i = 0;
5277 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
5278 {
5279 if ((in_i & 1) != 0
5280 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
5281 2 * in_repeat))
5282 continue;
5283
5284 tree output = make_ssa_name (new_vector_type);
5285 tree input1 = pieces[in_start + (in_i / 2)];
5286 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
5287 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
5288 input1, input2,
5289 permutes[in_i & 1]);
5290 gimple_seq_add_stmt (seq, stmt);
5291 pieces[out_start + out_i] = output;
5292 out_i += 1;
5293 }
5294 std::swap (in_start, out_start);
5295 new_nvectors = out_i;
5296 }
5297
5298 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
5299 results.reserve (nresults);
5300 for (unsigned int i = 0; i < nresults; ++i)
5301 if (i < new_nvectors)
5302 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
5303 pieces[in_start + i]));
5304 else
5305 results.quick_push (results[i - new_nvectors]);
5306 }
5307
5308
5309 /* For constant and loop invariant defs in OP_NODE this function creates
5310 vector defs that will be used in the vectorized stmts and stores them
5311 to SLP_TREE_VEC_DEFS of OP_NODE. */
5312
5313 static void
5314 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
5315 {
5316 unsigned HOST_WIDE_INT nunits;
5317 tree vec_cst;
5318 unsigned j, number_of_places_left_in_vector;
5319 tree vector_type;
5320 tree vop;
5321 int group_size = op_node->ops.length ();
5322 unsigned int vec_num, i;
5323 unsigned number_of_copies = 1;
5324 bool constant_p;
5325 gimple_seq ctor_seq = NULL;
5326 auto_vec<tree, 16> permute_results;
5327
5328 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
5329 vector_type = SLP_TREE_VECTYPE (op_node);
5330
5331 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
5332 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
5333 auto_vec<tree> voprnds (number_of_vectors);
5334
5335 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5336 created vectors. It is greater than 1 if unrolling is performed.
5337
5338 For example, we have two scalar operands, s1 and s2 (e.g., group of
5339 strided accesses of size two), while NUNITS is four (i.e., four scalars
5340 of this type can be packed in a vector). The output vector will contain
5341 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5342 will be 2).
5343
5344 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
5345 containing the operands.
5346
5347 For example, NUNITS is four as before, and the group size is 8
5348 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5349 {s5, s6, s7, s8}. */
5350
5351 /* When using duplicate_and_interleave, we just need one element for
5352 each scalar statement. */
5353 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5354 nunits = group_size;
5355
5356 number_of_copies = nunits * number_of_vectors / group_size;
5357
5358 number_of_places_left_in_vector = nunits;
5359 constant_p = true;
5360 tree_vector_builder elts (vector_type, nunits, 1);
5361 elts.quick_grow (nunits);
5362 stmt_vec_info insert_after = NULL;
5363 for (j = 0; j < number_of_copies; j++)
5364 {
5365 tree op;
5366 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
5367 {
5368 /* Create 'vect_ = {op0,op1,...,opn}'. */
5369 number_of_places_left_in_vector--;
5370 tree orig_op = op;
5371 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
5372 {
5373 if (CONSTANT_CLASS_P (op))
5374 {
5375 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
5376 {
5377 /* Can't use VIEW_CONVERT_EXPR for booleans because
5378 of possibly different sizes of scalar value and
5379 vector element. */
5380 if (integer_zerop (op))
5381 op = build_int_cst (TREE_TYPE (vector_type), 0);
5382 else if (integer_onep (op))
5383 op = build_all_ones_cst (TREE_TYPE (vector_type));
5384 else
5385 gcc_unreachable ();
5386 }
5387 else
5388 op = fold_unary (VIEW_CONVERT_EXPR,
5389 TREE_TYPE (vector_type), op);
5390 gcc_assert (op && CONSTANT_CLASS_P (op));
5391 }
5392 else
5393 {
5394 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
5395 gimple *init_stmt;
5396 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
5397 {
5398 tree true_val
5399 = build_all_ones_cst (TREE_TYPE (vector_type));
5400 tree false_val
5401 = build_zero_cst (TREE_TYPE (vector_type));
5402 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
5403 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
5404 op, true_val,
5405 false_val);
5406 }
5407 else
5408 {
5409 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
5410 op);
5411 init_stmt
5412 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
5413 op);
5414 }
5415 gimple_seq_add_stmt (&ctor_seq, init_stmt);
5416 op = new_temp;
5417 }
5418 }
5419 elts[number_of_places_left_in_vector] = op;
5420 if (!CONSTANT_CLASS_P (op))
5421 constant_p = false;
5422 /* For BB vectorization we have to compute an insert location
5423 when a def is inside the analyzed region since we cannot
5424 simply insert at the BB start in this case. */
5425 stmt_vec_info opdef;
5426 if (TREE_CODE (orig_op) == SSA_NAME
5427 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
5428 && is_a <bb_vec_info> (vinfo)
5429 && (opdef = vinfo->lookup_def (orig_op)))
5430 {
5431 if (!insert_after)
5432 insert_after = opdef;
5433 else
5434 insert_after = get_later_stmt (insert_after, opdef);
5435 }
5436
5437 if (number_of_places_left_in_vector == 0)
5438 {
5439 if (constant_p
5440 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5441 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5442 vec_cst = gimple_build_vector (&ctor_seq, &elts);
5443 else
5444 {
5445 if (permute_results.is_empty ())
5446 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
5447 elts, number_of_vectors,
5448 permute_results);
5449 vec_cst = permute_results[number_of_vectors - j - 1];
5450 }
5451 if (!gimple_seq_empty_p (ctor_seq))
5452 {
5453 if (insert_after)
5454 {
5455 gimple_stmt_iterator gsi;
5456 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
5457 {
5458 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
5459 gsi_insert_seq_before (&gsi, ctor_seq,
5460 GSI_CONTINUE_LINKING);
5461 }
5462 else if (!stmt_ends_bb_p (insert_after->stmt))
5463 {
5464 gsi = gsi_for_stmt (insert_after->stmt);
5465 gsi_insert_seq_after (&gsi, ctor_seq,
5466 GSI_CONTINUE_LINKING);
5467 }
5468 else
5469 {
5470 /* When we want to insert after a def where the
5471 defining stmt throws then insert on the fallthru
5472 edge. */
5473 edge e = find_fallthru_edge
5474 (gimple_bb (insert_after->stmt)->succs);
5475 basic_block new_bb
5476 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
5477 gcc_assert (!new_bb);
5478 }
5479 }
5480 else
5481 vinfo->insert_seq_on_entry (NULL, ctor_seq);
5482 ctor_seq = NULL;
5483 }
5484 voprnds.quick_push (vec_cst);
5485 insert_after = NULL;
5486 number_of_places_left_in_vector = nunits;
5487 constant_p = true;
5488 elts.new_vector (vector_type, nunits, 1);
5489 elts.quick_grow (nunits);
5490 }
5491 }
5492 }
5493
5494 /* Since the vectors are created in the reverse order, we should invert
5495 them. */
5496 vec_num = voprnds.length ();
5497 for (j = vec_num; j != 0; j--)
5498 {
5499 vop = voprnds[j - 1];
5500 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5501 }
5502
5503 /* In case that VF is greater than the unrolling factor needed for the SLP
5504 group of stmts, NUMBER_OF_VECTORS to be created is greater than
5505 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
5506 to replicate the vectors. */
5507 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
5508 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
5509 i++)
5510 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5511 }
5512
5513 /* Get the Ith vectorized definition from SLP_NODE. */
5514
5515 tree
5516 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
5517 {
5518 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
5519 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
5520 else
5521 return SLP_TREE_VEC_DEFS (slp_node)[i];
5522 }
5523
5524 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
5525
5526 void
5527 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
5528 {
5529 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
5530 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
5531 {
5532 unsigned j;
5533 gimple *vec_def_stmt;
5534 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
5535 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
5536 }
5537 else
5538 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
5539 }
5540
5541 /* Get N vectorized definitions for SLP_NODE. */
5542
5543 void
5544 vect_get_slp_defs (vec_info *,
5545 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
5546 {
5547 if (n == -1U)
5548 n = SLP_TREE_CHILDREN (slp_node).length ();
5549
5550 for (unsigned i = 0; i < n; ++i)
5551 {
5552 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
5553 vec<tree> vec_defs = vNULL;
5554 vect_get_slp_defs (child, &vec_defs);
5555 vec_oprnds->quick_push (vec_defs);
5556 }
5557 }
5558
5559 /* Generate vector permute statements from a list of loads in DR_CHAIN.
5560 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
5561 permute statements for the SLP node NODE. Store the number of vector
5562 permute instructions in *N_PERMS and the number of vector load
5563 instructions in *N_LOADS. */
5564
5565 bool
5566 vect_transform_slp_perm_load (vec_info *vinfo,
5567 slp_tree node, vec<tree> dr_chain,
5568 gimple_stmt_iterator *gsi, poly_uint64 vf,
5569 bool analyze_only, unsigned *n_perms,
5570 unsigned int *n_loads)
5571 {
5572 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5573 int vec_index = 0;
5574 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5575 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
5576 unsigned int mask_element;
5577 machine_mode mode;
5578
5579 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
5580 return false;
5581
5582 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
5583
5584 mode = TYPE_MODE (vectype);
5585 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5586
5587 /* Initialize the vect stmts of NODE to properly insert the generated
5588 stmts later. */
5589 if (! analyze_only)
5590 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
5591 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
5592 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
5593
5594 /* Generate permutation masks for every NODE. Number of masks for each NODE
5595 is equal to GROUP_SIZE.
5596 E.g., we have a group of three nodes with three loads from the same
5597 location in each node, and the vector size is 4. I.e., we have a
5598 a0b0c0a1b1c1... sequence and we need to create the following vectors:
5599 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
5600 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
5601 ...
5602
5603 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
5604 The last mask is illegal since we assume two operands for permute
5605 operation, and the mask element values can't be outside that range.
5606 Hence, the last mask must be converted into {2,5,5,5}.
5607 For the first two permutations we need the first and the second input
5608 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
5609 we need the second and the third vectors: {b1,c1,a2,b2} and
5610 {c2,a3,b3,c3}. */
5611
5612 int vect_stmts_counter = 0;
5613 unsigned int index = 0;
5614 int first_vec_index = -1;
5615 int second_vec_index = -1;
5616 bool noop_p = true;
5617 *n_perms = 0;
5618
5619 vec_perm_builder mask;
5620 unsigned int nelts_to_build;
5621 unsigned int nvectors_per_build;
5622 unsigned int in_nlanes;
5623 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
5624 && multiple_p (nunits, group_size));
5625 if (repeating_p)
5626 {
5627 /* A single vector contains a whole number of copies of the node, so:
5628 (a) all permutes can use the same mask; and
5629 (b) the permutes only need a single vector input. */
5630 mask.new_vector (nunits, group_size, 3);
5631 nelts_to_build = mask.encoded_nelts ();
5632 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
5633 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
5634 }
5635 else
5636 {
5637 /* We need to construct a separate mask for each vector statement. */
5638 unsigned HOST_WIDE_INT const_nunits, const_vf;
5639 if (!nunits.is_constant (&const_nunits)
5640 || !vf.is_constant (&const_vf))
5641 return false;
5642 mask.new_vector (const_nunits, const_nunits, 1);
5643 nelts_to_build = const_vf * group_size;
5644 nvectors_per_build = 1;
5645 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
5646 }
5647 auto_sbitmap used_in_lanes (in_nlanes);
5648 bitmap_clear (used_in_lanes);
5649
5650 unsigned int count = mask.encoded_nelts ();
5651 mask.quick_grow (count);
5652 vec_perm_indices indices;
5653
5654 for (unsigned int j = 0; j < nelts_to_build; j++)
5655 {
5656 unsigned int iter_num = j / group_size;
5657 unsigned int stmt_num = j % group_size;
5658 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
5659 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
5660 bitmap_set_bit (used_in_lanes, i);
5661 if (repeating_p)
5662 {
5663 first_vec_index = 0;
5664 mask_element = i;
5665 }
5666 else
5667 {
5668 /* Enforced before the loop when !repeating_p. */
5669 unsigned int const_nunits = nunits.to_constant ();
5670 vec_index = i / const_nunits;
5671 mask_element = i % const_nunits;
5672 if (vec_index == first_vec_index
5673 || first_vec_index == -1)
5674 {
5675 first_vec_index = vec_index;
5676 }
5677 else if (vec_index == second_vec_index
5678 || second_vec_index == -1)
5679 {
5680 second_vec_index = vec_index;
5681 mask_element += const_nunits;
5682 }
5683 else
5684 {
5685 if (dump_enabled_p ())
5686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5687 "permutation requires at "
5688 "least three vectors %G",
5689 stmt_info->stmt);
5690 gcc_assert (analyze_only);
5691 return false;
5692 }
5693
5694 gcc_assert (mask_element < 2 * const_nunits);
5695 }
5696
5697 if (mask_element != index)
5698 noop_p = false;
5699 mask[index++] = mask_element;
5700
5701 if (index == count && !noop_p)
5702 {
5703 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
5704 if (!can_vec_perm_const_p (mode, indices))
5705 {
5706 if (dump_enabled_p ())
5707 {
5708 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5709 vect_location,
5710 "unsupported vect permute { ");
5711 for (i = 0; i < count; ++i)
5712 {
5713 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5714 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5715 }
5716 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5717 }
5718 gcc_assert (analyze_only);
5719 return false;
5720 }
5721
5722 ++*n_perms;
5723 }
5724
5725 if (index == count)
5726 {
5727 if (!analyze_only)
5728 {
5729 tree mask_vec = NULL_TREE;
5730
5731 if (! noop_p)
5732 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5733
5734 if (second_vec_index == -1)
5735 second_vec_index = first_vec_index;
5736
5737 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
5738 {
5739 /* Generate the permute statement if necessary. */
5740 tree first_vec = dr_chain[first_vec_index + ri];
5741 tree second_vec = dr_chain[second_vec_index + ri];
5742 gimple *perm_stmt;
5743 if (! noop_p)
5744 {
5745 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5746 tree perm_dest
5747 = vect_create_destination_var (gimple_assign_lhs (stmt),
5748 vectype);
5749 perm_dest = make_ssa_name (perm_dest);
5750 perm_stmt
5751 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5752 first_vec, second_vec,
5753 mask_vec);
5754 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
5755 gsi);
5756 }
5757 else
5758 /* If mask was NULL_TREE generate the requested
5759 identity transform. */
5760 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
5761
5762 /* Store the vector statement in NODE. */
5763 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
5764 }
5765 }
5766
5767 index = 0;
5768 first_vec_index = -1;
5769 second_vec_index = -1;
5770 noop_p = true;
5771 }
5772 }
5773
5774 if (n_loads)
5775 {
5776 if (repeating_p)
5777 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5778 else
5779 {
5780 /* Enforced above when !repeating_p. */
5781 unsigned int const_nunits = nunits.to_constant ();
5782 *n_loads = 0;
5783 bool load_seen = false;
5784 for (unsigned i = 0; i < in_nlanes; ++i)
5785 {
5786 if (i % const_nunits == 0)
5787 {
5788 if (load_seen)
5789 *n_loads += 1;
5790 load_seen = false;
5791 }
5792 if (bitmap_bit_p (used_in_lanes, i))
5793 load_seen = true;
5794 }
5795 if (load_seen)
5796 *n_loads += 1;
5797 }
5798 }
5799
5800 return true;
5801 }
5802
5803
5804 /* Vectorize the SLP permutations in NODE as specified
5805 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5806 child number and lane number.
5807 Interleaving of two two-lane two-child SLP subtrees (not supported):
5808 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5809 A blend of two four-lane two-child SLP subtrees:
5810 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5811 Highpart of a four-lane one-child SLP subtree (not supported):
5812 [ { 0, 2 }, { 0, 3 } ]
5813 Where currently only a subset is supported by code generating below. */
5814
5815 static bool
5816 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5817 slp_tree node, stmt_vector_for_cost *cost_vec)
5818 {
5819 tree vectype = SLP_TREE_VECTYPE (node);
5820
5821 /* ??? We currently only support all same vector input and output types
5822 while the SLP IL should really do a concat + select and thus accept
5823 arbitrary mismatches. */
5824 slp_tree child;
5825 unsigned i;
5826 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5827 if (!vect_maybe_update_slp_op_vectype (child, vectype)
5828 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
5829 {
5830 if (dump_enabled_p ())
5831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5832 "Unsupported lane permutation\n");
5833 return false;
5834 }
5835
5836 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
5837 gcc_assert (perm.length () == SLP_TREE_LANES (node));
5838 if (dump_enabled_p ())
5839 {
5840 dump_printf_loc (MSG_NOTE, vect_location,
5841 "vectorizing permutation");
5842 for (unsigned i = 0; i < perm.length (); ++i)
5843 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
5844 dump_printf (MSG_NOTE, "\n");
5845 }
5846
5847 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5848 if (!nunits.is_constant ())
5849 return false;
5850 unsigned HOST_WIDE_INT vf = 1;
5851 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
5852 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
5853 return false;
5854 unsigned olanes = vf * SLP_TREE_LANES (node);
5855 gcc_assert (multiple_p (olanes, nunits));
5856
5857 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5858 from the { SLP operand, scalar lane } permutation as recorded in the
5859 SLP node as intermediate step. This part should already work
5860 with SLP children with arbitrary number of lanes. */
5861 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
5862 auto_vec<unsigned> active_lane;
5863 vperm.create (olanes);
5864 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
5865 for (unsigned i = 0; i < vf; ++i)
5866 {
5867 for (unsigned pi = 0; pi < perm.length (); ++pi)
5868 {
5869 std::pair<unsigned, unsigned> p = perm[pi];
5870 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
5871 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
5872 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
5873 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
5874 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
5875 }
5876 /* Advance to the next group. */
5877 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
5878 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
5879 }
5880
5881 if (dump_enabled_p ())
5882 {
5883 dump_printf_loc (MSG_NOTE, vect_location, "as");
5884 for (unsigned i = 0; i < vperm.length (); ++i)
5885 {
5886 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
5887 dump_printf (MSG_NOTE, ",");
5888 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
5889 vperm[i].first.first, vperm[i].first.second,
5890 vperm[i].second);
5891 }
5892 dump_printf (MSG_NOTE, "\n");
5893 }
5894
5895 /* We can only handle two-vector permutes, everything else should
5896 be lowered on the SLP level. The following is closely inspired
5897 by vect_transform_slp_perm_load and is supposed to eventually
5898 replace it.
5899 ??? As intermediate step do code-gen in the SLP tree representation
5900 somehow? */
5901 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
5902 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
5903 unsigned int const_nunits = nunits.to_constant ();
5904 unsigned int index = 0;
5905 unsigned int mask_element;
5906 vec_perm_builder mask;
5907 mask.new_vector (const_nunits, const_nunits, 1);
5908 unsigned int count = mask.encoded_nelts ();
5909 mask.quick_grow (count);
5910 vec_perm_indices indices;
5911 unsigned nperms = 0;
5912 for (unsigned i = 0; i < vperm.length (); ++i)
5913 {
5914 mask_element = vperm[i].second;
5915 if (first_vec.first == -1U
5916 || first_vec == vperm[i].first)
5917 first_vec = vperm[i].first;
5918 else if (second_vec.first == -1U
5919 || second_vec == vperm[i].first)
5920 {
5921 second_vec = vperm[i].first;
5922 mask_element += const_nunits;
5923 }
5924 else
5925 {
5926 if (dump_enabled_p ())
5927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5928 "permutation requires at "
5929 "least three vectors\n");
5930 gcc_assert (!gsi);
5931 return false;
5932 }
5933
5934 mask[index++] = mask_element;
5935
5936 if (index == count)
5937 {
5938 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
5939 const_nunits);
5940 bool identity_p = indices.series_p (0, 1, 0, 1);
5941 if (!identity_p
5942 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5943 {
5944 if (dump_enabled_p ())
5945 {
5946 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5947 vect_location,
5948 "unsupported vect permute { ");
5949 for (i = 0; i < count; ++i)
5950 {
5951 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5952 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5953 }
5954 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5955 }
5956 gcc_assert (!gsi);
5957 return false;
5958 }
5959
5960 if (!identity_p)
5961 nperms++;
5962 if (gsi)
5963 {
5964 if (second_vec.first == -1U)
5965 second_vec = first_vec;
5966
5967 /* Generate the permute statement if necessary. */
5968 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
5969 tree first_def
5970 = vect_get_slp_vect_def (first_node, first_vec.second);
5971 /* ??? We SLP match existing vector element extracts but
5972 allow punning which we need to re-instantiate at uses
5973 but have no good way of explicitely representing. */
5974 if (!types_compatible_p (TREE_TYPE (first_def), vectype))
5975 {
5976 gassign *conv_stmt;
5977 conv_stmt = gimple_build_assign (make_ssa_name (vectype),
5978 build1 (VIEW_CONVERT_EXPR,
5979 vectype, first_def));
5980 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
5981 first_def = gimple_assign_lhs (conv_stmt);
5982 }
5983 gassign *perm_stmt;
5984 tree perm_dest = make_ssa_name (vectype);
5985 if (!identity_p)
5986 {
5987 slp_tree second_node
5988 = SLP_TREE_CHILDREN (node)[second_vec.first];
5989 tree second_def
5990 = vect_get_slp_vect_def (second_node, second_vec.second);
5991 if (!types_compatible_p (TREE_TYPE (second_def), vectype))
5992 {
5993 gassign *conv_stmt;
5994 conv_stmt = gimple_build_assign (make_ssa_name (vectype),
5995 build1
5996 (VIEW_CONVERT_EXPR,
5997 vectype, second_def));
5998 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
5999 second_def = gimple_assign_lhs (conv_stmt);
6000 }
6001 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6002 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6003 first_def, second_def,
6004 mask_vec);
6005 }
6006 else
6007 /* We need a copy here in case the def was external. */
6008 perm_stmt = gimple_build_assign (perm_dest, first_def);
6009 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
6010 /* Store the vector statement in NODE. */
6011 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
6012 }
6013
6014 index = 0;
6015 first_vec = std::make_pair (-1U, -1U);
6016 second_vec = std::make_pair (-1U, -1U);
6017 }
6018 }
6019
6020 if (!gsi)
6021 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
6022
6023 return true;
6024 }
6025
6026 /* Vectorize SLP NODE. */
6027
6028 static void
6029 vect_schedule_slp_node (vec_info *vinfo,
6030 slp_tree node, slp_instance instance)
6031 {
6032 gimple_stmt_iterator si;
6033 int i;
6034 slp_tree child;
6035
6036 /* For existing vectors there's nothing to do. */
6037 if (SLP_TREE_VEC_DEFS (node).exists ())
6038 return;
6039
6040 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
6041
6042 /* Vectorize externals and constants. */
6043 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
6044 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6045 {
6046 /* ??? vectorizable_shift can end up using a scalar operand which is
6047 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
6048 node in this case. */
6049 if (!SLP_TREE_VECTYPE (node))
6050 return;
6051
6052 vect_create_constant_vectors (vinfo, node);
6053 return;
6054 }
6055
6056 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6057
6058 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
6059 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6060
6061 if (dump_enabled_p ())
6062 dump_printf_loc (MSG_NOTE, vect_location,
6063 "------>vectorizing SLP node starting from: %G",
6064 stmt_info->stmt);
6065
6066 if (STMT_VINFO_DATA_REF (stmt_info)
6067 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
6068 {
6069 /* Vectorized loads go before the first scalar load to make it
6070 ready early, vectorized stores go before the last scalar
6071 stmt which is where all uses are ready. */
6072 stmt_vec_info last_stmt_info = NULL;
6073 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
6074 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
6075 else /* DR_IS_WRITE */
6076 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
6077 si = gsi_for_stmt (last_stmt_info->stmt);
6078 }
6079 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
6080 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
6081 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
6082 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
6083 {
6084 /* For PHI node vectorization we do not use the insertion iterator. */
6085 si = gsi_none ();
6086 }
6087 else
6088 {
6089 /* Emit other stmts after the children vectorized defs which is
6090 earliest possible. */
6091 gimple *last_stmt = NULL;
6092 bool seen_vector_def = false;
6093 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6094 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6095 {
6096 /* For fold-left reductions we are retaining the scalar
6097 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
6098 set so the representation isn't perfect. Resort to the
6099 last scalar def here. */
6100 if (SLP_TREE_VEC_STMTS (child).is_empty ())
6101 {
6102 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
6103 == cycle_phi_info_type);
6104 gphi *phi = as_a <gphi *>
6105 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
6106 if (!last_stmt
6107 || vect_stmt_dominates_stmt_p (last_stmt, phi))
6108 last_stmt = phi;
6109 }
6110 /* We are emitting all vectorized stmts in the same place and
6111 the last one is the last.
6112 ??? Unless we have a load permutation applied and that
6113 figures to re-use an earlier generated load. */
6114 unsigned j;
6115 gimple *vstmt;
6116 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
6117 if (!last_stmt
6118 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
6119 last_stmt = vstmt;
6120 }
6121 else if (!SLP_TREE_VECTYPE (child))
6122 {
6123 /* For externals we use unvectorized at all scalar defs. */
6124 unsigned j;
6125 tree def;
6126 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
6127 if (TREE_CODE (def) == SSA_NAME
6128 && !SSA_NAME_IS_DEFAULT_DEF (def))
6129 {
6130 gimple *stmt = SSA_NAME_DEF_STMT (def);
6131 if (!last_stmt
6132 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
6133 last_stmt = stmt;
6134 }
6135 }
6136 else
6137 {
6138 /* For externals we have to look at all defs since their
6139 insertion place is decided per vector. But beware
6140 of pre-existing vectors where we need to make sure
6141 we do not insert before the region boundary. */
6142 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
6143 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
6144 seen_vector_def = true;
6145 else
6146 {
6147 unsigned j;
6148 tree vdef;
6149 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
6150 if (TREE_CODE (vdef) == SSA_NAME
6151 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
6152 {
6153 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
6154 if (!last_stmt
6155 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
6156 last_stmt = vstmt;
6157 }
6158 }
6159 }
6160 /* This can happen when all children are pre-existing vectors or
6161 constants. */
6162 if (!last_stmt)
6163 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
6164 if (!last_stmt)
6165 {
6166 gcc_assert (seen_vector_def);
6167 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
6168 }
6169 else if (is_a <gphi *> (last_stmt))
6170 si = gsi_after_labels (gimple_bb (last_stmt));
6171 else
6172 {
6173 si = gsi_for_stmt (last_stmt);
6174 gsi_next (&si);
6175 }
6176 }
6177
6178 bool done_p = false;
6179
6180 /* Handle purely internal nodes. */
6181 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6182 {
6183 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
6184 be shared with different SLP nodes (but usually it's the same
6185 operation apart from the case the stmt is only there for denoting
6186 the actual scalar lane defs ...). So do not call vect_transform_stmt
6187 but open-code it here (partly). */
6188 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
6189 gcc_assert (done);
6190 done_p = true;
6191 }
6192 if (!done_p)
6193 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
6194 }
6195
6196 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
6197 For loop vectorization this is done in vectorizable_call, but for SLP
6198 it needs to be deferred until end of vect_schedule_slp, because multiple
6199 SLP instances may refer to the same scalar stmt. */
6200
6201 static void
6202 vect_remove_slp_scalar_calls (vec_info *vinfo,
6203 slp_tree node, hash_set<slp_tree> &visited)
6204 {
6205 gimple *new_stmt;
6206 gimple_stmt_iterator gsi;
6207 int i;
6208 slp_tree child;
6209 tree lhs;
6210 stmt_vec_info stmt_info;
6211
6212 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
6213 return;
6214
6215 if (visited.add (node))
6216 return;
6217
6218 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6219 vect_remove_slp_scalar_calls (vinfo, child, visited);
6220
6221 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6222 {
6223 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
6224 if (!stmt || gimple_bb (stmt) == NULL)
6225 continue;
6226 if (is_pattern_stmt_p (stmt_info)
6227 || !PURE_SLP_STMT (stmt_info))
6228 continue;
6229 lhs = gimple_call_lhs (stmt);
6230 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
6231 gsi = gsi_for_stmt (stmt);
6232 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
6233 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
6234 }
6235 }
6236
6237 static void
6238 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
6239 {
6240 hash_set<slp_tree> visited;
6241 vect_remove_slp_scalar_calls (vinfo, node, visited);
6242 }
6243
6244 /* Vectorize the instance root. */
6245
6246 void
6247 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
6248 {
6249 gassign *rstmt = NULL;
6250
6251 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
6252 {
6253 gimple *child_stmt;
6254 int j;
6255
6256 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
6257 {
6258 tree vect_lhs = gimple_get_lhs (child_stmt);
6259 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
6260 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
6261 TREE_TYPE (vect_lhs)))
6262 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
6263 vect_lhs);
6264 rstmt = gimple_build_assign (root_lhs, vect_lhs);
6265 break;
6266 }
6267 }
6268 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
6269 {
6270 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6271 gimple *child_stmt;
6272 int j;
6273 vec<constructor_elt, va_gc> *v;
6274 vec_alloc (v, nelts);
6275
6276 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
6277 {
6278 CONSTRUCTOR_APPEND_ELT (v,
6279 NULL_TREE,
6280 gimple_get_lhs (child_stmt));
6281 }
6282 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
6283 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
6284 tree r_constructor = build_constructor (rtype, v);
6285 rstmt = gimple_build_assign (lhs, r_constructor);
6286 }
6287
6288 gcc_assert (rstmt);
6289
6290 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
6291 gsi_replace (&rgsi, rstmt, true);
6292 }
6293
6294 struct slp_scc_info
6295 {
6296 bool on_stack;
6297 int dfs;
6298 int lowlink;
6299 };
6300
6301 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
6302
6303 static void
6304 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
6305 hash_map<slp_tree, slp_scc_info> &scc_info,
6306 int &maxdfs, vec<slp_tree> &stack)
6307 {
6308 bool existed_p;
6309 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
6310 gcc_assert (!existed_p);
6311 info->dfs = maxdfs;
6312 info->lowlink = maxdfs;
6313 maxdfs++;
6314
6315 /* Leaf. */
6316 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
6317 {
6318 info->on_stack = false;
6319 vect_schedule_slp_node (vinfo, node, instance);
6320 return;
6321 }
6322
6323 info->on_stack = true;
6324 stack.safe_push (node);
6325
6326 unsigned i;
6327 slp_tree child;
6328 /* DFS recurse. */
6329 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6330 {
6331 if (!child)
6332 continue;
6333 slp_scc_info *child_info = scc_info.get (child);
6334 if (!child_info)
6335 {
6336 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
6337 /* Recursion might have re-allocated the node. */
6338 info = scc_info.get (node);
6339 child_info = scc_info.get (child);
6340 info->lowlink = MIN (info->lowlink, child_info->lowlink);
6341 }
6342 else if (child_info->on_stack)
6343 info->lowlink = MIN (info->lowlink, child_info->dfs);
6344 }
6345 if (info->lowlink != info->dfs)
6346 return;
6347
6348 auto_vec<slp_tree, 4> phis_to_fixup;
6349
6350 /* Singleton. */
6351 if (stack.last () == node)
6352 {
6353 stack.pop ();
6354 info->on_stack = false;
6355 vect_schedule_slp_node (vinfo, node, instance);
6356 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6357 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
6358 phis_to_fixup.quick_push (node);
6359 }
6360 else
6361 {
6362 /* SCC. */
6363 int last_idx = stack.length () - 1;
6364 while (stack[last_idx] != node)
6365 last_idx--;
6366 /* We can break the cycle at PHIs who have at least one child
6367 code generated. Then we could re-start the DFS walk until
6368 all nodes in the SCC are covered (we might have new entries
6369 for only back-reachable nodes). But it's simpler to just
6370 iterate and schedule those that are ready. */
6371 unsigned todo = stack.length () - last_idx;
6372 do
6373 {
6374 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
6375 {
6376 slp_tree entry = stack[idx];
6377 if (!entry)
6378 continue;
6379 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
6380 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
6381 bool ready = !phi;
6382 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
6383 if (!child)
6384 {
6385 gcc_assert (phi);
6386 ready = true;
6387 break;
6388 }
6389 else if (scc_info.get (child)->on_stack)
6390 {
6391 if (!phi)
6392 {
6393 ready = false;
6394 break;
6395 }
6396 }
6397 else
6398 {
6399 if (phi)
6400 {
6401 ready = true;
6402 break;
6403 }
6404 }
6405 if (ready)
6406 {
6407 vect_schedule_slp_node (vinfo, entry, instance);
6408 scc_info.get (entry)->on_stack = false;
6409 stack[idx] = NULL;
6410 todo--;
6411 if (phi)
6412 phis_to_fixup.safe_push (entry);
6413 }
6414 }
6415 }
6416 while (todo != 0);
6417
6418 /* Pop the SCC. */
6419 stack.truncate (last_idx);
6420 }
6421
6422 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
6423 slp_tree phi_node;
6424 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
6425 {
6426 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
6427 edge_iterator ei;
6428 edge e;
6429 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
6430 {
6431 unsigned dest_idx = e->dest_idx;
6432 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
6433 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
6434 continue;
6435 /* Simply fill all args. */
6436 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
6437 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
6438 vect_get_slp_vect_def (child, i),
6439 e, gimple_phi_arg_location (phi, dest_idx));
6440 }
6441 }
6442 }
6443
6444 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
6445
6446 void
6447 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
6448 {
6449 slp_instance instance;
6450 unsigned int i;
6451
6452 hash_map<slp_tree, slp_scc_info> scc_info;
6453 int maxdfs = 0;
6454 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6455 {
6456 slp_tree node = SLP_INSTANCE_TREE (instance);
6457 if (dump_enabled_p ())
6458 {
6459 dump_printf_loc (MSG_NOTE, vect_location,
6460 "Vectorizing SLP tree:\n");
6461 if (SLP_INSTANCE_ROOT_STMT (instance))
6462 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
6463 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
6464 vect_print_slp_graph (MSG_NOTE, vect_location,
6465 SLP_INSTANCE_TREE (instance));
6466 }
6467 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
6468 have a PHI be the node breaking the cycle. */
6469 auto_vec<slp_tree> stack;
6470 if (!scc_info.get (node))
6471 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
6472
6473 if (SLP_INSTANCE_ROOT_STMT (instance))
6474 vectorize_slp_instance_root_stmt (node, instance);
6475
6476 if (dump_enabled_p ())
6477 dump_printf_loc (MSG_NOTE, vect_location,
6478 "vectorizing stmts using SLP.\n");
6479 }
6480
6481 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6482 {
6483 slp_tree root = SLP_INSTANCE_TREE (instance);
6484 stmt_vec_info store_info;
6485 unsigned int j;
6486
6487 /* Remove scalar call stmts. Do not do this for basic-block
6488 vectorization as not all uses may be vectorized.
6489 ??? Why should this be necessary? DCE should be able to
6490 remove the stmts itself.
6491 ??? For BB vectorization we can as well remove scalar
6492 stmts starting from the SLP tree root if they have no
6493 uses. */
6494 if (is_a <loop_vec_info> (vinfo))
6495 vect_remove_slp_scalar_calls (vinfo, root);
6496
6497 /* Remove vectorized stores original scalar stmts. */
6498 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
6499 {
6500 if (!STMT_VINFO_DATA_REF (store_info)
6501 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
6502 break;
6503
6504 store_info = vect_orig_stmt (store_info);
6505 /* Free the attached stmt_vec_info and remove the stmt. */
6506 vinfo->remove_stmt (store_info);
6507
6508 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
6509 to not crash in vect_free_slp_tree later. */
6510 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
6511 SLP_TREE_REPRESENTATIVE (root) = NULL;
6512 }
6513 }
6514 }