1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "tree-pass.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
51 #include "alloc-pool.h"
53 static bool vectorizable_slp_permutation (vec_info
*, gimple_stmt_iterator
*,
54 slp_tree
, stmt_vector_for_cost
*);
56 static object_allocator
<_slp_tree
> *slp_tree_pool
;
57 static slp_tree slp_first_node
;
62 slp_tree_pool
= new object_allocator
<_slp_tree
> ("SLP nodes");
68 while (slp_first_node
)
69 delete slp_first_node
;
75 _slp_tree::operator new (size_t n
)
77 gcc_assert (n
== sizeof (_slp_tree
));
78 return slp_tree_pool
->allocate_raw ();
82 _slp_tree::operator delete (void *node
, size_t n
)
84 gcc_assert (n
== sizeof (_slp_tree
));
85 slp_tree_pool
->remove_raw (node
);
89 /* Initialize a SLP node. */
91 _slp_tree::_slp_tree ()
93 this->prev_node
= NULL
;
95 slp_first_node
->prev_node
= this;
96 this->next_node
= slp_first_node
;
97 slp_first_node
= this;
98 SLP_TREE_SCALAR_STMTS (this) = vNULL
;
99 SLP_TREE_SCALAR_OPS (this) = vNULL
;
100 SLP_TREE_VEC_STMTS (this) = vNULL
;
101 SLP_TREE_VEC_DEFS (this) = vNULL
;
102 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
103 SLP_TREE_CHILDREN (this) = vNULL
;
104 SLP_TREE_LOAD_PERMUTATION (this) = vNULL
;
105 SLP_TREE_LANE_PERMUTATION (this) = vNULL
;
106 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def
;
107 SLP_TREE_CODE (this) = ERROR_MARK
;
108 SLP_TREE_VECTYPE (this) = NULL_TREE
;
109 SLP_TREE_REPRESENTATIVE (this) = NULL
;
110 SLP_TREE_REF_COUNT (this) = 1;
111 this->max_nunits
= 1;
115 /* Tear down a SLP node. */
117 _slp_tree::~_slp_tree ()
120 this->prev_node
->next_node
= this->next_node
;
122 slp_first_node
= this->next_node
;
124 this->next_node
->prev_node
= this->prev_node
;
125 SLP_TREE_CHILDREN (this).release ();
126 SLP_TREE_SCALAR_STMTS (this).release ();
127 SLP_TREE_SCALAR_OPS (this).release ();
128 SLP_TREE_VEC_STMTS (this).release ();
129 SLP_TREE_VEC_DEFS (this).release ();
130 SLP_TREE_LOAD_PERMUTATION (this).release ();
131 SLP_TREE_LANE_PERMUTATION (this).release ();
134 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
137 vect_free_slp_tree (slp_tree node
)
142 if (--SLP_TREE_REF_COUNT (node
) != 0)
145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
147 vect_free_slp_tree (child
);
149 /* If the node defines any SLP only patterns then those patterns are no
150 longer valid and should be removed. */
151 stmt_vec_info rep_stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
152 if (rep_stmt_info
&& STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info
))
154 stmt_vec_info stmt_info
= vect_orig_stmt (rep_stmt_info
);
155 STMT_VINFO_IN_PATTERN_P (stmt_info
) = false;
156 STMT_SLP_TYPE (stmt_info
) = STMT_SLP_TYPE (rep_stmt_info
);
162 /* Return a location suitable for dumpings related to the SLP instance. */
165 _slp_instance::location () const
167 if (!root_stmts
.is_empty ())
168 return root_stmts
[0]->stmt
;
170 return SLP_TREE_SCALAR_STMTS (root
)[0]->stmt
;
174 /* Free the memory allocated for the SLP instance. */
177 vect_free_slp_instance (slp_instance instance
)
179 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
180 SLP_INSTANCE_LOADS (instance
).release ();
181 SLP_INSTANCE_ROOT_STMTS (instance
).release ();
182 instance
->subgraph_entries
.release ();
183 instance
->cost_vec
.release ();
188 /* Create an SLP node for SCALAR_STMTS. */
191 vect_create_new_slp_node (unsigned nops
, tree_code code
)
193 slp_tree node
= new _slp_tree
;
194 SLP_TREE_SCALAR_STMTS (node
) = vNULL
;
195 SLP_TREE_CHILDREN (node
).create (nops
);
196 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
197 SLP_TREE_CODE (node
) = code
;
200 /* Create an SLP node for SCALAR_STMTS. */
203 vect_create_new_slp_node (slp_tree node
,
204 vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
206 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
207 SLP_TREE_CHILDREN (node
).create (nops
);
208 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
209 SLP_TREE_REPRESENTATIVE (node
) = scalar_stmts
[0];
210 SLP_TREE_LANES (node
) = scalar_stmts
.length ();
214 /* Create an SLP node for SCALAR_STMTS. */
217 vect_create_new_slp_node (vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
219 return vect_create_new_slp_node (new _slp_tree
, scalar_stmts
, nops
);
222 /* Create an SLP node for OPS. */
225 vect_create_new_slp_node (slp_tree node
, vec
<tree
> ops
)
227 SLP_TREE_SCALAR_OPS (node
) = ops
;
228 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
229 SLP_TREE_LANES (node
) = ops
.length ();
233 /* Create an SLP node for OPS. */
236 vect_create_new_slp_node (vec
<tree
> ops
)
238 return vect_create_new_slp_node (new _slp_tree
, ops
);
242 /* This structure is used in creation of an SLP tree. Each instance
243 corresponds to the same operand in a group of scalar stmts in an SLP
245 typedef struct _slp_oprnd_info
247 /* Def-stmts for the operands. */
248 vec
<stmt_vec_info
> def_stmts
;
251 /* Information about the first statement, its vector def-type, type, the
252 operand itself in case it's constant, and an indication if it's a pattern
255 enum vect_def_type first_dt
;
260 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
262 static vec
<slp_oprnd_info
>
263 vect_create_oprnd_info (int nops
, int group_size
)
266 slp_oprnd_info oprnd_info
;
267 vec
<slp_oprnd_info
> oprnds_info
;
269 oprnds_info
.create (nops
);
270 for (i
= 0; i
< nops
; i
++)
272 oprnd_info
= XNEW (struct _slp_oprnd_info
);
273 oprnd_info
->def_stmts
.create (group_size
);
274 oprnd_info
->ops
.create (group_size
);
275 oprnd_info
->first_dt
= vect_uninitialized_def
;
276 oprnd_info
->first_op_type
= NULL_TREE
;
277 oprnd_info
->any_pattern
= false;
278 oprnds_info
.quick_push (oprnd_info
);
285 /* Free operands info. */
288 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
291 slp_oprnd_info oprnd_info
;
293 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
295 oprnd_info
->def_stmts
.release ();
296 oprnd_info
->ops
.release ();
297 XDELETE (oprnd_info
);
300 oprnds_info
.release ();
304 /* Return true if STMTS contains a pattern statement. */
307 vect_contains_pattern_stmt_p (vec
<stmt_vec_info
> stmts
)
309 stmt_vec_info stmt_info
;
311 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
312 if (is_pattern_stmt_p (stmt_info
))
317 /* Return true when all lanes in the external or constant NODE have
321 vect_slp_tree_uniform_p (slp_tree node
)
323 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
324 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
);
326 /* Pre-exsting vectors. */
327 if (SLP_TREE_SCALAR_OPS (node
).is_empty ())
331 tree op
, first
= NULL_TREE
;
332 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
335 else if (!operand_equal_p (first
, op
, 0))
341 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
342 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
346 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info
,
347 stmt_vec_info first_stmt_info
)
349 stmt_vec_info next_stmt_info
= first_stmt_info
;
352 if (first_stmt_info
!= DR_GROUP_FIRST_ELEMENT (stmt_info
))
357 if (next_stmt_info
== stmt_info
)
359 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
361 result
+= DR_GROUP_GAP (next_stmt_info
);
363 while (next_stmt_info
);
368 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
369 using the method implemented by duplicate_and_interleave. Return true
370 if so, returning the number of intermediate vectors in *NVECTORS_OUT
371 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
375 can_duplicate_and_interleave_p (vec_info
*vinfo
, unsigned int count
,
376 tree elt_type
, unsigned int *nvectors_out
,
377 tree
*vector_type_out
,
380 tree base_vector_type
= get_vectype_for_scalar_type (vinfo
, elt_type
, count
);
381 if (!base_vector_type
|| !VECTOR_MODE_P (TYPE_MODE (base_vector_type
)))
384 machine_mode base_vector_mode
= TYPE_MODE (base_vector_type
);
385 poly_int64 elt_bytes
= count
* GET_MODE_UNIT_SIZE (base_vector_mode
);
386 unsigned int nvectors
= 1;
389 scalar_int_mode int_mode
;
390 poly_int64 elt_bits
= elt_bytes
* BITS_PER_UNIT
;
391 if (int_mode_for_size (elt_bits
, 1).exists (&int_mode
))
393 /* Get the natural vector type for this SLP group size. */
394 tree int_type
= build_nonstandard_integer_type
395 (GET_MODE_BITSIZE (int_mode
), 1);
397 = get_vectype_for_scalar_type (vinfo
, int_type
, count
);
399 && VECTOR_MODE_P (TYPE_MODE (vector_type
))
400 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type
)),
401 GET_MODE_SIZE (base_vector_mode
)))
403 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
404 together into elements of type INT_TYPE and using the result
405 to build NVECTORS vectors. */
406 poly_uint64 nelts
= GET_MODE_NUNITS (TYPE_MODE (vector_type
));
407 vec_perm_builder
sel1 (nelts
, 2, 3);
408 vec_perm_builder
sel2 (nelts
, 2, 3);
409 poly_int64 half_nelts
= exact_div (nelts
, 2);
410 for (unsigned int i
= 0; i
< 3; ++i
)
413 sel1
.quick_push (i
+ nelts
);
414 sel2
.quick_push (half_nelts
+ i
);
415 sel2
.quick_push (half_nelts
+ i
+ nelts
);
417 vec_perm_indices
indices1 (sel1
, 2, nelts
);
418 vec_perm_indices
indices2 (sel2
, 2, nelts
);
419 if (can_vec_perm_const_p (TYPE_MODE (vector_type
), indices1
)
420 && can_vec_perm_const_p (TYPE_MODE (vector_type
), indices2
))
423 *nvectors_out
= nvectors
;
425 *vector_type_out
= vector_type
;
428 permutes
[0] = vect_gen_perm_mask_checked (vector_type
,
430 permutes
[1] = vect_gen_perm_mask_checked (vector_type
,
437 if (!multiple_p (elt_bytes
, 2, &elt_bytes
))
443 /* Return true if DTA and DTB match. */
446 vect_def_types_match (enum vect_def_type dta
, enum vect_def_type dtb
)
449 || ((dta
== vect_external_def
|| dta
== vect_constant_def
)
450 && (dtb
== vect_external_def
|| dtb
== vect_constant_def
)));
453 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
454 they are of a valid type and that they match the defs of the first stmt of
455 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
456 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
457 indicates swap is required for cond_expr stmts. Specifically, *SWAP
458 is 1 if STMT is cond and operands of comparison need to be swapped;
459 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
460 If there is any operand swap in this function, *SWAP is set to non-zero
462 If there was a fatal error return -1; if the error could be corrected by
463 swapping operands of father node of this one, return 1; if everything is
466 vect_get_and_check_slp_defs (vec_info
*vinfo
, unsigned char swap
,
468 vec
<stmt_vec_info
> stmts
, unsigned stmt_num
,
469 vec
<slp_oprnd_info
> *oprnds_info
)
471 stmt_vec_info stmt_info
= stmts
[stmt_num
];
473 unsigned int i
, number_of_oprnds
;
474 enum vect_def_type dt
= vect_uninitialized_def
;
475 slp_oprnd_info oprnd_info
;
476 int first_op_idx
= 1;
477 unsigned int commutative_op
= -1U;
478 bool first_op_cond
= false;
479 bool first
= stmt_num
== 0;
481 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
483 number_of_oprnds
= gimple_call_num_args (stmt
);
485 if (gimple_call_internal_p (stmt
))
487 internal_fn ifn
= gimple_call_internal_fn (stmt
);
488 commutative_op
= first_commutative_argument (ifn
);
490 /* Masked load, only look at mask. */
491 if (ifn
== IFN_MASK_LOAD
)
493 number_of_oprnds
= 1;
494 /* Mask operand index. */
499 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
501 enum tree_code code
= gimple_assign_rhs_code (stmt
);
502 number_of_oprnds
= gimple_num_ops (stmt
) - 1;
503 /* Swap can only be done for cond_expr if asked to, otherwise we
504 could result in different comparison code to the first stmt. */
505 if (code
== COND_EXPR
506 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt
)))
508 first_op_cond
= true;
512 commutative_op
= commutative_tree_code (code
) ? 0U : -1U;
514 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
515 number_of_oprnds
= gimple_phi_num_args (stmt
);
519 bool swapped
= (swap
!= 0);
520 bool backedge
= false;
521 gcc_assert (!swapped
|| first_op_cond
);
522 enum vect_def_type
*dts
= XALLOCAVEC (enum vect_def_type
, number_of_oprnds
);
523 for (i
= 0; i
< number_of_oprnds
; i
++)
527 /* Map indicating how operands of cond_expr should be swapped. */
528 int maps
[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
529 int *map
= maps
[swap
];
532 oprnd
= TREE_OPERAND (gimple_op (stmt_info
->stmt
,
533 first_op_idx
), map
[i
]);
535 oprnd
= gimple_op (stmt_info
->stmt
, map
[i
]);
537 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
539 oprnd
= gimple_phi_arg_def (stmt
, i
);
540 backedge
= dominated_by_p (CDI_DOMINATORS
,
541 gimple_phi_arg_edge (stmt
, i
)->src
,
542 gimple_bb (stmt_info
->stmt
));
545 oprnd
= gimple_op (stmt_info
->stmt
, first_op_idx
+ (swapped
? !i
: i
));
546 if (TREE_CODE (oprnd
) == VIEW_CONVERT_EXPR
)
547 oprnd
= TREE_OPERAND (oprnd
, 0);
549 oprnd_info
= (*oprnds_info
)[i
];
551 stmt_vec_info def_stmt_info
;
552 if (!vect_is_simple_use (oprnd
, vinfo
, &dts
[i
], &def_stmt_info
))
554 if (dump_enabled_p ())
555 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
556 "Build SLP failed: can't analyze def for %T\n",
564 oprnd_info
->def_stmts
.quick_push (NULL
);
565 oprnd_info
->ops
.quick_push (NULL_TREE
);
566 oprnd_info
->first_dt
= vect_uninitialized_def
;
570 oprnd_info
->def_stmts
.quick_push (def_stmt_info
);
571 oprnd_info
->ops
.quick_push (oprnd
);
574 && is_pattern_stmt_p (def_stmt_info
))
576 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info
))
578 oprnd_info
->any_pattern
= true;
580 /* If we promote this to external use the original stmt def. */
581 oprnd_info
->ops
.last ()
582 = gimple_get_lhs (vect_orig_stmt (def_stmt_info
)->stmt
);
585 /* If there's a extern def on a backedge make sure we can
586 code-generate at the region start.
587 ??? This is another case that could be fixed by adjusting
588 how we split the function but at the moment we'd have conflicting
591 && dts
[i
] == vect_external_def
592 && is_a
<bb_vec_info
> (vinfo
)
593 && TREE_CODE (oprnd
) == SSA_NAME
594 && !SSA_NAME_IS_DEFAULT_DEF (oprnd
)
595 && !dominated_by_p (CDI_DOMINATORS
,
596 as_a
<bb_vec_info
> (vinfo
)->bbs
[0],
597 gimple_bb (SSA_NAME_DEF_STMT (oprnd
))))
599 if (dump_enabled_p ())
600 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
601 "Build SLP failed: extern def %T only defined "
602 "on backedge\n", oprnd
);
608 tree type
= TREE_TYPE (oprnd
);
610 if ((dt
== vect_constant_def
611 || dt
== vect_external_def
)
612 && !GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
613 && (TREE_CODE (type
) == BOOLEAN_TYPE
614 || !can_duplicate_and_interleave_p (vinfo
, stmts
.length (),
617 if (dump_enabled_p ())
618 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
619 "Build SLP failed: invalid type of def "
620 "for variable-length SLP %T\n", oprnd
);
624 /* For the swapping logic below force vect_reduction_def
625 for the reduction op in a SLP reduction group. */
626 if (!STMT_VINFO_DATA_REF (stmt_info
)
627 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
628 && (int)i
== STMT_VINFO_REDUC_IDX (stmt_info
)
630 dts
[i
] = dt
= vect_reduction_def
;
632 /* Check the types of the definition. */
635 case vect_external_def
:
636 case vect_constant_def
:
637 case vect_internal_def
:
638 case vect_reduction_def
:
639 case vect_induction_def
:
640 case vect_nested_cycle
:
644 /* FORNOW: Not supported. */
645 if (dump_enabled_p ())
646 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
647 "Build SLP failed: illegal type of def %T\n",
652 oprnd_info
->first_dt
= dt
;
653 oprnd_info
->first_op_type
= type
;
659 /* Now match the operand definition types to that of the first stmt. */
660 for (i
= 0; i
< number_of_oprnds
;)
668 oprnd_info
= (*oprnds_info
)[i
];
670 stmt_vec_info def_stmt_info
= oprnd_info
->def_stmts
[stmt_num
];
671 oprnd
= oprnd_info
->ops
[stmt_num
];
672 tree type
= TREE_TYPE (oprnd
);
674 if (!types_compatible_p (oprnd_info
->first_op_type
, type
))
676 if (dump_enabled_p ())
677 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
678 "Build SLP failed: different operand types\n");
682 /* Not first stmt of the group, check that the def-stmt/s match
683 the def-stmt/s of the first stmt. Allow different definition
684 types for reduction chains: the first stmt must be a
685 vect_reduction_def (a phi node), and the rest
686 end in the reduction chain. */
687 if ((!vect_def_types_match (oprnd_info
->first_dt
, dt
)
688 && !(oprnd_info
->first_dt
== vect_reduction_def
689 && !STMT_VINFO_DATA_REF (stmt_info
)
690 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
692 && !STMT_VINFO_DATA_REF (def_stmt_info
)
693 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
694 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
))))
695 || (!STMT_VINFO_DATA_REF (stmt_info
)
696 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
698 || STMT_VINFO_DATA_REF (def_stmt_info
)
699 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
700 != REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
701 != (oprnd_info
->first_dt
!= vect_reduction_def
))))
703 /* Try swapping operands if we got a mismatch. For BB
704 vectorization only in case it will clearly improve things. */
705 if (i
== commutative_op
&& !swapped
706 && (!is_a
<bb_vec_info
> (vinfo
)
707 || (!vect_def_types_match ((*oprnds_info
)[i
+1]->first_dt
,
709 && (vect_def_types_match (oprnd_info
->first_dt
, dts
[i
+1])
710 || vect_def_types_match
711 ((*oprnds_info
)[i
+1]->first_dt
, dts
[i
])))))
713 if (dump_enabled_p ())
714 dump_printf_loc (MSG_NOTE
, vect_location
,
715 "trying swapped operands\n");
716 std::swap (dts
[i
], dts
[i
+1]);
717 std::swap ((*oprnds_info
)[i
]->def_stmts
[stmt_num
],
718 (*oprnds_info
)[i
+1]->def_stmts
[stmt_num
]);
719 std::swap ((*oprnds_info
)[i
]->ops
[stmt_num
],
720 (*oprnds_info
)[i
+1]->ops
[stmt_num
]);
725 if (is_a
<bb_vec_info
> (vinfo
)
726 && !oprnd_info
->any_pattern
)
728 /* Now for commutative ops we should see whether we can
729 make the other operand matching. */
730 if (dump_enabled_p ())
731 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
732 "treating operand as external\n");
733 oprnd_info
->first_dt
= dt
= vect_external_def
;
737 if (dump_enabled_p ())
738 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
739 "Build SLP failed: different types\n");
744 /* Make sure to demote the overall operand to external. */
745 if (dt
== vect_external_def
)
746 oprnd_info
->first_dt
= vect_external_def
;
747 /* For a SLP reduction chain we want to duplicate the reduction to
748 each of the chain members. That gets us a sane SLP graph (still
749 the stmts are not 100% correct wrt the initial values). */
750 else if ((dt
== vect_internal_def
751 || dt
== vect_reduction_def
)
752 && oprnd_info
->first_dt
== vect_reduction_def
753 && !STMT_VINFO_DATA_REF (stmt_info
)
754 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
755 && !STMT_VINFO_DATA_REF (def_stmt_info
)
756 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
757 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
759 oprnd_info
->def_stmts
[stmt_num
] = oprnd_info
->def_stmts
[0];
760 oprnd_info
->ops
[stmt_num
] = oprnd_info
->ops
[0];
769 if (dump_enabled_p ())
770 dump_printf_loc (MSG_NOTE
, vect_location
,
771 "swapped operands to match def types in %G",
778 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
779 Return true if we can, meaning that this choice doesn't conflict with
780 existing SLP nodes that use STMT_INFO. */
783 vect_update_shared_vectype (stmt_vec_info stmt_info
, tree vectype
)
785 tree old_vectype
= STMT_VINFO_VECTYPE (stmt_info
);
787 return useless_type_conversion_p (vectype
, old_vectype
);
789 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
791 /* We maintain the invariant that if any statement in the group is
792 used, all other members of the group have the same vector type. */
793 stmt_vec_info first_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
794 stmt_vec_info member_info
= first_info
;
795 for (; member_info
; member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
796 if (is_pattern_stmt_p (member_info
)
797 && !useless_type_conversion_p (vectype
,
798 STMT_VINFO_VECTYPE (member_info
)))
803 for (member_info
= first_info
; member_info
;
804 member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
805 STMT_VINFO_VECTYPE (member_info
) = vectype
;
809 else if (!is_pattern_stmt_p (stmt_info
))
811 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
815 if (dump_enabled_p ())
817 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
818 "Build SLP failed: incompatible vector"
819 " types for: %G", stmt_info
->stmt
);
820 dump_printf_loc (MSG_NOTE
, vect_location
,
821 " old vector type: %T\n", old_vectype
);
822 dump_printf_loc (MSG_NOTE
, vect_location
,
823 " new vector type: %T\n", vectype
);
828 /* Return true if call statements CALL1 and CALL2 are similar enough
829 to be combined into the same SLP group. */
832 compatible_calls_p (gcall
*call1
, gcall
*call2
)
834 unsigned int nargs
= gimple_call_num_args (call1
);
835 if (nargs
!= gimple_call_num_args (call2
))
838 if (gimple_call_combined_fn (call1
) != gimple_call_combined_fn (call2
))
841 if (gimple_call_internal_p (call1
))
843 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1
)),
844 TREE_TYPE (gimple_call_lhs (call2
))))
846 for (unsigned int i
= 0; i
< nargs
; ++i
)
847 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1
, i
)),
848 TREE_TYPE (gimple_call_arg (call2
, i
))))
853 if (!operand_equal_p (gimple_call_fn (call1
),
854 gimple_call_fn (call2
), 0))
857 if (gimple_call_fntype (call1
) != gimple_call_fntype (call2
))
863 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
864 caller's attempt to find the vector type in STMT_INFO with the narrowest
865 element type. Return true if VECTYPE is nonnull and if it is valid
866 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
867 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
868 vect_build_slp_tree. */
871 vect_record_max_nunits (vec_info
*vinfo
, stmt_vec_info stmt_info
,
872 unsigned int group_size
,
873 tree vectype
, poly_uint64
*max_nunits
)
877 if (dump_enabled_p ())
878 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
879 "Build SLP failed: unsupported data-type in %G\n",
881 /* Fatal mismatch. */
885 /* If populating the vector type requires unrolling then fail
886 before adjusting *max_nunits for basic-block vectorization. */
887 if (is_a
<bb_vec_info
> (vinfo
)
888 && !multiple_p (group_size
, TYPE_VECTOR_SUBPARTS (vectype
)))
890 if (dump_enabled_p ())
891 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
892 "Build SLP failed: unrolling required "
893 "in basic block SLP\n");
894 /* Fatal mismatch. */
898 /* In case of multiple types we need to detect the smallest type. */
899 vect_update_max_nunits (max_nunits
, vectype
);
903 /* Verify if the scalar stmts STMTS are isomorphic, require data
904 permutation or are of unsupported types of operation. Return
905 true if they are, otherwise return false and indicate in *MATCHES
906 which stmts are not isomorphic to the first one. If MATCHES[0]
907 is false then this indicates the comparison could not be
908 carried out or the stmts will never be vectorized by SLP.
910 Note COND_EXPR is possibly isomorphic to another one after swapping its
911 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
912 the first stmt by swapping the two operands of comparison; set SWAP[i]
913 to 2 if stmt I is isormorphic to the first stmt by inverting the code
914 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
915 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
918 vect_build_slp_tree_1 (vec_info
*vinfo
, unsigned char *swap
,
919 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
920 poly_uint64
*max_nunits
, bool *matches
,
921 bool *two_operators
, tree
*node_vectype
)
924 stmt_vec_info first_stmt_info
= stmts
[0];
925 enum tree_code first_stmt_code
= ERROR_MARK
;
926 enum tree_code alt_stmt_code
= ERROR_MARK
;
927 enum tree_code rhs_code
= ERROR_MARK
;
928 enum tree_code first_cond_code
= ERROR_MARK
;
930 bool need_same_oprnds
= false;
931 tree vectype
= NULL_TREE
, first_op1
= NULL_TREE
;
934 machine_mode optab_op2_mode
;
935 machine_mode vec_mode
;
936 stmt_vec_info first_load
= NULL
, prev_first_load
= NULL
;
937 bool first_stmt_load_p
= false, load_p
= false;
938 bool first_stmt_phi_p
= false, phi_p
= false;
939 bool maybe_soft_fail
= false;
940 tree soft_fail_nunits_vectype
= NULL_TREE
;
942 /* For every stmt in NODE find its def stmt/s. */
943 stmt_vec_info stmt_info
;
944 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
946 gimple
*stmt
= stmt_info
->stmt
;
950 if (dump_enabled_p ())
951 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for %G", stmt
);
953 /* Fail to vectorize statements marked as unvectorizable, throw
955 if (!STMT_VINFO_VECTORIZABLE (stmt_info
)
956 || stmt_can_throw_internal (cfun
, stmt
)
957 || gimple_has_volatile_ops (stmt
))
959 if (dump_enabled_p ())
960 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
961 "Build SLP failed: unvectorizable statement %G",
963 /* ??? For BB vectorization we want to commutate operands in a way
964 to shuffle all unvectorizable defs into one operand and have
965 the other still vectorized. The following doesn't reliably
966 work for this though but it's the easiest we can do here. */
967 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
969 /* Fatal mismatch. */
974 lhs
= gimple_get_lhs (stmt
);
975 if (lhs
== NULL_TREE
)
977 if (dump_enabled_p ())
978 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
979 "Build SLP failed: not GIMPLE_ASSIGN nor "
980 "GIMPLE_CALL %G", stmt
);
981 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
983 /* Fatal mismatch. */
989 if (!vect_get_vector_types_for_stmt (vinfo
, stmt_info
, &vectype
,
990 &nunits_vectype
, group_size
))
992 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
994 /* Fatal mismatch. */
998 /* Record nunits required but continue analysis, producing matches[]
999 as if nunits was not an issue. This allows splitting of groups
1002 && !vect_record_max_nunits (vinfo
, stmt_info
, group_size
,
1003 nunits_vectype
, max_nunits
))
1005 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
1006 maybe_soft_fail
= true;
1007 soft_fail_nunits_vectype
= nunits_vectype
;
1010 gcc_assert (vectype
);
1012 gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
);
1015 rhs_code
= CALL_EXPR
;
1017 if (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
))
1019 else if ((gimple_call_internal_p (call_stmt
)
1020 && (!vectorizable_internal_fn_p
1021 (gimple_call_internal_fn (call_stmt
))))
1022 || gimple_call_tail_p (call_stmt
)
1023 || gimple_call_noreturn_p (call_stmt
)
1024 || !gimple_call_nothrow_p (call_stmt
)
1025 || gimple_call_chain (call_stmt
))
1027 if (dump_enabled_p ())
1028 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1029 "Build SLP failed: unsupported call type %G",
1031 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1033 /* Fatal mismatch. */
1038 else if (gimple_code (stmt
) == GIMPLE_PHI
)
1040 rhs_code
= ERROR_MARK
;
1045 rhs_code
= gimple_assign_rhs_code (stmt
);
1046 load_p
= gimple_vuse (stmt
);
1049 /* Check the operation. */
1052 *node_vectype
= vectype
;
1053 first_stmt_code
= rhs_code
;
1054 first_stmt_load_p
= load_p
;
1055 first_stmt_phi_p
= phi_p
;
1057 /* Shift arguments should be equal in all the packed stmts for a
1058 vector shift with scalar shift operand. */
1059 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
1060 || rhs_code
== LROTATE_EXPR
1061 || rhs_code
== RROTATE_EXPR
)
1063 vec_mode
= TYPE_MODE (vectype
);
1065 /* First see if we have a vector/vector shift. */
1066 optab
= optab_for_tree_code (rhs_code
, vectype
,
1070 || optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
1072 /* No vector/vector shift, try for a vector/scalar shift. */
1073 optab
= optab_for_tree_code (rhs_code
, vectype
,
1078 if (dump_enabled_p ())
1079 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1080 "Build SLP failed: no optab.\n");
1081 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1083 /* Fatal mismatch. */
1087 icode
= (int) optab_handler (optab
, vec_mode
);
1088 if (icode
== CODE_FOR_nothing
)
1090 if (dump_enabled_p ())
1091 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1092 "Build SLP failed: "
1093 "op not supported by target.\n");
1094 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1096 /* Fatal mismatch. */
1100 optab_op2_mode
= insn_data
[icode
].operand
[2].mode
;
1101 if (!VECTOR_MODE_P (optab_op2_mode
))
1103 need_same_oprnds
= true;
1104 first_op1
= gimple_assign_rhs2 (stmt
);
1108 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
1110 need_same_oprnds
= true;
1111 first_op1
= gimple_assign_rhs2 (stmt
);
1114 && rhs_code
== BIT_FIELD_REF
)
1116 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 0);
1117 if (!is_a
<bb_vec_info
> (vinfo
)
1118 || TREE_CODE (vec
) != SSA_NAME
1119 || !operand_equal_p (TYPE_SIZE (vectype
),
1120 TYPE_SIZE (TREE_TYPE (vec
))))
1122 if (dump_enabled_p ())
1123 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1124 "Build SLP failed: "
1125 "BIT_FIELD_REF not supported\n");
1126 /* Fatal mismatch. */
1132 && gimple_call_internal_p (call_stmt
, IFN_DIV_POW2
))
1134 need_same_oprnds
= true;
1135 first_op1
= gimple_call_arg (call_stmt
, 1);
1140 if (first_stmt_code
!= rhs_code
1141 && alt_stmt_code
== ERROR_MARK
)
1142 alt_stmt_code
= rhs_code
;
1143 if ((first_stmt_code
!= rhs_code
1144 && (first_stmt_code
!= IMAGPART_EXPR
1145 || rhs_code
!= REALPART_EXPR
)
1146 && (first_stmt_code
!= REALPART_EXPR
1147 || rhs_code
!= IMAGPART_EXPR
)
1148 /* Handle mismatches in plus/minus by computing both
1149 and merging the results. */
1150 && !((first_stmt_code
== PLUS_EXPR
1151 || first_stmt_code
== MINUS_EXPR
)
1152 && (alt_stmt_code
== PLUS_EXPR
1153 || alt_stmt_code
== MINUS_EXPR
)
1154 && rhs_code
== alt_stmt_code
)
1155 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1156 && (first_stmt_code
== ARRAY_REF
1157 || first_stmt_code
== BIT_FIELD_REF
1158 || first_stmt_code
== INDIRECT_REF
1159 || first_stmt_code
== COMPONENT_REF
1160 || first_stmt_code
== MEM_REF
)))
1161 || first_stmt_load_p
!= load_p
1162 || first_stmt_phi_p
!= phi_p
)
1164 if (dump_enabled_p ())
1166 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1167 "Build SLP failed: different operation "
1168 "in stmt %G", stmt
);
1169 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1170 "original stmt %G", first_stmt_info
->stmt
);
1176 if (need_same_oprnds
)
1178 tree other_op1
= (call_stmt
1179 ? gimple_call_arg (call_stmt
, 1)
1180 : gimple_assign_rhs2 (stmt
));
1181 if (!operand_equal_p (first_op1
, other_op1
, 0))
1183 if (dump_enabled_p ())
1184 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1185 "Build SLP failed: different shift "
1186 "arguments in %G", stmt
);
1192 && first_stmt_code
== BIT_FIELD_REF
1193 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info
->stmt
), 0)
1194 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0)))
1196 if (dump_enabled_p ())
1197 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1198 "Build SLP failed: different BIT_FIELD_REF "
1199 "arguments in %G", stmt
);
1204 if (!load_p
&& rhs_code
== CALL_EXPR
)
1206 if (!compatible_calls_p (as_a
<gcall
*> (stmts
[0]->stmt
),
1207 as_a
<gcall
*> (stmt
)))
1209 if (dump_enabled_p ())
1210 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1211 "Build SLP failed: different calls in %G",
1218 if ((phi_p
|| gimple_could_trap_p (stmt_info
->stmt
))
1219 && (gimple_bb (first_stmt_info
->stmt
)
1220 != gimple_bb (stmt_info
->stmt
)))
1222 if (dump_enabled_p ())
1223 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1224 "Build SLP failed: different BB for PHI "
1225 "or possibly trapping operation in %G", stmt
);
1230 if (!types_compatible_p (vectype
, *node_vectype
))
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1234 "Build SLP failed: different vector type "
1241 /* Grouped store or load. */
1242 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1244 if (REFERENCE_CLASS_P (lhs
))
1252 first_load
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1253 if (prev_first_load
)
1255 /* Check that there are no loads from different interleaving
1256 chains in the same node. */
1257 if (prev_first_load
!= first_load
)
1259 if (dump_enabled_p ())
1260 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1262 "Build SLP failed: different "
1263 "interleaving chains in one node %G",
1270 prev_first_load
= first_load
;
1272 } /* Grouped access. */
1277 /* Not grouped load. */
1278 if (dump_enabled_p ())
1279 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1280 "Build SLP failed: not grouped load %G", stmt
);
1282 /* FORNOW: Not grouped loads are not supported. */
1283 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1285 /* Fatal mismatch. */
1290 /* Not memory operation. */
1292 && TREE_CODE_CLASS (rhs_code
) != tcc_binary
1293 && TREE_CODE_CLASS (rhs_code
) != tcc_unary
1294 && TREE_CODE_CLASS (rhs_code
) != tcc_expression
1295 && TREE_CODE_CLASS (rhs_code
) != tcc_comparison
1296 && rhs_code
!= VIEW_CONVERT_EXPR
1297 && rhs_code
!= CALL_EXPR
1298 && rhs_code
!= BIT_FIELD_REF
)
1300 if (dump_enabled_p ())
1301 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1302 "Build SLP failed: operation unsupported %G",
1304 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1306 /* Fatal mismatch. */
1311 if (rhs_code
== COND_EXPR
)
1313 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1314 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1315 enum tree_code swap_code
= ERROR_MARK
;
1316 enum tree_code invert_code
= ERROR_MARK
;
1319 first_cond_code
= TREE_CODE (cond_expr
);
1320 else if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1322 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
1323 swap_code
= swap_tree_comparison (cond_code
);
1324 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
1327 if (first_cond_code
== cond_code
)
1329 /* Isomorphic can be achieved by swapping. */
1330 else if (first_cond_code
== swap_code
)
1332 /* Isomorphic can be achieved by inverting. */
1333 else if (first_cond_code
== invert_code
)
1337 if (dump_enabled_p ())
1338 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1339 "Build SLP failed: different"
1340 " operation %G", stmt
);
1350 for (i
= 0; i
< group_size
; ++i
)
1354 /* If we allowed a two-operation SLP node verify the target can cope
1355 with the permute we are going to use. */
1356 if (alt_stmt_code
!= ERROR_MARK
1357 && TREE_CODE_CLASS (alt_stmt_code
) != tcc_reference
)
1359 *two_operators
= true;
1362 if (maybe_soft_fail
)
1364 unsigned HOST_WIDE_INT const_nunits
;
1365 if (!TYPE_VECTOR_SUBPARTS
1366 (soft_fail_nunits_vectype
).is_constant (&const_nunits
)
1367 || const_nunits
> group_size
)
1371 /* With constant vector elements simulate a mismatch at the
1372 point we need to split. */
1373 unsigned tail
= group_size
& (const_nunits
- 1);
1374 memset (&matches
[group_size
- tail
], 0, sizeof (bool) * tail
);
1382 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1383 Note we never remove apart from at destruction time so we do not
1384 need a special value for deleted that differs from empty. */
1387 typedef vec
<stmt_vec_info
> value_type
;
1388 typedef vec
<stmt_vec_info
> compare_type
;
1389 static inline hashval_t
hash (value_type
);
1390 static inline bool equal (value_type existing
, value_type candidate
);
1391 static inline bool is_empty (value_type x
) { return !x
.exists (); }
1392 static inline bool is_deleted (value_type x
) { return !x
.exists (); }
1393 static const bool empty_zero_p
= true;
1394 static inline void mark_empty (value_type
&x
) { x
.release (); }
1395 static inline void mark_deleted (value_type
&x
) { x
.release (); }
1396 static inline void remove (value_type
&x
) { x
.release (); }
1399 bst_traits::hash (value_type x
)
1402 for (unsigned i
= 0; i
< x
.length (); ++i
)
1403 h
.add_int (gimple_uid (x
[i
]->stmt
));
1407 bst_traits::equal (value_type existing
, value_type candidate
)
1409 if (existing
.length () != candidate
.length ())
1411 for (unsigned i
= 0; i
< existing
.length (); ++i
)
1412 if (existing
[i
] != candidate
[i
])
1417 typedef hash_map
<vec
<stmt_vec_info
>, slp_tree
,
1418 simple_hashmap_traits
<bst_traits
, slp_tree
> >
1419 scalar_stmts_to_slp_tree_map_t
;
1422 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1423 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1424 poly_uint64
*max_nunits
,
1425 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1426 scalar_stmts_to_slp_tree_map_t
*bst_map
);
1429 vect_build_slp_tree (vec_info
*vinfo
,
1430 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1431 poly_uint64
*max_nunits
,
1432 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1433 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1435 if (slp_tree
*leader
= bst_map
->get (stmts
))
1437 if (dump_enabled_p ())
1438 dump_printf_loc (MSG_NOTE
, vect_location
, "re-using %sSLP tree %p\n",
1439 *leader
? "" : "failed ", *leader
);
1442 SLP_TREE_REF_COUNT (*leader
)++;
1443 vect_update_max_nunits (max_nunits
, (*leader
)->max_nunits
);
1449 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1450 so we can pick up backedge destinations during discovery. */
1451 slp_tree res
= new _slp_tree
;
1452 SLP_TREE_DEF_TYPE (res
) = vect_internal_def
;
1453 SLP_TREE_SCALAR_STMTS (res
) = stmts
;
1454 bst_map
->put (stmts
.copy (), res
);
1458 if (dump_enabled_p ())
1459 dump_printf_loc (MSG_NOTE
, vect_location
,
1460 "SLP discovery limit exceeded\n");
1461 bool existed_p
= bst_map
->put (stmts
, NULL
);
1462 gcc_assert (existed_p
);
1463 /* Mark the node invalid so we can detect those when still in use
1464 as backedge destinations. */
1465 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1466 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1467 vect_free_slp_tree (res
);
1468 memset (matches
, 0, sizeof (bool) * group_size
);
1473 poly_uint64 this_max_nunits
= 1;
1474 slp_tree res_
= vect_build_slp_tree_2 (vinfo
, res
, stmts
, group_size
,
1476 matches
, limit
, tree_size
, bst_map
);
1479 bool existed_p
= bst_map
->put (stmts
, NULL
);
1480 gcc_assert (existed_p
);
1481 /* Mark the node invalid so we can detect those when still in use
1482 as backedge destinations. */
1483 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1484 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1485 vect_free_slp_tree (res
);
1489 gcc_assert (res_
== res
);
1490 res
->max_nunits
= this_max_nunits
;
1491 vect_update_max_nunits (max_nunits
, this_max_nunits
);
1492 /* Keep a reference for the bst_map use. */
1493 SLP_TREE_REF_COUNT (res
)++;
1498 /* Recursively build an SLP tree starting from NODE.
1499 Fail (and return a value not equal to zero) if def-stmts are not
1500 isomorphic, require data permutation or are of unsupported types of
1501 operation. Otherwise, return 0.
1502 The value returned is the depth in the SLP tree where a mismatch
1506 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1507 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1508 poly_uint64
*max_nunits
,
1509 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1510 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1512 unsigned nops
, i
, this_tree_size
= 0;
1513 poly_uint64 this_max_nunits
= *max_nunits
;
1517 stmt_vec_info stmt_info
= stmts
[0];
1518 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1519 nops
= gimple_call_num_args (stmt
);
1520 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
1522 nops
= gimple_num_ops (stmt
) - 1;
1523 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
1526 else if (gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1527 nops
= gimple_phi_num_args (phi
);
1531 /* If the SLP node is a PHI (induction or reduction), terminate
1533 bool *skip_args
= XALLOCAVEC (bool, nops
);
1534 memset (skip_args
, 0, sizeof (bool) * nops
);
1535 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
1536 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1538 tree scalar_type
= TREE_TYPE (PHI_RESULT (stmt
));
1539 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
1541 if (!vect_record_max_nunits (vinfo
, stmt_info
, group_size
, vectype
,
1545 vect_def_type def_type
= STMT_VINFO_DEF_TYPE (stmt_info
);
1546 if (def_type
== vect_induction_def
)
1548 /* Induction PHIs are not cycles but walk the initial
1549 value. Only for inner loops through, for outer loops
1550 we need to pick up the value from the actual PHIs
1551 to more easily support peeling and epilogue vectorization. */
1552 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1553 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1554 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1557 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1559 else if (def_type
== vect_reduction_def
1560 || def_type
== vect_double_reduction_def
1561 || def_type
== vect_nested_cycle
)
1563 /* Else def types have to match. */
1564 stmt_vec_info other_info
;
1565 bool all_same
= true;
1566 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1568 if (STMT_VINFO_DEF_TYPE (other_info
) != def_type
)
1570 if (other_info
!= stmt_info
)
1573 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1574 /* Reduction initial values are not explicitely represented. */
1575 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1576 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1577 /* Reduction chain backedge defs are filled manually.
1578 ??? Need a better way to identify a SLP reduction chain PHI.
1579 Or a better overall way to SLP match those. */
1580 if (all_same
&& def_type
== vect_reduction_def
)
1581 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1583 else if (def_type
!= vect_internal_def
)
1588 bool two_operators
= false;
1589 unsigned char *swap
= XALLOCAVEC (unsigned char, group_size
);
1590 tree vectype
= NULL_TREE
;
1591 if (!vect_build_slp_tree_1 (vinfo
, swap
, stmts
, group_size
,
1592 &this_max_nunits
, matches
, &two_operators
,
1596 /* If the SLP node is a load, terminate the recursion unless masked. */
1597 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1598 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1600 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1603 gcc_assert (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
));
1608 *max_nunits
= this_max_nunits
;
1610 node
= vect_create_new_slp_node (node
, stmts
, 0);
1611 SLP_TREE_VECTYPE (node
) = vectype
;
1612 /* And compute the load permutation. Whether it is actually
1613 a permutation depends on the unrolling factor which is
1615 vec
<unsigned> load_permutation
;
1617 stmt_vec_info load_info
;
1618 load_permutation
.create (group_size
);
1619 stmt_vec_info first_stmt_info
1620 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
1621 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
1623 int load_place
= vect_get_place_in_interleaving_chain
1624 (load_info
, first_stmt_info
);
1625 gcc_assert (load_place
!= -1);
1626 load_permutation
.safe_push (load_place
);
1628 SLP_TREE_LOAD_PERMUTATION (node
) = load_permutation
;
1632 else if (gimple_assign_single_p (stmt_info
->stmt
)
1633 && !gimple_vuse (stmt_info
->stmt
)
1634 && gimple_assign_rhs_code (stmt_info
->stmt
) == BIT_FIELD_REF
)
1636 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1637 the same SSA name vector of a compatible type to vectype. */
1638 vec
<std::pair
<unsigned, unsigned> > lperm
= vNULL
;
1639 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0);
1640 stmt_vec_info estmt_info
;
1641 FOR_EACH_VEC_ELT (stmts
, i
, estmt_info
)
1643 gassign
*estmt
= as_a
<gassign
*> (estmt_info
->stmt
);
1644 tree bfref
= gimple_assign_rhs1 (estmt
);
1646 if (!known_eq (bit_field_size (bfref
),
1647 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype
))))
1648 || !constant_multiple_p (bit_field_offset (bfref
),
1649 bit_field_size (bfref
), &lane
))
1654 lperm
.safe_push (std::make_pair (0, (unsigned)lane
));
1656 slp_tree vnode
= vect_create_new_slp_node (vNULL
);
1657 /* ??? We record vectype here but we hide eventually necessary
1658 punning and instead rely on code generation to materialize
1659 VIEW_CONVERT_EXPRs as necessary. We instead should make
1660 this explicit somehow. */
1661 SLP_TREE_VECTYPE (vnode
) = vectype
;
1662 SLP_TREE_VEC_DEFS (vnode
).safe_push (vec
);
1663 /* We are always building a permutation node even if it is an identity
1664 permute to shield the rest of the vectorizer from the odd node
1665 representing an actual vector without any scalar ops.
1666 ??? We could hide it completely with making the permute node
1668 node
= vect_create_new_slp_node (node
, stmts
, 1);
1669 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1670 SLP_TREE_LANE_PERMUTATION (node
) = lperm
;
1671 SLP_TREE_VECTYPE (node
) = vectype
;
1672 SLP_TREE_CHILDREN (node
).quick_push (vnode
);
1676 /* Get at the operands, verifying they are compatible. */
1677 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
1678 slp_oprnd_info oprnd_info
;
1679 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
1681 int res
= vect_get_and_check_slp_defs (vinfo
, swap
[i
], skip_args
,
1682 stmts
, i
, &oprnds_info
);
1684 matches
[(res
== -1) ? 0 : i
] = false;
1688 for (i
= 0; i
< group_size
; ++i
)
1691 vect_free_oprnd_info (oprnds_info
);
1696 auto_vec
<slp_tree
, 4> children
;
1698 stmt_info
= stmts
[0];
1700 /* Create SLP_TREE nodes for the definition node/s. */
1701 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
1706 /* We're skipping certain operands from processing, for example
1707 outer loop reduction initial defs. */
1710 children
.safe_push (NULL
);
1714 if (oprnd_info
->first_dt
== vect_uninitialized_def
)
1716 /* COND_EXPR have one too many eventually if the condition
1718 gcc_assert (i
== 3 && nops
== 4);
1722 if (is_a
<bb_vec_info
> (vinfo
)
1723 && oprnd_info
->first_dt
== vect_internal_def
1724 && !oprnd_info
->any_pattern
)
1726 /* For BB vectorization, if all defs are the same do not
1727 bother to continue the build along the single-lane
1728 graph but use a splat of the scalar value. */
1729 stmt_vec_info first_def
= oprnd_info
->def_stmts
[0];
1730 for (j
= 1; j
< group_size
; ++j
)
1731 if (oprnd_info
->def_stmts
[j
] != first_def
)
1734 /* But avoid doing this for loads where we may be
1735 able to CSE things, unless the stmt is not
1737 && (!STMT_VINFO_VECTORIZABLE (first_def
)
1738 || !gimple_vuse (first_def
->stmt
)))
1740 if (dump_enabled_p ())
1741 dump_printf_loc (MSG_NOTE
, vect_location
,
1742 "Using a splat of the uniform operand\n");
1743 oprnd_info
->first_dt
= vect_external_def
;
1747 if (oprnd_info
->first_dt
== vect_external_def
1748 || oprnd_info
->first_dt
== vect_constant_def
)
1750 slp_tree invnode
= vect_create_new_slp_node (oprnd_info
->ops
);
1751 SLP_TREE_DEF_TYPE (invnode
) = oprnd_info
->first_dt
;
1752 oprnd_info
->ops
= vNULL
;
1753 children
.safe_push (invnode
);
1757 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1758 group_size
, &this_max_nunits
,
1760 &this_tree_size
, bst_map
)) != NULL
)
1762 oprnd_info
->def_stmts
= vNULL
;
1763 children
.safe_push (child
);
1767 /* If the SLP build for operand zero failed and operand zero
1768 and one can be commutated try that for the scalar stmts
1769 that failed the match. */
1771 /* A first scalar stmt mismatch signals a fatal mismatch. */
1773 /* ??? For COND_EXPRs we can swap the comparison operands
1774 as well as the arms under some constraints. */
1776 && oprnds_info
[1]->first_dt
== vect_internal_def
1777 && is_gimple_assign (stmt_info
->stmt
)
1778 /* Swapping operands for reductions breaks assumptions later on. */
1779 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
1780 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
1782 /* See whether we can swap the matching or the non-matching
1784 bool swap_not_matching
= true;
1787 for (j
= 0; j
< group_size
; ++j
)
1789 if (matches
[j
] != !swap_not_matching
)
1791 stmt_vec_info stmt_info
= stmts
[j
];
1792 /* Verify if we can swap operands of this stmt. */
1793 gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
1795 || !commutative_tree_code (gimple_assign_rhs_code (stmt
)))
1797 if (!swap_not_matching
)
1799 swap_not_matching
= false;
1804 while (j
!= group_size
);
1806 /* Swap mismatched definition stmts. */
1807 if (dump_enabled_p ())
1808 dump_printf_loc (MSG_NOTE
, vect_location
,
1809 "Re-trying with swapped operands of stmts ");
1810 for (j
= 0; j
< group_size
; ++j
)
1811 if (matches
[j
] == !swap_not_matching
)
1813 std::swap (oprnds_info
[0]->def_stmts
[j
],
1814 oprnds_info
[1]->def_stmts
[j
]);
1815 std::swap (oprnds_info
[0]->ops
[j
],
1816 oprnds_info
[1]->ops
[j
]);
1817 if (dump_enabled_p ())
1818 dump_printf (MSG_NOTE
, "%d ", j
);
1820 if (dump_enabled_p ())
1821 dump_printf (MSG_NOTE
, "\n");
1822 /* And try again with scratch 'matches' ... */
1823 bool *tem
= XALLOCAVEC (bool, group_size
);
1824 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1825 group_size
, &this_max_nunits
,
1827 &this_tree_size
, bst_map
)) != NULL
)
1829 oprnd_info
->def_stmts
= vNULL
;
1830 children
.safe_push (child
);
1836 /* If the SLP build failed and we analyze a basic-block
1837 simply treat nodes we fail to build as externally defined
1838 (and thus build vectors from the scalar defs).
1839 The cost model will reject outright expensive cases.
1840 ??? This doesn't treat cases where permutation ultimatively
1841 fails (or we don't try permutation below). Ideally we'd
1842 even compute a permutation that will end up with the maximum
1844 if (is_a
<bb_vec_info
> (vinfo
)
1845 /* ??? Rejecting patterns this way doesn't work. We'd have to
1846 do extra work to cancel the pattern so the uses see the
1848 && !is_pattern_stmt_p (stmt_info
)
1849 && !oprnd_info
->any_pattern
)
1851 /* But if there's a leading vector sized set of matching stmts
1852 fail here so we can split the group. This matches the condition
1853 vect_analyze_slp_instance uses. */
1854 /* ??? We might want to split here and combine the results to support
1855 multiple vector sizes better. */
1856 for (j
= 0; j
< group_size
; ++j
)
1859 if (!known_ge (j
, TYPE_VECTOR_SUBPARTS (vectype
)))
1861 if (dump_enabled_p ())
1862 dump_printf_loc (MSG_NOTE
, vect_location
,
1863 "Building vector operands from scalars\n");
1865 child
= vect_create_new_slp_node (oprnd_info
->ops
);
1866 children
.safe_push (child
);
1867 oprnd_info
->ops
= vNULL
;
1872 gcc_assert (child
== NULL
);
1873 FOR_EACH_VEC_ELT (children
, j
, child
)
1875 vect_free_slp_tree (child
);
1876 vect_free_oprnd_info (oprnds_info
);
1880 vect_free_oprnd_info (oprnds_info
);
1882 /* If we have all children of a child built up from uniform scalars
1883 or does more than one possibly expensive vector construction then
1884 just throw that away, causing it built up from scalars.
1885 The exception is the SLP node for the vector store. */
1886 if (is_a
<bb_vec_info
> (vinfo
)
1887 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1888 /* ??? Rejecting patterns this way doesn't work. We'd have to
1889 do extra work to cancel the pattern so the uses see the
1891 && !is_pattern_stmt_p (stmt_info
))
1895 bool all_uniform_p
= true;
1896 unsigned n_vector_builds
= 0;
1897 FOR_EACH_VEC_ELT (children
, j
, child
)
1901 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
1902 all_uniform_p
= false;
1903 else if (!vect_slp_tree_uniform_p (child
))
1905 all_uniform_p
= false;
1906 if (SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
1911 || n_vector_builds
> 1
1912 || (n_vector_builds
== children
.length ()
1913 && is_a
<gphi
*> (stmt_info
->stmt
)))
1917 FOR_EACH_VEC_ELT (children
, j
, child
)
1919 vect_free_slp_tree (child
);
1921 if (dump_enabled_p ())
1922 dump_printf_loc (MSG_NOTE
, vect_location
,
1923 "Building parent vector operands from "
1924 "scalars instead\n");
1929 *tree_size
+= this_tree_size
+ 1;
1930 *max_nunits
= this_max_nunits
;
1934 /* ??? We'd likely want to either cache in bst_map sth like
1935 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1936 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1937 explicit stmts to put in so the keying on 'stmts' doesn't
1938 work (but we have the same issue with nodes that use 'ops'). */
1939 slp_tree one
= new _slp_tree
;
1940 slp_tree two
= new _slp_tree
;
1941 SLP_TREE_DEF_TYPE (one
) = vect_internal_def
;
1942 SLP_TREE_DEF_TYPE (two
) = vect_internal_def
;
1943 SLP_TREE_VECTYPE (one
) = vectype
;
1944 SLP_TREE_VECTYPE (two
) = vectype
;
1945 SLP_TREE_CHILDREN (one
).safe_splice (children
);
1946 SLP_TREE_CHILDREN (two
).safe_splice (children
);
1948 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two
), i
, child
)
1949 SLP_TREE_REF_COUNT (child
)++;
1951 /* Here we record the original defs since this
1952 node represents the final lane configuration. */
1953 node
= vect_create_new_slp_node (node
, stmts
, 2);
1954 SLP_TREE_VECTYPE (node
) = vectype
;
1955 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1956 SLP_TREE_CHILDREN (node
).quick_push (one
);
1957 SLP_TREE_CHILDREN (node
).quick_push (two
);
1958 gassign
*stmt
= as_a
<gassign
*> (stmts
[0]->stmt
);
1959 enum tree_code code0
= gimple_assign_rhs_code (stmt
);
1960 enum tree_code ocode
= ERROR_MARK
;
1961 stmt_vec_info ostmt_info
;
1963 FOR_EACH_VEC_ELT (stmts
, i
, ostmt_info
)
1965 gassign
*ostmt
= as_a
<gassign
*> (ostmt_info
->stmt
);
1966 if (gimple_assign_rhs_code (ostmt
) != code0
)
1968 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (1, i
));
1969 ocode
= gimple_assign_rhs_code (ostmt
);
1973 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (0, i
));
1975 SLP_TREE_CODE (one
) = code0
;
1976 SLP_TREE_CODE (two
) = ocode
;
1977 SLP_TREE_LANES (one
) = stmts
.length ();
1978 SLP_TREE_LANES (two
) = stmts
.length ();
1979 SLP_TREE_REPRESENTATIVE (one
) = stmts
[0];
1980 SLP_TREE_REPRESENTATIVE (two
) = stmts
[j
];
1984 node
= vect_create_new_slp_node (node
, stmts
, nops
);
1985 SLP_TREE_VECTYPE (node
) = vectype
;
1986 SLP_TREE_CHILDREN (node
).splice (children
);
1990 /* Dump a single SLP tree NODE. */
1993 vect_print_slp_tree (dump_flags_t dump_kind
, dump_location_t loc
,
1998 stmt_vec_info stmt_info
;
2001 dump_metadata_t
metadata (dump_kind
, loc
.get_impl_location ());
2002 dump_user_location_t user_loc
= loc
.get_user_location ();
2003 dump_printf_loc (metadata
, user_loc
, "node%s %p (max_nunits=%u, refcnt=%u)\n",
2004 SLP_TREE_DEF_TYPE (node
) == vect_external_def
2006 : (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
2009 estimated_poly_value (node
->max_nunits
),
2010 SLP_TREE_REF_COUNT (node
));
2011 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
2013 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
2014 dump_printf_loc (metadata
, user_loc
, "op: VEC_PERM_EXPR\n");
2016 dump_printf_loc (metadata
, user_loc
, "op template: %G",
2017 SLP_TREE_REPRESENTATIVE (node
)->stmt
);
2019 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
2020 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2021 dump_printf_loc (metadata
, user_loc
, "\tstmt %u %G", i
, stmt_info
->stmt
);
2024 dump_printf_loc (metadata
, user_loc
, "\t{ ");
2025 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
2026 dump_printf (metadata
, "%T%s ", op
,
2027 i
< SLP_TREE_SCALAR_OPS (node
).length () - 1 ? "," : "");
2028 dump_printf (metadata
, "}\n");
2030 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2032 dump_printf_loc (metadata
, user_loc
, "\tload permutation {");
2033 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node
), i
, j
)
2034 dump_printf (dump_kind
, " %u", j
);
2035 dump_printf (dump_kind
, " }\n");
2037 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
2039 dump_printf_loc (metadata
, user_loc
, "\tlane permutation {");
2040 for (i
= 0; i
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++i
)
2041 dump_printf (dump_kind
, " %u[%u]",
2042 SLP_TREE_LANE_PERMUTATION (node
)[i
].first
,
2043 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
);
2044 dump_printf (dump_kind
, " }\n");
2046 if (SLP_TREE_CHILDREN (node
).is_empty ())
2048 dump_printf_loc (metadata
, user_loc
, "\tchildren");
2049 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2050 dump_printf (dump_kind
, " %p", (void *)child
);
2051 dump_printf (dump_kind
, "\n");
2055 debug (slp_tree node
)
2057 debug_dump_context ctx
;
2058 vect_print_slp_tree (MSG_NOTE
,
2059 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
2063 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2066 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2067 slp_tree node
, hash_set
<slp_tree
> &visited
)
2072 if (visited
.add (node
))
2075 vect_print_slp_tree (dump_kind
, loc
, node
);
2077 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2079 vect_print_slp_graph (dump_kind
, loc
, child
, visited
);
2083 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2086 hash_set
<slp_tree
> visited
;
2087 vect_print_slp_graph (dump_kind
, loc
, entry
, visited
);
2090 /* Mark the tree rooted at NODE with PURE_SLP. */
2093 vect_mark_slp_stmts (slp_tree node
, hash_set
<slp_tree
> &visited
)
2096 stmt_vec_info stmt_info
;
2099 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2102 if (visited
.add (node
))
2105 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2106 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
2108 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2110 vect_mark_slp_stmts (child
, visited
);
2114 vect_mark_slp_stmts (slp_tree node
)
2116 hash_set
<slp_tree
> visited
;
2117 vect_mark_slp_stmts (node
, visited
);
2120 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2123 vect_mark_slp_stmts_relevant (slp_tree node
, hash_set
<slp_tree
> &visited
)
2126 stmt_vec_info stmt_info
;
2129 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2132 if (visited
.add (node
))
2135 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2137 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
2138 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
2139 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
2142 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2144 vect_mark_slp_stmts_relevant (child
, visited
);
2148 vect_mark_slp_stmts_relevant (slp_tree node
)
2150 hash_set
<slp_tree
> visited
;
2151 vect_mark_slp_stmts_relevant (node
, visited
);
2155 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2158 vect_gather_slp_loads (vec
<slp_tree
> &loads
, slp_tree node
,
2159 hash_set
<slp_tree
> &visited
)
2161 if (!node
|| visited
.add (node
))
2164 if (SLP_TREE_CHILDREN (node
).length () == 0)
2166 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2168 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2169 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2170 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2171 loads
.safe_push (node
);
2177 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2178 vect_gather_slp_loads (loads
, child
, visited
);
2183 /* Find the last store in SLP INSTANCE. */
2186 vect_find_last_scalar_stmt_in_slp (slp_tree node
)
2188 stmt_vec_info last
= NULL
;
2189 stmt_vec_info stmt_vinfo
;
2191 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2193 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2194 last
= last
? get_later_stmt (stmt_vinfo
, last
) : stmt_vinfo
;
2200 /* Find the first stmt in NODE. */
2203 vect_find_first_scalar_stmt_in_slp (slp_tree node
)
2205 stmt_vec_info first
= NULL
;
2206 stmt_vec_info stmt_vinfo
;
2208 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2210 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2212 || get_later_stmt (stmt_vinfo
, first
) == first
)
2219 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2220 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2221 (also containing the first GROUP1_SIZE stmts, since stores are
2222 consecutive), the second containing the remainder.
2223 Return the first stmt in the second group. */
2225 static stmt_vec_info
2226 vect_split_slp_store_group (stmt_vec_info first_vinfo
, unsigned group1_size
)
2228 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo
) == first_vinfo
);
2229 gcc_assert (group1_size
> 0);
2230 int group2_size
= DR_GROUP_SIZE (first_vinfo
) - group1_size
;
2231 gcc_assert (group2_size
> 0);
2232 DR_GROUP_SIZE (first_vinfo
) = group1_size
;
2234 stmt_vec_info stmt_info
= first_vinfo
;
2235 for (unsigned i
= group1_size
; i
> 1; i
--)
2237 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2238 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2240 /* STMT is now the last element of the first group. */
2241 stmt_vec_info group2
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2242 DR_GROUP_NEXT_ELEMENT (stmt_info
) = 0;
2244 DR_GROUP_SIZE (group2
) = group2_size
;
2245 for (stmt_info
= group2
; stmt_info
;
2246 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
))
2248 DR_GROUP_FIRST_ELEMENT (stmt_info
) = group2
;
2249 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2252 /* For the second group, the DR_GROUP_GAP is that before the original group,
2253 plus skipping over the first vector. */
2254 DR_GROUP_GAP (group2
) = DR_GROUP_GAP (first_vinfo
) + group1_size
;
2256 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2257 DR_GROUP_GAP (first_vinfo
) += group2_size
;
2259 if (dump_enabled_p ())
2260 dump_printf_loc (MSG_NOTE
, vect_location
, "Split group into %d and %d\n",
2261 group1_size
, group2_size
);
2266 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2267 statements and a vector of NUNITS elements. */
2270 calculate_unrolling_factor (poly_uint64 nunits
, unsigned int group_size
)
2272 return exact_div (common_multiple (nunits
, group_size
), group_size
);
2275 /* Helper that checks to see if a node is a load node. */
2278 vect_is_slp_load_node (slp_tree root
)
2280 return SLP_TREE_DEF_TYPE (root
) == vect_internal_def
2281 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root
))
2282 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root
)));
2286 /* Helper function of optimize_load_redistribution that performs the operation
2290 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t
*bst_map
,
2291 vec_info
*vinfo
, unsigned int group_size
,
2292 hash_map
<slp_tree
, slp_tree
> *load_map
,
2295 if (slp_tree
*leader
= load_map
->get (root
))
2301 /* For now, we don't know anything about externals so do not do anything. */
2302 if (!root
|| SLP_TREE_DEF_TYPE (root
) != vect_internal_def
)
2304 else if (SLP_TREE_CODE (root
) == VEC_PERM_EXPR
)
2306 /* First convert this node into a load node and add it to the leaves
2307 list and flatten the permute from a lane to a load one. If it's
2308 unneeded it will be elided later. */
2309 vec
<stmt_vec_info
> stmts
;
2310 stmts
.create (SLP_TREE_LANES (root
));
2311 lane_permutation_t lane_perm
= SLP_TREE_LANE_PERMUTATION (root
);
2312 for (unsigned j
= 0; j
< lane_perm
.length (); j
++)
2314 std::pair
<unsigned, unsigned> perm
= lane_perm
[j
];
2315 node
= SLP_TREE_CHILDREN (root
)[perm
.first
];
2317 if (!vect_is_slp_load_node (node
)
2318 || SLP_TREE_CHILDREN (node
).exists ())
2324 stmts
.quick_push (SLP_TREE_SCALAR_STMTS (node
)[perm
.second
]);
2327 if (dump_enabled_p ())
2328 dump_printf_loc (MSG_NOTE
, vect_location
,
2329 "converting stmts on permute node %p\n", root
);
2331 bool *matches
= XALLOCAVEC (bool, group_size
);
2332 poly_uint64 max_nunits
= 1;
2333 unsigned tree_size
= 0, limit
= 1;
2334 node
= vect_build_slp_tree (vinfo
, stmts
, group_size
, &max_nunits
,
2335 matches
, &limit
, &tree_size
, bst_map
);
2339 load_map
->put (root
, node
);
2344 load_map
->put (root
, NULL
);
2346 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
2349 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
2353 SLP_TREE_REF_COUNT (value
)++;
2354 SLP_TREE_CHILDREN (root
)[i
] = value
;
2355 /* ??? We know the original leafs of the replaced nodes will
2356 be referenced by bst_map, only the permutes created by
2357 pattern matching are not. */
2358 if (SLP_TREE_REF_COUNT (node
) == 1)
2359 load_map
->remove (node
);
2360 vect_free_slp_tree (node
);
2367 /* Temporary workaround for loads not being CSEd during SLP build. This
2368 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2369 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2370 same DR such that the final operation is equal to a permuted load. Such
2371 NODES are then directly converted into LOADS themselves. The nodes are
2372 CSEd using BST_MAP. */
2375 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t
*bst_map
,
2376 vec_info
*vinfo
, unsigned int group_size
,
2377 hash_map
<slp_tree
, slp_tree
> *load_map
,
2383 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
2386 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
2390 SLP_TREE_REF_COUNT (value
)++;
2391 SLP_TREE_CHILDREN (root
)[i
] = value
;
2392 /* ??? We know the original leafs of the replaced nodes will
2393 be referenced by bst_map, only the permutes created by
2394 pattern matching are not. */
2395 if (SLP_TREE_REF_COUNT (node
) == 1)
2396 load_map
->remove (node
);
2397 vect_free_slp_tree (node
);
2402 /* Helper function of vect_match_slp_patterns.
2404 Attempts to match patterns against the slp tree rooted in REF_NODE using
2405 VINFO. Patterns are matched in post-order traversal.
2407 If matching is successful the value in REF_NODE is updated and returned, if
2408 not then it is returned unchanged. */
2411 vect_match_slp_patterns_2 (slp_tree
*ref_node
, vec_info
*vinfo
,
2412 slp_tree_to_load_perm_map_t
*perm_cache
,
2413 hash_set
<slp_tree
> *visited
)
2416 slp_tree node
= *ref_node
;
2417 bool found_p
= false;
2418 if (!node
|| visited
->add (node
))
2422 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2423 found_p
|= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node
)[i
],
2424 vinfo
, perm_cache
, visited
);
2426 for (unsigned x
= 0; x
< num__slp_patterns
; x
++)
2428 vect_pattern
*pattern
= slp_patterns
[x
] (perm_cache
, ref_node
);
2431 pattern
->build (vinfo
);
2440 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2443 The modified tree is returned. Patterns are tried in order and multiple
2444 patterns may match. */
2447 vect_match_slp_patterns (slp_instance instance
, vec_info
*vinfo
,
2448 hash_set
<slp_tree
> *visited
,
2449 slp_tree_to_load_perm_map_t
*perm_cache
)
2451 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2452 slp_tree
*ref_node
= &SLP_INSTANCE_TREE (instance
);
2454 if (dump_enabled_p ())
2455 dump_printf_loc (MSG_NOTE
, vect_location
,
2456 "Analyzing SLP tree %p for patterns\n",
2457 SLP_INSTANCE_TREE (instance
));
2459 return vect_match_slp_patterns_2 (ref_node
, vinfo
, perm_cache
, visited
);
2462 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
2463 splitting into two, with the first split group having size NEW_GROUP_SIZE.
2464 Return true if we could use IFN_STORE_LANES instead and if that appears
2465 to be the better approach. */
2468 vect_slp_prefer_store_lanes_p (vec_info
*vinfo
, stmt_vec_info stmt_info
,
2469 unsigned int group_size
,
2470 unsigned int new_group_size
)
2472 tree scalar_type
= TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
2473 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
);
2476 /* Allow the split if one of the two new groups would operate on full
2477 vectors *within* rather than across one scalar loop iteration.
2478 This is purely a heuristic, but it should work well for group
2479 sizes of 3 and 4, where the possible splits are:
2481 3->2+1: OK if the vector has exactly two elements
2483 4->3+1: Less clear-cut. */
2484 if (multiple_p (group_size
- new_group_size
, TYPE_VECTOR_SUBPARTS (vectype
))
2485 || multiple_p (new_group_size
, TYPE_VECTOR_SUBPARTS (vectype
)))
2487 return vect_store_lanes_supported (vectype
, group_size
, false);
2490 /* Analyze an SLP instance starting from a group of grouped stores. Call
2491 vect_build_slp_tree to build a tree of packed stmts if possible.
2492 Return FALSE if it's impossible to SLP any stmt in the loop. */
2495 vect_analyze_slp_instance (vec_info
*vinfo
,
2496 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2497 stmt_vec_info stmt_info
, slp_instance_kind kind
,
2498 unsigned max_tree_size
, unsigned *limit
);
2500 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2501 of KIND. Return true if successful. */
2504 vect_build_slp_instance (vec_info
*vinfo
,
2505 slp_instance_kind kind
,
2506 vec
<stmt_vec_info
> &scalar_stmts
,
2507 vec
<stmt_vec_info
> &root_stmt_infos
,
2508 unsigned max_tree_size
, unsigned *limit
,
2509 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2510 /* ??? We need stmt_info for group splitting. */
2511 stmt_vec_info stmt_info_
)
2513 if (dump_enabled_p ())
2515 dump_printf_loc (MSG_NOTE
, vect_location
,
2516 "Starting SLP discovery for\n");
2517 for (unsigned i
= 0; i
< scalar_stmts
.length (); ++i
)
2518 dump_printf_loc (MSG_NOTE
, vect_location
,
2519 " %G", scalar_stmts
[i
]->stmt
);
2522 /* Build the tree for the SLP instance. */
2523 unsigned int group_size
= scalar_stmts
.length ();
2524 bool *matches
= XALLOCAVEC (bool, group_size
);
2525 poly_uint64 max_nunits
= 1;
2526 unsigned tree_size
= 0;
2528 slp_tree node
= vect_build_slp_tree (vinfo
, scalar_stmts
, group_size
,
2529 &max_nunits
, matches
, limit
,
2530 &tree_size
, bst_map
);
2533 /* Calculate the unrolling factor based on the smallest type. */
2534 poly_uint64 unrolling_factor
2535 = calculate_unrolling_factor (max_nunits
, group_size
);
2537 if (maybe_ne (unrolling_factor
, 1U)
2538 && is_a
<bb_vec_info
> (vinfo
))
2540 unsigned HOST_WIDE_INT const_max_nunits
;
2541 if (!max_nunits
.is_constant (&const_max_nunits
)
2542 || const_max_nunits
> group_size
)
2544 if (dump_enabled_p ())
2545 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2546 "Build SLP failed: store group "
2547 "size not a multiple of the vector size "
2548 "in basic block SLP\n");
2549 vect_free_slp_tree (node
);
2552 /* Fatal mismatch. */
2553 if (dump_enabled_p ())
2554 dump_printf_loc (MSG_NOTE
, vect_location
,
2555 "SLP discovery succeeded but node needs "
2557 memset (matches
, true, group_size
);
2558 matches
[group_size
/ const_max_nunits
* const_max_nunits
] = false;
2559 vect_free_slp_tree (node
);
2563 /* Create a new SLP instance. */
2564 slp_instance new_instance
= XNEW (class _slp_instance
);
2565 SLP_INSTANCE_TREE (new_instance
) = node
;
2566 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
2567 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
2568 SLP_INSTANCE_ROOT_STMTS (new_instance
) = root_stmt_infos
;
2569 SLP_INSTANCE_KIND (new_instance
) = kind
;
2570 new_instance
->reduc_phis
= NULL
;
2571 new_instance
->cost_vec
= vNULL
;
2572 new_instance
->subgraph_entries
= vNULL
;
2574 if (dump_enabled_p ())
2575 dump_printf_loc (MSG_NOTE
, vect_location
,
2576 "SLP size %u vs. limit %u.\n",
2577 tree_size
, max_tree_size
);
2579 /* Fixup SLP reduction chains. */
2580 if (kind
== slp_inst_kind_reduc_chain
)
2582 /* If this is a reduction chain with a conversion in front
2583 amend the SLP tree with a node for that. */
2585 = vect_orig_stmt (scalar_stmts
[group_size
- 1])->stmt
;
2586 if (STMT_VINFO_DEF_TYPE (scalar_stmts
[0]) != vect_reduction_def
)
2588 /* Get at the conversion stmt - we know it's the single use
2589 of the last stmt of the reduction chain. */
2590 use_operand_p use_p
;
2591 bool r
= single_imm_use (gimple_assign_lhs (scalar_def
),
2592 &use_p
, &scalar_def
);
2594 stmt_vec_info next_info
= vinfo
->lookup_stmt (scalar_def
);
2595 next_info
= vect_stmt_to_vectorize (next_info
);
2596 scalar_stmts
= vNULL
;
2597 scalar_stmts
.create (group_size
);
2598 for (unsigned i
= 0; i
< group_size
; ++i
)
2599 scalar_stmts
.quick_push (next_info
);
2600 slp_tree conv
= vect_create_new_slp_node (scalar_stmts
, 1);
2601 SLP_TREE_VECTYPE (conv
) = STMT_VINFO_VECTYPE (next_info
);
2602 SLP_TREE_CHILDREN (conv
).quick_push (node
);
2603 SLP_INSTANCE_TREE (new_instance
) = conv
;
2604 /* We also have to fake this conversion stmt as SLP reduction
2605 group so we don't have to mess with too much code
2607 REDUC_GROUP_FIRST_ELEMENT (next_info
) = next_info
;
2608 REDUC_GROUP_NEXT_ELEMENT (next_info
) = NULL
;
2610 /* Fill the backedge child of the PHI SLP node. The
2611 general matching code cannot find it because the
2612 scalar code does not reflect how we vectorize the
2614 use_operand_p use_p
;
2615 imm_use_iterator imm_iter
;
2616 class loop
*loop
= LOOP_VINFO_LOOP (as_a
<loop_vec_info
> (vinfo
));
2617 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
,
2618 gimple_get_lhs (scalar_def
))
2619 /* There are exactly two non-debug uses, the reduction
2620 PHI and the loop-closed PHI node. */
2621 if (!is_gimple_debug (USE_STMT (use_p
))
2622 && gimple_bb (USE_STMT (use_p
)) == loop
->header
)
2624 auto_vec
<stmt_vec_info
, 64> phis (group_size
);
2625 stmt_vec_info phi_info
2626 = vinfo
->lookup_stmt (USE_STMT (use_p
));
2627 for (unsigned i
= 0; i
< group_size
; ++i
)
2628 phis
.quick_push (phi_info
);
2629 slp_tree
*phi_node
= bst_map
->get (phis
);
2630 unsigned dest_idx
= loop_latch_edge (loop
)->dest_idx
;
2631 SLP_TREE_CHILDREN (*phi_node
)[dest_idx
]
2632 = SLP_INSTANCE_TREE (new_instance
);
2633 SLP_INSTANCE_TREE (new_instance
)->refcnt
++;
2637 vinfo
->slp_instances
.safe_push (new_instance
);
2639 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2640 the number of scalar stmts in the root in a few places.
2641 Verify that assumption holds. */
2642 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
2643 .length () == group_size
);
2645 if (dump_enabled_p ())
2647 dump_printf_loc (MSG_NOTE
, vect_location
,
2648 "Final SLP tree for instance %p:\n", new_instance
);
2649 vect_print_slp_graph (MSG_NOTE
, vect_location
,
2650 SLP_INSTANCE_TREE (new_instance
));
2658 /* Failed to SLP. */
2659 /* Free the allocated memory. */
2660 scalar_stmts
.release ();
2663 stmt_vec_info stmt_info
= stmt_info_
;
2664 /* Try to break the group up into pieces. */
2665 if (kind
== slp_inst_kind_store
)
2667 /* ??? We could delay all the actual splitting of store-groups
2668 until after SLP discovery of the original group completed.
2669 Then we can recurse to vect_build_slp_instance directly. */
2670 for (i
= 0; i
< group_size
; i
++)
2674 /* For basic block SLP, try to break the group up into multiples of
2676 if (is_a
<bb_vec_info
> (vinfo
)
2677 && (i
> 1 && i
< group_size
))
2680 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
2681 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
2682 1 << floor_log2 (i
));
2683 unsigned HOST_WIDE_INT const_nunits
;
2685 && TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
))
2687 /* Split into two groups at the first vector boundary. */
2688 gcc_assert ((const_nunits
& (const_nunits
- 1)) == 0);
2689 unsigned group1_size
= i
& ~(const_nunits
- 1);
2691 if (dump_enabled_p ())
2692 dump_printf_loc (MSG_NOTE
, vect_location
,
2693 "Splitting SLP group at stmt %u\n", i
);
2694 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2696 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2697 kind
, max_tree_size
,
2699 /* Split the rest at the failure point and possibly
2700 re-analyze the remaining matching part if it has
2701 at least two lanes. */
2703 && (i
+ 1 < group_size
2704 || i
- group1_size
> 1))
2706 stmt_vec_info rest2
= rest
;
2707 rest
= vect_split_slp_store_group (rest
, i
- group1_size
);
2708 if (i
- group1_size
> 1)
2709 res
|= vect_analyze_slp_instance (vinfo
, bst_map
, rest2
,
2710 kind
, max_tree_size
,
2713 /* Re-analyze the non-matching tail if it has at least
2715 if (i
+ 1 < group_size
)
2716 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2717 rest
, kind
, max_tree_size
,
2723 /* For loop vectorization split into arbitrary pieces of size > 1. */
2724 if (is_a
<loop_vec_info
> (vinfo
)
2725 && (i
> 1 && i
< group_size
)
2726 && !vect_slp_prefer_store_lanes_p (vinfo
, stmt_info
, group_size
, i
))
2728 unsigned group1_size
= i
;
2730 if (dump_enabled_p ())
2731 dump_printf_loc (MSG_NOTE
, vect_location
,
2732 "Splitting SLP group at stmt %u\n", i
);
2734 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2736 /* Loop vectorization cannot handle gaps in stores, make sure
2737 the split group appears as strided. */
2738 STMT_VINFO_STRIDED_P (rest
) = 1;
2739 DR_GROUP_GAP (rest
) = 0;
2740 STMT_VINFO_STRIDED_P (stmt_info
) = 1;
2741 DR_GROUP_GAP (stmt_info
) = 0;
2743 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2744 kind
, max_tree_size
, limit
);
2745 if (i
+ 1 < group_size
)
2746 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2747 rest
, kind
, max_tree_size
, limit
);
2752 /* Even though the first vector did not all match, we might be able to SLP
2753 (some) of the remainder. FORNOW ignore this possibility. */
2756 /* Failed to SLP. */
2757 if (dump_enabled_p ())
2758 dump_printf_loc (MSG_NOTE
, vect_location
, "SLP discovery failed\n");
2763 /* Analyze an SLP instance starting from a group of grouped stores. Call
2764 vect_build_slp_tree to build a tree of packed stmts if possible.
2765 Return FALSE if it's impossible to SLP any stmt in the loop. */
2768 vect_analyze_slp_instance (vec_info
*vinfo
,
2769 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2770 stmt_vec_info stmt_info
,
2771 slp_instance_kind kind
,
2772 unsigned max_tree_size
, unsigned *limit
)
2775 vec
<stmt_vec_info
> scalar_stmts
;
2777 if (is_a
<bb_vec_info
> (vinfo
))
2778 vect_location
= stmt_info
->stmt
;
2780 stmt_vec_info next_info
= stmt_info
;
2781 if (kind
== slp_inst_kind_store
)
2783 /* Collect the stores and store them in scalar_stmts. */
2784 scalar_stmts
.create (DR_GROUP_SIZE (stmt_info
));
2787 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
2788 next_info
= DR_GROUP_NEXT_ELEMENT (next_info
);
2791 else if (kind
== slp_inst_kind_reduc_chain
)
2793 /* Collect the reduction stmts and store them in scalar_stmts. */
2794 scalar_stmts
.create (REDUC_GROUP_SIZE (stmt_info
));
2797 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
2798 next_info
= REDUC_GROUP_NEXT_ELEMENT (next_info
);
2800 /* Mark the first element of the reduction chain as reduction to properly
2801 transform the node. In the reduction analysis phase only the last
2802 element of the chain is marked as reduction. */
2803 STMT_VINFO_DEF_TYPE (stmt_info
)
2804 = STMT_VINFO_DEF_TYPE (scalar_stmts
.last ());
2805 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))
2806 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts
.last ()));
2808 else if (kind
== slp_inst_kind_ctor
)
2810 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
2812 scalar_stmts
.create (CONSTRUCTOR_NELTS (rhs
));
2813 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), i
, val
)
2815 stmt_vec_info def_info
= vinfo
->lookup_def (val
);
2816 def_info
= vect_stmt_to_vectorize (def_info
);
2817 scalar_stmts
.quick_push (def_info
);
2819 if (dump_enabled_p ())
2820 dump_printf_loc (MSG_NOTE
, vect_location
,
2821 "Analyzing vectorizable constructor: %G\n",
2824 else if (kind
== slp_inst_kind_reduc_group
)
2826 /* Collect reduction statements. */
2827 vec
<stmt_vec_info
> reductions
= as_a
<loop_vec_info
> (vinfo
)->reductions
;
2828 scalar_stmts
.create (reductions
.length ());
2829 for (i
= 0; reductions
.iterate (i
, &next_info
); i
++)
2830 if (STMT_VINFO_RELEVANT_P (next_info
)
2831 || STMT_VINFO_LIVE_P (next_info
))
2832 scalar_stmts
.quick_push (next_info
);
2833 /* If less than two were relevant/live there's nothing to SLP. */
2834 if (scalar_stmts
.length () < 2)
2840 vec
<stmt_vec_info
> roots
= vNULL
;
2841 if (kind
== slp_inst_kind_ctor
)
2844 roots
.quick_push (stmt_info
);
2846 /* Build the tree for the SLP instance. */
2847 bool res
= vect_build_slp_instance (vinfo
, kind
, scalar_stmts
,
2849 max_tree_size
, limit
, bst_map
,
2850 kind
== slp_inst_kind_store
2851 ? stmt_info
: NULL
);
2855 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2856 where we should do store group splitting. */
2861 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2862 trees of packed scalar stmts if SLP is possible. */
2865 vect_analyze_slp (vec_info
*vinfo
, unsigned max_tree_size
)
2868 stmt_vec_info first_element
;
2869 slp_instance instance
;
2871 DUMP_VECT_SCOPE ("vect_analyze_slp");
2873 unsigned limit
= max_tree_size
;
2875 scalar_stmts_to_slp_tree_map_t
*bst_map
2876 = new scalar_stmts_to_slp_tree_map_t ();
2878 /* Find SLP sequences starting from groups of grouped stores. */
2879 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
2880 vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2881 STMT_VINFO_GROUPED_ACCESS (first_element
)
2882 ? slp_inst_kind_store
: slp_inst_kind_ctor
,
2883 max_tree_size
, &limit
);
2885 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
2887 for (unsigned i
= 0; i
< bb_vinfo
->roots
.length (); ++i
)
2889 vect_location
= bb_vinfo
->roots
[i
].roots
[0]->stmt
;
2890 if (vect_build_slp_instance (bb_vinfo
, bb_vinfo
->roots
[i
].kind
,
2891 bb_vinfo
->roots
[i
].stmts
,
2892 bb_vinfo
->roots
[i
].roots
,
2893 max_tree_size
, &limit
, bst_map
, NULL
))
2895 bb_vinfo
->roots
[i
].stmts
= vNULL
;
2896 bb_vinfo
->roots
[i
].roots
= vNULL
;
2901 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2903 /* Find SLP sequences starting from reduction chains. */
2904 FOR_EACH_VEC_ELT (loop_vinfo
->reduction_chains
, i
, first_element
)
2905 if (! STMT_VINFO_RELEVANT_P (first_element
)
2906 && ! STMT_VINFO_LIVE_P (first_element
))
2908 else if (! vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2909 slp_inst_kind_reduc_chain
,
2910 max_tree_size
, &limit
))
2912 /* Dissolve reduction chain group. */
2913 stmt_vec_info vinfo
= first_element
;
2914 stmt_vec_info last
= NULL
;
2917 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
2918 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2919 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2923 STMT_VINFO_DEF_TYPE (first_element
) = vect_internal_def
;
2924 /* It can be still vectorized as part of an SLP reduction. */
2925 loop_vinfo
->reductions
.safe_push (last
);
2928 /* Find SLP sequences starting from groups of reductions. */
2929 if (loop_vinfo
->reductions
.length () > 1)
2930 vect_analyze_slp_instance (vinfo
, bst_map
, loop_vinfo
->reductions
[0],
2931 slp_inst_kind_reduc_group
, max_tree_size
,
2935 hash_set
<slp_tree
> visited_patterns
;
2936 slp_tree_to_load_perm_map_t perm_cache
;
2938 /* See if any patterns can be found in the SLP tree. */
2939 bool pattern_found
= false;
2940 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
2941 pattern_found
|= vect_match_slp_patterns (instance
, vinfo
,
2942 &visited_patterns
, &perm_cache
);
2944 /* If any were found optimize permutations of loads. */
2947 hash_map
<slp_tree
, slp_tree
> load_map
;
2948 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
2950 slp_tree root
= SLP_INSTANCE_TREE (instance
);
2951 optimize_load_redistribution (bst_map
, vinfo
, SLP_TREE_LANES (root
),
2958 /* The map keeps a reference on SLP nodes built, release that. */
2959 for (scalar_stmts_to_slp_tree_map_t::iterator it
= bst_map
->begin ();
2960 it
!= bst_map
->end (); ++it
)
2962 vect_free_slp_tree ((*it
).second
);
2965 if (pattern_found
&& dump_enabled_p ())
2967 dump_printf_loc (MSG_NOTE
, vect_location
,
2968 "Pattern matched SLP tree\n");
2969 hash_set
<slp_tree
> visited
;
2970 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
2971 vect_print_slp_graph (MSG_NOTE
, vect_location
,
2972 SLP_INSTANCE_TREE (instance
), visited
);
2975 return opt_result::success ();
2978 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2981 vect_slp_build_vertices (hash_set
<slp_tree
> &visited
, slp_tree node
,
2982 vec
<slp_tree
> &vertices
, vec
<int> &leafs
)
2987 if (visited
.add (node
))
2990 node
->vertex
= vertices
.length ();
2991 vertices
.safe_push (node
);
2994 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2998 vect_slp_build_vertices (visited
, child
, vertices
, leafs
);
3001 leafs
.safe_push (node
->vertex
);
3004 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3007 vect_slp_build_vertices (vec_info
*info
, vec
<slp_tree
> &vertices
,
3010 hash_set
<slp_tree
> visited
;
3012 slp_instance instance
;
3013 FOR_EACH_VEC_ELT (info
->slp_instances
, i
, instance
)
3015 unsigned n_v
= vertices
.length ();
3016 unsigned n_l
= leafs
.length ();
3017 vect_slp_build_vertices (visited
, SLP_INSTANCE_TREE (instance
), vertices
,
3019 /* If we added vertices but no entries to the reverse graph we've
3020 added a cycle that is not backwards-reachable. Push the entry
3021 to mimic as leaf then. */
3022 if (vertices
.length () > n_v
3023 && leafs
.length () == n_l
)
3024 leafs
.safe_push (SLP_INSTANCE_TREE (instance
)->vertex
);
3028 /* Apply (reverse) bijectite PERM to VEC. */
3032 vect_slp_permute (vec
<unsigned> perm
,
3033 vec
<T
> &vec
, bool reverse
)
3035 auto_vec
<T
, 64> saved
;
3036 saved
.create (vec
.length ());
3037 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3038 saved
.quick_push (vec
[i
]);
3042 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3043 vec
[perm
[i
]] = saved
[i
];
3044 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3045 gcc_assert (vec
[perm
[i
]] == saved
[i
]);
3049 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3050 vec
[i
] = saved
[perm
[i
]];
3051 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3052 gcc_assert (vec
[i
] == saved
[perm
[i
]]);
3056 /* Return whether permutations PERM_A and PERM_B as recorded in the
3057 PERMS vector are equal. */
3060 vect_slp_perms_eq (const vec
<vec
<unsigned> > &perms
,
3061 int perm_a
, int perm_b
)
3063 return (perm_a
== perm_b
3064 || (perms
[perm_a
].length () == perms
[perm_b
].length ()
3065 && memcmp (&perms
[perm_a
][0], &perms
[perm_b
][0],
3066 sizeof (unsigned) * perms
[perm_a
].length ()) == 0));
3069 /* Optimize the SLP graph of VINFO. */
3072 vect_optimize_slp (vec_info
*vinfo
)
3074 if (vinfo
->slp_instances
.is_empty ())
3079 auto_vec
<slp_tree
> vertices
;
3080 auto_vec
<int> leafs
;
3081 vect_slp_build_vertices (vinfo
, vertices
, leafs
);
3083 struct graph
*slpg
= new_graph (vertices
.length ());
3084 FOR_EACH_VEC_ELT (vertices
, i
, node
)
3088 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3090 add_edge (slpg
, i
, child
->vertex
);
3093 /* Compute (reverse) postorder on the inverted graph. */
3095 graphds_dfs (slpg
, &leafs
[0], leafs
.length (), &ipo
, false, NULL
, NULL
);
3097 auto_sbitmap
n_visited (vertices
.length ());
3098 auto_sbitmap
n_materialize (vertices
.length ());
3099 auto_vec
<int> n_perm (vertices
.length ());
3100 auto_vec
<vec
<unsigned> > perms
;
3102 bitmap_clear (n_visited
);
3103 bitmap_clear (n_materialize
);
3104 n_perm
.quick_grow_cleared (vertices
.length ());
3105 perms
.safe_push (vNULL
); /* zero is no permute */
3107 /* Produce initial permutations. */
3108 for (i
= 0; i
< leafs
.length (); ++i
)
3111 slp_tree node
= vertices
[idx
];
3113 /* Handle externals and constants optimistically throughout the
3115 if (SLP_TREE_DEF_TYPE (node
) == vect_external_def
3116 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
3119 /* Leafs do not change across iterations. Note leafs also double
3120 as entries to the reverse graph. */
3121 if (!slpg
->vertices
[idx
].succ
)
3122 bitmap_set_bit (n_visited
, idx
);
3123 /* Loads are the only thing generating permutes. */
3124 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3127 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3128 node unpermuted, record this permute. */
3129 stmt_vec_info dr_stmt
= SLP_TREE_REPRESENTATIVE (node
);
3130 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt
))
3132 dr_stmt
= DR_GROUP_FIRST_ELEMENT (dr_stmt
);
3133 unsigned imin
= DR_GROUP_SIZE (dr_stmt
) + 1, imax
= 0;
3134 bool any_permute
= false;
3135 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3137 unsigned idx
= SLP_TREE_LOAD_PERMUTATION (node
)[j
];
3138 imin
= MIN (imin
, idx
);
3139 imax
= MAX (imax
, idx
);
3140 if (idx
- SLP_TREE_LOAD_PERMUTATION (node
)[0] != j
)
3143 /* If there's no permute no need to split one out. */
3146 /* If the span doesn't match we'd disrupt VF computation, avoid
3148 if (imax
- imin
+ 1 != SLP_TREE_LANES (node
))
3151 /* For now only handle true permutes, like
3152 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3153 when permuting constants and invariants keeping the permute
3155 auto_sbitmap
load_index (SLP_TREE_LANES (node
));
3156 bitmap_clear (load_index
);
3157 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3158 bitmap_set_bit (load_index
, SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
);
3160 for (j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3161 if (!bitmap_bit_p (load_index
, j
))
3163 if (j
!= SLP_TREE_LANES (node
))
3166 vec
<unsigned> perm
= vNULL
;
3167 perm
.safe_grow (SLP_TREE_LANES (node
), true);
3168 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3169 perm
[j
] = SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
;
3170 perms
.safe_push (perm
);
3171 n_perm
[idx
] = perms
.length () - 1;
3174 /* Propagate permutes along the graph and compute materialization points. */
3176 unsigned iteration
= 0;
3182 for (i
= vertices
.length (); i
> 0 ; --i
)
3185 slp_tree node
= vertices
[idx
];
3186 /* For leafs there's nothing to do - we've seeded permutes
3188 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
3191 bitmap_set_bit (n_visited
, idx
);
3193 /* We cannot move a permute across a store. */
3194 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))
3196 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))))
3200 for (graph_edge
*succ
= slpg
->vertices
[idx
].succ
;
3201 succ
; succ
= succ
->succ_next
)
3203 int succ_idx
= succ
->dest
;
3204 /* Handle unvisited nodes optimistically. */
3205 /* ??? But for constants once we want to handle non-bijective
3206 permutes we have to verify the permute, when unifying lanes,
3207 will not unify different constants. For example see
3208 gcc.dg/vect/bb-slp-14.c for a case that would break. */
3209 if (!bitmap_bit_p (n_visited
, succ_idx
))
3211 int succ_perm
= n_perm
[succ_idx
];
3212 /* Once we materialize succs permutation its output lanes
3213 appear unpermuted to us. */
3214 if (bitmap_bit_p (n_materialize
, succ_idx
))
3218 else if (succ_perm
== 0)
3223 else if (!vect_slp_perms_eq (perms
, perm
, succ_perm
))
3231 /* Pick up pre-computed leaf values. */
3233 else if (!vect_slp_perms_eq (perms
, perm
, n_perm
[idx
]))
3236 /* Make sure we eventually converge. */
3237 gcc_checking_assert (perm
== 0);
3240 bitmap_clear_bit (n_materialize
, idx
);
3247 /* Elide pruning at materialization points in the first
3248 iteration so every node was visited once at least. */
3252 /* Decide on permute materialization. Look whether there's
3253 a use (pred) edge that is permuted differently than us.
3254 In that case mark ourselves so the permutation is applied.
3255 For VEC_PERM_EXPRs the permutation doesn't carry along
3256 from children to parents so force materialization at the
3257 point of the VEC_PERM_EXPR. In principle VEC_PERM_EXPRs
3258 are a source of an arbitrary permutation again, similar
3259 to constants/externals - that's something we do not yet
3260 optimally handle. */
3261 bool all_preds_permuted
= (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
3262 && slpg
->vertices
[idx
].pred
!= NULL
);
3263 if (all_preds_permuted
)
3264 for (graph_edge
*pred
= slpg
->vertices
[idx
].pred
;
3265 pred
; pred
= pred
->pred_next
)
3267 gcc_checking_assert (bitmap_bit_p (n_visited
, pred
->src
));
3268 int pred_perm
= n_perm
[pred
->src
];
3269 if (!vect_slp_perms_eq (perms
, perm
, pred_perm
))
3271 all_preds_permuted
= false;
3275 if (!all_preds_permuted
)
3277 if (!bitmap_bit_p (n_materialize
, idx
))
3279 bitmap_set_bit (n_materialize
, idx
);
3283 while (changed
|| iteration
== 1);
3286 for (i
= 0; i
< vertices
.length (); ++i
)
3288 int perm
= n_perm
[i
];
3292 slp_tree node
= vertices
[i
];
3294 /* First permute invariant/external original successors. */
3297 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3299 if (!child
|| SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3302 /* If the vector is uniform there's nothing to do. */
3303 if (vect_slp_tree_uniform_p (child
))
3306 /* We can end up sharing some externals via two_operator
3307 handling. Be prepared to unshare those. */
3308 if (child
->refcnt
!= 1)
3310 gcc_assert (slpg
->vertices
[child
->vertex
].pred
->pred_next
);
3311 SLP_TREE_CHILDREN (node
)[j
] = child
3312 = vect_create_new_slp_node
3313 (SLP_TREE_SCALAR_OPS (child
).copy ());
3315 vect_slp_permute (perms
[perm
],
3316 SLP_TREE_SCALAR_OPS (child
), true);
3319 if (bitmap_bit_p (n_materialize
, i
))
3321 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3322 /* For loads simply drop the permutation, the load permutation
3323 already performs the desired permutation. */
3325 else if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
3327 /* If the node is already a permute node we can apply
3328 the permutation to the lane selection, effectively
3329 materializing it on the incoming vectors. */
3330 if (dump_enabled_p ())
3331 dump_printf_loc (MSG_NOTE
, vect_location
,
3332 "simplifying permute node %p\n",
3335 for (unsigned k
= 0;
3336 k
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++k
)
3337 SLP_TREE_LANE_PERMUTATION (node
)[k
].second
3338 = perms
[perm
][SLP_TREE_LANE_PERMUTATION (node
)[k
].second
];
3342 if (dump_enabled_p ())
3343 dump_printf_loc (MSG_NOTE
, vect_location
,
3344 "inserting permute node in place of %p\n",
3347 /* Make a copy of NODE and in-place change it to a
3348 VEC_PERM node to permute the lanes of the copy. */
3349 slp_tree copy
= new _slp_tree
;
3350 SLP_TREE_CHILDREN (copy
) = SLP_TREE_CHILDREN (node
);
3351 SLP_TREE_CHILDREN (node
) = vNULL
;
3352 SLP_TREE_SCALAR_STMTS (copy
)
3353 = SLP_TREE_SCALAR_STMTS (node
).copy ();
3354 vect_slp_permute (perms
[perm
],
3355 SLP_TREE_SCALAR_STMTS (copy
), true);
3356 gcc_assert (!SLP_TREE_SCALAR_OPS (node
).exists ());
3357 SLP_TREE_REPRESENTATIVE (copy
) = SLP_TREE_REPRESENTATIVE (node
);
3358 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node
).exists ());
3359 SLP_TREE_LANE_PERMUTATION (copy
)
3360 = SLP_TREE_LANE_PERMUTATION (node
);
3361 SLP_TREE_LANE_PERMUTATION (node
) = vNULL
;
3362 SLP_TREE_VECTYPE (copy
) = SLP_TREE_VECTYPE (node
);
3364 copy
->max_nunits
= node
->max_nunits
;
3365 SLP_TREE_DEF_TYPE (copy
) = SLP_TREE_DEF_TYPE (node
);
3366 SLP_TREE_LANES (copy
) = SLP_TREE_LANES (node
);
3367 SLP_TREE_CODE (copy
) = SLP_TREE_CODE (node
);
3369 /* Now turn NODE into a VEC_PERM. */
3370 SLP_TREE_CHILDREN (node
).safe_push (copy
);
3371 SLP_TREE_LANE_PERMUTATION (node
).create (SLP_TREE_LANES (node
));
3372 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3373 SLP_TREE_LANE_PERMUTATION (node
)
3374 .quick_push (std::make_pair (0, perms
[perm
][j
]));
3375 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
3380 /* Apply the reverse permutation to our stmts. */
3381 vect_slp_permute (perms
[perm
],
3382 SLP_TREE_SCALAR_STMTS (node
), true);
3383 /* And to the load permutation, which we can simply
3384 make regular by design. */
3385 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3387 /* ??? When we handle non-bijective permutes the idea
3388 is that we can force the load-permutation to be
3389 { min, min + 1, min + 2, ... max }. But then the
3390 scalar defs might no longer match the lane content
3391 which means wrong-code with live lane vectorization.
3392 So we possibly have to have NULL entries for those. */
3393 vect_slp_permute (perms
[perm
],
3394 SLP_TREE_LOAD_PERMUTATION (node
), true);
3399 /* Free the perms vector used for propagation. */
3400 while (!perms
.is_empty ())
3401 perms
.pop ().release ();
3405 /* Now elide load permutations that are not necessary. */
3406 for (i
= 0; i
< leafs
.length (); ++i
)
3408 node
= vertices
[leafs
[i
]];
3409 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3412 /* In basic block vectorization we allow any subchain of an interleaving
3414 FORNOW: not in loop SLP because of realignment complications. */
3415 if (is_a
<bb_vec_info
> (vinfo
))
3417 bool subchain_p
= true;
3418 stmt_vec_info next_load_info
= NULL
;
3419 stmt_vec_info load_info
;
3421 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3424 && (next_load_info
!= load_info
3425 || DR_GROUP_GAP (load_info
) != 1))
3430 next_load_info
= DR_GROUP_NEXT_ELEMENT (load_info
);
3434 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3440 stmt_vec_info load_info
;
3441 bool this_load_permuted
= false;
3443 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3444 if (SLP_TREE_LOAD_PERMUTATION (node
)[j
] != j
)
3446 this_load_permuted
= true;
3449 stmt_vec_info first_stmt_info
3450 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
3451 if (!this_load_permuted
3452 /* The load requires permutation when unrolling exposes
3453 a gap either because the group is larger than the SLP
3454 group-size or because there is a gap between the groups. */
3455 && (known_eq (LOOP_VINFO_VECT_FACTOR
3456 (as_a
<loop_vec_info
> (vinfo
)), 1U)
3457 || ((SLP_TREE_LANES (node
) == DR_GROUP_SIZE (first_stmt_info
))
3458 && DR_GROUP_GAP (first_stmt_info
) == 0)))
3460 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3467 /* Gather loads reachable from the individual SLP graph entries. */
3470 vect_gather_slp_loads (vec_info
*vinfo
)
3473 slp_instance instance
;
3474 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
3476 hash_set
<slp_tree
> visited
;
3477 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance
),
3478 SLP_INSTANCE_TREE (instance
), visited
);
3483 /* For each possible SLP instance decide whether to SLP it and calculate overall
3484 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3485 least one instance. */
3488 vect_make_slp_decision (loop_vec_info loop_vinfo
)
3491 poly_uint64 unrolling_factor
= 1;
3492 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
3493 slp_instance instance
;
3494 int decided_to_slp
= 0;
3496 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3498 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3500 /* FORNOW: SLP if you can. */
3501 /* All unroll factors have the form:
3503 GET_MODE_SIZE (vinfo->vector_mode) * X
3505 for some rational X, so they must have a common multiple. */
3507 = force_common_multiple (unrolling_factor
,
3508 SLP_INSTANCE_UNROLLING_FACTOR (instance
));
3510 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3511 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3512 loop-based vectorization. Such stmts will be marked as HYBRID. */
3513 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
3517 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
3519 if (decided_to_slp
&& dump_enabled_p ())
3521 dump_printf_loc (MSG_NOTE
, vect_location
,
3522 "Decided to SLP %d instances. Unrolling factor ",
3524 dump_dec (MSG_NOTE
, unrolling_factor
);
3525 dump_printf (MSG_NOTE
, "\n");
3528 return (decided_to_slp
> 0);
3531 /* Private data for vect_detect_hybrid_slp. */
3534 loop_vec_info loop_vinfo
;
3535 vec
<stmt_vec_info
> *worklist
;
3538 /* Walker for walk_gimple_op. */
3541 vect_detect_hybrid_slp (tree
*tp
, int *, void *data
)
3543 walk_stmt_info
*wi
= (walk_stmt_info
*)data
;
3544 vdhs_data
*dat
= (vdhs_data
*)wi
->info
;
3549 stmt_vec_info def_stmt_info
= dat
->loop_vinfo
->lookup_def (*tp
);
3552 def_stmt_info
= vect_stmt_to_vectorize (def_stmt_info
);
3553 if (PURE_SLP_STMT (def_stmt_info
))
3555 if (dump_enabled_p ())
3556 dump_printf_loc (MSG_NOTE
, vect_location
, "marking hybrid: %G",
3557 def_stmt_info
->stmt
);
3558 STMT_SLP_TYPE (def_stmt_info
) = hybrid
;
3559 dat
->worklist
->safe_push (def_stmt_info
);
3565 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3566 if so, otherwise pushing it to WORKLIST. */
3569 maybe_push_to_hybrid_worklist (vec_info
*vinfo
,
3570 vec
<stmt_vec_info
> &worklist
,
3571 stmt_vec_info stmt_info
)
3573 if (dump_enabled_p ())
3574 dump_printf_loc (MSG_NOTE
, vect_location
,
3575 "Processing hybrid candidate : %G", stmt_info
->stmt
);
3576 stmt_vec_info orig_info
= vect_orig_stmt (stmt_info
);
3577 imm_use_iterator iter2
;
3579 use_operand_p use_p
;
3580 def_operand_p def_p
;
3581 bool any_def
= false;
3582 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_info
->stmt
, iter1
, SSA_OP_DEF
)
3585 FOR_EACH_IMM_USE_FAST (use_p
, iter2
, DEF_FROM_PTR (def_p
))
3587 if (is_gimple_debug (USE_STMT (use_p
)))
3589 stmt_vec_info use_info
= vinfo
->lookup_stmt (USE_STMT (use_p
));
3590 /* An out-of loop use means this is a loop_vect sink. */
3593 if (dump_enabled_p ())
3594 dump_printf_loc (MSG_NOTE
, vect_location
,
3595 "Found loop_vect sink: %G", stmt_info
->stmt
);
3596 worklist
.safe_push (stmt_info
);
3599 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info
)))
3601 if (dump_enabled_p ())
3602 dump_printf_loc (MSG_NOTE
, vect_location
,
3603 "Found loop_vect use: %G", use_info
->stmt
);
3604 worklist
.safe_push (stmt_info
);
3609 /* No def means this is a loo_vect sink. */
3612 if (dump_enabled_p ())
3613 dump_printf_loc (MSG_NOTE
, vect_location
,
3614 "Found loop_vect sink: %G", stmt_info
->stmt
);
3615 worklist
.safe_push (stmt_info
);
3618 if (dump_enabled_p ())
3619 dump_printf_loc (MSG_NOTE
, vect_location
,
3620 "Marked SLP consumed stmt pure: %G", stmt_info
->stmt
);
3621 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
3624 /* Find stmts that must be both vectorized and SLPed. */
3627 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
3629 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3631 /* All stmts participating in SLP are marked pure_slp, all other
3632 stmts are loop_vect.
3633 First collect all loop_vect stmts into a worklist.
3634 SLP patterns cause not all original scalar stmts to appear in
3635 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3636 Rectify this here and do a backward walk over the IL only considering
3637 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3638 mark them as pure_slp. */
3639 auto_vec
<stmt_vec_info
> worklist
;
3640 for (int i
= LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
- 1; i
>= 0; --i
)
3642 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
3643 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
3646 gphi
*phi
= gsi
.phi ();
3647 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (phi
);
3648 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3649 maybe_push_to_hybrid_worklist (loop_vinfo
,
3650 worklist
, stmt_info
);
3652 for (gimple_stmt_iterator gsi
= gsi_last_bb (bb
); !gsi_end_p (gsi
);
3655 gimple
*stmt
= gsi_stmt (gsi
);
3656 if (is_gimple_debug (stmt
))
3658 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
3659 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
3661 for (gimple_stmt_iterator gsi2
3662 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
3663 !gsi_end_p (gsi2
); gsi_next (&gsi2
))
3665 stmt_vec_info patt_info
3666 = loop_vinfo
->lookup_stmt (gsi_stmt (gsi2
));
3667 if (!STMT_SLP_TYPE (patt_info
)
3668 && STMT_VINFO_RELEVANT (patt_info
))
3669 maybe_push_to_hybrid_worklist (loop_vinfo
,
3670 worklist
, patt_info
);
3672 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
3674 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3675 maybe_push_to_hybrid_worklist (loop_vinfo
,
3676 worklist
, stmt_info
);
3680 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3681 mark any SLP vectorized stmt as hybrid.
3682 ??? We're visiting def stmts N times (once for each non-SLP and
3683 once for each hybrid-SLP use). */
3686 dat
.worklist
= &worklist
;
3687 dat
.loop_vinfo
= loop_vinfo
;
3688 memset (&wi
, 0, sizeof (wi
));
3689 wi
.info
= (void *)&dat
;
3690 while (!worklist
.is_empty ())
3692 stmt_vec_info stmt_info
= worklist
.pop ();
3693 /* Since SSA operands are not set up for pattern stmts we need
3694 to use walk_gimple_op. */
3696 walk_gimple_op (stmt_info
->stmt
, vect_detect_hybrid_slp
, &wi
);
3701 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3703 _bb_vec_info::_bb_vec_info (vec
<basic_block
> _bbs
, vec_info_shared
*shared
)
3704 : vec_info (vec_info::bb
, init_cost (NULL
, false), shared
),
3708 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3711 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3714 gphi
*phi
= si
.phi ();
3715 gimple_set_uid (phi
, 0);
3718 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3719 !gsi_end_p (gsi
); gsi_next (&gsi
))
3721 gimple
*stmt
= gsi_stmt (gsi
);
3722 gimple_set_uid (stmt
, 0);
3723 if (is_gimple_debug (stmt
))
3731 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3732 stmts in the basic block. */
3734 _bb_vec_info::~_bb_vec_info ()
3736 /* Reset region marker. */
3737 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3740 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3743 gphi
*phi
= si
.phi ();
3744 gimple_set_uid (phi
, -1);
3746 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3747 !gsi_end_p (gsi
); gsi_next (&gsi
))
3749 gimple
*stmt
= gsi_stmt (gsi
);
3750 gimple_set_uid (stmt
, -1);
3754 for (unsigned i
= 0; i
< roots
.length (); ++i
)
3756 roots
[i
].stmts
.release ();
3757 roots
[i
].roots
.release ();
3762 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3763 given then that child nodes have already been processed, and that
3764 their def types currently match their SLP node's def type. */
3767 vect_slp_analyze_node_operations_1 (vec_info
*vinfo
, slp_tree node
,
3768 slp_instance node_instance
,
3769 stmt_vector_for_cost
*cost_vec
)
3771 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
3773 /* Calculate the number of vector statements to be created for the
3774 scalar stmts in this node. For SLP reductions it is equal to the
3775 number of vector statements in the children (which has already been
3776 calculated by the recursive call). Otherwise it is the number of
3777 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3778 VF divided by the number of elements in a vector. */
3779 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3780 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
3782 for (unsigned i
= 0; i
< SLP_TREE_CHILDREN (node
).length (); ++i
)
3783 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node
)[i
]) == vect_internal_def
)
3785 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3786 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node
)[i
]);
3793 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3794 vf
= loop_vinfo
->vectorization_factor
;
3797 unsigned int group_size
= SLP_TREE_LANES (node
);
3798 tree vectype
= SLP_TREE_VECTYPE (node
);
3799 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3800 = vect_get_num_vectors (vf
* group_size
, vectype
);
3803 /* Handle purely internal nodes. */
3804 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
3805 return vectorizable_slp_permutation (vinfo
, NULL
, node
, cost_vec
);
3807 gcc_assert (STMT_SLP_TYPE (stmt_info
) != loop_vect
);
3808 if (is_a
<bb_vec_info
> (vinfo
)
3809 && !vect_update_shared_vectype (stmt_info
, SLP_TREE_VECTYPE (node
)))
3811 if (dump_enabled_p ())
3812 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3813 "desired vector type conflicts with earlier one "
3814 "for %G", stmt_info
->stmt
);
3819 return vect_analyze_stmt (vinfo
, stmt_info
, &dummy
,
3820 node
, node_instance
, cost_vec
);
3823 /* Try to build NODE from scalars, returning true on success.
3824 NODE_INSTANCE is the SLP instance that contains NODE. */
3827 vect_slp_convert_to_external (vec_info
*vinfo
, slp_tree node
,
3828 slp_instance node_instance
)
3830 stmt_vec_info stmt_info
;
3833 if (!is_a
<bb_vec_info
> (vinfo
)
3834 || node
== SLP_INSTANCE_TREE (node_instance
)
3835 || !SLP_TREE_SCALAR_STMTS (node
).exists ()
3836 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node
)))
3839 if (dump_enabled_p ())
3840 dump_printf_loc (MSG_NOTE
, vect_location
,
3841 "Building vector operands of %p from scalars instead\n", node
);
3843 /* Don't remove and free the child nodes here, since they could be
3844 referenced by other structures. The analysis and scheduling phases
3845 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3846 unsigned int group_size
= SLP_TREE_LANES (node
);
3847 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
3848 SLP_TREE_SCALAR_OPS (node
).safe_grow (group_size
, true);
3849 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3850 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3852 tree lhs
= gimple_get_lhs (vect_orig_stmt (stmt_info
)->stmt
);
3853 SLP_TREE_SCALAR_OPS (node
)[i
] = lhs
;
3858 /* Compute the prologue cost for invariant or constant operands represented
3862 vect_prologue_cost_for_slp (slp_tree node
,
3863 stmt_vector_for_cost
*cost_vec
)
3865 /* There's a special case of an existing vector, that costs nothing. */
3866 if (SLP_TREE_SCALAR_OPS (node
).length () == 0
3867 && !SLP_TREE_VEC_DEFS (node
).is_empty ())
3869 /* Without looking at the actual initializer a vector of
3870 constants can be implemented as load from the constant pool.
3871 When all elements are the same we can use a splat. */
3872 tree vectype
= SLP_TREE_VECTYPE (node
);
3873 unsigned group_size
= SLP_TREE_SCALAR_OPS (node
).length ();
3874 unsigned num_vects_to_check
;
3875 unsigned HOST_WIDE_INT const_nunits
;
3876 unsigned nelt_limit
;
3877 if (TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
)
3878 && ! multiple_p (const_nunits
, group_size
))
3880 num_vects_to_check
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
3881 nelt_limit
= const_nunits
;
3885 /* If either the vector has variable length or the vectors
3886 are composed of repeated whole groups we only need to
3887 cost construction once. All vectors will be the same. */
3888 num_vects_to_check
= 1;
3889 nelt_limit
= group_size
;
3891 tree elt
= NULL_TREE
;
3893 for (unsigned j
= 0; j
< num_vects_to_check
* nelt_limit
; ++j
)
3895 unsigned si
= j
% group_size
;
3897 elt
= SLP_TREE_SCALAR_OPS (node
)[si
];
3898 /* ??? We're just tracking whether all operands of a single
3899 vector initializer are the same, ideally we'd check if
3900 we emitted the same one already. */
3901 else if (elt
!= SLP_TREE_SCALAR_OPS (node
)[si
])
3904 if (nelt
== nelt_limit
)
3906 record_stmt_cost (cost_vec
, 1,
3907 SLP_TREE_DEF_TYPE (node
) == vect_external_def
3908 ? (elt
? scalar_to_vec
: vec_construct
)
3910 NULL
, vectype
, 0, vect_prologue
);
3916 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3917 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3919 Return true if the operations are supported. */
3922 vect_slp_analyze_node_operations (vec_info
*vinfo
, slp_tree node
,
3923 slp_instance node_instance
,
3924 hash_set
<slp_tree
> &visited_set
,
3925 vec
<slp_tree
> &visited_vec
,
3926 stmt_vector_for_cost
*cost_vec
)
3931 /* Assume we can code-generate all invariants. */
3933 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
3934 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
3937 if (SLP_TREE_DEF_TYPE (node
) == vect_uninitialized_def
)
3939 if (dump_enabled_p ())
3940 dump_printf_loc (MSG_NOTE
, vect_location
,
3941 "Failed cyclic SLP reference in %p\n", node
);
3944 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
);
3946 /* If we already analyzed the exact same set of scalar stmts we're done.
3947 We share the generated vector stmts for those. */
3948 if (visited_set
.add (node
))
3950 visited_vec
.safe_push (node
);
3953 unsigned visited_rec_start
= visited_vec
.length ();
3954 unsigned cost_vec_rec_start
= cost_vec
->length ();
3955 bool seen_non_constant_child
= false;
3956 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3958 res
= vect_slp_analyze_node_operations (vinfo
, child
, node_instance
,
3959 visited_set
, visited_vec
,
3963 if (child
&& SLP_TREE_DEF_TYPE (child
) != vect_constant_def
)
3964 seen_non_constant_child
= true;
3966 /* We're having difficulties scheduling nodes with just constant
3967 operands and no scalar stmts since we then cannot compute a stmt
3969 if (!seen_non_constant_child
&& SLP_TREE_SCALAR_STMTS (node
).is_empty ())
3971 if (dump_enabled_p ())
3972 dump_printf_loc (MSG_NOTE
, vect_location
,
3973 "Cannot vectorize all-constant op node %p\n", node
);
3978 res
= vect_slp_analyze_node_operations_1 (vinfo
, node
, node_instance
,
3980 /* If analysis failed we have to pop all recursive visited nodes
3984 while (visited_vec
.length () >= visited_rec_start
)
3985 visited_set
.remove (visited_vec
.pop ());
3986 cost_vec
->truncate (cost_vec_rec_start
);
3989 /* When the node can be vectorized cost invariant nodes it references.
3990 This is not done in DFS order to allow the refering node
3991 vectorizable_* calls to nail down the invariant nodes vector type
3992 and possibly unshare it if it needs a different vector type than
3995 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3997 && (SLP_TREE_DEF_TYPE (child
) == vect_constant_def
3998 || SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
3999 /* Perform usual caching, note code-generation still
4000 code-gens these nodes multiple times but we expect
4001 to CSE them later. */
4002 && !visited_set
.add (child
))
4004 visited_vec
.safe_push (child
);
4005 /* ??? After auditing more code paths make a "default"
4006 and push the vector type from NODE to all children
4007 if it is not already set. */
4008 /* Compute the number of vectors to be generated. */
4009 tree vector_type
= SLP_TREE_VECTYPE (child
);
4012 /* For shifts with a scalar argument we don't need
4013 to cost or code-generate anything.
4014 ??? Represent this more explicitely. */
4015 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
4016 == shift_vec_info_type
)
4020 unsigned group_size
= SLP_TREE_LANES (child
);
4022 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4023 vf
= loop_vinfo
->vectorization_factor
;
4024 SLP_TREE_NUMBER_OF_VEC_STMTS (child
)
4025 = vect_get_num_vectors (vf
* group_size
, vector_type
);
4026 /* And cost them. */
4027 vect_prologue_cost_for_slp (child
, cost_vec
);
4030 /* If this node or any of its children can't be vectorized, try pruning
4031 the tree here rather than felling the whole thing. */
4032 if (!res
&& vect_slp_convert_to_external (vinfo
, node
, node_instance
))
4034 /* We'll need to revisit this for invariant costing and number
4035 of vectorized stmt setting. */
4043 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4044 region and that can be vectorized using vectorizable_live_operation
4045 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
4046 scalar code computing it to be retained. */
4049 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo
, slp_tree node
,
4050 slp_instance instance
,
4051 stmt_vector_for_cost
*cost_vec
,
4052 hash_set
<stmt_vec_info
> &svisited
,
4053 hash_set
<slp_tree
> &visited
)
4055 if (visited
.add (node
))
4059 stmt_vec_info stmt_info
;
4060 stmt_vec_info last_stmt
= vect_find_last_scalar_stmt_in_slp (node
);
4061 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4063 if (svisited
.contains (stmt_info
))
4065 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4066 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
)
4067 && STMT_VINFO_RELATED_STMT (orig_stmt_info
) != stmt_info
)
4068 /* Only the pattern root stmt computes the original scalar value. */
4070 bool mark_visited
= true;
4071 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
4072 ssa_op_iter op_iter
;
4073 def_operand_p def_p
;
4074 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
4076 imm_use_iterator use_iter
;
4078 stmt_vec_info use_stmt_info
;
4079 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4080 if (!is_gimple_debug (use_stmt
))
4082 use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
);
4084 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
4086 STMT_VINFO_LIVE_P (stmt_info
) = true;
4087 if (vectorizable_live_operation (bb_vinfo
, stmt_info
,
4088 NULL
, node
, instance
, i
,
4090 /* ??? So we know we can vectorize the live stmt
4091 from one SLP node. If we cannot do so from all
4092 or none consistently we'd have to record which
4093 SLP node (and lane) we want to use for the live
4094 operation. So make sure we can code-generate
4096 mark_visited
= false;
4098 STMT_VINFO_LIVE_P (stmt_info
) = false;
4102 /* We have to verify whether we can insert the lane extract
4103 before all uses. The following is a conservative approximation.
4104 We cannot put this into vectorizable_live_operation because
4105 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4107 Note that while the fact that we emit code for loads at the
4108 first load should make this a non-problem leafs we construct
4109 from scalars are vectorized after the last scalar def.
4110 ??? If we'd actually compute the insert location during
4111 analysis we could use sth less conservative than the last
4112 scalar stmt in the node for the dominance check. */
4113 /* ??? What remains is "live" uses in vector CTORs in the same
4114 SLP graph which is where those uses can end up code-generated
4115 right after their definition instead of close to their original
4116 use. But that would restrict us to code-generate lane-extracts
4117 from the latest stmt in a node. So we compensate for this
4118 during code-generation, simply not replacing uses for those
4119 hopefully rare cases. */
4120 if (STMT_VINFO_LIVE_P (stmt_info
))
4121 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4122 if (!is_gimple_debug (use_stmt
)
4123 && (!(use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
))
4124 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
4125 && !vect_stmt_dominates_stmt_p (last_stmt
->stmt
, use_stmt
))
4127 if (dump_enabled_p ())
4128 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4129 "Cannot determine insertion place for "
4131 STMT_VINFO_LIVE_P (stmt_info
) = false;
4132 mark_visited
= true;
4136 svisited
.add (stmt_info
);
4140 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4141 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4142 vect_bb_slp_mark_live_stmts (bb_vinfo
, child
, instance
,
4143 cost_vec
, svisited
, visited
);
4146 /* Analyze statements in SLP instances of VINFO. Return true if the
4147 operations are supported. */
4150 vect_slp_analyze_operations (vec_info
*vinfo
)
4152 slp_instance instance
;
4155 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4157 hash_set
<slp_tree
> visited
;
4158 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
4160 auto_vec
<slp_tree
> visited_vec
;
4161 stmt_vector_for_cost cost_vec
;
4162 cost_vec
.create (2);
4163 if (is_a
<bb_vec_info
> (vinfo
))
4164 vect_location
= instance
->location ();
4165 if (!vect_slp_analyze_node_operations (vinfo
,
4166 SLP_INSTANCE_TREE (instance
),
4167 instance
, visited
, visited_vec
,
4169 /* Instances with a root stmt require vectorized defs for the
4171 /* ??? Do inst->kind check instead. */
4172 || (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ()
4173 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
))
4174 != vect_internal_def
)))
4176 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4177 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4178 if (dump_enabled_p ())
4179 dump_printf_loc (MSG_NOTE
, vect_location
,
4180 "removing SLP instance operations starting from: %G",
4182 vect_free_slp_instance (instance
);
4183 vinfo
->slp_instances
.ordered_remove (i
);
4184 cost_vec
.release ();
4185 while (!visited_vec
.is_empty ())
4186 visited
.remove (visited_vec
.pop ());
4192 /* For BB vectorization remember the SLP graph entry
4194 if (is_a
<bb_vec_info
> (vinfo
))
4195 instance
->cost_vec
= cost_vec
;
4198 add_stmt_costs (vinfo
, vinfo
->target_cost_data
, &cost_vec
);
4199 cost_vec
.release ();
4204 /* Compute vectorizable live stmts. */
4205 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
4207 hash_set
<stmt_vec_info
> svisited
;
4208 hash_set
<slp_tree
> visited
;
4209 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4211 vect_location
= instance
->location ();
4212 vect_bb_slp_mark_live_stmts (bb_vinfo
, SLP_INSTANCE_TREE (instance
),
4213 instance
, &instance
->cost_vec
, svisited
,
4218 return !vinfo
->slp_instances
.is_empty ();
4221 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
4222 closing the eventual chain. */
4225 get_ultimate_leader (slp_instance instance
,
4226 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
4228 auto_vec
<slp_instance
*, 8> chain
;
4230 while (*(tem
= instance_leader
.get (instance
)) != instance
)
4232 chain
.safe_push (tem
);
4235 while (!chain
.is_empty ())
4236 *chain
.pop () = instance
;
4240 /* Worker of vect_bb_partition_graph, recurse on NODE. */
4243 vect_bb_partition_graph_r (bb_vec_info bb_vinfo
,
4244 slp_instance instance
, slp_tree node
,
4245 hash_map
<stmt_vec_info
, slp_instance
> &stmt_to_instance
,
4246 hash_map
<slp_instance
, slp_instance
> &instance_leader
,
4247 hash_set
<slp_tree
> &visited
)
4249 stmt_vec_info stmt_info
;
4252 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4255 slp_instance
&stmt_instance
4256 = stmt_to_instance
.get_or_insert (stmt_info
, &existed_p
);
4259 else if (stmt_instance
!= instance
)
4261 /* If we're running into a previously marked stmt make us the
4262 leader of the current ultimate leader. This keeps the
4263 leader chain acyclic and works even when the current instance
4264 connects two previously independent graph parts. */
4265 slp_instance stmt_leader
4266 = get_ultimate_leader (stmt_instance
, instance_leader
);
4267 if (stmt_leader
!= instance
)
4268 instance_leader
.put (stmt_leader
, instance
);
4270 stmt_instance
= instance
;
4273 if (!SLP_TREE_SCALAR_STMTS (node
).is_empty () && visited
.add (node
))
4277 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4278 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4279 vect_bb_partition_graph_r (bb_vinfo
, instance
, child
, stmt_to_instance
,
4280 instance_leader
, visited
);
4283 /* Partition the SLP graph into pieces that can be costed independently. */
4286 vect_bb_partition_graph (bb_vec_info bb_vinfo
)
4288 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
4290 /* First walk the SLP graph assigning each involved scalar stmt a
4291 corresponding SLP graph entry and upon visiting a previously
4292 marked stmt, make the stmts leader the current SLP graph entry. */
4293 hash_map
<stmt_vec_info
, slp_instance
> stmt_to_instance
;
4294 hash_map
<slp_instance
, slp_instance
> instance_leader
;
4295 hash_set
<slp_tree
> visited
;
4296 slp_instance instance
;
4297 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4299 instance_leader
.put (instance
, instance
);
4300 vect_bb_partition_graph_r (bb_vinfo
,
4301 instance
, SLP_INSTANCE_TREE (instance
),
4302 stmt_to_instance
, instance_leader
,
4306 /* Then collect entries to each independent subgraph. */
4307 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4309 slp_instance leader
= get_ultimate_leader (instance
, instance_leader
);
4310 leader
->subgraph_entries
.safe_push (instance
);
4311 if (dump_enabled_p ()
4312 && leader
!= instance
)
4313 dump_printf_loc (MSG_NOTE
, vect_location
,
4314 "instance %p is leader of %p\n",
4319 /* Compute the scalar cost of the SLP node NODE and its children
4320 and return it. Do not account defs that are marked in LIFE and
4321 update LIFE according to uses of NODE. */
4324 vect_bb_slp_scalar_cost (vec_info
*vinfo
,
4325 slp_tree node
, vec
<bool, va_heap
> *life
,
4326 stmt_vector_for_cost
*cost_vec
,
4327 hash_set
<slp_tree
> &visited
)
4330 stmt_vec_info stmt_info
;
4333 if (visited
.add (node
))
4336 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4338 ssa_op_iter op_iter
;
4339 def_operand_p def_p
;
4344 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4345 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
4347 /* If there is a non-vectorized use of the defs then the scalar
4348 stmt is kept live in which case we do not account it or any
4349 required defs in the SLP children in the scalar cost. This
4350 way we make the vectorization more costly when compared to
4352 if (!STMT_VINFO_LIVE_P (stmt_info
))
4354 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
4356 imm_use_iterator use_iter
;
4358 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4359 if (!is_gimple_debug (use_stmt
))
4361 stmt_vec_info use_stmt_info
= vinfo
->lookup_stmt (use_stmt
);
4364 (vect_stmt_to_vectorize (use_stmt_info
)))
4375 /* Count scalar stmts only once. */
4376 if (gimple_visited_p (orig_stmt
))
4378 gimple_set_visited (orig_stmt
, true);
4380 vect_cost_for_stmt kind
;
4381 if (STMT_VINFO_DATA_REF (orig_stmt_info
))
4383 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info
)))
4386 kind
= scalar_store
;
4388 else if (vect_nop_conversion_p (orig_stmt_info
))
4390 /* For single-argument PHIs assume coalescing which means zero cost
4391 for the scalar and the vector PHIs. This avoids artificially
4392 favoring the vector path (but may pessimize it in some cases). */
4393 else if (is_a
<gphi
*> (orig_stmt_info
->stmt
)
4394 && gimple_phi_num_args
4395 (as_a
<gphi
*> (orig_stmt_info
->stmt
)) == 1)
4399 record_stmt_cost (cost_vec
, 1, kind
, orig_stmt_info
,
4400 SLP_TREE_VECTYPE (node
), 0, vect_body
);
4403 auto_vec
<bool, 20> subtree_life
;
4404 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4406 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4408 /* Do not directly pass LIFE to the recursive call, copy it to
4409 confine changes in the callee to the current child/subtree. */
4410 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
4412 subtree_life
.safe_grow_cleared (SLP_TREE_LANES (child
), true);
4413 for (unsigned j
= 0;
4414 j
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++j
)
4416 auto perm
= SLP_TREE_LANE_PERMUTATION (node
)[j
];
4417 if (perm
.first
== i
)
4418 subtree_life
[perm
.second
] = (*life
)[j
];
4423 gcc_assert (SLP_TREE_LANES (node
) == SLP_TREE_LANES (child
));
4424 subtree_life
.safe_splice (*life
);
4426 vect_bb_slp_scalar_cost (vinfo
, child
, &subtree_life
, cost_vec
,
4428 subtree_life
.truncate (0);
4433 /* Comparator for the loop-index sorted cost vectors. */
4436 li_cost_vec_cmp (const void *a_
, const void *b_
)
4438 auto *a
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)a_
;
4439 auto *b
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)b_
;
4440 if (a
->first
< b
->first
)
4442 else if (a
->first
== b
->first
)
4447 /* Check if vectorization of the basic block is profitable for the
4448 subgraph denoted by SLP_INSTANCES. */
4451 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
,
4452 vec
<slp_instance
> slp_instances
)
4454 slp_instance instance
;
4456 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
4457 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
4459 if (dump_enabled_p ())
4461 dump_printf_loc (MSG_NOTE
, vect_location
, "Costing subgraph: \n");
4462 hash_set
<slp_tree
> visited
;
4463 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4464 vect_print_slp_graph (MSG_NOTE
, vect_location
,
4465 SLP_INSTANCE_TREE (instance
), visited
);
4468 /* Calculate scalar cost and sum the cost for the vector stmts
4469 previously collected. */
4470 stmt_vector_for_cost scalar_costs
= vNULL
;
4471 stmt_vector_for_cost vector_costs
= vNULL
;
4472 hash_set
<slp_tree
> visited
;
4473 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4475 auto_vec
<bool, 20> life
;
4476 life
.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
)),
4478 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
4479 record_stmt_cost (&scalar_costs
,
4480 SLP_INSTANCE_ROOT_STMTS (instance
).length (),
4482 SLP_INSTANCE_ROOT_STMTS (instance
)[0], 0, vect_body
);
4483 vect_bb_slp_scalar_cost (bb_vinfo
,
4484 SLP_INSTANCE_TREE (instance
),
4485 &life
, &scalar_costs
, visited
);
4486 vector_costs
.safe_splice (instance
->cost_vec
);
4487 instance
->cost_vec
.release ();
4489 /* Unset visited flag. */
4490 stmt_info_for_cost
*cost
;
4491 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
4492 gimple_set_visited (cost
->stmt_info
->stmt
, false);
4494 if (dump_enabled_p ())
4495 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
4497 /* When costing non-loop vectorization we need to consider each covered
4498 loop independently and make sure vectorization is profitable. For
4499 now we assume a loop may be not entered or executed an arbitrary
4500 number of iterations (??? static information can provide more
4501 precise info here) which means we can simply cost each containing
4502 loops stmts separately. */
4504 /* First produce cost vectors sorted by loop index. */
4505 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
4506 li_scalar_costs (scalar_costs
.length ());
4507 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
4508 li_vector_costs (vector_costs
.length ());
4509 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
4511 unsigned l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
4512 li_scalar_costs
.quick_push (std::make_pair (l
, cost
));
4514 /* Use a random used loop as fallback in case the first vector_costs
4515 entry does not have a stmt_info associated with it. */
4516 unsigned l
= li_scalar_costs
[0].first
;
4517 FOR_EACH_VEC_ELT (vector_costs
, i
, cost
)
4519 /* We inherit from the previous COST, invariants, externals and
4520 extracts immediately follow the cost for the related stmt. */
4521 if (cost
->stmt_info
)
4522 l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
4523 li_vector_costs
.quick_push (std::make_pair (l
, cost
));
4525 li_scalar_costs
.qsort (li_cost_vec_cmp
);
4526 li_vector_costs
.qsort (li_cost_vec_cmp
);
4528 /* Now cost the portions individually. */
4531 while (si
< li_scalar_costs
.length ()
4532 && vi
< li_vector_costs
.length ())
4534 unsigned sl
= li_scalar_costs
[si
].first
;
4535 unsigned vl
= li_vector_costs
[vi
].first
;
4538 if (dump_enabled_p ())
4539 dump_printf_loc (MSG_NOTE
, vect_location
,
4540 "Scalar %d and vector %d loop part do not "
4541 "match up, skipping scalar part\n", sl
, vl
);
4542 /* Skip the scalar part, assuming zero cost on the vector side. */
4547 while (si
< li_scalar_costs
.length ()
4548 && li_scalar_costs
[si
].first
== sl
);
4552 void *scalar_target_cost_data
= init_cost (NULL
, true);
4555 add_stmt_cost (bb_vinfo
, scalar_target_cost_data
,
4556 li_scalar_costs
[si
].second
);
4559 while (si
< li_scalar_costs
.length ()
4560 && li_scalar_costs
[si
].first
== sl
);
4562 finish_cost (scalar_target_cost_data
, &dummy
, &scalar_cost
, &dummy
);
4563 destroy_cost_data (scalar_target_cost_data
);
4565 /* Complete the target-specific vector cost calculation. */
4566 void *vect_target_cost_data
= init_cost (NULL
, false);
4569 add_stmt_cost (bb_vinfo
, vect_target_cost_data
,
4570 li_vector_costs
[vi
].second
);
4573 while (vi
< li_vector_costs
.length ()
4574 && li_vector_costs
[vi
].first
== vl
);
4575 finish_cost (vect_target_cost_data
, &vec_prologue_cost
,
4576 &vec_inside_cost
, &vec_epilogue_cost
);
4577 destroy_cost_data (vect_target_cost_data
);
4579 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
4581 if (dump_enabled_p ())
4583 dump_printf_loc (MSG_NOTE
, vect_location
,
4584 "Cost model analysis for part in loop %d:\n", sl
);
4585 dump_printf (MSG_NOTE
, " Vector cost: %d\n",
4586 vec_inside_cost
+ vec_outside_cost
);
4587 dump_printf (MSG_NOTE
, " Scalar cost: %d\n", scalar_cost
);
4590 /* Vectorization is profitable if its cost is more than the cost of scalar
4591 version. Note that we err on the vector side for equal cost because
4592 the cost estimate is otherwise quite pessimistic (constant uses are
4593 free on the scalar side but cost a load on the vector side for
4595 if (vec_outside_cost
+ vec_inside_cost
> scalar_cost
)
4597 scalar_costs
.release ();
4598 vector_costs
.release ();
4602 if (vi
< li_vector_costs
.length ())
4604 if (dump_enabled_p ())
4605 dump_printf_loc (MSG_NOTE
, vect_location
,
4606 "Excess vector cost for part in loop %d:\n",
4607 li_vector_costs
[vi
].first
);
4608 scalar_costs
.release ();
4609 vector_costs
.release ();
4613 scalar_costs
.release ();
4614 vector_costs
.release ();
4618 /* qsort comparator for lane defs. */
4621 vld_cmp (const void *a_
, const void *b_
)
4623 auto *a
= (const std::pair
<unsigned, tree
> *)a_
;
4624 auto *b
= (const std::pair
<unsigned, tree
> *)b_
;
4625 return a
->first
- b
->first
;
4628 /* Return true if USE_STMT is a vector lane insert into VEC and set
4629 *THIS_LANE to the lane number that is set. */
4632 vect_slp_is_lane_insert (gimple
*use_stmt
, tree vec
, unsigned *this_lane
)
4634 gassign
*use_ass
= dyn_cast
<gassign
*> (use_stmt
);
4636 || gimple_assign_rhs_code (use_ass
) != BIT_INSERT_EXPR
4638 ? gimple_assign_rhs1 (use_ass
) != vec
4639 : ((vec
= gimple_assign_rhs1 (use_ass
)), false))
4640 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec
)),
4641 TREE_TYPE (gimple_assign_rhs2 (use_ass
)))
4642 || !constant_multiple_p
4643 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass
)),
4644 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec
)))),
4650 /* Find any vectorizable constructors and add them to the grouped_store
4654 vect_slp_check_for_constructors (bb_vec_info bb_vinfo
)
4656 for (unsigned i
= 0; i
< bb_vinfo
->bbs
.length (); ++i
)
4657 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb_vinfo
->bbs
[i
]);
4658 !gsi_end_p (gsi
); gsi_next (&gsi
))
4660 gassign
*assign
= dyn_cast
<gassign
*> (gsi_stmt (gsi
));
4664 tree rhs
= gimple_assign_rhs1 (assign
);
4665 if (gimple_assign_rhs_code (assign
) == CONSTRUCTOR
)
4667 if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
4668 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)),
4669 CONSTRUCTOR_NELTS (rhs
))
4670 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs
, 0)->value
))
4671 || uniform_vector_p (rhs
))
4676 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), j
, val
)
4677 if (TREE_CODE (val
) != SSA_NAME
4678 || !bb_vinfo
->lookup_def (val
))
4680 if (j
!= CONSTRUCTOR_NELTS (rhs
))
4683 stmt_vec_info stmt_info
= bb_vinfo
->lookup_stmt (assign
);
4684 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
4686 else if (gimple_assign_rhs_code (assign
) == BIT_INSERT_EXPR
4687 && VECTOR_TYPE_P (TREE_TYPE (rhs
))
4688 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).is_constant ()
4689 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).to_constant () > 1
4690 && integer_zerop (gimple_assign_rhs3 (assign
))
4691 && useless_type_conversion_p
4692 (TREE_TYPE (TREE_TYPE (rhs
)),
4693 TREE_TYPE (gimple_assign_rhs2 (assign
)))
4694 && bb_vinfo
->lookup_def (gimple_assign_rhs2 (assign
)))
4696 /* We start to match on insert to lane zero but since the
4697 inserts need not be ordered we'd have to search both
4698 the def and the use chains. */
4699 tree vectype
= TREE_TYPE (rhs
);
4700 unsigned nlanes
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
4701 auto_vec
<std::pair
<unsigned, tree
> > lane_defs (nlanes
);
4702 auto_sbitmap
lanes (nlanes
);
4703 bitmap_clear (lanes
);
4704 bitmap_set_bit (lanes
, 0);
4705 tree def
= gimple_assign_lhs (assign
);
4706 lane_defs
.quick_push
4707 (std::make_pair (0, gimple_assign_rhs2 (assign
)));
4708 unsigned lanes_found
= 1;
4709 /* Start with the use chains, the last stmt will be the root. */
4710 stmt_vec_info last
= bb_vinfo
->lookup_stmt (assign
);
4711 vec
<stmt_vec_info
> roots
= vNULL
;
4712 roots
.safe_push (last
);
4715 use_operand_p use_p
;
4717 if (!single_imm_use (def
, &use_p
, &use_stmt
))
4720 if (!bb_vinfo
->lookup_stmt (use_stmt
)
4721 || !vect_slp_is_lane_insert (use_stmt
, def
, &this_lane
)
4722 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (use_stmt
)))
4724 if (bitmap_bit_p (lanes
, this_lane
))
4727 bitmap_set_bit (lanes
, this_lane
);
4728 gassign
*use_ass
= as_a
<gassign
*> (use_stmt
);
4729 lane_defs
.quick_push (std::make_pair
4730 (this_lane
, gimple_assign_rhs2 (use_ass
)));
4731 last
= bb_vinfo
->lookup_stmt (use_ass
);
4732 roots
.safe_push (last
);
4733 def
= gimple_assign_lhs (use_ass
);
4735 while (lanes_found
< nlanes
);
4736 if (roots
.length () > 1)
4737 std::swap(roots
[0], roots
[roots
.length () - 1]);
4738 if (lanes_found
< nlanes
)
4740 /* Now search the def chain. */
4741 def
= gimple_assign_rhs1 (assign
);
4744 if (TREE_CODE (def
) != SSA_NAME
4745 || !has_single_use (def
))
4747 gimple
*def_stmt
= SSA_NAME_DEF_STMT (def
);
4749 if (!bb_vinfo
->lookup_stmt (def_stmt
)
4750 || !vect_slp_is_lane_insert (def_stmt
,
4751 NULL_TREE
, &this_lane
)
4752 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (def_stmt
)))
4754 if (bitmap_bit_p (lanes
, this_lane
))
4757 bitmap_set_bit (lanes
, this_lane
);
4758 lane_defs
.quick_push (std::make_pair
4760 gimple_assign_rhs2 (def_stmt
)));
4761 roots
.safe_push (bb_vinfo
->lookup_stmt (def_stmt
));
4762 def
= gimple_assign_rhs1 (def_stmt
);
4764 while (lanes_found
< nlanes
);
4766 if (lanes_found
== nlanes
)
4768 /* Sort lane_defs after the lane index and register the root. */
4769 lane_defs
.qsort (vld_cmp
);
4770 vec
<stmt_vec_info
> stmts
;
4771 stmts
.create (nlanes
);
4772 for (unsigned i
= 0; i
< nlanes
; ++i
)
4773 stmts
.quick_push (bb_vinfo
->lookup_def (lane_defs
[i
].second
));
4774 bb_vinfo
->roots
.safe_push (slp_root (slp_inst_kind_ctor
,
4783 /* Walk the grouped store chains and replace entries with their
4784 pattern variant if any. */
4787 vect_fixup_store_groups_with_patterns (vec_info
*vinfo
)
4789 stmt_vec_info first_element
;
4792 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
4794 /* We also have CTORs in this array. */
4795 if (!STMT_VINFO_GROUPED_ACCESS (first_element
))
4797 if (STMT_VINFO_IN_PATTERN_P (first_element
))
4799 stmt_vec_info orig
= first_element
;
4800 first_element
= STMT_VINFO_RELATED_STMT (first_element
);
4801 DR_GROUP_FIRST_ELEMENT (first_element
) = first_element
;
4802 DR_GROUP_SIZE (first_element
) = DR_GROUP_SIZE (orig
);
4803 DR_GROUP_GAP (first_element
) = DR_GROUP_GAP (orig
);
4804 DR_GROUP_NEXT_ELEMENT (first_element
) = DR_GROUP_NEXT_ELEMENT (orig
);
4805 vinfo
->grouped_stores
[i
] = first_element
;
4807 stmt_vec_info prev
= first_element
;
4808 while (DR_GROUP_NEXT_ELEMENT (prev
))
4810 stmt_vec_info elt
= DR_GROUP_NEXT_ELEMENT (prev
);
4811 if (STMT_VINFO_IN_PATTERN_P (elt
))
4813 stmt_vec_info orig
= elt
;
4814 elt
= STMT_VINFO_RELATED_STMT (elt
);
4815 DR_GROUP_NEXT_ELEMENT (prev
) = elt
;
4816 DR_GROUP_GAP (elt
) = DR_GROUP_GAP (orig
);
4817 DR_GROUP_NEXT_ELEMENT (elt
) = DR_GROUP_NEXT_ELEMENT (orig
);
4819 DR_GROUP_FIRST_ELEMENT (elt
) = first_element
;
4825 /* Check if the region described by BB_VINFO can be vectorized, returning
4826 true if so. When returning false, set FATAL to true if the same failure
4827 would prevent vectorization at other vector sizes, false if it is still
4828 worth trying other sizes. N_STMTS is the number of statements in the
4832 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo
, int n_stmts
, bool &fatal
,
4833 vec
<int> *dataref_groups
)
4835 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4837 slp_instance instance
;
4839 poly_uint64 min_vf
= 2;
4841 /* The first group of checks is independent of the vector size. */
4844 /* Analyze the data references. */
4846 if (!vect_analyze_data_refs (bb_vinfo
, &min_vf
, NULL
))
4848 if (dump_enabled_p ())
4849 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4850 "not vectorized: unhandled data-ref in basic "
4855 if (!vect_analyze_data_ref_accesses (bb_vinfo
, dataref_groups
))
4857 if (dump_enabled_p ())
4858 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4859 "not vectorized: unhandled data access in "
4864 vect_slp_check_for_constructors (bb_vinfo
);
4866 /* If there are no grouped stores and no constructors in the region
4867 there is no need to continue with pattern recog as vect_analyze_slp
4868 will fail anyway. */
4869 if (bb_vinfo
->grouped_stores
.is_empty ()
4870 && bb_vinfo
->roots
.is_empty ())
4872 if (dump_enabled_p ())
4873 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4874 "not vectorized: no grouped stores in "
4879 /* While the rest of the analysis below depends on it in some way. */
4882 vect_pattern_recog (bb_vinfo
);
4884 /* Update store groups from pattern processing. */
4885 vect_fixup_store_groups_with_patterns (bb_vinfo
);
4887 /* Check the SLP opportunities in the basic block, analyze and build SLP
4889 if (!vect_analyze_slp (bb_vinfo
, n_stmts
))
4891 if (dump_enabled_p ())
4893 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4894 "Failed to SLP the basic block.\n");
4895 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4896 "not vectorized: failed to find SLP opportunities "
4897 "in basic block.\n");
4902 /* Optimize permutations. */
4903 vect_optimize_slp (bb_vinfo
);
4905 /* Gather the loads reachable from the SLP graph entries. */
4906 vect_gather_slp_loads (bb_vinfo
);
4908 vect_record_base_alignments (bb_vinfo
);
4910 /* Analyze and verify the alignment of data references and the
4911 dependence in the SLP instances. */
4912 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
4914 vect_location
= instance
->location ();
4915 if (! vect_slp_analyze_instance_alignment (bb_vinfo
, instance
)
4916 || ! vect_slp_analyze_instance_dependence (bb_vinfo
, instance
))
4918 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4919 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4920 if (dump_enabled_p ())
4921 dump_printf_loc (MSG_NOTE
, vect_location
,
4922 "removing SLP instance operations starting from: %G",
4924 vect_free_slp_instance (instance
);
4925 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
4929 /* Mark all the statements that we want to vectorize as pure SLP and
4931 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
4932 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
4935 /* Likewise consider instance root stmts as vectorized. */
4936 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance
), j
, root
)
4937 STMT_SLP_TYPE (root
) = pure_slp
;
4941 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
4944 if (!vect_slp_analyze_operations (bb_vinfo
))
4946 if (dump_enabled_p ())
4947 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4948 "not vectorized: bad operation in basic block.\n");
4952 vect_bb_partition_graph (bb_vinfo
);
4957 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4958 basic blocks in BBS, returning true on success.
4959 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4962 vect_slp_region (vec
<basic_block
> bbs
, vec
<data_reference_p
> datarefs
,
4963 vec
<int> *dataref_groups
, unsigned int n_stmts
)
4965 bb_vec_info bb_vinfo
;
4966 auto_vector_modes vector_modes
;
4968 /* Autodetect first vector size we try. */
4969 machine_mode next_vector_mode
= VOIDmode
;
4970 targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
, false);
4971 unsigned int mode_i
= 0;
4973 vec_info_shared shared
;
4975 machine_mode autodetected_vector_mode
= VOIDmode
;
4978 bool vectorized
= false;
4980 bb_vinfo
= new _bb_vec_info (bbs
, &shared
);
4982 bool first_time_p
= shared
.datarefs
.is_empty ();
4983 BB_VINFO_DATAREFS (bb_vinfo
) = datarefs
;
4985 bb_vinfo
->shared
->save_datarefs ();
4987 bb_vinfo
->shared
->check_datarefs ();
4988 bb_vinfo
->vector_mode
= next_vector_mode
;
4990 if (vect_slp_analyze_bb_1 (bb_vinfo
, n_stmts
, fatal
, dataref_groups
))
4992 if (dump_enabled_p ())
4994 dump_printf_loc (MSG_NOTE
, vect_location
,
4995 "***** Analysis succeeded with vector mode"
4996 " %s\n", GET_MODE_NAME (bb_vinfo
->vector_mode
));
4997 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB part\n");
5000 bb_vinfo
->shared
->check_datarefs ();
5003 slp_instance instance
;
5004 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo
), i
, instance
)
5006 if (instance
->subgraph_entries
.is_empty ())
5009 vect_location
= instance
->location ();
5010 if (!unlimited_cost_model (NULL
)
5011 && !vect_bb_vectorization_profitable_p
5012 (bb_vinfo
, instance
->subgraph_entries
))
5014 if (dump_enabled_p ())
5015 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5016 "not vectorized: vectorization is not "
5021 if (!dbg_cnt (vect_slp
))
5024 if (!vectorized
&& dump_enabled_p ())
5025 dump_printf_loc (MSG_NOTE
, vect_location
,
5026 "Basic block will be vectorized "
5030 vect_schedule_slp (bb_vinfo
, instance
->subgraph_entries
);
5032 unsigned HOST_WIDE_INT bytes
;
5033 if (dump_enabled_p ())
5036 (bb_vinfo
->vector_mode
).is_constant (&bytes
))
5037 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
5038 "basic block part vectorized using %wu "
5039 "byte vectors\n", bytes
);
5041 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
5042 "basic block part vectorized using "
5043 "variable length vectors\n");
5049 if (dump_enabled_p ())
5050 dump_printf_loc (MSG_NOTE
, vect_location
,
5051 "***** Analysis failed with vector mode %s\n",
5052 GET_MODE_NAME (bb_vinfo
->vector_mode
));
5056 autodetected_vector_mode
= bb_vinfo
->vector_mode
;
5059 while (mode_i
< vector_modes
.length ()
5060 && vect_chooses_same_modes_p (bb_vinfo
, vector_modes
[mode_i
]))
5062 if (dump_enabled_p ())
5063 dump_printf_loc (MSG_NOTE
, vect_location
,
5064 "***** The result for vector mode %s would"
5066 GET_MODE_NAME (vector_modes
[mode_i
]));
5072 if (mode_i
< vector_modes
.length ()
5073 && VECTOR_MODE_P (autodetected_vector_mode
)
5074 && (related_vector_mode (vector_modes
[mode_i
],
5075 GET_MODE_INNER (autodetected_vector_mode
))
5076 == autodetected_vector_mode
)
5077 && (related_vector_mode (autodetected_vector_mode
,
5078 GET_MODE_INNER (vector_modes
[mode_i
]))
5079 == vector_modes
[mode_i
]))
5081 if (dump_enabled_p ())
5082 dump_printf_loc (MSG_NOTE
, vect_location
,
5083 "***** Skipping vector mode %s, which would"
5084 " repeat the analysis for %s\n",
5085 GET_MODE_NAME (vector_modes
[mode_i
]),
5086 GET_MODE_NAME (autodetected_vector_mode
));
5091 || mode_i
== vector_modes
.length ()
5092 || autodetected_vector_mode
== VOIDmode
5093 /* If vect_slp_analyze_bb_1 signaled that analysis for all
5094 vector sizes will fail do not bother iterating. */
5098 /* Try the next biggest vector size. */
5099 next_vector_mode
= vector_modes
[mode_i
++];
5100 if (dump_enabled_p ())
5101 dump_printf_loc (MSG_NOTE
, vect_location
,
5102 "***** Re-trying analysis with vector mode %s\n",
5103 GET_MODE_NAME (next_vector_mode
));
5108 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
5109 true if anything in the basic-block was vectorized. */
5112 vect_slp_bbs (vec
<basic_block
> bbs
)
5114 vec
<data_reference_p
> datarefs
= vNULL
;
5115 auto_vec
<int> dataref_groups
;
5117 int current_group
= 0;
5119 for (unsigned i
= 0; i
< bbs
.length (); i
++)
5121 basic_block bb
= bbs
[i
];
5122 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
5125 gimple
*stmt
= gsi_stmt (gsi
);
5126 if (is_gimple_debug (stmt
))
5131 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
5132 vect_location
= stmt
;
5134 if (!vect_find_stmt_data_reference (NULL
, stmt
, &datarefs
,
5135 &dataref_groups
, current_group
))
5140 return vect_slp_region (bbs
, datarefs
, &dataref_groups
, insns
);
5143 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5144 true if anything in the basic-block was vectorized. */
5147 vect_slp_bb (basic_block bb
)
5149 auto_vec
<basic_block
> bbs
;
5151 return vect_slp_bbs (bbs
);
5154 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5155 true if anything in the basic-block was vectorized. */
5158 vect_slp_function (function
*fun
)
5161 int *rpo
= XNEWVEC (int, n_basic_blocks_for_fn (fun
));
5162 unsigned n
= pre_and_rev_post_order_compute_fn (fun
, NULL
, rpo
, false);
5164 /* For the moment split the function into pieces to avoid making
5165 the iteration on the vector mode moot. Split at points we know
5166 to not handle well which is CFG merges (SLP discovery doesn't
5167 handle non-loop-header PHIs) and loop exits. Since pattern
5168 recog requires reverse iteration to visit uses before defs
5169 simply chop RPO into pieces. */
5170 auto_vec
<basic_block
> bbs
;
5171 for (unsigned i
= 0; i
< n
; i
++)
5173 basic_block bb
= BASIC_BLOCK_FOR_FN (fun
, rpo
[i
]);
5176 /* Split when a BB is not dominated by the first block. */
5177 if (!bbs
.is_empty ()
5178 && !dominated_by_p (CDI_DOMINATORS
, bb
, bbs
[0]))
5180 if (dump_enabled_p ())
5181 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5182 "splitting region at dominance boundary bb%d\n",
5186 /* Split when the loop determined by the first block
5187 is exited. This is because we eventually insert
5188 invariants at region begin. */
5189 else if (!bbs
.is_empty ()
5190 && bbs
[0]->loop_father
!= bb
->loop_father
5191 && !flow_loop_nested_p (bbs
[0]->loop_father
, bb
->loop_father
))
5193 if (dump_enabled_p ())
5194 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5195 "splitting region at loop %d exit at bb%d\n",
5196 bbs
[0]->loop_father
->num
, bb
->index
);
5200 if (split
&& !bbs
.is_empty ())
5202 r
|= vect_slp_bbs (bbs
);
5204 bbs
.quick_push (bb
);
5209 /* When we have a stmt ending this block and defining a
5210 value we have to insert on edges when inserting after it for
5211 a vector containing its definition. Avoid this for now. */
5212 if (gimple
*last
= last_stmt (bb
))
5213 if (gimple_get_lhs (last
)
5214 && is_ctrl_altering_stmt (last
))
5216 if (dump_enabled_p ())
5217 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5218 "splitting region at control altering "
5219 "definition %G", last
);
5220 r
|= vect_slp_bbs (bbs
);
5225 if (!bbs
.is_empty ())
5226 r
|= vect_slp_bbs (bbs
);
5233 /* Build a variable-length vector in which the elements in ELTS are repeated
5234 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
5235 RESULTS and add any new instructions to SEQ.
5237 The approach we use is:
5239 (1) Find a vector mode VM with integer elements of mode IM.
5241 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5242 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
5243 from small vectors to IM.
5245 (3) Duplicate each ELTS'[I] into a vector of mode VM.
5247 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
5248 correct byte contents.
5250 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
5252 We try to find the largest IM for which this sequence works, in order
5253 to cut down on the number of interleaves. */
5256 duplicate_and_interleave (vec_info
*vinfo
, gimple_seq
*seq
, tree vector_type
,
5257 vec
<tree
> elts
, unsigned int nresults
,
5260 unsigned int nelts
= elts
.length ();
5261 tree element_type
= TREE_TYPE (vector_type
);
5263 /* (1) Find a vector mode VM with integer elements of mode IM. */
5264 unsigned int nvectors
= 1;
5265 tree new_vector_type
;
5267 if (!can_duplicate_and_interleave_p (vinfo
, nelts
, element_type
,
5268 &nvectors
, &new_vector_type
,
5272 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
5273 unsigned int partial_nelts
= nelts
/ nvectors
;
5274 tree partial_vector_type
= build_vector_type (element_type
, partial_nelts
);
5276 tree_vector_builder partial_elts
;
5277 auto_vec
<tree
, 32> pieces (nvectors
* 2);
5278 pieces
.quick_grow_cleared (nvectors
* 2);
5279 for (unsigned int i
= 0; i
< nvectors
; ++i
)
5281 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5282 ELTS' has mode IM. */
5283 partial_elts
.new_vector (partial_vector_type
, partial_nelts
, 1);
5284 for (unsigned int j
= 0; j
< partial_nelts
; ++j
)
5285 partial_elts
.quick_push (elts
[i
* partial_nelts
+ j
]);
5286 tree t
= gimple_build_vector (seq
, &partial_elts
);
5287 t
= gimple_build (seq
, VIEW_CONVERT_EXPR
,
5288 TREE_TYPE (new_vector_type
), t
);
5290 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
5291 pieces
[i
] = gimple_build_vector_from_val (seq
, new_vector_type
, t
);
5294 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
5295 correct byte contents.
5297 Conceptually, we need to repeat the following operation log2(nvectors)
5298 times, where hi_start = nvectors / 2:
5300 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
5301 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
5303 However, if each input repeats every N elements and the VF is
5304 a multiple of N * 2, the HI result is the same as the LO result.
5305 This will be true for the first N1 iterations of the outer loop,
5306 followed by N2 iterations for which both the LO and HI results
5309 N1 + N2 = log2(nvectors)
5311 Each "N1 iteration" doubles the number of redundant vectors and the
5312 effect of the process as a whole is to have a sequence of nvectors/2**N1
5313 vectors that repeats 2**N1 times. Rather than generate these redundant
5314 vectors, we halve the number of vectors for each N1 iteration. */
5315 unsigned int in_start
= 0;
5316 unsigned int out_start
= nvectors
;
5317 unsigned int new_nvectors
= nvectors
;
5318 for (unsigned int in_repeat
= 1; in_repeat
< nvectors
; in_repeat
*= 2)
5320 unsigned int hi_start
= new_nvectors
/ 2;
5321 unsigned int out_i
= 0;
5322 for (unsigned int in_i
= 0; in_i
< new_nvectors
; ++in_i
)
5325 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type
),
5329 tree output
= make_ssa_name (new_vector_type
);
5330 tree input1
= pieces
[in_start
+ (in_i
/ 2)];
5331 tree input2
= pieces
[in_start
+ (in_i
/ 2) + hi_start
];
5332 gassign
*stmt
= gimple_build_assign (output
, VEC_PERM_EXPR
,
5334 permutes
[in_i
& 1]);
5335 gimple_seq_add_stmt (seq
, stmt
);
5336 pieces
[out_start
+ out_i
] = output
;
5339 std::swap (in_start
, out_start
);
5340 new_nvectors
= out_i
;
5343 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
5344 results
.reserve (nresults
);
5345 for (unsigned int i
= 0; i
< nresults
; ++i
)
5346 if (i
< new_nvectors
)
5347 results
.quick_push (gimple_build (seq
, VIEW_CONVERT_EXPR
, vector_type
,
5348 pieces
[in_start
+ i
]));
5350 results
.quick_push (results
[i
- new_nvectors
]);
5354 /* For constant and loop invariant defs in OP_NODE this function creates
5355 vector defs that will be used in the vectorized stmts and stores them
5356 to SLP_TREE_VEC_DEFS of OP_NODE. */
5359 vect_create_constant_vectors (vec_info
*vinfo
, slp_tree op_node
)
5361 unsigned HOST_WIDE_INT nunits
;
5363 unsigned j
, number_of_places_left_in_vector
;
5366 int group_size
= op_node
->ops
.length ();
5367 unsigned int vec_num
, i
;
5368 unsigned number_of_copies
= 1;
5370 gimple_seq ctor_seq
= NULL
;
5371 auto_vec
<tree
, 16> permute_results
;
5373 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
5374 vector_type
= SLP_TREE_VECTYPE (op_node
);
5376 unsigned int number_of_vectors
= SLP_TREE_NUMBER_OF_VEC_STMTS (op_node
);
5377 SLP_TREE_VEC_DEFS (op_node
).create (number_of_vectors
);
5378 auto_vec
<tree
> voprnds (number_of_vectors
);
5380 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5381 created vectors. It is greater than 1 if unrolling is performed.
5383 For example, we have two scalar operands, s1 and s2 (e.g., group of
5384 strided accesses of size two), while NUNITS is four (i.e., four scalars
5385 of this type can be packed in a vector). The output vector will contain
5386 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5389 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
5390 containing the operands.
5392 For example, NUNITS is four as before, and the group size is 8
5393 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5394 {s5, s6, s7, s8}. */
5396 /* When using duplicate_and_interleave, we just need one element for
5397 each scalar statement. */
5398 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
5399 nunits
= group_size
;
5401 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
5403 number_of_places_left_in_vector
= nunits
;
5405 tree_vector_builder
elts (vector_type
, nunits
, 1);
5406 elts
.quick_grow (nunits
);
5407 stmt_vec_info insert_after
= NULL
;
5408 for (j
= 0; j
< number_of_copies
; j
++)
5411 for (i
= group_size
- 1; op_node
->ops
.iterate (i
, &op
); i
--)
5413 /* Create 'vect_ = {op0,op1,...,opn}'. */
5414 number_of_places_left_in_vector
--;
5416 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
5418 if (CONSTANT_CLASS_P (op
))
5420 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
5422 /* Can't use VIEW_CONVERT_EXPR for booleans because
5423 of possibly different sizes of scalar value and
5425 if (integer_zerop (op
))
5426 op
= build_int_cst (TREE_TYPE (vector_type
), 0);
5427 else if (integer_onep (op
))
5428 op
= build_all_ones_cst (TREE_TYPE (vector_type
));
5433 op
= fold_unary (VIEW_CONVERT_EXPR
,
5434 TREE_TYPE (vector_type
), op
);
5435 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
5439 tree new_temp
= make_ssa_name (TREE_TYPE (vector_type
));
5441 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
5444 = build_all_ones_cst (TREE_TYPE (vector_type
));
5446 = build_zero_cst (TREE_TYPE (vector_type
));
5447 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op
)));
5448 init_stmt
= gimple_build_assign (new_temp
, COND_EXPR
,
5454 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
5457 = gimple_build_assign (new_temp
, VIEW_CONVERT_EXPR
,
5460 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
5464 elts
[number_of_places_left_in_vector
] = op
;
5465 if (!CONSTANT_CLASS_P (op
))
5467 /* For BB vectorization we have to compute an insert location
5468 when a def is inside the analyzed region since we cannot
5469 simply insert at the BB start in this case. */
5470 stmt_vec_info opdef
;
5471 if (TREE_CODE (orig_op
) == SSA_NAME
5472 && !SSA_NAME_IS_DEFAULT_DEF (orig_op
)
5473 && is_a
<bb_vec_info
> (vinfo
)
5474 && (opdef
= vinfo
->lookup_def (orig_op
)))
5477 insert_after
= opdef
;
5479 insert_after
= get_later_stmt (insert_after
, opdef
);
5482 if (number_of_places_left_in_vector
== 0)
5485 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
5486 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
5487 vec_cst
= gimple_build_vector (&ctor_seq
, &elts
);
5490 if (permute_results
.is_empty ())
5491 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
,
5492 elts
, number_of_vectors
,
5494 vec_cst
= permute_results
[number_of_vectors
- j
- 1];
5496 if (!gimple_seq_empty_p (ctor_seq
))
5500 gimple_stmt_iterator gsi
;
5501 if (gimple_code (insert_after
->stmt
) == GIMPLE_PHI
)
5503 gsi
= gsi_after_labels (gimple_bb (insert_after
->stmt
));
5504 gsi_insert_seq_before (&gsi
, ctor_seq
,
5505 GSI_CONTINUE_LINKING
);
5507 else if (!stmt_ends_bb_p (insert_after
->stmt
))
5509 gsi
= gsi_for_stmt (insert_after
->stmt
);
5510 gsi_insert_seq_after (&gsi
, ctor_seq
,
5511 GSI_CONTINUE_LINKING
);
5515 /* When we want to insert after a def where the
5516 defining stmt throws then insert on the fallthru
5518 edge e
= find_fallthru_edge
5519 (gimple_bb (insert_after
->stmt
)->succs
);
5521 = gsi_insert_seq_on_edge_immediate (e
, ctor_seq
);
5522 gcc_assert (!new_bb
);
5526 vinfo
->insert_seq_on_entry (NULL
, ctor_seq
);
5529 voprnds
.quick_push (vec_cst
);
5530 insert_after
= NULL
;
5531 number_of_places_left_in_vector
= nunits
;
5533 elts
.new_vector (vector_type
, nunits
, 1);
5534 elts
.quick_grow (nunits
);
5539 /* Since the vectors are created in the reverse order, we should invert
5541 vec_num
= voprnds
.length ();
5542 for (j
= vec_num
; j
!= 0; j
--)
5544 vop
= voprnds
[j
- 1];
5545 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
5548 /* In case that VF is greater than the unrolling factor needed for the SLP
5549 group of stmts, NUMBER_OF_VECTORS to be created is greater than
5550 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
5551 to replicate the vectors. */
5552 while (number_of_vectors
> SLP_TREE_VEC_DEFS (op_node
).length ())
5553 for (i
= 0; SLP_TREE_VEC_DEFS (op_node
).iterate (i
, &vop
) && i
< vec_num
;
5555 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
5558 /* Get the Ith vectorized definition from SLP_NODE. */
5561 vect_get_slp_vect_def (slp_tree slp_node
, unsigned i
)
5563 if (SLP_TREE_VEC_STMTS (slp_node
).exists ())
5564 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node
)[i
]);
5566 return SLP_TREE_VEC_DEFS (slp_node
)[i
];
5569 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
5572 vect_get_slp_defs (slp_tree slp_node
, vec
<tree
> *vec_defs
)
5574 vec_defs
->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
));
5575 if (SLP_TREE_DEF_TYPE (slp_node
) == vect_internal_def
)
5578 gimple
*vec_def_stmt
;
5579 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), j
, vec_def_stmt
)
5580 vec_defs
->quick_push (gimple_get_lhs (vec_def_stmt
));
5583 vec_defs
->splice (SLP_TREE_VEC_DEFS (slp_node
));
5586 /* Get N vectorized definitions for SLP_NODE. */
5589 vect_get_slp_defs (vec_info
*,
5590 slp_tree slp_node
, vec
<vec
<tree
> > *vec_oprnds
, unsigned n
)
5593 n
= SLP_TREE_CHILDREN (slp_node
).length ();
5595 for (unsigned i
= 0; i
< n
; ++i
)
5597 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
5598 vec
<tree
> vec_defs
= vNULL
;
5599 vect_get_slp_defs (child
, &vec_defs
);
5600 vec_oprnds
->quick_push (vec_defs
);
5604 /* Generate vector permute statements from a list of loads in DR_CHAIN.
5605 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
5606 permute statements for the SLP node NODE. Store the number of vector
5607 permute instructions in *N_PERMS and the number of vector load
5608 instructions in *N_LOADS. */
5611 vect_transform_slp_perm_load (vec_info
*vinfo
,
5612 slp_tree node
, vec
<tree
> dr_chain
,
5613 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
5614 bool analyze_only
, unsigned *n_perms
,
5615 unsigned int *n_loads
)
5617 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
5619 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5620 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (node
).length ();
5621 unsigned int mask_element
;
5624 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
5627 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
5629 mode
= TYPE_MODE (vectype
);
5630 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5632 /* Initialize the vect stmts of NODE to properly insert the generated
5635 for (unsigned i
= SLP_TREE_VEC_STMTS (node
).length ();
5636 i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
5637 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
5639 /* Generate permutation masks for every NODE. Number of masks for each NODE
5640 is equal to GROUP_SIZE.
5641 E.g., we have a group of three nodes with three loads from the same
5642 location in each node, and the vector size is 4. I.e., we have a
5643 a0b0c0a1b1c1... sequence and we need to create the following vectors:
5644 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
5645 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
5648 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
5649 The last mask is illegal since we assume two operands for permute
5650 operation, and the mask element values can't be outside that range.
5651 Hence, the last mask must be converted into {2,5,5,5}.
5652 For the first two permutations we need the first and the second input
5653 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
5654 we need the second and the third vectors: {b1,c1,a2,b2} and
5657 int vect_stmts_counter
= 0;
5658 unsigned int index
= 0;
5659 int first_vec_index
= -1;
5660 int second_vec_index
= -1;
5664 vec_perm_builder mask
;
5665 unsigned int nelts_to_build
;
5666 unsigned int nvectors_per_build
;
5667 unsigned int in_nlanes
;
5668 bool repeating_p
= (group_size
== DR_GROUP_SIZE (stmt_info
)
5669 && multiple_p (nunits
, group_size
));
5672 /* A single vector contains a whole number of copies of the node, so:
5673 (a) all permutes can use the same mask; and
5674 (b) the permutes only need a single vector input. */
5675 mask
.new_vector (nunits
, group_size
, 3);
5676 nelts_to_build
= mask
.encoded_nelts ();
5677 nvectors_per_build
= SLP_TREE_VEC_STMTS (node
).length ();
5678 in_nlanes
= DR_GROUP_SIZE (stmt_info
) * 3;
5682 /* We need to construct a separate mask for each vector statement. */
5683 unsigned HOST_WIDE_INT const_nunits
, const_vf
;
5684 if (!nunits
.is_constant (&const_nunits
)
5685 || !vf
.is_constant (&const_vf
))
5687 mask
.new_vector (const_nunits
, const_nunits
, 1);
5688 nelts_to_build
= const_vf
* group_size
;
5689 nvectors_per_build
= 1;
5690 in_nlanes
= const_vf
* DR_GROUP_SIZE (stmt_info
);
5692 auto_sbitmap
used_in_lanes (in_nlanes
);
5693 bitmap_clear (used_in_lanes
);
5695 unsigned int count
= mask
.encoded_nelts ();
5696 mask
.quick_grow (count
);
5697 vec_perm_indices indices
;
5699 for (unsigned int j
= 0; j
< nelts_to_build
; j
++)
5701 unsigned int iter_num
= j
/ group_size
;
5702 unsigned int stmt_num
= j
% group_size
;
5703 unsigned int i
= (iter_num
* DR_GROUP_SIZE (stmt_info
)
5704 + SLP_TREE_LOAD_PERMUTATION (node
)[stmt_num
]);
5705 bitmap_set_bit (used_in_lanes
, i
);
5708 first_vec_index
= 0;
5713 /* Enforced before the loop when !repeating_p. */
5714 unsigned int const_nunits
= nunits
.to_constant ();
5715 vec_index
= i
/ const_nunits
;
5716 mask_element
= i
% const_nunits
;
5717 if (vec_index
== first_vec_index
5718 || first_vec_index
== -1)
5720 first_vec_index
= vec_index
;
5722 else if (vec_index
== second_vec_index
5723 || second_vec_index
== -1)
5725 second_vec_index
= vec_index
;
5726 mask_element
+= const_nunits
;
5730 if (dump_enabled_p ())
5731 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5732 "permutation requires at "
5733 "least three vectors %G",
5735 gcc_assert (analyze_only
);
5739 gcc_assert (mask_element
< 2 * const_nunits
);
5742 if (mask_element
!= index
)
5744 mask
[index
++] = mask_element
;
5746 if (index
== count
&& !noop_p
)
5748 indices
.new_vector (mask
, second_vec_index
== -1 ? 1 : 2, nunits
);
5749 if (!can_vec_perm_const_p (mode
, indices
))
5751 if (dump_enabled_p ())
5753 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
5755 "unsupported vect permute { ");
5756 for (i
= 0; i
< count
; ++i
)
5758 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
5759 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
5761 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
5763 gcc_assert (analyze_only
);
5774 tree mask_vec
= NULL_TREE
;
5777 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
5779 if (second_vec_index
== -1)
5780 second_vec_index
= first_vec_index
;
5782 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
5784 /* Generate the permute statement if necessary. */
5785 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
5786 tree second_vec
= dr_chain
[second_vec_index
+ ri
];
5790 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
5792 = vect_create_destination_var (gimple_assign_lhs (stmt
),
5794 perm_dest
= make_ssa_name (perm_dest
);
5796 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
5797 first_vec
, second_vec
,
5799 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
,
5803 /* If mask was NULL_TREE generate the requested
5804 identity transform. */
5805 perm_stmt
= SSA_NAME_DEF_STMT (first_vec
);
5807 /* Store the vector statement in NODE. */
5808 SLP_TREE_VEC_STMTS (node
)[vect_stmts_counter
++] = perm_stmt
;
5813 first_vec_index
= -1;
5814 second_vec_index
= -1;
5822 *n_loads
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
5825 /* Enforced above when !repeating_p. */
5826 unsigned int const_nunits
= nunits
.to_constant ();
5828 bool load_seen
= false;
5829 for (unsigned i
= 0; i
< in_nlanes
; ++i
)
5831 if (i
% const_nunits
== 0)
5837 if (bitmap_bit_p (used_in_lanes
, i
))
5848 /* Produce the next vector result for SLP permutation NODE by adding a vector
5849 statement at GSI. If MASK_VEC is nonnull, add:
5851 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
5855 <new SSA name> = FIRST_DEF. */
5858 vect_add_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
5859 slp_tree node
, tree first_def
, tree second_def
,
5862 tree vectype
= SLP_TREE_VECTYPE (node
);
5864 /* ??? We SLP match existing vector element extracts but
5865 allow punning which we need to re-instantiate at uses
5866 but have no good way of explicitly representing. */
5867 if (!types_compatible_p (TREE_TYPE (first_def
), vectype
))
5870 = gimple_build_assign (make_ssa_name (vectype
),
5871 build1 (VIEW_CONVERT_EXPR
, vectype
, first_def
));
5872 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
5873 first_def
= gimple_assign_lhs (conv_stmt
);
5876 tree perm_dest
= make_ssa_name (vectype
);
5879 if (!types_compatible_p (TREE_TYPE (second_def
), vectype
))
5882 = gimple_build_assign (make_ssa_name (vectype
),
5883 build1 (VIEW_CONVERT_EXPR
,
5884 vectype
, second_def
));
5885 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
5886 second_def
= gimple_assign_lhs (conv_stmt
);
5888 perm_stmt
= gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
5889 first_def
, second_def
,
5893 /* We need a copy here in case the def was external. */
5894 perm_stmt
= gimple_build_assign (perm_dest
, first_def
);
5895 vect_finish_stmt_generation (vinfo
, NULL
, perm_stmt
, gsi
);
5896 /* Store the vector statement in NODE. */
5897 SLP_TREE_VEC_STMTS (node
).quick_push (perm_stmt
);
5900 /* Vectorize the SLP permutations in NODE as specified
5901 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5902 child number and lane number.
5903 Interleaving of two two-lane two-child SLP subtrees (not supported):
5904 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5905 A blend of two four-lane two-child SLP subtrees:
5906 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5907 Highpart of a four-lane one-child SLP subtree (not supported):
5908 [ { 0, 2 }, { 0, 3 } ]
5909 Where currently only a subset is supported by code generating below. */
5912 vectorizable_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
5913 slp_tree node
, stmt_vector_for_cost
*cost_vec
)
5915 tree vectype
= SLP_TREE_VECTYPE (node
);
5917 /* ??? We currently only support all same vector input and output types
5918 while the SLP IL should really do a concat + select and thus accept
5919 arbitrary mismatches. */
5922 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5923 bool repeating_p
= multiple_p (nunits
, SLP_TREE_LANES (node
));
5924 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5926 if (!vect_maybe_update_slp_op_vectype (child
, vectype
)
5927 || !types_compatible_p (SLP_TREE_VECTYPE (child
), vectype
))
5929 if (dump_enabled_p ())
5930 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5931 "Unsupported lane permutation\n");
5934 if (SLP_TREE_LANES (child
) != SLP_TREE_LANES (node
))
5935 repeating_p
= false;
5938 vec
<std::pair
<unsigned, unsigned> > &perm
= SLP_TREE_LANE_PERMUTATION (node
);
5939 gcc_assert (perm
.length () == SLP_TREE_LANES (node
));
5940 if (dump_enabled_p ())
5942 dump_printf_loc (MSG_NOTE
, vect_location
,
5943 "vectorizing permutation");
5944 for (unsigned i
= 0; i
< perm
.length (); ++i
)
5945 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
5947 dump_printf (MSG_NOTE
, " (repeat %d)\n", SLP_TREE_LANES (node
));
5948 dump_printf (MSG_NOTE
, "\n");
5951 /* REPEATING_P is true if every output vector is guaranteed to use the
5952 same permute vector. We can handle that case for both variable-length
5953 and constant-length vectors, but we only handle other cases for
5954 constant-length vectors.
5958 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
5959 mask vector that we want to build.
5961 - NCOPIES to the number of copies of PERM that we need in order
5962 to build the necessary permute mask vectors.
5964 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
5965 for each permute mask vector. This is only relevant when GSI is
5968 unsigned nelts_per_pattern
;
5970 unsigned noutputs_per_mask
;
5973 /* We need a single permute mask vector that has the form:
5975 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
5977 In other words, the original n-element permute in PERM is
5978 "unrolled" to fill a full vector. The stepped vector encoding
5979 that we use for permutes requires 3n elements. */
5980 npatterns
= SLP_TREE_LANES (node
);
5981 nelts_per_pattern
= ncopies
= 3;
5982 noutputs_per_mask
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
5986 /* Calculate every element of every permute mask vector explicitly,
5987 instead of relying on the pattern described above. */
5988 if (!nunits
.is_constant (&npatterns
))
5990 nelts_per_pattern
= ncopies
= 1;
5991 if (loop_vec_info linfo
= dyn_cast
<loop_vec_info
> (vinfo
))
5992 if (!LOOP_VINFO_VECT_FACTOR (linfo
).is_constant (&ncopies
))
5994 noutputs_per_mask
= 1;
5996 unsigned olanes
= ncopies
* SLP_TREE_LANES (node
);
5997 gcc_assert (repeating_p
|| multiple_p (olanes
, nunits
));
5999 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
6000 from the { SLP operand, scalar lane } permutation as recorded in the
6001 SLP node as intermediate step. This part should already work
6002 with SLP children with arbitrary number of lanes. */
6003 auto_vec
<std::pair
<std::pair
<unsigned, unsigned>, unsigned> > vperm
;
6004 auto_vec
<unsigned> active_lane
;
6005 vperm
.create (olanes
);
6006 active_lane
.safe_grow_cleared (SLP_TREE_CHILDREN (node
).length (), true);
6007 for (unsigned i
= 0; i
< ncopies
; ++i
)
6009 for (unsigned pi
= 0; pi
< perm
.length (); ++pi
)
6011 std::pair
<unsigned, unsigned> p
= perm
[pi
];
6012 tree vtype
= SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node
)[p
.first
]);
6014 vperm
.quick_push ({{p
.first
, 0}, p
.second
+ active_lane
[p
.first
]});
6017 /* We checked above that the vectors are constant-length. */
6018 unsigned vnunits
= TYPE_VECTOR_SUBPARTS (vtype
).to_constant ();
6019 unsigned vi
= (active_lane
[p
.first
] + p
.second
) / vnunits
;
6020 unsigned vl
= (active_lane
[p
.first
] + p
.second
) % vnunits
;
6021 vperm
.quick_push ({{p
.first
, vi
}, vl
});
6024 /* Advance to the next group. */
6025 for (unsigned j
= 0; j
< SLP_TREE_CHILDREN (node
).length (); ++j
)
6026 active_lane
[j
] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node
)[j
]);
6029 if (dump_enabled_p ())
6031 dump_printf_loc (MSG_NOTE
, vect_location
, "as");
6032 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
6036 ? multiple_p (i
, npatterns
)
6037 : multiple_p (i
, TYPE_VECTOR_SUBPARTS (vectype
))))
6038 dump_printf (MSG_NOTE
, ",");
6039 dump_printf (MSG_NOTE
, " vops%u[%u][%u]",
6040 vperm
[i
].first
.first
, vperm
[i
].first
.second
,
6043 dump_printf (MSG_NOTE
, "\n");
6046 /* We can only handle two-vector permutes, everything else should
6047 be lowered on the SLP level. The following is closely inspired
6048 by vect_transform_slp_perm_load and is supposed to eventually
6050 ??? As intermediate step do code-gen in the SLP tree representation
6052 std::pair
<unsigned, unsigned> first_vec
= std::make_pair (-1U, -1U);
6053 std::pair
<unsigned, unsigned> second_vec
= std::make_pair (-1U, -1U);
6054 unsigned int index
= 0;
6055 poly_uint64 mask_element
;
6056 vec_perm_builder mask
;
6057 mask
.new_vector (nunits
, npatterns
, nelts_per_pattern
);
6058 unsigned int count
= mask
.encoded_nelts ();
6059 mask
.quick_grow (count
);
6060 vec_perm_indices indices
;
6061 unsigned nperms
= 0;
6062 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
6064 mask_element
= vperm
[i
].second
;
6065 if (first_vec
.first
== -1U
6066 || first_vec
== vperm
[i
].first
)
6067 first_vec
= vperm
[i
].first
;
6068 else if (second_vec
.first
== -1U
6069 || second_vec
== vperm
[i
].first
)
6071 second_vec
= vperm
[i
].first
;
6072 mask_element
+= nunits
;
6076 if (dump_enabled_p ())
6077 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6078 "permutation requires at "
6079 "least three vectors\n");
6084 mask
[index
++] = mask_element
;
6088 indices
.new_vector (mask
, second_vec
.first
== -1U ? 1 : 2, nunits
);
6089 bool identity_p
= indices
.series_p (0, 1, 0, 1);
6091 && !can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6093 if (dump_enabled_p ())
6095 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
6097 "unsupported vect permute { ");
6098 for (i
= 0; i
< count
; ++i
)
6100 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
6101 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
6103 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
6113 if (second_vec
.first
== -1U)
6114 second_vec
= first_vec
;
6117 first_node
= SLP_TREE_CHILDREN (node
)[first_vec
.first
],
6118 second_node
= SLP_TREE_CHILDREN (node
)[second_vec
.first
];
6120 tree mask_vec
= NULL_TREE
;
6122 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
6124 for (unsigned int vi
= 0; vi
< noutputs_per_mask
; ++vi
)
6127 = vect_get_slp_vect_def (first_node
,
6128 first_vec
.second
+ vi
);
6130 = vect_get_slp_vect_def (second_node
,
6131 second_vec
.second
+ vi
);
6132 vect_add_slp_permutation (vinfo
, gsi
, node
, first_def
,
6133 second_def
, mask_vec
);
6138 first_vec
= std::make_pair (-1U, -1U);
6139 second_vec
= std::make_pair (-1U, -1U);
6144 record_stmt_cost (cost_vec
, nperms
, vec_perm
, NULL
, vectype
, 0, vect_body
);
6149 /* Vectorize SLP NODE. */
6152 vect_schedule_slp_node (vec_info
*vinfo
,
6153 slp_tree node
, slp_instance instance
)
6155 gimple_stmt_iterator si
;
6159 /* For existing vectors there's nothing to do. */
6160 if (SLP_TREE_VEC_DEFS (node
).exists ())
6163 gcc_assert (SLP_TREE_VEC_STMTS (node
).is_empty ());
6165 /* Vectorize externals and constants. */
6166 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
6167 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
6169 /* ??? vectorizable_shift can end up using a scalar operand which is
6170 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
6171 node in this case. */
6172 if (!SLP_TREE_VECTYPE (node
))
6175 vect_create_constant_vectors (vinfo
, node
);
6179 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
6181 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) != 0);
6182 SLP_TREE_VEC_STMTS (node
).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
6184 if (dump_enabled_p ())
6185 dump_printf_loc (MSG_NOTE
, vect_location
,
6186 "------>vectorizing SLP node starting from: %G",
6189 if (STMT_VINFO_DATA_REF (stmt_info
)
6190 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
6192 /* Vectorized loads go before the first scalar load to make it
6193 ready early, vectorized stores go before the last scalar
6194 stmt which is where all uses are ready. */
6195 stmt_vec_info last_stmt_info
= NULL
;
6196 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
6197 last_stmt_info
= vect_find_first_scalar_stmt_in_slp (node
);
6198 else /* DR_IS_WRITE */
6199 last_stmt_info
= vect_find_last_scalar_stmt_in_slp (node
);
6200 si
= gsi_for_stmt (last_stmt_info
->stmt
);
6202 else if ((STMT_VINFO_TYPE (stmt_info
) == cycle_phi_info_type
6203 || STMT_VINFO_TYPE (stmt_info
) == induc_vec_info_type
6204 || STMT_VINFO_TYPE (stmt_info
) == phi_info_type
)
6205 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
6207 /* For PHI node vectorization we do not use the insertion iterator. */
6212 /* Emit other stmts after the children vectorized defs which is
6213 earliest possible. */
6214 gimple
*last_stmt
= NULL
;
6215 bool seen_vector_def
= false;
6216 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6217 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
6219 /* For fold-left reductions we are retaining the scalar
6220 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
6221 set so the representation isn't perfect. Resort to the
6222 last scalar def here. */
6223 if (SLP_TREE_VEC_STMTS (child
).is_empty ())
6225 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child
))
6226 == cycle_phi_info_type
);
6227 gphi
*phi
= as_a
<gphi
*>
6228 (vect_find_last_scalar_stmt_in_slp (child
)->stmt
);
6230 || vect_stmt_dominates_stmt_p (last_stmt
, phi
))
6233 /* We are emitting all vectorized stmts in the same place and
6234 the last one is the last.
6235 ??? Unless we have a load permutation applied and that
6236 figures to re-use an earlier generated load. */
6239 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child
), j
, vstmt
)
6241 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
6244 else if (!SLP_TREE_VECTYPE (child
))
6246 /* For externals we use unvectorized at all scalar defs. */
6249 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child
), j
, def
)
6250 if (TREE_CODE (def
) == SSA_NAME
6251 && !SSA_NAME_IS_DEFAULT_DEF (def
))
6253 gimple
*stmt
= SSA_NAME_DEF_STMT (def
);
6255 || vect_stmt_dominates_stmt_p (last_stmt
, stmt
))
6261 /* For externals we have to look at all defs since their
6262 insertion place is decided per vector. But beware
6263 of pre-existing vectors where we need to make sure
6264 we do not insert before the region boundary. */
6265 if (SLP_TREE_SCALAR_OPS (child
).is_empty ()
6266 && !vinfo
->lookup_def (SLP_TREE_VEC_DEFS (child
)[0]))
6267 seen_vector_def
= true;
6272 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
6273 if (TREE_CODE (vdef
) == SSA_NAME
6274 && !SSA_NAME_IS_DEFAULT_DEF (vdef
))
6276 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
6278 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
6283 /* This can happen when all children are pre-existing vectors or
6286 last_stmt
= vect_find_first_scalar_stmt_in_slp (node
)->stmt
;
6289 gcc_assert (seen_vector_def
);
6290 si
= gsi_after_labels (as_a
<bb_vec_info
> (vinfo
)->bbs
[0]);
6292 else if (is_a
<gphi
*> (last_stmt
))
6293 si
= gsi_after_labels (gimple_bb (last_stmt
));
6296 si
= gsi_for_stmt (last_stmt
);
6301 bool done_p
= false;
6303 /* Handle purely internal nodes. */
6304 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
6306 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
6307 be shared with different SLP nodes (but usually it's the same
6308 operation apart from the case the stmt is only there for denoting
6309 the actual scalar lane defs ...). So do not call vect_transform_stmt
6310 but open-code it here (partly). */
6311 bool done
= vectorizable_slp_permutation (vinfo
, &si
, node
, NULL
);
6316 vect_transform_stmt (vinfo
, stmt_info
, &si
, node
, instance
);
6319 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
6320 For loop vectorization this is done in vectorizable_call, but for SLP
6321 it needs to be deferred until end of vect_schedule_slp, because multiple
6322 SLP instances may refer to the same scalar stmt. */
6325 vect_remove_slp_scalar_calls (vec_info
*vinfo
,
6326 slp_tree node
, hash_set
<slp_tree
> &visited
)
6329 gimple_stmt_iterator gsi
;
6333 stmt_vec_info stmt_info
;
6335 if (!node
|| SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
6338 if (visited
.add (node
))
6341 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6342 vect_remove_slp_scalar_calls (vinfo
, child
, visited
);
6344 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
6346 gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
6347 if (!stmt
|| gimple_bb (stmt
) == NULL
)
6349 if (is_pattern_stmt_p (stmt_info
)
6350 || !PURE_SLP_STMT (stmt_info
))
6352 lhs
= gimple_call_lhs (stmt
);
6353 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
6354 gsi
= gsi_for_stmt (stmt
);
6355 vinfo
->replace_stmt (&gsi
, stmt_info
, new_stmt
);
6356 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
6361 vect_remove_slp_scalar_calls (vec_info
*vinfo
, slp_tree node
)
6363 hash_set
<slp_tree
> visited
;
6364 vect_remove_slp_scalar_calls (vinfo
, node
, visited
);
6367 /* Vectorize the instance root. */
6370 vectorize_slp_instance_root_stmt (slp_tree node
, slp_instance instance
)
6372 gassign
*rstmt
= NULL
;
6374 if (instance
->kind
== slp_inst_kind_ctor
)
6376 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) == 1)
6381 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
6383 tree vect_lhs
= gimple_get_lhs (child_stmt
);
6384 tree root_lhs
= gimple_get_lhs (instance
->root_stmts
[0]->stmt
);
6385 if (!useless_type_conversion_p (TREE_TYPE (root_lhs
),
6386 TREE_TYPE (vect_lhs
)))
6387 vect_lhs
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (root_lhs
),
6389 rstmt
= gimple_build_assign (root_lhs
, vect_lhs
);
6393 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) > 1)
6395 int nelts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
6398 vec
<constructor_elt
, va_gc
> *v
;
6399 vec_alloc (v
, nelts
);
6401 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
6402 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
,
6403 gimple_get_lhs (child_stmt
));
6404 tree lhs
= gimple_get_lhs (instance
->root_stmts
[0]->stmt
);
6406 = TREE_TYPE (gimple_assign_rhs1 (instance
->root_stmts
[0]->stmt
));
6407 tree r_constructor
= build_constructor (rtype
, v
);
6408 rstmt
= gimple_build_assign (lhs
, r_constructor
);
6416 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmts
[0]->stmt
);
6417 gsi_replace (&rgsi
, rstmt
, true);
6427 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
6430 vect_schedule_scc (vec_info
*vinfo
, slp_tree node
, slp_instance instance
,
6431 hash_map
<slp_tree
, slp_scc_info
> &scc_info
,
6432 int &maxdfs
, vec
<slp_tree
> &stack
)
6435 slp_scc_info
*info
= &scc_info
.get_or_insert (node
, &existed_p
);
6436 gcc_assert (!existed_p
);
6438 info
->lowlink
= maxdfs
;
6442 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
6444 info
->on_stack
= false;
6445 vect_schedule_slp_node (vinfo
, node
, instance
);
6449 info
->on_stack
= true;
6450 stack
.safe_push (node
);
6455 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6459 slp_scc_info
*child_info
= scc_info
.get (child
);
6462 vect_schedule_scc (vinfo
, child
, instance
, scc_info
, maxdfs
, stack
);
6463 /* Recursion might have re-allocated the node. */
6464 info
= scc_info
.get (node
);
6465 child_info
= scc_info
.get (child
);
6466 info
->lowlink
= MIN (info
->lowlink
, child_info
->lowlink
);
6468 else if (child_info
->on_stack
)
6469 info
->lowlink
= MIN (info
->lowlink
, child_info
->dfs
);
6471 if (info
->lowlink
!= info
->dfs
)
6474 auto_vec
<slp_tree
, 4> phis_to_fixup
;
6477 if (stack
.last () == node
)
6480 info
->on_stack
= false;
6481 vect_schedule_slp_node (vinfo
, node
, instance
);
6482 if (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
6483 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (node
)->stmt
))
6484 phis_to_fixup
.quick_push (node
);
6489 int last_idx
= stack
.length () - 1;
6490 while (stack
[last_idx
] != node
)
6492 /* We can break the cycle at PHIs who have at least one child
6493 code generated. Then we could re-start the DFS walk until
6494 all nodes in the SCC are covered (we might have new entries
6495 for only back-reachable nodes). But it's simpler to just
6496 iterate and schedule those that are ready. */
6497 unsigned todo
= stack
.length () - last_idx
;
6500 for (int idx
= stack
.length () - 1; idx
>= last_idx
; --idx
)
6502 slp_tree entry
= stack
[idx
];
6505 bool phi
= (SLP_TREE_CODE (entry
) != VEC_PERM_EXPR
6506 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (entry
)->stmt
));
6508 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry
), i
, child
)
6515 else if (scc_info
.get (child
)->on_stack
)
6533 vect_schedule_slp_node (vinfo
, entry
, instance
);
6534 scc_info
.get (entry
)->on_stack
= false;
6538 phis_to_fixup
.safe_push (entry
);
6545 stack
.truncate (last_idx
);
6548 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
6550 FOR_EACH_VEC_ELT (phis_to_fixup
, i
, phi_node
)
6552 gphi
*phi
= as_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (phi_node
)->stmt
);
6555 FOR_EACH_EDGE (e
, ei
, gimple_bb (phi
)->preds
)
6557 unsigned dest_idx
= e
->dest_idx
;
6558 child
= SLP_TREE_CHILDREN (phi_node
)[dest_idx
];
6559 if (!child
|| SLP_TREE_DEF_TYPE (child
) != vect_internal_def
)
6561 /* Simply fill all args. */
6562 for (unsigned i
= 0; i
< SLP_TREE_VEC_STMTS (phi_node
).length (); ++i
)
6563 add_phi_arg (as_a
<gphi
*> (SLP_TREE_VEC_STMTS (phi_node
)[i
]),
6564 vect_get_slp_vect_def (child
, i
),
6565 e
, gimple_phi_arg_location (phi
, dest_idx
));
6570 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
6573 vect_schedule_slp (vec_info
*vinfo
, vec
<slp_instance
> slp_instances
)
6575 slp_instance instance
;
6578 hash_map
<slp_tree
, slp_scc_info
> scc_info
;
6580 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
6582 slp_tree node
= SLP_INSTANCE_TREE (instance
);
6583 if (dump_enabled_p ())
6585 dump_printf_loc (MSG_NOTE
, vect_location
,
6586 "Vectorizing SLP tree:\n");
6588 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
6589 dump_printf_loc (MSG_NOTE
, vect_location
, "Root stmt: %G",
6590 SLP_INSTANCE_ROOT_STMTS (instance
)[0]->stmt
);
6591 vect_print_slp_graph (MSG_NOTE
, vect_location
,
6592 SLP_INSTANCE_TREE (instance
));
6594 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
6595 have a PHI be the node breaking the cycle. */
6596 auto_vec
<slp_tree
> stack
;
6597 if (!scc_info
.get (node
))
6598 vect_schedule_scc (vinfo
, node
, instance
, scc_info
, maxdfs
, stack
);
6600 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
6601 vectorize_slp_instance_root_stmt (node
, instance
);
6603 if (dump_enabled_p ())
6604 dump_printf_loc (MSG_NOTE
, vect_location
,
6605 "vectorizing stmts using SLP.\n");
6608 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
6610 slp_tree root
= SLP_INSTANCE_TREE (instance
);
6611 stmt_vec_info store_info
;
6614 /* Remove scalar call stmts. Do not do this for basic-block
6615 vectorization as not all uses may be vectorized.
6616 ??? Why should this be necessary? DCE should be able to
6617 remove the stmts itself.
6618 ??? For BB vectorization we can as well remove scalar
6619 stmts starting from the SLP tree root if they have no
6621 if (is_a
<loop_vec_info
> (vinfo
))
6622 vect_remove_slp_scalar_calls (vinfo
, root
);
6624 /* Remove vectorized stores original scalar stmts. */
6625 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store_info
); j
++)
6627 if (!STMT_VINFO_DATA_REF (store_info
)
6628 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info
)))
6631 store_info
= vect_orig_stmt (store_info
);
6632 /* Free the attached stmt_vec_info and remove the stmt. */
6633 vinfo
->remove_stmt (store_info
);
6635 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
6636 to not crash in vect_free_slp_tree later. */
6637 if (SLP_TREE_REPRESENTATIVE (root
) == store_info
)
6638 SLP_TREE_REPRESENTATIVE (root
) = NULL
;