1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "tree-pass.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
51 #include "alloc-pool.h"
53 static bool vectorizable_slp_permutation (vec_info
*, gimple_stmt_iterator
*,
54 slp_tree
, stmt_vector_for_cost
*);
56 static object_allocator
<_slp_tree
> *slp_tree_pool
;
57 static slp_tree slp_first_node
;
62 slp_tree_pool
= new object_allocator
<_slp_tree
> ("SLP nodes");
68 while (slp_first_node
)
69 delete slp_first_node
;
75 _slp_tree::operator new (size_t n
)
77 gcc_assert (n
== sizeof (_slp_tree
));
78 return slp_tree_pool
->allocate_raw ();
82 _slp_tree::operator delete (void *node
, size_t n
)
84 gcc_assert (n
== sizeof (_slp_tree
));
85 slp_tree_pool
->remove_raw (node
);
89 /* Initialize a SLP node. */
91 _slp_tree::_slp_tree ()
93 this->prev_node
= NULL
;
95 slp_first_node
->prev_node
= this;
96 this->next_node
= slp_first_node
;
97 slp_first_node
= this;
98 SLP_TREE_SCALAR_STMTS (this) = vNULL
;
99 SLP_TREE_SCALAR_OPS (this) = vNULL
;
100 SLP_TREE_VEC_STMTS (this) = vNULL
;
101 SLP_TREE_VEC_DEFS (this) = vNULL
;
102 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
103 SLP_TREE_CHILDREN (this) = vNULL
;
104 SLP_TREE_LOAD_PERMUTATION (this) = vNULL
;
105 SLP_TREE_LANE_PERMUTATION (this) = vNULL
;
106 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def
;
107 SLP_TREE_CODE (this) = ERROR_MARK
;
108 SLP_TREE_VECTYPE (this) = NULL_TREE
;
109 SLP_TREE_REPRESENTATIVE (this) = NULL
;
110 SLP_TREE_REF_COUNT (this) = 1;
111 this->max_nunits
= 1;
115 /* Tear down a SLP node. */
117 _slp_tree::~_slp_tree ()
120 this->prev_node
->next_node
= this->next_node
;
122 slp_first_node
= this->next_node
;
124 this->next_node
->prev_node
= this->prev_node
;
125 SLP_TREE_CHILDREN (this).release ();
126 SLP_TREE_SCALAR_STMTS (this).release ();
127 SLP_TREE_SCALAR_OPS (this).release ();
128 SLP_TREE_VEC_STMTS (this).release ();
129 SLP_TREE_VEC_DEFS (this).release ();
130 SLP_TREE_LOAD_PERMUTATION (this).release ();
131 SLP_TREE_LANE_PERMUTATION (this).release ();
134 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
137 vect_free_slp_tree (slp_tree node
)
142 if (--SLP_TREE_REF_COUNT (node
) != 0)
145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
147 vect_free_slp_tree (child
);
149 /* If the node defines any SLP only patterns then those patterns are no
150 longer valid and should be removed. */
151 stmt_vec_info rep_stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
152 if (rep_stmt_info
&& STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info
))
154 stmt_vec_info stmt_info
= vect_orig_stmt (rep_stmt_info
);
155 STMT_VINFO_IN_PATTERN_P (stmt_info
) = false;
156 STMT_SLP_TYPE (stmt_info
) = STMT_SLP_TYPE (rep_stmt_info
);
162 /* Return a location suitable for dumpings related to the SLP instance. */
165 _slp_instance::location () const
168 return root_stmt
->stmt
;
170 return SLP_TREE_SCALAR_STMTS (root
)[0]->stmt
;
174 /* Free the memory allocated for the SLP instance. */
177 vect_free_slp_instance (slp_instance instance
)
179 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
180 SLP_INSTANCE_LOADS (instance
).release ();
181 instance
->subgraph_entries
.release ();
182 instance
->cost_vec
.release ();
187 /* Create an SLP node for SCALAR_STMTS. */
190 vect_create_new_slp_node (unsigned nops
, tree_code code
)
192 slp_tree node
= new _slp_tree
;
193 SLP_TREE_SCALAR_STMTS (node
) = vNULL
;
194 SLP_TREE_CHILDREN (node
).create (nops
);
195 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
196 SLP_TREE_CODE (node
) = code
;
199 /* Create an SLP node for SCALAR_STMTS. */
202 vect_create_new_slp_node (slp_tree node
,
203 vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
205 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
206 SLP_TREE_CHILDREN (node
).create (nops
);
207 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
208 SLP_TREE_REPRESENTATIVE (node
) = scalar_stmts
[0];
209 SLP_TREE_LANES (node
) = scalar_stmts
.length ();
213 /* Create an SLP node for SCALAR_STMTS. */
216 vect_create_new_slp_node (vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
218 return vect_create_new_slp_node (new _slp_tree
, scalar_stmts
, nops
);
221 /* Create an SLP node for OPS. */
224 vect_create_new_slp_node (slp_tree node
, vec
<tree
> ops
)
226 SLP_TREE_SCALAR_OPS (node
) = ops
;
227 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
228 SLP_TREE_LANES (node
) = ops
.length ();
232 /* Create an SLP node for OPS. */
235 vect_create_new_slp_node (vec
<tree
> ops
)
237 return vect_create_new_slp_node (new _slp_tree
, ops
);
241 /* This structure is used in creation of an SLP tree. Each instance
242 corresponds to the same operand in a group of scalar stmts in an SLP
244 typedef struct _slp_oprnd_info
246 /* Def-stmts for the operands. */
247 vec
<stmt_vec_info
> def_stmts
;
250 /* Information about the first statement, its vector def-type, type, the
251 operand itself in case it's constant, and an indication if it's a pattern
254 enum vect_def_type first_dt
;
259 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
261 static vec
<slp_oprnd_info
>
262 vect_create_oprnd_info (int nops
, int group_size
)
265 slp_oprnd_info oprnd_info
;
266 vec
<slp_oprnd_info
> oprnds_info
;
268 oprnds_info
.create (nops
);
269 for (i
= 0; i
< nops
; i
++)
271 oprnd_info
= XNEW (struct _slp_oprnd_info
);
272 oprnd_info
->def_stmts
.create (group_size
);
273 oprnd_info
->ops
.create (group_size
);
274 oprnd_info
->first_dt
= vect_uninitialized_def
;
275 oprnd_info
->first_op_type
= NULL_TREE
;
276 oprnd_info
->any_pattern
= false;
277 oprnds_info
.quick_push (oprnd_info
);
284 /* Free operands info. */
287 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
290 slp_oprnd_info oprnd_info
;
292 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
294 oprnd_info
->def_stmts
.release ();
295 oprnd_info
->ops
.release ();
296 XDELETE (oprnd_info
);
299 oprnds_info
.release ();
303 /* Return true if STMTS contains a pattern statement. */
306 vect_contains_pattern_stmt_p (vec
<stmt_vec_info
> stmts
)
308 stmt_vec_info stmt_info
;
310 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
311 if (is_pattern_stmt_p (stmt_info
))
316 /* Return true when all lanes in the external or constant NODE have
320 vect_slp_tree_uniform_p (slp_tree node
)
322 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
323 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
);
325 /* Pre-exsting vectors. */
326 if (SLP_TREE_SCALAR_OPS (node
).is_empty ())
330 tree op
, first
= NULL_TREE
;
331 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
334 else if (!operand_equal_p (first
, op
, 0))
340 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
341 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
345 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info
,
346 stmt_vec_info first_stmt_info
)
348 stmt_vec_info next_stmt_info
= first_stmt_info
;
351 if (first_stmt_info
!= DR_GROUP_FIRST_ELEMENT (stmt_info
))
356 if (next_stmt_info
== stmt_info
)
358 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
360 result
+= DR_GROUP_GAP (next_stmt_info
);
362 while (next_stmt_info
);
367 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
368 using the method implemented by duplicate_and_interleave. Return true
369 if so, returning the number of intermediate vectors in *NVECTORS_OUT
370 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
374 can_duplicate_and_interleave_p (vec_info
*vinfo
, unsigned int count
,
375 tree elt_type
, unsigned int *nvectors_out
,
376 tree
*vector_type_out
,
379 tree base_vector_type
= get_vectype_for_scalar_type (vinfo
, elt_type
, count
);
380 if (!base_vector_type
|| !VECTOR_MODE_P (TYPE_MODE (base_vector_type
)))
383 machine_mode base_vector_mode
= TYPE_MODE (base_vector_type
);
384 poly_int64 elt_bytes
= count
* GET_MODE_UNIT_SIZE (base_vector_mode
);
385 unsigned int nvectors
= 1;
388 scalar_int_mode int_mode
;
389 poly_int64 elt_bits
= elt_bytes
* BITS_PER_UNIT
;
390 if (int_mode_for_size (elt_bits
, 1).exists (&int_mode
))
392 /* Get the natural vector type for this SLP group size. */
393 tree int_type
= build_nonstandard_integer_type
394 (GET_MODE_BITSIZE (int_mode
), 1);
396 = get_vectype_for_scalar_type (vinfo
, int_type
, count
);
398 && VECTOR_MODE_P (TYPE_MODE (vector_type
))
399 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type
)),
400 GET_MODE_SIZE (base_vector_mode
)))
402 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
403 together into elements of type INT_TYPE and using the result
404 to build NVECTORS vectors. */
405 poly_uint64 nelts
= GET_MODE_NUNITS (TYPE_MODE (vector_type
));
406 vec_perm_builder
sel1 (nelts
, 2, 3);
407 vec_perm_builder
sel2 (nelts
, 2, 3);
408 poly_int64 half_nelts
= exact_div (nelts
, 2);
409 for (unsigned int i
= 0; i
< 3; ++i
)
412 sel1
.quick_push (i
+ nelts
);
413 sel2
.quick_push (half_nelts
+ i
);
414 sel2
.quick_push (half_nelts
+ i
+ nelts
);
416 vec_perm_indices
indices1 (sel1
, 2, nelts
);
417 vec_perm_indices
indices2 (sel2
, 2, nelts
);
418 if (can_vec_perm_const_p (TYPE_MODE (vector_type
), indices1
)
419 && can_vec_perm_const_p (TYPE_MODE (vector_type
), indices2
))
422 *nvectors_out
= nvectors
;
424 *vector_type_out
= vector_type
;
427 permutes
[0] = vect_gen_perm_mask_checked (vector_type
,
429 permutes
[1] = vect_gen_perm_mask_checked (vector_type
,
436 if (!multiple_p (elt_bytes
, 2, &elt_bytes
))
442 /* Return true if DTA and DTB match. */
445 vect_def_types_match (enum vect_def_type dta
, enum vect_def_type dtb
)
448 || ((dta
== vect_external_def
|| dta
== vect_constant_def
)
449 && (dtb
== vect_external_def
|| dtb
== vect_constant_def
)));
452 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
453 they are of a valid type and that they match the defs of the first stmt of
454 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
455 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
456 indicates swap is required for cond_expr stmts. Specifically, *SWAP
457 is 1 if STMT is cond and operands of comparison need to be swapped;
458 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
459 If there is any operand swap in this function, *SWAP is set to non-zero
461 If there was a fatal error return -1; if the error could be corrected by
462 swapping operands of father node of this one, return 1; if everything is
465 vect_get_and_check_slp_defs (vec_info
*vinfo
, unsigned char swap
,
467 vec
<stmt_vec_info
> stmts
, unsigned stmt_num
,
468 vec
<slp_oprnd_info
> *oprnds_info
)
470 stmt_vec_info stmt_info
= stmts
[stmt_num
];
472 unsigned int i
, number_of_oprnds
;
473 enum vect_def_type dt
= vect_uninitialized_def
;
474 slp_oprnd_info oprnd_info
;
475 int first_op_idx
= 1;
476 unsigned int commutative_op
= -1U;
477 bool first_op_cond
= false;
478 bool first
= stmt_num
== 0;
480 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
482 number_of_oprnds
= gimple_call_num_args (stmt
);
484 if (gimple_call_internal_p (stmt
))
486 internal_fn ifn
= gimple_call_internal_fn (stmt
);
487 commutative_op
= first_commutative_argument (ifn
);
489 /* Masked load, only look at mask. */
490 if (ifn
== IFN_MASK_LOAD
)
492 number_of_oprnds
= 1;
493 /* Mask operand index. */
498 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
500 enum tree_code code
= gimple_assign_rhs_code (stmt
);
501 number_of_oprnds
= gimple_num_ops (stmt
) - 1;
502 /* Swap can only be done for cond_expr if asked to, otherwise we
503 could result in different comparison code to the first stmt. */
504 if (code
== COND_EXPR
505 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt
)))
507 first_op_cond
= true;
511 commutative_op
= commutative_tree_code (code
) ? 0U : -1U;
513 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
514 number_of_oprnds
= gimple_phi_num_args (stmt
);
518 bool swapped
= (swap
!= 0);
519 bool backedge
= false;
520 gcc_assert (!swapped
|| first_op_cond
);
521 enum vect_def_type
*dts
= XALLOCAVEC (enum vect_def_type
, number_of_oprnds
);
522 for (i
= 0; i
< number_of_oprnds
; i
++)
526 /* Map indicating how operands of cond_expr should be swapped. */
527 int maps
[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
528 int *map
= maps
[swap
];
531 oprnd
= TREE_OPERAND (gimple_op (stmt_info
->stmt
,
532 first_op_idx
), map
[i
]);
534 oprnd
= gimple_op (stmt_info
->stmt
, map
[i
]);
536 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
538 oprnd
= gimple_phi_arg_def (stmt
, i
);
539 backedge
= dominated_by_p (CDI_DOMINATORS
,
540 gimple_phi_arg_edge (stmt
, i
)->src
,
541 gimple_bb (stmt_info
->stmt
));
544 oprnd
= gimple_op (stmt_info
->stmt
, first_op_idx
+ (swapped
? !i
: i
));
545 if (TREE_CODE (oprnd
) == VIEW_CONVERT_EXPR
)
546 oprnd
= TREE_OPERAND (oprnd
, 0);
548 oprnd_info
= (*oprnds_info
)[i
];
550 stmt_vec_info def_stmt_info
;
551 if (!vect_is_simple_use (oprnd
, vinfo
, &dts
[i
], &def_stmt_info
))
553 if (dump_enabled_p ())
554 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
555 "Build SLP failed: can't analyze def for %T\n",
563 oprnd_info
->def_stmts
.quick_push (NULL
);
564 oprnd_info
->ops
.quick_push (NULL_TREE
);
565 oprnd_info
->first_dt
= vect_uninitialized_def
;
569 oprnd_info
->def_stmts
.quick_push (def_stmt_info
);
570 oprnd_info
->ops
.quick_push (oprnd
);
573 && is_pattern_stmt_p (def_stmt_info
))
575 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info
))
577 oprnd_info
->any_pattern
= true;
579 /* If we promote this to external use the original stmt def. */
580 oprnd_info
->ops
.last ()
581 = gimple_get_lhs (vect_orig_stmt (def_stmt_info
)->stmt
);
584 /* If there's a extern def on a backedge make sure we can
585 code-generate at the region start.
586 ??? This is another case that could be fixed by adjusting
587 how we split the function but at the moment we'd have conflicting
590 && dts
[i
] == vect_external_def
591 && is_a
<bb_vec_info
> (vinfo
)
592 && TREE_CODE (oprnd
) == SSA_NAME
593 && !SSA_NAME_IS_DEFAULT_DEF (oprnd
)
594 && !dominated_by_p (CDI_DOMINATORS
,
595 as_a
<bb_vec_info
> (vinfo
)->bbs
[0],
596 gimple_bb (SSA_NAME_DEF_STMT (oprnd
))))
598 if (dump_enabled_p ())
599 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
600 "Build SLP failed: extern def %T only defined "
601 "on backedge\n", oprnd
);
607 tree type
= TREE_TYPE (oprnd
);
609 if ((dt
== vect_constant_def
610 || dt
== vect_external_def
)
611 && !GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
612 && (TREE_CODE (type
) == BOOLEAN_TYPE
613 || !can_duplicate_and_interleave_p (vinfo
, stmts
.length (),
616 if (dump_enabled_p ())
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
618 "Build SLP failed: invalid type of def "
619 "for variable-length SLP %T\n", oprnd
);
623 /* For the swapping logic below force vect_reduction_def
624 for the reduction op in a SLP reduction group. */
625 if (!STMT_VINFO_DATA_REF (stmt_info
)
626 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
627 && (int)i
== STMT_VINFO_REDUC_IDX (stmt_info
)
629 dts
[i
] = dt
= vect_reduction_def
;
631 /* Check the types of the definition. */
634 case vect_external_def
:
635 case vect_constant_def
:
636 case vect_internal_def
:
637 case vect_reduction_def
:
638 case vect_induction_def
:
639 case vect_nested_cycle
:
643 /* FORNOW: Not supported. */
644 if (dump_enabled_p ())
645 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
646 "Build SLP failed: illegal type of def %T\n",
651 oprnd_info
->first_dt
= dt
;
652 oprnd_info
->first_op_type
= type
;
658 /* Now match the operand definition types to that of the first stmt. */
659 for (i
= 0; i
< number_of_oprnds
;)
667 oprnd_info
= (*oprnds_info
)[i
];
669 stmt_vec_info def_stmt_info
= oprnd_info
->def_stmts
[stmt_num
];
670 oprnd
= oprnd_info
->ops
[stmt_num
];
671 tree type
= TREE_TYPE (oprnd
);
673 if (!types_compatible_p (oprnd_info
->first_op_type
, type
))
675 if (dump_enabled_p ())
676 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
677 "Build SLP failed: different operand types\n");
681 /* Not first stmt of the group, check that the def-stmt/s match
682 the def-stmt/s of the first stmt. Allow different definition
683 types for reduction chains: the first stmt must be a
684 vect_reduction_def (a phi node), and the rest
685 end in the reduction chain. */
686 if ((!vect_def_types_match (oprnd_info
->first_dt
, dt
)
687 && !(oprnd_info
->first_dt
== vect_reduction_def
688 && !STMT_VINFO_DATA_REF (stmt_info
)
689 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
691 && !STMT_VINFO_DATA_REF (def_stmt_info
)
692 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
693 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
))))
694 || (!STMT_VINFO_DATA_REF (stmt_info
)
695 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
697 || STMT_VINFO_DATA_REF (def_stmt_info
)
698 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
699 != REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
700 != (oprnd_info
->first_dt
!= vect_reduction_def
))))
702 /* Try swapping operands if we got a mismatch. For BB
703 vectorization only in case it will clearly improve things. */
704 if (i
== commutative_op
&& !swapped
705 && (!is_a
<bb_vec_info
> (vinfo
)
706 || (!vect_def_types_match ((*oprnds_info
)[i
+1]->first_dt
,
708 && (vect_def_types_match (oprnd_info
->first_dt
, dts
[i
+1])
709 || vect_def_types_match
710 ((*oprnds_info
)[i
+1]->first_dt
, dts
[i
])))))
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_NOTE
, vect_location
,
714 "trying swapped operands\n");
715 std::swap (dts
[i
], dts
[i
+1]);
716 std::swap ((*oprnds_info
)[i
]->def_stmts
[stmt_num
],
717 (*oprnds_info
)[i
+1]->def_stmts
[stmt_num
]);
718 std::swap ((*oprnds_info
)[i
]->ops
[stmt_num
],
719 (*oprnds_info
)[i
+1]->ops
[stmt_num
]);
724 if (is_a
<bb_vec_info
> (vinfo
)
725 && !oprnd_info
->any_pattern
)
727 /* Now for commutative ops we should see whether we can
728 make the other operand matching. */
729 if (dump_enabled_p ())
730 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
731 "treating operand as external\n");
732 oprnd_info
->first_dt
= dt
= vect_external_def
;
736 if (dump_enabled_p ())
737 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
738 "Build SLP failed: different types\n");
743 /* Make sure to demote the overall operand to external. */
744 if (dt
== vect_external_def
)
745 oprnd_info
->first_dt
= vect_external_def
;
746 /* For a SLP reduction chain we want to duplicate the reduction to
747 each of the chain members. That gets us a sane SLP graph (still
748 the stmts are not 100% correct wrt the initial values). */
749 else if ((dt
== vect_internal_def
750 || dt
== vect_reduction_def
)
751 && oprnd_info
->first_dt
== vect_reduction_def
752 && !STMT_VINFO_DATA_REF (stmt_info
)
753 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
754 && !STMT_VINFO_DATA_REF (def_stmt_info
)
755 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
756 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
758 oprnd_info
->def_stmts
[stmt_num
] = oprnd_info
->def_stmts
[0];
759 oprnd_info
->ops
[stmt_num
] = oprnd_info
->ops
[0];
768 if (dump_enabled_p ())
769 dump_printf_loc (MSG_NOTE
, vect_location
,
770 "swapped operands to match def types in %G",
777 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
778 Return true if we can, meaning that this choice doesn't conflict with
779 existing SLP nodes that use STMT_INFO. */
782 vect_update_shared_vectype (stmt_vec_info stmt_info
, tree vectype
)
784 tree old_vectype
= STMT_VINFO_VECTYPE (stmt_info
);
786 return useless_type_conversion_p (vectype
, old_vectype
);
788 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
790 /* We maintain the invariant that if any statement in the group is
791 used, all other members of the group have the same vector type. */
792 stmt_vec_info first_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
793 stmt_vec_info member_info
= first_info
;
794 for (; member_info
; member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
795 if (is_pattern_stmt_p (member_info
)
796 && !useless_type_conversion_p (vectype
,
797 STMT_VINFO_VECTYPE (member_info
)))
802 for (member_info
= first_info
; member_info
;
803 member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
804 STMT_VINFO_VECTYPE (member_info
) = vectype
;
808 else if (!is_pattern_stmt_p (stmt_info
))
810 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
814 if (dump_enabled_p ())
816 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
817 "Build SLP failed: incompatible vector"
818 " types for: %G", stmt_info
->stmt
);
819 dump_printf_loc (MSG_NOTE
, vect_location
,
820 " old vector type: %T\n", old_vectype
);
821 dump_printf_loc (MSG_NOTE
, vect_location
,
822 " new vector type: %T\n", vectype
);
827 /* Return true if call statements CALL1 and CALL2 are similar enough
828 to be combined into the same SLP group. */
831 compatible_calls_p (gcall
*call1
, gcall
*call2
)
833 unsigned int nargs
= gimple_call_num_args (call1
);
834 if (nargs
!= gimple_call_num_args (call2
))
837 if (gimple_call_combined_fn (call1
) != gimple_call_combined_fn (call2
))
840 if (gimple_call_internal_p (call1
))
842 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1
)),
843 TREE_TYPE (gimple_call_lhs (call2
))))
845 for (unsigned int i
= 0; i
< nargs
; ++i
)
846 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1
, i
)),
847 TREE_TYPE (gimple_call_arg (call2
, i
))))
852 if (!operand_equal_p (gimple_call_fn (call1
),
853 gimple_call_fn (call2
), 0))
856 if (gimple_call_fntype (call1
) != gimple_call_fntype (call2
))
862 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
863 caller's attempt to find the vector type in STMT_INFO with the narrowest
864 element type. Return true if VECTYPE is nonnull and if it is valid
865 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
866 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
867 vect_build_slp_tree. */
870 vect_record_max_nunits (vec_info
*vinfo
, stmt_vec_info stmt_info
,
871 unsigned int group_size
,
872 tree vectype
, poly_uint64
*max_nunits
)
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
878 "Build SLP failed: unsupported data-type in %G\n",
880 /* Fatal mismatch. */
884 /* If populating the vector type requires unrolling then fail
885 before adjusting *max_nunits for basic-block vectorization. */
886 if (is_a
<bb_vec_info
> (vinfo
)
887 && !multiple_p (group_size
, TYPE_VECTOR_SUBPARTS (vectype
)))
889 if (dump_enabled_p ())
890 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
891 "Build SLP failed: unrolling required "
892 "in basic block SLP\n");
893 /* Fatal mismatch. */
897 /* In case of multiple types we need to detect the smallest type. */
898 vect_update_max_nunits (max_nunits
, vectype
);
902 /* Verify if the scalar stmts STMTS are isomorphic, require data
903 permutation or are of unsupported types of operation. Return
904 true if they are, otherwise return false and indicate in *MATCHES
905 which stmts are not isomorphic to the first one. If MATCHES[0]
906 is false then this indicates the comparison could not be
907 carried out or the stmts will never be vectorized by SLP.
909 Note COND_EXPR is possibly isomorphic to another one after swapping its
910 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
911 the first stmt by swapping the two operands of comparison; set SWAP[i]
912 to 2 if stmt I is isormorphic to the first stmt by inverting the code
913 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
914 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
917 vect_build_slp_tree_1 (vec_info
*vinfo
, unsigned char *swap
,
918 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
919 poly_uint64
*max_nunits
, bool *matches
,
920 bool *two_operators
, tree
*node_vectype
)
923 stmt_vec_info first_stmt_info
= stmts
[0];
924 enum tree_code first_stmt_code
= ERROR_MARK
;
925 enum tree_code alt_stmt_code
= ERROR_MARK
;
926 enum tree_code rhs_code
= ERROR_MARK
;
927 enum tree_code first_cond_code
= ERROR_MARK
;
929 bool need_same_oprnds
= false;
930 tree vectype
= NULL_TREE
, first_op1
= NULL_TREE
;
933 machine_mode optab_op2_mode
;
934 machine_mode vec_mode
;
935 stmt_vec_info first_load
= NULL
, prev_first_load
= NULL
;
936 bool first_stmt_load_p
= false, load_p
= false;
937 bool first_stmt_phi_p
= false, phi_p
= false;
938 bool maybe_soft_fail
= false;
939 tree soft_fail_nunits_vectype
= NULL_TREE
;
941 /* For every stmt in NODE find its def stmt/s. */
942 stmt_vec_info stmt_info
;
943 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
945 gimple
*stmt
= stmt_info
->stmt
;
949 if (dump_enabled_p ())
950 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for %G", stmt
);
952 /* Fail to vectorize statements marked as unvectorizable, throw
954 if (!STMT_VINFO_VECTORIZABLE (stmt_info
)
955 || stmt_can_throw_internal (cfun
, stmt
)
956 || gimple_has_volatile_ops (stmt
))
958 if (dump_enabled_p ())
959 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
960 "Build SLP failed: unvectorizable statement %G",
962 /* ??? For BB vectorization we want to commutate operands in a way
963 to shuffle all unvectorizable defs into one operand and have
964 the other still vectorized. The following doesn't reliably
965 work for this though but it's the easiest we can do here. */
966 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
968 /* Fatal mismatch. */
973 lhs
= gimple_get_lhs (stmt
);
974 if (lhs
== NULL_TREE
)
976 if (dump_enabled_p ())
977 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
978 "Build SLP failed: not GIMPLE_ASSIGN nor "
979 "GIMPLE_CALL %G", stmt
);
980 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
982 /* Fatal mismatch. */
988 if (!vect_get_vector_types_for_stmt (vinfo
, stmt_info
, &vectype
,
989 &nunits_vectype
, group_size
))
991 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
993 /* Fatal mismatch. */
997 /* Record nunits required but continue analysis, producing matches[]
998 as if nunits was not an issue. This allows splitting of groups
1001 && !vect_record_max_nunits (vinfo
, stmt_info
, group_size
,
1002 nunits_vectype
, max_nunits
))
1004 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
1005 maybe_soft_fail
= true;
1006 soft_fail_nunits_vectype
= nunits_vectype
;
1009 gcc_assert (vectype
);
1011 gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
);
1014 rhs_code
= CALL_EXPR
;
1016 if (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
))
1018 else if ((gimple_call_internal_p (call_stmt
)
1019 && (!vectorizable_internal_fn_p
1020 (gimple_call_internal_fn (call_stmt
))))
1021 || gimple_call_tail_p (call_stmt
)
1022 || gimple_call_noreturn_p (call_stmt
)
1023 || !gimple_call_nothrow_p (call_stmt
)
1024 || gimple_call_chain (call_stmt
))
1026 if (dump_enabled_p ())
1027 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1028 "Build SLP failed: unsupported call type %G",
1030 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1032 /* Fatal mismatch. */
1037 else if (gimple_code (stmt
) == GIMPLE_PHI
)
1039 rhs_code
= ERROR_MARK
;
1044 rhs_code
= gimple_assign_rhs_code (stmt
);
1045 load_p
= gimple_vuse (stmt
);
1048 /* Check the operation. */
1051 *node_vectype
= vectype
;
1052 first_stmt_code
= rhs_code
;
1053 first_stmt_load_p
= load_p
;
1054 first_stmt_phi_p
= phi_p
;
1056 /* Shift arguments should be equal in all the packed stmts for a
1057 vector shift with scalar shift operand. */
1058 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
1059 || rhs_code
== LROTATE_EXPR
1060 || rhs_code
== RROTATE_EXPR
)
1062 vec_mode
= TYPE_MODE (vectype
);
1064 /* First see if we have a vector/vector shift. */
1065 optab
= optab_for_tree_code (rhs_code
, vectype
,
1069 || optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
1071 /* No vector/vector shift, try for a vector/scalar shift. */
1072 optab
= optab_for_tree_code (rhs_code
, vectype
,
1077 if (dump_enabled_p ())
1078 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1079 "Build SLP failed: no optab.\n");
1080 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1082 /* Fatal mismatch. */
1086 icode
= (int) optab_handler (optab
, vec_mode
);
1087 if (icode
== CODE_FOR_nothing
)
1089 if (dump_enabled_p ())
1090 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1091 "Build SLP failed: "
1092 "op not supported by target.\n");
1093 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1095 /* Fatal mismatch. */
1099 optab_op2_mode
= insn_data
[icode
].operand
[2].mode
;
1100 if (!VECTOR_MODE_P (optab_op2_mode
))
1102 need_same_oprnds
= true;
1103 first_op1
= gimple_assign_rhs2 (stmt
);
1107 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
1109 need_same_oprnds
= true;
1110 first_op1
= gimple_assign_rhs2 (stmt
);
1113 && rhs_code
== BIT_FIELD_REF
)
1115 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 0);
1116 if (!is_a
<bb_vec_info
> (vinfo
)
1117 || TREE_CODE (vec
) != SSA_NAME
1118 || !operand_equal_p (TYPE_SIZE (vectype
),
1119 TYPE_SIZE (TREE_TYPE (vec
))))
1121 if (dump_enabled_p ())
1122 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1123 "Build SLP failed: "
1124 "BIT_FIELD_REF not supported\n");
1125 /* Fatal mismatch. */
1131 && gimple_call_internal_p (call_stmt
, IFN_DIV_POW2
))
1133 need_same_oprnds
= true;
1134 first_op1
= gimple_call_arg (call_stmt
, 1);
1139 if (first_stmt_code
!= rhs_code
1140 && alt_stmt_code
== ERROR_MARK
)
1141 alt_stmt_code
= rhs_code
;
1142 if ((first_stmt_code
!= rhs_code
1143 && (first_stmt_code
!= IMAGPART_EXPR
1144 || rhs_code
!= REALPART_EXPR
)
1145 && (first_stmt_code
!= REALPART_EXPR
1146 || rhs_code
!= IMAGPART_EXPR
)
1147 /* Handle mismatches in plus/minus by computing both
1148 and merging the results. */
1149 && !((first_stmt_code
== PLUS_EXPR
1150 || first_stmt_code
== MINUS_EXPR
)
1151 && (alt_stmt_code
== PLUS_EXPR
1152 || alt_stmt_code
== MINUS_EXPR
)
1153 && rhs_code
== alt_stmt_code
)
1154 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1155 && (first_stmt_code
== ARRAY_REF
1156 || first_stmt_code
== BIT_FIELD_REF
1157 || first_stmt_code
== INDIRECT_REF
1158 || first_stmt_code
== COMPONENT_REF
1159 || first_stmt_code
== MEM_REF
)))
1160 || first_stmt_load_p
!= load_p
1161 || first_stmt_phi_p
!= phi_p
)
1163 if (dump_enabled_p ())
1165 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1166 "Build SLP failed: different operation "
1167 "in stmt %G", stmt
);
1168 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1169 "original stmt %G", first_stmt_info
->stmt
);
1175 if (need_same_oprnds
)
1177 tree other_op1
= (call_stmt
1178 ? gimple_call_arg (call_stmt
, 1)
1179 : gimple_assign_rhs2 (stmt
));
1180 if (!operand_equal_p (first_op1
, other_op1
, 0))
1182 if (dump_enabled_p ())
1183 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1184 "Build SLP failed: different shift "
1185 "arguments in %G", stmt
);
1191 && first_stmt_code
== BIT_FIELD_REF
1192 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info
->stmt
), 0)
1193 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0)))
1195 if (dump_enabled_p ())
1196 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1197 "Build SLP failed: different BIT_FIELD_REF "
1198 "arguments in %G", stmt
);
1203 if (!load_p
&& rhs_code
== CALL_EXPR
)
1205 if (!compatible_calls_p (as_a
<gcall
*> (stmts
[0]->stmt
),
1206 as_a
<gcall
*> (stmt
)))
1208 if (dump_enabled_p ())
1209 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1210 "Build SLP failed: different calls in %G",
1218 && (gimple_bb (first_stmt_info
->stmt
)
1219 != gimple_bb (stmt_info
->stmt
)))
1221 if (dump_enabled_p ())
1222 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1223 "Build SLP failed: different BB for PHI "
1229 if (!types_compatible_p (vectype
, *node_vectype
))
1231 if (dump_enabled_p ())
1232 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1233 "Build SLP failed: different vector type "
1240 /* Grouped store or load. */
1241 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1243 if (REFERENCE_CLASS_P (lhs
))
1251 first_load
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1252 if (prev_first_load
)
1254 /* Check that there are no loads from different interleaving
1255 chains in the same node. */
1256 if (prev_first_load
!= first_load
)
1258 if (dump_enabled_p ())
1259 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1261 "Build SLP failed: different "
1262 "interleaving chains in one node %G",
1269 prev_first_load
= first_load
;
1271 } /* Grouped access. */
1276 /* Not grouped load. */
1277 if (dump_enabled_p ())
1278 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1279 "Build SLP failed: not grouped load %G", stmt
);
1281 /* FORNOW: Not grouped loads are not supported. */
1282 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1284 /* Fatal mismatch. */
1289 /* Not memory operation. */
1291 && TREE_CODE_CLASS (rhs_code
) != tcc_binary
1292 && TREE_CODE_CLASS (rhs_code
) != tcc_unary
1293 && TREE_CODE_CLASS (rhs_code
) != tcc_expression
1294 && TREE_CODE_CLASS (rhs_code
) != tcc_comparison
1295 && rhs_code
!= VIEW_CONVERT_EXPR
1296 && rhs_code
!= CALL_EXPR
1297 && rhs_code
!= BIT_FIELD_REF
)
1299 if (dump_enabled_p ())
1300 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1301 "Build SLP failed: operation unsupported %G",
1303 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1305 /* Fatal mismatch. */
1310 if (rhs_code
== COND_EXPR
)
1312 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1313 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1314 enum tree_code swap_code
= ERROR_MARK
;
1315 enum tree_code invert_code
= ERROR_MARK
;
1318 first_cond_code
= TREE_CODE (cond_expr
);
1319 else if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1321 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
1322 swap_code
= swap_tree_comparison (cond_code
);
1323 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
1326 if (first_cond_code
== cond_code
)
1328 /* Isomorphic can be achieved by swapping. */
1329 else if (first_cond_code
== swap_code
)
1331 /* Isomorphic can be achieved by inverting. */
1332 else if (first_cond_code
== invert_code
)
1336 if (dump_enabled_p ())
1337 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1338 "Build SLP failed: different"
1339 " operation %G", stmt
);
1349 for (i
= 0; i
< group_size
; ++i
)
1353 /* If we allowed a two-operation SLP node verify the target can cope
1354 with the permute we are going to use. */
1355 if (alt_stmt_code
!= ERROR_MARK
1356 && TREE_CODE_CLASS (alt_stmt_code
) != tcc_reference
)
1358 *two_operators
= true;
1361 if (maybe_soft_fail
)
1363 unsigned HOST_WIDE_INT const_nunits
;
1364 if (!TYPE_VECTOR_SUBPARTS
1365 (soft_fail_nunits_vectype
).is_constant (&const_nunits
)
1366 || const_nunits
> group_size
)
1370 /* With constant vector elements simulate a mismatch at the
1371 point we need to split. */
1372 unsigned tail
= group_size
& (const_nunits
- 1);
1373 memset (&matches
[group_size
- tail
], 0, sizeof (bool) * tail
);
1381 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1382 Note we never remove apart from at destruction time so we do not
1383 need a special value for deleted that differs from empty. */
1386 typedef vec
<stmt_vec_info
> value_type
;
1387 typedef vec
<stmt_vec_info
> compare_type
;
1388 static inline hashval_t
hash (value_type
);
1389 static inline bool equal (value_type existing
, value_type candidate
);
1390 static inline bool is_empty (value_type x
) { return !x
.exists (); }
1391 static inline bool is_deleted (value_type x
) { return !x
.exists (); }
1392 static const bool empty_zero_p
= true;
1393 static inline void mark_empty (value_type
&x
) { x
.release (); }
1394 static inline void mark_deleted (value_type
&x
) { x
.release (); }
1395 static inline void remove (value_type
&x
) { x
.release (); }
1398 bst_traits::hash (value_type x
)
1401 for (unsigned i
= 0; i
< x
.length (); ++i
)
1402 h
.add_int (gimple_uid (x
[i
]->stmt
));
1406 bst_traits::equal (value_type existing
, value_type candidate
)
1408 if (existing
.length () != candidate
.length ())
1410 for (unsigned i
= 0; i
< existing
.length (); ++i
)
1411 if (existing
[i
] != candidate
[i
])
1416 typedef hash_map
<vec
<stmt_vec_info
>, slp_tree
,
1417 simple_hashmap_traits
<bst_traits
, slp_tree
> >
1418 scalar_stmts_to_slp_tree_map_t
;
1421 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1422 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1423 poly_uint64
*max_nunits
,
1424 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1425 scalar_stmts_to_slp_tree_map_t
*bst_map
);
1428 vect_build_slp_tree (vec_info
*vinfo
,
1429 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1430 poly_uint64
*max_nunits
,
1431 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1432 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1434 if (slp_tree
*leader
= bst_map
->get (stmts
))
1436 if (dump_enabled_p ())
1437 dump_printf_loc (MSG_NOTE
, vect_location
, "re-using %sSLP tree %p\n",
1438 *leader
? "" : "failed ", *leader
);
1441 SLP_TREE_REF_COUNT (*leader
)++;
1442 vect_update_max_nunits (max_nunits
, (*leader
)->max_nunits
);
1448 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1449 so we can pick up backedge destinations during discovery. */
1450 slp_tree res
= new _slp_tree
;
1451 SLP_TREE_DEF_TYPE (res
) = vect_internal_def
;
1452 SLP_TREE_SCALAR_STMTS (res
) = stmts
;
1453 bst_map
->put (stmts
.copy (), res
);
1457 if (dump_enabled_p ())
1458 dump_printf_loc (MSG_NOTE
, vect_location
,
1459 "SLP discovery limit exceeded\n");
1460 bool existed_p
= bst_map
->put (stmts
, NULL
);
1461 gcc_assert (existed_p
);
1462 /* Mark the node invalid so we can detect those when still in use
1463 as backedge destinations. */
1464 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1465 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1466 vect_free_slp_tree (res
);
1467 memset (matches
, 0, sizeof (bool) * group_size
);
1472 poly_uint64 this_max_nunits
= 1;
1473 slp_tree res_
= vect_build_slp_tree_2 (vinfo
, res
, stmts
, group_size
,
1475 matches
, limit
, tree_size
, bst_map
);
1478 bool existed_p
= bst_map
->put (stmts
, NULL
);
1479 gcc_assert (existed_p
);
1480 /* Mark the node invalid so we can detect those when still in use
1481 as backedge destinations. */
1482 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1483 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1484 vect_free_slp_tree (res
);
1488 gcc_assert (res_
== res
);
1489 res
->max_nunits
= this_max_nunits
;
1490 vect_update_max_nunits (max_nunits
, this_max_nunits
);
1491 /* Keep a reference for the bst_map use. */
1492 SLP_TREE_REF_COUNT (res
)++;
1497 /* Recursively build an SLP tree starting from NODE.
1498 Fail (and return a value not equal to zero) if def-stmts are not
1499 isomorphic, require data permutation or are of unsupported types of
1500 operation. Otherwise, return 0.
1501 The value returned is the depth in the SLP tree where a mismatch
1505 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1506 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1507 poly_uint64
*max_nunits
,
1508 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1509 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1511 unsigned nops
, i
, this_tree_size
= 0;
1512 poly_uint64 this_max_nunits
= *max_nunits
;
1516 stmt_vec_info stmt_info
= stmts
[0];
1517 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1518 nops
= gimple_call_num_args (stmt
);
1519 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
1521 nops
= gimple_num_ops (stmt
) - 1;
1522 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
1525 else if (gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1526 nops
= gimple_phi_num_args (phi
);
1530 /* If the SLP node is a PHI (induction or reduction), terminate
1532 bool *skip_args
= XALLOCAVEC (bool, nops
);
1533 memset (skip_args
, 0, sizeof (bool) * nops
);
1534 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
1535 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1537 tree scalar_type
= TREE_TYPE (PHI_RESULT (stmt
));
1538 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
1540 if (!vect_record_max_nunits (vinfo
, stmt_info
, group_size
, vectype
,
1544 vect_def_type def_type
= STMT_VINFO_DEF_TYPE (stmt_info
);
1545 if (def_type
== vect_induction_def
)
1547 /* Induction PHIs are not cycles but walk the initial
1548 value. Only for inner loops through, for outer loops
1549 we need to pick up the value from the actual PHIs
1550 to more easily support peeling and epilogue vectorization. */
1551 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1552 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1553 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1556 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1558 else if (def_type
== vect_reduction_def
1559 || def_type
== vect_double_reduction_def
1560 || def_type
== vect_nested_cycle
)
1562 /* Else def types have to match. */
1563 stmt_vec_info other_info
;
1564 bool all_same
= true;
1565 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1567 if (STMT_VINFO_DEF_TYPE (other_info
) != def_type
)
1569 if (other_info
!= stmt_info
)
1572 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1573 /* Reduction initial values are not explicitely represented. */
1574 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1575 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1576 /* Reduction chain backedge defs are filled manually.
1577 ??? Need a better way to identify a SLP reduction chain PHI.
1578 Or a better overall way to SLP match those. */
1579 if (all_same
&& def_type
== vect_reduction_def
)
1580 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1582 else if (def_type
!= vect_internal_def
)
1587 bool two_operators
= false;
1588 unsigned char *swap
= XALLOCAVEC (unsigned char, group_size
);
1589 tree vectype
= NULL_TREE
;
1590 if (!vect_build_slp_tree_1 (vinfo
, swap
, stmts
, group_size
,
1591 &this_max_nunits
, matches
, &two_operators
,
1595 /* If the SLP node is a load, terminate the recursion unless masked. */
1596 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1597 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1599 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1602 gcc_assert (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
));
1607 *max_nunits
= this_max_nunits
;
1609 node
= vect_create_new_slp_node (node
, stmts
, 0);
1610 SLP_TREE_VECTYPE (node
) = vectype
;
1611 /* And compute the load permutation. Whether it is actually
1612 a permutation depends on the unrolling factor which is
1614 vec
<unsigned> load_permutation
;
1616 stmt_vec_info load_info
;
1617 load_permutation
.create (group_size
);
1618 stmt_vec_info first_stmt_info
1619 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
1620 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
1622 int load_place
= vect_get_place_in_interleaving_chain
1623 (load_info
, first_stmt_info
);
1624 gcc_assert (load_place
!= -1);
1625 load_permutation
.safe_push (load_place
);
1627 SLP_TREE_LOAD_PERMUTATION (node
) = load_permutation
;
1631 else if (gimple_assign_single_p (stmt_info
->stmt
)
1632 && !gimple_vuse (stmt_info
->stmt
)
1633 && gimple_assign_rhs_code (stmt_info
->stmt
) == BIT_FIELD_REF
)
1635 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1636 the same SSA name vector of a compatible type to vectype. */
1637 vec
<std::pair
<unsigned, unsigned> > lperm
= vNULL
;
1638 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0);
1639 stmt_vec_info estmt_info
;
1640 FOR_EACH_VEC_ELT (stmts
, i
, estmt_info
)
1642 gassign
*estmt
= as_a
<gassign
*> (estmt_info
->stmt
);
1643 tree bfref
= gimple_assign_rhs1 (estmt
);
1645 if (!known_eq (bit_field_size (bfref
),
1646 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype
))))
1647 || !constant_multiple_p (bit_field_offset (bfref
),
1648 bit_field_size (bfref
), &lane
))
1653 lperm
.safe_push (std::make_pair (0, (unsigned)lane
));
1655 slp_tree vnode
= vect_create_new_slp_node (vNULL
);
1656 /* ??? We record vectype here but we hide eventually necessary
1657 punning and instead rely on code generation to materialize
1658 VIEW_CONVERT_EXPRs as necessary. We instead should make
1659 this explicit somehow. */
1660 SLP_TREE_VECTYPE (vnode
) = vectype
;
1661 SLP_TREE_VEC_DEFS (vnode
).safe_push (vec
);
1662 /* We are always building a permutation node even if it is an identity
1663 permute to shield the rest of the vectorizer from the odd node
1664 representing an actual vector without any scalar ops.
1665 ??? We could hide it completely with making the permute node
1667 node
= vect_create_new_slp_node (node
, stmts
, 1);
1668 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1669 SLP_TREE_LANE_PERMUTATION (node
) = lperm
;
1670 SLP_TREE_VECTYPE (node
) = vectype
;
1671 SLP_TREE_CHILDREN (node
).quick_push (vnode
);
1675 /* Get at the operands, verifying they are compatible. */
1676 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
1677 slp_oprnd_info oprnd_info
;
1678 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
1680 int res
= vect_get_and_check_slp_defs (vinfo
, swap
[i
], skip_args
,
1681 stmts
, i
, &oprnds_info
);
1683 matches
[(res
== -1) ? 0 : i
] = false;
1687 for (i
= 0; i
< group_size
; ++i
)
1690 vect_free_oprnd_info (oprnds_info
);
1695 auto_vec
<slp_tree
, 4> children
;
1697 stmt_info
= stmts
[0];
1699 /* Create SLP_TREE nodes for the definition node/s. */
1700 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
1705 /* We're skipping certain operands from processing, for example
1706 outer loop reduction initial defs. */
1709 children
.safe_push (NULL
);
1713 if (oprnd_info
->first_dt
== vect_uninitialized_def
)
1715 /* COND_EXPR have one too many eventually if the condition
1717 gcc_assert (i
== 3 && nops
== 4);
1721 if (is_a
<bb_vec_info
> (vinfo
)
1722 && oprnd_info
->first_dt
== vect_internal_def
1723 && !oprnd_info
->any_pattern
)
1725 /* For BB vectorization, if all defs are the same do not
1726 bother to continue the build along the single-lane
1727 graph but use a splat of the scalar value. */
1728 stmt_vec_info first_def
= oprnd_info
->def_stmts
[0];
1729 for (j
= 1; j
< group_size
; ++j
)
1730 if (oprnd_info
->def_stmts
[j
] != first_def
)
1733 /* But avoid doing this for loads where we may be
1734 able to CSE things, unless the stmt is not
1736 && (!STMT_VINFO_VECTORIZABLE (first_def
)
1737 || !gimple_vuse (first_def
->stmt
)))
1739 if (dump_enabled_p ())
1740 dump_printf_loc (MSG_NOTE
, vect_location
,
1741 "Using a splat of the uniform operand\n");
1742 oprnd_info
->first_dt
= vect_external_def
;
1746 if (oprnd_info
->first_dt
== vect_external_def
1747 || oprnd_info
->first_dt
== vect_constant_def
)
1749 slp_tree invnode
= vect_create_new_slp_node (oprnd_info
->ops
);
1750 SLP_TREE_DEF_TYPE (invnode
) = oprnd_info
->first_dt
;
1751 oprnd_info
->ops
= vNULL
;
1752 children
.safe_push (invnode
);
1756 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1757 group_size
, &this_max_nunits
,
1759 &this_tree_size
, bst_map
)) != NULL
)
1761 oprnd_info
->def_stmts
= vNULL
;
1762 children
.safe_push (child
);
1766 /* If the SLP build for operand zero failed and operand zero
1767 and one can be commutated try that for the scalar stmts
1768 that failed the match. */
1770 /* A first scalar stmt mismatch signals a fatal mismatch. */
1772 /* ??? For COND_EXPRs we can swap the comparison operands
1773 as well as the arms under some constraints. */
1775 && oprnds_info
[1]->first_dt
== vect_internal_def
1776 && is_gimple_assign (stmt_info
->stmt
)
1777 /* Swapping operands for reductions breaks assumptions later on. */
1778 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
1779 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
1781 /* See whether we can swap the matching or the non-matching
1783 bool swap_not_matching
= true;
1786 for (j
= 0; j
< group_size
; ++j
)
1788 if (matches
[j
] != !swap_not_matching
)
1790 stmt_vec_info stmt_info
= stmts
[j
];
1791 /* Verify if we can swap operands of this stmt. */
1792 gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
1794 || !commutative_tree_code (gimple_assign_rhs_code (stmt
)))
1796 if (!swap_not_matching
)
1798 swap_not_matching
= false;
1803 while (j
!= group_size
);
1805 /* Swap mismatched definition stmts. */
1806 if (dump_enabled_p ())
1807 dump_printf_loc (MSG_NOTE
, vect_location
,
1808 "Re-trying with swapped operands of stmts ");
1809 for (j
= 0; j
< group_size
; ++j
)
1810 if (matches
[j
] == !swap_not_matching
)
1812 std::swap (oprnds_info
[0]->def_stmts
[j
],
1813 oprnds_info
[1]->def_stmts
[j
]);
1814 std::swap (oprnds_info
[0]->ops
[j
],
1815 oprnds_info
[1]->ops
[j
]);
1816 if (dump_enabled_p ())
1817 dump_printf (MSG_NOTE
, "%d ", j
);
1819 if (dump_enabled_p ())
1820 dump_printf (MSG_NOTE
, "\n");
1821 /* And try again with scratch 'matches' ... */
1822 bool *tem
= XALLOCAVEC (bool, group_size
);
1823 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1824 group_size
, &this_max_nunits
,
1826 &this_tree_size
, bst_map
)) != NULL
)
1828 oprnd_info
->def_stmts
= vNULL
;
1829 children
.safe_push (child
);
1835 /* If the SLP build failed and we analyze a basic-block
1836 simply treat nodes we fail to build as externally defined
1837 (and thus build vectors from the scalar defs).
1838 The cost model will reject outright expensive cases.
1839 ??? This doesn't treat cases where permutation ultimatively
1840 fails (or we don't try permutation below). Ideally we'd
1841 even compute a permutation that will end up with the maximum
1843 if (is_a
<bb_vec_info
> (vinfo
)
1844 /* ??? Rejecting patterns this way doesn't work. We'd have to
1845 do extra work to cancel the pattern so the uses see the
1847 && !is_pattern_stmt_p (stmt_info
)
1848 && !oprnd_info
->any_pattern
)
1850 /* But if there's a leading vector sized set of matching stmts
1851 fail here so we can split the group. This matches the condition
1852 vect_analyze_slp_instance uses. */
1853 /* ??? We might want to split here and combine the results to support
1854 multiple vector sizes better. */
1855 for (j
= 0; j
< group_size
; ++j
)
1858 if (!known_ge (j
, TYPE_VECTOR_SUBPARTS (vectype
)))
1860 if (dump_enabled_p ())
1861 dump_printf_loc (MSG_NOTE
, vect_location
,
1862 "Building vector operands from scalars\n");
1864 child
= vect_create_new_slp_node (oprnd_info
->ops
);
1865 children
.safe_push (child
);
1866 oprnd_info
->ops
= vNULL
;
1871 gcc_assert (child
== NULL
);
1872 FOR_EACH_VEC_ELT (children
, j
, child
)
1874 vect_free_slp_tree (child
);
1875 vect_free_oprnd_info (oprnds_info
);
1879 vect_free_oprnd_info (oprnds_info
);
1881 /* If we have all children of a child built up from uniform scalars
1882 or does more than one possibly expensive vector construction then
1883 just throw that away, causing it built up from scalars.
1884 The exception is the SLP node for the vector store. */
1885 if (is_a
<bb_vec_info
> (vinfo
)
1886 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1887 /* ??? Rejecting patterns this way doesn't work. We'd have to
1888 do extra work to cancel the pattern so the uses see the
1890 && !is_pattern_stmt_p (stmt_info
))
1894 bool all_uniform_p
= true;
1895 unsigned n_vector_builds
= 0;
1896 FOR_EACH_VEC_ELT (children
, j
, child
)
1900 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
1901 all_uniform_p
= false;
1902 else if (!vect_slp_tree_uniform_p (child
))
1904 all_uniform_p
= false;
1905 if (SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
1910 || n_vector_builds
> 1
1911 || (n_vector_builds
== children
.length ()
1912 && is_a
<gphi
*> (stmt_info
->stmt
)))
1916 FOR_EACH_VEC_ELT (children
, j
, child
)
1918 vect_free_slp_tree (child
);
1920 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_NOTE
, vect_location
,
1922 "Building parent vector operands from "
1923 "scalars instead\n");
1928 *tree_size
+= this_tree_size
+ 1;
1929 *max_nunits
= this_max_nunits
;
1933 /* ??? We'd likely want to either cache in bst_map sth like
1934 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1935 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1936 explicit stmts to put in so the keying on 'stmts' doesn't
1937 work (but we have the same issue with nodes that use 'ops'). */
1938 slp_tree one
= new _slp_tree
;
1939 slp_tree two
= new _slp_tree
;
1940 SLP_TREE_DEF_TYPE (one
) = vect_internal_def
;
1941 SLP_TREE_DEF_TYPE (two
) = vect_internal_def
;
1942 SLP_TREE_VECTYPE (one
) = vectype
;
1943 SLP_TREE_VECTYPE (two
) = vectype
;
1944 SLP_TREE_CHILDREN (one
).safe_splice (children
);
1945 SLP_TREE_CHILDREN (two
).safe_splice (children
);
1947 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two
), i
, child
)
1948 SLP_TREE_REF_COUNT (child
)++;
1950 /* Here we record the original defs since this
1951 node represents the final lane configuration. */
1952 node
= vect_create_new_slp_node (node
, stmts
, 2);
1953 SLP_TREE_VECTYPE (node
) = vectype
;
1954 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1955 SLP_TREE_CHILDREN (node
).quick_push (one
);
1956 SLP_TREE_CHILDREN (node
).quick_push (two
);
1957 gassign
*stmt
= as_a
<gassign
*> (stmts
[0]->stmt
);
1958 enum tree_code code0
= gimple_assign_rhs_code (stmt
);
1959 enum tree_code ocode
= ERROR_MARK
;
1960 stmt_vec_info ostmt_info
;
1962 FOR_EACH_VEC_ELT (stmts
, i
, ostmt_info
)
1964 gassign
*ostmt
= as_a
<gassign
*> (ostmt_info
->stmt
);
1965 if (gimple_assign_rhs_code (ostmt
) != code0
)
1967 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (1, i
));
1968 ocode
= gimple_assign_rhs_code (ostmt
);
1972 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (0, i
));
1974 SLP_TREE_CODE (one
) = code0
;
1975 SLP_TREE_CODE (two
) = ocode
;
1976 SLP_TREE_LANES (one
) = stmts
.length ();
1977 SLP_TREE_LANES (two
) = stmts
.length ();
1978 SLP_TREE_REPRESENTATIVE (one
) = stmts
[0];
1979 SLP_TREE_REPRESENTATIVE (two
) = stmts
[j
];
1983 node
= vect_create_new_slp_node (node
, stmts
, nops
);
1984 SLP_TREE_VECTYPE (node
) = vectype
;
1985 SLP_TREE_CHILDREN (node
).splice (children
);
1989 /* Dump a single SLP tree NODE. */
1992 vect_print_slp_tree (dump_flags_t dump_kind
, dump_location_t loc
,
1997 stmt_vec_info stmt_info
;
2000 dump_metadata_t
metadata (dump_kind
, loc
.get_impl_location ());
2001 dump_user_location_t user_loc
= loc
.get_user_location ();
2002 dump_printf_loc (metadata
, user_loc
, "node%s %p (max_nunits=%u, refcnt=%u)\n",
2003 SLP_TREE_DEF_TYPE (node
) == vect_external_def
2005 : (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
2008 estimated_poly_value (node
->max_nunits
),
2009 SLP_TREE_REF_COUNT (node
));
2010 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
2012 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
2013 dump_printf_loc (metadata
, user_loc
, "op: VEC_PERM_EXPR\n");
2015 dump_printf_loc (metadata
, user_loc
, "op template: %G",
2016 SLP_TREE_REPRESENTATIVE (node
)->stmt
);
2018 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
2019 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2020 dump_printf_loc (metadata
, user_loc
, "\tstmt %u %G", i
, stmt_info
->stmt
);
2023 dump_printf_loc (metadata
, user_loc
, "\t{ ");
2024 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
2025 dump_printf (metadata
, "%T%s ", op
,
2026 i
< SLP_TREE_SCALAR_OPS (node
).length () - 1 ? "," : "");
2027 dump_printf (metadata
, "}\n");
2029 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2031 dump_printf_loc (metadata
, user_loc
, "\tload permutation {");
2032 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node
), i
, j
)
2033 dump_printf (dump_kind
, " %u", j
);
2034 dump_printf (dump_kind
, " }\n");
2036 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
2038 dump_printf_loc (metadata
, user_loc
, "\tlane permutation {");
2039 for (i
= 0; i
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++i
)
2040 dump_printf (dump_kind
, " %u[%u]",
2041 SLP_TREE_LANE_PERMUTATION (node
)[i
].first
,
2042 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
);
2043 dump_printf (dump_kind
, " }\n");
2045 if (SLP_TREE_CHILDREN (node
).is_empty ())
2047 dump_printf_loc (metadata
, user_loc
, "\tchildren");
2048 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2049 dump_printf (dump_kind
, " %p", (void *)child
);
2050 dump_printf (dump_kind
, "\n");
2054 debug (slp_tree node
)
2056 debug_dump_context ctx
;
2057 vect_print_slp_tree (MSG_NOTE
,
2058 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
2062 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2065 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2066 slp_tree node
, hash_set
<slp_tree
> &visited
)
2071 if (visited
.add (node
))
2074 vect_print_slp_tree (dump_kind
, loc
, node
);
2076 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2078 vect_print_slp_graph (dump_kind
, loc
, child
, visited
);
2082 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2085 hash_set
<slp_tree
> visited
;
2086 vect_print_slp_graph (dump_kind
, loc
, entry
, visited
);
2089 /* Mark the tree rooted at NODE with PURE_SLP. */
2092 vect_mark_slp_stmts (slp_tree node
, hash_set
<slp_tree
> &visited
)
2095 stmt_vec_info stmt_info
;
2098 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2101 if (visited
.add (node
))
2104 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2105 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
2107 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2109 vect_mark_slp_stmts (child
, visited
);
2113 vect_mark_slp_stmts (slp_tree node
)
2115 hash_set
<slp_tree
> visited
;
2116 vect_mark_slp_stmts (node
, visited
);
2119 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2122 vect_mark_slp_stmts_relevant (slp_tree node
, hash_set
<slp_tree
> &visited
)
2125 stmt_vec_info stmt_info
;
2128 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2131 if (visited
.add (node
))
2134 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2136 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
2137 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
2138 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
2141 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2143 vect_mark_slp_stmts_relevant (child
, visited
);
2147 vect_mark_slp_stmts_relevant (slp_tree node
)
2149 hash_set
<slp_tree
> visited
;
2150 vect_mark_slp_stmts_relevant (node
, visited
);
2154 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2157 vect_gather_slp_loads (vec
<slp_tree
> &loads
, slp_tree node
,
2158 hash_set
<slp_tree
> &visited
)
2160 if (!node
|| visited
.add (node
))
2163 if (SLP_TREE_CHILDREN (node
).length () == 0)
2165 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2167 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2168 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2169 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2170 loads
.safe_push (node
);
2176 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2177 vect_gather_slp_loads (loads
, child
, visited
);
2182 /* Find the last store in SLP INSTANCE. */
2185 vect_find_last_scalar_stmt_in_slp (slp_tree node
)
2187 stmt_vec_info last
= NULL
;
2188 stmt_vec_info stmt_vinfo
;
2190 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2192 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2193 last
= last
? get_later_stmt (stmt_vinfo
, last
) : stmt_vinfo
;
2199 /* Find the first stmt in NODE. */
2202 vect_find_first_scalar_stmt_in_slp (slp_tree node
)
2204 stmt_vec_info first
= NULL
;
2205 stmt_vec_info stmt_vinfo
;
2207 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2209 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2211 || get_later_stmt (stmt_vinfo
, first
) == first
)
2218 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2219 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2220 (also containing the first GROUP1_SIZE stmts, since stores are
2221 consecutive), the second containing the remainder.
2222 Return the first stmt in the second group. */
2224 static stmt_vec_info
2225 vect_split_slp_store_group (stmt_vec_info first_vinfo
, unsigned group1_size
)
2227 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo
) == first_vinfo
);
2228 gcc_assert (group1_size
> 0);
2229 int group2_size
= DR_GROUP_SIZE (first_vinfo
) - group1_size
;
2230 gcc_assert (group2_size
> 0);
2231 DR_GROUP_SIZE (first_vinfo
) = group1_size
;
2233 stmt_vec_info stmt_info
= first_vinfo
;
2234 for (unsigned i
= group1_size
; i
> 1; i
--)
2236 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2237 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2239 /* STMT is now the last element of the first group. */
2240 stmt_vec_info group2
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2241 DR_GROUP_NEXT_ELEMENT (stmt_info
) = 0;
2243 DR_GROUP_SIZE (group2
) = group2_size
;
2244 for (stmt_info
= group2
; stmt_info
;
2245 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
))
2247 DR_GROUP_FIRST_ELEMENT (stmt_info
) = group2
;
2248 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2251 /* For the second group, the DR_GROUP_GAP is that before the original group,
2252 plus skipping over the first vector. */
2253 DR_GROUP_GAP (group2
) = DR_GROUP_GAP (first_vinfo
) + group1_size
;
2255 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2256 DR_GROUP_GAP (first_vinfo
) += group2_size
;
2258 if (dump_enabled_p ())
2259 dump_printf_loc (MSG_NOTE
, vect_location
, "Split group into %d and %d\n",
2260 group1_size
, group2_size
);
2265 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2266 statements and a vector of NUNITS elements. */
2269 calculate_unrolling_factor (poly_uint64 nunits
, unsigned int group_size
)
2271 return exact_div (common_multiple (nunits
, group_size
), group_size
);
2274 /* Helper that checks to see if a node is a load node. */
2277 vect_is_slp_load_node (slp_tree root
)
2279 return SLP_TREE_DEF_TYPE (root
) == vect_internal_def
2280 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root
))
2281 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root
)));
2285 /* Helper function of optimize_load_redistribution that performs the operation
2289 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t
*bst_map
,
2290 vec_info
*vinfo
, unsigned int group_size
,
2291 hash_map
<slp_tree
, slp_tree
> *load_map
,
2294 if (slp_tree
*leader
= load_map
->get (root
))
2300 /* For now, we don't know anything about externals so do not do anything. */
2301 if (!root
|| SLP_TREE_DEF_TYPE (root
) != vect_internal_def
)
2303 else if (SLP_TREE_CODE (root
) == VEC_PERM_EXPR
)
2305 /* First convert this node into a load node and add it to the leaves
2306 list and flatten the permute from a lane to a load one. If it's
2307 unneeded it will be elided later. */
2308 vec
<stmt_vec_info
> stmts
;
2309 stmts
.create (SLP_TREE_LANES (root
));
2310 lane_permutation_t lane_perm
= SLP_TREE_LANE_PERMUTATION (root
);
2311 for (unsigned j
= 0; j
< lane_perm
.length (); j
++)
2313 std::pair
<unsigned, unsigned> perm
= lane_perm
[j
];
2314 node
= SLP_TREE_CHILDREN (root
)[perm
.first
];
2316 if (!vect_is_slp_load_node (node
)
2317 || SLP_TREE_CHILDREN (node
).exists ())
2323 stmts
.quick_push (SLP_TREE_SCALAR_STMTS (node
)[perm
.second
]);
2326 if (dump_enabled_p ())
2327 dump_printf_loc (MSG_NOTE
, vect_location
,
2328 "converting stmts on permute node %p\n", root
);
2330 bool *matches
= XALLOCAVEC (bool, group_size
);
2331 poly_uint64 max_nunits
= 1;
2332 unsigned tree_size
= 0, limit
= 1;
2333 node
= vect_build_slp_tree (vinfo
, stmts
, group_size
, &max_nunits
,
2334 matches
, &limit
, &tree_size
, bst_map
);
2338 load_map
->put (root
, node
);
2343 load_map
->put (root
, NULL
);
2345 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
2348 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
2352 SLP_TREE_REF_COUNT (value
)++;
2353 SLP_TREE_CHILDREN (root
)[i
] = value
;
2354 /* ??? We know the original leafs of the replaced nodes will
2355 be referenced by bst_map, only the permutes created by
2356 pattern matching are not. */
2357 if (SLP_TREE_REF_COUNT (node
) == 1)
2358 load_map
->remove (node
);
2359 vect_free_slp_tree (node
);
2366 /* Temporary workaround for loads not being CSEd during SLP build. This
2367 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2368 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2369 same DR such that the final operation is equal to a permuted load. Such
2370 NODES are then directly converted into LOADS themselves. The nodes are
2371 CSEd using BST_MAP. */
2374 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t
*bst_map
,
2375 vec_info
*vinfo
, unsigned int group_size
,
2376 hash_map
<slp_tree
, slp_tree
> *load_map
,
2382 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
2385 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
2389 SLP_TREE_REF_COUNT (value
)++;
2390 SLP_TREE_CHILDREN (root
)[i
] = value
;
2391 /* ??? We know the original leafs of the replaced nodes will
2392 be referenced by bst_map, only the permutes created by
2393 pattern matching are not. */
2394 if (SLP_TREE_REF_COUNT (node
) == 1)
2395 load_map
->remove (node
);
2396 vect_free_slp_tree (node
);
2401 /* Helper function of vect_match_slp_patterns.
2403 Attempts to match patterns against the slp tree rooted in REF_NODE using
2404 VINFO. Patterns are matched in post-order traversal.
2406 If matching is successful the value in REF_NODE is updated and returned, if
2407 not then it is returned unchanged. */
2410 vect_match_slp_patterns_2 (slp_tree
*ref_node
, vec_info
*vinfo
,
2411 slp_tree_to_load_perm_map_t
*perm_cache
,
2412 hash_set
<slp_tree
> *visited
)
2415 slp_tree node
= *ref_node
;
2416 bool found_p
= false;
2417 if (!node
|| visited
->add (node
))
2421 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2422 found_p
|= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node
)[i
],
2423 vinfo
, perm_cache
, visited
);
2425 for (unsigned x
= 0; x
< num__slp_patterns
; x
++)
2427 vect_pattern
*pattern
= slp_patterns
[x
] (perm_cache
, ref_node
);
2430 pattern
->build (vinfo
);
2439 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2442 The modified tree is returned. Patterns are tried in order and multiple
2443 patterns may match. */
2446 vect_match_slp_patterns (slp_instance instance
, vec_info
*vinfo
,
2447 hash_set
<slp_tree
> *visited
,
2448 slp_tree_to_load_perm_map_t
*perm_cache
)
2450 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2451 slp_tree
*ref_node
= &SLP_INSTANCE_TREE (instance
);
2453 if (dump_enabled_p ())
2454 dump_printf_loc (MSG_NOTE
, vect_location
,
2455 "Analyzing SLP tree %p for patterns\n",
2456 SLP_INSTANCE_TREE (instance
));
2458 return vect_match_slp_patterns_2 (ref_node
, vinfo
, perm_cache
, visited
);
2461 /* Analyze an SLP instance starting from a group of grouped stores. Call
2462 vect_build_slp_tree to build a tree of packed stmts if possible.
2463 Return FALSE if it's impossible to SLP any stmt in the loop. */
2466 vect_analyze_slp_instance (vec_info
*vinfo
,
2467 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2468 stmt_vec_info stmt_info
, slp_instance_kind kind
,
2469 unsigned max_tree_size
, unsigned *limit
);
2471 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2472 of KIND. Return true if successful. */
2475 vect_build_slp_instance (vec_info
*vinfo
,
2476 slp_instance_kind kind
,
2477 vec
<stmt_vec_info
> &scalar_stmts
,
2478 stmt_vec_info root_stmt_info
,
2479 unsigned max_tree_size
, unsigned *limit
,
2480 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2481 /* ??? We need stmt_info for group splitting. */
2482 stmt_vec_info stmt_info_
)
2484 if (dump_enabled_p ())
2486 dump_printf_loc (MSG_NOTE
, vect_location
,
2487 "Starting SLP discovery for\n");
2488 for (unsigned i
= 0; i
< scalar_stmts
.length (); ++i
)
2489 dump_printf_loc (MSG_NOTE
, vect_location
,
2490 " %G", scalar_stmts
[i
]->stmt
);
2493 /* Build the tree for the SLP instance. */
2494 unsigned int group_size
= scalar_stmts
.length ();
2495 bool *matches
= XALLOCAVEC (bool, group_size
);
2496 poly_uint64 max_nunits
= 1;
2497 unsigned tree_size
= 0;
2499 slp_tree node
= vect_build_slp_tree (vinfo
, scalar_stmts
, group_size
,
2500 &max_nunits
, matches
, limit
,
2501 &tree_size
, bst_map
);
2504 /* Calculate the unrolling factor based on the smallest type. */
2505 poly_uint64 unrolling_factor
2506 = calculate_unrolling_factor (max_nunits
, group_size
);
2508 if (maybe_ne (unrolling_factor
, 1U)
2509 && is_a
<bb_vec_info
> (vinfo
))
2511 unsigned HOST_WIDE_INT const_max_nunits
;
2512 if (!max_nunits
.is_constant (&const_max_nunits
)
2513 || const_max_nunits
> group_size
)
2515 if (dump_enabled_p ())
2516 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2517 "Build SLP failed: store group "
2518 "size not a multiple of the vector size "
2519 "in basic block SLP\n");
2520 vect_free_slp_tree (node
);
2523 /* Fatal mismatch. */
2524 if (dump_enabled_p ())
2525 dump_printf_loc (MSG_NOTE
, vect_location
,
2526 "SLP discovery succeeded but node needs "
2528 memset (matches
, true, group_size
);
2529 matches
[group_size
/ const_max_nunits
* const_max_nunits
] = false;
2530 vect_free_slp_tree (node
);
2534 /* Create a new SLP instance. */
2535 slp_instance new_instance
= XNEW (class _slp_instance
);
2536 SLP_INSTANCE_TREE (new_instance
) = node
;
2537 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
2538 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
2539 SLP_INSTANCE_ROOT_STMT (new_instance
) = root_stmt_info
;
2540 SLP_INSTANCE_KIND (new_instance
) = kind
;
2541 new_instance
->reduc_phis
= NULL
;
2542 new_instance
->cost_vec
= vNULL
;
2543 new_instance
->subgraph_entries
= vNULL
;
2545 if (dump_enabled_p ())
2546 dump_printf_loc (MSG_NOTE
, vect_location
,
2547 "SLP size %u vs. limit %u.\n",
2548 tree_size
, max_tree_size
);
2550 /* Fixup SLP reduction chains. */
2551 if (kind
== slp_inst_kind_reduc_chain
)
2553 /* If this is a reduction chain with a conversion in front
2554 amend the SLP tree with a node for that. */
2556 = vect_orig_stmt (scalar_stmts
[group_size
- 1])->stmt
;
2557 if (STMT_VINFO_DEF_TYPE (scalar_stmts
[0]) != vect_reduction_def
)
2559 /* Get at the conversion stmt - we know it's the single use
2560 of the last stmt of the reduction chain. */
2561 use_operand_p use_p
;
2562 bool r
= single_imm_use (gimple_assign_lhs (scalar_def
),
2563 &use_p
, &scalar_def
);
2565 stmt_vec_info next_info
= vinfo
->lookup_stmt (scalar_def
);
2566 next_info
= vect_stmt_to_vectorize (next_info
);
2567 scalar_stmts
= vNULL
;
2568 scalar_stmts
.create (group_size
);
2569 for (unsigned i
= 0; i
< group_size
; ++i
)
2570 scalar_stmts
.quick_push (next_info
);
2571 slp_tree conv
= vect_create_new_slp_node (scalar_stmts
, 1);
2572 SLP_TREE_VECTYPE (conv
) = STMT_VINFO_VECTYPE (next_info
);
2573 SLP_TREE_CHILDREN (conv
).quick_push (node
);
2574 SLP_INSTANCE_TREE (new_instance
) = conv
;
2575 /* We also have to fake this conversion stmt as SLP reduction
2576 group so we don't have to mess with too much code
2578 REDUC_GROUP_FIRST_ELEMENT (next_info
) = next_info
;
2579 REDUC_GROUP_NEXT_ELEMENT (next_info
) = NULL
;
2581 /* Fill the backedge child of the PHI SLP node. The
2582 general matching code cannot find it because the
2583 scalar code does not reflect how we vectorize the
2585 use_operand_p use_p
;
2586 imm_use_iterator imm_iter
;
2587 class loop
*loop
= LOOP_VINFO_LOOP (as_a
<loop_vec_info
> (vinfo
));
2588 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
,
2589 gimple_get_lhs (scalar_def
))
2590 /* There are exactly two non-debug uses, the reduction
2591 PHI and the loop-closed PHI node. */
2592 if (!is_gimple_debug (USE_STMT (use_p
))
2593 && gimple_bb (USE_STMT (use_p
)) == loop
->header
)
2595 auto_vec
<stmt_vec_info
, 64> phis (group_size
);
2596 stmt_vec_info phi_info
2597 = vinfo
->lookup_stmt (USE_STMT (use_p
));
2598 for (unsigned i
= 0; i
< group_size
; ++i
)
2599 phis
.quick_push (phi_info
);
2600 slp_tree
*phi_node
= bst_map
->get (phis
);
2601 unsigned dest_idx
= loop_latch_edge (loop
)->dest_idx
;
2602 SLP_TREE_CHILDREN (*phi_node
)[dest_idx
]
2603 = SLP_INSTANCE_TREE (new_instance
);
2604 SLP_INSTANCE_TREE (new_instance
)->refcnt
++;
2608 vinfo
->slp_instances
.safe_push (new_instance
);
2610 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2611 the number of scalar stmts in the root in a few places.
2612 Verify that assumption holds. */
2613 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
2614 .length () == group_size
);
2616 if (dump_enabled_p ())
2618 dump_printf_loc (MSG_NOTE
, vect_location
,
2619 "Final SLP tree for instance %p:\n", new_instance
);
2620 vect_print_slp_graph (MSG_NOTE
, vect_location
,
2621 SLP_INSTANCE_TREE (new_instance
));
2629 /* Failed to SLP. */
2630 /* Free the allocated memory. */
2631 scalar_stmts
.release ();
2634 stmt_vec_info stmt_info
= stmt_info_
;
2635 /* Try to break the group up into pieces. */
2636 if (kind
== slp_inst_kind_store
)
2638 /* ??? We could delay all the actual splitting of store-groups
2639 until after SLP discovery of the original group completed.
2640 Then we can recurse to vect_build_slp_instance directly. */
2641 for (i
= 0; i
< group_size
; i
++)
2645 /* For basic block SLP, try to break the group up into multiples of
2647 if (is_a
<bb_vec_info
> (vinfo
)
2648 && (i
> 1 && i
< group_size
))
2651 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
2652 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
2653 1 << floor_log2 (i
));
2654 unsigned HOST_WIDE_INT const_nunits
;
2656 && TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
))
2658 /* Split into two groups at the first vector boundary. */
2659 gcc_assert ((const_nunits
& (const_nunits
- 1)) == 0);
2660 unsigned group1_size
= i
& ~(const_nunits
- 1);
2662 if (dump_enabled_p ())
2663 dump_printf_loc (MSG_NOTE
, vect_location
,
2664 "Splitting SLP group at stmt %u\n", i
);
2665 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2667 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2668 kind
, max_tree_size
,
2670 /* Split the rest at the failure point and possibly
2671 re-analyze the remaining matching part if it has
2672 at least two lanes. */
2674 && (i
+ 1 < group_size
2675 || i
- group1_size
> 1))
2677 stmt_vec_info rest2
= rest
;
2678 rest
= vect_split_slp_store_group (rest
, i
- group1_size
);
2679 if (i
- group1_size
> 1)
2680 res
|= vect_analyze_slp_instance (vinfo
, bst_map
, rest2
,
2681 kind
, max_tree_size
,
2684 /* Re-analyze the non-matching tail if it has at least
2686 if (i
+ 1 < group_size
)
2687 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2688 rest
, kind
, max_tree_size
,
2694 /* For loop vectorization split into arbitrary pieces of size > 1. */
2695 if (is_a
<loop_vec_info
> (vinfo
)
2696 && (i
> 1 && i
< group_size
))
2698 unsigned group1_size
= i
;
2700 if (dump_enabled_p ())
2701 dump_printf_loc (MSG_NOTE
, vect_location
,
2702 "Splitting SLP group at stmt %u\n", i
);
2704 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2706 /* Loop vectorization cannot handle gaps in stores, make sure
2707 the split group appears as strided. */
2708 STMT_VINFO_STRIDED_P (rest
) = 1;
2709 DR_GROUP_GAP (rest
) = 0;
2710 STMT_VINFO_STRIDED_P (stmt_info
) = 1;
2711 DR_GROUP_GAP (stmt_info
) = 0;
2713 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2714 kind
, max_tree_size
, limit
);
2715 if (i
+ 1 < group_size
)
2716 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2717 rest
, kind
, max_tree_size
, limit
);
2722 /* Even though the first vector did not all match, we might be able to SLP
2723 (some) of the remainder. FORNOW ignore this possibility. */
2726 /* Failed to SLP. */
2727 if (dump_enabled_p ())
2728 dump_printf_loc (MSG_NOTE
, vect_location
, "SLP discovery failed\n");
2733 /* Analyze an SLP instance starting from a group of grouped stores. Call
2734 vect_build_slp_tree to build a tree of packed stmts if possible.
2735 Return FALSE if it's impossible to SLP any stmt in the loop. */
2738 vect_analyze_slp_instance (vec_info
*vinfo
,
2739 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2740 stmt_vec_info stmt_info
,
2741 slp_instance_kind kind
,
2742 unsigned max_tree_size
, unsigned *limit
)
2745 vec
<stmt_vec_info
> scalar_stmts
;
2747 if (is_a
<bb_vec_info
> (vinfo
))
2748 vect_location
= stmt_info
->stmt
;
2750 stmt_vec_info next_info
= stmt_info
;
2751 if (kind
== slp_inst_kind_store
)
2753 /* Collect the stores and store them in scalar_stmts. */
2754 scalar_stmts
.create (DR_GROUP_SIZE (stmt_info
));
2757 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
2758 next_info
= DR_GROUP_NEXT_ELEMENT (next_info
);
2761 else if (kind
== slp_inst_kind_reduc_chain
)
2763 /* Collect the reduction stmts and store them in scalar_stmts. */
2764 scalar_stmts
.create (REDUC_GROUP_SIZE (stmt_info
));
2767 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
2768 next_info
= REDUC_GROUP_NEXT_ELEMENT (next_info
);
2770 /* Mark the first element of the reduction chain as reduction to properly
2771 transform the node. In the reduction analysis phase only the last
2772 element of the chain is marked as reduction. */
2773 STMT_VINFO_DEF_TYPE (stmt_info
)
2774 = STMT_VINFO_DEF_TYPE (scalar_stmts
.last ());
2775 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))
2776 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts
.last ()));
2778 else if (kind
== slp_inst_kind_ctor
)
2780 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
2782 scalar_stmts
.create (CONSTRUCTOR_NELTS (rhs
));
2783 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), i
, val
)
2785 stmt_vec_info def_info
= vinfo
->lookup_def (val
);
2786 def_info
= vect_stmt_to_vectorize (def_info
);
2787 scalar_stmts
.quick_push (def_info
);
2789 if (dump_enabled_p ())
2790 dump_printf_loc (MSG_NOTE
, vect_location
,
2791 "Analyzing vectorizable constructor: %G\n",
2794 else if (kind
== slp_inst_kind_reduc_group
)
2796 /* Collect reduction statements. */
2797 vec
<stmt_vec_info
> reductions
= as_a
<loop_vec_info
> (vinfo
)->reductions
;
2798 scalar_stmts
.create (reductions
.length ());
2799 for (i
= 0; reductions
.iterate (i
, &next_info
); i
++)
2800 if (STMT_VINFO_RELEVANT_P (next_info
)
2801 || STMT_VINFO_LIVE_P (next_info
))
2802 scalar_stmts
.quick_push (next_info
);
2803 /* If less than two were relevant/live there's nothing to SLP. */
2804 if (scalar_stmts
.length () < 2)
2810 /* Build the tree for the SLP instance. */
2811 bool res
= vect_build_slp_instance (vinfo
, kind
, scalar_stmts
,
2812 kind
== slp_inst_kind_ctor
2814 max_tree_size
, limit
, bst_map
,
2815 kind
== slp_inst_kind_store
2816 ? stmt_info
: NULL
);
2818 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2819 where we should do store group splitting. */
2824 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2825 trees of packed scalar stmts if SLP is possible. */
2828 vect_analyze_slp (vec_info
*vinfo
, unsigned max_tree_size
)
2831 stmt_vec_info first_element
;
2832 slp_instance instance
;
2834 DUMP_VECT_SCOPE ("vect_analyze_slp");
2836 unsigned limit
= max_tree_size
;
2838 scalar_stmts_to_slp_tree_map_t
*bst_map
2839 = new scalar_stmts_to_slp_tree_map_t ();
2841 /* Find SLP sequences starting from groups of grouped stores. */
2842 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
2843 vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2844 STMT_VINFO_GROUPED_ACCESS (first_element
)
2845 ? slp_inst_kind_store
: slp_inst_kind_ctor
,
2846 max_tree_size
, &limit
);
2848 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
2850 for (unsigned i
= 0; i
< bb_vinfo
->roots
.length (); ++i
)
2852 vect_location
= bb_vinfo
->roots
[i
].root
->stmt
;
2853 if (vect_build_slp_instance (bb_vinfo
, bb_vinfo
->roots
[i
].kind
,
2854 bb_vinfo
->roots
[i
].stmts
,
2855 bb_vinfo
->roots
[i
].root
,
2856 max_tree_size
, &limit
, bst_map
, NULL
))
2857 bb_vinfo
->roots
[i
].stmts
= vNULL
;
2861 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2863 /* Find SLP sequences starting from reduction chains. */
2864 FOR_EACH_VEC_ELT (loop_vinfo
->reduction_chains
, i
, first_element
)
2865 if (! STMT_VINFO_RELEVANT_P (first_element
)
2866 && ! STMT_VINFO_LIVE_P (first_element
))
2868 else if (! vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2869 slp_inst_kind_reduc_chain
,
2870 max_tree_size
, &limit
))
2872 /* Dissolve reduction chain group. */
2873 stmt_vec_info vinfo
= first_element
;
2874 stmt_vec_info last
= NULL
;
2877 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
2878 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2879 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2883 STMT_VINFO_DEF_TYPE (first_element
) = vect_internal_def
;
2884 /* It can be still vectorized as part of an SLP reduction. */
2885 loop_vinfo
->reductions
.safe_push (last
);
2888 /* Find SLP sequences starting from groups of reductions. */
2889 if (loop_vinfo
->reductions
.length () > 1)
2890 vect_analyze_slp_instance (vinfo
, bst_map
, loop_vinfo
->reductions
[0],
2891 slp_inst_kind_reduc_group
, max_tree_size
,
2895 hash_set
<slp_tree
> visited_patterns
;
2896 slp_tree_to_load_perm_map_t perm_cache
;
2898 /* See if any patterns can be found in the SLP tree. */
2899 bool pattern_found
= false;
2900 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
2901 pattern_found
|= vect_match_slp_patterns (instance
, vinfo
,
2902 &visited_patterns
, &perm_cache
);
2904 /* If any were found optimize permutations of loads. */
2907 hash_map
<slp_tree
, slp_tree
> load_map
;
2908 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
2910 slp_tree root
= SLP_INSTANCE_TREE (instance
);
2911 optimize_load_redistribution (bst_map
, vinfo
, SLP_TREE_LANES (root
),
2918 /* The map keeps a reference on SLP nodes built, release that. */
2919 for (scalar_stmts_to_slp_tree_map_t::iterator it
= bst_map
->begin ();
2920 it
!= bst_map
->end (); ++it
)
2922 vect_free_slp_tree ((*it
).second
);
2925 if (pattern_found
&& dump_enabled_p ())
2927 dump_printf_loc (MSG_NOTE
, vect_location
,
2928 "Pattern matched SLP tree\n");
2929 hash_set
<slp_tree
> visited
;
2930 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
2931 vect_print_slp_graph (MSG_NOTE
, vect_location
,
2932 SLP_INSTANCE_TREE (instance
), visited
);
2935 return opt_result::success ();
2938 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2941 vect_slp_build_vertices (hash_set
<slp_tree
> &visited
, slp_tree node
,
2942 vec
<slp_tree
> &vertices
, vec
<int> &leafs
)
2947 if (visited
.add (node
))
2950 node
->vertex
= vertices
.length ();
2951 vertices
.safe_push (node
);
2954 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2958 vect_slp_build_vertices (visited
, child
, vertices
, leafs
);
2961 leafs
.safe_push (node
->vertex
);
2964 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2967 vect_slp_build_vertices (vec_info
*info
, vec
<slp_tree
> &vertices
,
2970 hash_set
<slp_tree
> visited
;
2972 slp_instance instance
;
2973 FOR_EACH_VEC_ELT (info
->slp_instances
, i
, instance
)
2975 unsigned n_v
= vertices
.length ();
2976 unsigned n_l
= leafs
.length ();
2977 vect_slp_build_vertices (visited
, SLP_INSTANCE_TREE (instance
), vertices
,
2979 /* If we added vertices but no entries to the reverse graph we've
2980 added a cycle that is not backwards-reachable. Push the entry
2981 to mimic as leaf then. */
2982 if (vertices
.length () > n_v
2983 && leafs
.length () == n_l
)
2984 leafs
.safe_push (SLP_INSTANCE_TREE (instance
)->vertex
);
2988 /* Apply (reverse) bijectite PERM to VEC. */
2992 vect_slp_permute (vec
<unsigned> perm
,
2993 vec
<T
> &vec
, bool reverse
)
2995 auto_vec
<T
, 64> saved
;
2996 saved
.create (vec
.length ());
2997 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2998 saved
.quick_push (vec
[i
]);
3002 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3003 vec
[perm
[i
]] = saved
[i
];
3004 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3005 gcc_assert (vec
[perm
[i
]] == saved
[i
]);
3009 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3010 vec
[i
] = saved
[perm
[i
]];
3011 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3012 gcc_assert (vec
[i
] == saved
[perm
[i
]]);
3016 /* Return whether permutations PERM_A and PERM_B as recorded in the
3017 PERMS vector are equal. */
3020 vect_slp_perms_eq (const vec
<vec
<unsigned> > &perms
,
3021 int perm_a
, int perm_b
)
3023 return (perm_a
== perm_b
3024 || (perms
[perm_a
].length () == perms
[perm_b
].length ()
3025 && memcmp (&perms
[perm_a
][0], &perms
[perm_b
][0],
3026 sizeof (unsigned) * perms
[perm_a
].length ()) == 0));
3029 /* Optimize the SLP graph of VINFO. */
3032 vect_optimize_slp (vec_info
*vinfo
)
3034 if (vinfo
->slp_instances
.is_empty ())
3039 auto_vec
<slp_tree
> vertices
;
3040 auto_vec
<int> leafs
;
3041 vect_slp_build_vertices (vinfo
, vertices
, leafs
);
3043 struct graph
*slpg
= new_graph (vertices
.length ());
3044 FOR_EACH_VEC_ELT (vertices
, i
, node
)
3048 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3050 add_edge (slpg
, i
, child
->vertex
);
3053 /* Compute (reverse) postorder on the inverted graph. */
3055 graphds_dfs (slpg
, &leafs
[0], leafs
.length (), &ipo
, false, NULL
, NULL
);
3057 auto_sbitmap
n_visited (vertices
.length ());
3058 auto_sbitmap
n_materialize (vertices
.length ());
3059 auto_vec
<int> n_perm (vertices
.length ());
3060 auto_vec
<vec
<unsigned> > perms
;
3062 bitmap_clear (n_visited
);
3063 bitmap_clear (n_materialize
);
3064 n_perm
.quick_grow_cleared (vertices
.length ());
3065 perms
.safe_push (vNULL
); /* zero is no permute */
3067 /* Produce initial permutations. */
3068 for (i
= 0; i
< leafs
.length (); ++i
)
3071 slp_tree node
= vertices
[idx
];
3073 /* Handle externals and constants optimistically throughout the
3075 if (SLP_TREE_DEF_TYPE (node
) == vect_external_def
3076 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
3079 /* Leafs do not change across iterations. Note leafs also double
3080 as entries to the reverse graph. */
3081 if (!slpg
->vertices
[idx
].succ
)
3082 bitmap_set_bit (n_visited
, idx
);
3083 /* Loads are the only thing generating permutes. */
3084 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3087 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3088 node unpermuted, record this permute. */
3089 stmt_vec_info dr_stmt
= SLP_TREE_REPRESENTATIVE (node
);
3090 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt
))
3092 dr_stmt
= DR_GROUP_FIRST_ELEMENT (dr_stmt
);
3093 unsigned imin
= DR_GROUP_SIZE (dr_stmt
) + 1, imax
= 0;
3094 bool any_permute
= false;
3095 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3097 unsigned idx
= SLP_TREE_LOAD_PERMUTATION (node
)[j
];
3098 imin
= MIN (imin
, idx
);
3099 imax
= MAX (imax
, idx
);
3100 if (idx
- SLP_TREE_LOAD_PERMUTATION (node
)[0] != j
)
3103 /* If there's no permute no need to split one out. */
3106 /* If the span doesn't match we'd disrupt VF computation, avoid
3108 if (imax
- imin
+ 1 != SLP_TREE_LANES (node
))
3111 /* For now only handle true permutes, like
3112 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3113 when permuting constants and invariants keeping the permute
3115 auto_sbitmap
load_index (SLP_TREE_LANES (node
));
3116 bitmap_clear (load_index
);
3117 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3118 bitmap_set_bit (load_index
, SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
);
3120 for (j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3121 if (!bitmap_bit_p (load_index
, j
))
3123 if (j
!= SLP_TREE_LANES (node
))
3126 vec
<unsigned> perm
= vNULL
;
3127 perm
.safe_grow (SLP_TREE_LANES (node
), true);
3128 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3129 perm
[j
] = SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
;
3130 perms
.safe_push (perm
);
3131 n_perm
[idx
] = perms
.length () - 1;
3134 /* Propagate permutes along the graph and compute materialization points. */
3136 unsigned iteration
= 0;
3142 for (i
= vertices
.length (); i
> 0 ; --i
)
3145 slp_tree node
= vertices
[idx
];
3146 /* For leafs there's nothing to do - we've seeded permutes
3148 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
3151 bitmap_set_bit (n_visited
, idx
);
3153 /* We cannot move a permute across a store. */
3154 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))
3156 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))))
3160 for (graph_edge
*succ
= slpg
->vertices
[idx
].succ
;
3161 succ
; succ
= succ
->succ_next
)
3163 int succ_idx
= succ
->dest
;
3164 /* Handle unvisited nodes optimistically. */
3165 /* ??? But for constants once we want to handle non-bijective
3166 permutes we have to verify the permute, when unifying lanes,
3167 will not unify different constants. For example see
3168 gcc.dg/vect/bb-slp-14.c for a case that would break. */
3169 if (!bitmap_bit_p (n_visited
, succ_idx
))
3171 int succ_perm
= n_perm
[succ_idx
];
3172 /* Once we materialize succs permutation its output lanes
3173 appear unpermuted to us. */
3174 if (bitmap_bit_p (n_materialize
, succ_idx
))
3178 else if (succ_perm
== 0)
3183 else if (!vect_slp_perms_eq (perms
, perm
, succ_perm
))
3191 /* Pick up pre-computed leaf values. */
3193 else if (!vect_slp_perms_eq (perms
, perm
, n_perm
[idx
]))
3196 /* Make sure we eventually converge. */
3197 gcc_checking_assert (perm
== 0);
3200 bitmap_clear_bit (n_materialize
, idx
);
3207 /* Elide pruning at materialization points in the first
3208 iteration so every node was visited once at least. */
3212 /* Decide on permute materialization. Look whether there's
3213 a use (pred) edge that is permuted differently than us.
3214 In that case mark ourselves so the permutation is applied.
3215 For VEC_PERM_EXPRs the permutation doesn't carry along
3216 from children to parents so force materialization at the
3217 point of the VEC_PERM_EXPR. In principle VEC_PERM_EXPRs
3218 are a source of an arbitrary permutation again, similar
3219 to constants/externals - that's something we do not yet
3220 optimally handle. */
3221 bool all_preds_permuted
= (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
3222 && slpg
->vertices
[idx
].pred
!= NULL
);
3223 if (all_preds_permuted
)
3224 for (graph_edge
*pred
= slpg
->vertices
[idx
].pred
;
3225 pred
; pred
= pred
->pred_next
)
3227 gcc_checking_assert (bitmap_bit_p (n_visited
, pred
->src
));
3228 int pred_perm
= n_perm
[pred
->src
];
3229 if (!vect_slp_perms_eq (perms
, perm
, pred_perm
))
3231 all_preds_permuted
= false;
3235 if (!all_preds_permuted
)
3237 if (!bitmap_bit_p (n_materialize
, idx
))
3239 bitmap_set_bit (n_materialize
, idx
);
3243 while (changed
|| iteration
== 1);
3246 for (i
= 0; i
< vertices
.length (); ++i
)
3248 int perm
= n_perm
[i
];
3252 slp_tree node
= vertices
[i
];
3254 /* First permute invariant/external original successors. */
3257 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3259 if (!child
|| SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3262 /* If the vector is uniform there's nothing to do. */
3263 if (vect_slp_tree_uniform_p (child
))
3266 /* We can end up sharing some externals via two_operator
3267 handling. Be prepared to unshare those. */
3268 if (child
->refcnt
!= 1)
3270 gcc_assert (slpg
->vertices
[child
->vertex
].pred
->pred_next
);
3271 SLP_TREE_CHILDREN (node
)[j
] = child
3272 = vect_create_new_slp_node
3273 (SLP_TREE_SCALAR_OPS (child
).copy ());
3275 vect_slp_permute (perms
[perm
],
3276 SLP_TREE_SCALAR_OPS (child
), true);
3279 if (bitmap_bit_p (n_materialize
, i
))
3281 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3282 /* For loads simply drop the permutation, the load permutation
3283 already performs the desired permutation. */
3285 else if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
3287 /* If the node is already a permute node we can apply
3288 the permutation to the lane selection, effectively
3289 materializing it on the incoming vectors. */
3290 if (dump_enabled_p ())
3291 dump_printf_loc (MSG_NOTE
, vect_location
,
3292 "simplifying permute node %p\n",
3295 for (unsigned k
= 0;
3296 k
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++k
)
3297 SLP_TREE_LANE_PERMUTATION (node
)[k
].second
3298 = perms
[perm
][SLP_TREE_LANE_PERMUTATION (node
)[k
].second
];
3302 if (dump_enabled_p ())
3303 dump_printf_loc (MSG_NOTE
, vect_location
,
3304 "inserting permute node in place of %p\n",
3307 /* Make a copy of NODE and in-place change it to a
3308 VEC_PERM node to permute the lanes of the copy. */
3309 slp_tree copy
= new _slp_tree
;
3310 SLP_TREE_CHILDREN (copy
) = SLP_TREE_CHILDREN (node
);
3311 SLP_TREE_CHILDREN (node
) = vNULL
;
3312 SLP_TREE_SCALAR_STMTS (copy
)
3313 = SLP_TREE_SCALAR_STMTS (node
).copy ();
3314 vect_slp_permute (perms
[perm
],
3315 SLP_TREE_SCALAR_STMTS (copy
), true);
3316 gcc_assert (!SLP_TREE_SCALAR_OPS (node
).exists ());
3317 SLP_TREE_REPRESENTATIVE (copy
) = SLP_TREE_REPRESENTATIVE (node
);
3318 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node
).exists ());
3319 SLP_TREE_LANE_PERMUTATION (copy
)
3320 = SLP_TREE_LANE_PERMUTATION (node
);
3321 SLP_TREE_LANE_PERMUTATION (node
) = vNULL
;
3322 SLP_TREE_VECTYPE (copy
) = SLP_TREE_VECTYPE (node
);
3324 copy
->max_nunits
= node
->max_nunits
;
3325 SLP_TREE_DEF_TYPE (copy
) = SLP_TREE_DEF_TYPE (node
);
3326 SLP_TREE_LANES (copy
) = SLP_TREE_LANES (node
);
3327 SLP_TREE_CODE (copy
) = SLP_TREE_CODE (node
);
3329 /* Now turn NODE into a VEC_PERM. */
3330 SLP_TREE_CHILDREN (node
).safe_push (copy
);
3331 SLP_TREE_LANE_PERMUTATION (node
).create (SLP_TREE_LANES (node
));
3332 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3333 SLP_TREE_LANE_PERMUTATION (node
)
3334 .quick_push (std::make_pair (0, perms
[perm
][j
]));
3335 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
3340 /* Apply the reverse permutation to our stmts. */
3341 vect_slp_permute (perms
[perm
],
3342 SLP_TREE_SCALAR_STMTS (node
), true);
3343 /* And to the load permutation, which we can simply
3344 make regular by design. */
3345 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3347 /* ??? When we handle non-bijective permutes the idea
3348 is that we can force the load-permutation to be
3349 { min, min + 1, min + 2, ... max }. But then the
3350 scalar defs might no longer match the lane content
3351 which means wrong-code with live lane vectorization.
3352 So we possibly have to have NULL entries for those. */
3353 vect_slp_permute (perms
[perm
],
3354 SLP_TREE_LOAD_PERMUTATION (node
), true);
3359 /* Free the perms vector used for propagation. */
3360 while (!perms
.is_empty ())
3361 perms
.pop ().release ();
3365 /* Now elide load permutations that are not necessary. */
3366 for (i
= 0; i
< leafs
.length (); ++i
)
3368 node
= vertices
[leafs
[i
]];
3369 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3372 /* In basic block vectorization we allow any subchain of an interleaving
3374 FORNOW: not in loop SLP because of realignment complications. */
3375 if (is_a
<bb_vec_info
> (vinfo
))
3377 bool subchain_p
= true;
3378 stmt_vec_info next_load_info
= NULL
;
3379 stmt_vec_info load_info
;
3381 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3384 && (next_load_info
!= load_info
3385 || DR_GROUP_GAP (load_info
) != 1))
3390 next_load_info
= DR_GROUP_NEXT_ELEMENT (load_info
);
3394 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3400 stmt_vec_info load_info
;
3401 bool this_load_permuted
= false;
3403 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3404 if (SLP_TREE_LOAD_PERMUTATION (node
)[j
] != j
)
3406 this_load_permuted
= true;
3409 stmt_vec_info first_stmt_info
3410 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
3411 if (!this_load_permuted
3412 /* The load requires permutation when unrolling exposes
3413 a gap either because the group is larger than the SLP
3414 group-size or because there is a gap between the groups. */
3415 && (known_eq (LOOP_VINFO_VECT_FACTOR
3416 (as_a
<loop_vec_info
> (vinfo
)), 1U)
3417 || ((SLP_TREE_LANES (node
) == DR_GROUP_SIZE (first_stmt_info
))
3418 && DR_GROUP_GAP (first_stmt_info
) == 0)))
3420 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3427 /* Gather loads reachable from the individual SLP graph entries. */
3430 vect_gather_slp_loads (vec_info
*vinfo
)
3433 slp_instance instance
;
3434 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
3436 hash_set
<slp_tree
> visited
;
3437 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance
),
3438 SLP_INSTANCE_TREE (instance
), visited
);
3443 /* For each possible SLP instance decide whether to SLP it and calculate overall
3444 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3445 least one instance. */
3448 vect_make_slp_decision (loop_vec_info loop_vinfo
)
3451 poly_uint64 unrolling_factor
= 1;
3452 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
3453 slp_instance instance
;
3454 int decided_to_slp
= 0;
3456 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3458 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3460 /* FORNOW: SLP if you can. */
3461 /* All unroll factors have the form:
3463 GET_MODE_SIZE (vinfo->vector_mode) * X
3465 for some rational X, so they must have a common multiple. */
3467 = force_common_multiple (unrolling_factor
,
3468 SLP_INSTANCE_UNROLLING_FACTOR (instance
));
3470 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3471 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3472 loop-based vectorization. Such stmts will be marked as HYBRID. */
3473 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
3477 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
3479 if (decided_to_slp
&& dump_enabled_p ())
3481 dump_printf_loc (MSG_NOTE
, vect_location
,
3482 "Decided to SLP %d instances. Unrolling factor ",
3484 dump_dec (MSG_NOTE
, unrolling_factor
);
3485 dump_printf (MSG_NOTE
, "\n");
3488 return (decided_to_slp
> 0);
3491 /* Private data for vect_detect_hybrid_slp. */
3494 loop_vec_info loop_vinfo
;
3495 vec
<stmt_vec_info
> *worklist
;
3498 /* Walker for walk_gimple_op. */
3501 vect_detect_hybrid_slp (tree
*tp
, int *, void *data
)
3503 walk_stmt_info
*wi
= (walk_stmt_info
*)data
;
3504 vdhs_data
*dat
= (vdhs_data
*)wi
->info
;
3509 stmt_vec_info def_stmt_info
= dat
->loop_vinfo
->lookup_def (*tp
);
3512 def_stmt_info
= vect_stmt_to_vectorize (def_stmt_info
);
3513 if (PURE_SLP_STMT (def_stmt_info
))
3515 if (dump_enabled_p ())
3516 dump_printf_loc (MSG_NOTE
, vect_location
, "marking hybrid: %G",
3517 def_stmt_info
->stmt
);
3518 STMT_SLP_TYPE (def_stmt_info
) = hybrid
;
3519 dat
->worklist
->safe_push (def_stmt_info
);
3525 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3526 if so, otherwise pushing it to WORKLIST. */
3529 maybe_push_to_hybrid_worklist (vec_info
*vinfo
,
3530 vec
<stmt_vec_info
> &worklist
,
3531 stmt_vec_info stmt_info
)
3533 if (dump_enabled_p ())
3534 dump_printf_loc (MSG_NOTE
, vect_location
,
3535 "Processing hybrid candidate : %G", stmt_info
->stmt
);
3536 stmt_vec_info orig_info
= vect_orig_stmt (stmt_info
);
3537 imm_use_iterator iter2
;
3539 use_operand_p use_p
;
3540 def_operand_p def_p
;
3541 bool any_def
= false;
3542 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_info
->stmt
, iter1
, SSA_OP_DEF
)
3545 FOR_EACH_IMM_USE_FAST (use_p
, iter2
, DEF_FROM_PTR (def_p
))
3547 if (is_gimple_debug (USE_STMT (use_p
)))
3549 stmt_vec_info use_info
= vinfo
->lookup_stmt (USE_STMT (use_p
));
3550 /* An out-of loop use means this is a loop_vect sink. */
3553 if (dump_enabled_p ())
3554 dump_printf_loc (MSG_NOTE
, vect_location
,
3555 "Found loop_vect sink: %G", stmt_info
->stmt
);
3556 worklist
.safe_push (stmt_info
);
3559 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info
)))
3561 if (dump_enabled_p ())
3562 dump_printf_loc (MSG_NOTE
, vect_location
,
3563 "Found loop_vect use: %G", use_info
->stmt
);
3564 worklist
.safe_push (stmt_info
);
3569 /* No def means this is a loo_vect sink. */
3572 if (dump_enabled_p ())
3573 dump_printf_loc (MSG_NOTE
, vect_location
,
3574 "Found loop_vect sink: %G", stmt_info
->stmt
);
3575 worklist
.safe_push (stmt_info
);
3578 if (dump_enabled_p ())
3579 dump_printf_loc (MSG_NOTE
, vect_location
,
3580 "Marked SLP consumed stmt pure: %G", stmt_info
->stmt
);
3581 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
3584 /* Find stmts that must be both vectorized and SLPed. */
3587 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
3589 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3591 /* All stmts participating in SLP are marked pure_slp, all other
3592 stmts are loop_vect.
3593 First collect all loop_vect stmts into a worklist.
3594 SLP patterns cause not all original scalar stmts to appear in
3595 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3596 Rectify this here and do a backward walk over the IL only considering
3597 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3598 mark them as pure_slp. */
3599 auto_vec
<stmt_vec_info
> worklist
;
3600 for (int i
= LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
- 1; i
>= 0; --i
)
3602 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
3603 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
3606 gphi
*phi
= gsi
.phi ();
3607 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (phi
);
3608 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3609 maybe_push_to_hybrid_worklist (loop_vinfo
,
3610 worklist
, stmt_info
);
3612 for (gimple_stmt_iterator gsi
= gsi_last_bb (bb
); !gsi_end_p (gsi
);
3615 gimple
*stmt
= gsi_stmt (gsi
);
3616 if (is_gimple_debug (stmt
))
3618 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
3619 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
3621 for (gimple_stmt_iterator gsi2
3622 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
3623 !gsi_end_p (gsi2
); gsi_next (&gsi2
))
3625 stmt_vec_info patt_info
3626 = loop_vinfo
->lookup_stmt (gsi_stmt (gsi2
));
3627 if (!STMT_SLP_TYPE (patt_info
)
3628 && STMT_VINFO_RELEVANT (patt_info
))
3629 maybe_push_to_hybrid_worklist (loop_vinfo
,
3630 worklist
, patt_info
);
3632 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
3634 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3635 maybe_push_to_hybrid_worklist (loop_vinfo
,
3636 worklist
, stmt_info
);
3640 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3641 mark any SLP vectorized stmt as hybrid.
3642 ??? We're visiting def stmts N times (once for each non-SLP and
3643 once for each hybrid-SLP use). */
3646 dat
.worklist
= &worklist
;
3647 dat
.loop_vinfo
= loop_vinfo
;
3648 memset (&wi
, 0, sizeof (wi
));
3649 wi
.info
= (void *)&dat
;
3650 while (!worklist
.is_empty ())
3652 stmt_vec_info stmt_info
= worklist
.pop ();
3653 /* Since SSA operands are not set up for pattern stmts we need
3654 to use walk_gimple_op. */
3656 walk_gimple_op (stmt_info
->stmt
, vect_detect_hybrid_slp
, &wi
);
3661 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3663 _bb_vec_info::_bb_vec_info (vec
<basic_block
> _bbs
, vec_info_shared
*shared
)
3664 : vec_info (vec_info::bb
, init_cost (NULL
), shared
), bbs (_bbs
), roots (vNULL
)
3666 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3669 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3672 gphi
*phi
= si
.phi ();
3673 gimple_set_uid (phi
, 0);
3676 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3677 !gsi_end_p (gsi
); gsi_next (&gsi
))
3679 gimple
*stmt
= gsi_stmt (gsi
);
3680 gimple_set_uid (stmt
, 0);
3681 if (is_gimple_debug (stmt
))
3689 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3690 stmts in the basic block. */
3692 _bb_vec_info::~_bb_vec_info ()
3694 /* Reset region marker. */
3695 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3698 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3701 gphi
*phi
= si
.phi ();
3702 gimple_set_uid (phi
, -1);
3704 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3705 !gsi_end_p (gsi
); gsi_next (&gsi
))
3707 gimple
*stmt
= gsi_stmt (gsi
);
3708 gimple_set_uid (stmt
, -1);
3712 for (unsigned i
= 0; i
< roots
.length (); ++i
)
3713 roots
[i
].stmts
.release ();
3717 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3718 given then that child nodes have already been processed, and that
3719 their def types currently match their SLP node's def type. */
3722 vect_slp_analyze_node_operations_1 (vec_info
*vinfo
, slp_tree node
,
3723 slp_instance node_instance
,
3724 stmt_vector_for_cost
*cost_vec
)
3726 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
3728 /* Calculate the number of vector statements to be created for the
3729 scalar stmts in this node. For SLP reductions it is equal to the
3730 number of vector statements in the children (which has already been
3731 calculated by the recursive call). Otherwise it is the number of
3732 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3733 VF divided by the number of elements in a vector. */
3734 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3735 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
3737 for (unsigned i
= 0; i
< SLP_TREE_CHILDREN (node
).length (); ++i
)
3738 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node
)[i
]) == vect_internal_def
)
3740 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3741 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node
)[i
]);
3748 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3749 vf
= loop_vinfo
->vectorization_factor
;
3752 unsigned int group_size
= SLP_TREE_LANES (node
);
3753 tree vectype
= SLP_TREE_VECTYPE (node
);
3754 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3755 = vect_get_num_vectors (vf
* group_size
, vectype
);
3758 /* Handle purely internal nodes. */
3759 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
3760 return vectorizable_slp_permutation (vinfo
, NULL
, node
, cost_vec
);
3762 gcc_assert (STMT_SLP_TYPE (stmt_info
) != loop_vect
);
3763 if (is_a
<bb_vec_info
> (vinfo
)
3764 && !vect_update_shared_vectype (stmt_info
, SLP_TREE_VECTYPE (node
)))
3766 if (dump_enabled_p ())
3767 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3768 "desired vector type conflicts with earlier one "
3769 "for %G", stmt_info
->stmt
);
3774 return vect_analyze_stmt (vinfo
, stmt_info
, &dummy
,
3775 node
, node_instance
, cost_vec
);
3778 /* Try to build NODE from scalars, returning true on success.
3779 NODE_INSTANCE is the SLP instance that contains NODE. */
3782 vect_slp_convert_to_external (vec_info
*vinfo
, slp_tree node
,
3783 slp_instance node_instance
)
3785 stmt_vec_info stmt_info
;
3788 if (!is_a
<bb_vec_info
> (vinfo
)
3789 || node
== SLP_INSTANCE_TREE (node_instance
)
3790 || !SLP_TREE_SCALAR_STMTS (node
).exists ()
3791 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node
)))
3794 if (dump_enabled_p ())
3795 dump_printf_loc (MSG_NOTE
, vect_location
,
3796 "Building vector operands of %p from scalars instead\n", node
);
3798 /* Don't remove and free the child nodes here, since they could be
3799 referenced by other structures. The analysis and scheduling phases
3800 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3801 unsigned int group_size
= SLP_TREE_LANES (node
);
3802 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
3803 SLP_TREE_SCALAR_OPS (node
).safe_grow (group_size
, true);
3804 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3805 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3807 tree lhs
= gimple_get_lhs (vect_orig_stmt (stmt_info
)->stmt
);
3808 SLP_TREE_SCALAR_OPS (node
)[i
] = lhs
;
3813 /* Compute the prologue cost for invariant or constant operands represented
3817 vect_prologue_cost_for_slp (slp_tree node
,
3818 stmt_vector_for_cost
*cost_vec
)
3820 /* There's a special case of an existing vector, that costs nothing. */
3821 if (SLP_TREE_SCALAR_OPS (node
).length () == 0
3822 && !SLP_TREE_VEC_DEFS (node
).is_empty ())
3824 /* Without looking at the actual initializer a vector of
3825 constants can be implemented as load from the constant pool.
3826 When all elements are the same we can use a splat. */
3827 tree vectype
= SLP_TREE_VECTYPE (node
);
3828 unsigned group_size
= SLP_TREE_SCALAR_OPS (node
).length ();
3829 unsigned num_vects_to_check
;
3830 unsigned HOST_WIDE_INT const_nunits
;
3831 unsigned nelt_limit
;
3832 if (TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
)
3833 && ! multiple_p (const_nunits
, group_size
))
3835 num_vects_to_check
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
3836 nelt_limit
= const_nunits
;
3840 /* If either the vector has variable length or the vectors
3841 are composed of repeated whole groups we only need to
3842 cost construction once. All vectors will be the same. */
3843 num_vects_to_check
= 1;
3844 nelt_limit
= group_size
;
3846 tree elt
= NULL_TREE
;
3848 for (unsigned j
= 0; j
< num_vects_to_check
* nelt_limit
; ++j
)
3850 unsigned si
= j
% group_size
;
3852 elt
= SLP_TREE_SCALAR_OPS (node
)[si
];
3853 /* ??? We're just tracking whether all operands of a single
3854 vector initializer are the same, ideally we'd check if
3855 we emitted the same one already. */
3856 else if (elt
!= SLP_TREE_SCALAR_OPS (node
)[si
])
3859 if (nelt
== nelt_limit
)
3861 record_stmt_cost (cost_vec
, 1,
3862 SLP_TREE_DEF_TYPE (node
) == vect_external_def
3863 ? (elt
? scalar_to_vec
: vec_construct
)
3865 NULL
, vectype
, 0, vect_prologue
);
3871 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3872 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3874 Return true if the operations are supported. */
3877 vect_slp_analyze_node_operations (vec_info
*vinfo
, slp_tree node
,
3878 slp_instance node_instance
,
3879 hash_set
<slp_tree
> &visited_set
,
3880 vec
<slp_tree
> &visited_vec
,
3881 stmt_vector_for_cost
*cost_vec
)
3886 /* Assume we can code-generate all invariants. */
3888 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
3889 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
3892 if (SLP_TREE_DEF_TYPE (node
) == vect_uninitialized_def
)
3894 if (dump_enabled_p ())
3895 dump_printf_loc (MSG_NOTE
, vect_location
,
3896 "Failed cyclic SLP reference in %p\n", node
);
3899 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
);
3901 /* If we already analyzed the exact same set of scalar stmts we're done.
3902 We share the generated vector stmts for those. */
3903 if (visited_set
.add (node
))
3905 visited_vec
.safe_push (node
);
3908 unsigned visited_rec_start
= visited_vec
.length ();
3909 unsigned cost_vec_rec_start
= cost_vec
->length ();
3910 bool seen_non_constant_child
= false;
3911 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3913 res
= vect_slp_analyze_node_operations (vinfo
, child
, node_instance
,
3914 visited_set
, visited_vec
,
3918 if (child
&& SLP_TREE_DEF_TYPE (child
) != vect_constant_def
)
3919 seen_non_constant_child
= true;
3921 /* We're having difficulties scheduling nodes with just constant
3922 operands and no scalar stmts since we then cannot compute a stmt
3924 if (!seen_non_constant_child
&& SLP_TREE_SCALAR_STMTS (node
).is_empty ())
3926 if (dump_enabled_p ())
3927 dump_printf_loc (MSG_NOTE
, vect_location
,
3928 "Cannot vectorize all-constant op node %p\n", node
);
3933 res
= vect_slp_analyze_node_operations_1 (vinfo
, node
, node_instance
,
3935 /* If analysis failed we have to pop all recursive visited nodes
3939 while (visited_vec
.length () >= visited_rec_start
)
3940 visited_set
.remove (visited_vec
.pop ());
3941 cost_vec
->truncate (cost_vec_rec_start
);
3944 /* When the node can be vectorized cost invariant nodes it references.
3945 This is not done in DFS order to allow the refering node
3946 vectorizable_* calls to nail down the invariant nodes vector type
3947 and possibly unshare it if it needs a different vector type than
3950 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3952 && (SLP_TREE_DEF_TYPE (child
) == vect_constant_def
3953 || SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
3954 /* Perform usual caching, note code-generation still
3955 code-gens these nodes multiple times but we expect
3956 to CSE them later. */
3957 && !visited_set
.add (child
))
3959 visited_vec
.safe_push (child
);
3960 /* ??? After auditing more code paths make a "default"
3961 and push the vector type from NODE to all children
3962 if it is not already set. */
3963 /* Compute the number of vectors to be generated. */
3964 tree vector_type
= SLP_TREE_VECTYPE (child
);
3967 /* For shifts with a scalar argument we don't need
3968 to cost or code-generate anything.
3969 ??? Represent this more explicitely. */
3970 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
3971 == shift_vec_info_type
)
3975 unsigned group_size
= SLP_TREE_LANES (child
);
3977 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3978 vf
= loop_vinfo
->vectorization_factor
;
3979 SLP_TREE_NUMBER_OF_VEC_STMTS (child
)
3980 = vect_get_num_vectors (vf
* group_size
, vector_type
);
3981 /* And cost them. */
3982 vect_prologue_cost_for_slp (child
, cost_vec
);
3985 /* If this node or any of its children can't be vectorized, try pruning
3986 the tree here rather than felling the whole thing. */
3987 if (!res
&& vect_slp_convert_to_external (vinfo
, node
, node_instance
))
3989 /* We'll need to revisit this for invariant costing and number
3990 of vectorized stmt setting. */
3998 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3999 region and that can be vectorized using vectorizable_live_operation
4000 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
4001 scalar code computing it to be retained. */
4004 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo
, slp_tree node
,
4005 slp_instance instance
,
4006 stmt_vector_for_cost
*cost_vec
,
4007 hash_set
<stmt_vec_info
> &svisited
,
4008 hash_set
<slp_tree
> &visited
)
4010 if (visited
.add (node
))
4014 stmt_vec_info stmt_info
;
4015 stmt_vec_info last_stmt
= vect_find_last_scalar_stmt_in_slp (node
);
4016 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4018 if (svisited
.contains (stmt_info
))
4020 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4021 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
)
4022 && STMT_VINFO_RELATED_STMT (orig_stmt_info
) != stmt_info
)
4023 /* Only the pattern root stmt computes the original scalar value. */
4025 bool mark_visited
= true;
4026 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
4027 ssa_op_iter op_iter
;
4028 def_operand_p def_p
;
4029 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
4031 imm_use_iterator use_iter
;
4033 stmt_vec_info use_stmt_info
;
4034 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4035 if (!is_gimple_debug (use_stmt
))
4037 use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
);
4039 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
4041 STMT_VINFO_LIVE_P (stmt_info
) = true;
4042 if (vectorizable_live_operation (bb_vinfo
, stmt_info
,
4043 NULL
, node
, instance
, i
,
4045 /* ??? So we know we can vectorize the live stmt
4046 from one SLP node. If we cannot do so from all
4047 or none consistently we'd have to record which
4048 SLP node (and lane) we want to use for the live
4049 operation. So make sure we can code-generate
4051 mark_visited
= false;
4053 STMT_VINFO_LIVE_P (stmt_info
) = false;
4057 /* We have to verify whether we can insert the lane extract
4058 before all uses. The following is a conservative approximation.
4059 We cannot put this into vectorizable_live_operation because
4060 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4062 Note that while the fact that we emit code for loads at the
4063 first load should make this a non-problem leafs we construct
4064 from scalars are vectorized after the last scalar def.
4065 ??? If we'd actually compute the insert location during
4066 analysis we could use sth less conservative than the last
4067 scalar stmt in the node for the dominance check. */
4068 /* ??? What remains is "live" uses in vector CTORs in the same
4069 SLP graph which is where those uses can end up code-generated
4070 right after their definition instead of close to their original
4071 use. But that would restrict us to code-generate lane-extracts
4072 from the latest stmt in a node. So we compensate for this
4073 during code-generation, simply not replacing uses for those
4074 hopefully rare cases. */
4075 if (STMT_VINFO_LIVE_P (stmt_info
))
4076 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4077 if (!is_gimple_debug (use_stmt
)
4078 && (!(use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
))
4079 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
4080 && !vect_stmt_dominates_stmt_p (last_stmt
->stmt
, use_stmt
))
4082 if (dump_enabled_p ())
4083 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4084 "Cannot determine insertion place for "
4086 STMT_VINFO_LIVE_P (stmt_info
) = false;
4087 mark_visited
= true;
4091 svisited
.add (stmt_info
);
4095 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4096 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4097 vect_bb_slp_mark_live_stmts (bb_vinfo
, child
, instance
,
4098 cost_vec
, svisited
, visited
);
4101 /* Analyze statements in SLP instances of VINFO. Return true if the
4102 operations are supported. */
4105 vect_slp_analyze_operations (vec_info
*vinfo
)
4107 slp_instance instance
;
4110 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4112 hash_set
<slp_tree
> visited
;
4113 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
4115 auto_vec
<slp_tree
> visited_vec
;
4116 stmt_vector_for_cost cost_vec
;
4117 cost_vec
.create (2);
4118 if (is_a
<bb_vec_info
> (vinfo
))
4119 vect_location
= instance
->location ();
4120 if (!vect_slp_analyze_node_operations (vinfo
,
4121 SLP_INSTANCE_TREE (instance
),
4122 instance
, visited
, visited_vec
,
4124 /* Instances with a root stmt require vectorized defs for the
4126 || (SLP_INSTANCE_ROOT_STMT (instance
)
4127 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
))
4128 != vect_internal_def
)))
4130 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4131 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4132 if (dump_enabled_p ())
4133 dump_printf_loc (MSG_NOTE
, vect_location
,
4134 "removing SLP instance operations starting from: %G",
4136 vect_free_slp_instance (instance
);
4137 vinfo
->slp_instances
.ordered_remove (i
);
4138 cost_vec
.release ();
4139 while (!visited_vec
.is_empty ())
4140 visited
.remove (visited_vec
.pop ());
4146 /* For BB vectorization remember the SLP graph entry
4148 if (is_a
<bb_vec_info
> (vinfo
))
4149 instance
->cost_vec
= cost_vec
;
4152 add_stmt_costs (vinfo
, vinfo
->target_cost_data
, &cost_vec
);
4153 cost_vec
.release ();
4158 /* Compute vectorizable live stmts. */
4159 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
4161 hash_set
<stmt_vec_info
> svisited
;
4162 hash_set
<slp_tree
> visited
;
4163 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4165 vect_location
= instance
->location ();
4166 vect_bb_slp_mark_live_stmts (bb_vinfo
, SLP_INSTANCE_TREE (instance
),
4167 instance
, &instance
->cost_vec
, svisited
,
4172 return !vinfo
->slp_instances
.is_empty ();
4175 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
4176 closing the eventual chain. */
4179 get_ultimate_leader (slp_instance instance
,
4180 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
4182 auto_vec
<slp_instance
*, 8> chain
;
4184 while (*(tem
= instance_leader
.get (instance
)) != instance
)
4186 chain
.safe_push (tem
);
4189 while (!chain
.is_empty ())
4190 *chain
.pop () = instance
;
4194 /* Worker of vect_bb_partition_graph, recurse on NODE. */
4197 vect_bb_partition_graph_r (bb_vec_info bb_vinfo
,
4198 slp_instance instance
, slp_tree node
,
4199 hash_map
<stmt_vec_info
, slp_instance
> &stmt_to_instance
,
4200 hash_map
<slp_instance
, slp_instance
> &instance_leader
,
4201 hash_set
<slp_tree
> &visited
)
4203 stmt_vec_info stmt_info
;
4206 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4209 slp_instance
&stmt_instance
4210 = stmt_to_instance
.get_or_insert (stmt_info
, &existed_p
);
4213 else if (stmt_instance
!= instance
)
4215 /* If we're running into a previously marked stmt make us the
4216 leader of the current ultimate leader. This keeps the
4217 leader chain acyclic and works even when the current instance
4218 connects two previously independent graph parts. */
4219 slp_instance stmt_leader
4220 = get_ultimate_leader (stmt_instance
, instance_leader
);
4221 if (stmt_leader
!= instance
)
4222 instance_leader
.put (stmt_leader
, instance
);
4224 stmt_instance
= instance
;
4227 if (visited
.add (node
))
4231 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4232 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4233 vect_bb_partition_graph_r (bb_vinfo
, instance
, child
, stmt_to_instance
,
4234 instance_leader
, visited
);
4237 /* Partition the SLP graph into pieces that can be costed independently. */
4240 vect_bb_partition_graph (bb_vec_info bb_vinfo
)
4242 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
4244 /* First walk the SLP graph assigning each involved scalar stmt a
4245 corresponding SLP graph entry and upon visiting a previously
4246 marked stmt, make the stmts leader the current SLP graph entry. */
4247 hash_map
<stmt_vec_info
, slp_instance
> stmt_to_instance
;
4248 hash_map
<slp_instance
, slp_instance
> instance_leader
;
4249 hash_set
<slp_tree
> visited
;
4250 slp_instance instance
;
4251 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4253 instance_leader
.put (instance
, instance
);
4254 vect_bb_partition_graph_r (bb_vinfo
,
4255 instance
, SLP_INSTANCE_TREE (instance
),
4256 stmt_to_instance
, instance_leader
,
4260 /* Then collect entries to each independent subgraph. */
4261 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4263 slp_instance leader
= get_ultimate_leader (instance
, instance_leader
);
4264 leader
->subgraph_entries
.safe_push (instance
);
4265 if (dump_enabled_p ()
4266 && leader
!= instance
)
4267 dump_printf_loc (MSG_NOTE
, vect_location
,
4268 "instance %p is leader of %p\n",
4273 /* Compute the scalar cost of the SLP node NODE and its children
4274 and return it. Do not account defs that are marked in LIFE and
4275 update LIFE according to uses of NODE. */
4278 vect_bb_slp_scalar_cost (vec_info
*vinfo
,
4279 slp_tree node
, vec
<bool, va_heap
> *life
,
4280 stmt_vector_for_cost
*cost_vec
,
4281 hash_set
<slp_tree
> &visited
)
4284 stmt_vec_info stmt_info
;
4287 if (visited
.add (node
))
4290 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4292 ssa_op_iter op_iter
;
4293 def_operand_p def_p
;
4298 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4299 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
4301 /* If there is a non-vectorized use of the defs then the scalar
4302 stmt is kept live in which case we do not account it or any
4303 required defs in the SLP children in the scalar cost. This
4304 way we make the vectorization more costly when compared to
4306 if (!STMT_VINFO_LIVE_P (stmt_info
))
4308 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
4310 imm_use_iterator use_iter
;
4312 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4313 if (!is_gimple_debug (use_stmt
))
4315 stmt_vec_info use_stmt_info
= vinfo
->lookup_stmt (use_stmt
);
4318 (vect_stmt_to_vectorize (use_stmt_info
)))
4329 /* Count scalar stmts only once. */
4330 if (gimple_visited_p (orig_stmt
))
4332 gimple_set_visited (orig_stmt
, true);
4334 vect_cost_for_stmt kind
;
4335 if (STMT_VINFO_DATA_REF (orig_stmt_info
))
4337 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info
)))
4340 kind
= scalar_store
;
4342 else if (vect_nop_conversion_p (orig_stmt_info
))
4344 /* For single-argument PHIs assume coalescing which means zero cost
4345 for the scalar and the vector PHIs. This avoids artificially
4346 favoring the vector path (but may pessimize it in some cases). */
4347 else if (is_a
<gphi
*> (orig_stmt_info
->stmt
)
4348 && gimple_phi_num_args
4349 (as_a
<gphi
*> (orig_stmt_info
->stmt
)) == 1)
4353 record_stmt_cost (cost_vec
, 1, kind
, orig_stmt_info
,
4354 SLP_TREE_VECTYPE (node
), 0, vect_body
);
4357 auto_vec
<bool, 20> subtree_life
;
4358 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4360 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4362 /* Do not directly pass LIFE to the recursive call, copy it to
4363 confine changes in the callee to the current child/subtree. */
4364 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
4366 subtree_life
.safe_grow_cleared (SLP_TREE_LANES (child
), true);
4367 for (unsigned j
= 0;
4368 j
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++j
)
4370 auto perm
= SLP_TREE_LANE_PERMUTATION (node
)[j
];
4371 if (perm
.first
== i
)
4372 subtree_life
[perm
.second
] = (*life
)[j
];
4377 gcc_assert (SLP_TREE_LANES (node
) == SLP_TREE_LANES (child
));
4378 subtree_life
.safe_splice (*life
);
4380 vect_bb_slp_scalar_cost (vinfo
, child
, &subtree_life
, cost_vec
,
4382 subtree_life
.truncate (0);
4387 /* Comparator for the loop-index sorted cost vectors. */
4390 li_cost_vec_cmp (const void *a_
, const void *b_
)
4392 auto *a
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)a_
;
4393 auto *b
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)b_
;
4394 if (a
->first
< b
->first
)
4396 else if (a
->first
== b
->first
)
4401 /* Check if vectorization of the basic block is profitable for the
4402 subgraph denoted by SLP_INSTANCES. */
4405 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
,
4406 vec
<slp_instance
> slp_instances
)
4408 slp_instance instance
;
4410 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
4411 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
4413 if (dump_enabled_p ())
4415 dump_printf_loc (MSG_NOTE
, vect_location
, "Costing subgraph: \n");
4416 hash_set
<slp_tree
> visited
;
4417 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4418 vect_print_slp_graph (MSG_NOTE
, vect_location
,
4419 SLP_INSTANCE_TREE (instance
), visited
);
4422 /* Calculate scalar cost and sum the cost for the vector stmts
4423 previously collected. */
4424 stmt_vector_for_cost scalar_costs
= vNULL
;
4425 stmt_vector_for_cost vector_costs
= vNULL
;
4426 hash_set
<slp_tree
> visited
;
4427 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4429 auto_vec
<bool, 20> life
;
4430 life
.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
)),
4432 if (SLP_INSTANCE_ROOT_STMT (instance
))
4433 record_stmt_cost (&scalar_costs
, 1, scalar_stmt
,
4434 SLP_INSTANCE_ROOT_STMT (instance
), 0, vect_body
);
4435 vect_bb_slp_scalar_cost (bb_vinfo
,
4436 SLP_INSTANCE_TREE (instance
),
4437 &life
, &scalar_costs
, visited
);
4438 vector_costs
.safe_splice (instance
->cost_vec
);
4439 instance
->cost_vec
.release ();
4441 /* Unset visited flag. */
4442 stmt_info_for_cost
*cost
;
4443 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
4444 gimple_set_visited (cost
->stmt_info
->stmt
, false);
4446 if (dump_enabled_p ())
4447 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
4449 /* When costing non-loop vectorization we need to consider each covered
4450 loop independently and make sure vectorization is profitable. For
4451 now we assume a loop may be not entered or executed an arbitrary
4452 number of iterations (??? static information can provide more
4453 precise info here) which means we can simply cost each containing
4454 loops stmts separately. */
4456 /* First produce cost vectors sorted by loop index. */
4457 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
4458 li_scalar_costs (scalar_costs
.length ());
4459 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
4460 li_vector_costs (vector_costs
.length ());
4461 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
4463 unsigned l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
4464 li_scalar_costs
.quick_push (std::make_pair (l
, cost
));
4466 /* Use a random used loop as fallback in case the first vector_costs
4467 entry does not have a stmt_info associated with it. */
4468 unsigned l
= li_scalar_costs
[0].first
;
4469 FOR_EACH_VEC_ELT (vector_costs
, i
, cost
)
4471 /* We inherit from the previous COST, invariants, externals and
4472 extracts immediately follow the cost for the related stmt. */
4473 if (cost
->stmt_info
)
4474 l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
4475 li_vector_costs
.quick_push (std::make_pair (l
, cost
));
4477 li_scalar_costs
.qsort (li_cost_vec_cmp
);
4478 li_vector_costs
.qsort (li_cost_vec_cmp
);
4480 /* Now cost the portions individually. */
4483 while (si
< li_scalar_costs
.length ()
4484 && vi
< li_vector_costs
.length ())
4486 unsigned sl
= li_scalar_costs
[si
].first
;
4487 unsigned vl
= li_vector_costs
[vi
].first
;
4490 if (dump_enabled_p ())
4491 dump_printf_loc (MSG_NOTE
, vect_location
,
4492 "Scalar %d and vector %d loop part do not "
4493 "match up, skipping scalar part\n", sl
, vl
);
4494 /* Skip the scalar part, assuming zero cost on the vector side. */
4499 while (si
< li_scalar_costs
.length ()
4500 && li_scalar_costs
[si
].first
== sl
);
4504 void *scalar_target_cost_data
= init_cost (NULL
);
4507 add_stmt_cost (bb_vinfo
, scalar_target_cost_data
,
4508 li_scalar_costs
[si
].second
);
4511 while (si
< li_scalar_costs
.length ()
4512 && li_scalar_costs
[si
].first
== sl
);
4514 finish_cost (scalar_target_cost_data
, &dummy
, &scalar_cost
, &dummy
);
4515 destroy_cost_data (scalar_target_cost_data
);
4517 /* Complete the target-specific vector cost calculation. */
4518 void *vect_target_cost_data
= init_cost (NULL
);
4521 add_stmt_cost (bb_vinfo
, vect_target_cost_data
,
4522 li_vector_costs
[vi
].second
);
4525 while (vi
< li_vector_costs
.length ()
4526 && li_vector_costs
[vi
].first
== vl
);
4527 finish_cost (vect_target_cost_data
, &vec_prologue_cost
,
4528 &vec_inside_cost
, &vec_epilogue_cost
);
4529 destroy_cost_data (vect_target_cost_data
);
4531 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
4533 if (dump_enabled_p ())
4535 dump_printf_loc (MSG_NOTE
, vect_location
,
4536 "Cost model analysis for part in loop %d:\n", sl
);
4537 dump_printf (MSG_NOTE
, " Vector cost: %d\n",
4538 vec_inside_cost
+ vec_outside_cost
);
4539 dump_printf (MSG_NOTE
, " Scalar cost: %d\n", scalar_cost
);
4542 /* Vectorization is profitable if its cost is more than the cost of scalar
4543 version. Note that we err on the vector side for equal cost because
4544 the cost estimate is otherwise quite pessimistic (constant uses are
4545 free on the scalar side but cost a load on the vector side for
4547 if (vec_outside_cost
+ vec_inside_cost
> scalar_cost
)
4549 scalar_costs
.release ();
4550 vector_costs
.release ();
4554 if (vi
< li_vector_costs
.length ())
4556 if (dump_enabled_p ())
4557 dump_printf_loc (MSG_NOTE
, vect_location
,
4558 "Excess vector cost for part in loop %d:\n",
4559 li_vector_costs
[vi
].first
);
4560 scalar_costs
.release ();
4561 vector_costs
.release ();
4565 scalar_costs
.release ();
4566 vector_costs
.release ();
4570 /* qsort comparator for lane defs. */
4573 vld_cmp (const void *a_
, const void *b_
)
4575 auto *a
= (const std::pair
<unsigned, tree
> *)a_
;
4576 auto *b
= (const std::pair
<unsigned, tree
> *)b_
;
4577 return a
->first
- b
->first
;
4580 /* Return true if USE_STMT is a vector lane insert into VEC and set
4581 *THIS_LANE to the lane number that is set. */
4584 vect_slp_is_lane_insert (gimple
*use_stmt
, tree vec
, unsigned *this_lane
)
4586 gassign
*use_ass
= dyn_cast
<gassign
*> (use_stmt
);
4588 || gimple_assign_rhs_code (use_ass
) != BIT_INSERT_EXPR
4590 ? gimple_assign_rhs1 (use_ass
) != vec
4591 : ((vec
= gimple_assign_rhs1 (use_ass
)), false))
4592 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec
)),
4593 TREE_TYPE (gimple_assign_rhs2 (use_ass
)))
4594 || !constant_multiple_p
4595 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass
)),
4596 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec
)))),
4602 /* Find any vectorizable constructors and add them to the grouped_store
4606 vect_slp_check_for_constructors (bb_vec_info bb_vinfo
)
4608 for (unsigned i
= 0; i
< bb_vinfo
->bbs
.length (); ++i
)
4609 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb_vinfo
->bbs
[i
]);
4610 !gsi_end_p (gsi
); gsi_next (&gsi
))
4612 gassign
*assign
= dyn_cast
<gassign
*> (gsi_stmt (gsi
));
4616 tree rhs
= gimple_assign_rhs1 (assign
);
4617 if (gimple_assign_rhs_code (assign
) == CONSTRUCTOR
)
4619 if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
4620 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)),
4621 CONSTRUCTOR_NELTS (rhs
))
4622 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs
, 0)->value
))
4623 || uniform_vector_p (rhs
))
4628 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), j
, val
)
4629 if (TREE_CODE (val
) != SSA_NAME
4630 || !bb_vinfo
->lookup_def (val
))
4632 if (j
!= CONSTRUCTOR_NELTS (rhs
))
4635 stmt_vec_info stmt_info
= bb_vinfo
->lookup_stmt (assign
);
4636 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
4638 else if (gimple_assign_rhs_code (assign
) == BIT_INSERT_EXPR
4639 && VECTOR_TYPE_P (TREE_TYPE (rhs
))
4640 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).is_constant ()
4641 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).to_constant () > 1
4642 && integer_zerop (gimple_assign_rhs3 (assign
))
4643 && useless_type_conversion_p
4644 (TREE_TYPE (TREE_TYPE (rhs
)),
4645 TREE_TYPE (gimple_assign_rhs2 (assign
)))
4646 && bb_vinfo
->lookup_def (gimple_assign_rhs2 (assign
)))
4648 /* We start to match on insert to lane zero but since the
4649 inserts need not be ordered we'd have to search both
4650 the def and the use chains. */
4651 tree vectype
= TREE_TYPE (rhs
);
4652 unsigned nlanes
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
4653 auto_vec
<std::pair
<unsigned, tree
> > lane_defs (nlanes
);
4654 auto_sbitmap
lanes (nlanes
);
4655 bitmap_clear (lanes
);
4656 bitmap_set_bit (lanes
, 0);
4657 tree def
= gimple_assign_lhs (assign
);
4658 lane_defs
.quick_push
4659 (std::make_pair (0, gimple_assign_rhs2 (assign
)));
4660 unsigned lanes_found
= 1;
4661 /* Start with the use chains, the last stmt will be the root. */
4662 stmt_vec_info last
= bb_vinfo
->lookup_stmt (assign
);
4665 use_operand_p use_p
;
4667 if (!single_imm_use (def
, &use_p
, &use_stmt
))
4670 if (!bb_vinfo
->lookup_stmt (use_stmt
)
4671 || !vect_slp_is_lane_insert (use_stmt
, def
, &this_lane
)
4672 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (use_stmt
)))
4674 if (bitmap_bit_p (lanes
, this_lane
))
4677 bitmap_set_bit (lanes
, this_lane
);
4678 gassign
*use_ass
= as_a
<gassign
*> (use_stmt
);
4679 lane_defs
.quick_push (std::make_pair
4680 (this_lane
, gimple_assign_rhs2 (use_ass
)));
4681 last
= bb_vinfo
->lookup_stmt (use_ass
);
4682 def
= gimple_assign_lhs (use_ass
);
4684 while (lanes_found
< nlanes
);
4685 if (lanes_found
< nlanes
)
4687 /* Now search the def chain. */
4688 def
= gimple_assign_rhs1 (assign
);
4691 if (TREE_CODE (def
) != SSA_NAME
4692 || !has_single_use (def
))
4694 gimple
*def_stmt
= SSA_NAME_DEF_STMT (def
);
4696 if (!bb_vinfo
->lookup_stmt (def_stmt
)
4697 || !vect_slp_is_lane_insert (def_stmt
,
4698 NULL_TREE
, &this_lane
)
4699 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (def_stmt
)))
4701 if (bitmap_bit_p (lanes
, this_lane
))
4704 bitmap_set_bit (lanes
, this_lane
);
4705 lane_defs
.quick_push (std::make_pair
4707 gimple_assign_rhs2 (def_stmt
)));
4708 def
= gimple_assign_rhs1 (def_stmt
);
4710 while (lanes_found
< nlanes
);
4712 if (lanes_found
== nlanes
)
4714 /* Sort lane_defs after the lane index and register the root. */
4715 lane_defs
.qsort (vld_cmp
);
4716 vec
<stmt_vec_info
> stmts
;
4717 stmts
.create (nlanes
);
4718 for (unsigned i
= 0; i
< nlanes
; ++i
)
4719 stmts
.quick_push (bb_vinfo
->lookup_def (lane_defs
[i
].second
));
4720 bb_vinfo
->roots
.safe_push (slp_root (slp_inst_kind_ctor
,
4727 /* Walk the grouped store chains and replace entries with their
4728 pattern variant if any. */
4731 vect_fixup_store_groups_with_patterns (vec_info
*vinfo
)
4733 stmt_vec_info first_element
;
4736 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
4738 /* We also have CTORs in this array. */
4739 if (!STMT_VINFO_GROUPED_ACCESS (first_element
))
4741 if (STMT_VINFO_IN_PATTERN_P (first_element
))
4743 stmt_vec_info orig
= first_element
;
4744 first_element
= STMT_VINFO_RELATED_STMT (first_element
);
4745 DR_GROUP_FIRST_ELEMENT (first_element
) = first_element
;
4746 DR_GROUP_SIZE (first_element
) = DR_GROUP_SIZE (orig
);
4747 DR_GROUP_GAP (first_element
) = DR_GROUP_GAP (orig
);
4748 DR_GROUP_NEXT_ELEMENT (first_element
) = DR_GROUP_NEXT_ELEMENT (orig
);
4749 vinfo
->grouped_stores
[i
] = first_element
;
4751 stmt_vec_info prev
= first_element
;
4752 while (DR_GROUP_NEXT_ELEMENT (prev
))
4754 stmt_vec_info elt
= DR_GROUP_NEXT_ELEMENT (prev
);
4755 if (STMT_VINFO_IN_PATTERN_P (elt
))
4757 stmt_vec_info orig
= elt
;
4758 elt
= STMT_VINFO_RELATED_STMT (elt
);
4759 DR_GROUP_NEXT_ELEMENT (prev
) = elt
;
4760 DR_GROUP_GAP (elt
) = DR_GROUP_GAP (orig
);
4761 DR_GROUP_NEXT_ELEMENT (elt
) = DR_GROUP_NEXT_ELEMENT (orig
);
4763 DR_GROUP_FIRST_ELEMENT (elt
) = first_element
;
4769 /* Check if the region described by BB_VINFO can be vectorized, returning
4770 true if so. When returning false, set FATAL to true if the same failure
4771 would prevent vectorization at other vector sizes, false if it is still
4772 worth trying other sizes. N_STMTS is the number of statements in the
4776 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo
, int n_stmts
, bool &fatal
,
4777 vec
<int> *dataref_groups
)
4779 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4781 slp_instance instance
;
4783 poly_uint64 min_vf
= 2;
4785 /* The first group of checks is independent of the vector size. */
4788 /* Analyze the data references. */
4790 if (!vect_analyze_data_refs (bb_vinfo
, &min_vf
, NULL
))
4792 if (dump_enabled_p ())
4793 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4794 "not vectorized: unhandled data-ref in basic "
4799 if (!vect_analyze_data_ref_accesses (bb_vinfo
, dataref_groups
))
4801 if (dump_enabled_p ())
4802 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4803 "not vectorized: unhandled data access in "
4808 vect_slp_check_for_constructors (bb_vinfo
);
4810 /* If there are no grouped stores and no constructors in the region
4811 there is no need to continue with pattern recog as vect_analyze_slp
4812 will fail anyway. */
4813 if (bb_vinfo
->grouped_stores
.is_empty ()
4814 && bb_vinfo
->roots
.is_empty ())
4816 if (dump_enabled_p ())
4817 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4818 "not vectorized: no grouped stores in "
4823 /* While the rest of the analysis below depends on it in some way. */
4826 vect_pattern_recog (bb_vinfo
);
4828 /* Update store groups from pattern processing. */
4829 vect_fixup_store_groups_with_patterns (bb_vinfo
);
4831 /* Check the SLP opportunities in the basic block, analyze and build SLP
4833 if (!vect_analyze_slp (bb_vinfo
, n_stmts
))
4835 if (dump_enabled_p ())
4837 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4838 "Failed to SLP the basic block.\n");
4839 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4840 "not vectorized: failed to find SLP opportunities "
4841 "in basic block.\n");
4846 /* Optimize permutations. */
4847 vect_optimize_slp (bb_vinfo
);
4849 /* Gather the loads reachable from the SLP graph entries. */
4850 vect_gather_slp_loads (bb_vinfo
);
4852 vect_record_base_alignments (bb_vinfo
);
4854 /* Analyze and verify the alignment of data references and the
4855 dependence in the SLP instances. */
4856 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
4858 vect_location
= instance
->location ();
4859 if (! vect_slp_analyze_instance_alignment (bb_vinfo
, instance
)
4860 || ! vect_slp_analyze_instance_dependence (bb_vinfo
, instance
))
4862 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4863 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4864 if (dump_enabled_p ())
4865 dump_printf_loc (MSG_NOTE
, vect_location
,
4866 "removing SLP instance operations starting from: %G",
4868 vect_free_slp_instance (instance
);
4869 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
4873 /* Mark all the statements that we want to vectorize as pure SLP and
4875 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
4876 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
4877 if (stmt_vec_info root
= SLP_INSTANCE_ROOT_STMT (instance
))
4879 STMT_SLP_TYPE (root
) = pure_slp
;
4880 if (is_gimple_assign (root
->stmt
)
4881 && gimple_assign_rhs_code (root
->stmt
) == BIT_INSERT_EXPR
)
4883 /* ??? We should probably record the whole vector of
4884 root stmts so we do not have to back-track here... */
4885 for (unsigned n
= SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
));
4888 root
= bb_vinfo
->lookup_def (gimple_assign_rhs1 (root
->stmt
));
4889 STMT_SLP_TYPE (root
) = pure_slp
;
4896 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
4899 if (!vect_slp_analyze_operations (bb_vinfo
))
4901 if (dump_enabled_p ())
4902 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4903 "not vectorized: bad operation in basic block.\n");
4907 vect_bb_partition_graph (bb_vinfo
);
4912 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4913 basic blocks in BBS, returning true on success.
4914 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4917 vect_slp_region (vec
<basic_block
> bbs
, vec
<data_reference_p
> datarefs
,
4918 vec
<int> *dataref_groups
, unsigned int n_stmts
)
4920 bb_vec_info bb_vinfo
;
4921 auto_vector_modes vector_modes
;
4923 /* Autodetect first vector size we try. */
4924 machine_mode next_vector_mode
= VOIDmode
;
4925 targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
, false);
4926 unsigned int mode_i
= 0;
4928 vec_info_shared shared
;
4930 machine_mode autodetected_vector_mode
= VOIDmode
;
4933 bool vectorized
= false;
4935 bb_vinfo
= new _bb_vec_info (bbs
, &shared
);
4937 bool first_time_p
= shared
.datarefs
.is_empty ();
4938 BB_VINFO_DATAREFS (bb_vinfo
) = datarefs
;
4940 bb_vinfo
->shared
->save_datarefs ();
4942 bb_vinfo
->shared
->check_datarefs ();
4943 bb_vinfo
->vector_mode
= next_vector_mode
;
4945 if (vect_slp_analyze_bb_1 (bb_vinfo
, n_stmts
, fatal
, dataref_groups
))
4947 if (dump_enabled_p ())
4949 dump_printf_loc (MSG_NOTE
, vect_location
,
4950 "***** Analysis succeeded with vector mode"
4951 " %s\n", GET_MODE_NAME (bb_vinfo
->vector_mode
));
4952 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB part\n");
4955 bb_vinfo
->shared
->check_datarefs ();
4958 slp_instance instance
;
4959 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo
), i
, instance
)
4961 if (instance
->subgraph_entries
.is_empty ())
4964 vect_location
= instance
->location ();
4965 if (!unlimited_cost_model (NULL
)
4966 && !vect_bb_vectorization_profitable_p
4967 (bb_vinfo
, instance
->subgraph_entries
))
4969 if (dump_enabled_p ())
4970 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4971 "not vectorized: vectorization is not "
4976 if (!dbg_cnt (vect_slp
))
4979 if (!vectorized
&& dump_enabled_p ())
4980 dump_printf_loc (MSG_NOTE
, vect_location
,
4981 "Basic block will be vectorized "
4985 vect_schedule_slp (bb_vinfo
, instance
->subgraph_entries
);
4987 unsigned HOST_WIDE_INT bytes
;
4988 if (dump_enabled_p ())
4991 (bb_vinfo
->vector_mode
).is_constant (&bytes
))
4992 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
4993 "basic block part vectorized using %wu "
4994 "byte vectors\n", bytes
);
4996 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
4997 "basic block part vectorized using "
4998 "variable length vectors\n");
5004 if (dump_enabled_p ())
5005 dump_printf_loc (MSG_NOTE
, vect_location
,
5006 "***** Analysis failed with vector mode %s\n",
5007 GET_MODE_NAME (bb_vinfo
->vector_mode
));
5011 autodetected_vector_mode
= bb_vinfo
->vector_mode
;
5014 while (mode_i
< vector_modes
.length ()
5015 && vect_chooses_same_modes_p (bb_vinfo
, vector_modes
[mode_i
]))
5017 if (dump_enabled_p ())
5018 dump_printf_loc (MSG_NOTE
, vect_location
,
5019 "***** The result for vector mode %s would"
5021 GET_MODE_NAME (vector_modes
[mode_i
]));
5027 if (mode_i
< vector_modes
.length ()
5028 && VECTOR_MODE_P (autodetected_vector_mode
)
5029 && (related_vector_mode (vector_modes
[mode_i
],
5030 GET_MODE_INNER (autodetected_vector_mode
))
5031 == autodetected_vector_mode
)
5032 && (related_vector_mode (autodetected_vector_mode
,
5033 GET_MODE_INNER (vector_modes
[mode_i
]))
5034 == vector_modes
[mode_i
]))
5036 if (dump_enabled_p ())
5037 dump_printf_loc (MSG_NOTE
, vect_location
,
5038 "***** Skipping vector mode %s, which would"
5039 " repeat the analysis for %s\n",
5040 GET_MODE_NAME (vector_modes
[mode_i
]),
5041 GET_MODE_NAME (autodetected_vector_mode
));
5046 || mode_i
== vector_modes
.length ()
5047 || autodetected_vector_mode
== VOIDmode
5048 /* If vect_slp_analyze_bb_1 signaled that analysis for all
5049 vector sizes will fail do not bother iterating. */
5053 /* Try the next biggest vector size. */
5054 next_vector_mode
= vector_modes
[mode_i
++];
5055 if (dump_enabled_p ())
5056 dump_printf_loc (MSG_NOTE
, vect_location
,
5057 "***** Re-trying analysis with vector mode %s\n",
5058 GET_MODE_NAME (next_vector_mode
));
5063 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
5064 true if anything in the basic-block was vectorized. */
5067 vect_slp_bbs (vec
<basic_block
> bbs
)
5069 vec
<data_reference_p
> datarefs
= vNULL
;
5070 auto_vec
<int> dataref_groups
;
5072 int current_group
= 0;
5074 for (unsigned i
= 0; i
< bbs
.length (); i
++)
5076 basic_block bb
= bbs
[i
];
5077 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
5080 gimple
*stmt
= gsi_stmt (gsi
);
5081 if (is_gimple_debug (stmt
))
5086 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
5087 vect_location
= stmt
;
5089 if (!vect_find_stmt_data_reference (NULL
, stmt
, &datarefs
,
5090 &dataref_groups
, current_group
))
5095 return vect_slp_region (bbs
, datarefs
, &dataref_groups
, insns
);
5098 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5099 true if anything in the basic-block was vectorized. */
5102 vect_slp_bb (basic_block bb
)
5104 auto_vec
<basic_block
> bbs
;
5106 return vect_slp_bbs (bbs
);
5109 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5110 true if anything in the basic-block was vectorized. */
5113 vect_slp_function (function
*fun
)
5116 int *rpo
= XNEWVEC (int, n_basic_blocks_for_fn (fun
));
5117 unsigned n
= pre_and_rev_post_order_compute_fn (fun
, NULL
, rpo
, false);
5119 /* For the moment split the function into pieces to avoid making
5120 the iteration on the vector mode moot. Split at points we know
5121 to not handle well which is CFG merges (SLP discovery doesn't
5122 handle non-loop-header PHIs) and loop exits. Since pattern
5123 recog requires reverse iteration to visit uses before defs
5124 simply chop RPO into pieces. */
5125 auto_vec
<basic_block
> bbs
;
5126 for (unsigned i
= 0; i
< n
; i
++)
5128 basic_block bb
= BASIC_BLOCK_FOR_FN (fun
, rpo
[i
]);
5131 /* Split when a BB is not dominated by the first block. */
5132 if (!bbs
.is_empty ()
5133 && !dominated_by_p (CDI_DOMINATORS
, bb
, bbs
[0]))
5135 if (dump_enabled_p ())
5136 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5137 "splitting region at dominance boundary bb%d\n",
5141 /* Split when the loop determined by the first block
5142 is exited. This is because we eventually insert
5143 invariants at region begin. */
5144 else if (!bbs
.is_empty ()
5145 && bbs
[0]->loop_father
!= bb
->loop_father
5146 && !flow_loop_nested_p (bbs
[0]->loop_father
, bb
->loop_father
))
5148 if (dump_enabled_p ())
5149 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5150 "splitting region at loop %d exit at bb%d\n",
5151 bbs
[0]->loop_father
->num
, bb
->index
);
5155 if (split
&& !bbs
.is_empty ())
5157 r
|= vect_slp_bbs (bbs
);
5159 bbs
.quick_push (bb
);
5164 /* When we have a stmt ending this block and defining a
5165 value we have to insert on edges when inserting after it for
5166 a vector containing its definition. Avoid this for now. */
5167 if (gimple
*last
= last_stmt (bb
))
5168 if (gimple_get_lhs (last
)
5169 && is_ctrl_altering_stmt (last
))
5171 if (dump_enabled_p ())
5172 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5173 "splitting region at control altering "
5174 "definition %G", last
);
5175 r
|= vect_slp_bbs (bbs
);
5180 if (!bbs
.is_empty ())
5181 r
|= vect_slp_bbs (bbs
);
5188 /* Build a variable-length vector in which the elements in ELTS are repeated
5189 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
5190 RESULTS and add any new instructions to SEQ.
5192 The approach we use is:
5194 (1) Find a vector mode VM with integer elements of mode IM.
5196 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5197 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
5198 from small vectors to IM.
5200 (3) Duplicate each ELTS'[I] into a vector of mode VM.
5202 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
5203 correct byte contents.
5205 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
5207 We try to find the largest IM for which this sequence works, in order
5208 to cut down on the number of interleaves. */
5211 duplicate_and_interleave (vec_info
*vinfo
, gimple_seq
*seq
, tree vector_type
,
5212 vec
<tree
> elts
, unsigned int nresults
,
5215 unsigned int nelts
= elts
.length ();
5216 tree element_type
= TREE_TYPE (vector_type
);
5218 /* (1) Find a vector mode VM with integer elements of mode IM. */
5219 unsigned int nvectors
= 1;
5220 tree new_vector_type
;
5222 if (!can_duplicate_and_interleave_p (vinfo
, nelts
, element_type
,
5223 &nvectors
, &new_vector_type
,
5227 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
5228 unsigned int partial_nelts
= nelts
/ nvectors
;
5229 tree partial_vector_type
= build_vector_type (element_type
, partial_nelts
);
5231 tree_vector_builder partial_elts
;
5232 auto_vec
<tree
, 32> pieces (nvectors
* 2);
5233 pieces
.quick_grow_cleared (nvectors
* 2);
5234 for (unsigned int i
= 0; i
< nvectors
; ++i
)
5236 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5237 ELTS' has mode IM. */
5238 partial_elts
.new_vector (partial_vector_type
, partial_nelts
, 1);
5239 for (unsigned int j
= 0; j
< partial_nelts
; ++j
)
5240 partial_elts
.quick_push (elts
[i
* partial_nelts
+ j
]);
5241 tree t
= gimple_build_vector (seq
, &partial_elts
);
5242 t
= gimple_build (seq
, VIEW_CONVERT_EXPR
,
5243 TREE_TYPE (new_vector_type
), t
);
5245 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
5246 pieces
[i
] = gimple_build_vector_from_val (seq
, new_vector_type
, t
);
5249 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
5250 correct byte contents.
5252 Conceptually, we need to repeat the following operation log2(nvectors)
5253 times, where hi_start = nvectors / 2:
5255 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
5256 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
5258 However, if each input repeats every N elements and the VF is
5259 a multiple of N * 2, the HI result is the same as the LO result.
5260 This will be true for the first N1 iterations of the outer loop,
5261 followed by N2 iterations for which both the LO and HI results
5264 N1 + N2 = log2(nvectors)
5266 Each "N1 iteration" doubles the number of redundant vectors and the
5267 effect of the process as a whole is to have a sequence of nvectors/2**N1
5268 vectors that repeats 2**N1 times. Rather than generate these redundant
5269 vectors, we halve the number of vectors for each N1 iteration. */
5270 unsigned int in_start
= 0;
5271 unsigned int out_start
= nvectors
;
5272 unsigned int new_nvectors
= nvectors
;
5273 for (unsigned int in_repeat
= 1; in_repeat
< nvectors
; in_repeat
*= 2)
5275 unsigned int hi_start
= new_nvectors
/ 2;
5276 unsigned int out_i
= 0;
5277 for (unsigned int in_i
= 0; in_i
< new_nvectors
; ++in_i
)
5280 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type
),
5284 tree output
= make_ssa_name (new_vector_type
);
5285 tree input1
= pieces
[in_start
+ (in_i
/ 2)];
5286 tree input2
= pieces
[in_start
+ (in_i
/ 2) + hi_start
];
5287 gassign
*stmt
= gimple_build_assign (output
, VEC_PERM_EXPR
,
5289 permutes
[in_i
& 1]);
5290 gimple_seq_add_stmt (seq
, stmt
);
5291 pieces
[out_start
+ out_i
] = output
;
5294 std::swap (in_start
, out_start
);
5295 new_nvectors
= out_i
;
5298 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
5299 results
.reserve (nresults
);
5300 for (unsigned int i
= 0; i
< nresults
; ++i
)
5301 if (i
< new_nvectors
)
5302 results
.quick_push (gimple_build (seq
, VIEW_CONVERT_EXPR
, vector_type
,
5303 pieces
[in_start
+ i
]));
5305 results
.quick_push (results
[i
- new_nvectors
]);
5309 /* For constant and loop invariant defs in OP_NODE this function creates
5310 vector defs that will be used in the vectorized stmts and stores them
5311 to SLP_TREE_VEC_DEFS of OP_NODE. */
5314 vect_create_constant_vectors (vec_info
*vinfo
, slp_tree op_node
)
5316 unsigned HOST_WIDE_INT nunits
;
5318 unsigned j
, number_of_places_left_in_vector
;
5321 int group_size
= op_node
->ops
.length ();
5322 unsigned int vec_num
, i
;
5323 unsigned number_of_copies
= 1;
5325 gimple_seq ctor_seq
= NULL
;
5326 auto_vec
<tree
, 16> permute_results
;
5328 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
5329 vector_type
= SLP_TREE_VECTYPE (op_node
);
5331 unsigned int number_of_vectors
= SLP_TREE_NUMBER_OF_VEC_STMTS (op_node
);
5332 SLP_TREE_VEC_DEFS (op_node
).create (number_of_vectors
);
5333 auto_vec
<tree
> voprnds (number_of_vectors
);
5335 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5336 created vectors. It is greater than 1 if unrolling is performed.
5338 For example, we have two scalar operands, s1 and s2 (e.g., group of
5339 strided accesses of size two), while NUNITS is four (i.e., four scalars
5340 of this type can be packed in a vector). The output vector will contain
5341 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5344 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
5345 containing the operands.
5347 For example, NUNITS is four as before, and the group size is 8
5348 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5349 {s5, s6, s7, s8}. */
5351 /* When using duplicate_and_interleave, we just need one element for
5352 each scalar statement. */
5353 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
5354 nunits
= group_size
;
5356 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
5358 number_of_places_left_in_vector
= nunits
;
5360 tree_vector_builder
elts (vector_type
, nunits
, 1);
5361 elts
.quick_grow (nunits
);
5362 stmt_vec_info insert_after
= NULL
;
5363 for (j
= 0; j
< number_of_copies
; j
++)
5366 for (i
= group_size
- 1; op_node
->ops
.iterate (i
, &op
); i
--)
5368 /* Create 'vect_ = {op0,op1,...,opn}'. */
5369 number_of_places_left_in_vector
--;
5371 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
5373 if (CONSTANT_CLASS_P (op
))
5375 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
5377 /* Can't use VIEW_CONVERT_EXPR for booleans because
5378 of possibly different sizes of scalar value and
5380 if (integer_zerop (op
))
5381 op
= build_int_cst (TREE_TYPE (vector_type
), 0);
5382 else if (integer_onep (op
))
5383 op
= build_all_ones_cst (TREE_TYPE (vector_type
));
5388 op
= fold_unary (VIEW_CONVERT_EXPR
,
5389 TREE_TYPE (vector_type
), op
);
5390 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
5394 tree new_temp
= make_ssa_name (TREE_TYPE (vector_type
));
5396 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
5399 = build_all_ones_cst (TREE_TYPE (vector_type
));
5401 = build_zero_cst (TREE_TYPE (vector_type
));
5402 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op
)));
5403 init_stmt
= gimple_build_assign (new_temp
, COND_EXPR
,
5409 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
5412 = gimple_build_assign (new_temp
, VIEW_CONVERT_EXPR
,
5415 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
5419 elts
[number_of_places_left_in_vector
] = op
;
5420 if (!CONSTANT_CLASS_P (op
))
5422 /* For BB vectorization we have to compute an insert location
5423 when a def is inside the analyzed region since we cannot
5424 simply insert at the BB start in this case. */
5425 stmt_vec_info opdef
;
5426 if (TREE_CODE (orig_op
) == SSA_NAME
5427 && !SSA_NAME_IS_DEFAULT_DEF (orig_op
)
5428 && is_a
<bb_vec_info
> (vinfo
)
5429 && (opdef
= vinfo
->lookup_def (orig_op
)))
5432 insert_after
= opdef
;
5434 insert_after
= get_later_stmt (insert_after
, opdef
);
5437 if (number_of_places_left_in_vector
== 0)
5440 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
5441 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
5442 vec_cst
= gimple_build_vector (&ctor_seq
, &elts
);
5445 if (permute_results
.is_empty ())
5446 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
,
5447 elts
, number_of_vectors
,
5449 vec_cst
= permute_results
[number_of_vectors
- j
- 1];
5451 if (!gimple_seq_empty_p (ctor_seq
))
5455 gimple_stmt_iterator gsi
;
5456 if (gimple_code (insert_after
->stmt
) == GIMPLE_PHI
)
5458 gsi
= gsi_after_labels (gimple_bb (insert_after
->stmt
));
5459 gsi_insert_seq_before (&gsi
, ctor_seq
,
5460 GSI_CONTINUE_LINKING
);
5462 else if (!stmt_ends_bb_p (insert_after
->stmt
))
5464 gsi
= gsi_for_stmt (insert_after
->stmt
);
5465 gsi_insert_seq_after (&gsi
, ctor_seq
,
5466 GSI_CONTINUE_LINKING
);
5470 /* When we want to insert after a def where the
5471 defining stmt throws then insert on the fallthru
5473 edge e
= find_fallthru_edge
5474 (gimple_bb (insert_after
->stmt
)->succs
);
5476 = gsi_insert_seq_on_edge_immediate (e
, ctor_seq
);
5477 gcc_assert (!new_bb
);
5481 vinfo
->insert_seq_on_entry (NULL
, ctor_seq
);
5484 voprnds
.quick_push (vec_cst
);
5485 insert_after
= NULL
;
5486 number_of_places_left_in_vector
= nunits
;
5488 elts
.new_vector (vector_type
, nunits
, 1);
5489 elts
.quick_grow (nunits
);
5494 /* Since the vectors are created in the reverse order, we should invert
5496 vec_num
= voprnds
.length ();
5497 for (j
= vec_num
; j
!= 0; j
--)
5499 vop
= voprnds
[j
- 1];
5500 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
5503 /* In case that VF is greater than the unrolling factor needed for the SLP
5504 group of stmts, NUMBER_OF_VECTORS to be created is greater than
5505 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
5506 to replicate the vectors. */
5507 while (number_of_vectors
> SLP_TREE_VEC_DEFS (op_node
).length ())
5508 for (i
= 0; SLP_TREE_VEC_DEFS (op_node
).iterate (i
, &vop
) && i
< vec_num
;
5510 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
5513 /* Get the Ith vectorized definition from SLP_NODE. */
5516 vect_get_slp_vect_def (slp_tree slp_node
, unsigned i
)
5518 if (SLP_TREE_VEC_STMTS (slp_node
).exists ())
5519 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node
)[i
]);
5521 return SLP_TREE_VEC_DEFS (slp_node
)[i
];
5524 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
5527 vect_get_slp_defs (slp_tree slp_node
, vec
<tree
> *vec_defs
)
5529 vec_defs
->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
));
5530 if (SLP_TREE_DEF_TYPE (slp_node
) == vect_internal_def
)
5533 gimple
*vec_def_stmt
;
5534 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), j
, vec_def_stmt
)
5535 vec_defs
->quick_push (gimple_get_lhs (vec_def_stmt
));
5538 vec_defs
->splice (SLP_TREE_VEC_DEFS (slp_node
));
5541 /* Get N vectorized definitions for SLP_NODE. */
5544 vect_get_slp_defs (vec_info
*,
5545 slp_tree slp_node
, vec
<vec
<tree
> > *vec_oprnds
, unsigned n
)
5548 n
= SLP_TREE_CHILDREN (slp_node
).length ();
5550 for (unsigned i
= 0; i
< n
; ++i
)
5552 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
5553 vec
<tree
> vec_defs
= vNULL
;
5554 vect_get_slp_defs (child
, &vec_defs
);
5555 vec_oprnds
->quick_push (vec_defs
);
5559 /* Generate vector permute statements from a list of loads in DR_CHAIN.
5560 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
5561 permute statements for the SLP node NODE. Store the number of vector
5562 permute instructions in *N_PERMS and the number of vector load
5563 instructions in *N_LOADS. */
5566 vect_transform_slp_perm_load (vec_info
*vinfo
,
5567 slp_tree node
, vec
<tree
> dr_chain
,
5568 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
5569 bool analyze_only
, unsigned *n_perms
,
5570 unsigned int *n_loads
)
5572 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
5574 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5575 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (node
).length ();
5576 unsigned int mask_element
;
5579 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
5582 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
5584 mode
= TYPE_MODE (vectype
);
5585 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5587 /* Initialize the vect stmts of NODE to properly insert the generated
5590 for (unsigned i
= SLP_TREE_VEC_STMTS (node
).length ();
5591 i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
5592 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
5594 /* Generate permutation masks for every NODE. Number of masks for each NODE
5595 is equal to GROUP_SIZE.
5596 E.g., we have a group of three nodes with three loads from the same
5597 location in each node, and the vector size is 4. I.e., we have a
5598 a0b0c0a1b1c1... sequence and we need to create the following vectors:
5599 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
5600 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
5603 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
5604 The last mask is illegal since we assume two operands for permute
5605 operation, and the mask element values can't be outside that range.
5606 Hence, the last mask must be converted into {2,5,5,5}.
5607 For the first two permutations we need the first and the second input
5608 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
5609 we need the second and the third vectors: {b1,c1,a2,b2} and
5612 int vect_stmts_counter
= 0;
5613 unsigned int index
= 0;
5614 int first_vec_index
= -1;
5615 int second_vec_index
= -1;
5619 vec_perm_builder mask
;
5620 unsigned int nelts_to_build
;
5621 unsigned int nvectors_per_build
;
5622 unsigned int in_nlanes
;
5623 bool repeating_p
= (group_size
== DR_GROUP_SIZE (stmt_info
)
5624 && multiple_p (nunits
, group_size
));
5627 /* A single vector contains a whole number of copies of the node, so:
5628 (a) all permutes can use the same mask; and
5629 (b) the permutes only need a single vector input. */
5630 mask
.new_vector (nunits
, group_size
, 3);
5631 nelts_to_build
= mask
.encoded_nelts ();
5632 nvectors_per_build
= SLP_TREE_VEC_STMTS (node
).length ();
5633 in_nlanes
= DR_GROUP_SIZE (stmt_info
) * 3;
5637 /* We need to construct a separate mask for each vector statement. */
5638 unsigned HOST_WIDE_INT const_nunits
, const_vf
;
5639 if (!nunits
.is_constant (&const_nunits
)
5640 || !vf
.is_constant (&const_vf
))
5642 mask
.new_vector (const_nunits
, const_nunits
, 1);
5643 nelts_to_build
= const_vf
* group_size
;
5644 nvectors_per_build
= 1;
5645 in_nlanes
= const_vf
* DR_GROUP_SIZE (stmt_info
);
5647 auto_sbitmap
used_in_lanes (in_nlanes
);
5648 bitmap_clear (used_in_lanes
);
5650 unsigned int count
= mask
.encoded_nelts ();
5651 mask
.quick_grow (count
);
5652 vec_perm_indices indices
;
5654 for (unsigned int j
= 0; j
< nelts_to_build
; j
++)
5656 unsigned int iter_num
= j
/ group_size
;
5657 unsigned int stmt_num
= j
% group_size
;
5658 unsigned int i
= (iter_num
* DR_GROUP_SIZE (stmt_info
)
5659 + SLP_TREE_LOAD_PERMUTATION (node
)[stmt_num
]);
5660 bitmap_set_bit (used_in_lanes
, i
);
5663 first_vec_index
= 0;
5668 /* Enforced before the loop when !repeating_p. */
5669 unsigned int const_nunits
= nunits
.to_constant ();
5670 vec_index
= i
/ const_nunits
;
5671 mask_element
= i
% const_nunits
;
5672 if (vec_index
== first_vec_index
5673 || first_vec_index
== -1)
5675 first_vec_index
= vec_index
;
5677 else if (vec_index
== second_vec_index
5678 || second_vec_index
== -1)
5680 second_vec_index
= vec_index
;
5681 mask_element
+= const_nunits
;
5685 if (dump_enabled_p ())
5686 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5687 "permutation requires at "
5688 "least three vectors %G",
5690 gcc_assert (analyze_only
);
5694 gcc_assert (mask_element
< 2 * const_nunits
);
5697 if (mask_element
!= index
)
5699 mask
[index
++] = mask_element
;
5701 if (index
== count
&& !noop_p
)
5703 indices
.new_vector (mask
, second_vec_index
== -1 ? 1 : 2, nunits
);
5704 if (!can_vec_perm_const_p (mode
, indices
))
5706 if (dump_enabled_p ())
5708 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
5710 "unsupported vect permute { ");
5711 for (i
= 0; i
< count
; ++i
)
5713 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
5714 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
5716 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
5718 gcc_assert (analyze_only
);
5729 tree mask_vec
= NULL_TREE
;
5732 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
5734 if (second_vec_index
== -1)
5735 second_vec_index
= first_vec_index
;
5737 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
5739 /* Generate the permute statement if necessary. */
5740 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
5741 tree second_vec
= dr_chain
[second_vec_index
+ ri
];
5745 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
5747 = vect_create_destination_var (gimple_assign_lhs (stmt
),
5749 perm_dest
= make_ssa_name (perm_dest
);
5751 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
5752 first_vec
, second_vec
,
5754 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
,
5758 /* If mask was NULL_TREE generate the requested
5759 identity transform. */
5760 perm_stmt
= SSA_NAME_DEF_STMT (first_vec
);
5762 /* Store the vector statement in NODE. */
5763 SLP_TREE_VEC_STMTS (node
)[vect_stmts_counter
++] = perm_stmt
;
5768 first_vec_index
= -1;
5769 second_vec_index
= -1;
5777 *n_loads
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
5780 /* Enforced above when !repeating_p. */
5781 unsigned int const_nunits
= nunits
.to_constant ();
5783 bool load_seen
= false;
5784 for (unsigned i
= 0; i
< in_nlanes
; ++i
)
5786 if (i
% const_nunits
== 0)
5792 if (bitmap_bit_p (used_in_lanes
, i
))
5804 /* Vectorize the SLP permutations in NODE as specified
5805 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5806 child number and lane number.
5807 Interleaving of two two-lane two-child SLP subtrees (not supported):
5808 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5809 A blend of two four-lane two-child SLP subtrees:
5810 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5811 Highpart of a four-lane one-child SLP subtree (not supported):
5812 [ { 0, 2 }, { 0, 3 } ]
5813 Where currently only a subset is supported by code generating below. */
5816 vectorizable_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
5817 slp_tree node
, stmt_vector_for_cost
*cost_vec
)
5819 tree vectype
= SLP_TREE_VECTYPE (node
);
5821 /* ??? We currently only support all same vector input and output types
5822 while the SLP IL should really do a concat + select and thus accept
5823 arbitrary mismatches. */
5826 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5827 if (!vect_maybe_update_slp_op_vectype (child
, vectype
)
5828 || !types_compatible_p (SLP_TREE_VECTYPE (child
), vectype
))
5830 if (dump_enabled_p ())
5831 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5832 "Unsupported lane permutation\n");
5836 vec
<std::pair
<unsigned, unsigned> > &perm
= SLP_TREE_LANE_PERMUTATION (node
);
5837 gcc_assert (perm
.length () == SLP_TREE_LANES (node
));
5838 if (dump_enabled_p ())
5840 dump_printf_loc (MSG_NOTE
, vect_location
,
5841 "vectorizing permutation");
5842 for (unsigned i
= 0; i
< perm
.length (); ++i
)
5843 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
5844 dump_printf (MSG_NOTE
, "\n");
5847 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5848 if (!nunits
.is_constant ())
5850 unsigned HOST_WIDE_INT vf
= 1;
5851 if (loop_vec_info linfo
= dyn_cast
<loop_vec_info
> (vinfo
))
5852 if (!LOOP_VINFO_VECT_FACTOR (linfo
).is_constant (&vf
))
5854 unsigned olanes
= vf
* SLP_TREE_LANES (node
);
5855 gcc_assert (multiple_p (olanes
, nunits
));
5857 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5858 from the { SLP operand, scalar lane } permutation as recorded in the
5859 SLP node as intermediate step. This part should already work
5860 with SLP children with arbitrary number of lanes. */
5861 auto_vec
<std::pair
<std::pair
<unsigned, unsigned>, unsigned> > vperm
;
5862 auto_vec
<unsigned> active_lane
;
5863 vperm
.create (olanes
);
5864 active_lane
.safe_grow_cleared (SLP_TREE_CHILDREN (node
).length (), true);
5865 for (unsigned i
= 0; i
< vf
; ++i
)
5867 for (unsigned pi
= 0; pi
< perm
.length (); ++pi
)
5869 std::pair
<unsigned, unsigned> p
= perm
[pi
];
5870 tree vtype
= SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node
)[p
.first
]);
5871 unsigned vnunits
= TYPE_VECTOR_SUBPARTS (vtype
).to_constant ();
5872 unsigned vi
= (active_lane
[p
.first
] + p
.second
) / vnunits
;
5873 unsigned vl
= (active_lane
[p
.first
] + p
.second
) % vnunits
;
5874 vperm
.quick_push (std::make_pair (std::make_pair (p
.first
, vi
), vl
));
5876 /* Advance to the next group. */
5877 for (unsigned j
= 0; j
< SLP_TREE_CHILDREN (node
).length (); ++j
)
5878 active_lane
[j
] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node
)[j
]);
5881 if (dump_enabled_p ())
5883 dump_printf_loc (MSG_NOTE
, vect_location
, "as");
5884 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
5886 if (i
!= 0 && multiple_p (i
, TYPE_VECTOR_SUBPARTS (vectype
)))
5887 dump_printf (MSG_NOTE
, ",");
5888 dump_printf (MSG_NOTE
, " vops%u[%u][%u]",
5889 vperm
[i
].first
.first
, vperm
[i
].first
.second
,
5892 dump_printf (MSG_NOTE
, "\n");
5895 /* We can only handle two-vector permutes, everything else should
5896 be lowered on the SLP level. The following is closely inspired
5897 by vect_transform_slp_perm_load and is supposed to eventually
5899 ??? As intermediate step do code-gen in the SLP tree representation
5901 std::pair
<unsigned, unsigned> first_vec
= std::make_pair (-1U, -1U);
5902 std::pair
<unsigned, unsigned> second_vec
= std::make_pair (-1U, -1U);
5903 unsigned int const_nunits
= nunits
.to_constant ();
5904 unsigned int index
= 0;
5905 unsigned int mask_element
;
5906 vec_perm_builder mask
;
5907 mask
.new_vector (const_nunits
, const_nunits
, 1);
5908 unsigned int count
= mask
.encoded_nelts ();
5909 mask
.quick_grow (count
);
5910 vec_perm_indices indices
;
5911 unsigned nperms
= 0;
5912 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
5914 mask_element
= vperm
[i
].second
;
5915 if (first_vec
.first
== -1U
5916 || first_vec
== vperm
[i
].first
)
5917 first_vec
= vperm
[i
].first
;
5918 else if (second_vec
.first
== -1U
5919 || second_vec
== vperm
[i
].first
)
5921 second_vec
= vperm
[i
].first
;
5922 mask_element
+= const_nunits
;
5926 if (dump_enabled_p ())
5927 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5928 "permutation requires at "
5929 "least three vectors\n");
5934 mask
[index
++] = mask_element
;
5938 indices
.new_vector (mask
, second_vec
.first
== -1U ? 1 : 2,
5940 bool identity_p
= indices
.series_p (0, 1, 0, 1);
5942 && !can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5944 if (dump_enabled_p ())
5946 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
5948 "unsupported vect permute { ");
5949 for (i
= 0; i
< count
; ++i
)
5951 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
5952 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
5954 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
5964 if (second_vec
.first
== -1U)
5965 second_vec
= first_vec
;
5967 /* Generate the permute statement if necessary. */
5968 slp_tree first_node
= SLP_TREE_CHILDREN (node
)[first_vec
.first
];
5970 = vect_get_slp_vect_def (first_node
, first_vec
.second
);
5971 /* ??? We SLP match existing vector element extracts but
5972 allow punning which we need to re-instantiate at uses
5973 but have no good way of explicitely representing. */
5974 if (!types_compatible_p (TREE_TYPE (first_def
), vectype
))
5977 conv_stmt
= gimple_build_assign (make_ssa_name (vectype
),
5978 build1 (VIEW_CONVERT_EXPR
,
5979 vectype
, first_def
));
5980 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
5981 first_def
= gimple_assign_lhs (conv_stmt
);
5984 tree perm_dest
= make_ssa_name (vectype
);
5987 slp_tree second_node
5988 = SLP_TREE_CHILDREN (node
)[second_vec
.first
];
5990 = vect_get_slp_vect_def (second_node
, second_vec
.second
);
5991 if (!types_compatible_p (TREE_TYPE (second_def
), vectype
))
5994 conv_stmt
= gimple_build_assign (make_ssa_name (vectype
),
5997 vectype
, second_def
));
5998 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
5999 second_def
= gimple_assign_lhs (conv_stmt
);
6001 tree mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
6002 perm_stmt
= gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
6003 first_def
, second_def
,
6007 /* We need a copy here in case the def was external. */
6008 perm_stmt
= gimple_build_assign (perm_dest
, first_def
);
6009 vect_finish_stmt_generation (vinfo
, NULL
, perm_stmt
, gsi
);
6010 /* Store the vector statement in NODE. */
6011 SLP_TREE_VEC_STMTS (node
).quick_push (perm_stmt
);
6015 first_vec
= std::make_pair (-1U, -1U);
6016 second_vec
= std::make_pair (-1U, -1U);
6021 record_stmt_cost (cost_vec
, nperms
, vec_perm
, NULL
, vectype
, 0, vect_body
);
6026 /* Vectorize SLP NODE. */
6029 vect_schedule_slp_node (vec_info
*vinfo
,
6030 slp_tree node
, slp_instance instance
)
6032 gimple_stmt_iterator si
;
6036 /* For existing vectors there's nothing to do. */
6037 if (SLP_TREE_VEC_DEFS (node
).exists ())
6040 gcc_assert (SLP_TREE_VEC_STMTS (node
).is_empty ());
6042 /* Vectorize externals and constants. */
6043 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
6044 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
6046 /* ??? vectorizable_shift can end up using a scalar operand which is
6047 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
6048 node in this case. */
6049 if (!SLP_TREE_VECTYPE (node
))
6052 vect_create_constant_vectors (vinfo
, node
);
6056 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
6058 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) != 0);
6059 SLP_TREE_VEC_STMTS (node
).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
6061 if (dump_enabled_p ())
6062 dump_printf_loc (MSG_NOTE
, vect_location
,
6063 "------>vectorizing SLP node starting from: %G",
6066 if (STMT_VINFO_DATA_REF (stmt_info
)
6067 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
6069 /* Vectorized loads go before the first scalar load to make it
6070 ready early, vectorized stores go before the last scalar
6071 stmt which is where all uses are ready. */
6072 stmt_vec_info last_stmt_info
= NULL
;
6073 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
6074 last_stmt_info
= vect_find_first_scalar_stmt_in_slp (node
);
6075 else /* DR_IS_WRITE */
6076 last_stmt_info
= vect_find_last_scalar_stmt_in_slp (node
);
6077 si
= gsi_for_stmt (last_stmt_info
->stmt
);
6079 else if ((STMT_VINFO_TYPE (stmt_info
) == cycle_phi_info_type
6080 || STMT_VINFO_TYPE (stmt_info
) == induc_vec_info_type
6081 || STMT_VINFO_TYPE (stmt_info
) == phi_info_type
)
6082 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
6084 /* For PHI node vectorization we do not use the insertion iterator. */
6089 /* Emit other stmts after the children vectorized defs which is
6090 earliest possible. */
6091 gimple
*last_stmt
= NULL
;
6092 bool seen_vector_def
= false;
6093 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6094 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
6096 /* For fold-left reductions we are retaining the scalar
6097 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
6098 set so the representation isn't perfect. Resort to the
6099 last scalar def here. */
6100 if (SLP_TREE_VEC_STMTS (child
).is_empty ())
6102 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child
))
6103 == cycle_phi_info_type
);
6104 gphi
*phi
= as_a
<gphi
*>
6105 (vect_find_last_scalar_stmt_in_slp (child
)->stmt
);
6107 || vect_stmt_dominates_stmt_p (last_stmt
, phi
))
6110 /* We are emitting all vectorized stmts in the same place and
6111 the last one is the last.
6112 ??? Unless we have a load permutation applied and that
6113 figures to re-use an earlier generated load. */
6116 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child
), j
, vstmt
)
6118 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
6121 else if (!SLP_TREE_VECTYPE (child
))
6123 /* For externals we use unvectorized at all scalar defs. */
6126 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child
), j
, def
)
6127 if (TREE_CODE (def
) == SSA_NAME
6128 && !SSA_NAME_IS_DEFAULT_DEF (def
))
6130 gimple
*stmt
= SSA_NAME_DEF_STMT (def
);
6132 || vect_stmt_dominates_stmt_p (last_stmt
, stmt
))
6138 /* For externals we have to look at all defs since their
6139 insertion place is decided per vector. But beware
6140 of pre-existing vectors where we need to make sure
6141 we do not insert before the region boundary. */
6142 if (SLP_TREE_SCALAR_OPS (child
).is_empty ()
6143 && !vinfo
->lookup_def (SLP_TREE_VEC_DEFS (child
)[0]))
6144 seen_vector_def
= true;
6149 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
6150 if (TREE_CODE (vdef
) == SSA_NAME
6151 && !SSA_NAME_IS_DEFAULT_DEF (vdef
))
6153 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
6155 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
6160 /* This can happen when all children are pre-existing vectors or
6163 last_stmt
= vect_find_first_scalar_stmt_in_slp (node
)->stmt
;
6166 gcc_assert (seen_vector_def
);
6167 si
= gsi_after_labels (as_a
<bb_vec_info
> (vinfo
)->bbs
[0]);
6169 else if (is_a
<gphi
*> (last_stmt
))
6170 si
= gsi_after_labels (gimple_bb (last_stmt
));
6173 si
= gsi_for_stmt (last_stmt
);
6178 bool done_p
= false;
6180 /* Handle purely internal nodes. */
6181 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
6183 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
6184 be shared with different SLP nodes (but usually it's the same
6185 operation apart from the case the stmt is only there for denoting
6186 the actual scalar lane defs ...). So do not call vect_transform_stmt
6187 but open-code it here (partly). */
6188 bool done
= vectorizable_slp_permutation (vinfo
, &si
, node
, NULL
);
6193 vect_transform_stmt (vinfo
, stmt_info
, &si
, node
, instance
);
6196 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
6197 For loop vectorization this is done in vectorizable_call, but for SLP
6198 it needs to be deferred until end of vect_schedule_slp, because multiple
6199 SLP instances may refer to the same scalar stmt. */
6202 vect_remove_slp_scalar_calls (vec_info
*vinfo
,
6203 slp_tree node
, hash_set
<slp_tree
> &visited
)
6206 gimple_stmt_iterator gsi
;
6210 stmt_vec_info stmt_info
;
6212 if (!node
|| SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
6215 if (visited
.add (node
))
6218 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6219 vect_remove_slp_scalar_calls (vinfo
, child
, visited
);
6221 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
6223 gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
6224 if (!stmt
|| gimple_bb (stmt
) == NULL
)
6226 if (is_pattern_stmt_p (stmt_info
)
6227 || !PURE_SLP_STMT (stmt_info
))
6229 lhs
= gimple_call_lhs (stmt
);
6230 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
6231 gsi
= gsi_for_stmt (stmt
);
6232 vinfo
->replace_stmt (&gsi
, stmt_info
, new_stmt
);
6233 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
6238 vect_remove_slp_scalar_calls (vec_info
*vinfo
, slp_tree node
)
6240 hash_set
<slp_tree
> visited
;
6241 vect_remove_slp_scalar_calls (vinfo
, node
, visited
);
6244 /* Vectorize the instance root. */
6247 vectorize_slp_instance_root_stmt (slp_tree node
, slp_instance instance
)
6249 gassign
*rstmt
= NULL
;
6251 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) == 1)
6256 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
6258 tree vect_lhs
= gimple_get_lhs (child_stmt
);
6259 tree root_lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
6260 if (!useless_type_conversion_p (TREE_TYPE (root_lhs
),
6261 TREE_TYPE (vect_lhs
)))
6262 vect_lhs
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (root_lhs
),
6264 rstmt
= gimple_build_assign (root_lhs
, vect_lhs
);
6268 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) > 1)
6270 int nelts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
6273 vec
<constructor_elt
, va_gc
> *v
;
6274 vec_alloc (v
, nelts
);
6276 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
6278 CONSTRUCTOR_APPEND_ELT (v
,
6280 gimple_get_lhs (child_stmt
));
6282 tree lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
6283 tree rtype
= TREE_TYPE (gimple_assign_rhs1 (instance
->root_stmt
->stmt
));
6284 tree r_constructor
= build_constructor (rtype
, v
);
6285 rstmt
= gimple_build_assign (lhs
, r_constructor
);
6290 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmt
->stmt
);
6291 gsi_replace (&rgsi
, rstmt
, true);
6301 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
6304 vect_schedule_scc (vec_info
*vinfo
, slp_tree node
, slp_instance instance
,
6305 hash_map
<slp_tree
, slp_scc_info
> &scc_info
,
6306 int &maxdfs
, vec
<slp_tree
> &stack
)
6309 slp_scc_info
*info
= &scc_info
.get_or_insert (node
, &existed_p
);
6310 gcc_assert (!existed_p
);
6312 info
->lowlink
= maxdfs
;
6316 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
6318 info
->on_stack
= false;
6319 vect_schedule_slp_node (vinfo
, node
, instance
);
6323 info
->on_stack
= true;
6324 stack
.safe_push (node
);
6329 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6333 slp_scc_info
*child_info
= scc_info
.get (child
);
6336 vect_schedule_scc (vinfo
, child
, instance
, scc_info
, maxdfs
, stack
);
6337 /* Recursion might have re-allocated the node. */
6338 info
= scc_info
.get (node
);
6339 child_info
= scc_info
.get (child
);
6340 info
->lowlink
= MIN (info
->lowlink
, child_info
->lowlink
);
6342 else if (child_info
->on_stack
)
6343 info
->lowlink
= MIN (info
->lowlink
, child_info
->dfs
);
6345 if (info
->lowlink
!= info
->dfs
)
6348 auto_vec
<slp_tree
, 4> phis_to_fixup
;
6351 if (stack
.last () == node
)
6354 info
->on_stack
= false;
6355 vect_schedule_slp_node (vinfo
, node
, instance
);
6356 if (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
6357 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (node
)->stmt
))
6358 phis_to_fixup
.quick_push (node
);
6363 int last_idx
= stack
.length () - 1;
6364 while (stack
[last_idx
] != node
)
6366 /* We can break the cycle at PHIs who have at least one child
6367 code generated. Then we could re-start the DFS walk until
6368 all nodes in the SCC are covered (we might have new entries
6369 for only back-reachable nodes). But it's simpler to just
6370 iterate and schedule those that are ready. */
6371 unsigned todo
= stack
.length () - last_idx
;
6374 for (int idx
= stack
.length () - 1; idx
>= last_idx
; --idx
)
6376 slp_tree entry
= stack
[idx
];
6379 bool phi
= (SLP_TREE_CODE (entry
) != VEC_PERM_EXPR
6380 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (entry
)->stmt
));
6382 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry
), i
, child
)
6389 else if (scc_info
.get (child
)->on_stack
)
6407 vect_schedule_slp_node (vinfo
, entry
, instance
);
6408 scc_info
.get (entry
)->on_stack
= false;
6412 phis_to_fixup
.safe_push (entry
);
6419 stack
.truncate (last_idx
);
6422 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
6424 FOR_EACH_VEC_ELT (phis_to_fixup
, i
, phi_node
)
6426 gphi
*phi
= as_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (phi_node
)->stmt
);
6429 FOR_EACH_EDGE (e
, ei
, gimple_bb (phi
)->preds
)
6431 unsigned dest_idx
= e
->dest_idx
;
6432 child
= SLP_TREE_CHILDREN (phi_node
)[dest_idx
];
6433 if (!child
|| SLP_TREE_DEF_TYPE (child
) != vect_internal_def
)
6435 /* Simply fill all args. */
6436 for (unsigned i
= 0; i
< SLP_TREE_VEC_STMTS (phi_node
).length (); ++i
)
6437 add_phi_arg (as_a
<gphi
*> (SLP_TREE_VEC_STMTS (phi_node
)[i
]),
6438 vect_get_slp_vect_def (child
, i
),
6439 e
, gimple_phi_arg_location (phi
, dest_idx
));
6444 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
6447 vect_schedule_slp (vec_info
*vinfo
, vec
<slp_instance
> slp_instances
)
6449 slp_instance instance
;
6452 hash_map
<slp_tree
, slp_scc_info
> scc_info
;
6454 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
6456 slp_tree node
= SLP_INSTANCE_TREE (instance
);
6457 if (dump_enabled_p ())
6459 dump_printf_loc (MSG_NOTE
, vect_location
,
6460 "Vectorizing SLP tree:\n");
6461 if (SLP_INSTANCE_ROOT_STMT (instance
))
6462 dump_printf_loc (MSG_NOTE
, vect_location
, "Root stmt: %G",
6463 SLP_INSTANCE_ROOT_STMT (instance
)->stmt
);
6464 vect_print_slp_graph (MSG_NOTE
, vect_location
,
6465 SLP_INSTANCE_TREE (instance
));
6467 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
6468 have a PHI be the node breaking the cycle. */
6469 auto_vec
<slp_tree
> stack
;
6470 if (!scc_info
.get (node
))
6471 vect_schedule_scc (vinfo
, node
, instance
, scc_info
, maxdfs
, stack
);
6473 if (SLP_INSTANCE_ROOT_STMT (instance
))
6474 vectorize_slp_instance_root_stmt (node
, instance
);
6476 if (dump_enabled_p ())
6477 dump_printf_loc (MSG_NOTE
, vect_location
,
6478 "vectorizing stmts using SLP.\n");
6481 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
6483 slp_tree root
= SLP_INSTANCE_TREE (instance
);
6484 stmt_vec_info store_info
;
6487 /* Remove scalar call stmts. Do not do this for basic-block
6488 vectorization as not all uses may be vectorized.
6489 ??? Why should this be necessary? DCE should be able to
6490 remove the stmts itself.
6491 ??? For BB vectorization we can as well remove scalar
6492 stmts starting from the SLP tree root if they have no
6494 if (is_a
<loop_vec_info
> (vinfo
))
6495 vect_remove_slp_scalar_calls (vinfo
, root
);
6497 /* Remove vectorized stores original scalar stmts. */
6498 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store_info
); j
++)
6500 if (!STMT_VINFO_DATA_REF (store_info
)
6501 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info
)))
6504 store_info
= vect_orig_stmt (store_info
);
6505 /* Free the attached stmt_vec_info and remove the stmt. */
6506 vinfo
->remove_stmt (store_info
);
6508 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
6509 to not crash in vect_free_slp_tree later. */
6510 if (SLP_TREE_REPRESENTATIVE (root
) == store_info
)
6511 SLP_TREE_REPRESENTATIVE (root
) = NULL
;