1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "tree-pass.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
51 #include "alloc-pool.h"
53 static bool vectorizable_slp_permutation (vec_info
*, gimple_stmt_iterator
*,
54 slp_tree
, stmt_vector_for_cost
*);
56 static object_allocator
<_slp_tree
> *slp_tree_pool
;
57 static slp_tree slp_first_node
;
62 slp_tree_pool
= new object_allocator
<_slp_tree
> ("SLP nodes");
68 while (slp_first_node
)
69 delete slp_first_node
;
75 _slp_tree::operator new (size_t n
)
77 gcc_assert (n
== sizeof (_slp_tree
));
78 return slp_tree_pool
->allocate_raw ();
82 _slp_tree::operator delete (void *node
, size_t n
)
84 gcc_assert (n
== sizeof (_slp_tree
));
85 slp_tree_pool
->remove_raw (node
);
89 /* Initialize a SLP node. */
91 _slp_tree::_slp_tree ()
93 this->prev_node
= NULL
;
95 slp_first_node
->prev_node
= this;
96 this->next_node
= slp_first_node
;
97 slp_first_node
= this;
98 SLP_TREE_SCALAR_STMTS (this) = vNULL
;
99 SLP_TREE_SCALAR_OPS (this) = vNULL
;
100 SLP_TREE_VEC_STMTS (this) = vNULL
;
101 SLP_TREE_VEC_DEFS (this) = vNULL
;
102 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
103 SLP_TREE_CHILDREN (this) = vNULL
;
104 SLP_TREE_LOAD_PERMUTATION (this) = vNULL
;
105 SLP_TREE_LANE_PERMUTATION (this) = vNULL
;
106 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def
;
107 SLP_TREE_CODE (this) = ERROR_MARK
;
108 SLP_TREE_VECTYPE (this) = NULL_TREE
;
109 SLP_TREE_REPRESENTATIVE (this) = NULL
;
110 SLP_TREE_REF_COUNT (this) = 1;
111 this->max_nunits
= 1;
115 /* Tear down a SLP node. */
117 _slp_tree::~_slp_tree ()
120 this->prev_node
->next_node
= this->next_node
;
122 slp_first_node
= this->next_node
;
124 this->next_node
->prev_node
= this->prev_node
;
125 SLP_TREE_CHILDREN (this).release ();
126 SLP_TREE_SCALAR_STMTS (this).release ();
127 SLP_TREE_SCALAR_OPS (this).release ();
128 SLP_TREE_VEC_STMTS (this).release ();
129 SLP_TREE_VEC_DEFS (this).release ();
130 SLP_TREE_LOAD_PERMUTATION (this).release ();
131 SLP_TREE_LANE_PERMUTATION (this).release ();
134 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
137 vect_free_slp_tree (slp_tree node
)
142 if (--SLP_TREE_REF_COUNT (node
) != 0)
145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
147 vect_free_slp_tree (child
);
149 /* If the node defines any SLP only patterns then those patterns are no
150 longer valid and should be removed. */
151 stmt_vec_info rep_stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
152 if (rep_stmt_info
&& STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info
))
154 stmt_vec_info stmt_info
= vect_orig_stmt (rep_stmt_info
);
155 STMT_VINFO_IN_PATTERN_P (stmt_info
) = false;
156 STMT_SLP_TYPE (stmt_info
) = STMT_SLP_TYPE (rep_stmt_info
);
162 /* Return a location suitable for dumpings related to the SLP instance. */
165 _slp_instance::location () const
168 return root_stmt
->stmt
;
170 return SLP_TREE_SCALAR_STMTS (root
)[0]->stmt
;
174 /* Free the memory allocated for the SLP instance. */
177 vect_free_slp_instance (slp_instance instance
)
179 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
180 SLP_INSTANCE_LOADS (instance
).release ();
181 instance
->subgraph_entries
.release ();
182 instance
->cost_vec
.release ();
187 /* Create an SLP node for SCALAR_STMTS. */
190 vect_create_new_slp_node (unsigned nops
, tree_code code
)
192 slp_tree node
= new _slp_tree
;
193 SLP_TREE_SCALAR_STMTS (node
) = vNULL
;
194 SLP_TREE_CHILDREN (node
).create (nops
);
195 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
196 SLP_TREE_CODE (node
) = code
;
199 /* Create an SLP node for SCALAR_STMTS. */
202 vect_create_new_slp_node (slp_tree node
,
203 vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
205 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
206 SLP_TREE_CHILDREN (node
).create (nops
);
207 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
208 SLP_TREE_REPRESENTATIVE (node
) = scalar_stmts
[0];
209 SLP_TREE_LANES (node
) = scalar_stmts
.length ();
213 /* Create an SLP node for SCALAR_STMTS. */
216 vect_create_new_slp_node (vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
218 return vect_create_new_slp_node (new _slp_tree
, scalar_stmts
, nops
);
221 /* Create an SLP node for OPS. */
224 vect_create_new_slp_node (slp_tree node
, vec
<tree
> ops
)
226 SLP_TREE_SCALAR_OPS (node
) = ops
;
227 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
228 SLP_TREE_LANES (node
) = ops
.length ();
232 /* Create an SLP node for OPS. */
235 vect_create_new_slp_node (vec
<tree
> ops
)
237 return vect_create_new_slp_node (new _slp_tree
, ops
);
241 /* This structure is used in creation of an SLP tree. Each instance
242 corresponds to the same operand in a group of scalar stmts in an SLP
244 typedef struct _slp_oprnd_info
246 /* Def-stmts for the operands. */
247 vec
<stmt_vec_info
> def_stmts
;
250 /* Information about the first statement, its vector def-type, type, the
251 operand itself in case it's constant, and an indication if it's a pattern
254 enum vect_def_type first_dt
;
259 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
261 static vec
<slp_oprnd_info
>
262 vect_create_oprnd_info (int nops
, int group_size
)
265 slp_oprnd_info oprnd_info
;
266 vec
<slp_oprnd_info
> oprnds_info
;
268 oprnds_info
.create (nops
);
269 for (i
= 0; i
< nops
; i
++)
271 oprnd_info
= XNEW (struct _slp_oprnd_info
);
272 oprnd_info
->def_stmts
.create (group_size
);
273 oprnd_info
->ops
.create (group_size
);
274 oprnd_info
->first_dt
= vect_uninitialized_def
;
275 oprnd_info
->first_op_type
= NULL_TREE
;
276 oprnd_info
->any_pattern
= false;
277 oprnds_info
.quick_push (oprnd_info
);
284 /* Free operands info. */
287 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
290 slp_oprnd_info oprnd_info
;
292 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
294 oprnd_info
->def_stmts
.release ();
295 oprnd_info
->ops
.release ();
296 XDELETE (oprnd_info
);
299 oprnds_info
.release ();
303 /* Return true if STMTS contains a pattern statement. */
306 vect_contains_pattern_stmt_p (vec
<stmt_vec_info
> stmts
)
308 stmt_vec_info stmt_info
;
310 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
311 if (is_pattern_stmt_p (stmt_info
))
316 /* Return true when all lanes in the external or constant NODE have
320 vect_slp_tree_uniform_p (slp_tree node
)
322 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
323 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
);
325 /* Pre-exsting vectors. */
326 if (SLP_TREE_SCALAR_OPS (node
).is_empty ())
330 tree op
, first
= NULL_TREE
;
331 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
334 else if (!operand_equal_p (first
, op
, 0))
340 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
341 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
345 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info
,
346 stmt_vec_info first_stmt_info
)
348 stmt_vec_info next_stmt_info
= first_stmt_info
;
351 if (first_stmt_info
!= DR_GROUP_FIRST_ELEMENT (stmt_info
))
356 if (next_stmt_info
== stmt_info
)
358 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
360 result
+= DR_GROUP_GAP (next_stmt_info
);
362 while (next_stmt_info
);
367 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
368 using the method implemented by duplicate_and_interleave. Return true
369 if so, returning the number of intermediate vectors in *NVECTORS_OUT
370 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
374 can_duplicate_and_interleave_p (vec_info
*vinfo
, unsigned int count
,
375 tree elt_type
, unsigned int *nvectors_out
,
376 tree
*vector_type_out
,
379 tree base_vector_type
= get_vectype_for_scalar_type (vinfo
, elt_type
, count
);
380 if (!base_vector_type
|| !VECTOR_MODE_P (TYPE_MODE (base_vector_type
)))
383 machine_mode base_vector_mode
= TYPE_MODE (base_vector_type
);
384 poly_int64 elt_bytes
= count
* GET_MODE_UNIT_SIZE (base_vector_mode
);
385 unsigned int nvectors
= 1;
388 scalar_int_mode int_mode
;
389 poly_int64 elt_bits
= elt_bytes
* BITS_PER_UNIT
;
390 if (int_mode_for_size (elt_bits
, 1).exists (&int_mode
))
392 /* Get the natural vector type for this SLP group size. */
393 tree int_type
= build_nonstandard_integer_type
394 (GET_MODE_BITSIZE (int_mode
), 1);
396 = get_vectype_for_scalar_type (vinfo
, int_type
, count
);
398 && VECTOR_MODE_P (TYPE_MODE (vector_type
))
399 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type
)),
400 GET_MODE_SIZE (base_vector_mode
)))
402 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
403 together into elements of type INT_TYPE and using the result
404 to build NVECTORS vectors. */
405 poly_uint64 nelts
= GET_MODE_NUNITS (TYPE_MODE (vector_type
));
406 vec_perm_builder
sel1 (nelts
, 2, 3);
407 vec_perm_builder
sel2 (nelts
, 2, 3);
408 poly_int64 half_nelts
= exact_div (nelts
, 2);
409 for (unsigned int i
= 0; i
< 3; ++i
)
412 sel1
.quick_push (i
+ nelts
);
413 sel2
.quick_push (half_nelts
+ i
);
414 sel2
.quick_push (half_nelts
+ i
+ nelts
);
416 vec_perm_indices
indices1 (sel1
, 2, nelts
);
417 vec_perm_indices
indices2 (sel2
, 2, nelts
);
418 if (can_vec_perm_const_p (TYPE_MODE (vector_type
), indices1
)
419 && can_vec_perm_const_p (TYPE_MODE (vector_type
), indices2
))
422 *nvectors_out
= nvectors
;
424 *vector_type_out
= vector_type
;
427 permutes
[0] = vect_gen_perm_mask_checked (vector_type
,
429 permutes
[1] = vect_gen_perm_mask_checked (vector_type
,
436 if (!multiple_p (elt_bytes
, 2, &elt_bytes
))
442 /* Return true if DTA and DTB match. */
445 vect_def_types_match (enum vect_def_type dta
, enum vect_def_type dtb
)
448 || ((dta
== vect_external_def
|| dta
== vect_constant_def
)
449 && (dtb
== vect_external_def
|| dtb
== vect_constant_def
)));
452 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
453 they are of a valid type and that they match the defs of the first stmt of
454 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
455 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
456 indicates swap is required for cond_expr stmts. Specifically, *SWAP
457 is 1 if STMT is cond and operands of comparison need to be swapped;
458 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
459 If there is any operand swap in this function, *SWAP is set to non-zero
461 If there was a fatal error return -1; if the error could be corrected by
462 swapping operands of father node of this one, return 1; if everything is
465 vect_get_and_check_slp_defs (vec_info
*vinfo
, unsigned char swap
,
467 vec
<stmt_vec_info
> stmts
, unsigned stmt_num
,
468 vec
<slp_oprnd_info
> *oprnds_info
)
470 stmt_vec_info stmt_info
= stmts
[stmt_num
];
472 unsigned int i
, number_of_oprnds
;
473 enum vect_def_type dt
= vect_uninitialized_def
;
474 slp_oprnd_info oprnd_info
;
475 int first_op_idx
= 1;
476 unsigned int commutative_op
= -1U;
477 bool first_op_cond
= false;
478 bool first
= stmt_num
== 0;
480 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
482 number_of_oprnds
= gimple_call_num_args (stmt
);
484 if (gimple_call_internal_p (stmt
))
486 internal_fn ifn
= gimple_call_internal_fn (stmt
);
487 commutative_op
= first_commutative_argument (ifn
);
489 /* Masked load, only look at mask. */
490 if (ifn
== IFN_MASK_LOAD
)
492 number_of_oprnds
= 1;
493 /* Mask operand index. */
498 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
500 enum tree_code code
= gimple_assign_rhs_code (stmt
);
501 number_of_oprnds
= gimple_num_ops (stmt
) - 1;
502 /* Swap can only be done for cond_expr if asked to, otherwise we
503 could result in different comparison code to the first stmt. */
504 if (code
== COND_EXPR
505 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt
)))
507 first_op_cond
= true;
511 commutative_op
= commutative_tree_code (code
) ? 0U : -1U;
513 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
514 number_of_oprnds
= gimple_phi_num_args (stmt
);
518 bool swapped
= (swap
!= 0);
519 bool backedge
= false;
520 gcc_assert (!swapped
|| first_op_cond
);
521 enum vect_def_type
*dts
= XALLOCAVEC (enum vect_def_type
, number_of_oprnds
);
522 for (i
= 0; i
< number_of_oprnds
; i
++)
526 /* Map indicating how operands of cond_expr should be swapped. */
527 int maps
[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
528 int *map
= maps
[swap
];
531 oprnd
= TREE_OPERAND (gimple_op (stmt_info
->stmt
,
532 first_op_idx
), map
[i
]);
534 oprnd
= gimple_op (stmt_info
->stmt
, map
[i
]);
536 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
538 oprnd
= gimple_phi_arg_def (stmt
, i
);
539 backedge
= dominated_by_p (CDI_DOMINATORS
,
540 gimple_phi_arg_edge (stmt
, i
)->src
,
541 gimple_bb (stmt_info
->stmt
));
544 oprnd
= gimple_op (stmt_info
->stmt
, first_op_idx
+ (swapped
? !i
: i
));
545 if (TREE_CODE (oprnd
) == VIEW_CONVERT_EXPR
)
546 oprnd
= TREE_OPERAND (oprnd
, 0);
548 oprnd_info
= (*oprnds_info
)[i
];
550 stmt_vec_info def_stmt_info
;
551 if (!vect_is_simple_use (oprnd
, vinfo
, &dts
[i
], &def_stmt_info
))
553 if (dump_enabled_p ())
554 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
555 "Build SLP failed: can't analyze def for %T\n",
563 oprnd_info
->def_stmts
.quick_push (NULL
);
564 oprnd_info
->ops
.quick_push (NULL_TREE
);
565 oprnd_info
->first_dt
= vect_uninitialized_def
;
569 oprnd_info
->def_stmts
.quick_push (def_stmt_info
);
570 oprnd_info
->ops
.quick_push (oprnd
);
573 && is_pattern_stmt_p (def_stmt_info
))
575 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info
))
577 oprnd_info
->any_pattern
= true;
579 /* If we promote this to external use the original stmt def. */
580 oprnd_info
->ops
.last ()
581 = gimple_get_lhs (vect_orig_stmt (def_stmt_info
)->stmt
);
584 /* If there's a extern def on a backedge make sure we can
585 code-generate at the region start.
586 ??? This is another case that could be fixed by adjusting
587 how we split the function but at the moment we'd have conflicting
590 && dts
[i
] == vect_external_def
591 && is_a
<bb_vec_info
> (vinfo
)
592 && TREE_CODE (oprnd
) == SSA_NAME
593 && !SSA_NAME_IS_DEFAULT_DEF (oprnd
)
594 && !dominated_by_p (CDI_DOMINATORS
,
595 as_a
<bb_vec_info
> (vinfo
)->bbs
[0],
596 gimple_bb (SSA_NAME_DEF_STMT (oprnd
))))
598 if (dump_enabled_p ())
599 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
600 "Build SLP failed: extern def %T only defined "
601 "on backedge\n", oprnd
);
607 tree type
= TREE_TYPE (oprnd
);
609 if ((dt
== vect_constant_def
610 || dt
== vect_external_def
)
611 && !GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
612 && (TREE_CODE (type
) == BOOLEAN_TYPE
613 || !can_duplicate_and_interleave_p (vinfo
, stmts
.length (),
616 if (dump_enabled_p ())
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
618 "Build SLP failed: invalid type of def "
619 "for variable-length SLP %T\n", oprnd
);
623 /* For the swapping logic below force vect_reduction_def
624 for the reduction op in a SLP reduction group. */
625 if (!STMT_VINFO_DATA_REF (stmt_info
)
626 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
627 && (int)i
== STMT_VINFO_REDUC_IDX (stmt_info
)
629 dts
[i
] = dt
= vect_reduction_def
;
631 /* Check the types of the definition. */
634 case vect_external_def
:
635 case vect_constant_def
:
636 case vect_internal_def
:
637 case vect_reduction_def
:
638 case vect_induction_def
:
639 case vect_nested_cycle
:
643 /* FORNOW: Not supported. */
644 if (dump_enabled_p ())
645 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
646 "Build SLP failed: illegal type of def %T\n",
651 oprnd_info
->first_dt
= dt
;
652 oprnd_info
->first_op_type
= type
;
658 /* Now match the operand definition types to that of the first stmt. */
659 for (i
= 0; i
< number_of_oprnds
;)
667 oprnd_info
= (*oprnds_info
)[i
];
669 stmt_vec_info def_stmt_info
= oprnd_info
->def_stmts
[stmt_num
];
670 oprnd
= oprnd_info
->ops
[stmt_num
];
671 tree type
= TREE_TYPE (oprnd
);
673 if (!types_compatible_p (oprnd_info
->first_op_type
, type
))
675 if (dump_enabled_p ())
676 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
677 "Build SLP failed: different operand types\n");
681 /* Not first stmt of the group, check that the def-stmt/s match
682 the def-stmt/s of the first stmt. Allow different definition
683 types for reduction chains: the first stmt must be a
684 vect_reduction_def (a phi node), and the rest
685 end in the reduction chain. */
686 if ((!vect_def_types_match (oprnd_info
->first_dt
, dt
)
687 && !(oprnd_info
->first_dt
== vect_reduction_def
688 && !STMT_VINFO_DATA_REF (stmt_info
)
689 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
691 && !STMT_VINFO_DATA_REF (def_stmt_info
)
692 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
693 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
))))
694 || (!STMT_VINFO_DATA_REF (stmt_info
)
695 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
697 || STMT_VINFO_DATA_REF (def_stmt_info
)
698 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
699 != REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
700 != (oprnd_info
->first_dt
!= vect_reduction_def
))))
702 /* Try swapping operands if we got a mismatch. For BB
703 vectorization only in case it will clearly improve things. */
704 if (i
== commutative_op
&& !swapped
705 && (!is_a
<bb_vec_info
> (vinfo
)
706 || (!vect_def_types_match ((*oprnds_info
)[i
+1]->first_dt
,
708 && (vect_def_types_match (oprnd_info
->first_dt
, dts
[i
+1])
709 || vect_def_types_match
710 ((*oprnds_info
)[i
+1]->first_dt
, dts
[i
])))))
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_NOTE
, vect_location
,
714 "trying swapped operands\n");
715 std::swap (dts
[i
], dts
[i
+1]);
716 std::swap ((*oprnds_info
)[i
]->def_stmts
[stmt_num
],
717 (*oprnds_info
)[i
+1]->def_stmts
[stmt_num
]);
718 std::swap ((*oprnds_info
)[i
]->ops
[stmt_num
],
719 (*oprnds_info
)[i
+1]->ops
[stmt_num
]);
724 if (is_a
<bb_vec_info
> (vinfo
)
725 && !oprnd_info
->any_pattern
)
727 /* Now for commutative ops we should see whether we can
728 make the other operand matching. */
729 if (dump_enabled_p ())
730 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
731 "treating operand as external\n");
732 oprnd_info
->first_dt
= dt
= vect_external_def
;
736 if (dump_enabled_p ())
737 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
738 "Build SLP failed: different types\n");
743 /* Make sure to demote the overall operand to external. */
744 if (dt
== vect_external_def
)
745 oprnd_info
->first_dt
= vect_external_def
;
746 /* For a SLP reduction chain we want to duplicate the reduction to
747 each of the chain members. That gets us a sane SLP graph (still
748 the stmts are not 100% correct wrt the initial values). */
749 else if ((dt
== vect_internal_def
750 || dt
== vect_reduction_def
)
751 && oprnd_info
->first_dt
== vect_reduction_def
752 && !STMT_VINFO_DATA_REF (stmt_info
)
753 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
754 && !STMT_VINFO_DATA_REF (def_stmt_info
)
755 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
756 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
758 oprnd_info
->def_stmts
[stmt_num
] = oprnd_info
->def_stmts
[0];
759 oprnd_info
->ops
[stmt_num
] = oprnd_info
->ops
[0];
768 if (dump_enabled_p ())
769 dump_printf_loc (MSG_NOTE
, vect_location
,
770 "swapped operands to match def types in %G",
777 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
778 Return true if we can, meaning that this choice doesn't conflict with
779 existing SLP nodes that use STMT_INFO. */
782 vect_update_shared_vectype (stmt_vec_info stmt_info
, tree vectype
)
784 tree old_vectype
= STMT_VINFO_VECTYPE (stmt_info
);
786 return useless_type_conversion_p (vectype
, old_vectype
);
788 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
790 /* We maintain the invariant that if any statement in the group is
791 used, all other members of the group have the same vector type. */
792 stmt_vec_info first_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
793 stmt_vec_info member_info
= first_info
;
794 for (; member_info
; member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
795 if (is_pattern_stmt_p (member_info
)
796 && !useless_type_conversion_p (vectype
,
797 STMT_VINFO_VECTYPE (member_info
)))
802 for (member_info
= first_info
; member_info
;
803 member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
804 STMT_VINFO_VECTYPE (member_info
) = vectype
;
808 else if (!is_pattern_stmt_p (stmt_info
))
810 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
814 if (dump_enabled_p ())
816 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
817 "Build SLP failed: incompatible vector"
818 " types for: %G", stmt_info
->stmt
);
819 dump_printf_loc (MSG_NOTE
, vect_location
,
820 " old vector type: %T\n", old_vectype
);
821 dump_printf_loc (MSG_NOTE
, vect_location
,
822 " new vector type: %T\n", vectype
);
827 /* Return true if call statements CALL1 and CALL2 are similar enough
828 to be combined into the same SLP group. */
831 compatible_calls_p (gcall
*call1
, gcall
*call2
)
833 unsigned int nargs
= gimple_call_num_args (call1
);
834 if (nargs
!= gimple_call_num_args (call2
))
837 if (gimple_call_combined_fn (call1
) != gimple_call_combined_fn (call2
))
840 if (gimple_call_internal_p (call1
))
842 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1
)),
843 TREE_TYPE (gimple_call_lhs (call2
))))
845 for (unsigned int i
= 0; i
< nargs
; ++i
)
846 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1
, i
)),
847 TREE_TYPE (gimple_call_arg (call2
, i
))))
852 if (!operand_equal_p (gimple_call_fn (call1
),
853 gimple_call_fn (call2
), 0))
856 if (gimple_call_fntype (call1
) != gimple_call_fntype (call2
))
862 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
863 caller's attempt to find the vector type in STMT_INFO with the narrowest
864 element type. Return true if VECTYPE is nonnull and if it is valid
865 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
866 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
867 vect_build_slp_tree. */
870 vect_record_max_nunits (vec_info
*vinfo
, stmt_vec_info stmt_info
,
871 unsigned int group_size
,
872 tree vectype
, poly_uint64
*max_nunits
)
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
878 "Build SLP failed: unsupported data-type in %G\n",
880 /* Fatal mismatch. */
884 /* If populating the vector type requires unrolling then fail
885 before adjusting *max_nunits for basic-block vectorization. */
886 if (is_a
<bb_vec_info
> (vinfo
)
887 && !multiple_p (group_size
, TYPE_VECTOR_SUBPARTS (vectype
)))
889 if (dump_enabled_p ())
890 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
891 "Build SLP failed: unrolling required "
892 "in basic block SLP\n");
893 /* Fatal mismatch. */
897 /* In case of multiple types we need to detect the smallest type. */
898 vect_update_max_nunits (max_nunits
, vectype
);
902 /* Verify if the scalar stmts STMTS are isomorphic, require data
903 permutation or are of unsupported types of operation. Return
904 true if they are, otherwise return false and indicate in *MATCHES
905 which stmts are not isomorphic to the first one. If MATCHES[0]
906 is false then this indicates the comparison could not be
907 carried out or the stmts will never be vectorized by SLP.
909 Note COND_EXPR is possibly isomorphic to another one after swapping its
910 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
911 the first stmt by swapping the two operands of comparison; set SWAP[i]
912 to 2 if stmt I is isormorphic to the first stmt by inverting the code
913 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
914 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
917 vect_build_slp_tree_1 (vec_info
*vinfo
, unsigned char *swap
,
918 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
919 poly_uint64
*max_nunits
, bool *matches
,
920 bool *two_operators
, tree
*node_vectype
)
923 stmt_vec_info first_stmt_info
= stmts
[0];
924 enum tree_code first_stmt_code
= ERROR_MARK
;
925 enum tree_code alt_stmt_code
= ERROR_MARK
;
926 enum tree_code rhs_code
= ERROR_MARK
;
927 enum tree_code first_cond_code
= ERROR_MARK
;
929 bool need_same_oprnds
= false;
930 tree vectype
= NULL_TREE
, first_op1
= NULL_TREE
;
933 machine_mode optab_op2_mode
;
934 machine_mode vec_mode
;
935 stmt_vec_info first_load
= NULL
, prev_first_load
= NULL
;
936 bool first_stmt_load_p
= false, load_p
= false;
937 bool first_stmt_phi_p
= false, phi_p
= false;
938 bool maybe_soft_fail
= false;
939 tree soft_fail_nunits_vectype
= NULL_TREE
;
941 /* For every stmt in NODE find its def stmt/s. */
942 stmt_vec_info stmt_info
;
943 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
945 gimple
*stmt
= stmt_info
->stmt
;
949 if (dump_enabled_p ())
950 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for %G", stmt
);
952 /* Fail to vectorize statements marked as unvectorizable, throw
954 if (!STMT_VINFO_VECTORIZABLE (stmt_info
)
955 || stmt_can_throw_internal (cfun
, stmt
)
956 || gimple_has_volatile_ops (stmt
))
958 if (dump_enabled_p ())
959 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
960 "Build SLP failed: unvectorizable statement %G",
962 /* ??? For BB vectorization we want to commutate operands in a way
963 to shuffle all unvectorizable defs into one operand and have
964 the other still vectorized. The following doesn't reliably
965 work for this though but it's the easiest we can do here. */
966 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
968 /* Fatal mismatch. */
973 lhs
= gimple_get_lhs (stmt
);
974 if (lhs
== NULL_TREE
)
976 if (dump_enabled_p ())
977 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
978 "Build SLP failed: not GIMPLE_ASSIGN nor "
979 "GIMPLE_CALL %G", stmt
);
980 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
982 /* Fatal mismatch. */
988 if (!vect_get_vector_types_for_stmt (vinfo
, stmt_info
, &vectype
,
989 &nunits_vectype
, group_size
))
991 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
993 /* Fatal mismatch. */
997 /* Record nunits required but continue analysis, producing matches[]
998 as if nunits was not an issue. This allows splitting of groups
1001 && !vect_record_max_nunits (vinfo
, stmt_info
, group_size
,
1002 nunits_vectype
, max_nunits
))
1004 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
1005 maybe_soft_fail
= true;
1006 soft_fail_nunits_vectype
= nunits_vectype
;
1009 gcc_assert (vectype
);
1011 gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
);
1014 rhs_code
= CALL_EXPR
;
1016 if (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
))
1018 else if ((gimple_call_internal_p (call_stmt
)
1019 && (!vectorizable_internal_fn_p
1020 (gimple_call_internal_fn (call_stmt
))))
1021 || gimple_call_tail_p (call_stmt
)
1022 || gimple_call_noreturn_p (call_stmt
)
1023 || !gimple_call_nothrow_p (call_stmt
)
1024 || gimple_call_chain (call_stmt
))
1026 if (dump_enabled_p ())
1027 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1028 "Build SLP failed: unsupported call type %G",
1030 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1032 /* Fatal mismatch. */
1037 else if (gimple_code (stmt
) == GIMPLE_PHI
)
1039 rhs_code
= ERROR_MARK
;
1044 rhs_code
= gimple_assign_rhs_code (stmt
);
1045 load_p
= gimple_vuse (stmt
);
1048 /* Check the operation. */
1051 *node_vectype
= vectype
;
1052 first_stmt_code
= rhs_code
;
1053 first_stmt_load_p
= load_p
;
1054 first_stmt_phi_p
= phi_p
;
1056 /* Shift arguments should be equal in all the packed stmts for a
1057 vector shift with scalar shift operand. */
1058 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
1059 || rhs_code
== LROTATE_EXPR
1060 || rhs_code
== RROTATE_EXPR
)
1062 vec_mode
= TYPE_MODE (vectype
);
1064 /* First see if we have a vector/vector shift. */
1065 optab
= optab_for_tree_code (rhs_code
, vectype
,
1069 || optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
1071 /* No vector/vector shift, try for a vector/scalar shift. */
1072 optab
= optab_for_tree_code (rhs_code
, vectype
,
1077 if (dump_enabled_p ())
1078 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1079 "Build SLP failed: no optab.\n");
1080 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1082 /* Fatal mismatch. */
1086 icode
= (int) optab_handler (optab
, vec_mode
);
1087 if (icode
== CODE_FOR_nothing
)
1089 if (dump_enabled_p ())
1090 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1091 "Build SLP failed: "
1092 "op not supported by target.\n");
1093 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1095 /* Fatal mismatch. */
1099 optab_op2_mode
= insn_data
[icode
].operand
[2].mode
;
1100 if (!VECTOR_MODE_P (optab_op2_mode
))
1102 need_same_oprnds
= true;
1103 first_op1
= gimple_assign_rhs2 (stmt
);
1107 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
1109 need_same_oprnds
= true;
1110 first_op1
= gimple_assign_rhs2 (stmt
);
1113 && rhs_code
== BIT_FIELD_REF
)
1115 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 0);
1116 if (!is_a
<bb_vec_info
> (vinfo
)
1117 || TREE_CODE (vec
) != SSA_NAME
1118 || !operand_equal_p (TYPE_SIZE (vectype
),
1119 TYPE_SIZE (TREE_TYPE (vec
))))
1121 if (dump_enabled_p ())
1122 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1123 "Build SLP failed: "
1124 "BIT_FIELD_REF not supported\n");
1125 /* Fatal mismatch. */
1131 && gimple_call_internal_p (call_stmt
, IFN_DIV_POW2
))
1133 need_same_oprnds
= true;
1134 first_op1
= gimple_call_arg (call_stmt
, 1);
1139 if (first_stmt_code
!= rhs_code
1140 && alt_stmt_code
== ERROR_MARK
)
1141 alt_stmt_code
= rhs_code
;
1142 if ((first_stmt_code
!= rhs_code
1143 && (first_stmt_code
!= IMAGPART_EXPR
1144 || rhs_code
!= REALPART_EXPR
)
1145 && (first_stmt_code
!= REALPART_EXPR
1146 || rhs_code
!= IMAGPART_EXPR
)
1147 /* Handle mismatches in plus/minus by computing both
1148 and merging the results. */
1149 && !((first_stmt_code
== PLUS_EXPR
1150 || first_stmt_code
== MINUS_EXPR
)
1151 && (alt_stmt_code
== PLUS_EXPR
1152 || alt_stmt_code
== MINUS_EXPR
)
1153 && rhs_code
== alt_stmt_code
)
1154 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1155 && (first_stmt_code
== ARRAY_REF
1156 || first_stmt_code
== BIT_FIELD_REF
1157 || first_stmt_code
== INDIRECT_REF
1158 || first_stmt_code
== COMPONENT_REF
1159 || first_stmt_code
== MEM_REF
)))
1160 || first_stmt_load_p
!= load_p
1161 || first_stmt_phi_p
!= phi_p
)
1163 if (dump_enabled_p ())
1165 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1166 "Build SLP failed: different operation "
1167 "in stmt %G", stmt
);
1168 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1169 "original stmt %G", first_stmt_info
->stmt
);
1175 if (need_same_oprnds
)
1177 tree other_op1
= (call_stmt
1178 ? gimple_call_arg (call_stmt
, 1)
1179 : gimple_assign_rhs2 (stmt
));
1180 if (!operand_equal_p (first_op1
, other_op1
, 0))
1182 if (dump_enabled_p ())
1183 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1184 "Build SLP failed: different shift "
1185 "arguments in %G", stmt
);
1191 && first_stmt_code
== BIT_FIELD_REF
1192 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info
->stmt
), 0)
1193 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0)))
1195 if (dump_enabled_p ())
1196 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1197 "Build SLP failed: different BIT_FIELD_REF "
1198 "arguments in %G", stmt
);
1203 if (!load_p
&& rhs_code
== CALL_EXPR
)
1205 if (!compatible_calls_p (as_a
<gcall
*> (stmts
[0]->stmt
),
1206 as_a
<gcall
*> (stmt
)))
1208 if (dump_enabled_p ())
1209 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1210 "Build SLP failed: different calls in %G",
1218 && (gimple_bb (first_stmt_info
->stmt
)
1219 != gimple_bb (stmt_info
->stmt
)))
1221 if (dump_enabled_p ())
1222 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1223 "Build SLP failed: different BB for PHI "
1229 if (!types_compatible_p (vectype
, *node_vectype
))
1231 if (dump_enabled_p ())
1232 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1233 "Build SLP failed: different vector type "
1240 /* Grouped store or load. */
1241 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1243 if (REFERENCE_CLASS_P (lhs
))
1251 first_load
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1252 if (prev_first_load
)
1254 /* Check that there are no loads from different interleaving
1255 chains in the same node. */
1256 if (prev_first_load
!= first_load
)
1258 if (dump_enabled_p ())
1259 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1261 "Build SLP failed: different "
1262 "interleaving chains in one node %G",
1269 prev_first_load
= first_load
;
1271 } /* Grouped access. */
1276 /* Not grouped load. */
1277 if (dump_enabled_p ())
1278 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1279 "Build SLP failed: not grouped load %G", stmt
);
1281 /* FORNOW: Not grouped loads are not supported. */
1282 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1284 /* Fatal mismatch. */
1289 /* Not memory operation. */
1291 && TREE_CODE_CLASS (rhs_code
) != tcc_binary
1292 && TREE_CODE_CLASS (rhs_code
) != tcc_unary
1293 && TREE_CODE_CLASS (rhs_code
) != tcc_expression
1294 && TREE_CODE_CLASS (rhs_code
) != tcc_comparison
1295 && rhs_code
!= VIEW_CONVERT_EXPR
1296 && rhs_code
!= CALL_EXPR
1297 && rhs_code
!= BIT_FIELD_REF
)
1299 if (dump_enabled_p ())
1300 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1301 "Build SLP failed: operation unsupported %G",
1303 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1305 /* Fatal mismatch. */
1310 if (rhs_code
== COND_EXPR
)
1312 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1313 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1314 enum tree_code swap_code
= ERROR_MARK
;
1315 enum tree_code invert_code
= ERROR_MARK
;
1318 first_cond_code
= TREE_CODE (cond_expr
);
1319 else if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1321 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
1322 swap_code
= swap_tree_comparison (cond_code
);
1323 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
1326 if (first_cond_code
== cond_code
)
1328 /* Isomorphic can be achieved by swapping. */
1329 else if (first_cond_code
== swap_code
)
1331 /* Isomorphic can be achieved by inverting. */
1332 else if (first_cond_code
== invert_code
)
1336 if (dump_enabled_p ())
1337 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1338 "Build SLP failed: different"
1339 " operation %G", stmt
);
1349 for (i
= 0; i
< group_size
; ++i
)
1353 /* If we allowed a two-operation SLP node verify the target can cope
1354 with the permute we are going to use. */
1355 if (alt_stmt_code
!= ERROR_MARK
1356 && TREE_CODE_CLASS (alt_stmt_code
) != tcc_reference
)
1358 *two_operators
= true;
1361 if (maybe_soft_fail
)
1363 unsigned HOST_WIDE_INT const_nunits
;
1364 if (!TYPE_VECTOR_SUBPARTS
1365 (soft_fail_nunits_vectype
).is_constant (&const_nunits
)
1366 || const_nunits
> group_size
)
1370 /* With constant vector elements simulate a mismatch at the
1371 point we need to split. */
1372 unsigned tail
= group_size
& (const_nunits
- 1);
1373 memset (&matches
[group_size
- tail
], 0, sizeof (bool) * tail
);
1381 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1382 Note we never remove apart from at destruction time so we do not
1383 need a special value for deleted that differs from empty. */
1386 typedef vec
<stmt_vec_info
> value_type
;
1387 typedef vec
<stmt_vec_info
> compare_type
;
1388 static inline hashval_t
hash (value_type
);
1389 static inline bool equal (value_type existing
, value_type candidate
);
1390 static inline bool is_empty (value_type x
) { return !x
.exists (); }
1391 static inline bool is_deleted (value_type x
) { return !x
.exists (); }
1392 static const bool empty_zero_p
= true;
1393 static inline void mark_empty (value_type
&x
) { x
.release (); }
1394 static inline void mark_deleted (value_type
&x
) { x
.release (); }
1395 static inline void remove (value_type
&x
) { x
.release (); }
1398 bst_traits::hash (value_type x
)
1401 for (unsigned i
= 0; i
< x
.length (); ++i
)
1402 h
.add_int (gimple_uid (x
[i
]->stmt
));
1406 bst_traits::equal (value_type existing
, value_type candidate
)
1408 if (existing
.length () != candidate
.length ())
1410 for (unsigned i
= 0; i
< existing
.length (); ++i
)
1411 if (existing
[i
] != candidate
[i
])
1416 typedef hash_map
<vec
<stmt_vec_info
>, slp_tree
,
1417 simple_hashmap_traits
<bst_traits
, slp_tree
> >
1418 scalar_stmts_to_slp_tree_map_t
;
1421 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1422 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1423 poly_uint64
*max_nunits
,
1424 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1425 scalar_stmts_to_slp_tree_map_t
*bst_map
);
1428 vect_build_slp_tree (vec_info
*vinfo
,
1429 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1430 poly_uint64
*max_nunits
,
1431 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1432 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1434 if (slp_tree
*leader
= bst_map
->get (stmts
))
1436 if (dump_enabled_p ())
1437 dump_printf_loc (MSG_NOTE
, vect_location
, "re-using %sSLP tree %p\n",
1438 *leader
? "" : "failed ", *leader
);
1441 SLP_TREE_REF_COUNT (*leader
)++;
1442 vect_update_max_nunits (max_nunits
, (*leader
)->max_nunits
);
1448 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1449 so we can pick up backedge destinations during discovery. */
1450 slp_tree res
= new _slp_tree
;
1451 SLP_TREE_DEF_TYPE (res
) = vect_internal_def
;
1452 SLP_TREE_SCALAR_STMTS (res
) = stmts
;
1453 bst_map
->put (stmts
.copy (), res
);
1457 if (dump_enabled_p ())
1458 dump_printf_loc (MSG_NOTE
, vect_location
,
1459 "SLP discovery limit exceeded\n");
1460 bool existed_p
= bst_map
->put (stmts
, NULL
);
1461 gcc_assert (existed_p
);
1462 /* Mark the node invalid so we can detect those when still in use
1463 as backedge destinations. */
1464 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1465 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1466 vect_free_slp_tree (res
);
1467 memset (matches
, 0, sizeof (bool) * group_size
);
1472 poly_uint64 this_max_nunits
= 1;
1473 slp_tree res_
= vect_build_slp_tree_2 (vinfo
, res
, stmts
, group_size
,
1475 matches
, limit
, tree_size
, bst_map
);
1478 bool existed_p
= bst_map
->put (stmts
, NULL
);
1479 gcc_assert (existed_p
);
1480 /* Mark the node invalid so we can detect those when still in use
1481 as backedge destinations. */
1482 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1483 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1484 vect_free_slp_tree (res
);
1488 gcc_assert (res_
== res
);
1489 res
->max_nunits
= this_max_nunits
;
1490 vect_update_max_nunits (max_nunits
, this_max_nunits
);
1491 /* Keep a reference for the bst_map use. */
1492 SLP_TREE_REF_COUNT (res
)++;
1497 /* Recursively build an SLP tree starting from NODE.
1498 Fail (and return a value not equal to zero) if def-stmts are not
1499 isomorphic, require data permutation or are of unsupported types of
1500 operation. Otherwise, return 0.
1501 The value returned is the depth in the SLP tree where a mismatch
1505 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1506 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1507 poly_uint64
*max_nunits
,
1508 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1509 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1511 unsigned nops
, i
, this_tree_size
= 0;
1512 poly_uint64 this_max_nunits
= *max_nunits
;
1516 stmt_vec_info stmt_info
= stmts
[0];
1517 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1518 nops
= gimple_call_num_args (stmt
);
1519 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
1521 nops
= gimple_num_ops (stmt
) - 1;
1522 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
1525 else if (gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1526 nops
= gimple_phi_num_args (phi
);
1530 /* If the SLP node is a PHI (induction or reduction), terminate
1532 bool *skip_args
= XALLOCAVEC (bool, nops
);
1533 memset (skip_args
, 0, sizeof (bool) * nops
);
1534 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
1535 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1537 tree scalar_type
= TREE_TYPE (PHI_RESULT (stmt
));
1538 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
1540 if (!vect_record_max_nunits (vinfo
, stmt_info
, group_size
, vectype
,
1544 vect_def_type def_type
= STMT_VINFO_DEF_TYPE (stmt_info
);
1545 if (def_type
== vect_induction_def
)
1547 /* Induction PHIs are not cycles but walk the initial
1548 value. Only for inner loops through, for outer loops
1549 we need to pick up the value from the actual PHIs
1550 to more easily support peeling and epilogue vectorization. */
1551 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1552 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1553 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1556 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1558 else if (def_type
== vect_reduction_def
1559 || def_type
== vect_double_reduction_def
1560 || def_type
== vect_nested_cycle
)
1562 /* Else def types have to match. */
1563 stmt_vec_info other_info
;
1564 bool all_same
= true;
1565 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1567 if (STMT_VINFO_DEF_TYPE (other_info
) != def_type
)
1569 if (other_info
!= stmt_info
)
1572 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1573 /* Reduction initial values are not explicitely represented. */
1574 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1575 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1576 /* Reduction chain backedge defs are filled manually.
1577 ??? Need a better way to identify a SLP reduction chain PHI.
1578 Or a better overall way to SLP match those. */
1579 if (all_same
&& def_type
== vect_reduction_def
)
1580 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1582 else if (def_type
!= vect_internal_def
)
1587 bool two_operators
= false;
1588 unsigned char *swap
= XALLOCAVEC (unsigned char, group_size
);
1589 tree vectype
= NULL_TREE
;
1590 if (!vect_build_slp_tree_1 (vinfo
, swap
, stmts
, group_size
,
1591 &this_max_nunits
, matches
, &two_operators
,
1595 /* If the SLP node is a load, terminate the recursion unless masked. */
1596 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1597 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1599 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1602 gcc_assert (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
));
1607 *max_nunits
= this_max_nunits
;
1609 node
= vect_create_new_slp_node (node
, stmts
, 0);
1610 SLP_TREE_VECTYPE (node
) = vectype
;
1611 /* And compute the load permutation. Whether it is actually
1612 a permutation depends on the unrolling factor which is
1614 vec
<unsigned> load_permutation
;
1616 stmt_vec_info load_info
;
1617 load_permutation
.create (group_size
);
1618 stmt_vec_info first_stmt_info
1619 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
1620 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
1622 int load_place
= vect_get_place_in_interleaving_chain
1623 (load_info
, first_stmt_info
);
1624 gcc_assert (load_place
!= -1);
1625 load_permutation
.safe_push (load_place
);
1627 SLP_TREE_LOAD_PERMUTATION (node
) = load_permutation
;
1631 else if (gimple_assign_single_p (stmt_info
->stmt
)
1632 && !gimple_vuse (stmt_info
->stmt
)
1633 && gimple_assign_rhs_code (stmt_info
->stmt
) == BIT_FIELD_REF
)
1635 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1636 the same SSA name vector of a compatible type to vectype. */
1637 vec
<std::pair
<unsigned, unsigned> > lperm
= vNULL
;
1638 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0);
1639 stmt_vec_info estmt_info
;
1640 FOR_EACH_VEC_ELT (stmts
, i
, estmt_info
)
1642 gassign
*estmt
= as_a
<gassign
*> (estmt_info
->stmt
);
1643 tree bfref
= gimple_assign_rhs1 (estmt
);
1645 if (!known_eq (bit_field_size (bfref
),
1646 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype
))))
1647 || !constant_multiple_p (bit_field_offset (bfref
),
1648 bit_field_size (bfref
), &lane
))
1653 lperm
.safe_push (std::make_pair (0, (unsigned)lane
));
1655 slp_tree vnode
= vect_create_new_slp_node (vNULL
);
1656 /* ??? We record vectype here but we hide eventually necessary
1657 punning and instead rely on code generation to materialize
1658 VIEW_CONVERT_EXPRs as necessary. We instead should make
1659 this explicit somehow. */
1660 SLP_TREE_VECTYPE (vnode
) = vectype
;
1661 SLP_TREE_VEC_DEFS (vnode
).safe_push (vec
);
1662 /* We are always building a permutation node even if it is an identity
1663 permute to shield the rest of the vectorizer from the odd node
1664 representing an actual vector without any scalar ops.
1665 ??? We could hide it completely with making the permute node
1667 node
= vect_create_new_slp_node (node
, stmts
, 1);
1668 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1669 SLP_TREE_LANE_PERMUTATION (node
) = lperm
;
1670 SLP_TREE_VECTYPE (node
) = vectype
;
1671 SLP_TREE_CHILDREN (node
).quick_push (vnode
);
1675 /* Get at the operands, verifying they are compatible. */
1676 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
1677 slp_oprnd_info oprnd_info
;
1678 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
1680 int res
= vect_get_and_check_slp_defs (vinfo
, swap
[i
], skip_args
,
1681 stmts
, i
, &oprnds_info
);
1683 matches
[(res
== -1) ? 0 : i
] = false;
1687 for (i
= 0; i
< group_size
; ++i
)
1690 vect_free_oprnd_info (oprnds_info
);
1695 auto_vec
<slp_tree
, 4> children
;
1697 stmt_info
= stmts
[0];
1699 /* Create SLP_TREE nodes for the definition node/s. */
1700 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
1705 /* We're skipping certain operands from processing, for example
1706 outer loop reduction initial defs. */
1709 children
.safe_push (NULL
);
1713 if (oprnd_info
->first_dt
== vect_uninitialized_def
)
1715 /* COND_EXPR have one too many eventually if the condition
1717 gcc_assert (i
== 3 && nops
== 4);
1721 if (is_a
<bb_vec_info
> (vinfo
)
1722 && oprnd_info
->first_dt
== vect_internal_def
1723 && !oprnd_info
->any_pattern
)
1725 /* For BB vectorization, if all defs are the same do not
1726 bother to continue the build along the single-lane
1727 graph but use a splat of the scalar value. */
1728 stmt_vec_info first_def
= oprnd_info
->def_stmts
[0];
1729 for (j
= 1; j
< group_size
; ++j
)
1730 if (oprnd_info
->def_stmts
[j
] != first_def
)
1733 /* But avoid doing this for loads where we may be
1734 able to CSE things, unless the stmt is not
1736 && (!STMT_VINFO_VECTORIZABLE (first_def
)
1737 || !gimple_vuse (first_def
->stmt
)))
1739 if (dump_enabled_p ())
1740 dump_printf_loc (MSG_NOTE
, vect_location
,
1741 "Using a splat of the uniform operand\n");
1742 oprnd_info
->first_dt
= vect_external_def
;
1746 if (oprnd_info
->first_dt
== vect_external_def
1747 || oprnd_info
->first_dt
== vect_constant_def
)
1749 slp_tree invnode
= vect_create_new_slp_node (oprnd_info
->ops
);
1750 SLP_TREE_DEF_TYPE (invnode
) = oprnd_info
->first_dt
;
1751 oprnd_info
->ops
= vNULL
;
1752 children
.safe_push (invnode
);
1756 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1757 group_size
, &this_max_nunits
,
1759 &this_tree_size
, bst_map
)) != NULL
)
1761 oprnd_info
->def_stmts
= vNULL
;
1762 children
.safe_push (child
);
1766 /* If the SLP build for operand zero failed and operand zero
1767 and one can be commutated try that for the scalar stmts
1768 that failed the match. */
1770 /* A first scalar stmt mismatch signals a fatal mismatch. */
1772 /* ??? For COND_EXPRs we can swap the comparison operands
1773 as well as the arms under some constraints. */
1775 && oprnds_info
[1]->first_dt
== vect_internal_def
1776 && is_gimple_assign (stmt_info
->stmt
)
1777 /* Swapping operands for reductions breaks assumptions later on. */
1778 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
1779 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
1781 /* See whether we can swap the matching or the non-matching
1783 bool swap_not_matching
= true;
1786 for (j
= 0; j
< group_size
; ++j
)
1788 if (matches
[j
] != !swap_not_matching
)
1790 stmt_vec_info stmt_info
= stmts
[j
];
1791 /* Verify if we can swap operands of this stmt. */
1792 gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
1794 || !commutative_tree_code (gimple_assign_rhs_code (stmt
)))
1796 if (!swap_not_matching
)
1798 swap_not_matching
= false;
1803 while (j
!= group_size
);
1805 /* Swap mismatched definition stmts. */
1806 if (dump_enabled_p ())
1807 dump_printf_loc (MSG_NOTE
, vect_location
,
1808 "Re-trying with swapped operands of stmts ");
1809 for (j
= 0; j
< group_size
; ++j
)
1810 if (matches
[j
] == !swap_not_matching
)
1812 std::swap (oprnds_info
[0]->def_stmts
[j
],
1813 oprnds_info
[1]->def_stmts
[j
]);
1814 std::swap (oprnds_info
[0]->ops
[j
],
1815 oprnds_info
[1]->ops
[j
]);
1816 if (dump_enabled_p ())
1817 dump_printf (MSG_NOTE
, "%d ", j
);
1819 if (dump_enabled_p ())
1820 dump_printf (MSG_NOTE
, "\n");
1821 /* And try again with scratch 'matches' ... */
1822 bool *tem
= XALLOCAVEC (bool, group_size
);
1823 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1824 group_size
, &this_max_nunits
,
1826 &this_tree_size
, bst_map
)) != NULL
)
1828 oprnd_info
->def_stmts
= vNULL
;
1829 children
.safe_push (child
);
1835 /* If the SLP build failed and we analyze a basic-block
1836 simply treat nodes we fail to build as externally defined
1837 (and thus build vectors from the scalar defs).
1838 The cost model will reject outright expensive cases.
1839 ??? This doesn't treat cases where permutation ultimatively
1840 fails (or we don't try permutation below). Ideally we'd
1841 even compute a permutation that will end up with the maximum
1843 if (is_a
<bb_vec_info
> (vinfo
)
1844 /* ??? Rejecting patterns this way doesn't work. We'd have to
1845 do extra work to cancel the pattern so the uses see the
1847 && !is_pattern_stmt_p (stmt_info
)
1848 && !oprnd_info
->any_pattern
)
1850 /* But if there's a leading vector sized set of matching stmts
1851 fail here so we can split the group. This matches the condition
1852 vect_analyze_slp_instance uses. */
1853 /* ??? We might want to split here and combine the results to support
1854 multiple vector sizes better. */
1855 for (j
= 0; j
< group_size
; ++j
)
1858 if (!known_ge (j
, TYPE_VECTOR_SUBPARTS (vectype
)))
1860 if (dump_enabled_p ())
1861 dump_printf_loc (MSG_NOTE
, vect_location
,
1862 "Building vector operands from scalars\n");
1864 child
= vect_create_new_slp_node (oprnd_info
->ops
);
1865 children
.safe_push (child
);
1866 oprnd_info
->ops
= vNULL
;
1871 gcc_assert (child
== NULL
);
1872 FOR_EACH_VEC_ELT (children
, j
, child
)
1874 vect_free_slp_tree (child
);
1875 vect_free_oprnd_info (oprnds_info
);
1879 vect_free_oprnd_info (oprnds_info
);
1881 /* If we have all children of a child built up from uniform scalars
1882 or does more than one possibly expensive vector construction then
1883 just throw that away, causing it built up from scalars.
1884 The exception is the SLP node for the vector store. */
1885 if (is_a
<bb_vec_info
> (vinfo
)
1886 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1887 /* ??? Rejecting patterns this way doesn't work. We'd have to
1888 do extra work to cancel the pattern so the uses see the
1890 && !is_pattern_stmt_p (stmt_info
))
1894 bool all_uniform_p
= true;
1895 unsigned n_vector_builds
= 0;
1896 FOR_EACH_VEC_ELT (children
, j
, child
)
1900 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
1901 all_uniform_p
= false;
1902 else if (!vect_slp_tree_uniform_p (child
))
1904 all_uniform_p
= false;
1905 if (SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
1910 || n_vector_builds
> 1
1911 || (n_vector_builds
== children
.length ()
1912 && is_a
<gphi
*> (stmt_info
->stmt
)))
1916 FOR_EACH_VEC_ELT (children
, j
, child
)
1918 vect_free_slp_tree (child
);
1920 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_NOTE
, vect_location
,
1922 "Building parent vector operands from "
1923 "scalars instead\n");
1928 *tree_size
+= this_tree_size
+ 1;
1929 *max_nunits
= this_max_nunits
;
1933 /* ??? We'd likely want to either cache in bst_map sth like
1934 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1935 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1936 explicit stmts to put in so the keying on 'stmts' doesn't
1937 work (but we have the same issue with nodes that use 'ops'). */
1938 slp_tree one
= new _slp_tree
;
1939 slp_tree two
= new _slp_tree
;
1940 SLP_TREE_DEF_TYPE (one
) = vect_internal_def
;
1941 SLP_TREE_DEF_TYPE (two
) = vect_internal_def
;
1942 SLP_TREE_VECTYPE (one
) = vectype
;
1943 SLP_TREE_VECTYPE (two
) = vectype
;
1944 SLP_TREE_CHILDREN (one
).safe_splice (children
);
1945 SLP_TREE_CHILDREN (two
).safe_splice (children
);
1947 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two
), i
, child
)
1948 SLP_TREE_REF_COUNT (child
)++;
1950 /* Here we record the original defs since this
1951 node represents the final lane configuration. */
1952 node
= vect_create_new_slp_node (node
, stmts
, 2);
1953 SLP_TREE_VECTYPE (node
) = vectype
;
1954 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1955 SLP_TREE_CHILDREN (node
).quick_push (one
);
1956 SLP_TREE_CHILDREN (node
).quick_push (two
);
1957 gassign
*stmt
= as_a
<gassign
*> (stmts
[0]->stmt
);
1958 enum tree_code code0
= gimple_assign_rhs_code (stmt
);
1959 enum tree_code ocode
= ERROR_MARK
;
1960 stmt_vec_info ostmt_info
;
1962 FOR_EACH_VEC_ELT (stmts
, i
, ostmt_info
)
1964 gassign
*ostmt
= as_a
<gassign
*> (ostmt_info
->stmt
);
1965 if (gimple_assign_rhs_code (ostmt
) != code0
)
1967 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (1, i
));
1968 ocode
= gimple_assign_rhs_code (ostmt
);
1972 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (0, i
));
1974 SLP_TREE_CODE (one
) = code0
;
1975 SLP_TREE_CODE (two
) = ocode
;
1976 SLP_TREE_LANES (one
) = stmts
.length ();
1977 SLP_TREE_LANES (two
) = stmts
.length ();
1978 SLP_TREE_REPRESENTATIVE (one
) = stmts
[0];
1979 SLP_TREE_REPRESENTATIVE (two
) = stmts
[j
];
1983 node
= vect_create_new_slp_node (node
, stmts
, nops
);
1984 SLP_TREE_VECTYPE (node
) = vectype
;
1985 SLP_TREE_CHILDREN (node
).splice (children
);
1989 /* Dump a single SLP tree NODE. */
1992 vect_print_slp_tree (dump_flags_t dump_kind
, dump_location_t loc
,
1997 stmt_vec_info stmt_info
;
2000 dump_metadata_t
metadata (dump_kind
, loc
.get_impl_location ());
2001 dump_user_location_t user_loc
= loc
.get_user_location ();
2002 dump_printf_loc (metadata
, user_loc
, "node%s %p (max_nunits=%u, refcnt=%u)\n",
2003 SLP_TREE_DEF_TYPE (node
) == vect_external_def
2005 : (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
2008 estimated_poly_value (node
->max_nunits
),
2009 SLP_TREE_REF_COUNT (node
));
2010 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
2012 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
2013 dump_printf_loc (metadata
, user_loc
, "op: VEC_PERM_EXPR\n");
2015 dump_printf_loc (metadata
, user_loc
, "op template: %G",
2016 SLP_TREE_REPRESENTATIVE (node
)->stmt
);
2018 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
2019 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2020 dump_printf_loc (metadata
, user_loc
, "\tstmt %u %G", i
, stmt_info
->stmt
);
2023 dump_printf_loc (metadata
, user_loc
, "\t{ ");
2024 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
2025 dump_printf (metadata
, "%T%s ", op
,
2026 i
< SLP_TREE_SCALAR_OPS (node
).length () - 1 ? "," : "");
2027 dump_printf (metadata
, "}\n");
2029 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2031 dump_printf_loc (metadata
, user_loc
, "\tload permutation {");
2032 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node
), i
, j
)
2033 dump_printf (dump_kind
, " %u", j
);
2034 dump_printf (dump_kind
, " }\n");
2036 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
2038 dump_printf_loc (metadata
, user_loc
, "\tlane permutation {");
2039 for (i
= 0; i
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++i
)
2040 dump_printf (dump_kind
, " %u[%u]",
2041 SLP_TREE_LANE_PERMUTATION (node
)[i
].first
,
2042 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
);
2043 dump_printf (dump_kind
, " }\n");
2045 if (SLP_TREE_CHILDREN (node
).is_empty ())
2047 dump_printf_loc (metadata
, user_loc
, "\tchildren");
2048 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2049 dump_printf (dump_kind
, " %p", (void *)child
);
2050 dump_printf (dump_kind
, "\n");
2054 debug (slp_tree node
)
2056 debug_dump_context ctx
;
2057 vect_print_slp_tree (MSG_NOTE
,
2058 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
2062 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2065 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2066 slp_tree node
, hash_set
<slp_tree
> &visited
)
2071 if (visited
.add (node
))
2074 vect_print_slp_tree (dump_kind
, loc
, node
);
2076 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2078 vect_print_slp_graph (dump_kind
, loc
, child
, visited
);
2082 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2085 hash_set
<slp_tree
> visited
;
2086 vect_print_slp_graph (dump_kind
, loc
, entry
, visited
);
2089 /* Mark the tree rooted at NODE with PURE_SLP. */
2092 vect_mark_slp_stmts (slp_tree node
, hash_set
<slp_tree
> &visited
)
2095 stmt_vec_info stmt_info
;
2098 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2101 if (visited
.add (node
))
2104 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2105 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
2107 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2109 vect_mark_slp_stmts (child
, visited
);
2113 vect_mark_slp_stmts (slp_tree node
)
2115 hash_set
<slp_tree
> visited
;
2116 vect_mark_slp_stmts (node
, visited
);
2119 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2122 vect_mark_slp_stmts_relevant (slp_tree node
, hash_set
<slp_tree
> &visited
)
2125 stmt_vec_info stmt_info
;
2128 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2131 if (visited
.add (node
))
2134 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2136 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
2137 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
2138 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
2141 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2143 vect_mark_slp_stmts_relevant (child
, visited
);
2147 vect_mark_slp_stmts_relevant (slp_tree node
)
2149 hash_set
<slp_tree
> visited
;
2150 vect_mark_slp_stmts_relevant (node
, visited
);
2154 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2157 vect_gather_slp_loads (vec
<slp_tree
> &loads
, slp_tree node
,
2158 hash_set
<slp_tree
> &visited
)
2160 if (!node
|| visited
.add (node
))
2163 if (SLP_TREE_CHILDREN (node
).length () == 0)
2165 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2167 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2168 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2169 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2170 loads
.safe_push (node
);
2176 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2177 vect_gather_slp_loads (loads
, child
, visited
);
2182 /* Find the last store in SLP INSTANCE. */
2185 vect_find_last_scalar_stmt_in_slp (slp_tree node
)
2187 stmt_vec_info last
= NULL
;
2188 stmt_vec_info stmt_vinfo
;
2190 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2192 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2193 last
= last
? get_later_stmt (stmt_vinfo
, last
) : stmt_vinfo
;
2199 /* Find the first stmt in NODE. */
2202 vect_find_first_scalar_stmt_in_slp (slp_tree node
)
2204 stmt_vec_info first
= NULL
;
2205 stmt_vec_info stmt_vinfo
;
2207 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2209 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2211 || get_later_stmt (stmt_vinfo
, first
) == first
)
2218 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2219 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2220 (also containing the first GROUP1_SIZE stmts, since stores are
2221 consecutive), the second containing the remainder.
2222 Return the first stmt in the second group. */
2224 static stmt_vec_info
2225 vect_split_slp_store_group (stmt_vec_info first_vinfo
, unsigned group1_size
)
2227 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo
) == first_vinfo
);
2228 gcc_assert (group1_size
> 0);
2229 int group2_size
= DR_GROUP_SIZE (first_vinfo
) - group1_size
;
2230 gcc_assert (group2_size
> 0);
2231 DR_GROUP_SIZE (first_vinfo
) = group1_size
;
2233 stmt_vec_info stmt_info
= first_vinfo
;
2234 for (unsigned i
= group1_size
; i
> 1; i
--)
2236 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2237 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2239 /* STMT is now the last element of the first group. */
2240 stmt_vec_info group2
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2241 DR_GROUP_NEXT_ELEMENT (stmt_info
) = 0;
2243 DR_GROUP_SIZE (group2
) = group2_size
;
2244 for (stmt_info
= group2
; stmt_info
;
2245 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
))
2247 DR_GROUP_FIRST_ELEMENT (stmt_info
) = group2
;
2248 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2251 /* For the second group, the DR_GROUP_GAP is that before the original group,
2252 plus skipping over the first vector. */
2253 DR_GROUP_GAP (group2
) = DR_GROUP_GAP (first_vinfo
) + group1_size
;
2255 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2256 DR_GROUP_GAP (first_vinfo
) += group2_size
;
2258 if (dump_enabled_p ())
2259 dump_printf_loc (MSG_NOTE
, vect_location
, "Split group into %d and %d\n",
2260 group1_size
, group2_size
);
2265 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2266 statements and a vector of NUNITS elements. */
2269 calculate_unrolling_factor (poly_uint64 nunits
, unsigned int group_size
)
2271 return exact_div (common_multiple (nunits
, group_size
), group_size
);
2274 /* Helper that checks to see if a node is a load node. */
2277 vect_is_slp_load_node (slp_tree root
)
2279 return SLP_TREE_DEF_TYPE (root
) == vect_internal_def
2280 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root
))
2281 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root
)));
2285 /* Helper function of optimize_load_redistribution that performs the operation
2289 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t
*bst_map
,
2290 vec_info
*vinfo
, unsigned int group_size
,
2291 hash_map
<slp_tree
, slp_tree
> *load_map
,
2294 if (slp_tree
*leader
= load_map
->get (root
))
2300 /* For now, we don't know anything about externals so do not do anything. */
2301 if (SLP_TREE_DEF_TYPE (root
) != vect_internal_def
)
2303 else if (SLP_TREE_CODE (root
) == VEC_PERM_EXPR
)
2305 /* First convert this node into a load node and add it to the leaves
2306 list and flatten the permute from a lane to a load one. If it's
2307 unneeded it will be elided later. */
2308 vec
<stmt_vec_info
> stmts
;
2309 stmts
.create (SLP_TREE_LANES (root
));
2310 lane_permutation_t lane_perm
= SLP_TREE_LANE_PERMUTATION (root
);
2311 for (unsigned j
= 0; j
< lane_perm
.length (); j
++)
2313 std::pair
<unsigned, unsigned> perm
= lane_perm
[j
];
2314 node
= SLP_TREE_CHILDREN (root
)[perm
.first
];
2316 if (!vect_is_slp_load_node (node
)
2317 || SLP_TREE_CHILDREN (node
).exists ())
2323 stmts
.quick_push (SLP_TREE_SCALAR_STMTS (node
)[perm
.second
]);
2326 if (dump_enabled_p ())
2327 dump_printf_loc (MSG_NOTE
, vect_location
,
2328 "converting stmts on permute node %p\n", root
);
2330 bool *matches
= XALLOCAVEC (bool, group_size
);
2331 poly_uint64 max_nunits
= 1;
2332 unsigned tree_size
= 0, limit
= 1;
2333 node
= vect_build_slp_tree (vinfo
, stmts
, group_size
, &max_nunits
,
2334 matches
, &limit
, &tree_size
, bst_map
);
2338 load_map
->put (root
, node
);
2343 load_map
->put (root
, NULL
);
2345 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
2348 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
2352 SLP_TREE_REF_COUNT (value
)++;
2353 SLP_TREE_CHILDREN (root
)[i
] = value
;
2354 /* ??? We know the original leafs of the replaced nodes will
2355 be referenced by bst_map, only the permutes created by
2356 pattern matching are not. */
2357 if (SLP_TREE_REF_COUNT (node
) == 1)
2358 load_map
->remove (node
);
2359 vect_free_slp_tree (node
);
2366 /* Temporary workaround for loads not being CSEd during SLP build. This
2367 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2368 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2369 same DR such that the final operation is equal to a permuted load. Such
2370 NODES are then directly converted into LOADS themselves. The nodes are
2371 CSEd using BST_MAP. */
2374 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t
*bst_map
,
2375 vec_info
*vinfo
, unsigned int group_size
,
2376 hash_map
<slp_tree
, slp_tree
> *load_map
,
2382 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
2385 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
2389 SLP_TREE_REF_COUNT (value
)++;
2390 SLP_TREE_CHILDREN (root
)[i
] = value
;
2391 /* ??? We know the original leafs of the replaced nodes will
2392 be referenced by bst_map, only the permutes created by
2393 pattern matching are not. */
2394 if (SLP_TREE_REF_COUNT (node
) == 1)
2395 load_map
->remove (node
);
2396 vect_free_slp_tree (node
);
2401 /* Helper function of vect_match_slp_patterns.
2403 Attempts to match patterns against the slp tree rooted in REF_NODE using
2404 VINFO. Patterns are matched in post-order traversal.
2406 If matching is successful the value in REF_NODE is updated and returned, if
2407 not then it is returned unchanged. */
2410 vect_match_slp_patterns_2 (slp_tree
*ref_node
, vec_info
*vinfo
,
2411 slp_tree_to_load_perm_map_t
*perm_cache
,
2412 hash_set
<slp_tree
> *visited
)
2415 slp_tree node
= *ref_node
;
2416 bool found_p
= false;
2417 if (!node
|| visited
->add (node
))
2421 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2422 found_p
|= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node
)[i
],
2423 vinfo
, perm_cache
, visited
);
2425 for (unsigned x
= 0; x
< num__slp_patterns
; x
++)
2427 vect_pattern
*pattern
= slp_patterns
[x
] (perm_cache
, ref_node
);
2430 pattern
->build (vinfo
);
2439 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2442 The modified tree is returned. Patterns are tried in order and multiple
2443 patterns may match. */
2446 vect_match_slp_patterns (slp_instance instance
, vec_info
*vinfo
,
2447 hash_set
<slp_tree
> *visited
,
2448 slp_tree_to_load_perm_map_t
*perm_cache
)
2450 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2451 slp_tree
*ref_node
= &SLP_INSTANCE_TREE (instance
);
2453 if (dump_enabled_p ())
2454 dump_printf_loc (MSG_NOTE
, vect_location
,
2455 "Analyzing SLP tree %p for patterns\n",
2456 SLP_INSTANCE_TREE (instance
));
2458 return vect_match_slp_patterns_2 (ref_node
, vinfo
, perm_cache
, visited
);
2461 /* Analyze an SLP instance starting from a group of grouped stores. Call
2462 vect_build_slp_tree to build a tree of packed stmts if possible.
2463 Return FALSE if it's impossible to SLP any stmt in the loop. */
2466 vect_analyze_slp_instance (vec_info
*vinfo
,
2467 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2468 stmt_vec_info stmt_info
, slp_instance_kind kind
,
2469 unsigned max_tree_size
, unsigned *limit
);
2471 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2472 of KIND. Return true if successful. */
2475 vect_build_slp_instance (vec_info
*vinfo
,
2476 slp_instance_kind kind
,
2477 vec
<stmt_vec_info
> &scalar_stmts
,
2478 stmt_vec_info root_stmt_info
,
2479 unsigned max_tree_size
, unsigned *limit
,
2480 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2481 /* ??? We need stmt_info for group splitting. */
2482 stmt_vec_info stmt_info_
)
2484 if (dump_enabled_p ())
2486 dump_printf_loc (MSG_NOTE
, vect_location
,
2487 "Starting SLP discovery for\n");
2488 for (unsigned i
= 0; i
< scalar_stmts
.length (); ++i
)
2489 dump_printf_loc (MSG_NOTE
, vect_location
,
2490 " %G", scalar_stmts
[i
]->stmt
);
2493 /* Build the tree for the SLP instance. */
2494 unsigned int group_size
= scalar_stmts
.length ();
2495 bool *matches
= XALLOCAVEC (bool, group_size
);
2496 poly_uint64 max_nunits
= 1;
2497 unsigned tree_size
= 0;
2499 slp_tree node
= vect_build_slp_tree (vinfo
, scalar_stmts
, group_size
,
2500 &max_nunits
, matches
, limit
,
2501 &tree_size
, bst_map
);
2504 /* Calculate the unrolling factor based on the smallest type. */
2505 poly_uint64 unrolling_factor
2506 = calculate_unrolling_factor (max_nunits
, group_size
);
2508 if (maybe_ne (unrolling_factor
, 1U)
2509 && is_a
<bb_vec_info
> (vinfo
))
2511 unsigned HOST_WIDE_INT const_max_nunits
;
2512 if (!max_nunits
.is_constant (&const_max_nunits
)
2513 || const_max_nunits
> group_size
)
2515 if (dump_enabled_p ())
2516 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2517 "Build SLP failed: store group "
2518 "size not a multiple of the vector size "
2519 "in basic block SLP\n");
2520 vect_free_slp_tree (node
);
2523 /* Fatal mismatch. */
2524 if (dump_enabled_p ())
2525 dump_printf_loc (MSG_NOTE
, vect_location
,
2526 "SLP discovery succeeded but node needs "
2528 memset (matches
, true, group_size
);
2529 matches
[group_size
/ const_max_nunits
* const_max_nunits
] = false;
2530 vect_free_slp_tree (node
);
2534 /* Create a new SLP instance. */
2535 slp_instance new_instance
= XNEW (class _slp_instance
);
2536 SLP_INSTANCE_TREE (new_instance
) = node
;
2537 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
2538 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
2539 SLP_INSTANCE_ROOT_STMT (new_instance
) = root_stmt_info
;
2540 SLP_INSTANCE_KIND (new_instance
) = kind
;
2541 new_instance
->reduc_phis
= NULL
;
2542 new_instance
->cost_vec
= vNULL
;
2543 new_instance
->subgraph_entries
= vNULL
;
2545 if (dump_enabled_p ())
2546 dump_printf_loc (MSG_NOTE
, vect_location
,
2547 "SLP size %u vs. limit %u.\n",
2548 tree_size
, max_tree_size
);
2550 /* Fixup SLP reduction chains. */
2551 if (kind
== slp_inst_kind_reduc_chain
)
2553 /* If this is a reduction chain with a conversion in front
2554 amend the SLP tree with a node for that. */
2556 = vect_orig_stmt (scalar_stmts
[group_size
- 1])->stmt
;
2557 if (STMT_VINFO_DEF_TYPE (scalar_stmts
[0]) != vect_reduction_def
)
2559 /* Get at the conversion stmt - we know it's the single use
2560 of the last stmt of the reduction chain. */
2561 use_operand_p use_p
;
2562 bool r
= single_imm_use (gimple_assign_lhs (scalar_def
),
2563 &use_p
, &scalar_def
);
2565 stmt_vec_info next_info
= vinfo
->lookup_stmt (scalar_def
);
2566 next_info
= vect_stmt_to_vectorize (next_info
);
2567 scalar_stmts
= vNULL
;
2568 scalar_stmts
.create (group_size
);
2569 for (unsigned i
= 0; i
< group_size
; ++i
)
2570 scalar_stmts
.quick_push (next_info
);
2571 slp_tree conv
= vect_create_new_slp_node (scalar_stmts
, 1);
2572 SLP_TREE_VECTYPE (conv
) = STMT_VINFO_VECTYPE (next_info
);
2573 SLP_TREE_CHILDREN (conv
).quick_push (node
);
2574 SLP_INSTANCE_TREE (new_instance
) = conv
;
2575 /* We also have to fake this conversion stmt as SLP reduction
2576 group so we don't have to mess with too much code
2578 REDUC_GROUP_FIRST_ELEMENT (next_info
) = next_info
;
2579 REDUC_GROUP_NEXT_ELEMENT (next_info
) = NULL
;
2581 /* Fill the backedge child of the PHI SLP node. The
2582 general matching code cannot find it because the
2583 scalar code does not reflect how we vectorize the
2585 use_operand_p use_p
;
2586 imm_use_iterator imm_iter
;
2587 class loop
*loop
= LOOP_VINFO_LOOP (as_a
<loop_vec_info
> (vinfo
));
2588 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
,
2589 gimple_get_lhs (scalar_def
))
2590 /* There are exactly two non-debug uses, the reduction
2591 PHI and the loop-closed PHI node. */
2592 if (!is_gimple_debug (USE_STMT (use_p
))
2593 && gimple_bb (USE_STMT (use_p
)) == loop
->header
)
2595 auto_vec
<stmt_vec_info
, 64> phis (group_size
);
2596 stmt_vec_info phi_info
2597 = vinfo
->lookup_stmt (USE_STMT (use_p
));
2598 for (unsigned i
= 0; i
< group_size
; ++i
)
2599 phis
.quick_push (phi_info
);
2600 slp_tree
*phi_node
= bst_map
->get (phis
);
2601 unsigned dest_idx
= loop_latch_edge (loop
)->dest_idx
;
2602 SLP_TREE_CHILDREN (*phi_node
)[dest_idx
]
2603 = SLP_INSTANCE_TREE (new_instance
);
2604 SLP_INSTANCE_TREE (new_instance
)->refcnt
++;
2608 vinfo
->slp_instances
.safe_push (new_instance
);
2610 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2611 the number of scalar stmts in the root in a few places.
2612 Verify that assumption holds. */
2613 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
2614 .length () == group_size
);
2616 if (dump_enabled_p ())
2618 dump_printf_loc (MSG_NOTE
, vect_location
,
2619 "Final SLP tree for instance %p:\n", new_instance
);
2620 vect_print_slp_graph (MSG_NOTE
, vect_location
,
2621 SLP_INSTANCE_TREE (new_instance
));
2629 /* Failed to SLP. */
2630 /* Free the allocated memory. */
2631 scalar_stmts
.release ();
2634 stmt_vec_info stmt_info
= stmt_info_
;
2635 /* Try to break the group up into pieces. */
2636 if (kind
== slp_inst_kind_store
)
2638 /* ??? We could delay all the actual splitting of store-groups
2639 until after SLP discovery of the original group completed.
2640 Then we can recurse to vect_build_slp_instance directly. */
2641 for (i
= 0; i
< group_size
; i
++)
2645 /* For basic block SLP, try to break the group up into multiples of
2647 if (is_a
<bb_vec_info
> (vinfo
)
2648 && (i
> 1 && i
< group_size
))
2651 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
2652 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
2653 1 << floor_log2 (i
));
2654 unsigned HOST_WIDE_INT const_nunits
;
2656 && TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
))
2658 /* Split into two groups at the first vector boundary. */
2659 gcc_assert ((const_nunits
& (const_nunits
- 1)) == 0);
2660 unsigned group1_size
= i
& ~(const_nunits
- 1);
2662 if (dump_enabled_p ())
2663 dump_printf_loc (MSG_NOTE
, vect_location
,
2664 "Splitting SLP group at stmt %u\n", i
);
2665 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2667 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2668 kind
, max_tree_size
,
2670 /* Split the rest at the failure point and possibly
2671 re-analyze the remaining matching part if it has
2672 at least two lanes. */
2674 && (i
+ 1 < group_size
2675 || i
- group1_size
> 1))
2677 stmt_vec_info rest2
= rest
;
2678 rest
= vect_split_slp_store_group (rest
, i
- group1_size
);
2679 if (i
- group1_size
> 1)
2680 res
|= vect_analyze_slp_instance (vinfo
, bst_map
, rest2
,
2681 kind
, max_tree_size
,
2684 /* Re-analyze the non-matching tail if it has at least
2686 if (i
+ 1 < group_size
)
2687 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2688 rest
, kind
, max_tree_size
,
2694 /* For loop vectorization split into arbitrary pieces of size > 1. */
2695 if (is_a
<loop_vec_info
> (vinfo
)
2696 && (i
> 1 && i
< group_size
))
2698 unsigned group1_size
= i
;
2700 if (dump_enabled_p ())
2701 dump_printf_loc (MSG_NOTE
, vect_location
,
2702 "Splitting SLP group at stmt %u\n", i
);
2704 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2706 /* Loop vectorization cannot handle gaps in stores, make sure
2707 the split group appears as strided. */
2708 STMT_VINFO_STRIDED_P (rest
) = 1;
2709 DR_GROUP_GAP (rest
) = 0;
2710 STMT_VINFO_STRIDED_P (stmt_info
) = 1;
2711 DR_GROUP_GAP (stmt_info
) = 0;
2713 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2714 kind
, max_tree_size
, limit
);
2715 if (i
+ 1 < group_size
)
2716 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2717 rest
, kind
, max_tree_size
, limit
);
2722 /* Even though the first vector did not all match, we might be able to SLP
2723 (some) of the remainder. FORNOW ignore this possibility. */
2726 /* Failed to SLP. */
2727 if (dump_enabled_p ())
2728 dump_printf_loc (MSG_NOTE
, vect_location
, "SLP discovery failed\n");
2733 /* Analyze an SLP instance starting from a group of grouped stores. Call
2734 vect_build_slp_tree to build a tree of packed stmts if possible.
2735 Return FALSE if it's impossible to SLP any stmt in the loop. */
2738 vect_analyze_slp_instance (vec_info
*vinfo
,
2739 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2740 stmt_vec_info stmt_info
,
2741 slp_instance_kind kind
,
2742 unsigned max_tree_size
, unsigned *limit
)
2745 vec
<stmt_vec_info
> scalar_stmts
;
2747 if (is_a
<bb_vec_info
> (vinfo
))
2748 vect_location
= stmt_info
->stmt
;
2750 stmt_vec_info next_info
= stmt_info
;
2751 if (kind
== slp_inst_kind_store
)
2753 /* Collect the stores and store them in scalar_stmts. */
2754 scalar_stmts
.create (DR_GROUP_SIZE (stmt_info
));
2757 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
2758 next_info
= DR_GROUP_NEXT_ELEMENT (next_info
);
2761 else if (kind
== slp_inst_kind_reduc_chain
)
2763 /* Collect the reduction stmts and store them in scalar_stmts. */
2764 scalar_stmts
.create (REDUC_GROUP_SIZE (stmt_info
));
2767 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
2768 next_info
= REDUC_GROUP_NEXT_ELEMENT (next_info
);
2770 /* Mark the first element of the reduction chain as reduction to properly
2771 transform the node. In the reduction analysis phase only the last
2772 element of the chain is marked as reduction. */
2773 STMT_VINFO_DEF_TYPE (stmt_info
)
2774 = STMT_VINFO_DEF_TYPE (scalar_stmts
.last ());
2775 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))
2776 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts
.last ()));
2778 else if (kind
== slp_inst_kind_ctor
)
2780 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
2782 scalar_stmts
.create (CONSTRUCTOR_NELTS (rhs
));
2783 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), i
, val
)
2785 stmt_vec_info def_info
= vinfo
->lookup_def (val
);
2786 def_info
= vect_stmt_to_vectorize (def_info
);
2787 scalar_stmts
.quick_push (def_info
);
2789 if (dump_enabled_p ())
2790 dump_printf_loc (MSG_NOTE
, vect_location
,
2791 "Analyzing vectorizable constructor: %G\n",
2794 else if (kind
== slp_inst_kind_reduc_group
)
2796 /* Collect reduction statements. */
2797 vec
<stmt_vec_info
> reductions
= as_a
<loop_vec_info
> (vinfo
)->reductions
;
2798 scalar_stmts
.create (reductions
.length ());
2799 for (i
= 0; reductions
.iterate (i
, &next_info
); i
++)
2800 if (STMT_VINFO_RELEVANT_P (next_info
)
2801 || STMT_VINFO_LIVE_P (next_info
))
2802 scalar_stmts
.quick_push (next_info
);
2803 /* If less than two were relevant/live there's nothing to SLP. */
2804 if (scalar_stmts
.length () < 2)
2810 /* Build the tree for the SLP instance. */
2811 bool res
= vect_build_slp_instance (vinfo
, kind
, scalar_stmts
,
2812 kind
== slp_inst_kind_ctor
2814 max_tree_size
, limit
, bst_map
,
2815 kind
== slp_inst_kind_store
2816 ? stmt_info
: NULL
);
2818 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2819 where we should do store group splitting. */
2824 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2825 trees of packed scalar stmts if SLP is possible. */
2828 vect_analyze_slp (vec_info
*vinfo
, unsigned max_tree_size
)
2831 stmt_vec_info first_element
;
2832 slp_instance instance
;
2834 DUMP_VECT_SCOPE ("vect_analyze_slp");
2836 unsigned limit
= max_tree_size
;
2838 scalar_stmts_to_slp_tree_map_t
*bst_map
2839 = new scalar_stmts_to_slp_tree_map_t ();
2841 /* Find SLP sequences starting from groups of grouped stores. */
2842 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
2843 vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2844 STMT_VINFO_GROUPED_ACCESS (first_element
)
2845 ? slp_inst_kind_store
: slp_inst_kind_ctor
,
2846 max_tree_size
, &limit
);
2848 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
2850 for (unsigned i
= 0; i
< bb_vinfo
->roots
.length (); ++i
)
2852 vect_location
= bb_vinfo
->roots
[i
].root
->stmt
;
2853 if (vect_build_slp_instance (bb_vinfo
, bb_vinfo
->roots
[i
].kind
,
2854 bb_vinfo
->roots
[i
].stmts
,
2855 bb_vinfo
->roots
[i
].root
,
2856 max_tree_size
, &limit
, bst_map
, NULL
))
2857 bb_vinfo
->roots
[i
].stmts
= vNULL
;
2861 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2863 /* Find SLP sequences starting from reduction chains. */
2864 FOR_EACH_VEC_ELT (loop_vinfo
->reduction_chains
, i
, first_element
)
2865 if (! STMT_VINFO_RELEVANT_P (first_element
)
2866 && ! STMT_VINFO_LIVE_P (first_element
))
2868 else if (! vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2869 slp_inst_kind_reduc_chain
,
2870 max_tree_size
, &limit
))
2872 /* Dissolve reduction chain group. */
2873 stmt_vec_info vinfo
= first_element
;
2874 stmt_vec_info last
= NULL
;
2877 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
2878 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2879 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2883 STMT_VINFO_DEF_TYPE (first_element
) = vect_internal_def
;
2884 /* It can be still vectorized as part of an SLP reduction. */
2885 loop_vinfo
->reductions
.safe_push (last
);
2888 /* Find SLP sequences starting from groups of reductions. */
2889 if (loop_vinfo
->reductions
.length () > 1)
2890 vect_analyze_slp_instance (vinfo
, bst_map
, loop_vinfo
->reductions
[0],
2891 slp_inst_kind_reduc_group
, max_tree_size
,
2895 hash_set
<slp_tree
> visited_patterns
;
2896 slp_tree_to_load_perm_map_t perm_cache
;
2898 /* See if any patterns can be found in the SLP tree. */
2899 bool pattern_found
= false;
2900 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
2901 pattern_found
|= vect_match_slp_patterns (instance
, vinfo
,
2902 &visited_patterns
, &perm_cache
);
2904 /* If any were found optimize permutations of loads. */
2907 hash_map
<slp_tree
, slp_tree
> load_map
;
2908 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
2910 slp_tree root
= SLP_INSTANCE_TREE (instance
);
2911 optimize_load_redistribution (bst_map
, vinfo
, SLP_TREE_LANES (root
),
2918 /* The map keeps a reference on SLP nodes built, release that. */
2919 for (scalar_stmts_to_slp_tree_map_t::iterator it
= bst_map
->begin ();
2920 it
!= bst_map
->end (); ++it
)
2922 vect_free_slp_tree ((*it
).second
);
2925 if (pattern_found
&& dump_enabled_p ())
2927 dump_printf_loc (MSG_NOTE
, vect_location
,
2928 "Pattern matched SLP tree\n");
2929 hash_set
<slp_tree
> visited
;
2930 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
2931 vect_print_slp_graph (MSG_NOTE
, vect_location
,
2932 SLP_INSTANCE_TREE (instance
), visited
);
2935 return opt_result::success ();
2938 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2941 vect_slp_build_vertices (hash_set
<slp_tree
> &visited
, slp_tree node
,
2942 vec
<slp_tree
> &vertices
, vec
<int> &leafs
)
2947 if (visited
.add (node
))
2950 node
->vertex
= vertices
.length ();
2951 vertices
.safe_push (node
);
2954 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2958 vect_slp_build_vertices (visited
, child
, vertices
, leafs
);
2961 leafs
.safe_push (node
->vertex
);
2964 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2967 vect_slp_build_vertices (vec_info
*info
, vec
<slp_tree
> &vertices
,
2970 hash_set
<slp_tree
> visited
;
2972 slp_instance instance
;
2973 FOR_EACH_VEC_ELT (info
->slp_instances
, i
, instance
)
2975 unsigned n_v
= vertices
.length ();
2976 unsigned n_l
= leafs
.length ();
2977 vect_slp_build_vertices (visited
, SLP_INSTANCE_TREE (instance
), vertices
,
2979 /* If we added vertices but no entries to the reverse graph we've
2980 added a cycle that is not backwards-reachable. Push the entry
2981 to mimic as leaf then. */
2982 if (vertices
.length () > n_v
2983 && leafs
.length () == n_l
)
2984 leafs
.safe_push (SLP_INSTANCE_TREE (instance
)->vertex
);
2988 /* Apply (reverse) bijectite PERM to VEC. */
2992 vect_slp_permute (vec
<unsigned> perm
,
2993 vec
<T
> &vec
, bool reverse
)
2995 auto_vec
<T
, 64> saved
;
2996 saved
.create (vec
.length ());
2997 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2998 saved
.quick_push (vec
[i
]);
3002 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3003 vec
[perm
[i
]] = saved
[i
];
3004 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3005 gcc_assert (vec
[perm
[i
]] == saved
[i
]);
3009 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3010 vec
[i
] = saved
[perm
[i
]];
3011 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3012 gcc_assert (vec
[i
] == saved
[perm
[i
]]);
3016 /* Return whether permutations PERM_A and PERM_B as recorded in the
3017 PERMS vector are equal. */
3020 vect_slp_perms_eq (const vec
<vec
<unsigned> > &perms
,
3021 int perm_a
, int perm_b
)
3023 return (perm_a
== perm_b
3024 || (perms
[perm_a
].length () == perms
[perm_b
].length ()
3025 && memcmp (&perms
[perm_a
][0], &perms
[perm_b
][0],
3026 sizeof (unsigned) * perms
[perm_a
].length ()) == 0));
3029 /* Optimize the SLP graph of VINFO. */
3032 vect_optimize_slp (vec_info
*vinfo
)
3034 if (vinfo
->slp_instances
.is_empty ())
3039 auto_vec
<slp_tree
> vertices
;
3040 auto_vec
<int> leafs
;
3041 vect_slp_build_vertices (vinfo
, vertices
, leafs
);
3043 struct graph
*slpg
= new_graph (vertices
.length ());
3044 FOR_EACH_VEC_ELT (vertices
, i
, node
)
3048 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3050 add_edge (slpg
, i
, child
->vertex
);
3053 /* Compute (reverse) postorder on the inverted graph. */
3055 graphds_dfs (slpg
, &leafs
[0], leafs
.length (), &ipo
, false, NULL
, NULL
);
3057 auto_sbitmap
n_visited (vertices
.length ());
3058 auto_sbitmap
n_materialize (vertices
.length ());
3059 auto_vec
<int> n_perm (vertices
.length ());
3060 auto_vec
<vec
<unsigned> > perms
;
3062 bitmap_clear (n_visited
);
3063 bitmap_clear (n_materialize
);
3064 n_perm
.quick_grow_cleared (vertices
.length ());
3065 perms
.safe_push (vNULL
); /* zero is no permute */
3067 /* Produce initial permutations. */
3068 for (i
= 0; i
< leafs
.length (); ++i
)
3071 slp_tree node
= vertices
[idx
];
3073 /* Handle externals and constants optimistically throughout the
3075 if (SLP_TREE_DEF_TYPE (node
) == vect_external_def
3076 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
3079 /* Leafs do not change across iterations. Note leafs also double
3080 as entries to the reverse graph. */
3081 if (!slpg
->vertices
[idx
].succ
)
3082 bitmap_set_bit (n_visited
, idx
);
3083 /* Loads are the only thing generating permutes. */
3084 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3087 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3088 node unpermuted, record this permute. */
3089 stmt_vec_info dr_stmt
= SLP_TREE_REPRESENTATIVE (node
);
3090 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt
))
3092 dr_stmt
= DR_GROUP_FIRST_ELEMENT (dr_stmt
);
3093 unsigned imin
= DR_GROUP_SIZE (dr_stmt
) + 1, imax
= 0;
3094 bool any_permute
= false;
3095 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3097 unsigned idx
= SLP_TREE_LOAD_PERMUTATION (node
)[j
];
3098 imin
= MIN (imin
, idx
);
3099 imax
= MAX (imax
, idx
);
3100 if (idx
- SLP_TREE_LOAD_PERMUTATION (node
)[0] != j
)
3103 /* If there's no permute no need to split one out. */
3106 /* If the span doesn't match we'd disrupt VF computation, avoid
3108 if (imax
- imin
+ 1 != SLP_TREE_LANES (node
))
3111 /* For now only handle true permutes, like
3112 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3113 when permuting constants and invariants keeping the permute
3115 auto_sbitmap
load_index (SLP_TREE_LANES (node
));
3116 bitmap_clear (load_index
);
3117 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3118 bitmap_set_bit (load_index
, SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
);
3120 for (j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3121 if (!bitmap_bit_p (load_index
, j
))
3123 if (j
!= SLP_TREE_LANES (node
))
3126 vec
<unsigned> perm
= vNULL
;
3127 perm
.safe_grow (SLP_TREE_LANES (node
), true);
3128 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3129 perm
[j
] = SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
;
3130 perms
.safe_push (perm
);
3131 n_perm
[idx
] = perms
.length () - 1;
3134 /* Propagate permutes along the graph and compute materialization points. */
3136 unsigned iteration
= 0;
3142 for (i
= vertices
.length (); i
> 0 ; --i
)
3145 slp_tree node
= vertices
[idx
];
3146 /* For leafs there's nothing to do - we've seeded permutes
3148 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
3151 bitmap_set_bit (n_visited
, idx
);
3153 /* We cannot move a permute across a store. */
3154 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))
3156 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))))
3160 for (graph_edge
*succ
= slpg
->vertices
[idx
].succ
;
3161 succ
; succ
= succ
->succ_next
)
3163 int succ_idx
= succ
->dest
;
3164 /* Handle unvisited nodes optimistically. */
3165 /* ??? But for constants once we want to handle non-bijective
3166 permutes we have to verify the permute, when unifying lanes,
3167 will not unify different constants. For example see
3168 gcc.dg/vect/bb-slp-14.c for a case that would break. */
3169 if (!bitmap_bit_p (n_visited
, succ_idx
))
3171 int succ_perm
= n_perm
[succ_idx
];
3172 /* Once we materialize succs permutation its output lanes
3173 appear unpermuted to us. */
3174 if (bitmap_bit_p (n_materialize
, succ_idx
))
3178 else if (succ_perm
== 0)
3183 else if (!vect_slp_perms_eq (perms
, perm
, succ_perm
))
3191 /* Pick up pre-computed leaf values. */
3193 else if (!vect_slp_perms_eq (perms
, perm
, n_perm
[idx
]))
3196 /* Make sure we eventually converge. */
3197 gcc_checking_assert (perm
== 0);
3200 bitmap_clear_bit (n_materialize
, idx
);
3207 /* Elide pruning at materialization points in the first
3208 iteration so every node was visited once at least. */
3212 /* Decide on permute materialization. Look whether there's
3213 a use (pred) edge that is permuted differently than us.
3214 In that case mark ourselves so the permutation is applied.
3215 For VEC_PERM_EXPRs the permutation doesn't carry along
3216 from children to parents so force materialization at the
3217 point of the VEC_PERM_EXPR. In principle VEC_PERM_EXPRs
3218 are a source of an arbitrary permutation again, similar
3219 to constants/externals - that's something we do not yet
3220 optimally handle. */
3221 bool all_preds_permuted
= (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
3222 && slpg
->vertices
[idx
].pred
!= NULL
);
3223 if (all_preds_permuted
)
3224 for (graph_edge
*pred
= slpg
->vertices
[idx
].pred
;
3225 pred
; pred
= pred
->pred_next
)
3227 gcc_checking_assert (bitmap_bit_p (n_visited
, pred
->src
));
3228 int pred_perm
= n_perm
[pred
->src
];
3229 if (!vect_slp_perms_eq (perms
, perm
, pred_perm
))
3231 all_preds_permuted
= false;
3235 if (!all_preds_permuted
)
3237 if (!bitmap_bit_p (n_materialize
, idx
))
3239 bitmap_set_bit (n_materialize
, idx
);
3243 while (changed
|| iteration
== 1);
3246 for (i
= 0; i
< vertices
.length (); ++i
)
3248 int perm
= n_perm
[i
];
3252 slp_tree node
= vertices
[i
];
3254 /* First permute invariant/external original successors. */
3257 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3259 if (!child
|| SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3262 /* If the vector is uniform there's nothing to do. */
3263 if (vect_slp_tree_uniform_p (child
))
3266 /* We can end up sharing some externals via two_operator
3267 handling. Be prepared to unshare those. */
3268 if (child
->refcnt
!= 1)
3270 gcc_assert (slpg
->vertices
[child
->vertex
].pred
->pred_next
);
3271 SLP_TREE_CHILDREN (node
)[j
] = child
3272 = vect_create_new_slp_node
3273 (SLP_TREE_SCALAR_OPS (child
).copy ());
3275 vect_slp_permute (perms
[perm
],
3276 SLP_TREE_SCALAR_OPS (child
), true);
3279 if (bitmap_bit_p (n_materialize
, i
))
3281 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3282 /* For loads simply drop the permutation, the load permutation
3283 already performs the desired permutation. */
3285 else if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
3287 /* If the node is already a permute node we can apply
3288 the permutation to the lane selection, effectively
3289 materializing it on the incoming vectors. */
3290 if (dump_enabled_p ())
3291 dump_printf_loc (MSG_NOTE
, vect_location
,
3292 "simplifying permute node %p\n",
3295 for (unsigned k
= 0;
3296 k
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++k
)
3297 SLP_TREE_LANE_PERMUTATION (node
)[k
].second
3298 = perms
[perm
][SLP_TREE_LANE_PERMUTATION (node
)[k
].second
];
3302 if (dump_enabled_p ())
3303 dump_printf_loc (MSG_NOTE
, vect_location
,
3304 "inserting permute node in place of %p\n",
3307 /* Make a copy of NODE and in-place change it to a
3308 VEC_PERM node to permute the lanes of the copy. */
3309 slp_tree copy
= new _slp_tree
;
3310 SLP_TREE_CHILDREN (copy
) = SLP_TREE_CHILDREN (node
);
3311 SLP_TREE_CHILDREN (node
) = vNULL
;
3312 SLP_TREE_SCALAR_STMTS (copy
)
3313 = SLP_TREE_SCALAR_STMTS (node
).copy ();
3314 vect_slp_permute (perms
[perm
],
3315 SLP_TREE_SCALAR_STMTS (copy
), true);
3316 gcc_assert (!SLP_TREE_SCALAR_OPS (node
).exists ());
3317 SLP_TREE_REPRESENTATIVE (copy
) = SLP_TREE_REPRESENTATIVE (node
);
3318 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node
).exists ());
3319 SLP_TREE_LANE_PERMUTATION (copy
)
3320 = SLP_TREE_LANE_PERMUTATION (node
);
3321 SLP_TREE_LANE_PERMUTATION (node
) = vNULL
;
3322 SLP_TREE_VECTYPE (copy
) = SLP_TREE_VECTYPE (node
);
3324 copy
->max_nunits
= node
->max_nunits
;
3325 SLP_TREE_DEF_TYPE (copy
) = SLP_TREE_DEF_TYPE (node
);
3326 SLP_TREE_LANES (copy
) = SLP_TREE_LANES (node
);
3327 SLP_TREE_CODE (copy
) = SLP_TREE_CODE (node
);
3329 /* Now turn NODE into a VEC_PERM. */
3330 SLP_TREE_CHILDREN (node
).safe_push (copy
);
3331 SLP_TREE_LANE_PERMUTATION (node
).create (SLP_TREE_LANES (node
));
3332 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3333 SLP_TREE_LANE_PERMUTATION (node
)
3334 .quick_push (std::make_pair (0, perms
[perm
][j
]));
3335 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
3340 /* Apply the reverse permutation to our stmts. */
3341 vect_slp_permute (perms
[perm
],
3342 SLP_TREE_SCALAR_STMTS (node
), true);
3343 /* And to the load permutation, which we can simply
3344 make regular by design. */
3345 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3347 /* ??? When we handle non-bijective permutes the idea
3348 is that we can force the load-permutation to be
3349 { min, min + 1, min + 2, ... max }. But then the
3350 scalar defs might no longer match the lane content
3351 which means wrong-code with live lane vectorization.
3352 So we possibly have to have NULL entries for those. */
3353 vect_slp_permute (perms
[perm
],
3354 SLP_TREE_LOAD_PERMUTATION (node
), true);
3359 /* Free the perms vector used for propagation. */
3360 while (!perms
.is_empty ())
3361 perms
.pop ().release ();
3365 /* Now elide load permutations that are not necessary. */
3366 for (i
= 0; i
< leafs
.length (); ++i
)
3368 node
= vertices
[leafs
[i
]];
3369 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3372 /* In basic block vectorization we allow any subchain of an interleaving
3374 FORNOW: not in loop SLP because of realignment complications. */
3375 if (is_a
<bb_vec_info
> (vinfo
))
3377 bool subchain_p
= true;
3378 stmt_vec_info next_load_info
= NULL
;
3379 stmt_vec_info load_info
;
3381 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3384 && (next_load_info
!= load_info
3385 || DR_GROUP_GAP (load_info
) != 1))
3390 next_load_info
= DR_GROUP_NEXT_ELEMENT (load_info
);
3394 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3400 stmt_vec_info load_info
;
3401 bool this_load_permuted
= false;
3403 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3404 if (SLP_TREE_LOAD_PERMUTATION (node
)[j
] != j
)
3406 this_load_permuted
= true;
3409 stmt_vec_info first_stmt_info
3410 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
3411 if (!this_load_permuted
3412 /* The load requires permutation when unrolling exposes
3413 a gap either because the group is larger than the SLP
3414 group-size or because there is a gap between the groups. */
3415 && (known_eq (LOOP_VINFO_VECT_FACTOR
3416 (as_a
<loop_vec_info
> (vinfo
)), 1U)
3417 || ((SLP_TREE_LANES (node
) == DR_GROUP_SIZE (first_stmt_info
))
3418 && DR_GROUP_GAP (first_stmt_info
) == 0)))
3420 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3427 /* Gather loads reachable from the individual SLP graph entries. */
3430 vect_gather_slp_loads (vec_info
*vinfo
)
3433 slp_instance instance
;
3434 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
3436 hash_set
<slp_tree
> visited
;
3437 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance
),
3438 SLP_INSTANCE_TREE (instance
), visited
);
3443 /* For each possible SLP instance decide whether to SLP it and calculate overall
3444 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3445 least one instance. */
3448 vect_make_slp_decision (loop_vec_info loop_vinfo
)
3451 poly_uint64 unrolling_factor
= 1;
3452 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
3453 slp_instance instance
;
3454 int decided_to_slp
= 0;
3456 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3458 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3460 /* FORNOW: SLP if you can. */
3461 /* All unroll factors have the form:
3463 GET_MODE_SIZE (vinfo->vector_mode) * X
3465 for some rational X, so they must have a common multiple. */
3467 = force_common_multiple (unrolling_factor
,
3468 SLP_INSTANCE_UNROLLING_FACTOR (instance
));
3470 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3471 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3472 loop-based vectorization. Such stmts will be marked as HYBRID. */
3473 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
3477 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
3479 if (decided_to_slp
&& dump_enabled_p ())
3481 dump_printf_loc (MSG_NOTE
, vect_location
,
3482 "Decided to SLP %d instances. Unrolling factor ",
3484 dump_dec (MSG_NOTE
, unrolling_factor
);
3485 dump_printf (MSG_NOTE
, "\n");
3488 return (decided_to_slp
> 0);
3491 /* Private data for vect_detect_hybrid_slp. */
3494 loop_vec_info loop_vinfo
;
3495 vec
<stmt_vec_info
> *worklist
;
3498 /* Walker for walk_gimple_op. */
3501 vect_detect_hybrid_slp (tree
*tp
, int *, void *data
)
3503 walk_stmt_info
*wi
= (walk_stmt_info
*)data
;
3504 vdhs_data
*dat
= (vdhs_data
*)wi
->info
;
3509 stmt_vec_info def_stmt_info
= dat
->loop_vinfo
->lookup_def (*tp
);
3512 def_stmt_info
= vect_stmt_to_vectorize (def_stmt_info
);
3513 if (PURE_SLP_STMT (def_stmt_info
))
3515 if (dump_enabled_p ())
3516 dump_printf_loc (MSG_NOTE
, vect_location
, "marking hybrid: %G",
3517 def_stmt_info
->stmt
);
3518 STMT_SLP_TYPE (def_stmt_info
) = hybrid
;
3519 dat
->worklist
->safe_push (def_stmt_info
);
3525 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3526 if so, otherwise pushing it to WORKLIST. */
3529 maybe_push_to_hybrid_worklist (vec_info
*vinfo
,
3530 vec
<stmt_vec_info
> &worklist
,
3531 stmt_vec_info stmt_info
)
3533 if (dump_enabled_p ())
3534 dump_printf_loc (MSG_NOTE
, vect_location
,
3535 "Processing hybrid candidate : %G", stmt_info
->stmt
);
3536 stmt_vec_info orig_info
= vect_orig_stmt (stmt_info
);
3537 imm_use_iterator iter2
;
3539 use_operand_p use_p
;
3540 def_operand_p def_p
;
3541 bool any_def
= false;
3542 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_info
->stmt
, iter1
, SSA_OP_DEF
)
3545 FOR_EACH_IMM_USE_FAST (use_p
, iter2
, DEF_FROM_PTR (def_p
))
3547 if (is_gimple_debug (USE_STMT (use_p
)))
3549 stmt_vec_info use_info
= vinfo
->lookup_stmt (USE_STMT (use_p
));
3550 /* An out-of loop use means this is a loop_vect sink. */
3553 if (dump_enabled_p ())
3554 dump_printf_loc (MSG_NOTE
, vect_location
,
3555 "Found loop_vect sink: %G", stmt_info
->stmt
);
3556 worklist
.safe_push (stmt_info
);
3559 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info
)))
3561 if (dump_enabled_p ())
3562 dump_printf_loc (MSG_NOTE
, vect_location
,
3563 "Found loop_vect use: %G", use_info
->stmt
);
3564 worklist
.safe_push (stmt_info
);
3569 /* No def means this is a loo_vect sink. */
3572 if (dump_enabled_p ())
3573 dump_printf_loc (MSG_NOTE
, vect_location
,
3574 "Found loop_vect sink: %G", stmt_info
->stmt
);
3575 worklist
.safe_push (stmt_info
);
3578 if (dump_enabled_p ())
3579 dump_printf_loc (MSG_NOTE
, vect_location
,
3580 "Marked SLP consumed stmt pure: %G", stmt_info
->stmt
);
3581 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
3584 /* Find stmts that must be both vectorized and SLPed. */
3587 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
3589 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3591 /* All stmts participating in SLP are marked pure_slp, all other
3592 stmts are loop_vect.
3593 First collect all loop_vect stmts into a worklist.
3594 SLP patterns cause not all original scalar stmts to appear in
3595 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3596 Rectify this here and do a backward walk over the IL only considering
3597 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3598 mark them as pure_slp. */
3599 auto_vec
<stmt_vec_info
> worklist
;
3600 for (int i
= LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
- 1; i
>= 0; --i
)
3602 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
3603 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
3606 gphi
*phi
= gsi
.phi ();
3607 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (phi
);
3608 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3609 maybe_push_to_hybrid_worklist (loop_vinfo
,
3610 worklist
, stmt_info
);
3612 for (gimple_stmt_iterator gsi
= gsi_last_bb (bb
); !gsi_end_p (gsi
);
3615 gimple
*stmt
= gsi_stmt (gsi
);
3616 if (is_gimple_debug (stmt
))
3618 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
3619 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
3621 for (gimple_stmt_iterator gsi2
3622 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
3623 !gsi_end_p (gsi2
); gsi_next (&gsi2
))
3625 stmt_vec_info patt_info
3626 = loop_vinfo
->lookup_stmt (gsi_stmt (gsi2
));
3627 if (!STMT_SLP_TYPE (patt_info
)
3628 && STMT_VINFO_RELEVANT (patt_info
))
3629 maybe_push_to_hybrid_worklist (loop_vinfo
,
3630 worklist
, patt_info
);
3632 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
3634 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3635 maybe_push_to_hybrid_worklist (loop_vinfo
,
3636 worklist
, stmt_info
);
3640 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3641 mark any SLP vectorized stmt as hybrid.
3642 ??? We're visiting def stmts N times (once for each non-SLP and
3643 once for each hybrid-SLP use). */
3646 dat
.worklist
= &worklist
;
3647 dat
.loop_vinfo
= loop_vinfo
;
3648 memset (&wi
, 0, sizeof (wi
));
3649 wi
.info
= (void *)&dat
;
3650 while (!worklist
.is_empty ())
3652 stmt_vec_info stmt_info
= worklist
.pop ();
3653 /* Since SSA operands are not set up for pattern stmts we need
3654 to use walk_gimple_op. */
3656 walk_gimple_op (stmt_info
->stmt
, vect_detect_hybrid_slp
, &wi
);
3661 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3663 _bb_vec_info::_bb_vec_info (vec
<basic_block
> _bbs
, vec_info_shared
*shared
)
3664 : vec_info (vec_info::bb
, init_cost (NULL
), shared
), bbs (_bbs
), roots (vNULL
)
3666 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3669 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3672 gphi
*phi
= si
.phi ();
3673 gimple_set_uid (phi
, 0);
3676 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3677 !gsi_end_p (gsi
); gsi_next (&gsi
))
3679 gimple
*stmt
= gsi_stmt (gsi
);
3680 gimple_set_uid (stmt
, 0);
3681 if (is_gimple_debug (stmt
))
3689 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3690 stmts in the basic block. */
3692 _bb_vec_info::~_bb_vec_info ()
3694 /* Reset region marker. */
3695 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3698 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3701 gphi
*phi
= si
.phi ();
3702 gimple_set_uid (phi
, -1);
3704 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3705 !gsi_end_p (gsi
); gsi_next (&gsi
))
3707 gimple
*stmt
= gsi_stmt (gsi
);
3708 gimple_set_uid (stmt
, -1);
3712 for (unsigned i
= 0; i
< roots
.length (); ++i
)
3713 roots
[i
].stmts
.release ();
3717 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3718 given then that child nodes have already been processed, and that
3719 their def types currently match their SLP node's def type. */
3722 vect_slp_analyze_node_operations_1 (vec_info
*vinfo
, slp_tree node
,
3723 slp_instance node_instance
,
3724 stmt_vector_for_cost
*cost_vec
)
3726 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
3727 gcc_assert (STMT_SLP_TYPE (stmt_info
) != loop_vect
);
3729 /* Calculate the number of vector statements to be created for the
3730 scalar stmts in this node. For SLP reductions it is equal to the
3731 number of vector statements in the children (which has already been
3732 calculated by the recursive call). Otherwise it is the number of
3733 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3734 VF divided by the number of elements in a vector. */
3735 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3736 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
3738 for (unsigned i
= 0; i
< SLP_TREE_CHILDREN (node
).length (); ++i
)
3739 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node
)[i
]) == vect_internal_def
)
3741 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3742 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node
)[i
]);
3749 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3750 vf
= loop_vinfo
->vectorization_factor
;
3753 unsigned int group_size
= SLP_TREE_LANES (node
);
3754 tree vectype
= SLP_TREE_VECTYPE (node
);
3755 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3756 = vect_get_num_vectors (vf
* group_size
, vectype
);
3759 /* Handle purely internal nodes. */
3760 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
3761 return vectorizable_slp_permutation (vinfo
, NULL
, node
, cost_vec
);
3763 if (is_a
<bb_vec_info
> (vinfo
)
3764 && !vect_update_shared_vectype (stmt_info
, SLP_TREE_VECTYPE (node
)))
3766 if (dump_enabled_p ())
3767 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3768 "desired vector type conflicts with earlier one "
3769 "for %G", stmt_info
->stmt
);
3774 return vect_analyze_stmt (vinfo
, stmt_info
, &dummy
,
3775 node
, node_instance
, cost_vec
);
3778 /* Try to build NODE from scalars, returning true on success.
3779 NODE_INSTANCE is the SLP instance that contains NODE. */
3782 vect_slp_convert_to_external (vec_info
*vinfo
, slp_tree node
,
3783 slp_instance node_instance
)
3785 stmt_vec_info stmt_info
;
3788 if (!is_a
<bb_vec_info
> (vinfo
)
3789 || node
== SLP_INSTANCE_TREE (node_instance
)
3790 || !SLP_TREE_SCALAR_STMTS (node
).exists ()
3791 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node
)))
3794 if (dump_enabled_p ())
3795 dump_printf_loc (MSG_NOTE
, vect_location
,
3796 "Building vector operands of %p from scalars instead\n", node
);
3798 /* Don't remove and free the child nodes here, since they could be
3799 referenced by other structures. The analysis and scheduling phases
3800 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3801 unsigned int group_size
= SLP_TREE_LANES (node
);
3802 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
3803 SLP_TREE_SCALAR_OPS (node
).safe_grow (group_size
, true);
3804 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3805 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3807 tree lhs
= gimple_get_lhs (vect_orig_stmt (stmt_info
)->stmt
);
3808 SLP_TREE_SCALAR_OPS (node
)[i
] = lhs
;
3813 /* Compute the prologue cost for invariant or constant operands represented
3817 vect_prologue_cost_for_slp (slp_tree node
,
3818 stmt_vector_for_cost
*cost_vec
)
3820 /* There's a special case of an existing vector, that costs nothing. */
3821 if (SLP_TREE_SCALAR_OPS (node
).length () == 0
3822 && !SLP_TREE_VEC_DEFS (node
).is_empty ())
3824 /* Without looking at the actual initializer a vector of
3825 constants can be implemented as load from the constant pool.
3826 When all elements are the same we can use a splat. */
3827 tree vectype
= SLP_TREE_VECTYPE (node
);
3828 unsigned group_size
= SLP_TREE_SCALAR_OPS (node
).length ();
3829 unsigned num_vects_to_check
;
3830 unsigned HOST_WIDE_INT const_nunits
;
3831 unsigned nelt_limit
;
3832 if (TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
)
3833 && ! multiple_p (const_nunits
, group_size
))
3835 num_vects_to_check
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
3836 nelt_limit
= const_nunits
;
3840 /* If either the vector has variable length or the vectors
3841 are composed of repeated whole groups we only need to
3842 cost construction once. All vectors will be the same. */
3843 num_vects_to_check
= 1;
3844 nelt_limit
= group_size
;
3846 tree elt
= NULL_TREE
;
3848 for (unsigned j
= 0; j
< num_vects_to_check
* nelt_limit
; ++j
)
3850 unsigned si
= j
% group_size
;
3852 elt
= SLP_TREE_SCALAR_OPS (node
)[si
];
3853 /* ??? We're just tracking whether all operands of a single
3854 vector initializer are the same, ideally we'd check if
3855 we emitted the same one already. */
3856 else if (elt
!= SLP_TREE_SCALAR_OPS (node
)[si
])
3859 if (nelt
== nelt_limit
)
3861 record_stmt_cost (cost_vec
, 1,
3862 SLP_TREE_DEF_TYPE (node
) == vect_external_def
3863 ? (elt
? scalar_to_vec
: vec_construct
)
3865 NULL
, vectype
, 0, vect_prologue
);
3871 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3872 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3874 Return true if the operations are supported. */
3877 vect_slp_analyze_node_operations (vec_info
*vinfo
, slp_tree node
,
3878 slp_instance node_instance
,
3879 hash_set
<slp_tree
> &visited_set
,
3880 vec
<slp_tree
> &visited_vec
,
3881 stmt_vector_for_cost
*cost_vec
)
3886 /* Assume we can code-generate all invariants. */
3888 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
3889 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
3892 if (SLP_TREE_DEF_TYPE (node
) == vect_uninitialized_def
)
3894 if (dump_enabled_p ())
3895 dump_printf_loc (MSG_NOTE
, vect_location
,
3896 "Failed cyclic SLP reference in %p", node
);
3899 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
);
3901 /* If we already analyzed the exact same set of scalar stmts we're done.
3902 We share the generated vector stmts for those. */
3903 if (visited_set
.add (node
))
3905 visited_vec
.safe_push (node
);
3908 unsigned visited_rec_start
= visited_vec
.length ();
3909 unsigned cost_vec_rec_start
= cost_vec
->length ();
3910 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3912 res
= vect_slp_analyze_node_operations (vinfo
, child
, node_instance
,
3913 visited_set
, visited_vec
,
3920 res
= vect_slp_analyze_node_operations_1 (vinfo
, node
, node_instance
,
3922 /* If analysis failed we have to pop all recursive visited nodes
3926 while (visited_vec
.length () >= visited_rec_start
)
3927 visited_set
.remove (visited_vec
.pop ());
3928 cost_vec
->truncate (cost_vec_rec_start
);
3931 /* When the node can be vectorized cost invariant nodes it references.
3932 This is not done in DFS order to allow the refering node
3933 vectorizable_* calls to nail down the invariant nodes vector type
3934 and possibly unshare it if it needs a different vector type than
3937 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3939 && (SLP_TREE_DEF_TYPE (child
) == vect_constant_def
3940 || SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
3941 /* Perform usual caching, note code-generation still
3942 code-gens these nodes multiple times but we expect
3943 to CSE them later. */
3944 && !visited_set
.add (child
))
3946 visited_vec
.safe_push (child
);
3947 /* ??? After auditing more code paths make a "default"
3948 and push the vector type from NODE to all children
3949 if it is not already set. */
3950 /* Compute the number of vectors to be generated. */
3951 tree vector_type
= SLP_TREE_VECTYPE (child
);
3954 /* For shifts with a scalar argument we don't need
3955 to cost or code-generate anything.
3956 ??? Represent this more explicitely. */
3957 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
3958 == shift_vec_info_type
)
3962 unsigned group_size
= SLP_TREE_LANES (child
);
3964 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3965 vf
= loop_vinfo
->vectorization_factor
;
3966 SLP_TREE_NUMBER_OF_VEC_STMTS (child
)
3967 = vect_get_num_vectors (vf
* group_size
, vector_type
);
3968 /* And cost them. */
3969 vect_prologue_cost_for_slp (child
, cost_vec
);
3972 /* If this node or any of its children can't be vectorized, try pruning
3973 the tree here rather than felling the whole thing. */
3974 if (!res
&& vect_slp_convert_to_external (vinfo
, node
, node_instance
))
3976 /* We'll need to revisit this for invariant costing and number
3977 of vectorized stmt setting. */
3985 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3986 region and that can be vectorized using vectorizable_live_operation
3987 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3988 scalar code computing it to be retained. */
3991 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo
, slp_tree node
,
3992 slp_instance instance
,
3993 stmt_vector_for_cost
*cost_vec
,
3994 hash_set
<stmt_vec_info
> &svisited
,
3995 hash_set
<slp_tree
> &visited
)
3997 if (visited
.add (node
))
4001 stmt_vec_info stmt_info
;
4002 stmt_vec_info last_stmt
= vect_find_last_scalar_stmt_in_slp (node
);
4003 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4005 if (svisited
.contains (stmt_info
))
4007 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4008 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
)
4009 && STMT_VINFO_RELATED_STMT (orig_stmt_info
) != stmt_info
)
4010 /* Only the pattern root stmt computes the original scalar value. */
4012 bool mark_visited
= true;
4013 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
4014 ssa_op_iter op_iter
;
4015 def_operand_p def_p
;
4016 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
4018 imm_use_iterator use_iter
;
4020 stmt_vec_info use_stmt_info
;
4021 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4022 if (!is_gimple_debug (use_stmt
))
4024 use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
);
4026 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
4028 STMT_VINFO_LIVE_P (stmt_info
) = true;
4029 if (vectorizable_live_operation (bb_vinfo
, stmt_info
,
4030 NULL
, node
, instance
, i
,
4032 /* ??? So we know we can vectorize the live stmt
4033 from one SLP node. If we cannot do so from all
4034 or none consistently we'd have to record which
4035 SLP node (and lane) we want to use for the live
4036 operation. So make sure we can code-generate
4038 mark_visited
= false;
4040 STMT_VINFO_LIVE_P (stmt_info
) = false;
4044 /* We have to verify whether we can insert the lane extract
4045 before all uses. The following is a conservative approximation.
4046 We cannot put this into vectorizable_live_operation because
4047 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4049 Note that while the fact that we emit code for loads at the
4050 first load should make this a non-problem leafs we construct
4051 from scalars are vectorized after the last scalar def.
4052 ??? If we'd actually compute the insert location during
4053 analysis we could use sth less conservative than the last
4054 scalar stmt in the node for the dominance check. */
4055 /* ??? What remains is "live" uses in vector CTORs in the same
4056 SLP graph which is where those uses can end up code-generated
4057 right after their definition instead of close to their original
4058 use. But that would restrict us to code-generate lane-extracts
4059 from the latest stmt in a node. So we compensate for this
4060 during code-generation, simply not replacing uses for those
4061 hopefully rare cases. */
4062 if (STMT_VINFO_LIVE_P (stmt_info
))
4063 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4064 if (!is_gimple_debug (use_stmt
)
4065 && (!(use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
))
4066 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
4067 && !vect_stmt_dominates_stmt_p (last_stmt
->stmt
, use_stmt
))
4069 if (dump_enabled_p ())
4070 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4071 "Cannot determine insertion place for "
4073 STMT_VINFO_LIVE_P (stmt_info
) = false;
4074 mark_visited
= true;
4078 svisited
.add (stmt_info
);
4082 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4083 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4084 vect_bb_slp_mark_live_stmts (bb_vinfo
, child
, instance
,
4085 cost_vec
, svisited
, visited
);
4088 /* Analyze statements in SLP instances of VINFO. Return true if the
4089 operations are supported. */
4092 vect_slp_analyze_operations (vec_info
*vinfo
)
4094 slp_instance instance
;
4097 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4099 hash_set
<slp_tree
> visited
;
4100 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
4102 auto_vec
<slp_tree
> visited_vec
;
4103 stmt_vector_for_cost cost_vec
;
4104 cost_vec
.create (2);
4105 if (is_a
<bb_vec_info
> (vinfo
))
4106 vect_location
= instance
->location ();
4107 if (!vect_slp_analyze_node_operations (vinfo
,
4108 SLP_INSTANCE_TREE (instance
),
4109 instance
, visited
, visited_vec
,
4111 /* Instances with a root stmt require vectorized defs for the
4113 || (SLP_INSTANCE_ROOT_STMT (instance
)
4114 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
))
4115 != vect_internal_def
)))
4117 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4118 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4119 if (dump_enabled_p ())
4120 dump_printf_loc (MSG_NOTE
, vect_location
,
4121 "removing SLP instance operations starting from: %G",
4123 vect_free_slp_instance (instance
);
4124 vinfo
->slp_instances
.ordered_remove (i
);
4125 cost_vec
.release ();
4126 while (!visited_vec
.is_empty ())
4127 visited
.remove (visited_vec
.pop ());
4133 /* For BB vectorization remember the SLP graph entry
4135 if (is_a
<bb_vec_info
> (vinfo
))
4136 instance
->cost_vec
= cost_vec
;
4139 add_stmt_costs (vinfo
, vinfo
->target_cost_data
, &cost_vec
);
4140 cost_vec
.release ();
4145 /* Compute vectorizable live stmts. */
4146 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
4148 hash_set
<stmt_vec_info
> svisited
;
4149 hash_set
<slp_tree
> visited
;
4150 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4152 vect_location
= instance
->location ();
4153 vect_bb_slp_mark_live_stmts (bb_vinfo
, SLP_INSTANCE_TREE (instance
),
4154 instance
, &instance
->cost_vec
, svisited
,
4159 return !vinfo
->slp_instances
.is_empty ();
4162 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
4163 closing the eventual chain. */
4166 get_ultimate_leader (slp_instance instance
,
4167 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
4169 auto_vec
<slp_instance
*, 8> chain
;
4171 while (*(tem
= instance_leader
.get (instance
)) != instance
)
4173 chain
.safe_push (tem
);
4176 while (!chain
.is_empty ())
4177 *chain
.pop () = instance
;
4181 /* Worker of vect_bb_partition_graph, recurse on NODE. */
4184 vect_bb_partition_graph_r (bb_vec_info bb_vinfo
,
4185 slp_instance instance
, slp_tree node
,
4186 hash_map
<stmt_vec_info
, slp_instance
> &stmt_to_instance
,
4187 hash_map
<slp_instance
, slp_instance
> &instance_leader
,
4188 hash_set
<slp_tree
> &visited
)
4190 stmt_vec_info stmt_info
;
4193 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4196 slp_instance
&stmt_instance
4197 = stmt_to_instance
.get_or_insert (stmt_info
, &existed_p
);
4200 else if (stmt_instance
!= instance
)
4202 /* If we're running into a previously marked stmt make us the
4203 leader of the current ultimate leader. This keeps the
4204 leader chain acyclic and works even when the current instance
4205 connects two previously independent graph parts. */
4206 slp_instance stmt_leader
4207 = get_ultimate_leader (stmt_instance
, instance_leader
);
4208 if (stmt_leader
!= instance
)
4209 instance_leader
.put (stmt_leader
, instance
);
4211 stmt_instance
= instance
;
4214 if (visited
.add (node
))
4218 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4219 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4220 vect_bb_partition_graph_r (bb_vinfo
, instance
, child
, stmt_to_instance
,
4221 instance_leader
, visited
);
4224 /* Partition the SLP graph into pieces that can be costed independently. */
4227 vect_bb_partition_graph (bb_vec_info bb_vinfo
)
4229 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
4231 /* First walk the SLP graph assigning each involved scalar stmt a
4232 corresponding SLP graph entry and upon visiting a previously
4233 marked stmt, make the stmts leader the current SLP graph entry. */
4234 hash_map
<stmt_vec_info
, slp_instance
> stmt_to_instance
;
4235 hash_map
<slp_instance
, slp_instance
> instance_leader
;
4236 hash_set
<slp_tree
> visited
;
4237 slp_instance instance
;
4238 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4240 instance_leader
.put (instance
, instance
);
4241 vect_bb_partition_graph_r (bb_vinfo
,
4242 instance
, SLP_INSTANCE_TREE (instance
),
4243 stmt_to_instance
, instance_leader
,
4247 /* Then collect entries to each independent subgraph. */
4248 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4250 slp_instance leader
= get_ultimate_leader (instance
, instance_leader
);
4251 leader
->subgraph_entries
.safe_push (instance
);
4252 if (dump_enabled_p ()
4253 && leader
!= instance
)
4254 dump_printf_loc (MSG_NOTE
, vect_location
,
4255 "instance %p is leader of %p\n",
4260 /* Compute the scalar cost of the SLP node NODE and its children
4261 and return it. Do not account defs that are marked in LIFE and
4262 update LIFE according to uses of NODE. */
4265 vect_bb_slp_scalar_cost (vec_info
*vinfo
,
4266 slp_tree node
, vec
<bool, va_heap
> *life
,
4267 stmt_vector_for_cost
*cost_vec
,
4268 hash_set
<slp_tree
> &visited
)
4271 stmt_vec_info stmt_info
;
4274 if (visited
.add (node
))
4277 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4279 ssa_op_iter op_iter
;
4280 def_operand_p def_p
;
4285 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4286 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
4288 /* If there is a non-vectorized use of the defs then the scalar
4289 stmt is kept live in which case we do not account it or any
4290 required defs in the SLP children in the scalar cost. This
4291 way we make the vectorization more costly when compared to
4293 if (!STMT_VINFO_LIVE_P (stmt_info
))
4295 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
4297 imm_use_iterator use_iter
;
4299 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4300 if (!is_gimple_debug (use_stmt
))
4302 stmt_vec_info use_stmt_info
= vinfo
->lookup_stmt (use_stmt
);
4305 (vect_stmt_to_vectorize (use_stmt_info
)))
4316 /* Count scalar stmts only once. */
4317 if (gimple_visited_p (orig_stmt
))
4319 gimple_set_visited (orig_stmt
, true);
4321 vect_cost_for_stmt kind
;
4322 if (STMT_VINFO_DATA_REF (orig_stmt_info
))
4324 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info
)))
4327 kind
= scalar_store
;
4329 else if (vect_nop_conversion_p (orig_stmt_info
))
4331 /* For single-argument PHIs assume coalescing which means zero cost
4332 for the scalar and the vector PHIs. This avoids artificially
4333 favoring the vector path (but may pessimize it in some cases). */
4334 else if (is_a
<gphi
*> (orig_stmt_info
->stmt
)
4335 && gimple_phi_num_args
4336 (as_a
<gphi
*> (orig_stmt_info
->stmt
)) == 1)
4340 record_stmt_cost (cost_vec
, 1, kind
, orig_stmt_info
,
4341 SLP_TREE_VECTYPE (node
), 0, vect_body
);
4344 auto_vec
<bool, 20> subtree_life
;
4345 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4347 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4349 /* Do not directly pass LIFE to the recursive call, copy it to
4350 confine changes in the callee to the current child/subtree. */
4351 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
4353 subtree_life
.safe_grow_cleared (SLP_TREE_LANES (child
), true);
4354 for (unsigned j
= 0;
4355 j
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++j
)
4357 auto perm
= SLP_TREE_LANE_PERMUTATION (node
)[j
];
4358 if (perm
.first
== i
)
4359 subtree_life
[perm
.second
] = (*life
)[j
];
4364 gcc_assert (SLP_TREE_LANES (node
) == SLP_TREE_LANES (child
));
4365 subtree_life
.safe_splice (*life
);
4367 vect_bb_slp_scalar_cost (vinfo
, child
, &subtree_life
, cost_vec
,
4369 subtree_life
.truncate (0);
4374 /* Comparator for the loop-index sorted cost vectors. */
4377 li_cost_vec_cmp (const void *a_
, const void *b_
)
4379 auto *a
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)a_
;
4380 auto *b
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)b_
;
4381 if (a
->first
< b
->first
)
4383 else if (a
->first
== b
->first
)
4388 /* Check if vectorization of the basic block is profitable for the
4389 subgraph denoted by SLP_INSTANCES. */
4392 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
,
4393 vec
<slp_instance
> slp_instances
)
4395 slp_instance instance
;
4397 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
4398 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
4400 if (dump_enabled_p ())
4402 dump_printf_loc (MSG_NOTE
, vect_location
, "Costing subgraph: \n");
4403 hash_set
<slp_tree
> visited
;
4404 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4405 vect_print_slp_graph (MSG_NOTE
, vect_location
,
4406 SLP_INSTANCE_TREE (instance
), visited
);
4409 /* Calculate scalar cost and sum the cost for the vector stmts
4410 previously collected. */
4411 stmt_vector_for_cost scalar_costs
= vNULL
;
4412 stmt_vector_for_cost vector_costs
= vNULL
;
4413 hash_set
<slp_tree
> visited
;
4414 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4416 auto_vec
<bool, 20> life
;
4417 life
.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
)),
4419 if (SLP_INSTANCE_ROOT_STMT (instance
))
4420 record_stmt_cost (&scalar_costs
, 1, scalar_stmt
,
4421 SLP_INSTANCE_ROOT_STMT (instance
), 0, vect_body
);
4422 vect_bb_slp_scalar_cost (bb_vinfo
,
4423 SLP_INSTANCE_TREE (instance
),
4424 &life
, &scalar_costs
, visited
);
4425 vector_costs
.safe_splice (instance
->cost_vec
);
4426 instance
->cost_vec
.release ();
4428 /* Unset visited flag. */
4429 stmt_info_for_cost
*cost
;
4430 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
4431 gimple_set_visited (cost
->stmt_info
->stmt
, false);
4433 if (dump_enabled_p ())
4434 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
4436 /* When costing non-loop vectorization we need to consider each covered
4437 loop independently and make sure vectorization is profitable. For
4438 now we assume a loop may be not entered or executed an arbitrary
4439 number of iterations (??? static information can provide more
4440 precise info here) which means we can simply cost each containing
4441 loops stmts separately. */
4443 /* First produce cost vectors sorted by loop index. */
4444 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
4445 li_scalar_costs (scalar_costs
.length ());
4446 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
4447 li_vector_costs (vector_costs
.length ());
4448 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
4450 unsigned l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
4451 li_scalar_costs
.quick_push (std::make_pair (l
, cost
));
4453 /* Use a random used loop as fallback in case the first vector_costs
4454 entry does not have a stmt_info associated with it. */
4455 unsigned l
= li_scalar_costs
[0].first
;
4456 FOR_EACH_VEC_ELT (vector_costs
, i
, cost
)
4458 /* We inherit from the previous COST, invariants, externals and
4459 extracts immediately follow the cost for the related stmt. */
4460 if (cost
->stmt_info
)
4461 l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
4462 li_vector_costs
.quick_push (std::make_pair (l
, cost
));
4464 li_scalar_costs
.qsort (li_cost_vec_cmp
);
4465 li_vector_costs
.qsort (li_cost_vec_cmp
);
4467 /* Now cost the portions individually. */
4470 while (si
< li_scalar_costs
.length ()
4471 && vi
< li_vector_costs
.length ())
4473 unsigned sl
= li_scalar_costs
[si
].first
;
4474 unsigned vl
= li_vector_costs
[vi
].first
;
4477 if (dump_enabled_p ())
4478 dump_printf_loc (MSG_NOTE
, vect_location
,
4479 "Scalar %d and vector %d loop part do not "
4480 "match up, skipping scalar part\n", sl
, vl
);
4481 /* Skip the scalar part, assuming zero cost on the vector side. */
4486 while (si
< li_scalar_costs
.length ()
4487 && li_scalar_costs
[si
].first
== sl
);
4491 void *scalar_target_cost_data
= init_cost (NULL
);
4494 add_stmt_cost (bb_vinfo
, scalar_target_cost_data
,
4495 li_scalar_costs
[si
].second
);
4498 while (si
< li_scalar_costs
.length ()
4499 && li_scalar_costs
[si
].first
== sl
);
4501 finish_cost (scalar_target_cost_data
, &dummy
, &scalar_cost
, &dummy
);
4502 destroy_cost_data (scalar_target_cost_data
);
4504 /* Complete the target-specific vector cost calculation. */
4505 void *vect_target_cost_data
= init_cost (NULL
);
4508 add_stmt_cost (bb_vinfo
, vect_target_cost_data
,
4509 li_vector_costs
[vi
].second
);
4512 while (vi
< li_vector_costs
.length ()
4513 && li_vector_costs
[vi
].first
== vl
);
4514 finish_cost (vect_target_cost_data
, &vec_prologue_cost
,
4515 &vec_inside_cost
, &vec_epilogue_cost
);
4516 destroy_cost_data (vect_target_cost_data
);
4518 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
4520 if (dump_enabled_p ())
4522 dump_printf_loc (MSG_NOTE
, vect_location
,
4523 "Cost model analysis for part in loop %d:\n", sl
);
4524 dump_printf (MSG_NOTE
, " Vector cost: %d\n",
4525 vec_inside_cost
+ vec_outside_cost
);
4526 dump_printf (MSG_NOTE
, " Scalar cost: %d\n", scalar_cost
);
4529 /* Vectorization is profitable if its cost is more than the cost of scalar
4530 version. Note that we err on the vector side for equal cost because
4531 the cost estimate is otherwise quite pessimistic (constant uses are
4532 free on the scalar side but cost a load on the vector side for
4534 if (vec_outside_cost
+ vec_inside_cost
> scalar_cost
)
4536 scalar_costs
.release ();
4537 vector_costs
.release ();
4541 if (vi
< li_vector_costs
.length ())
4543 if (dump_enabled_p ())
4544 dump_printf_loc (MSG_NOTE
, vect_location
,
4545 "Excess vector cost for part in loop %d:\n",
4546 li_vector_costs
[vi
].first
);
4547 scalar_costs
.release ();
4548 vector_costs
.release ();
4552 scalar_costs
.release ();
4553 vector_costs
.release ();
4557 /* qsort comparator for lane defs. */
4560 vld_cmp (const void *a_
, const void *b_
)
4562 auto *a
= (const std::pair
<unsigned, tree
> *)a_
;
4563 auto *b
= (const std::pair
<unsigned, tree
> *)b_
;
4564 return a
->first
- b
->first
;
4567 /* Return true if USE_STMT is a vector lane insert into VEC and set
4568 *THIS_LANE to the lane number that is set. */
4571 vect_slp_is_lane_insert (gimple
*use_stmt
, tree vec
, unsigned *this_lane
)
4573 gassign
*use_ass
= dyn_cast
<gassign
*> (use_stmt
);
4575 || gimple_assign_rhs_code (use_ass
) != BIT_INSERT_EXPR
4577 ? gimple_assign_rhs1 (use_ass
) != vec
4578 : ((vec
= gimple_assign_rhs1 (use_ass
)), false))
4579 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec
)),
4580 TREE_TYPE (gimple_assign_rhs2 (use_ass
)))
4581 || !constant_multiple_p
4582 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass
)),
4583 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec
)))),
4589 /* Find any vectorizable constructors and add them to the grouped_store
4593 vect_slp_check_for_constructors (bb_vec_info bb_vinfo
)
4595 for (unsigned i
= 0; i
< bb_vinfo
->bbs
.length (); ++i
)
4596 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb_vinfo
->bbs
[i
]);
4597 !gsi_end_p (gsi
); gsi_next (&gsi
))
4599 gassign
*assign
= dyn_cast
<gassign
*> (gsi_stmt (gsi
));
4603 tree rhs
= gimple_assign_rhs1 (assign
);
4604 if (gimple_assign_rhs_code (assign
) == CONSTRUCTOR
)
4606 if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
4607 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)),
4608 CONSTRUCTOR_NELTS (rhs
))
4609 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs
, 0)->value
))
4610 || uniform_vector_p (rhs
))
4615 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), j
, val
)
4616 if (TREE_CODE (val
) != SSA_NAME
4617 || !bb_vinfo
->lookup_def (val
))
4619 if (j
!= CONSTRUCTOR_NELTS (rhs
))
4622 stmt_vec_info stmt_info
= bb_vinfo
->lookup_stmt (assign
);
4623 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
4625 else if (gimple_assign_rhs_code (assign
) == BIT_INSERT_EXPR
4626 && VECTOR_TYPE_P (TREE_TYPE (rhs
))
4627 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).is_constant ()
4628 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).to_constant () > 1
4629 && integer_zerop (gimple_assign_rhs3 (assign
))
4630 && useless_type_conversion_p
4631 (TREE_TYPE (TREE_TYPE (rhs
)),
4632 TREE_TYPE (gimple_assign_rhs2 (assign
)))
4633 && bb_vinfo
->lookup_def (gimple_assign_rhs2 (assign
)))
4635 /* We start to match on insert to lane zero but since the
4636 inserts need not be ordered we'd have to search both
4637 the def and the use chains. */
4638 tree vectype
= TREE_TYPE (rhs
);
4639 unsigned nlanes
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
4640 auto_vec
<std::pair
<unsigned, tree
> > lane_defs (nlanes
);
4641 auto_sbitmap
lanes (nlanes
);
4642 bitmap_clear (lanes
);
4643 bitmap_set_bit (lanes
, 0);
4644 tree def
= gimple_assign_lhs (assign
);
4645 lane_defs
.quick_push
4646 (std::make_pair (0, gimple_assign_rhs2 (assign
)));
4647 unsigned lanes_found
= 1;
4648 /* Start with the use chains, the last stmt will be the root. */
4649 stmt_vec_info last
= bb_vinfo
->lookup_stmt (assign
);
4652 use_operand_p use_p
;
4654 if (!single_imm_use (def
, &use_p
, &use_stmt
))
4657 if (!bb_vinfo
->lookup_stmt (use_stmt
)
4658 || !vect_slp_is_lane_insert (use_stmt
, def
, &this_lane
)
4659 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (use_stmt
)))
4661 if (bitmap_bit_p (lanes
, this_lane
))
4664 bitmap_set_bit (lanes
, this_lane
);
4665 gassign
*use_ass
= as_a
<gassign
*> (use_stmt
);
4666 lane_defs
.quick_push (std::make_pair
4667 (this_lane
, gimple_assign_rhs2 (use_ass
)));
4668 last
= bb_vinfo
->lookup_stmt (use_ass
);
4669 def
= gimple_assign_lhs (use_ass
);
4671 while (lanes_found
< nlanes
);
4672 if (lanes_found
< nlanes
)
4674 /* Now search the def chain. */
4675 def
= gimple_assign_rhs1 (assign
);
4678 if (TREE_CODE (def
) != SSA_NAME
4679 || !has_single_use (def
))
4681 gimple
*def_stmt
= SSA_NAME_DEF_STMT (def
);
4683 if (!bb_vinfo
->lookup_stmt (def_stmt
)
4684 || !vect_slp_is_lane_insert (def_stmt
,
4685 NULL_TREE
, &this_lane
)
4686 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (def_stmt
)))
4688 if (bitmap_bit_p (lanes
, this_lane
))
4691 bitmap_set_bit (lanes
, this_lane
);
4692 lane_defs
.quick_push (std::make_pair
4694 gimple_assign_rhs2 (def_stmt
)));
4695 def
= gimple_assign_rhs1 (def_stmt
);
4697 while (lanes_found
< nlanes
);
4699 if (lanes_found
== nlanes
)
4701 /* Sort lane_defs after the lane index and register the root. */
4702 lane_defs
.qsort (vld_cmp
);
4703 vec
<stmt_vec_info
> stmts
;
4704 stmts
.create (nlanes
);
4705 for (unsigned i
= 0; i
< nlanes
; ++i
)
4706 stmts
.quick_push (bb_vinfo
->lookup_def (lane_defs
[i
].second
));
4707 bb_vinfo
->roots
.safe_push (slp_root (slp_inst_kind_ctor
,
4714 /* Walk the grouped store chains and replace entries with their
4715 pattern variant if any. */
4718 vect_fixup_store_groups_with_patterns (vec_info
*vinfo
)
4720 stmt_vec_info first_element
;
4723 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
4725 /* We also have CTORs in this array. */
4726 if (!STMT_VINFO_GROUPED_ACCESS (first_element
))
4728 if (STMT_VINFO_IN_PATTERN_P (first_element
))
4730 stmt_vec_info orig
= first_element
;
4731 first_element
= STMT_VINFO_RELATED_STMT (first_element
);
4732 DR_GROUP_FIRST_ELEMENT (first_element
) = first_element
;
4733 DR_GROUP_SIZE (first_element
) = DR_GROUP_SIZE (orig
);
4734 DR_GROUP_GAP (first_element
) = DR_GROUP_GAP (orig
);
4735 DR_GROUP_NEXT_ELEMENT (first_element
) = DR_GROUP_NEXT_ELEMENT (orig
);
4736 vinfo
->grouped_stores
[i
] = first_element
;
4738 stmt_vec_info prev
= first_element
;
4739 while (DR_GROUP_NEXT_ELEMENT (prev
))
4741 stmt_vec_info elt
= DR_GROUP_NEXT_ELEMENT (prev
);
4742 if (STMT_VINFO_IN_PATTERN_P (elt
))
4744 stmt_vec_info orig
= elt
;
4745 elt
= STMT_VINFO_RELATED_STMT (elt
);
4746 DR_GROUP_NEXT_ELEMENT (prev
) = elt
;
4747 DR_GROUP_GAP (elt
) = DR_GROUP_GAP (orig
);
4748 DR_GROUP_NEXT_ELEMENT (elt
) = DR_GROUP_NEXT_ELEMENT (orig
);
4750 DR_GROUP_FIRST_ELEMENT (elt
) = first_element
;
4756 /* Check if the region described by BB_VINFO can be vectorized, returning
4757 true if so. When returning false, set FATAL to true if the same failure
4758 would prevent vectorization at other vector sizes, false if it is still
4759 worth trying other sizes. N_STMTS is the number of statements in the
4763 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo
, int n_stmts
, bool &fatal
,
4764 vec
<int> *dataref_groups
)
4766 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4768 slp_instance instance
;
4770 poly_uint64 min_vf
= 2;
4772 /* The first group of checks is independent of the vector size. */
4775 /* Analyze the data references. */
4777 if (!vect_analyze_data_refs (bb_vinfo
, &min_vf
, NULL
))
4779 if (dump_enabled_p ())
4780 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4781 "not vectorized: unhandled data-ref in basic "
4786 if (!vect_analyze_data_ref_accesses (bb_vinfo
, dataref_groups
))
4788 if (dump_enabled_p ())
4789 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4790 "not vectorized: unhandled data access in "
4795 vect_slp_check_for_constructors (bb_vinfo
);
4797 /* If there are no grouped stores and no constructors in the region
4798 there is no need to continue with pattern recog as vect_analyze_slp
4799 will fail anyway. */
4800 if (bb_vinfo
->grouped_stores
.is_empty ()
4801 && bb_vinfo
->roots
.is_empty ())
4803 if (dump_enabled_p ())
4804 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4805 "not vectorized: no grouped stores in "
4810 /* While the rest of the analysis below depends on it in some way. */
4813 vect_pattern_recog (bb_vinfo
);
4815 /* Update store groups from pattern processing. */
4816 vect_fixup_store_groups_with_patterns (bb_vinfo
);
4818 /* Check the SLP opportunities in the basic block, analyze and build SLP
4820 if (!vect_analyze_slp (bb_vinfo
, n_stmts
))
4822 if (dump_enabled_p ())
4824 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4825 "Failed to SLP the basic block.\n");
4826 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4827 "not vectorized: failed to find SLP opportunities "
4828 "in basic block.\n");
4833 /* Optimize permutations. */
4834 vect_optimize_slp (bb_vinfo
);
4836 /* Gather the loads reachable from the SLP graph entries. */
4837 vect_gather_slp_loads (bb_vinfo
);
4839 vect_record_base_alignments (bb_vinfo
);
4841 /* Analyze and verify the alignment of data references and the
4842 dependence in the SLP instances. */
4843 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
4845 vect_location
= instance
->location ();
4846 if (! vect_slp_analyze_instance_alignment (bb_vinfo
, instance
)
4847 || ! vect_slp_analyze_instance_dependence (bb_vinfo
, instance
))
4849 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4850 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4851 if (dump_enabled_p ())
4852 dump_printf_loc (MSG_NOTE
, vect_location
,
4853 "removing SLP instance operations starting from: %G",
4855 vect_free_slp_instance (instance
);
4856 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
4860 /* Mark all the statements that we want to vectorize as pure SLP and
4862 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
4863 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
4864 if (stmt_vec_info root
= SLP_INSTANCE_ROOT_STMT (instance
))
4866 STMT_SLP_TYPE (root
) = pure_slp
;
4867 if (is_gimple_assign (root
->stmt
)
4868 && gimple_assign_rhs_code (root
->stmt
) == BIT_INSERT_EXPR
)
4870 /* ??? We should probably record the whole vector of
4871 root stmts so we do not have to back-track here... */
4872 for (unsigned n
= SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
));
4875 root
= bb_vinfo
->lookup_def (gimple_assign_rhs1 (root
->stmt
));
4876 STMT_SLP_TYPE (root
) = pure_slp
;
4883 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
4886 if (!vect_slp_analyze_operations (bb_vinfo
))
4888 if (dump_enabled_p ())
4889 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4890 "not vectorized: bad operation in basic block.\n");
4894 vect_bb_partition_graph (bb_vinfo
);
4899 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4900 basic blocks in BBS, returning true on success.
4901 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4904 vect_slp_region (vec
<basic_block
> bbs
, vec
<data_reference_p
> datarefs
,
4905 vec
<int> *dataref_groups
, unsigned int n_stmts
)
4907 bb_vec_info bb_vinfo
;
4908 auto_vector_modes vector_modes
;
4910 /* Autodetect first vector size we try. */
4911 machine_mode next_vector_mode
= VOIDmode
;
4912 targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
, false);
4913 unsigned int mode_i
= 0;
4915 vec_info_shared shared
;
4917 machine_mode autodetected_vector_mode
= VOIDmode
;
4920 bool vectorized
= false;
4922 bb_vinfo
= new _bb_vec_info (bbs
, &shared
);
4924 bool first_time_p
= shared
.datarefs
.is_empty ();
4925 BB_VINFO_DATAREFS (bb_vinfo
) = datarefs
;
4927 bb_vinfo
->shared
->save_datarefs ();
4929 bb_vinfo
->shared
->check_datarefs ();
4930 bb_vinfo
->vector_mode
= next_vector_mode
;
4932 if (vect_slp_analyze_bb_1 (bb_vinfo
, n_stmts
, fatal
, dataref_groups
))
4934 if (dump_enabled_p ())
4936 dump_printf_loc (MSG_NOTE
, vect_location
,
4937 "***** Analysis succeeded with vector mode"
4938 " %s\n", GET_MODE_NAME (bb_vinfo
->vector_mode
));
4939 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB part\n");
4942 bb_vinfo
->shared
->check_datarefs ();
4945 slp_instance instance
;
4946 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo
), i
, instance
)
4948 if (instance
->subgraph_entries
.is_empty ())
4951 vect_location
= instance
->location ();
4952 if (!unlimited_cost_model (NULL
)
4953 && !vect_bb_vectorization_profitable_p
4954 (bb_vinfo
, instance
->subgraph_entries
))
4956 if (dump_enabled_p ())
4957 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4958 "not vectorized: vectorization is not "
4963 if (!dbg_cnt (vect_slp
))
4966 if (!vectorized
&& dump_enabled_p ())
4967 dump_printf_loc (MSG_NOTE
, vect_location
,
4968 "Basic block will be vectorized "
4972 vect_schedule_slp (bb_vinfo
, instance
->subgraph_entries
);
4974 unsigned HOST_WIDE_INT bytes
;
4975 if (dump_enabled_p ())
4978 (bb_vinfo
->vector_mode
).is_constant (&bytes
))
4979 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
4980 "basic block part vectorized using %wu "
4981 "byte vectors\n", bytes
);
4983 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
4984 "basic block part vectorized using "
4985 "variable length vectors\n");
4991 if (dump_enabled_p ())
4992 dump_printf_loc (MSG_NOTE
, vect_location
,
4993 "***** Analysis failed with vector mode %s\n",
4994 GET_MODE_NAME (bb_vinfo
->vector_mode
));
4998 autodetected_vector_mode
= bb_vinfo
->vector_mode
;
5001 while (mode_i
< vector_modes
.length ()
5002 && vect_chooses_same_modes_p (bb_vinfo
, vector_modes
[mode_i
]))
5004 if (dump_enabled_p ())
5005 dump_printf_loc (MSG_NOTE
, vect_location
,
5006 "***** The result for vector mode %s would"
5008 GET_MODE_NAME (vector_modes
[mode_i
]));
5014 if (mode_i
< vector_modes
.length ()
5015 && VECTOR_MODE_P (autodetected_vector_mode
)
5016 && (related_vector_mode (vector_modes
[mode_i
],
5017 GET_MODE_INNER (autodetected_vector_mode
))
5018 == autodetected_vector_mode
)
5019 && (related_vector_mode (autodetected_vector_mode
,
5020 GET_MODE_INNER (vector_modes
[mode_i
]))
5021 == vector_modes
[mode_i
]))
5023 if (dump_enabled_p ())
5024 dump_printf_loc (MSG_NOTE
, vect_location
,
5025 "***** Skipping vector mode %s, which would"
5026 " repeat the analysis for %s\n",
5027 GET_MODE_NAME (vector_modes
[mode_i
]),
5028 GET_MODE_NAME (autodetected_vector_mode
));
5033 || mode_i
== vector_modes
.length ()
5034 || autodetected_vector_mode
== VOIDmode
5035 /* If vect_slp_analyze_bb_1 signaled that analysis for all
5036 vector sizes will fail do not bother iterating. */
5040 /* Try the next biggest vector size. */
5041 next_vector_mode
= vector_modes
[mode_i
++];
5042 if (dump_enabled_p ())
5043 dump_printf_loc (MSG_NOTE
, vect_location
,
5044 "***** Re-trying analysis with vector mode %s\n",
5045 GET_MODE_NAME (next_vector_mode
));
5050 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
5051 true if anything in the basic-block was vectorized. */
5054 vect_slp_bbs (vec
<basic_block
> bbs
)
5056 vec
<data_reference_p
> datarefs
= vNULL
;
5057 auto_vec
<int> dataref_groups
;
5059 int current_group
= 0;
5061 for (unsigned i
= 0; i
< bbs
.length (); i
++)
5063 basic_block bb
= bbs
[i
];
5064 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
5067 gimple
*stmt
= gsi_stmt (gsi
);
5068 if (is_gimple_debug (stmt
))
5073 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
5074 vect_location
= stmt
;
5076 if (!vect_find_stmt_data_reference (NULL
, stmt
, &datarefs
,
5077 &dataref_groups
, current_group
))
5082 return vect_slp_region (bbs
, datarefs
, &dataref_groups
, insns
);
5085 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5086 true if anything in the basic-block was vectorized. */
5089 vect_slp_bb (basic_block bb
)
5091 auto_vec
<basic_block
> bbs
;
5093 return vect_slp_bbs (bbs
);
5096 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5097 true if anything in the basic-block was vectorized. */
5100 vect_slp_function (function
*fun
)
5103 int *rpo
= XNEWVEC (int, n_basic_blocks_for_fn (fun
));
5104 unsigned n
= pre_and_rev_post_order_compute_fn (fun
, NULL
, rpo
, false);
5106 /* For the moment split the function into pieces to avoid making
5107 the iteration on the vector mode moot. Split at points we know
5108 to not handle well which is CFG merges (SLP discovery doesn't
5109 handle non-loop-header PHIs) and loop exits. Since pattern
5110 recog requires reverse iteration to visit uses before defs
5111 simply chop RPO into pieces. */
5112 auto_vec
<basic_block
> bbs
;
5113 for (unsigned i
= 0; i
< n
; i
++)
5115 basic_block bb
= BASIC_BLOCK_FOR_FN (fun
, rpo
[i
]);
5118 /* Split when a BB is not dominated by the first block. */
5119 if (!bbs
.is_empty ()
5120 && !dominated_by_p (CDI_DOMINATORS
, bb
, bbs
[0]))
5122 if (dump_enabled_p ())
5123 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5124 "splitting region at dominance boundary bb%d\n",
5128 /* Split when the loop determined by the first block
5129 is exited. This is because we eventually insert
5130 invariants at region begin. */
5131 else if (!bbs
.is_empty ()
5132 && bbs
[0]->loop_father
!= bb
->loop_father
5133 && !flow_loop_nested_p (bbs
[0]->loop_father
, bb
->loop_father
))
5135 if (dump_enabled_p ())
5136 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5137 "splitting region at loop %d exit at bb%d\n",
5138 bbs
[0]->loop_father
->num
, bb
->index
);
5142 if (split
&& !bbs
.is_empty ())
5144 r
|= vect_slp_bbs (bbs
);
5146 bbs
.quick_push (bb
);
5151 /* When we have a stmt ending this block and defining a
5152 value we have to insert on edges when inserting after it for
5153 a vector containing its definition. Avoid this for now. */
5154 if (gimple
*last
= last_stmt (bb
))
5155 if (gimple_get_lhs (last
)
5156 && is_ctrl_altering_stmt (last
))
5158 if (dump_enabled_p ())
5159 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5160 "splitting region at control altering "
5161 "definition %G", last
);
5162 r
|= vect_slp_bbs (bbs
);
5167 if (!bbs
.is_empty ())
5168 r
|= vect_slp_bbs (bbs
);
5175 /* Build a variable-length vector in which the elements in ELTS are repeated
5176 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
5177 RESULTS and add any new instructions to SEQ.
5179 The approach we use is:
5181 (1) Find a vector mode VM with integer elements of mode IM.
5183 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5184 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
5185 from small vectors to IM.
5187 (3) Duplicate each ELTS'[I] into a vector of mode VM.
5189 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
5190 correct byte contents.
5192 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
5194 We try to find the largest IM for which this sequence works, in order
5195 to cut down on the number of interleaves. */
5198 duplicate_and_interleave (vec_info
*vinfo
, gimple_seq
*seq
, tree vector_type
,
5199 vec
<tree
> elts
, unsigned int nresults
,
5202 unsigned int nelts
= elts
.length ();
5203 tree element_type
= TREE_TYPE (vector_type
);
5205 /* (1) Find a vector mode VM with integer elements of mode IM. */
5206 unsigned int nvectors
= 1;
5207 tree new_vector_type
;
5209 if (!can_duplicate_and_interleave_p (vinfo
, nelts
, element_type
,
5210 &nvectors
, &new_vector_type
,
5214 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
5215 unsigned int partial_nelts
= nelts
/ nvectors
;
5216 tree partial_vector_type
= build_vector_type (element_type
, partial_nelts
);
5218 tree_vector_builder partial_elts
;
5219 auto_vec
<tree
, 32> pieces (nvectors
* 2);
5220 pieces
.quick_grow_cleared (nvectors
* 2);
5221 for (unsigned int i
= 0; i
< nvectors
; ++i
)
5223 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5224 ELTS' has mode IM. */
5225 partial_elts
.new_vector (partial_vector_type
, partial_nelts
, 1);
5226 for (unsigned int j
= 0; j
< partial_nelts
; ++j
)
5227 partial_elts
.quick_push (elts
[i
* partial_nelts
+ j
]);
5228 tree t
= gimple_build_vector (seq
, &partial_elts
);
5229 t
= gimple_build (seq
, VIEW_CONVERT_EXPR
,
5230 TREE_TYPE (new_vector_type
), t
);
5232 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
5233 pieces
[i
] = gimple_build_vector_from_val (seq
, new_vector_type
, t
);
5236 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
5237 correct byte contents.
5239 Conceptually, we need to repeat the following operation log2(nvectors)
5240 times, where hi_start = nvectors / 2:
5242 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
5243 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
5245 However, if each input repeats every N elements and the VF is
5246 a multiple of N * 2, the HI result is the same as the LO result.
5247 This will be true for the first N1 iterations of the outer loop,
5248 followed by N2 iterations for which both the LO and HI results
5251 N1 + N2 = log2(nvectors)
5253 Each "N1 iteration" doubles the number of redundant vectors and the
5254 effect of the process as a whole is to have a sequence of nvectors/2**N1
5255 vectors that repeats 2**N1 times. Rather than generate these redundant
5256 vectors, we halve the number of vectors for each N1 iteration. */
5257 unsigned int in_start
= 0;
5258 unsigned int out_start
= nvectors
;
5259 unsigned int new_nvectors
= nvectors
;
5260 for (unsigned int in_repeat
= 1; in_repeat
< nvectors
; in_repeat
*= 2)
5262 unsigned int hi_start
= new_nvectors
/ 2;
5263 unsigned int out_i
= 0;
5264 for (unsigned int in_i
= 0; in_i
< new_nvectors
; ++in_i
)
5267 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type
),
5271 tree output
= make_ssa_name (new_vector_type
);
5272 tree input1
= pieces
[in_start
+ (in_i
/ 2)];
5273 tree input2
= pieces
[in_start
+ (in_i
/ 2) + hi_start
];
5274 gassign
*stmt
= gimple_build_assign (output
, VEC_PERM_EXPR
,
5276 permutes
[in_i
& 1]);
5277 gimple_seq_add_stmt (seq
, stmt
);
5278 pieces
[out_start
+ out_i
] = output
;
5281 std::swap (in_start
, out_start
);
5282 new_nvectors
= out_i
;
5285 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
5286 results
.reserve (nresults
);
5287 for (unsigned int i
= 0; i
< nresults
; ++i
)
5288 if (i
< new_nvectors
)
5289 results
.quick_push (gimple_build (seq
, VIEW_CONVERT_EXPR
, vector_type
,
5290 pieces
[in_start
+ i
]));
5292 results
.quick_push (results
[i
- new_nvectors
]);
5296 /* For constant and loop invariant defs in OP_NODE this function creates
5297 vector defs that will be used in the vectorized stmts and stores them
5298 to SLP_TREE_VEC_DEFS of OP_NODE. */
5301 vect_create_constant_vectors (vec_info
*vinfo
, slp_tree op_node
)
5303 unsigned HOST_WIDE_INT nunits
;
5305 unsigned j
, number_of_places_left_in_vector
;
5308 int group_size
= op_node
->ops
.length ();
5309 unsigned int vec_num
, i
;
5310 unsigned number_of_copies
= 1;
5312 gimple_seq ctor_seq
= NULL
;
5313 auto_vec
<tree
, 16> permute_results
;
5315 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
5316 vector_type
= SLP_TREE_VECTYPE (op_node
);
5318 unsigned int number_of_vectors
= SLP_TREE_NUMBER_OF_VEC_STMTS (op_node
);
5319 SLP_TREE_VEC_DEFS (op_node
).create (number_of_vectors
);
5320 auto_vec
<tree
> voprnds (number_of_vectors
);
5322 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5323 created vectors. It is greater than 1 if unrolling is performed.
5325 For example, we have two scalar operands, s1 and s2 (e.g., group of
5326 strided accesses of size two), while NUNITS is four (i.e., four scalars
5327 of this type can be packed in a vector). The output vector will contain
5328 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5331 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
5332 containing the operands.
5334 For example, NUNITS is four as before, and the group size is 8
5335 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5336 {s5, s6, s7, s8}. */
5338 /* When using duplicate_and_interleave, we just need one element for
5339 each scalar statement. */
5340 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
5341 nunits
= group_size
;
5343 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
5345 number_of_places_left_in_vector
= nunits
;
5347 tree_vector_builder
elts (vector_type
, nunits
, 1);
5348 elts
.quick_grow (nunits
);
5349 stmt_vec_info insert_after
= NULL
;
5350 for (j
= 0; j
< number_of_copies
; j
++)
5353 for (i
= group_size
- 1; op_node
->ops
.iterate (i
, &op
); i
--)
5355 /* Create 'vect_ = {op0,op1,...,opn}'. */
5356 number_of_places_left_in_vector
--;
5358 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
5360 if (CONSTANT_CLASS_P (op
))
5362 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
5364 /* Can't use VIEW_CONVERT_EXPR for booleans because
5365 of possibly different sizes of scalar value and
5367 if (integer_zerop (op
))
5368 op
= build_int_cst (TREE_TYPE (vector_type
), 0);
5369 else if (integer_onep (op
))
5370 op
= build_all_ones_cst (TREE_TYPE (vector_type
));
5375 op
= fold_unary (VIEW_CONVERT_EXPR
,
5376 TREE_TYPE (vector_type
), op
);
5377 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
5381 tree new_temp
= make_ssa_name (TREE_TYPE (vector_type
));
5383 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
5386 = build_all_ones_cst (TREE_TYPE (vector_type
));
5388 = build_zero_cst (TREE_TYPE (vector_type
));
5389 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op
)));
5390 init_stmt
= gimple_build_assign (new_temp
, COND_EXPR
,
5396 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
5399 = gimple_build_assign (new_temp
, VIEW_CONVERT_EXPR
,
5402 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
5406 elts
[number_of_places_left_in_vector
] = op
;
5407 if (!CONSTANT_CLASS_P (op
))
5409 /* For BB vectorization we have to compute an insert location
5410 when a def is inside the analyzed region since we cannot
5411 simply insert at the BB start in this case. */
5412 stmt_vec_info opdef
;
5413 if (TREE_CODE (orig_op
) == SSA_NAME
5414 && !SSA_NAME_IS_DEFAULT_DEF (orig_op
)
5415 && is_a
<bb_vec_info
> (vinfo
)
5416 && (opdef
= vinfo
->lookup_def (orig_op
)))
5419 insert_after
= opdef
;
5421 insert_after
= get_later_stmt (insert_after
, opdef
);
5424 if (number_of_places_left_in_vector
== 0)
5427 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
5428 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
5429 vec_cst
= gimple_build_vector (&ctor_seq
, &elts
);
5432 if (permute_results
.is_empty ())
5433 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
,
5434 elts
, number_of_vectors
,
5436 vec_cst
= permute_results
[number_of_vectors
- j
- 1];
5438 if (!gimple_seq_empty_p (ctor_seq
))
5442 gimple_stmt_iterator gsi
;
5443 if (gimple_code (insert_after
->stmt
) == GIMPLE_PHI
)
5445 gsi
= gsi_after_labels (gimple_bb (insert_after
->stmt
));
5446 gsi_insert_seq_before (&gsi
, ctor_seq
,
5447 GSI_CONTINUE_LINKING
);
5449 else if (!stmt_ends_bb_p (insert_after
->stmt
))
5451 gsi
= gsi_for_stmt (insert_after
->stmt
);
5452 gsi_insert_seq_after (&gsi
, ctor_seq
,
5453 GSI_CONTINUE_LINKING
);
5457 /* When we want to insert after a def where the
5458 defining stmt throws then insert on the fallthru
5460 edge e
= find_fallthru_edge
5461 (gimple_bb (insert_after
->stmt
)->succs
);
5463 = gsi_insert_seq_on_edge_immediate (e
, ctor_seq
);
5464 gcc_assert (!new_bb
);
5468 vinfo
->insert_seq_on_entry (NULL
, ctor_seq
);
5471 voprnds
.quick_push (vec_cst
);
5472 insert_after
= NULL
;
5473 number_of_places_left_in_vector
= nunits
;
5475 elts
.new_vector (vector_type
, nunits
, 1);
5476 elts
.quick_grow (nunits
);
5481 /* Since the vectors are created in the reverse order, we should invert
5483 vec_num
= voprnds
.length ();
5484 for (j
= vec_num
; j
!= 0; j
--)
5486 vop
= voprnds
[j
- 1];
5487 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
5490 /* In case that VF is greater than the unrolling factor needed for the SLP
5491 group of stmts, NUMBER_OF_VECTORS to be created is greater than
5492 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
5493 to replicate the vectors. */
5494 while (number_of_vectors
> SLP_TREE_VEC_DEFS (op_node
).length ())
5495 for (i
= 0; SLP_TREE_VEC_DEFS (op_node
).iterate (i
, &vop
) && i
< vec_num
;
5497 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
5500 /* Get the Ith vectorized definition from SLP_NODE. */
5503 vect_get_slp_vect_def (slp_tree slp_node
, unsigned i
)
5505 if (SLP_TREE_VEC_STMTS (slp_node
).exists ())
5506 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node
)[i
]);
5508 return SLP_TREE_VEC_DEFS (slp_node
)[i
];
5511 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
5514 vect_get_slp_defs (slp_tree slp_node
, vec
<tree
> *vec_defs
)
5516 vec_defs
->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
));
5517 if (SLP_TREE_DEF_TYPE (slp_node
) == vect_internal_def
)
5520 gimple
*vec_def_stmt
;
5521 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), j
, vec_def_stmt
)
5522 vec_defs
->quick_push (gimple_get_lhs (vec_def_stmt
));
5525 vec_defs
->splice (SLP_TREE_VEC_DEFS (slp_node
));
5528 /* Get N vectorized definitions for SLP_NODE. */
5531 vect_get_slp_defs (vec_info
*,
5532 slp_tree slp_node
, vec
<vec
<tree
> > *vec_oprnds
, unsigned n
)
5535 n
= SLP_TREE_CHILDREN (slp_node
).length ();
5537 for (unsigned i
= 0; i
< n
; ++i
)
5539 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
5540 vec
<tree
> vec_defs
= vNULL
;
5541 vect_get_slp_defs (child
, &vec_defs
);
5542 vec_oprnds
->quick_push (vec_defs
);
5546 /* Generate vector permute statements from a list of loads in DR_CHAIN.
5547 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
5548 permute statements for the SLP node NODE. Store the number of vector
5549 permute instructions in *N_PERMS and the number of vector load
5550 instructions in *N_LOADS. */
5553 vect_transform_slp_perm_load (vec_info
*vinfo
,
5554 slp_tree node
, vec
<tree
> dr_chain
,
5555 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
5556 bool analyze_only
, unsigned *n_perms
,
5557 unsigned int *n_loads
)
5559 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
5561 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5562 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (node
).length ();
5563 unsigned int mask_element
;
5566 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
5569 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
5571 mode
= TYPE_MODE (vectype
);
5572 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5574 /* Initialize the vect stmts of NODE to properly insert the generated
5577 for (unsigned i
= SLP_TREE_VEC_STMTS (node
).length ();
5578 i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
5579 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
5581 /* Generate permutation masks for every NODE. Number of masks for each NODE
5582 is equal to GROUP_SIZE.
5583 E.g., we have a group of three nodes with three loads from the same
5584 location in each node, and the vector size is 4. I.e., we have a
5585 a0b0c0a1b1c1... sequence and we need to create the following vectors:
5586 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
5587 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
5590 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
5591 The last mask is illegal since we assume two operands for permute
5592 operation, and the mask element values can't be outside that range.
5593 Hence, the last mask must be converted into {2,5,5,5}.
5594 For the first two permutations we need the first and the second input
5595 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
5596 we need the second and the third vectors: {b1,c1,a2,b2} and
5599 int vect_stmts_counter
= 0;
5600 unsigned int index
= 0;
5601 int first_vec_index
= -1;
5602 int second_vec_index
= -1;
5606 vec_perm_builder mask
;
5607 unsigned int nelts_to_build
;
5608 unsigned int nvectors_per_build
;
5609 unsigned int in_nlanes
;
5610 bool repeating_p
= (group_size
== DR_GROUP_SIZE (stmt_info
)
5611 && multiple_p (nunits
, group_size
));
5614 /* A single vector contains a whole number of copies of the node, so:
5615 (a) all permutes can use the same mask; and
5616 (b) the permutes only need a single vector input. */
5617 mask
.new_vector (nunits
, group_size
, 3);
5618 nelts_to_build
= mask
.encoded_nelts ();
5619 nvectors_per_build
= SLP_TREE_VEC_STMTS (node
).length ();
5620 in_nlanes
= DR_GROUP_SIZE (stmt_info
) * 3;
5624 /* We need to construct a separate mask for each vector statement. */
5625 unsigned HOST_WIDE_INT const_nunits
, const_vf
;
5626 if (!nunits
.is_constant (&const_nunits
)
5627 || !vf
.is_constant (&const_vf
))
5629 mask
.new_vector (const_nunits
, const_nunits
, 1);
5630 nelts_to_build
= const_vf
* group_size
;
5631 nvectors_per_build
= 1;
5632 in_nlanes
= const_vf
* DR_GROUP_SIZE (stmt_info
);
5634 auto_sbitmap
used_in_lanes (in_nlanes
);
5635 bitmap_clear (used_in_lanes
);
5637 unsigned int count
= mask
.encoded_nelts ();
5638 mask
.quick_grow (count
);
5639 vec_perm_indices indices
;
5641 for (unsigned int j
= 0; j
< nelts_to_build
; j
++)
5643 unsigned int iter_num
= j
/ group_size
;
5644 unsigned int stmt_num
= j
% group_size
;
5645 unsigned int i
= (iter_num
* DR_GROUP_SIZE (stmt_info
)
5646 + SLP_TREE_LOAD_PERMUTATION (node
)[stmt_num
]);
5647 bitmap_set_bit (used_in_lanes
, i
);
5650 first_vec_index
= 0;
5655 /* Enforced before the loop when !repeating_p. */
5656 unsigned int const_nunits
= nunits
.to_constant ();
5657 vec_index
= i
/ const_nunits
;
5658 mask_element
= i
% const_nunits
;
5659 if (vec_index
== first_vec_index
5660 || first_vec_index
== -1)
5662 first_vec_index
= vec_index
;
5664 else if (vec_index
== second_vec_index
5665 || second_vec_index
== -1)
5667 second_vec_index
= vec_index
;
5668 mask_element
+= const_nunits
;
5672 if (dump_enabled_p ())
5673 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5674 "permutation requires at "
5675 "least three vectors %G",
5677 gcc_assert (analyze_only
);
5681 gcc_assert (mask_element
< 2 * const_nunits
);
5684 if (mask_element
!= index
)
5686 mask
[index
++] = mask_element
;
5688 if (index
== count
&& !noop_p
)
5690 indices
.new_vector (mask
, second_vec_index
== -1 ? 1 : 2, nunits
);
5691 if (!can_vec_perm_const_p (mode
, indices
))
5693 if (dump_enabled_p ())
5695 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
5697 "unsupported vect permute { ");
5698 for (i
= 0; i
< count
; ++i
)
5700 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
5701 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
5703 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
5705 gcc_assert (analyze_only
);
5716 tree mask_vec
= NULL_TREE
;
5719 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
5721 if (second_vec_index
== -1)
5722 second_vec_index
= first_vec_index
;
5724 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
5726 /* Generate the permute statement if necessary. */
5727 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
5728 tree second_vec
= dr_chain
[second_vec_index
+ ri
];
5732 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
5734 = vect_create_destination_var (gimple_assign_lhs (stmt
),
5736 perm_dest
= make_ssa_name (perm_dest
);
5738 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
5739 first_vec
, second_vec
,
5741 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
,
5745 /* If mask was NULL_TREE generate the requested
5746 identity transform. */
5747 perm_stmt
= SSA_NAME_DEF_STMT (first_vec
);
5749 /* Store the vector statement in NODE. */
5750 SLP_TREE_VEC_STMTS (node
)[vect_stmts_counter
++] = perm_stmt
;
5755 first_vec_index
= -1;
5756 second_vec_index
= -1;
5764 *n_loads
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
5767 /* Enforced above when !repeating_p. */
5768 unsigned int const_nunits
= nunits
.to_constant ();
5770 bool load_seen
= false;
5771 for (unsigned i
= 0; i
< in_nlanes
; ++i
)
5773 if (i
% const_nunits
== 0)
5779 if (bitmap_bit_p (used_in_lanes
, i
))
5791 /* Vectorize the SLP permutations in NODE as specified
5792 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5793 child number and lane number.
5794 Interleaving of two two-lane two-child SLP subtrees (not supported):
5795 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5796 A blend of two four-lane two-child SLP subtrees:
5797 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5798 Highpart of a four-lane one-child SLP subtree (not supported):
5799 [ { 0, 2 }, { 0, 3 } ]
5800 Where currently only a subset is supported by code generating below. */
5803 vectorizable_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
5804 slp_tree node
, stmt_vector_for_cost
*cost_vec
)
5806 tree vectype
= SLP_TREE_VECTYPE (node
);
5808 /* ??? We currently only support all same vector input and output types
5809 while the SLP IL should really do a concat + select and thus accept
5810 arbitrary mismatches. */
5813 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5814 if (!vect_maybe_update_slp_op_vectype (child
, vectype
)
5815 || !types_compatible_p (SLP_TREE_VECTYPE (child
), vectype
))
5817 if (dump_enabled_p ())
5818 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5819 "Unsupported lane permutation\n");
5823 vec
<std::pair
<unsigned, unsigned> > &perm
= SLP_TREE_LANE_PERMUTATION (node
);
5824 gcc_assert (perm
.length () == SLP_TREE_LANES (node
));
5825 if (dump_enabled_p ())
5827 dump_printf_loc (MSG_NOTE
, vect_location
,
5828 "vectorizing permutation");
5829 for (unsigned i
= 0; i
< perm
.length (); ++i
)
5830 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
5831 dump_printf (MSG_NOTE
, "\n");
5834 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5835 if (!nunits
.is_constant ())
5837 unsigned HOST_WIDE_INT vf
= 1;
5838 if (loop_vec_info linfo
= dyn_cast
<loop_vec_info
> (vinfo
))
5839 if (!LOOP_VINFO_VECT_FACTOR (linfo
).is_constant (&vf
))
5841 unsigned olanes
= vf
* SLP_TREE_LANES (node
);
5842 gcc_assert (multiple_p (olanes
, nunits
));
5844 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5845 from the { SLP operand, scalar lane } permutation as recorded in the
5846 SLP node as intermediate step. This part should already work
5847 with SLP children with arbitrary number of lanes. */
5848 auto_vec
<std::pair
<std::pair
<unsigned, unsigned>, unsigned> > vperm
;
5849 auto_vec
<unsigned> active_lane
;
5850 vperm
.create (olanes
);
5851 active_lane
.safe_grow_cleared (SLP_TREE_CHILDREN (node
).length (), true);
5852 for (unsigned i
= 0; i
< vf
; ++i
)
5854 for (unsigned pi
= 0; pi
< perm
.length (); ++pi
)
5856 std::pair
<unsigned, unsigned> p
= perm
[pi
];
5857 tree vtype
= SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node
)[p
.first
]);
5858 unsigned vnunits
= TYPE_VECTOR_SUBPARTS (vtype
).to_constant ();
5859 unsigned vi
= (active_lane
[p
.first
] + p
.second
) / vnunits
;
5860 unsigned vl
= (active_lane
[p
.first
] + p
.second
) % vnunits
;
5861 vperm
.quick_push (std::make_pair (std::make_pair (p
.first
, vi
), vl
));
5863 /* Advance to the next group. */
5864 for (unsigned j
= 0; j
< SLP_TREE_CHILDREN (node
).length (); ++j
)
5865 active_lane
[j
] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node
)[j
]);
5868 if (dump_enabled_p ())
5870 dump_printf_loc (MSG_NOTE
, vect_location
, "as");
5871 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
5873 if (i
!= 0 && multiple_p (i
, TYPE_VECTOR_SUBPARTS (vectype
)))
5874 dump_printf (MSG_NOTE
, ",");
5875 dump_printf (MSG_NOTE
, " vops%u[%u][%u]",
5876 vperm
[i
].first
.first
, vperm
[i
].first
.second
,
5879 dump_printf (MSG_NOTE
, "\n");
5882 /* We can only handle two-vector permutes, everything else should
5883 be lowered on the SLP level. The following is closely inspired
5884 by vect_transform_slp_perm_load and is supposed to eventually
5886 ??? As intermediate step do code-gen in the SLP tree representation
5888 std::pair
<unsigned, unsigned> first_vec
= std::make_pair (-1U, -1U);
5889 std::pair
<unsigned, unsigned> second_vec
= std::make_pair (-1U, -1U);
5890 unsigned int const_nunits
= nunits
.to_constant ();
5891 unsigned int index
= 0;
5892 unsigned int mask_element
;
5893 vec_perm_builder mask
;
5894 mask
.new_vector (const_nunits
, const_nunits
, 1);
5895 unsigned int count
= mask
.encoded_nelts ();
5896 mask
.quick_grow (count
);
5897 vec_perm_indices indices
;
5898 unsigned nperms
= 0;
5899 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
5901 mask_element
= vperm
[i
].second
;
5902 if (first_vec
.first
== -1U
5903 || first_vec
== vperm
[i
].first
)
5904 first_vec
= vperm
[i
].first
;
5905 else if (second_vec
.first
== -1U
5906 || second_vec
== vperm
[i
].first
)
5908 second_vec
= vperm
[i
].first
;
5909 mask_element
+= const_nunits
;
5913 if (dump_enabled_p ())
5914 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5915 "permutation requires at "
5916 "least three vectors\n");
5921 mask
[index
++] = mask_element
;
5925 indices
.new_vector (mask
, second_vec
.first
== -1U ? 1 : 2,
5927 bool identity_p
= indices
.series_p (0, 1, 0, 1);
5929 && !can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5931 if (dump_enabled_p ())
5933 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
5935 "unsupported vect permute { ");
5936 for (i
= 0; i
< count
; ++i
)
5938 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
5939 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
5941 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
5951 if (second_vec
.first
== -1U)
5952 second_vec
= first_vec
;
5954 /* Generate the permute statement if necessary. */
5955 slp_tree first_node
= SLP_TREE_CHILDREN (node
)[first_vec
.first
];
5957 = vect_get_slp_vect_def (first_node
, first_vec
.second
);
5958 /* ??? We SLP match existing vector element extracts but
5959 allow punning which we need to re-instantiate at uses
5960 but have no good way of explicitely representing. */
5961 if (!types_compatible_p (TREE_TYPE (first_def
), vectype
))
5964 conv_stmt
= gimple_build_assign (make_ssa_name (vectype
),
5965 build1 (VIEW_CONVERT_EXPR
,
5966 vectype
, first_def
));
5967 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
5968 first_def
= gimple_assign_lhs (conv_stmt
);
5971 tree perm_dest
= make_ssa_name (vectype
);
5974 slp_tree second_node
5975 = SLP_TREE_CHILDREN (node
)[second_vec
.first
];
5977 = vect_get_slp_vect_def (second_node
, second_vec
.second
);
5978 if (!types_compatible_p (TREE_TYPE (second_def
), vectype
))
5981 conv_stmt
= gimple_build_assign (make_ssa_name (vectype
),
5984 vectype
, second_def
));
5985 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
5986 second_def
= gimple_assign_lhs (conv_stmt
);
5988 tree mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
5989 perm_stmt
= gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
5990 first_def
, second_def
,
5994 /* We need a copy here in case the def was external. */
5995 perm_stmt
= gimple_build_assign (perm_dest
, first_def
);
5996 vect_finish_stmt_generation (vinfo
, NULL
, perm_stmt
, gsi
);
5997 /* Store the vector statement in NODE. */
5998 SLP_TREE_VEC_STMTS (node
).quick_push (perm_stmt
);
6002 first_vec
= std::make_pair (-1U, -1U);
6003 second_vec
= std::make_pair (-1U, -1U);
6008 record_stmt_cost (cost_vec
, nperms
, vec_perm
, NULL
, vectype
, 0, vect_body
);
6013 /* Vectorize SLP NODE. */
6016 vect_schedule_slp_node (vec_info
*vinfo
,
6017 slp_tree node
, slp_instance instance
)
6019 gimple_stmt_iterator si
;
6023 /* For existing vectors there's nothing to do. */
6024 if (SLP_TREE_VEC_DEFS (node
).exists ())
6027 gcc_assert (SLP_TREE_VEC_STMTS (node
).is_empty ());
6029 /* Vectorize externals and constants. */
6030 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
6031 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
6033 /* ??? vectorizable_shift can end up using a scalar operand which is
6034 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
6035 node in this case. */
6036 if (!SLP_TREE_VECTYPE (node
))
6039 vect_create_constant_vectors (vinfo
, node
);
6043 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
6045 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) != 0);
6046 SLP_TREE_VEC_STMTS (node
).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
6048 if (dump_enabled_p ())
6049 dump_printf_loc (MSG_NOTE
, vect_location
,
6050 "------>vectorizing SLP node starting from: %G",
6053 if (STMT_VINFO_DATA_REF (stmt_info
)
6054 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
6056 /* Vectorized loads go before the first scalar load to make it
6057 ready early, vectorized stores go before the last scalar
6058 stmt which is where all uses are ready. */
6059 stmt_vec_info last_stmt_info
= NULL
;
6060 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
6061 last_stmt_info
= vect_find_first_scalar_stmt_in_slp (node
);
6062 else /* DR_IS_WRITE */
6063 last_stmt_info
= vect_find_last_scalar_stmt_in_slp (node
);
6064 si
= gsi_for_stmt (last_stmt_info
->stmt
);
6066 else if ((STMT_VINFO_TYPE (stmt_info
) == cycle_phi_info_type
6067 || STMT_VINFO_TYPE (stmt_info
) == induc_vec_info_type
6068 || STMT_VINFO_TYPE (stmt_info
) == phi_info_type
)
6069 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
6071 /* For PHI node vectorization we do not use the insertion iterator. */
6076 /* Emit other stmts after the children vectorized defs which is
6077 earliest possible. */
6078 gimple
*last_stmt
= NULL
;
6079 bool seen_vector_def
= false;
6080 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6081 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
6083 /* For fold-left reductions we are retaining the scalar
6084 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
6085 set so the representation isn't perfect. Resort to the
6086 last scalar def here. */
6087 if (SLP_TREE_VEC_STMTS (child
).is_empty ())
6089 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child
))
6090 == cycle_phi_info_type
);
6091 gphi
*phi
= as_a
<gphi
*>
6092 (vect_find_last_scalar_stmt_in_slp (child
)->stmt
);
6094 || vect_stmt_dominates_stmt_p (last_stmt
, phi
))
6097 /* We are emitting all vectorized stmts in the same place and
6098 the last one is the last.
6099 ??? Unless we have a load permutation applied and that
6100 figures to re-use an earlier generated load. */
6103 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child
), j
, vstmt
)
6105 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
6108 else if (!SLP_TREE_VECTYPE (child
))
6110 /* For externals we use unvectorized at all scalar defs. */
6113 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child
), j
, def
)
6114 if (TREE_CODE (def
) == SSA_NAME
6115 && !SSA_NAME_IS_DEFAULT_DEF (def
))
6117 gimple
*stmt
= SSA_NAME_DEF_STMT (def
);
6119 || vect_stmt_dominates_stmt_p (last_stmt
, stmt
))
6125 /* For externals we have to look at all defs since their
6126 insertion place is decided per vector. But beware
6127 of pre-existing vectors where we need to make sure
6128 we do not insert before the region boundary. */
6129 if (SLP_TREE_SCALAR_OPS (child
).is_empty ()
6130 && !vinfo
->lookup_def (SLP_TREE_VEC_DEFS (child
)[0]))
6131 seen_vector_def
= true;
6136 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
6137 if (TREE_CODE (vdef
) == SSA_NAME
6138 && !SSA_NAME_IS_DEFAULT_DEF (vdef
))
6140 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
6142 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
6147 /* This can happen when all children are pre-existing vectors or
6150 last_stmt
= vect_find_first_scalar_stmt_in_slp (node
)->stmt
;
6153 gcc_assert (seen_vector_def
);
6154 si
= gsi_after_labels (as_a
<bb_vec_info
> (vinfo
)->bbs
[0]);
6156 else if (is_a
<gphi
*> (last_stmt
))
6157 si
= gsi_after_labels (gimple_bb (last_stmt
));
6160 si
= gsi_for_stmt (last_stmt
);
6165 bool done_p
= false;
6167 /* Handle purely internal nodes. */
6168 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
6170 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
6171 be shared with different SLP nodes (but usually it's the same
6172 operation apart from the case the stmt is only there for denoting
6173 the actual scalar lane defs ...). So do not call vect_transform_stmt
6174 but open-code it here (partly). */
6175 bool done
= vectorizable_slp_permutation (vinfo
, &si
, node
, NULL
);
6180 vect_transform_stmt (vinfo
, stmt_info
, &si
, node
, instance
);
6183 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
6184 For loop vectorization this is done in vectorizable_call, but for SLP
6185 it needs to be deferred until end of vect_schedule_slp, because multiple
6186 SLP instances may refer to the same scalar stmt. */
6189 vect_remove_slp_scalar_calls (vec_info
*vinfo
,
6190 slp_tree node
, hash_set
<slp_tree
> &visited
)
6193 gimple_stmt_iterator gsi
;
6197 stmt_vec_info stmt_info
;
6199 if (!node
|| SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
6202 if (visited
.add (node
))
6205 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6206 vect_remove_slp_scalar_calls (vinfo
, child
, visited
);
6208 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
6210 gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
6211 if (!stmt
|| gimple_bb (stmt
) == NULL
)
6213 if (is_pattern_stmt_p (stmt_info
)
6214 || !PURE_SLP_STMT (stmt_info
))
6216 lhs
= gimple_call_lhs (stmt
);
6217 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
6218 gsi
= gsi_for_stmt (stmt
);
6219 vinfo
->replace_stmt (&gsi
, stmt_info
, new_stmt
);
6220 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
6225 vect_remove_slp_scalar_calls (vec_info
*vinfo
, slp_tree node
)
6227 hash_set
<slp_tree
> visited
;
6228 vect_remove_slp_scalar_calls (vinfo
, node
, visited
);
6231 /* Vectorize the instance root. */
6234 vectorize_slp_instance_root_stmt (slp_tree node
, slp_instance instance
)
6236 gassign
*rstmt
= NULL
;
6238 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) == 1)
6243 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
6245 tree vect_lhs
= gimple_get_lhs (child_stmt
);
6246 tree root_lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
6247 if (!useless_type_conversion_p (TREE_TYPE (root_lhs
),
6248 TREE_TYPE (vect_lhs
)))
6249 vect_lhs
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (root_lhs
),
6251 rstmt
= gimple_build_assign (root_lhs
, vect_lhs
);
6255 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) > 1)
6257 int nelts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
6260 vec
<constructor_elt
, va_gc
> *v
;
6261 vec_alloc (v
, nelts
);
6263 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
6265 CONSTRUCTOR_APPEND_ELT (v
,
6267 gimple_get_lhs (child_stmt
));
6269 tree lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
6270 tree rtype
= TREE_TYPE (gimple_assign_rhs1 (instance
->root_stmt
->stmt
));
6271 tree r_constructor
= build_constructor (rtype
, v
);
6272 rstmt
= gimple_build_assign (lhs
, r_constructor
);
6277 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmt
->stmt
);
6278 gsi_replace (&rgsi
, rstmt
, true);
6288 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
6291 vect_schedule_scc (vec_info
*vinfo
, slp_tree node
, slp_instance instance
,
6292 hash_map
<slp_tree
, slp_scc_info
> &scc_info
,
6293 int &maxdfs
, vec
<slp_tree
> &stack
)
6296 slp_scc_info
*info
= &scc_info
.get_or_insert (node
, &existed_p
);
6297 gcc_assert (!existed_p
);
6299 info
->lowlink
= maxdfs
;
6303 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
6305 info
->on_stack
= false;
6306 vect_schedule_slp_node (vinfo
, node
, instance
);
6310 info
->on_stack
= true;
6311 stack
.safe_push (node
);
6316 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6320 slp_scc_info
*child_info
= scc_info
.get (child
);
6323 vect_schedule_scc (vinfo
, child
, instance
, scc_info
, maxdfs
, stack
);
6324 /* Recursion might have re-allocated the node. */
6325 info
= scc_info
.get (node
);
6326 child_info
= scc_info
.get (child
);
6327 info
->lowlink
= MIN (info
->lowlink
, child_info
->lowlink
);
6329 else if (child_info
->on_stack
)
6330 info
->lowlink
= MIN (info
->lowlink
, child_info
->dfs
);
6332 if (info
->lowlink
!= info
->dfs
)
6335 auto_vec
<slp_tree
, 4> phis_to_fixup
;
6338 if (stack
.last () == node
)
6341 info
->on_stack
= false;
6342 vect_schedule_slp_node (vinfo
, node
, instance
);
6343 if (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
6344 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (node
)->stmt
))
6345 phis_to_fixup
.quick_push (node
);
6350 int last_idx
= stack
.length () - 1;
6351 while (stack
[last_idx
] != node
)
6353 /* We can break the cycle at PHIs who have at least one child
6354 code generated. Then we could re-start the DFS walk until
6355 all nodes in the SCC are covered (we might have new entries
6356 for only back-reachable nodes). But it's simpler to just
6357 iterate and schedule those that are ready. */
6358 unsigned todo
= stack
.length () - last_idx
;
6361 for (int idx
= stack
.length () - 1; idx
>= last_idx
; --idx
)
6363 slp_tree entry
= stack
[idx
];
6366 bool phi
= (SLP_TREE_CODE (entry
) != VEC_PERM_EXPR
6367 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (entry
)->stmt
));
6369 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry
), i
, child
)
6376 else if (scc_info
.get (child
)->on_stack
)
6394 vect_schedule_slp_node (vinfo
, entry
, instance
);
6395 scc_info
.get (entry
)->on_stack
= false;
6399 phis_to_fixup
.safe_push (entry
);
6406 stack
.truncate (last_idx
);
6409 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
6411 FOR_EACH_VEC_ELT (phis_to_fixup
, i
, phi_node
)
6413 gphi
*phi
= as_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (phi_node
)->stmt
);
6416 FOR_EACH_EDGE (e
, ei
, gimple_bb (phi
)->preds
)
6418 unsigned dest_idx
= e
->dest_idx
;
6419 child
= SLP_TREE_CHILDREN (phi_node
)[dest_idx
];
6420 if (!child
|| SLP_TREE_DEF_TYPE (child
) != vect_internal_def
)
6422 /* Simply fill all args. */
6423 for (unsigned i
= 0; i
< SLP_TREE_VEC_STMTS (phi_node
).length (); ++i
)
6424 add_phi_arg (as_a
<gphi
*> (SLP_TREE_VEC_STMTS (phi_node
)[i
]),
6425 vect_get_slp_vect_def (child
, i
),
6426 e
, gimple_phi_arg_location (phi
, dest_idx
));
6431 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
6434 vect_schedule_slp (vec_info
*vinfo
, vec
<slp_instance
> slp_instances
)
6436 slp_instance instance
;
6439 hash_map
<slp_tree
, slp_scc_info
> scc_info
;
6441 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
6443 slp_tree node
= SLP_INSTANCE_TREE (instance
);
6444 if (dump_enabled_p ())
6446 dump_printf_loc (MSG_NOTE
, vect_location
,
6447 "Vectorizing SLP tree:\n");
6448 if (SLP_INSTANCE_ROOT_STMT (instance
))
6449 dump_printf_loc (MSG_NOTE
, vect_location
, "Root stmt: %G",
6450 SLP_INSTANCE_ROOT_STMT (instance
)->stmt
);
6451 vect_print_slp_graph (MSG_NOTE
, vect_location
,
6452 SLP_INSTANCE_TREE (instance
));
6454 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
6455 have a PHI be the node breaking the cycle. */
6456 auto_vec
<slp_tree
> stack
;
6457 if (!scc_info
.get (node
))
6458 vect_schedule_scc (vinfo
, node
, instance
, scc_info
, maxdfs
, stack
);
6460 if (SLP_INSTANCE_ROOT_STMT (instance
))
6461 vectorize_slp_instance_root_stmt (node
, instance
);
6463 if (dump_enabled_p ())
6464 dump_printf_loc (MSG_NOTE
, vect_location
,
6465 "vectorizing stmts using SLP.\n");
6468 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
6470 slp_tree root
= SLP_INSTANCE_TREE (instance
);
6471 stmt_vec_info store_info
;
6474 /* Remove scalar call stmts. Do not do this for basic-block
6475 vectorization as not all uses may be vectorized.
6476 ??? Why should this be necessary? DCE should be able to
6477 remove the stmts itself.
6478 ??? For BB vectorization we can as well remove scalar
6479 stmts starting from the SLP tree root if they have no
6481 if (is_a
<loop_vec_info
> (vinfo
))
6482 vect_remove_slp_scalar_calls (vinfo
, root
);
6484 /* Remove vectorized stores original scalar stmts. */
6485 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store_info
); j
++)
6487 if (!STMT_VINFO_DATA_REF (store_info
)
6488 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info
)))
6491 store_info
= vect_orig_stmt (store_info
);
6492 /* Free the attached stmt_vec_info and remove the stmt. */
6493 vinfo
->remove_stmt (store_info
);
6495 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
6496 to not crash in vect_free_slp_tree later. */
6497 if (SLP_TREE_REPRESENTATIVE (root
) == store_info
)
6498 SLP_TREE_REPRESENTATIVE (root
) = NULL
;