1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "tree-pass.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
51 #include "alloc-pool.h"
53 static bool vectorizable_slp_permutation (vec_info
*, gimple_stmt_iterator
*,
54 slp_tree
, stmt_vector_for_cost
*);
56 static object_allocator
<_slp_tree
> *slp_tree_pool
;
57 static slp_tree slp_first_node
;
62 slp_tree_pool
= new object_allocator
<_slp_tree
> ("SLP nodes");
68 while (slp_first_node
)
69 delete slp_first_node
;
75 _slp_tree::operator new (size_t n
)
77 gcc_assert (n
== sizeof (_slp_tree
));
78 return slp_tree_pool
->allocate_raw ();
82 _slp_tree::operator delete (void *node
, size_t n
)
84 gcc_assert (n
== sizeof (_slp_tree
));
85 slp_tree_pool
->remove_raw (node
);
89 /* Initialize a SLP node. */
91 _slp_tree::_slp_tree ()
93 this->prev_node
= NULL
;
95 slp_first_node
->prev_node
= this;
96 this->next_node
= slp_first_node
;
97 slp_first_node
= this;
98 SLP_TREE_SCALAR_STMTS (this) = vNULL
;
99 SLP_TREE_SCALAR_OPS (this) = vNULL
;
100 SLP_TREE_VEC_STMTS (this) = vNULL
;
101 SLP_TREE_VEC_DEFS (this) = vNULL
;
102 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
103 SLP_TREE_CHILDREN (this) = vNULL
;
104 SLP_TREE_LOAD_PERMUTATION (this) = vNULL
;
105 SLP_TREE_LANE_PERMUTATION (this) = vNULL
;
106 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def
;
107 SLP_TREE_CODE (this) = ERROR_MARK
;
108 SLP_TREE_VECTYPE (this) = NULL_TREE
;
109 SLP_TREE_REPRESENTATIVE (this) = NULL
;
110 SLP_TREE_REF_COUNT (this) = 1;
111 this->max_nunits
= 1;
115 /* Tear down a SLP node. */
117 _slp_tree::~_slp_tree ()
120 this->prev_node
->next_node
= this->next_node
;
122 slp_first_node
= this->next_node
;
124 this->next_node
->prev_node
= this->prev_node
;
125 SLP_TREE_CHILDREN (this).release ();
126 SLP_TREE_SCALAR_STMTS (this).release ();
127 SLP_TREE_SCALAR_OPS (this).release ();
128 SLP_TREE_VEC_STMTS (this).release ();
129 SLP_TREE_VEC_DEFS (this).release ();
130 SLP_TREE_LOAD_PERMUTATION (this).release ();
131 SLP_TREE_LANE_PERMUTATION (this).release ();
134 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
137 vect_free_slp_tree (slp_tree node
)
142 if (--SLP_TREE_REF_COUNT (node
) != 0)
145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
147 vect_free_slp_tree (child
);
152 /* Return a location suitable for dumpings related to the SLP instance. */
155 _slp_instance::location () const
158 return root_stmt
->stmt
;
160 return SLP_TREE_SCALAR_STMTS (root
)[0]->stmt
;
164 /* Free the memory allocated for the SLP instance. */
167 vect_free_slp_instance (slp_instance instance
)
169 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
170 SLP_INSTANCE_LOADS (instance
).release ();
171 instance
->subgraph_entries
.release ();
172 instance
->cost_vec
.release ();
177 /* Create an SLP node for SCALAR_STMTS. */
180 vect_create_new_slp_node (unsigned nops
, tree_code code
)
182 slp_tree node
= new _slp_tree
;
183 SLP_TREE_SCALAR_STMTS (node
) = vNULL
;
184 SLP_TREE_CHILDREN (node
).create (nops
);
185 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
186 SLP_TREE_CODE (node
) = code
;
189 /* Create an SLP node for SCALAR_STMTS. */
192 vect_create_new_slp_node (slp_tree node
,
193 vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
195 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
196 SLP_TREE_CHILDREN (node
).create (nops
);
197 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
198 SLP_TREE_REPRESENTATIVE (node
) = scalar_stmts
[0];
199 SLP_TREE_LANES (node
) = scalar_stmts
.length ();
203 /* Create an SLP node for SCALAR_STMTS. */
206 vect_create_new_slp_node (vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
208 return vect_create_new_slp_node (new _slp_tree
, scalar_stmts
, nops
);
211 /* Create an SLP node for OPS. */
214 vect_create_new_slp_node (slp_tree node
, vec
<tree
> ops
)
216 SLP_TREE_SCALAR_OPS (node
) = ops
;
217 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
218 SLP_TREE_LANES (node
) = ops
.length ();
222 /* Create an SLP node for OPS. */
225 vect_create_new_slp_node (vec
<tree
> ops
)
227 return vect_create_new_slp_node (new _slp_tree
, ops
);
231 /* This structure is used in creation of an SLP tree. Each instance
232 corresponds to the same operand in a group of scalar stmts in an SLP
234 typedef struct _slp_oprnd_info
236 /* Def-stmts for the operands. */
237 vec
<stmt_vec_info
> def_stmts
;
240 /* Information about the first statement, its vector def-type, type, the
241 operand itself in case it's constant, and an indication if it's a pattern
244 enum vect_def_type first_dt
;
249 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
251 static vec
<slp_oprnd_info
>
252 vect_create_oprnd_info (int nops
, int group_size
)
255 slp_oprnd_info oprnd_info
;
256 vec
<slp_oprnd_info
> oprnds_info
;
258 oprnds_info
.create (nops
);
259 for (i
= 0; i
< nops
; i
++)
261 oprnd_info
= XNEW (struct _slp_oprnd_info
);
262 oprnd_info
->def_stmts
.create (group_size
);
263 oprnd_info
->ops
.create (group_size
);
264 oprnd_info
->first_dt
= vect_uninitialized_def
;
265 oprnd_info
->first_op_type
= NULL_TREE
;
266 oprnd_info
->any_pattern
= false;
267 oprnds_info
.quick_push (oprnd_info
);
274 /* Free operands info. */
277 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
280 slp_oprnd_info oprnd_info
;
282 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
284 oprnd_info
->def_stmts
.release ();
285 oprnd_info
->ops
.release ();
286 XDELETE (oprnd_info
);
289 oprnds_info
.release ();
293 /* Return true if STMTS contains a pattern statement. */
296 vect_contains_pattern_stmt_p (vec
<stmt_vec_info
> stmts
)
298 stmt_vec_info stmt_info
;
300 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
301 if (is_pattern_stmt_p (stmt_info
))
306 /* Return true when all lanes in the external or constant NODE have
310 vect_slp_tree_uniform_p (slp_tree node
)
312 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
313 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
);
315 /* Pre-exsting vectors. */
316 if (SLP_TREE_SCALAR_OPS (node
).is_empty ())
320 tree op
, first
= NULL_TREE
;
321 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
324 else if (!operand_equal_p (first
, op
, 0))
330 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
331 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
335 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info
,
336 stmt_vec_info first_stmt_info
)
338 stmt_vec_info next_stmt_info
= first_stmt_info
;
341 if (first_stmt_info
!= DR_GROUP_FIRST_ELEMENT (stmt_info
))
346 if (next_stmt_info
== stmt_info
)
348 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
350 result
+= DR_GROUP_GAP (next_stmt_info
);
352 while (next_stmt_info
);
357 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
358 using the method implemented by duplicate_and_interleave. Return true
359 if so, returning the number of intermediate vectors in *NVECTORS_OUT
360 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
364 can_duplicate_and_interleave_p (vec_info
*vinfo
, unsigned int count
,
365 tree elt_type
, unsigned int *nvectors_out
,
366 tree
*vector_type_out
,
369 tree base_vector_type
= get_vectype_for_scalar_type (vinfo
, elt_type
, count
);
370 if (!base_vector_type
|| !VECTOR_MODE_P (TYPE_MODE (base_vector_type
)))
373 machine_mode base_vector_mode
= TYPE_MODE (base_vector_type
);
374 poly_int64 elt_bytes
= count
* GET_MODE_UNIT_SIZE (base_vector_mode
);
375 unsigned int nvectors
= 1;
378 scalar_int_mode int_mode
;
379 poly_int64 elt_bits
= elt_bytes
* BITS_PER_UNIT
;
380 if (int_mode_for_size (elt_bits
, 1).exists (&int_mode
))
382 /* Get the natural vector type for this SLP group size. */
383 tree int_type
= build_nonstandard_integer_type
384 (GET_MODE_BITSIZE (int_mode
), 1);
386 = get_vectype_for_scalar_type (vinfo
, int_type
, count
);
388 && VECTOR_MODE_P (TYPE_MODE (vector_type
))
389 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type
)),
390 GET_MODE_SIZE (base_vector_mode
)))
392 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
393 together into elements of type INT_TYPE and using the result
394 to build NVECTORS vectors. */
395 poly_uint64 nelts
= GET_MODE_NUNITS (TYPE_MODE (vector_type
));
396 vec_perm_builder
sel1 (nelts
, 2, 3);
397 vec_perm_builder
sel2 (nelts
, 2, 3);
398 poly_int64 half_nelts
= exact_div (nelts
, 2);
399 for (unsigned int i
= 0; i
< 3; ++i
)
402 sel1
.quick_push (i
+ nelts
);
403 sel2
.quick_push (half_nelts
+ i
);
404 sel2
.quick_push (half_nelts
+ i
+ nelts
);
406 vec_perm_indices
indices1 (sel1
, 2, nelts
);
407 vec_perm_indices
indices2 (sel2
, 2, nelts
);
408 if (can_vec_perm_const_p (TYPE_MODE (vector_type
), indices1
)
409 && can_vec_perm_const_p (TYPE_MODE (vector_type
), indices2
))
412 *nvectors_out
= nvectors
;
414 *vector_type_out
= vector_type
;
417 permutes
[0] = vect_gen_perm_mask_checked (vector_type
,
419 permutes
[1] = vect_gen_perm_mask_checked (vector_type
,
426 if (!multiple_p (elt_bytes
, 2, &elt_bytes
))
432 /* Return true if DTA and DTB match. */
435 vect_def_types_match (enum vect_def_type dta
, enum vect_def_type dtb
)
438 || ((dta
== vect_external_def
|| dta
== vect_constant_def
)
439 && (dtb
== vect_external_def
|| dtb
== vect_constant_def
)));
442 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
443 they are of a valid type and that they match the defs of the first stmt of
444 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
445 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
446 indicates swap is required for cond_expr stmts. Specifically, *SWAP
447 is 1 if STMT is cond and operands of comparison need to be swapped;
448 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
449 If there is any operand swap in this function, *SWAP is set to non-zero
451 If there was a fatal error return -1; if the error could be corrected by
452 swapping operands of father node of this one, return 1; if everything is
455 vect_get_and_check_slp_defs (vec_info
*vinfo
, unsigned char swap
,
457 vec
<stmt_vec_info
> stmts
, unsigned stmt_num
,
458 vec
<slp_oprnd_info
> *oprnds_info
)
460 stmt_vec_info stmt_info
= stmts
[stmt_num
];
462 unsigned int i
, number_of_oprnds
;
463 enum vect_def_type dt
= vect_uninitialized_def
;
464 slp_oprnd_info oprnd_info
;
465 int first_op_idx
= 1;
466 unsigned int commutative_op
= -1U;
467 bool first_op_cond
= false;
468 bool first
= stmt_num
== 0;
470 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
472 number_of_oprnds
= gimple_call_num_args (stmt
);
474 if (gimple_call_internal_p (stmt
))
476 internal_fn ifn
= gimple_call_internal_fn (stmt
);
477 commutative_op
= first_commutative_argument (ifn
);
479 /* Masked load, only look at mask. */
480 if (ifn
== IFN_MASK_LOAD
)
482 number_of_oprnds
= 1;
483 /* Mask operand index. */
488 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
490 enum tree_code code
= gimple_assign_rhs_code (stmt
);
491 number_of_oprnds
= gimple_num_ops (stmt
) - 1;
492 /* Swap can only be done for cond_expr if asked to, otherwise we
493 could result in different comparison code to the first stmt. */
494 if (code
== COND_EXPR
495 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt
)))
497 first_op_cond
= true;
501 commutative_op
= commutative_tree_code (code
) ? 0U : -1U;
503 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
504 number_of_oprnds
= gimple_phi_num_args (stmt
);
508 bool swapped
= (swap
!= 0);
509 bool backedge
= false;
510 gcc_assert (!swapped
|| first_op_cond
);
511 enum vect_def_type
*dts
= XALLOCAVEC (enum vect_def_type
, number_of_oprnds
);
512 for (i
= 0; i
< number_of_oprnds
; i
++)
516 /* Map indicating how operands of cond_expr should be swapped. */
517 int maps
[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
518 int *map
= maps
[swap
];
521 oprnd
= TREE_OPERAND (gimple_op (stmt_info
->stmt
,
522 first_op_idx
), map
[i
]);
524 oprnd
= gimple_op (stmt_info
->stmt
, map
[i
]);
526 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
528 oprnd
= gimple_phi_arg_def (stmt
, i
);
529 backedge
= dominated_by_p (CDI_DOMINATORS
,
530 gimple_phi_arg_edge (stmt
, i
)->src
,
531 gimple_bb (stmt_info
->stmt
));
534 oprnd
= gimple_op (stmt_info
->stmt
, first_op_idx
+ (swapped
? !i
: i
));
535 if (TREE_CODE (oprnd
) == VIEW_CONVERT_EXPR
)
536 oprnd
= TREE_OPERAND (oprnd
, 0);
538 oprnd_info
= (*oprnds_info
)[i
];
540 stmt_vec_info def_stmt_info
;
541 if (!vect_is_simple_use (oprnd
, vinfo
, &dts
[i
], &def_stmt_info
))
543 if (dump_enabled_p ())
544 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
545 "Build SLP failed: can't analyze def for %T\n",
553 oprnd_info
->def_stmts
.quick_push (NULL
);
554 oprnd_info
->ops
.quick_push (NULL_TREE
);
555 oprnd_info
->first_dt
= vect_uninitialized_def
;
559 oprnd_info
->def_stmts
.quick_push (def_stmt_info
);
560 oprnd_info
->ops
.quick_push (oprnd
);
563 && is_pattern_stmt_p (def_stmt_info
))
565 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info
))
567 oprnd_info
->any_pattern
= true;
569 /* If we promote this to external use the original stmt def. */
570 oprnd_info
->ops
.last ()
571 = gimple_get_lhs (vect_orig_stmt (def_stmt_info
)->stmt
);
574 /* If there's a extern def on a backedge make sure we can
575 code-generate at the region start.
576 ??? This is another case that could be fixed by adjusting
577 how we split the function but at the moment we'd have conflicting
580 && dts
[i
] == vect_external_def
581 && is_a
<bb_vec_info
> (vinfo
)
582 && TREE_CODE (oprnd
) == SSA_NAME
583 && !SSA_NAME_IS_DEFAULT_DEF (oprnd
)
584 && !dominated_by_p (CDI_DOMINATORS
,
585 as_a
<bb_vec_info
> (vinfo
)->bbs
[0],
586 gimple_bb (SSA_NAME_DEF_STMT (oprnd
))))
588 if (dump_enabled_p ())
589 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
590 "Build SLP failed: extern def %T only defined "
591 "on backedge\n", oprnd
);
597 tree type
= TREE_TYPE (oprnd
);
599 if ((dt
== vect_constant_def
600 || dt
== vect_external_def
)
601 && !GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
602 && (TREE_CODE (type
) == BOOLEAN_TYPE
603 || !can_duplicate_and_interleave_p (vinfo
, stmts
.length (),
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
608 "Build SLP failed: invalid type of def "
609 "for variable-length SLP %T\n", oprnd
);
613 /* For the swapping logic below force vect_reduction_def
614 for the reduction op in a SLP reduction group. */
615 if (!STMT_VINFO_DATA_REF (stmt_info
)
616 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
617 && (int)i
== STMT_VINFO_REDUC_IDX (stmt_info
)
619 dts
[i
] = dt
= vect_reduction_def
;
621 /* Check the types of the definition. */
624 case vect_external_def
:
625 case vect_constant_def
:
626 case vect_internal_def
:
627 case vect_reduction_def
:
628 case vect_induction_def
:
629 case vect_nested_cycle
:
633 /* FORNOW: Not supported. */
634 if (dump_enabled_p ())
635 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
636 "Build SLP failed: illegal type of def %T\n",
641 oprnd_info
->first_dt
= dt
;
642 oprnd_info
->first_op_type
= type
;
648 /* Now match the operand definition types to that of the first stmt. */
649 for (i
= 0; i
< number_of_oprnds
;)
657 oprnd_info
= (*oprnds_info
)[i
];
659 stmt_vec_info def_stmt_info
= oprnd_info
->def_stmts
[stmt_num
];
660 oprnd
= oprnd_info
->ops
[stmt_num
];
661 tree type
= TREE_TYPE (oprnd
);
663 if (!types_compatible_p (oprnd_info
->first_op_type
, type
))
665 if (dump_enabled_p ())
666 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
667 "Build SLP failed: different operand types\n");
671 /* Not first stmt of the group, check that the def-stmt/s match
672 the def-stmt/s of the first stmt. Allow different definition
673 types for reduction chains: the first stmt must be a
674 vect_reduction_def (a phi node), and the rest
675 end in the reduction chain. */
676 if ((!vect_def_types_match (oprnd_info
->first_dt
, dt
)
677 && !(oprnd_info
->first_dt
== vect_reduction_def
678 && !STMT_VINFO_DATA_REF (stmt_info
)
679 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
681 && !STMT_VINFO_DATA_REF (def_stmt_info
)
682 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
683 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
))))
684 || (!STMT_VINFO_DATA_REF (stmt_info
)
685 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
687 || STMT_VINFO_DATA_REF (def_stmt_info
)
688 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
689 != REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
690 != (oprnd_info
->first_dt
!= vect_reduction_def
))))
692 /* Try swapping operands if we got a mismatch. For BB
693 vectorization only in case it will clearly improve things. */
694 if (i
== commutative_op
&& !swapped
695 && (!is_a
<bb_vec_info
> (vinfo
)
696 || (!vect_def_types_match ((*oprnds_info
)[i
+1]->first_dt
,
698 && (vect_def_types_match (oprnd_info
->first_dt
, dts
[i
+1])
699 || vect_def_types_match
700 ((*oprnds_info
)[i
+1]->first_dt
, dts
[i
])))))
702 if (dump_enabled_p ())
703 dump_printf_loc (MSG_NOTE
, vect_location
,
704 "trying swapped operands\n");
705 std::swap (dts
[i
], dts
[i
+1]);
706 std::swap ((*oprnds_info
)[i
]->def_stmts
[stmt_num
],
707 (*oprnds_info
)[i
+1]->def_stmts
[stmt_num
]);
708 std::swap ((*oprnds_info
)[i
]->ops
[stmt_num
],
709 (*oprnds_info
)[i
+1]->ops
[stmt_num
]);
714 if (is_a
<bb_vec_info
> (vinfo
)
715 && !oprnd_info
->any_pattern
)
717 /* Now for commutative ops we should see whether we can
718 make the other operand matching. */
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
721 "treating operand as external\n");
722 oprnd_info
->first_dt
= dt
= vect_external_def
;
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
728 "Build SLP failed: different types\n");
733 /* Make sure to demote the overall operand to external. */
734 if (dt
== vect_external_def
)
735 oprnd_info
->first_dt
= vect_external_def
;
736 /* For a SLP reduction chain we want to duplicate the reduction to
737 each of the chain members. That gets us a sane SLP graph (still
738 the stmts are not 100% correct wrt the initial values). */
739 else if ((dt
== vect_internal_def
740 || dt
== vect_reduction_def
)
741 && oprnd_info
->first_dt
== vect_reduction_def
742 && !STMT_VINFO_DATA_REF (stmt_info
)
743 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
744 && !STMT_VINFO_DATA_REF (def_stmt_info
)
745 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
746 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
748 oprnd_info
->def_stmts
[stmt_num
] = oprnd_info
->def_stmts
[0];
749 oprnd_info
->ops
[stmt_num
] = oprnd_info
->ops
[0];
758 if (dump_enabled_p ())
759 dump_printf_loc (MSG_NOTE
, vect_location
,
760 "swapped operands to match def types in %G",
767 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
768 Return true if we can, meaning that this choice doesn't conflict with
769 existing SLP nodes that use STMT_INFO. */
772 vect_update_shared_vectype (stmt_vec_info stmt_info
, tree vectype
)
774 tree old_vectype
= STMT_VINFO_VECTYPE (stmt_info
);
776 return useless_type_conversion_p (vectype
, old_vectype
);
778 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
780 /* We maintain the invariant that if any statement in the group is
781 used, all other members of the group have the same vector type. */
782 stmt_vec_info first_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
783 stmt_vec_info member_info
= first_info
;
784 for (; member_info
; member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
785 if (is_pattern_stmt_p (member_info
)
786 && !useless_type_conversion_p (vectype
,
787 STMT_VINFO_VECTYPE (member_info
)))
792 for (member_info
= first_info
; member_info
;
793 member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
794 STMT_VINFO_VECTYPE (member_info
) = vectype
;
798 else if (!is_pattern_stmt_p (stmt_info
))
800 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
804 if (dump_enabled_p ())
806 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
807 "Build SLP failed: incompatible vector"
808 " types for: %G", stmt_info
->stmt
);
809 dump_printf_loc (MSG_NOTE
, vect_location
,
810 " old vector type: %T\n", old_vectype
);
811 dump_printf_loc (MSG_NOTE
, vect_location
,
812 " new vector type: %T\n", vectype
);
817 /* Return true if call statements CALL1 and CALL2 are similar enough
818 to be combined into the same SLP group. */
821 compatible_calls_p (gcall
*call1
, gcall
*call2
)
823 unsigned int nargs
= gimple_call_num_args (call1
);
824 if (nargs
!= gimple_call_num_args (call2
))
827 if (gimple_call_combined_fn (call1
) != gimple_call_combined_fn (call2
))
830 if (gimple_call_internal_p (call1
))
832 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1
)),
833 TREE_TYPE (gimple_call_lhs (call2
))))
835 for (unsigned int i
= 0; i
< nargs
; ++i
)
836 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1
, i
)),
837 TREE_TYPE (gimple_call_arg (call2
, i
))))
842 if (!operand_equal_p (gimple_call_fn (call1
),
843 gimple_call_fn (call2
), 0))
846 if (gimple_call_fntype (call1
) != gimple_call_fntype (call2
))
852 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
853 caller's attempt to find the vector type in STMT_INFO with the narrowest
854 element type. Return true if VECTYPE is nonnull and if it is valid
855 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
856 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
857 vect_build_slp_tree. */
860 vect_record_max_nunits (vec_info
*vinfo
, stmt_vec_info stmt_info
,
861 unsigned int group_size
,
862 tree vectype
, poly_uint64
*max_nunits
)
866 if (dump_enabled_p ())
867 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
868 "Build SLP failed: unsupported data-type in %G\n",
870 /* Fatal mismatch. */
874 /* If populating the vector type requires unrolling then fail
875 before adjusting *max_nunits for basic-block vectorization. */
876 if (is_a
<bb_vec_info
> (vinfo
)
877 && !multiple_p (group_size
, TYPE_VECTOR_SUBPARTS (vectype
)))
879 if (dump_enabled_p ())
880 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
881 "Build SLP failed: unrolling required "
882 "in basic block SLP\n");
883 /* Fatal mismatch. */
887 /* In case of multiple types we need to detect the smallest type. */
888 vect_update_max_nunits (max_nunits
, vectype
);
892 /* Verify if the scalar stmts STMTS are isomorphic, require data
893 permutation or are of unsupported types of operation. Return
894 true if they are, otherwise return false and indicate in *MATCHES
895 which stmts are not isomorphic to the first one. If MATCHES[0]
896 is false then this indicates the comparison could not be
897 carried out or the stmts will never be vectorized by SLP.
899 Note COND_EXPR is possibly isomorphic to another one after swapping its
900 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
901 the first stmt by swapping the two operands of comparison; set SWAP[i]
902 to 2 if stmt I is isormorphic to the first stmt by inverting the code
903 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
904 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
907 vect_build_slp_tree_1 (vec_info
*vinfo
, unsigned char *swap
,
908 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
909 poly_uint64
*max_nunits
, bool *matches
,
910 bool *two_operators
, tree
*node_vectype
)
913 stmt_vec_info first_stmt_info
= stmts
[0];
914 enum tree_code first_stmt_code
= ERROR_MARK
;
915 enum tree_code alt_stmt_code
= ERROR_MARK
;
916 enum tree_code rhs_code
= ERROR_MARK
;
917 enum tree_code first_cond_code
= ERROR_MARK
;
919 bool need_same_oprnds
= false;
920 tree vectype
= NULL_TREE
, first_op1
= NULL_TREE
;
923 machine_mode optab_op2_mode
;
924 machine_mode vec_mode
;
925 stmt_vec_info first_load
= NULL
, prev_first_load
= NULL
;
926 bool first_stmt_load_p
= false, load_p
= false;
927 bool first_stmt_phi_p
= false, phi_p
= false;
928 bool maybe_soft_fail
= false;
929 tree soft_fail_nunits_vectype
= NULL_TREE
;
931 /* For every stmt in NODE find its def stmt/s. */
932 stmt_vec_info stmt_info
;
933 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
935 gimple
*stmt
= stmt_info
->stmt
;
939 if (dump_enabled_p ())
940 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for %G", stmt
);
942 /* Fail to vectorize statements marked as unvectorizable, throw
944 if (!STMT_VINFO_VECTORIZABLE (stmt_info
)
945 || stmt_can_throw_internal (cfun
, stmt
)
946 || gimple_has_volatile_ops (stmt
))
948 if (dump_enabled_p ())
949 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
950 "Build SLP failed: unvectorizable statement %G",
952 /* ??? For BB vectorization we want to commutate operands in a way
953 to shuffle all unvectorizable defs into one operand and have
954 the other still vectorized. The following doesn't reliably
955 work for this though but it's the easiest we can do here. */
956 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
958 /* Fatal mismatch. */
963 lhs
= gimple_get_lhs (stmt
);
964 if (lhs
== NULL_TREE
)
966 if (dump_enabled_p ())
967 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
968 "Build SLP failed: not GIMPLE_ASSIGN nor "
969 "GIMPLE_CALL %G", stmt
);
970 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
972 /* Fatal mismatch. */
978 if (!vect_get_vector_types_for_stmt (vinfo
, stmt_info
, &vectype
,
979 &nunits_vectype
, group_size
))
981 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
983 /* Fatal mismatch. */
987 /* Record nunits required but continue analysis, producing matches[]
988 as if nunits was not an issue. This allows splitting of groups
991 && !vect_record_max_nunits (vinfo
, stmt_info
, group_size
,
992 nunits_vectype
, max_nunits
))
994 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
995 maybe_soft_fail
= true;
996 soft_fail_nunits_vectype
= nunits_vectype
;
999 gcc_assert (vectype
);
1001 gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
);
1004 rhs_code
= CALL_EXPR
;
1006 if (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
))
1008 else if ((gimple_call_internal_p (call_stmt
)
1009 && (!vectorizable_internal_fn_p
1010 (gimple_call_internal_fn (call_stmt
))))
1011 || gimple_call_tail_p (call_stmt
)
1012 || gimple_call_noreturn_p (call_stmt
)
1013 || !gimple_call_nothrow_p (call_stmt
)
1014 || gimple_call_chain (call_stmt
))
1016 if (dump_enabled_p ())
1017 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1018 "Build SLP failed: unsupported call type %G",
1020 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1022 /* Fatal mismatch. */
1027 else if (gimple_code (stmt
) == GIMPLE_PHI
)
1029 rhs_code
= ERROR_MARK
;
1034 rhs_code
= gimple_assign_rhs_code (stmt
);
1035 load_p
= gimple_vuse (stmt
);
1038 /* Check the operation. */
1041 *node_vectype
= vectype
;
1042 first_stmt_code
= rhs_code
;
1043 first_stmt_load_p
= load_p
;
1044 first_stmt_phi_p
= phi_p
;
1046 /* Shift arguments should be equal in all the packed stmts for a
1047 vector shift with scalar shift operand. */
1048 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
1049 || rhs_code
== LROTATE_EXPR
1050 || rhs_code
== RROTATE_EXPR
)
1052 vec_mode
= TYPE_MODE (vectype
);
1054 /* First see if we have a vector/vector shift. */
1055 optab
= optab_for_tree_code (rhs_code
, vectype
,
1059 || optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
1061 /* No vector/vector shift, try for a vector/scalar shift. */
1062 optab
= optab_for_tree_code (rhs_code
, vectype
,
1067 if (dump_enabled_p ())
1068 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1069 "Build SLP failed: no optab.\n");
1070 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1072 /* Fatal mismatch. */
1076 icode
= (int) optab_handler (optab
, vec_mode
);
1077 if (icode
== CODE_FOR_nothing
)
1079 if (dump_enabled_p ())
1080 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1081 "Build SLP failed: "
1082 "op not supported by target.\n");
1083 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1085 /* Fatal mismatch. */
1089 optab_op2_mode
= insn_data
[icode
].operand
[2].mode
;
1090 if (!VECTOR_MODE_P (optab_op2_mode
))
1092 need_same_oprnds
= true;
1093 first_op1
= gimple_assign_rhs2 (stmt
);
1097 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
1099 need_same_oprnds
= true;
1100 first_op1
= gimple_assign_rhs2 (stmt
);
1103 && rhs_code
== BIT_FIELD_REF
)
1105 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 0);
1106 if (!is_a
<bb_vec_info
> (vinfo
)
1107 || TREE_CODE (vec
) != SSA_NAME
1108 || !operand_equal_p (TYPE_SIZE (vectype
),
1109 TYPE_SIZE (TREE_TYPE (vec
))))
1111 if (dump_enabled_p ())
1112 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1113 "Build SLP failed: "
1114 "BIT_FIELD_REF not supported\n");
1115 /* Fatal mismatch. */
1121 && gimple_call_internal_p (call_stmt
, IFN_DIV_POW2
))
1123 need_same_oprnds
= true;
1124 first_op1
= gimple_call_arg (call_stmt
, 1);
1129 if (first_stmt_code
!= rhs_code
1130 && alt_stmt_code
== ERROR_MARK
)
1131 alt_stmt_code
= rhs_code
;
1132 if ((first_stmt_code
!= rhs_code
1133 && (first_stmt_code
!= IMAGPART_EXPR
1134 || rhs_code
!= REALPART_EXPR
)
1135 && (first_stmt_code
!= REALPART_EXPR
1136 || rhs_code
!= IMAGPART_EXPR
)
1137 /* Handle mismatches in plus/minus by computing both
1138 and merging the results. */
1139 && !((first_stmt_code
== PLUS_EXPR
1140 || first_stmt_code
== MINUS_EXPR
)
1141 && (alt_stmt_code
== PLUS_EXPR
1142 || alt_stmt_code
== MINUS_EXPR
)
1143 && rhs_code
== alt_stmt_code
)
1144 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1145 && (first_stmt_code
== ARRAY_REF
1146 || first_stmt_code
== BIT_FIELD_REF
1147 || first_stmt_code
== INDIRECT_REF
1148 || first_stmt_code
== COMPONENT_REF
1149 || first_stmt_code
== MEM_REF
)))
1150 || first_stmt_load_p
!= load_p
1151 || first_stmt_phi_p
!= phi_p
)
1153 if (dump_enabled_p ())
1155 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1156 "Build SLP failed: different operation "
1157 "in stmt %G", stmt
);
1158 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1159 "original stmt %G", first_stmt_info
->stmt
);
1165 if (need_same_oprnds
)
1167 tree other_op1
= (call_stmt
1168 ? gimple_call_arg (call_stmt
, 1)
1169 : gimple_assign_rhs2 (stmt
));
1170 if (!operand_equal_p (first_op1
, other_op1
, 0))
1172 if (dump_enabled_p ())
1173 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1174 "Build SLP failed: different shift "
1175 "arguments in %G", stmt
);
1181 && first_stmt_code
== BIT_FIELD_REF
1182 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info
->stmt
), 0)
1183 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0)))
1185 if (dump_enabled_p ())
1186 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1187 "Build SLP failed: different BIT_FIELD_REF "
1188 "arguments in %G", stmt
);
1193 if (!load_p
&& rhs_code
== CALL_EXPR
)
1195 if (!compatible_calls_p (as_a
<gcall
*> (stmts
[0]->stmt
),
1196 as_a
<gcall
*> (stmt
)))
1198 if (dump_enabled_p ())
1199 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1200 "Build SLP failed: different calls in %G",
1208 && (gimple_bb (first_stmt_info
->stmt
)
1209 != gimple_bb (stmt_info
->stmt
)))
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1213 "Build SLP failed: different BB for PHI "
1219 if (!types_compatible_p (vectype
, *node_vectype
))
1221 if (dump_enabled_p ())
1222 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1223 "Build SLP failed: different vector type "
1230 /* Grouped store or load. */
1231 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1233 if (REFERENCE_CLASS_P (lhs
))
1241 first_load
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1242 if (prev_first_load
)
1244 /* Check that there are no loads from different interleaving
1245 chains in the same node. */
1246 if (prev_first_load
!= first_load
)
1248 if (dump_enabled_p ())
1249 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1251 "Build SLP failed: different "
1252 "interleaving chains in one node %G",
1259 prev_first_load
= first_load
;
1261 } /* Grouped access. */
1266 /* Not grouped load. */
1267 if (dump_enabled_p ())
1268 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1269 "Build SLP failed: not grouped load %G", stmt
);
1271 /* FORNOW: Not grouped loads are not supported. */
1272 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1274 /* Fatal mismatch. */
1279 /* Not memory operation. */
1281 && TREE_CODE_CLASS (rhs_code
) != tcc_binary
1282 && TREE_CODE_CLASS (rhs_code
) != tcc_unary
1283 && TREE_CODE_CLASS (rhs_code
) != tcc_expression
1284 && TREE_CODE_CLASS (rhs_code
) != tcc_comparison
1285 && rhs_code
!= VIEW_CONVERT_EXPR
1286 && rhs_code
!= CALL_EXPR
1287 && rhs_code
!= BIT_FIELD_REF
)
1289 if (dump_enabled_p ())
1290 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1291 "Build SLP failed: operation unsupported %G",
1293 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1295 /* Fatal mismatch. */
1300 if (rhs_code
== COND_EXPR
)
1302 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1303 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1304 enum tree_code swap_code
= ERROR_MARK
;
1305 enum tree_code invert_code
= ERROR_MARK
;
1308 first_cond_code
= TREE_CODE (cond_expr
);
1309 else if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1311 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
1312 swap_code
= swap_tree_comparison (cond_code
);
1313 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
1316 if (first_cond_code
== cond_code
)
1318 /* Isomorphic can be achieved by swapping. */
1319 else if (first_cond_code
== swap_code
)
1321 /* Isomorphic can be achieved by inverting. */
1322 else if (first_cond_code
== invert_code
)
1326 if (dump_enabled_p ())
1327 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1328 "Build SLP failed: different"
1329 " operation %G", stmt
);
1339 for (i
= 0; i
< group_size
; ++i
)
1343 /* If we allowed a two-operation SLP node verify the target can cope
1344 with the permute we are going to use. */
1345 if (alt_stmt_code
!= ERROR_MARK
1346 && TREE_CODE_CLASS (alt_stmt_code
) != tcc_reference
)
1348 *two_operators
= true;
1351 if (maybe_soft_fail
)
1353 unsigned HOST_WIDE_INT const_nunits
;
1354 if (!TYPE_VECTOR_SUBPARTS
1355 (soft_fail_nunits_vectype
).is_constant (&const_nunits
)
1356 || const_nunits
> group_size
)
1360 /* With constant vector elements simulate a mismatch at the
1361 point we need to split. */
1362 unsigned tail
= group_size
& (const_nunits
- 1);
1363 memset (&matches
[group_size
- tail
], 0, sizeof (bool) * tail
);
1371 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1372 Note we never remove apart from at destruction time so we do not
1373 need a special value for deleted that differs from empty. */
1376 typedef vec
<stmt_vec_info
> value_type
;
1377 typedef vec
<stmt_vec_info
> compare_type
;
1378 static inline hashval_t
hash (value_type
);
1379 static inline bool equal (value_type existing
, value_type candidate
);
1380 static inline bool is_empty (value_type x
) { return !x
.exists (); }
1381 static inline bool is_deleted (value_type x
) { return !x
.exists (); }
1382 static const bool empty_zero_p
= true;
1383 static inline void mark_empty (value_type
&x
) { x
.release (); }
1384 static inline void mark_deleted (value_type
&x
) { x
.release (); }
1385 static inline void remove (value_type
&x
) { x
.release (); }
1388 bst_traits::hash (value_type x
)
1391 for (unsigned i
= 0; i
< x
.length (); ++i
)
1392 h
.add_int (gimple_uid (x
[i
]->stmt
));
1396 bst_traits::equal (value_type existing
, value_type candidate
)
1398 if (existing
.length () != candidate
.length ())
1400 for (unsigned i
= 0; i
< existing
.length (); ++i
)
1401 if (existing
[i
] != candidate
[i
])
1406 typedef hash_map
<vec
<stmt_vec_info
>, slp_tree
,
1407 simple_hashmap_traits
<bst_traits
, slp_tree
> >
1408 scalar_stmts_to_slp_tree_map_t
;
1411 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1412 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1413 poly_uint64
*max_nunits
,
1414 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1415 scalar_stmts_to_slp_tree_map_t
*bst_map
);
1418 vect_build_slp_tree (vec_info
*vinfo
,
1419 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1420 poly_uint64
*max_nunits
,
1421 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1422 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1424 if (slp_tree
*leader
= bst_map
->get (stmts
))
1426 if (dump_enabled_p ())
1427 dump_printf_loc (MSG_NOTE
, vect_location
, "re-using %sSLP tree %p\n",
1428 *leader
? "" : "failed ", *leader
);
1431 SLP_TREE_REF_COUNT (*leader
)++;
1432 vect_update_max_nunits (max_nunits
, (*leader
)->max_nunits
);
1438 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1439 so we can pick up backedge destinations during discovery. */
1440 slp_tree res
= new _slp_tree
;
1441 SLP_TREE_DEF_TYPE (res
) = vect_internal_def
;
1442 SLP_TREE_SCALAR_STMTS (res
) = stmts
;
1443 bst_map
->put (stmts
.copy (), res
);
1447 if (dump_enabled_p ())
1448 dump_printf_loc (MSG_NOTE
, vect_location
,
1449 "SLP discovery limit exceeded\n");
1450 bool existed_p
= bst_map
->put (stmts
, NULL
);
1451 gcc_assert (existed_p
);
1452 /* Mark the node invalid so we can detect those when still in use
1453 as backedge destinations. */
1454 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1455 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1456 vect_free_slp_tree (res
);
1457 memset (matches
, 0, sizeof (bool) * group_size
);
1462 poly_uint64 this_max_nunits
= 1;
1463 slp_tree res_
= vect_build_slp_tree_2 (vinfo
, res
, stmts
, group_size
,
1465 matches
, limit
, tree_size
, bst_map
);
1468 bool existed_p
= bst_map
->put (stmts
, NULL
);
1469 gcc_assert (existed_p
);
1470 /* Mark the node invalid so we can detect those when still in use
1471 as backedge destinations. */
1472 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1473 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1474 vect_free_slp_tree (res
);
1478 gcc_assert (res_
== res
);
1479 res
->max_nunits
= this_max_nunits
;
1480 vect_update_max_nunits (max_nunits
, this_max_nunits
);
1481 /* Keep a reference for the bst_map use. */
1482 SLP_TREE_REF_COUNT (res
)++;
1487 /* Recursively build an SLP tree starting from NODE.
1488 Fail (and return a value not equal to zero) if def-stmts are not
1489 isomorphic, require data permutation or are of unsupported types of
1490 operation. Otherwise, return 0.
1491 The value returned is the depth in the SLP tree where a mismatch
1495 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1496 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1497 poly_uint64
*max_nunits
,
1498 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1499 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1501 unsigned nops
, i
, this_tree_size
= 0;
1502 poly_uint64 this_max_nunits
= *max_nunits
;
1506 stmt_vec_info stmt_info
= stmts
[0];
1507 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1508 nops
= gimple_call_num_args (stmt
);
1509 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
1511 nops
= gimple_num_ops (stmt
) - 1;
1512 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
1515 else if (gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1516 nops
= gimple_phi_num_args (phi
);
1520 /* If the SLP node is a PHI (induction or reduction), terminate
1522 bool *skip_args
= XALLOCAVEC (bool, nops
);
1523 memset (skip_args
, 0, sizeof (bool) * nops
);
1524 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
1525 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1527 tree scalar_type
= TREE_TYPE (PHI_RESULT (stmt
));
1528 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
1530 if (!vect_record_max_nunits (vinfo
, stmt_info
, group_size
, vectype
,
1534 vect_def_type def_type
= STMT_VINFO_DEF_TYPE (stmt_info
);
1535 if (def_type
== vect_induction_def
)
1537 /* Induction PHIs are not cycles but walk the initial
1538 value. Only for inner loops through, for outer loops
1539 we need to pick up the value from the actual PHIs
1540 to more easily support peeling and epilogue vectorization. */
1541 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1542 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1543 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1546 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1548 else if (def_type
== vect_reduction_def
1549 || def_type
== vect_double_reduction_def
1550 || def_type
== vect_nested_cycle
)
1552 /* Else def types have to match. */
1553 stmt_vec_info other_info
;
1554 bool all_same
= true;
1555 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1557 if (STMT_VINFO_DEF_TYPE (other_info
) != def_type
)
1559 if (other_info
!= stmt_info
)
1562 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1563 /* Reduction initial values are not explicitely represented. */
1564 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1565 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1566 /* Reduction chain backedge defs are filled manually.
1567 ??? Need a better way to identify a SLP reduction chain PHI.
1568 Or a better overall way to SLP match those. */
1569 if (all_same
&& def_type
== vect_reduction_def
)
1570 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1572 else if (def_type
!= vect_internal_def
)
1577 bool two_operators
= false;
1578 unsigned char *swap
= XALLOCAVEC (unsigned char, group_size
);
1579 tree vectype
= NULL_TREE
;
1580 if (!vect_build_slp_tree_1 (vinfo
, swap
, stmts
, group_size
,
1581 &this_max_nunits
, matches
, &two_operators
,
1585 /* If the SLP node is a load, terminate the recursion unless masked. */
1586 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1587 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1589 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1592 gcc_assert (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
));
1597 *max_nunits
= this_max_nunits
;
1599 node
= vect_create_new_slp_node (node
, stmts
, 0);
1600 SLP_TREE_VECTYPE (node
) = vectype
;
1601 /* And compute the load permutation. Whether it is actually
1602 a permutation depends on the unrolling factor which is
1604 vec
<unsigned> load_permutation
;
1606 stmt_vec_info load_info
;
1607 load_permutation
.create (group_size
);
1608 stmt_vec_info first_stmt_info
1609 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
1610 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
1612 int load_place
= vect_get_place_in_interleaving_chain
1613 (load_info
, first_stmt_info
);
1614 gcc_assert (load_place
!= -1);
1615 load_permutation
.safe_push (load_place
);
1617 SLP_TREE_LOAD_PERMUTATION (node
) = load_permutation
;
1621 else if (gimple_assign_single_p (stmt_info
->stmt
)
1622 && !gimple_vuse (stmt_info
->stmt
)
1623 && gimple_assign_rhs_code (stmt_info
->stmt
) == BIT_FIELD_REF
)
1625 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1626 the same SSA name vector of a compatible type to vectype. */
1627 vec
<std::pair
<unsigned, unsigned> > lperm
= vNULL
;
1628 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0);
1629 stmt_vec_info estmt_info
;
1630 FOR_EACH_VEC_ELT (stmts
, i
, estmt_info
)
1632 gassign
*estmt
= as_a
<gassign
*> (estmt_info
->stmt
);
1633 tree bfref
= gimple_assign_rhs1 (estmt
);
1635 if (!known_eq (bit_field_size (bfref
),
1636 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype
))))
1637 || !constant_multiple_p (bit_field_offset (bfref
),
1638 bit_field_size (bfref
), &lane
))
1643 lperm
.safe_push (std::make_pair (0, (unsigned)lane
));
1645 slp_tree vnode
= vect_create_new_slp_node (vNULL
);
1646 /* ??? We record vectype here but we hide eventually necessary
1647 punning and instead rely on code generation to materialize
1648 VIEW_CONVERT_EXPRs as necessary. We instead should make
1649 this explicit somehow. */
1650 SLP_TREE_VECTYPE (vnode
) = vectype
;
1651 SLP_TREE_VEC_DEFS (vnode
).safe_push (vec
);
1652 /* We are always building a permutation node even if it is an identity
1653 permute to shield the rest of the vectorizer from the odd node
1654 representing an actual vector without any scalar ops.
1655 ??? We could hide it completely with making the permute node
1657 node
= vect_create_new_slp_node (node
, stmts
, 1);
1658 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1659 SLP_TREE_LANE_PERMUTATION (node
) = lperm
;
1660 SLP_TREE_VECTYPE (node
) = vectype
;
1661 SLP_TREE_CHILDREN (node
).quick_push (vnode
);
1665 /* Get at the operands, verifying they are compatible. */
1666 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
1667 slp_oprnd_info oprnd_info
;
1668 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
1670 int res
= vect_get_and_check_slp_defs (vinfo
, swap
[i
], skip_args
,
1671 stmts
, i
, &oprnds_info
);
1673 matches
[(res
== -1) ? 0 : i
] = false;
1677 for (i
= 0; i
< group_size
; ++i
)
1680 vect_free_oprnd_info (oprnds_info
);
1685 auto_vec
<slp_tree
, 4> children
;
1687 stmt_info
= stmts
[0];
1689 /* Create SLP_TREE nodes for the definition node/s. */
1690 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
1695 /* We're skipping certain operands from processing, for example
1696 outer loop reduction initial defs. */
1699 children
.safe_push (NULL
);
1703 if (oprnd_info
->first_dt
== vect_uninitialized_def
)
1705 /* COND_EXPR have one too many eventually if the condition
1707 gcc_assert (i
== 3 && nops
== 4);
1711 if (is_a
<bb_vec_info
> (vinfo
)
1712 && oprnd_info
->first_dt
== vect_internal_def
1713 && !oprnd_info
->any_pattern
)
1715 /* For BB vectorization, if all defs are the same do not
1716 bother to continue the build along the single-lane
1717 graph but use a splat of the scalar value. */
1718 stmt_vec_info first_def
= oprnd_info
->def_stmts
[0];
1719 for (j
= 1; j
< group_size
; ++j
)
1720 if (oprnd_info
->def_stmts
[j
] != first_def
)
1723 /* But avoid doing this for loads where we may be
1724 able to CSE things, unless the stmt is not
1726 && (!STMT_VINFO_VECTORIZABLE (first_def
)
1727 || !gimple_vuse (first_def
->stmt
)))
1729 if (dump_enabled_p ())
1730 dump_printf_loc (MSG_NOTE
, vect_location
,
1731 "Using a splat of the uniform operand\n");
1732 oprnd_info
->first_dt
= vect_external_def
;
1736 if (oprnd_info
->first_dt
== vect_external_def
1737 || oprnd_info
->first_dt
== vect_constant_def
)
1739 slp_tree invnode
= vect_create_new_slp_node (oprnd_info
->ops
);
1740 SLP_TREE_DEF_TYPE (invnode
) = oprnd_info
->first_dt
;
1741 oprnd_info
->ops
= vNULL
;
1742 children
.safe_push (invnode
);
1746 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1747 group_size
, &this_max_nunits
,
1749 &this_tree_size
, bst_map
)) != NULL
)
1751 oprnd_info
->def_stmts
= vNULL
;
1752 children
.safe_push (child
);
1756 /* If the SLP build for operand zero failed and operand zero
1757 and one can be commutated try that for the scalar stmts
1758 that failed the match. */
1760 /* A first scalar stmt mismatch signals a fatal mismatch. */
1762 /* ??? For COND_EXPRs we can swap the comparison operands
1763 as well as the arms under some constraints. */
1765 && oprnds_info
[1]->first_dt
== vect_internal_def
1766 && is_gimple_assign (stmt_info
->stmt
)
1767 /* Swapping operands for reductions breaks assumptions later on. */
1768 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
1769 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
1771 /* See whether we can swap the matching or the non-matching
1773 bool swap_not_matching
= true;
1776 for (j
= 0; j
< group_size
; ++j
)
1778 if (matches
[j
] != !swap_not_matching
)
1780 stmt_vec_info stmt_info
= stmts
[j
];
1781 /* Verify if we can swap operands of this stmt. */
1782 gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
1784 || !commutative_tree_code (gimple_assign_rhs_code (stmt
)))
1786 if (!swap_not_matching
)
1788 swap_not_matching
= false;
1793 while (j
!= group_size
);
1795 /* Swap mismatched definition stmts. */
1796 if (dump_enabled_p ())
1797 dump_printf_loc (MSG_NOTE
, vect_location
,
1798 "Re-trying with swapped operands of stmts ");
1799 for (j
= 0; j
< group_size
; ++j
)
1800 if (matches
[j
] == !swap_not_matching
)
1802 std::swap (oprnds_info
[0]->def_stmts
[j
],
1803 oprnds_info
[1]->def_stmts
[j
]);
1804 std::swap (oprnds_info
[0]->ops
[j
],
1805 oprnds_info
[1]->ops
[j
]);
1806 if (dump_enabled_p ())
1807 dump_printf (MSG_NOTE
, "%d ", j
);
1809 if (dump_enabled_p ())
1810 dump_printf (MSG_NOTE
, "\n");
1811 /* And try again with scratch 'matches' ... */
1812 bool *tem
= XALLOCAVEC (bool, group_size
);
1813 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1814 group_size
, &this_max_nunits
,
1816 &this_tree_size
, bst_map
)) != NULL
)
1818 oprnd_info
->def_stmts
= vNULL
;
1819 children
.safe_push (child
);
1825 /* If the SLP build failed and we analyze a basic-block
1826 simply treat nodes we fail to build as externally defined
1827 (and thus build vectors from the scalar defs).
1828 The cost model will reject outright expensive cases.
1829 ??? This doesn't treat cases where permutation ultimatively
1830 fails (or we don't try permutation below). Ideally we'd
1831 even compute a permutation that will end up with the maximum
1833 if (is_a
<bb_vec_info
> (vinfo
)
1834 /* ??? Rejecting patterns this way doesn't work. We'd have to
1835 do extra work to cancel the pattern so the uses see the
1837 && !is_pattern_stmt_p (stmt_info
)
1838 && !oprnd_info
->any_pattern
)
1840 /* But if there's a leading vector sized set of matching stmts
1841 fail here so we can split the group. This matches the condition
1842 vect_analyze_slp_instance uses. */
1843 /* ??? We might want to split here and combine the results to support
1844 multiple vector sizes better. */
1845 for (j
= 0; j
< group_size
; ++j
)
1848 if (!known_ge (j
, TYPE_VECTOR_SUBPARTS (vectype
)))
1850 if (dump_enabled_p ())
1851 dump_printf_loc (MSG_NOTE
, vect_location
,
1852 "Building vector operands from scalars\n");
1854 child
= vect_create_new_slp_node (oprnd_info
->ops
);
1855 children
.safe_push (child
);
1856 oprnd_info
->ops
= vNULL
;
1861 gcc_assert (child
== NULL
);
1862 FOR_EACH_VEC_ELT (children
, j
, child
)
1864 vect_free_slp_tree (child
);
1865 vect_free_oprnd_info (oprnds_info
);
1869 vect_free_oprnd_info (oprnds_info
);
1871 /* If we have all children of a child built up from uniform scalars
1872 or does more than one possibly expensive vector construction then
1873 just throw that away, causing it built up from scalars.
1874 The exception is the SLP node for the vector store. */
1875 if (is_a
<bb_vec_info
> (vinfo
)
1876 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1877 /* ??? Rejecting patterns this way doesn't work. We'd have to
1878 do extra work to cancel the pattern so the uses see the
1880 && !is_pattern_stmt_p (stmt_info
))
1884 bool all_uniform_p
= true;
1885 unsigned n_vector_builds
= 0;
1886 FOR_EACH_VEC_ELT (children
, j
, child
)
1890 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
1891 all_uniform_p
= false;
1892 else if (!vect_slp_tree_uniform_p (child
))
1894 all_uniform_p
= false;
1895 if (SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
1900 || n_vector_builds
> 1
1901 || (n_vector_builds
== children
.length ()
1902 && is_a
<gphi
*> (stmt_info
->stmt
)))
1906 FOR_EACH_VEC_ELT (children
, j
, child
)
1908 vect_free_slp_tree (child
);
1910 if (dump_enabled_p ())
1911 dump_printf_loc (MSG_NOTE
, vect_location
,
1912 "Building parent vector operands from "
1913 "scalars instead\n");
1918 *tree_size
+= this_tree_size
+ 1;
1919 *max_nunits
= this_max_nunits
;
1923 /* ??? We'd likely want to either cache in bst_map sth like
1924 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1925 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1926 explicit stmts to put in so the keying on 'stmts' doesn't
1927 work (but we have the same issue with nodes that use 'ops'). */
1928 slp_tree one
= new _slp_tree
;
1929 slp_tree two
= new _slp_tree
;
1930 SLP_TREE_DEF_TYPE (one
) = vect_internal_def
;
1931 SLP_TREE_DEF_TYPE (two
) = vect_internal_def
;
1932 SLP_TREE_VECTYPE (one
) = vectype
;
1933 SLP_TREE_VECTYPE (two
) = vectype
;
1934 SLP_TREE_CHILDREN (one
).safe_splice (children
);
1935 SLP_TREE_CHILDREN (two
).safe_splice (children
);
1937 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two
), i
, child
)
1938 SLP_TREE_REF_COUNT (child
)++;
1940 /* Here we record the original defs since this
1941 node represents the final lane configuration. */
1942 node
= vect_create_new_slp_node (node
, stmts
, 2);
1943 SLP_TREE_VECTYPE (node
) = vectype
;
1944 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1945 SLP_TREE_CHILDREN (node
).quick_push (one
);
1946 SLP_TREE_CHILDREN (node
).quick_push (two
);
1947 gassign
*stmt
= as_a
<gassign
*> (stmts
[0]->stmt
);
1948 enum tree_code code0
= gimple_assign_rhs_code (stmt
);
1949 enum tree_code ocode
= ERROR_MARK
;
1950 stmt_vec_info ostmt_info
;
1952 FOR_EACH_VEC_ELT (stmts
, i
, ostmt_info
)
1954 gassign
*ostmt
= as_a
<gassign
*> (ostmt_info
->stmt
);
1955 if (gimple_assign_rhs_code (ostmt
) != code0
)
1957 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (1, i
));
1958 ocode
= gimple_assign_rhs_code (ostmt
);
1962 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (0, i
));
1964 SLP_TREE_CODE (one
) = code0
;
1965 SLP_TREE_CODE (two
) = ocode
;
1966 SLP_TREE_LANES (one
) = stmts
.length ();
1967 SLP_TREE_LANES (two
) = stmts
.length ();
1968 SLP_TREE_REPRESENTATIVE (one
) = stmts
[0];
1969 SLP_TREE_REPRESENTATIVE (two
) = stmts
[j
];
1973 node
= vect_create_new_slp_node (node
, stmts
, nops
);
1974 SLP_TREE_VECTYPE (node
) = vectype
;
1975 SLP_TREE_CHILDREN (node
).splice (children
);
1979 /* Dump a single SLP tree NODE. */
1982 vect_print_slp_tree (dump_flags_t dump_kind
, dump_location_t loc
,
1987 stmt_vec_info stmt_info
;
1990 dump_metadata_t
metadata (dump_kind
, loc
.get_impl_location ());
1991 dump_user_location_t user_loc
= loc
.get_user_location ();
1992 dump_printf_loc (metadata
, user_loc
, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1993 SLP_TREE_DEF_TYPE (node
) == vect_external_def
1995 : (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
1998 estimated_poly_value (node
->max_nunits
),
1999 SLP_TREE_REF_COUNT (node
));
2000 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
2002 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
2003 dump_printf_loc (metadata
, user_loc
, "op: VEC_PERM_EXPR\n");
2005 dump_printf_loc (metadata
, user_loc
, "op template: %G",
2006 SLP_TREE_REPRESENTATIVE (node
)->stmt
);
2008 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
2009 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2010 dump_printf_loc (metadata
, user_loc
, "\tstmt %u %G", i
, stmt_info
->stmt
);
2013 dump_printf_loc (metadata
, user_loc
, "\t{ ");
2014 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
2015 dump_printf (metadata
, "%T%s ", op
,
2016 i
< SLP_TREE_SCALAR_OPS (node
).length () - 1 ? "," : "");
2017 dump_printf (metadata
, "}\n");
2019 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2021 dump_printf_loc (metadata
, user_loc
, "\tload permutation {");
2022 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node
), i
, j
)
2023 dump_printf (dump_kind
, " %u", j
);
2024 dump_printf (dump_kind
, " }\n");
2026 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
2028 dump_printf_loc (metadata
, user_loc
, "\tlane permutation {");
2029 for (i
= 0; i
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++i
)
2030 dump_printf (dump_kind
, " %u[%u]",
2031 SLP_TREE_LANE_PERMUTATION (node
)[i
].first
,
2032 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
);
2033 dump_printf (dump_kind
, " }\n");
2035 if (SLP_TREE_CHILDREN (node
).is_empty ())
2037 dump_printf_loc (metadata
, user_loc
, "\tchildren");
2038 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2039 dump_printf (dump_kind
, " %p", (void *)child
);
2040 dump_printf (dump_kind
, "\n");
2044 debug (slp_tree node
)
2046 debug_dump_context ctx
;
2047 vect_print_slp_tree (MSG_NOTE
,
2048 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
2052 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2055 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2056 slp_tree node
, hash_set
<slp_tree
> &visited
)
2061 if (visited
.add (node
))
2064 vect_print_slp_tree (dump_kind
, loc
, node
);
2066 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2068 vect_print_slp_graph (dump_kind
, loc
, child
, visited
);
2072 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2075 hash_set
<slp_tree
> visited
;
2076 vect_print_slp_graph (dump_kind
, loc
, entry
, visited
);
2079 /* Mark the tree rooted at NODE with PURE_SLP. */
2082 vect_mark_slp_stmts (slp_tree node
, hash_set
<slp_tree
> &visited
)
2085 stmt_vec_info stmt_info
;
2088 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2091 if (visited
.add (node
))
2094 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2095 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
2097 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2099 vect_mark_slp_stmts (child
, visited
);
2103 vect_mark_slp_stmts (slp_tree node
)
2105 hash_set
<slp_tree
> visited
;
2106 vect_mark_slp_stmts (node
, visited
);
2109 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2112 vect_mark_slp_stmts_relevant (slp_tree node
, hash_set
<slp_tree
> &visited
)
2115 stmt_vec_info stmt_info
;
2118 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2121 if (visited
.add (node
))
2124 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2126 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
2127 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
2128 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
2131 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2133 vect_mark_slp_stmts_relevant (child
, visited
);
2137 vect_mark_slp_stmts_relevant (slp_tree node
)
2139 hash_set
<slp_tree
> visited
;
2140 vect_mark_slp_stmts_relevant (node
, visited
);
2144 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2147 vect_gather_slp_loads (vec
<slp_tree
> &loads
, slp_tree node
,
2148 hash_set
<slp_tree
> &visited
)
2150 if (!node
|| visited
.add (node
))
2153 if (SLP_TREE_CHILDREN (node
).length () == 0)
2155 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2157 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2158 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2159 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2160 loads
.safe_push (node
);
2166 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2167 vect_gather_slp_loads (loads
, child
, visited
);
2172 /* Find the last store in SLP INSTANCE. */
2175 vect_find_last_scalar_stmt_in_slp (slp_tree node
)
2177 stmt_vec_info last
= NULL
;
2178 stmt_vec_info stmt_vinfo
;
2180 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2182 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2183 last
= last
? get_later_stmt (stmt_vinfo
, last
) : stmt_vinfo
;
2189 /* Find the first stmt in NODE. */
2192 vect_find_first_scalar_stmt_in_slp (slp_tree node
)
2194 stmt_vec_info first
= NULL
;
2195 stmt_vec_info stmt_vinfo
;
2197 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2199 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2201 || get_later_stmt (stmt_vinfo
, first
) == first
)
2208 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2209 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2210 (also containing the first GROUP1_SIZE stmts, since stores are
2211 consecutive), the second containing the remainder.
2212 Return the first stmt in the second group. */
2214 static stmt_vec_info
2215 vect_split_slp_store_group (stmt_vec_info first_vinfo
, unsigned group1_size
)
2217 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo
) == first_vinfo
);
2218 gcc_assert (group1_size
> 0);
2219 int group2_size
= DR_GROUP_SIZE (first_vinfo
) - group1_size
;
2220 gcc_assert (group2_size
> 0);
2221 DR_GROUP_SIZE (first_vinfo
) = group1_size
;
2223 stmt_vec_info stmt_info
= first_vinfo
;
2224 for (unsigned i
= group1_size
; i
> 1; i
--)
2226 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2227 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2229 /* STMT is now the last element of the first group. */
2230 stmt_vec_info group2
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2231 DR_GROUP_NEXT_ELEMENT (stmt_info
) = 0;
2233 DR_GROUP_SIZE (group2
) = group2_size
;
2234 for (stmt_info
= group2
; stmt_info
;
2235 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
))
2237 DR_GROUP_FIRST_ELEMENT (stmt_info
) = group2
;
2238 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2241 /* For the second group, the DR_GROUP_GAP is that before the original group,
2242 plus skipping over the first vector. */
2243 DR_GROUP_GAP (group2
) = DR_GROUP_GAP (first_vinfo
) + group1_size
;
2245 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2246 DR_GROUP_GAP (first_vinfo
) += group2_size
;
2248 if (dump_enabled_p ())
2249 dump_printf_loc (MSG_NOTE
, vect_location
, "Split group into %d and %d\n",
2250 group1_size
, group2_size
);
2255 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2256 statements and a vector of NUNITS elements. */
2259 calculate_unrolling_factor (poly_uint64 nunits
, unsigned int group_size
)
2261 return exact_div (common_multiple (nunits
, group_size
), group_size
);
2264 /* Helper that checks to see if a node is a load node. */
2267 vect_is_slp_load_node (slp_tree root
)
2269 return SLP_TREE_DEF_TYPE (root
) == vect_internal_def
2270 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root
))
2271 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root
)));
2275 /* Helper function of optimize_load_redistribution that performs the operation
2279 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t
*bst_map
,
2280 vec_info
*vinfo
, unsigned int group_size
,
2281 hash_map
<slp_tree
, slp_tree
> *load_map
,
2284 if (slp_tree
*leader
= load_map
->get (root
))
2287 load_map
->put (root
, NULL
);
2292 /* For now, we don't know anything about externals so do not do anything. */
2293 if (SLP_TREE_DEF_TYPE (root
) != vect_internal_def
)
2295 else if (SLP_TREE_CODE (root
) == VEC_PERM_EXPR
)
2297 /* First convert this node into a load node and add it to the leaves
2298 list and flatten the permute from a lane to a load one. If it's
2299 unneeded it will be elided later. */
2300 vec
<stmt_vec_info
> stmts
;
2301 stmts
.create (SLP_TREE_LANES (root
));
2302 lane_permutation_t lane_perm
= SLP_TREE_LANE_PERMUTATION (root
);
2303 for (unsigned j
= 0; j
< lane_perm
.length (); j
++)
2305 std::pair
<unsigned, unsigned> perm
= lane_perm
[j
];
2306 node
= SLP_TREE_CHILDREN (root
)[perm
.first
];
2308 if (!vect_is_slp_load_node (node
)
2309 || SLP_TREE_CHILDREN (node
).exists ())
2315 stmts
.quick_push (SLP_TREE_SCALAR_STMTS (node
)[perm
.second
]);
2318 if (dump_enabled_p ())
2319 dump_printf_loc (MSG_NOTE
, vect_location
,
2320 "converting stmts on permute node %p\n", root
);
2322 bool *matches
= XALLOCAVEC (bool, group_size
);
2323 poly_uint64 max_nunits
= 1;
2324 unsigned tree_size
= 0, limit
= 1;
2325 node
= vect_build_slp_tree (vinfo
, stmts
, group_size
, &max_nunits
,
2326 matches
, &limit
, &tree_size
, bst_map
);
2330 load_map
->put (root
, node
);
2335 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
2338 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
2342 SLP_TREE_REF_COUNT (value
)++;
2343 SLP_TREE_CHILDREN (root
)[i
] = value
;
2344 vect_free_slp_tree (node
);
2351 /* Temporary workaround for loads not being CSEd during SLP build. This
2352 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2353 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2354 same DR such that the final operation is equal to a permuted load. Such
2355 NODES are then directly converted into LOADS themselves. The nodes are
2356 CSEd using BST_MAP. */
2359 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t
*bst_map
,
2360 vec_info
*vinfo
, unsigned int group_size
,
2361 hash_map
<slp_tree
, slp_tree
> *load_map
,
2367 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
2370 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
2374 SLP_TREE_REF_COUNT (value
)++;
2375 SLP_TREE_CHILDREN (root
)[i
] = value
;
2376 vect_free_slp_tree (node
);
2381 /* Helper function of vect_match_slp_patterns.
2383 Attempts to match patterns against the slp tree rooted in REF_NODE using
2384 VINFO. Patterns are matched in post-order traversal.
2386 If matching is successful the value in REF_NODE is updated and returned, if
2387 not then it is returned unchanged. */
2390 vect_match_slp_patterns_2 (slp_tree
*ref_node
, vec_info
*vinfo
,
2391 slp_tree_to_load_perm_map_t
*perm_cache
,
2392 hash_set
<slp_tree
> *visited
)
2395 slp_tree node
= *ref_node
;
2396 bool found_p
= false;
2397 if (!node
|| visited
->add (node
))
2401 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2402 found_p
|= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node
)[i
],
2403 vinfo
, perm_cache
, visited
);
2405 for (unsigned x
= 0; x
< num__slp_patterns
; x
++)
2407 vect_pattern
*pattern
= slp_patterns
[x
] (perm_cache
, ref_node
);
2410 pattern
->build (vinfo
);
2419 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2422 The modified tree is returned. Patterns are tried in order and multiple
2423 patterns may match. */
2426 vect_match_slp_patterns (slp_instance instance
, vec_info
*vinfo
,
2427 hash_set
<slp_tree
> *visited
,
2428 slp_tree_to_load_perm_map_t
*perm_cache
)
2430 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2431 slp_tree
*ref_node
= &SLP_INSTANCE_TREE (instance
);
2433 if (dump_enabled_p ())
2434 dump_printf_loc (MSG_NOTE
, vect_location
,
2435 "Analyzing SLP tree %p for patterns\n",
2436 SLP_INSTANCE_TREE (instance
));
2438 return vect_match_slp_patterns_2 (ref_node
, vinfo
, perm_cache
, visited
);
2441 /* Analyze an SLP instance starting from a group of grouped stores. Call
2442 vect_build_slp_tree to build a tree of packed stmts if possible.
2443 Return FALSE if it's impossible to SLP any stmt in the loop. */
2446 vect_analyze_slp_instance (vec_info
*vinfo
,
2447 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2448 stmt_vec_info stmt_info
, slp_instance_kind kind
,
2449 unsigned max_tree_size
, unsigned *limit
);
2451 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2452 of KIND. Return true if successful. */
2455 vect_build_slp_instance (vec_info
*vinfo
,
2456 slp_instance_kind kind
,
2457 vec
<stmt_vec_info
> &scalar_stmts
,
2458 stmt_vec_info root_stmt_info
,
2459 unsigned max_tree_size
, unsigned *limit
,
2460 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2461 /* ??? We need stmt_info for group splitting. */
2462 stmt_vec_info stmt_info_
)
2464 if (dump_enabled_p ())
2466 dump_printf_loc (MSG_NOTE
, vect_location
,
2467 "Starting SLP discovery for\n");
2468 for (unsigned i
= 0; i
< scalar_stmts
.length (); ++i
)
2469 dump_printf_loc (MSG_NOTE
, vect_location
,
2470 " %G", scalar_stmts
[i
]->stmt
);
2473 /* Build the tree for the SLP instance. */
2474 unsigned int group_size
= scalar_stmts
.length ();
2475 bool *matches
= XALLOCAVEC (bool, group_size
);
2476 poly_uint64 max_nunits
= 1;
2477 unsigned tree_size
= 0;
2479 slp_tree node
= vect_build_slp_tree (vinfo
, scalar_stmts
, group_size
,
2480 &max_nunits
, matches
, limit
,
2481 &tree_size
, bst_map
);
2484 /* Calculate the unrolling factor based on the smallest type. */
2485 poly_uint64 unrolling_factor
2486 = calculate_unrolling_factor (max_nunits
, group_size
);
2488 if (maybe_ne (unrolling_factor
, 1U)
2489 && is_a
<bb_vec_info
> (vinfo
))
2491 unsigned HOST_WIDE_INT const_max_nunits
;
2492 if (!max_nunits
.is_constant (&const_max_nunits
)
2493 || const_max_nunits
> group_size
)
2495 if (dump_enabled_p ())
2496 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2497 "Build SLP failed: store group "
2498 "size not a multiple of the vector size "
2499 "in basic block SLP\n");
2500 vect_free_slp_tree (node
);
2503 /* Fatal mismatch. */
2504 if (dump_enabled_p ())
2505 dump_printf_loc (MSG_NOTE
, vect_location
,
2506 "SLP discovery succeeded but node needs "
2508 memset (matches
, true, group_size
);
2509 matches
[group_size
/ const_max_nunits
* const_max_nunits
] = false;
2510 vect_free_slp_tree (node
);
2514 /* Create a new SLP instance. */
2515 slp_instance new_instance
= XNEW (class _slp_instance
);
2516 SLP_INSTANCE_TREE (new_instance
) = node
;
2517 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
2518 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
2519 SLP_INSTANCE_ROOT_STMT (new_instance
) = root_stmt_info
;
2520 SLP_INSTANCE_KIND (new_instance
) = kind
;
2521 new_instance
->reduc_phis
= NULL
;
2522 new_instance
->cost_vec
= vNULL
;
2523 new_instance
->subgraph_entries
= vNULL
;
2525 if (dump_enabled_p ())
2526 dump_printf_loc (MSG_NOTE
, vect_location
,
2527 "SLP size %u vs. limit %u.\n",
2528 tree_size
, max_tree_size
);
2530 /* Fixup SLP reduction chains. */
2531 if (kind
== slp_inst_kind_reduc_chain
)
2533 /* If this is a reduction chain with a conversion in front
2534 amend the SLP tree with a node for that. */
2536 = vect_orig_stmt (scalar_stmts
[group_size
- 1])->stmt
;
2537 if (STMT_VINFO_DEF_TYPE (scalar_stmts
[0]) != vect_reduction_def
)
2539 /* Get at the conversion stmt - we know it's the single use
2540 of the last stmt of the reduction chain. */
2541 use_operand_p use_p
;
2542 bool r
= single_imm_use (gimple_assign_lhs (scalar_def
),
2543 &use_p
, &scalar_def
);
2545 stmt_vec_info next_info
= vinfo
->lookup_stmt (scalar_def
);
2546 next_info
= vect_stmt_to_vectorize (next_info
);
2547 scalar_stmts
= vNULL
;
2548 scalar_stmts
.create (group_size
);
2549 for (unsigned i
= 0; i
< group_size
; ++i
)
2550 scalar_stmts
.quick_push (next_info
);
2551 slp_tree conv
= vect_create_new_slp_node (scalar_stmts
, 1);
2552 SLP_TREE_VECTYPE (conv
) = STMT_VINFO_VECTYPE (next_info
);
2553 SLP_TREE_CHILDREN (conv
).quick_push (node
);
2554 SLP_INSTANCE_TREE (new_instance
) = conv
;
2555 /* We also have to fake this conversion stmt as SLP reduction
2556 group so we don't have to mess with too much code
2558 REDUC_GROUP_FIRST_ELEMENT (next_info
) = next_info
;
2559 REDUC_GROUP_NEXT_ELEMENT (next_info
) = NULL
;
2561 /* Fill the backedge child of the PHI SLP node. The
2562 general matching code cannot find it because the
2563 scalar code does not reflect how we vectorize the
2565 use_operand_p use_p
;
2566 imm_use_iterator imm_iter
;
2567 class loop
*loop
= LOOP_VINFO_LOOP (as_a
<loop_vec_info
> (vinfo
));
2568 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
,
2569 gimple_get_lhs (scalar_def
))
2570 /* There are exactly two non-debug uses, the reduction
2571 PHI and the loop-closed PHI node. */
2572 if (!is_gimple_debug (USE_STMT (use_p
))
2573 && gimple_bb (USE_STMT (use_p
)) == loop
->header
)
2575 auto_vec
<stmt_vec_info
, 64> phis (group_size
);
2576 stmt_vec_info phi_info
2577 = vinfo
->lookup_stmt (USE_STMT (use_p
));
2578 for (unsigned i
= 0; i
< group_size
; ++i
)
2579 phis
.quick_push (phi_info
);
2580 slp_tree
*phi_node
= bst_map
->get (phis
);
2581 unsigned dest_idx
= loop_latch_edge (loop
)->dest_idx
;
2582 SLP_TREE_CHILDREN (*phi_node
)[dest_idx
]
2583 = SLP_INSTANCE_TREE (new_instance
);
2584 SLP_INSTANCE_TREE (new_instance
)->refcnt
++;
2588 vinfo
->slp_instances
.safe_push (new_instance
);
2590 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2591 the number of scalar stmts in the root in a few places.
2592 Verify that assumption holds. */
2593 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
2594 .length () == group_size
);
2596 if (dump_enabled_p ())
2598 dump_printf_loc (MSG_NOTE
, vect_location
,
2599 "Final SLP tree for instance %p:\n", new_instance
);
2600 vect_print_slp_graph (MSG_NOTE
, vect_location
,
2601 SLP_INSTANCE_TREE (new_instance
));
2609 /* Failed to SLP. */
2610 /* Free the allocated memory. */
2611 scalar_stmts
.release ();
2614 stmt_vec_info stmt_info
= stmt_info_
;
2615 /* Try to break the group up into pieces. */
2616 if (kind
== slp_inst_kind_store
)
2618 /* ??? We could delay all the actual splitting of store-groups
2619 until after SLP discovery of the original group completed.
2620 Then we can recurse to vect_build_slp_instance directly. */
2621 for (i
= 0; i
< group_size
; i
++)
2625 /* For basic block SLP, try to break the group up into multiples of
2627 if (is_a
<bb_vec_info
> (vinfo
)
2628 && (i
> 1 && i
< group_size
))
2631 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
2632 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
2633 1 << floor_log2 (i
));
2634 unsigned HOST_WIDE_INT const_nunits
;
2636 && TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
))
2638 /* Split into two groups at the first vector boundary. */
2639 gcc_assert ((const_nunits
& (const_nunits
- 1)) == 0);
2640 unsigned group1_size
= i
& ~(const_nunits
- 1);
2642 if (dump_enabled_p ())
2643 dump_printf_loc (MSG_NOTE
, vect_location
,
2644 "Splitting SLP group at stmt %u\n", i
);
2645 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2647 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2648 kind
, max_tree_size
,
2650 /* Split the rest at the failure point and possibly
2651 re-analyze the remaining matching part if it has
2652 at least two lanes. */
2654 && (i
+ 1 < group_size
2655 || i
- group1_size
> 1))
2657 stmt_vec_info rest2
= rest
;
2658 rest
= vect_split_slp_store_group (rest
, i
- group1_size
);
2659 if (i
- group1_size
> 1)
2660 res
|= vect_analyze_slp_instance (vinfo
, bst_map
, rest2
,
2661 kind
, max_tree_size
,
2664 /* Re-analyze the non-matching tail if it has at least
2666 if (i
+ 1 < group_size
)
2667 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2668 rest
, kind
, max_tree_size
,
2674 /* For loop vectorization split into arbitrary pieces of size > 1. */
2675 if (is_a
<loop_vec_info
> (vinfo
)
2676 && (i
> 1 && i
< group_size
))
2678 unsigned group1_size
= i
;
2680 if (dump_enabled_p ())
2681 dump_printf_loc (MSG_NOTE
, vect_location
,
2682 "Splitting SLP group at stmt %u\n", i
);
2684 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2686 /* Loop vectorization cannot handle gaps in stores, make sure
2687 the split group appears as strided. */
2688 STMT_VINFO_STRIDED_P (rest
) = 1;
2689 DR_GROUP_GAP (rest
) = 0;
2690 STMT_VINFO_STRIDED_P (stmt_info
) = 1;
2691 DR_GROUP_GAP (stmt_info
) = 0;
2693 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2694 kind
, max_tree_size
, limit
);
2695 if (i
+ 1 < group_size
)
2696 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2697 rest
, kind
, max_tree_size
, limit
);
2702 /* Even though the first vector did not all match, we might be able to SLP
2703 (some) of the remainder. FORNOW ignore this possibility. */
2706 /* Failed to SLP. */
2707 if (dump_enabled_p ())
2708 dump_printf_loc (MSG_NOTE
, vect_location
, "SLP discovery failed\n");
2713 /* Analyze an SLP instance starting from a group of grouped stores. Call
2714 vect_build_slp_tree to build a tree of packed stmts if possible.
2715 Return FALSE if it's impossible to SLP any stmt in the loop. */
2718 vect_analyze_slp_instance (vec_info
*vinfo
,
2719 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2720 stmt_vec_info stmt_info
,
2721 slp_instance_kind kind
,
2722 unsigned max_tree_size
, unsigned *limit
)
2725 vec
<stmt_vec_info
> scalar_stmts
;
2727 if (is_a
<bb_vec_info
> (vinfo
))
2728 vect_location
= stmt_info
->stmt
;
2730 stmt_vec_info next_info
= stmt_info
;
2731 if (kind
== slp_inst_kind_store
)
2733 /* Collect the stores and store them in scalar_stmts. */
2734 scalar_stmts
.create (DR_GROUP_SIZE (stmt_info
));
2737 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
2738 next_info
= DR_GROUP_NEXT_ELEMENT (next_info
);
2741 else if (kind
== slp_inst_kind_reduc_chain
)
2743 /* Collect the reduction stmts and store them in scalar_stmts. */
2744 scalar_stmts
.create (REDUC_GROUP_SIZE (stmt_info
));
2747 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
2748 next_info
= REDUC_GROUP_NEXT_ELEMENT (next_info
);
2750 /* Mark the first element of the reduction chain as reduction to properly
2751 transform the node. In the reduction analysis phase only the last
2752 element of the chain is marked as reduction. */
2753 STMT_VINFO_DEF_TYPE (stmt_info
)
2754 = STMT_VINFO_DEF_TYPE (scalar_stmts
.last ());
2755 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))
2756 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts
.last ()));
2758 else if (kind
== slp_inst_kind_ctor
)
2760 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
2762 scalar_stmts
.create (CONSTRUCTOR_NELTS (rhs
));
2763 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), i
, val
)
2765 stmt_vec_info def_info
= vinfo
->lookup_def (val
);
2766 def_info
= vect_stmt_to_vectorize (def_info
);
2767 scalar_stmts
.quick_push (def_info
);
2769 if (dump_enabled_p ())
2770 dump_printf_loc (MSG_NOTE
, vect_location
,
2771 "Analyzing vectorizable constructor: %G\n",
2774 else if (kind
== slp_inst_kind_reduc_group
)
2776 /* Collect reduction statements. */
2777 vec
<stmt_vec_info
> reductions
= as_a
<loop_vec_info
> (vinfo
)->reductions
;
2778 scalar_stmts
.create (reductions
.length ());
2779 for (i
= 0; reductions
.iterate (i
, &next_info
); i
++)
2780 if (STMT_VINFO_RELEVANT_P (next_info
)
2781 || STMT_VINFO_LIVE_P (next_info
))
2782 scalar_stmts
.quick_push (next_info
);
2783 /* If less than two were relevant/live there's nothing to SLP. */
2784 if (scalar_stmts
.length () < 2)
2790 /* Build the tree for the SLP instance. */
2791 bool res
= vect_build_slp_instance (vinfo
, kind
, scalar_stmts
,
2792 kind
== slp_inst_kind_ctor
2794 max_tree_size
, limit
, bst_map
,
2795 kind
== slp_inst_kind_store
2796 ? stmt_info
: NULL
);
2798 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2799 where we should do store group splitting. */
2804 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2805 trees of packed scalar stmts if SLP is possible. */
2808 vect_analyze_slp (vec_info
*vinfo
, unsigned max_tree_size
)
2811 stmt_vec_info first_element
;
2812 slp_instance instance
;
2814 DUMP_VECT_SCOPE ("vect_analyze_slp");
2816 unsigned limit
= max_tree_size
;
2818 scalar_stmts_to_slp_tree_map_t
*bst_map
2819 = new scalar_stmts_to_slp_tree_map_t ();
2821 /* Find SLP sequences starting from groups of grouped stores. */
2822 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
2823 vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2824 STMT_VINFO_GROUPED_ACCESS (first_element
)
2825 ? slp_inst_kind_store
: slp_inst_kind_ctor
,
2826 max_tree_size
, &limit
);
2828 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
2830 for (unsigned i
= 0; i
< bb_vinfo
->roots
.length (); ++i
)
2832 vect_location
= bb_vinfo
->roots
[i
].root
->stmt
;
2833 if (vect_build_slp_instance (bb_vinfo
, bb_vinfo
->roots
[i
].kind
,
2834 bb_vinfo
->roots
[i
].stmts
,
2835 bb_vinfo
->roots
[i
].root
,
2836 max_tree_size
, &limit
, bst_map
, NULL
))
2837 bb_vinfo
->roots
[i
].stmts
= vNULL
;
2841 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2843 /* Find SLP sequences starting from reduction chains. */
2844 FOR_EACH_VEC_ELT (loop_vinfo
->reduction_chains
, i
, first_element
)
2845 if (! STMT_VINFO_RELEVANT_P (first_element
)
2846 && ! STMT_VINFO_LIVE_P (first_element
))
2848 else if (! vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2849 slp_inst_kind_reduc_chain
,
2850 max_tree_size
, &limit
))
2852 /* Dissolve reduction chain group. */
2853 stmt_vec_info vinfo
= first_element
;
2854 stmt_vec_info last
= NULL
;
2857 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
2858 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2859 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2863 STMT_VINFO_DEF_TYPE (first_element
) = vect_internal_def
;
2864 /* It can be still vectorized as part of an SLP reduction. */
2865 loop_vinfo
->reductions
.safe_push (last
);
2868 /* Find SLP sequences starting from groups of reductions. */
2869 if (loop_vinfo
->reductions
.length () > 1)
2870 vect_analyze_slp_instance (vinfo
, bst_map
, loop_vinfo
->reductions
[0],
2871 slp_inst_kind_reduc_group
, max_tree_size
,
2875 hash_set
<slp_tree
> visited_patterns
;
2876 slp_tree_to_load_perm_map_t perm_cache
;
2877 hash_map
<slp_tree
, slp_tree
> load_map
;
2879 /* See if any patterns can be found in the SLP tree. */
2880 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
2881 if (vect_match_slp_patterns (instance
, vinfo
, &visited_patterns
,
2884 slp_tree root
= SLP_INSTANCE_TREE (instance
);
2885 optimize_load_redistribution (bst_map
, vinfo
, SLP_TREE_LANES (root
),
2887 if (dump_enabled_p ())
2889 dump_printf_loc (MSG_NOTE
, vect_location
,
2890 "Pattern matched SLP tree\n");
2891 vect_print_slp_graph (MSG_NOTE
, vect_location
, root
);
2897 /* The map keeps a reference on SLP nodes built, release that. */
2898 for (scalar_stmts_to_slp_tree_map_t::iterator it
= bst_map
->begin ();
2899 it
!= bst_map
->end (); ++it
)
2901 vect_free_slp_tree ((*it
).second
);
2904 return opt_result::success ();
2907 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2910 vect_slp_build_vertices (hash_set
<slp_tree
> &visited
, slp_tree node
,
2911 vec
<slp_tree
> &vertices
, vec
<int> &leafs
)
2916 if (visited
.add (node
))
2919 node
->vertex
= vertices
.length ();
2920 vertices
.safe_push (node
);
2923 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2927 vect_slp_build_vertices (visited
, child
, vertices
, leafs
);
2930 leafs
.safe_push (node
->vertex
);
2933 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2936 vect_slp_build_vertices (vec_info
*info
, vec
<slp_tree
> &vertices
,
2939 hash_set
<slp_tree
> visited
;
2941 slp_instance instance
;
2942 FOR_EACH_VEC_ELT (info
->slp_instances
, i
, instance
)
2944 unsigned n_v
= vertices
.length ();
2945 unsigned n_l
= leafs
.length ();
2946 vect_slp_build_vertices (visited
, SLP_INSTANCE_TREE (instance
), vertices
,
2948 /* If we added vertices but no entries to the reverse graph we've
2949 added a cycle that is not backwards-reachable. Push the entry
2950 to mimic as leaf then. */
2951 if (vertices
.length () > n_v
2952 && leafs
.length () == n_l
)
2953 leafs
.safe_push (SLP_INSTANCE_TREE (instance
)->vertex
);
2957 /* Apply (reverse) bijectite PERM to VEC. */
2961 vect_slp_permute (vec
<unsigned> perm
,
2962 vec
<T
> &vec
, bool reverse
)
2964 auto_vec
<T
, 64> saved
;
2965 saved
.create (vec
.length ());
2966 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2967 saved
.quick_push (vec
[i
]);
2971 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2972 vec
[perm
[i
]] = saved
[i
];
2973 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2974 gcc_assert (vec
[perm
[i
]] == saved
[i
]);
2978 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2979 vec
[i
] = saved
[perm
[i
]];
2980 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2981 gcc_assert (vec
[i
] == saved
[perm
[i
]]);
2985 /* Return whether permutations PERM_A and PERM_B as recorded in the
2986 PERMS vector are equal. */
2989 vect_slp_perms_eq (const vec
<vec
<unsigned> > &perms
,
2990 int perm_a
, int perm_b
)
2992 return (perm_a
== perm_b
2993 || (perms
[perm_a
].length () == perms
[perm_b
].length ()
2994 && memcmp (&perms
[perm_a
][0], &perms
[perm_b
][0],
2995 sizeof (unsigned) * perms
[perm_a
].length ()) == 0));
2998 /* Optimize the SLP graph of VINFO. */
3001 vect_optimize_slp (vec_info
*vinfo
)
3003 if (vinfo
->slp_instances
.is_empty ())
3008 auto_vec
<slp_tree
> vertices
;
3009 auto_vec
<int> leafs
;
3010 vect_slp_build_vertices (vinfo
, vertices
, leafs
);
3012 struct graph
*slpg
= new_graph (vertices
.length ());
3013 FOR_EACH_VEC_ELT (vertices
, i
, node
)
3017 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3019 add_edge (slpg
, i
, child
->vertex
);
3022 /* Compute (reverse) postorder on the inverted graph. */
3024 graphds_dfs (slpg
, &leafs
[0], leafs
.length (), &ipo
, false, NULL
, NULL
);
3026 auto_sbitmap
n_visited (vertices
.length ());
3027 auto_sbitmap
n_materialize (vertices
.length ());
3028 auto_vec
<int> n_perm (vertices
.length ());
3029 auto_vec
<vec
<unsigned> > perms
;
3031 bitmap_clear (n_visited
);
3032 bitmap_clear (n_materialize
);
3033 n_perm
.quick_grow_cleared (vertices
.length ());
3034 perms
.safe_push (vNULL
); /* zero is no permute */
3036 /* Produce initial permutations. */
3037 for (i
= 0; i
< leafs
.length (); ++i
)
3040 slp_tree node
= vertices
[idx
];
3042 /* Handle externals and constants optimistically throughout the
3044 if (SLP_TREE_DEF_TYPE (node
) == vect_external_def
3045 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
3048 /* Leafs do not change across iterations. Note leafs also double
3049 as entries to the reverse graph. */
3050 if (!slpg
->vertices
[idx
].succ
)
3051 bitmap_set_bit (n_visited
, idx
);
3052 /* Loads are the only thing generating permutes. */
3053 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3056 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3057 node unpermuted, record this permute. */
3058 stmt_vec_info dr_stmt
= SLP_TREE_REPRESENTATIVE (node
);
3059 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt
))
3061 dr_stmt
= DR_GROUP_FIRST_ELEMENT (dr_stmt
);
3062 unsigned imin
= DR_GROUP_SIZE (dr_stmt
) + 1, imax
= 0;
3063 bool any_permute
= false;
3064 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3066 unsigned idx
= SLP_TREE_LOAD_PERMUTATION (node
)[j
];
3067 imin
= MIN (imin
, idx
);
3068 imax
= MAX (imax
, idx
);
3069 if (idx
- SLP_TREE_LOAD_PERMUTATION (node
)[0] != j
)
3072 /* If there's no permute no need to split one out. */
3075 /* If the span doesn't match we'd disrupt VF computation, avoid
3077 if (imax
- imin
+ 1 != SLP_TREE_LANES (node
))
3080 /* For now only handle true permutes, like
3081 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3082 when permuting constants and invariants keeping the permute
3084 auto_sbitmap
load_index (SLP_TREE_LANES (node
));
3085 bitmap_clear (load_index
);
3086 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3087 bitmap_set_bit (load_index
, SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
);
3089 for (j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3090 if (!bitmap_bit_p (load_index
, j
))
3092 if (j
!= SLP_TREE_LANES (node
))
3095 vec
<unsigned> perm
= vNULL
;
3096 perm
.safe_grow (SLP_TREE_LANES (node
), true);
3097 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3098 perm
[j
] = SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
;
3099 perms
.safe_push (perm
);
3100 n_perm
[idx
] = perms
.length () - 1;
3103 /* Propagate permutes along the graph and compute materialization points. */
3105 unsigned iteration
= 0;
3111 for (i
= vertices
.length (); i
> 0 ; --i
)
3114 slp_tree node
= vertices
[idx
];
3115 /* For leafs there's nothing to do - we've seeded permutes
3117 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
3120 bitmap_set_bit (n_visited
, idx
);
3122 /* We cannot move a permute across a store. */
3123 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))
3125 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))))
3129 for (graph_edge
*succ
= slpg
->vertices
[idx
].succ
;
3130 succ
; succ
= succ
->succ_next
)
3132 int succ_idx
= succ
->dest
;
3133 /* Handle unvisited nodes optimistically. */
3134 /* ??? But for constants once we want to handle non-bijective
3135 permutes we have to verify the permute, when unifying lanes,
3136 will not unify different constants. For example see
3137 gcc.dg/vect/bb-slp-14.c for a case that would break. */
3138 if (!bitmap_bit_p (n_visited
, succ_idx
))
3140 int succ_perm
= n_perm
[succ_idx
];
3141 /* Once we materialize succs permutation its output lanes
3142 appear unpermuted to us. */
3143 if (bitmap_bit_p (n_materialize
, succ_idx
))
3147 else if (succ_perm
== 0)
3152 else if (!vect_slp_perms_eq (perms
, perm
, succ_perm
))
3160 /* Pick up pre-computed leaf values. */
3162 else if (!vect_slp_perms_eq (perms
, perm
, n_perm
[idx
]))
3165 /* Make sure we eventually converge. */
3166 gcc_checking_assert (perm
== 0);
3169 bitmap_clear_bit (n_materialize
, idx
);
3176 /* Elide pruning at materialization points in the first
3177 iteration so every node was visited once at least. */
3181 /* Decide on permute materialization. Look whether there's
3182 a use (pred) edge that is permuted differently than us.
3183 In that case mark ourselves so the permutation is applied.
3184 For VEC_PERM_EXPRs the permutation doesn't carry along
3185 from children to parents so force materialization at the
3186 point of the VEC_PERM_EXPR. In principle VEC_PERM_EXPRs
3187 are a source of an arbitrary permutation again, similar
3188 to constants/externals - that's something we do not yet
3189 optimally handle. */
3190 bool all_preds_permuted
= (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
3191 && slpg
->vertices
[idx
].pred
!= NULL
);
3192 if (all_preds_permuted
)
3193 for (graph_edge
*pred
= slpg
->vertices
[idx
].pred
;
3194 pred
; pred
= pred
->pred_next
)
3196 gcc_checking_assert (bitmap_bit_p (n_visited
, pred
->src
));
3197 int pred_perm
= n_perm
[pred
->src
];
3198 if (!vect_slp_perms_eq (perms
, perm
, pred_perm
))
3200 all_preds_permuted
= false;
3204 if (!all_preds_permuted
)
3206 if (!bitmap_bit_p (n_materialize
, idx
))
3208 bitmap_set_bit (n_materialize
, idx
);
3212 while (changed
|| iteration
== 1);
3215 for (i
= 0; i
< vertices
.length (); ++i
)
3217 int perm
= n_perm
[i
];
3221 slp_tree node
= vertices
[i
];
3223 /* First permute invariant/external original successors. */
3226 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3228 if (!child
|| SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3231 /* If the vector is uniform there's nothing to do. */
3232 if (vect_slp_tree_uniform_p (child
))
3235 /* We can end up sharing some externals via two_operator
3236 handling. Be prepared to unshare those. */
3237 if (child
->refcnt
!= 1)
3239 gcc_assert (slpg
->vertices
[child
->vertex
].pred
->pred_next
);
3240 SLP_TREE_CHILDREN (node
)[j
] = child
3241 = vect_create_new_slp_node
3242 (SLP_TREE_SCALAR_OPS (child
).copy ());
3244 vect_slp_permute (perms
[perm
],
3245 SLP_TREE_SCALAR_OPS (child
), true);
3248 if (bitmap_bit_p (n_materialize
, i
))
3250 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3251 /* For loads simply drop the permutation, the load permutation
3252 already performs the desired permutation. */
3254 else if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
3256 /* If the node is already a permute node we can apply
3257 the permutation to the lane selection, effectively
3258 materializing it on the incoming vectors. */
3259 if (dump_enabled_p ())
3260 dump_printf_loc (MSG_NOTE
, vect_location
,
3261 "simplifying permute node %p\n",
3264 for (unsigned k
= 0;
3265 k
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++k
)
3266 SLP_TREE_LANE_PERMUTATION (node
)[k
].second
3267 = perms
[perm
][SLP_TREE_LANE_PERMUTATION (node
)[k
].second
];
3271 if (dump_enabled_p ())
3272 dump_printf_loc (MSG_NOTE
, vect_location
,
3273 "inserting permute node in place of %p\n",
3276 /* Make a copy of NODE and in-place change it to a
3277 VEC_PERM node to permute the lanes of the copy. */
3278 slp_tree copy
= new _slp_tree
;
3279 SLP_TREE_CHILDREN (copy
) = SLP_TREE_CHILDREN (node
);
3280 SLP_TREE_CHILDREN (node
) = vNULL
;
3281 SLP_TREE_SCALAR_STMTS (copy
)
3282 = SLP_TREE_SCALAR_STMTS (node
).copy ();
3283 vect_slp_permute (perms
[perm
],
3284 SLP_TREE_SCALAR_STMTS (copy
), true);
3285 gcc_assert (!SLP_TREE_SCALAR_OPS (node
).exists ());
3286 SLP_TREE_REPRESENTATIVE (copy
) = SLP_TREE_REPRESENTATIVE (node
);
3287 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node
).exists ());
3288 SLP_TREE_LANE_PERMUTATION (copy
)
3289 = SLP_TREE_LANE_PERMUTATION (node
);
3290 SLP_TREE_LANE_PERMUTATION (node
) = vNULL
;
3291 SLP_TREE_VECTYPE (copy
) = SLP_TREE_VECTYPE (node
);
3293 copy
->max_nunits
= node
->max_nunits
;
3294 SLP_TREE_DEF_TYPE (copy
) = SLP_TREE_DEF_TYPE (node
);
3295 SLP_TREE_LANES (copy
) = SLP_TREE_LANES (node
);
3296 SLP_TREE_CODE (copy
) = SLP_TREE_CODE (node
);
3298 /* Now turn NODE into a VEC_PERM. */
3299 SLP_TREE_CHILDREN (node
).safe_push (copy
);
3300 SLP_TREE_LANE_PERMUTATION (node
).create (SLP_TREE_LANES (node
));
3301 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3302 SLP_TREE_LANE_PERMUTATION (node
)
3303 .quick_push (std::make_pair (0, perms
[perm
][j
]));
3304 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
3309 /* Apply the reverse permutation to our stmts. */
3310 vect_slp_permute (perms
[perm
],
3311 SLP_TREE_SCALAR_STMTS (node
), true);
3312 /* And to the load permutation, which we can simply
3313 make regular by design. */
3314 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3316 /* ??? When we handle non-bijective permutes the idea
3317 is that we can force the load-permutation to be
3318 { min, min + 1, min + 2, ... max }. But then the
3319 scalar defs might no longer match the lane content
3320 which means wrong-code with live lane vectorization.
3321 So we possibly have to have NULL entries for those. */
3322 vect_slp_permute (perms
[perm
],
3323 SLP_TREE_LOAD_PERMUTATION (node
), true);
3328 /* Free the perms vector used for propagation. */
3329 while (!perms
.is_empty ())
3330 perms
.pop ().release ();
3334 /* Now elide load permutations that are not necessary. */
3335 for (i
= 0; i
< leafs
.length (); ++i
)
3337 node
= vertices
[leafs
[i
]];
3338 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3341 /* In basic block vectorization we allow any subchain of an interleaving
3343 FORNOW: not in loop SLP because of realignment complications. */
3344 if (is_a
<bb_vec_info
> (vinfo
))
3346 bool subchain_p
= true;
3347 stmt_vec_info next_load_info
= NULL
;
3348 stmt_vec_info load_info
;
3350 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3353 && (next_load_info
!= load_info
3354 || DR_GROUP_GAP (load_info
) != 1))
3359 next_load_info
= DR_GROUP_NEXT_ELEMENT (load_info
);
3363 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3369 stmt_vec_info load_info
;
3370 bool this_load_permuted
= false;
3372 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3373 if (SLP_TREE_LOAD_PERMUTATION (node
)[j
] != j
)
3375 this_load_permuted
= true;
3378 stmt_vec_info first_stmt_info
3379 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
3380 if (!this_load_permuted
3381 /* The load requires permutation when unrolling exposes
3382 a gap either because the group is larger than the SLP
3383 group-size or because there is a gap between the groups. */
3384 && (known_eq (LOOP_VINFO_VECT_FACTOR
3385 (as_a
<loop_vec_info
> (vinfo
)), 1U)
3386 || ((SLP_TREE_LANES (node
) == DR_GROUP_SIZE (first_stmt_info
))
3387 && DR_GROUP_GAP (first_stmt_info
) == 0)))
3389 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3396 /* Gather loads reachable from the individual SLP graph entries. */
3399 vect_gather_slp_loads (vec_info
*vinfo
)
3402 slp_instance instance
;
3403 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
3405 hash_set
<slp_tree
> visited
;
3406 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance
),
3407 SLP_INSTANCE_TREE (instance
), visited
);
3412 /* For each possible SLP instance decide whether to SLP it and calculate overall
3413 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3414 least one instance. */
3417 vect_make_slp_decision (loop_vec_info loop_vinfo
)
3420 poly_uint64 unrolling_factor
= 1;
3421 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
3422 slp_instance instance
;
3423 int decided_to_slp
= 0;
3425 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3427 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3429 /* FORNOW: SLP if you can. */
3430 /* All unroll factors have the form:
3432 GET_MODE_SIZE (vinfo->vector_mode) * X
3434 for some rational X, so they must have a common multiple. */
3436 = force_common_multiple (unrolling_factor
,
3437 SLP_INSTANCE_UNROLLING_FACTOR (instance
));
3439 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3440 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3441 loop-based vectorization. Such stmts will be marked as HYBRID. */
3442 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
3446 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
3448 if (decided_to_slp
&& dump_enabled_p ())
3450 dump_printf_loc (MSG_NOTE
, vect_location
,
3451 "Decided to SLP %d instances. Unrolling factor ",
3453 dump_dec (MSG_NOTE
, unrolling_factor
);
3454 dump_printf (MSG_NOTE
, "\n");
3457 return (decided_to_slp
> 0);
3460 /* Private data for vect_detect_hybrid_slp. */
3463 loop_vec_info loop_vinfo
;
3464 vec
<stmt_vec_info
> *worklist
;
3467 /* Walker for walk_gimple_op. */
3470 vect_detect_hybrid_slp (tree
*tp
, int *, void *data
)
3472 walk_stmt_info
*wi
= (walk_stmt_info
*)data
;
3473 vdhs_data
*dat
= (vdhs_data
*)wi
->info
;
3478 stmt_vec_info def_stmt_info
= dat
->loop_vinfo
->lookup_def (*tp
);
3481 def_stmt_info
= vect_stmt_to_vectorize (def_stmt_info
);
3482 if (PURE_SLP_STMT (def_stmt_info
))
3484 if (dump_enabled_p ())
3485 dump_printf_loc (MSG_NOTE
, vect_location
, "marking hybrid: %G",
3486 def_stmt_info
->stmt
);
3487 STMT_SLP_TYPE (def_stmt_info
) = hybrid
;
3488 dat
->worklist
->safe_push (def_stmt_info
);
3494 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3495 if so, otherwise pushing it to WORKLIST. */
3498 maybe_push_to_hybrid_worklist (vec_info
*vinfo
,
3499 vec
<stmt_vec_info
> &worklist
,
3500 stmt_vec_info stmt_info
)
3502 if (dump_enabled_p ())
3503 dump_printf_loc (MSG_NOTE
, vect_location
,
3504 "Processing hybrid candidate : %G", stmt_info
->stmt
);
3505 stmt_vec_info orig_info
= vect_orig_stmt (stmt_info
);
3506 imm_use_iterator iter2
;
3508 use_operand_p use_p
;
3509 def_operand_p def_p
;
3510 bool any_def
= false;
3511 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_info
->stmt
, iter1
, SSA_OP_DEF
)
3514 FOR_EACH_IMM_USE_FAST (use_p
, iter2
, DEF_FROM_PTR (def_p
))
3516 if (is_gimple_debug (USE_STMT (use_p
)))
3518 stmt_vec_info use_info
= vinfo
->lookup_stmt (USE_STMT (use_p
));
3519 /* An out-of loop use means this is a loop_vect sink. */
3522 if (dump_enabled_p ())
3523 dump_printf_loc (MSG_NOTE
, vect_location
,
3524 "Found loop_vect sink: %G", stmt_info
->stmt
);
3525 worklist
.safe_push (stmt_info
);
3528 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info
)))
3530 if (dump_enabled_p ())
3531 dump_printf_loc (MSG_NOTE
, vect_location
,
3532 "Found loop_vect use: %G", use_info
->stmt
);
3533 worklist
.safe_push (stmt_info
);
3538 /* No def means this is a loo_vect sink. */
3541 if (dump_enabled_p ())
3542 dump_printf_loc (MSG_NOTE
, vect_location
,
3543 "Found loop_vect sink: %G", stmt_info
->stmt
);
3544 worklist
.safe_push (stmt_info
);
3547 if (dump_enabled_p ())
3548 dump_printf_loc (MSG_NOTE
, vect_location
,
3549 "Marked SLP consumed stmt pure: %G", stmt_info
->stmt
);
3550 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
3553 /* Find stmts that must be both vectorized and SLPed. */
3556 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
3558 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3560 /* All stmts participating in SLP are marked pure_slp, all other
3561 stmts are loop_vect.
3562 First collect all loop_vect stmts into a worklist.
3563 SLP patterns cause not all original scalar stmts to appear in
3564 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3565 Rectify this here and do a backward walk over the IL only considering
3566 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3567 mark them as pure_slp. */
3568 auto_vec
<stmt_vec_info
> worklist
;
3569 for (int i
= LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
- 1; i
>= 0; --i
)
3571 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
3572 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
3575 gphi
*phi
= gsi
.phi ();
3576 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (phi
);
3577 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3578 maybe_push_to_hybrid_worklist (loop_vinfo
,
3579 worklist
, stmt_info
);
3581 for (gimple_stmt_iterator gsi
= gsi_last_bb (bb
); !gsi_end_p (gsi
);
3584 gimple
*stmt
= gsi_stmt (gsi
);
3585 if (is_gimple_debug (stmt
))
3587 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
3588 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
3590 for (gimple_stmt_iterator gsi2
3591 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
3592 !gsi_end_p (gsi2
); gsi_next (&gsi2
))
3594 stmt_vec_info patt_info
3595 = loop_vinfo
->lookup_stmt (gsi_stmt (gsi2
));
3596 if (!STMT_SLP_TYPE (patt_info
)
3597 && STMT_VINFO_RELEVANT (patt_info
))
3598 maybe_push_to_hybrid_worklist (loop_vinfo
,
3599 worklist
, patt_info
);
3601 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
3603 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3604 maybe_push_to_hybrid_worklist (loop_vinfo
,
3605 worklist
, stmt_info
);
3609 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3610 mark any SLP vectorized stmt as hybrid.
3611 ??? We're visiting def stmts N times (once for each non-SLP and
3612 once for each hybrid-SLP use). */
3615 dat
.worklist
= &worklist
;
3616 dat
.loop_vinfo
= loop_vinfo
;
3617 memset (&wi
, 0, sizeof (wi
));
3618 wi
.info
= (void *)&dat
;
3619 while (!worklist
.is_empty ())
3621 stmt_vec_info stmt_info
= worklist
.pop ();
3622 /* Since SSA operands are not set up for pattern stmts we need
3623 to use walk_gimple_op. */
3625 walk_gimple_op (stmt_info
->stmt
, vect_detect_hybrid_slp
, &wi
);
3630 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3632 _bb_vec_info::_bb_vec_info (vec
<basic_block
> _bbs
, vec_info_shared
*shared
)
3633 : vec_info (vec_info::bb
, init_cost (NULL
), shared
), bbs (_bbs
), roots (vNULL
)
3635 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3638 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3641 gphi
*phi
= si
.phi ();
3642 gimple_set_uid (phi
, 0);
3645 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3646 !gsi_end_p (gsi
); gsi_next (&gsi
))
3648 gimple
*stmt
= gsi_stmt (gsi
);
3649 gimple_set_uid (stmt
, 0);
3650 if (is_gimple_debug (stmt
))
3658 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3659 stmts in the basic block. */
3661 _bb_vec_info::~_bb_vec_info ()
3663 /* Reset region marker. */
3664 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3667 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3670 gphi
*phi
= si
.phi ();
3671 gimple_set_uid (phi
, -1);
3673 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3674 !gsi_end_p (gsi
); gsi_next (&gsi
))
3676 gimple
*stmt
= gsi_stmt (gsi
);
3677 gimple_set_uid (stmt
, -1);
3681 for (unsigned i
= 0; i
< roots
.length (); ++i
)
3682 roots
[i
].stmts
.release ();
3686 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3687 given then that child nodes have already been processed, and that
3688 their def types currently match their SLP node's def type. */
3691 vect_slp_analyze_node_operations_1 (vec_info
*vinfo
, slp_tree node
,
3692 slp_instance node_instance
,
3693 stmt_vector_for_cost
*cost_vec
)
3695 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
3696 gcc_assert (STMT_SLP_TYPE (stmt_info
) != loop_vect
);
3698 /* Calculate the number of vector statements to be created for the
3699 scalar stmts in this node. For SLP reductions it is equal to the
3700 number of vector statements in the children (which has already been
3701 calculated by the recursive call). Otherwise it is the number of
3702 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3703 VF divided by the number of elements in a vector. */
3704 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3705 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
3707 for (unsigned i
= 0; i
< SLP_TREE_CHILDREN (node
).length (); ++i
)
3708 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node
)[i
]) == vect_internal_def
)
3710 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3711 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node
)[i
]);
3718 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3719 vf
= loop_vinfo
->vectorization_factor
;
3722 unsigned int group_size
= SLP_TREE_LANES (node
);
3723 tree vectype
= SLP_TREE_VECTYPE (node
);
3724 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3725 = vect_get_num_vectors (vf
* group_size
, vectype
);
3728 /* Handle purely internal nodes. */
3729 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
3730 return vectorizable_slp_permutation (vinfo
, NULL
, node
, cost_vec
);
3732 if (is_a
<bb_vec_info
> (vinfo
)
3733 && !vect_update_shared_vectype (stmt_info
, SLP_TREE_VECTYPE (node
)))
3735 if (dump_enabled_p ())
3736 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3737 "desired vector type conflicts with earlier one "
3738 "for %G", stmt_info
->stmt
);
3743 return vect_analyze_stmt (vinfo
, stmt_info
, &dummy
,
3744 node
, node_instance
, cost_vec
);
3747 /* Try to build NODE from scalars, returning true on success.
3748 NODE_INSTANCE is the SLP instance that contains NODE. */
3751 vect_slp_convert_to_external (vec_info
*vinfo
, slp_tree node
,
3752 slp_instance node_instance
)
3754 stmt_vec_info stmt_info
;
3757 if (!is_a
<bb_vec_info
> (vinfo
)
3758 || node
== SLP_INSTANCE_TREE (node_instance
)
3759 || !SLP_TREE_SCALAR_STMTS (node
).exists ()
3760 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node
)))
3763 if (dump_enabled_p ())
3764 dump_printf_loc (MSG_NOTE
, vect_location
,
3765 "Building vector operands of %p from scalars instead\n", node
);
3767 /* Don't remove and free the child nodes here, since they could be
3768 referenced by other structures. The analysis and scheduling phases
3769 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3770 unsigned int group_size
= SLP_TREE_LANES (node
);
3771 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
3772 SLP_TREE_SCALAR_OPS (node
).safe_grow (group_size
, true);
3773 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3774 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3776 tree lhs
= gimple_get_lhs (vect_orig_stmt (stmt_info
)->stmt
);
3777 SLP_TREE_SCALAR_OPS (node
)[i
] = lhs
;
3782 /* Compute the prologue cost for invariant or constant operands represented
3786 vect_prologue_cost_for_slp (slp_tree node
,
3787 stmt_vector_for_cost
*cost_vec
)
3789 /* There's a special case of an existing vector, that costs nothing. */
3790 if (SLP_TREE_SCALAR_OPS (node
).length () == 0
3791 && !SLP_TREE_VEC_DEFS (node
).is_empty ())
3793 /* Without looking at the actual initializer a vector of
3794 constants can be implemented as load from the constant pool.
3795 When all elements are the same we can use a splat. */
3796 tree vectype
= SLP_TREE_VECTYPE (node
);
3797 unsigned group_size
= SLP_TREE_SCALAR_OPS (node
).length ();
3798 unsigned num_vects_to_check
;
3799 unsigned HOST_WIDE_INT const_nunits
;
3800 unsigned nelt_limit
;
3801 if (TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
)
3802 && ! multiple_p (const_nunits
, group_size
))
3804 num_vects_to_check
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
3805 nelt_limit
= const_nunits
;
3809 /* If either the vector has variable length or the vectors
3810 are composed of repeated whole groups we only need to
3811 cost construction once. All vectors will be the same. */
3812 num_vects_to_check
= 1;
3813 nelt_limit
= group_size
;
3815 tree elt
= NULL_TREE
;
3817 for (unsigned j
= 0; j
< num_vects_to_check
* nelt_limit
; ++j
)
3819 unsigned si
= j
% group_size
;
3821 elt
= SLP_TREE_SCALAR_OPS (node
)[si
];
3822 /* ??? We're just tracking whether all operands of a single
3823 vector initializer are the same, ideally we'd check if
3824 we emitted the same one already. */
3825 else if (elt
!= SLP_TREE_SCALAR_OPS (node
)[si
])
3828 if (nelt
== nelt_limit
)
3830 record_stmt_cost (cost_vec
, 1,
3831 SLP_TREE_DEF_TYPE (node
) == vect_external_def
3832 ? (elt
? scalar_to_vec
: vec_construct
)
3834 NULL
, vectype
, 0, vect_prologue
);
3840 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3841 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3843 Return true if the operations are supported. */
3846 vect_slp_analyze_node_operations (vec_info
*vinfo
, slp_tree node
,
3847 slp_instance node_instance
,
3848 hash_set
<slp_tree
> &visited_set
,
3849 vec
<slp_tree
> &visited_vec
,
3850 stmt_vector_for_cost
*cost_vec
)
3855 /* Assume we can code-generate all invariants. */
3857 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
3858 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
3861 if (SLP_TREE_DEF_TYPE (node
) == vect_uninitialized_def
)
3863 if (dump_enabled_p ())
3864 dump_printf_loc (MSG_NOTE
, vect_location
,
3865 "Failed cyclic SLP reference in %p", node
);
3868 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
);
3870 /* If we already analyzed the exact same set of scalar stmts we're done.
3871 We share the generated vector stmts for those. */
3872 if (visited_set
.add (node
))
3874 visited_vec
.safe_push (node
);
3877 unsigned visited_rec_start
= visited_vec
.length ();
3878 unsigned cost_vec_rec_start
= cost_vec
->length ();
3879 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3881 res
= vect_slp_analyze_node_operations (vinfo
, child
, node_instance
,
3882 visited_set
, visited_vec
,
3889 res
= vect_slp_analyze_node_operations_1 (vinfo
, node
, node_instance
,
3891 /* If analysis failed we have to pop all recursive visited nodes
3895 while (visited_vec
.length () >= visited_rec_start
)
3896 visited_set
.remove (visited_vec
.pop ());
3897 cost_vec
->truncate (cost_vec_rec_start
);
3900 /* When the node can be vectorized cost invariant nodes it references.
3901 This is not done in DFS order to allow the refering node
3902 vectorizable_* calls to nail down the invariant nodes vector type
3903 and possibly unshare it if it needs a different vector type than
3906 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3908 && (SLP_TREE_DEF_TYPE (child
) == vect_constant_def
3909 || SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
3910 /* Perform usual caching, note code-generation still
3911 code-gens these nodes multiple times but we expect
3912 to CSE them later. */
3913 && !visited_set
.add (child
))
3915 visited_vec
.safe_push (child
);
3916 /* ??? After auditing more code paths make a "default"
3917 and push the vector type from NODE to all children
3918 if it is not already set. */
3919 /* Compute the number of vectors to be generated. */
3920 tree vector_type
= SLP_TREE_VECTYPE (child
);
3923 /* For shifts with a scalar argument we don't need
3924 to cost or code-generate anything.
3925 ??? Represent this more explicitely. */
3926 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
3927 == shift_vec_info_type
)
3931 unsigned group_size
= SLP_TREE_LANES (child
);
3933 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3934 vf
= loop_vinfo
->vectorization_factor
;
3935 SLP_TREE_NUMBER_OF_VEC_STMTS (child
)
3936 = vect_get_num_vectors (vf
* group_size
, vector_type
);
3937 /* And cost them. */
3938 vect_prologue_cost_for_slp (child
, cost_vec
);
3941 /* If this node or any of its children can't be vectorized, try pruning
3942 the tree here rather than felling the whole thing. */
3943 if (!res
&& vect_slp_convert_to_external (vinfo
, node
, node_instance
))
3945 /* We'll need to revisit this for invariant costing and number
3946 of vectorized stmt setting. */
3954 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3955 region and that can be vectorized using vectorizable_live_operation
3956 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3957 scalar code computing it to be retained. */
3960 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo
, slp_tree node
,
3961 slp_instance instance
,
3962 stmt_vector_for_cost
*cost_vec
,
3963 hash_set
<stmt_vec_info
> &svisited
,
3964 hash_set
<slp_tree
> &visited
)
3966 if (visited
.add (node
))
3970 stmt_vec_info stmt_info
;
3971 stmt_vec_info last_stmt
= vect_find_last_scalar_stmt_in_slp (node
);
3972 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3974 if (svisited
.contains (stmt_info
))
3976 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
3977 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
)
3978 && STMT_VINFO_RELATED_STMT (orig_stmt_info
) != stmt_info
)
3979 /* Only the pattern root stmt computes the original scalar value. */
3981 bool mark_visited
= true;
3982 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
3983 ssa_op_iter op_iter
;
3984 def_operand_p def_p
;
3985 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
3987 imm_use_iterator use_iter
;
3989 stmt_vec_info use_stmt_info
;
3990 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3991 if (!is_gimple_debug (use_stmt
))
3993 use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
);
3995 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
3997 STMT_VINFO_LIVE_P (stmt_info
) = true;
3998 if (vectorizable_live_operation (bb_vinfo
, stmt_info
,
3999 NULL
, node
, instance
, i
,
4001 /* ??? So we know we can vectorize the live stmt
4002 from one SLP node. If we cannot do so from all
4003 or none consistently we'd have to record which
4004 SLP node (and lane) we want to use for the live
4005 operation. So make sure we can code-generate
4007 mark_visited
= false;
4009 STMT_VINFO_LIVE_P (stmt_info
) = false;
4013 /* We have to verify whether we can insert the lane extract
4014 before all uses. The following is a conservative approximation.
4015 We cannot put this into vectorizable_live_operation because
4016 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4018 Note that while the fact that we emit code for loads at the
4019 first load should make this a non-problem leafs we construct
4020 from scalars are vectorized after the last scalar def.
4021 ??? If we'd actually compute the insert location during
4022 analysis we could use sth less conservative than the last
4023 scalar stmt in the node for the dominance check. */
4024 /* ??? What remains is "live" uses in vector CTORs in the same
4025 SLP graph which is where those uses can end up code-generated
4026 right after their definition instead of close to their original
4027 use. But that would restrict us to code-generate lane-extracts
4028 from the latest stmt in a node. So we compensate for this
4029 during code-generation, simply not replacing uses for those
4030 hopefully rare cases. */
4031 if (STMT_VINFO_LIVE_P (stmt_info
))
4032 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4033 if (!is_gimple_debug (use_stmt
)
4034 && (!(use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
))
4035 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
4036 && !vect_stmt_dominates_stmt_p (last_stmt
->stmt
, use_stmt
))
4038 if (dump_enabled_p ())
4039 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4040 "Cannot determine insertion place for "
4042 STMT_VINFO_LIVE_P (stmt_info
) = false;
4043 mark_visited
= true;
4047 svisited
.add (stmt_info
);
4051 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4052 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4053 vect_bb_slp_mark_live_stmts (bb_vinfo
, child
, instance
,
4054 cost_vec
, svisited
, visited
);
4057 /* Analyze statements in SLP instances of VINFO. Return true if the
4058 operations are supported. */
4061 vect_slp_analyze_operations (vec_info
*vinfo
)
4063 slp_instance instance
;
4066 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4068 hash_set
<slp_tree
> visited
;
4069 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
4071 auto_vec
<slp_tree
> visited_vec
;
4072 stmt_vector_for_cost cost_vec
;
4073 cost_vec
.create (2);
4074 if (is_a
<bb_vec_info
> (vinfo
))
4075 vect_location
= instance
->location ();
4076 if (!vect_slp_analyze_node_operations (vinfo
,
4077 SLP_INSTANCE_TREE (instance
),
4078 instance
, visited
, visited_vec
,
4080 /* Instances with a root stmt require vectorized defs for the
4082 || (SLP_INSTANCE_ROOT_STMT (instance
)
4083 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
))
4084 != vect_internal_def
)))
4086 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4087 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4088 if (dump_enabled_p ())
4089 dump_printf_loc (MSG_NOTE
, vect_location
,
4090 "removing SLP instance operations starting from: %G",
4092 vect_free_slp_instance (instance
);
4093 vinfo
->slp_instances
.ordered_remove (i
);
4094 cost_vec
.release ();
4095 while (!visited_vec
.is_empty ())
4096 visited
.remove (visited_vec
.pop ());
4102 /* For BB vectorization remember the SLP graph entry
4104 if (is_a
<bb_vec_info
> (vinfo
))
4105 instance
->cost_vec
= cost_vec
;
4108 add_stmt_costs (vinfo
, vinfo
->target_cost_data
, &cost_vec
);
4109 cost_vec
.release ();
4114 /* Compute vectorizable live stmts. */
4115 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
4117 hash_set
<stmt_vec_info
> svisited
;
4118 hash_set
<slp_tree
> visited
;
4119 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4121 vect_location
= instance
->location ();
4122 vect_bb_slp_mark_live_stmts (bb_vinfo
, SLP_INSTANCE_TREE (instance
),
4123 instance
, &instance
->cost_vec
, svisited
,
4128 return !vinfo
->slp_instances
.is_empty ();
4131 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
4132 closing the eventual chain. */
4135 get_ultimate_leader (slp_instance instance
,
4136 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
4138 auto_vec
<slp_instance
*, 8> chain
;
4140 while (*(tem
= instance_leader
.get (instance
)) != instance
)
4142 chain
.safe_push (tem
);
4145 while (!chain
.is_empty ())
4146 *chain
.pop () = instance
;
4150 /* Worker of vect_bb_partition_graph, recurse on NODE. */
4153 vect_bb_partition_graph_r (bb_vec_info bb_vinfo
,
4154 slp_instance instance
, slp_tree node
,
4155 hash_map
<stmt_vec_info
, slp_instance
> &stmt_to_instance
,
4156 hash_map
<slp_instance
, slp_instance
> &instance_leader
,
4157 hash_set
<slp_tree
> &visited
)
4159 stmt_vec_info stmt_info
;
4162 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4165 slp_instance
&stmt_instance
4166 = stmt_to_instance
.get_or_insert (stmt_info
, &existed_p
);
4169 else if (stmt_instance
!= instance
)
4171 /* If we're running into a previously marked stmt make us the
4172 leader of the current ultimate leader. This keeps the
4173 leader chain acyclic and works even when the current instance
4174 connects two previously independent graph parts. */
4175 slp_instance stmt_leader
4176 = get_ultimate_leader (stmt_instance
, instance_leader
);
4177 if (stmt_leader
!= instance
)
4178 instance_leader
.put (stmt_leader
, instance
);
4180 stmt_instance
= instance
;
4183 if (visited
.add (node
))
4187 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4188 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4189 vect_bb_partition_graph_r (bb_vinfo
, instance
, child
, stmt_to_instance
,
4190 instance_leader
, visited
);
4193 /* Partition the SLP graph into pieces that can be costed independently. */
4196 vect_bb_partition_graph (bb_vec_info bb_vinfo
)
4198 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
4200 /* First walk the SLP graph assigning each involved scalar stmt a
4201 corresponding SLP graph entry and upon visiting a previously
4202 marked stmt, make the stmts leader the current SLP graph entry. */
4203 hash_map
<stmt_vec_info
, slp_instance
> stmt_to_instance
;
4204 hash_map
<slp_instance
, slp_instance
> instance_leader
;
4205 hash_set
<slp_tree
> visited
;
4206 slp_instance instance
;
4207 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4209 instance_leader
.put (instance
, instance
);
4210 vect_bb_partition_graph_r (bb_vinfo
,
4211 instance
, SLP_INSTANCE_TREE (instance
),
4212 stmt_to_instance
, instance_leader
,
4216 /* Then collect entries to each independent subgraph. */
4217 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4219 slp_instance leader
= get_ultimate_leader (instance
, instance_leader
);
4220 leader
->subgraph_entries
.safe_push (instance
);
4221 if (dump_enabled_p ()
4222 && leader
!= instance
)
4223 dump_printf_loc (MSG_NOTE
, vect_location
,
4224 "instance %p is leader of %p\n",
4229 /* Compute the scalar cost of the SLP node NODE and its children
4230 and return it. Do not account defs that are marked in LIFE and
4231 update LIFE according to uses of NODE. */
4234 vect_bb_slp_scalar_cost (vec_info
*vinfo
,
4235 slp_tree node
, vec
<bool, va_heap
> *life
,
4236 stmt_vector_for_cost
*cost_vec
,
4237 hash_set
<slp_tree
> &visited
)
4240 stmt_vec_info stmt_info
;
4243 if (visited
.add (node
))
4246 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4248 ssa_op_iter op_iter
;
4249 def_operand_p def_p
;
4254 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4255 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
4257 /* If there is a non-vectorized use of the defs then the scalar
4258 stmt is kept live in which case we do not account it or any
4259 required defs in the SLP children in the scalar cost. This
4260 way we make the vectorization more costly when compared to
4262 if (!STMT_VINFO_LIVE_P (stmt_info
))
4264 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
4266 imm_use_iterator use_iter
;
4268 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4269 if (!is_gimple_debug (use_stmt
))
4271 stmt_vec_info use_stmt_info
= vinfo
->lookup_stmt (use_stmt
);
4274 (vect_stmt_to_vectorize (use_stmt_info
)))
4285 /* Count scalar stmts only once. */
4286 if (gimple_visited_p (orig_stmt
))
4288 gimple_set_visited (orig_stmt
, true);
4290 vect_cost_for_stmt kind
;
4291 if (STMT_VINFO_DATA_REF (orig_stmt_info
))
4293 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info
)))
4296 kind
= scalar_store
;
4298 else if (vect_nop_conversion_p (orig_stmt_info
))
4300 /* For single-argument PHIs assume coalescing which means zero cost
4301 for the scalar and the vector PHIs. This avoids artificially
4302 favoring the vector path (but may pessimize it in some cases). */
4303 else if (is_a
<gphi
*> (orig_stmt_info
->stmt
)
4304 && gimple_phi_num_args
4305 (as_a
<gphi
*> (orig_stmt_info
->stmt
)) == 1)
4309 record_stmt_cost (cost_vec
, 1, kind
, orig_stmt_info
,
4310 SLP_TREE_VECTYPE (node
), 0, vect_body
);
4313 auto_vec
<bool, 20> subtree_life
;
4314 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4316 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4318 /* Do not directly pass LIFE to the recursive call, copy it to
4319 confine changes in the callee to the current child/subtree. */
4320 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
4322 subtree_life
.safe_grow_cleared (SLP_TREE_LANES (child
), true);
4323 for (unsigned j
= 0;
4324 j
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++j
)
4326 auto perm
= SLP_TREE_LANE_PERMUTATION (node
)[j
];
4327 if (perm
.first
== i
)
4328 subtree_life
[perm
.second
] = (*life
)[j
];
4333 gcc_assert (SLP_TREE_LANES (node
) == SLP_TREE_LANES (child
));
4334 subtree_life
.safe_splice (*life
);
4336 vect_bb_slp_scalar_cost (vinfo
, child
, &subtree_life
, cost_vec
,
4338 subtree_life
.truncate (0);
4343 /* Comparator for the loop-index sorted cost vectors. */
4346 li_cost_vec_cmp (const void *a_
, const void *b_
)
4348 auto *a
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)a_
;
4349 auto *b
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)b_
;
4350 if (a
->first
< b
->first
)
4352 else if (a
->first
== b
->first
)
4357 /* Check if vectorization of the basic block is profitable for the
4358 subgraph denoted by SLP_INSTANCES. */
4361 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
,
4362 vec
<slp_instance
> slp_instances
)
4364 slp_instance instance
;
4366 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
4367 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
4369 if (dump_enabled_p ())
4371 dump_printf_loc (MSG_NOTE
, vect_location
, "Costing subgraph: \n");
4372 hash_set
<slp_tree
> visited
;
4373 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4374 vect_print_slp_graph (MSG_NOTE
, vect_location
,
4375 SLP_INSTANCE_TREE (instance
), visited
);
4378 /* Calculate scalar cost and sum the cost for the vector stmts
4379 previously collected. */
4380 stmt_vector_for_cost scalar_costs
= vNULL
;
4381 stmt_vector_for_cost vector_costs
= vNULL
;
4382 hash_set
<slp_tree
> visited
;
4383 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4385 auto_vec
<bool, 20> life
;
4386 life
.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
)),
4388 if (SLP_INSTANCE_ROOT_STMT (instance
))
4389 record_stmt_cost (&scalar_costs
, 1, scalar_stmt
,
4390 SLP_INSTANCE_ROOT_STMT (instance
), 0, vect_body
);
4391 vect_bb_slp_scalar_cost (bb_vinfo
,
4392 SLP_INSTANCE_TREE (instance
),
4393 &life
, &scalar_costs
, visited
);
4394 vector_costs
.safe_splice (instance
->cost_vec
);
4395 instance
->cost_vec
.release ();
4397 /* Unset visited flag. */
4398 stmt_info_for_cost
*cost
;
4399 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
4400 gimple_set_visited (cost
->stmt_info
->stmt
, false);
4402 if (dump_enabled_p ())
4403 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
4405 /* When costing non-loop vectorization we need to consider each covered
4406 loop independently and make sure vectorization is profitable. For
4407 now we assume a loop may be not entered or executed an arbitrary
4408 number of iterations (??? static information can provide more
4409 precise info here) which means we can simply cost each containing
4410 loops stmts separately. */
4412 /* First produce cost vectors sorted by loop index. */
4413 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
4414 li_scalar_costs (scalar_costs
.length ());
4415 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
4416 li_vector_costs (vector_costs
.length ());
4417 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
4419 unsigned l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
4420 li_scalar_costs
.quick_push (std::make_pair (l
, cost
));
4422 /* Use a random used loop as fallback in case the first vector_costs
4423 entry does not have a stmt_info associated with it. */
4424 unsigned l
= li_scalar_costs
[0].first
;
4425 FOR_EACH_VEC_ELT (vector_costs
, i
, cost
)
4427 /* We inherit from the previous COST, invariants, externals and
4428 extracts immediately follow the cost for the related stmt. */
4429 if (cost
->stmt_info
)
4430 l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
4431 li_vector_costs
.quick_push (std::make_pair (l
, cost
));
4433 li_scalar_costs
.qsort (li_cost_vec_cmp
);
4434 li_vector_costs
.qsort (li_cost_vec_cmp
);
4436 /* Now cost the portions individually. */
4439 while (si
< li_scalar_costs
.length ()
4440 && vi
< li_vector_costs
.length ())
4442 unsigned sl
= li_scalar_costs
[si
].first
;
4443 unsigned vl
= li_vector_costs
[vi
].first
;
4446 if (dump_enabled_p ())
4447 dump_printf_loc (MSG_NOTE
, vect_location
,
4448 "Scalar %d and vector %d loop part do not "
4449 "match up, skipping scalar part\n", sl
, vl
);
4450 /* Skip the scalar part, assuming zero cost on the vector side. */
4455 while (si
< li_scalar_costs
.length ()
4456 && li_scalar_costs
[si
].first
== sl
);
4460 void *scalar_target_cost_data
= init_cost (NULL
);
4463 add_stmt_cost (bb_vinfo
, scalar_target_cost_data
,
4464 li_scalar_costs
[si
].second
);
4467 while (si
< li_scalar_costs
.length ()
4468 && li_scalar_costs
[si
].first
== sl
);
4470 finish_cost (scalar_target_cost_data
, &dummy
, &scalar_cost
, &dummy
);
4471 destroy_cost_data (scalar_target_cost_data
);
4473 /* Complete the target-specific vector cost calculation. */
4474 void *vect_target_cost_data
= init_cost (NULL
);
4477 add_stmt_cost (bb_vinfo
, vect_target_cost_data
,
4478 li_vector_costs
[vi
].second
);
4481 while (vi
< li_vector_costs
.length ()
4482 && li_vector_costs
[vi
].first
== vl
);
4483 finish_cost (vect_target_cost_data
, &vec_prologue_cost
,
4484 &vec_inside_cost
, &vec_epilogue_cost
);
4485 destroy_cost_data (vect_target_cost_data
);
4487 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
4489 if (dump_enabled_p ())
4491 dump_printf_loc (MSG_NOTE
, vect_location
,
4492 "Cost model analysis for part in loop %d:\n", sl
);
4493 dump_printf (MSG_NOTE
, " Vector cost: %d\n",
4494 vec_inside_cost
+ vec_outside_cost
);
4495 dump_printf (MSG_NOTE
, " Scalar cost: %d\n", scalar_cost
);
4498 /* Vectorization is profitable if its cost is more than the cost of scalar
4499 version. Note that we err on the vector side for equal cost because
4500 the cost estimate is otherwise quite pessimistic (constant uses are
4501 free on the scalar side but cost a load on the vector side for
4503 if (vec_outside_cost
+ vec_inside_cost
> scalar_cost
)
4505 scalar_costs
.release ();
4506 vector_costs
.release ();
4510 if (vi
< li_vector_costs
.length ())
4512 if (dump_enabled_p ())
4513 dump_printf_loc (MSG_NOTE
, vect_location
,
4514 "Excess vector cost for part in loop %d:\n",
4515 li_vector_costs
[vi
].first
);
4516 scalar_costs
.release ();
4517 vector_costs
.release ();
4521 scalar_costs
.release ();
4522 vector_costs
.release ();
4526 /* qsort comparator for lane defs. */
4529 vld_cmp (const void *a_
, const void *b_
)
4531 auto *a
= (const std::pair
<unsigned, tree
> *)a_
;
4532 auto *b
= (const std::pair
<unsigned, tree
> *)b_
;
4533 return a
->first
- b
->first
;
4536 /* Return true if USE_STMT is a vector lane insert into VEC and set
4537 *THIS_LANE to the lane number that is set. */
4540 vect_slp_is_lane_insert (gimple
*use_stmt
, tree vec
, unsigned *this_lane
)
4542 gassign
*use_ass
= dyn_cast
<gassign
*> (use_stmt
);
4544 || gimple_assign_rhs_code (use_ass
) != BIT_INSERT_EXPR
4546 ? gimple_assign_rhs1 (use_ass
) != vec
4547 : ((vec
= gimple_assign_rhs1 (use_ass
)), false))
4548 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec
)),
4549 TREE_TYPE (gimple_assign_rhs2 (use_ass
)))
4550 || !constant_multiple_p
4551 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass
)),
4552 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec
)))),
4558 /* Find any vectorizable constructors and add them to the grouped_store
4562 vect_slp_check_for_constructors (bb_vec_info bb_vinfo
)
4564 for (unsigned i
= 0; i
< bb_vinfo
->bbs
.length (); ++i
)
4565 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb_vinfo
->bbs
[i
]);
4566 !gsi_end_p (gsi
); gsi_next (&gsi
))
4568 gassign
*assign
= dyn_cast
<gassign
*> (gsi_stmt (gsi
));
4572 tree rhs
= gimple_assign_rhs1 (assign
);
4573 if (gimple_assign_rhs_code (assign
) == CONSTRUCTOR
)
4575 if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
4576 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)),
4577 CONSTRUCTOR_NELTS (rhs
))
4578 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs
, 0)->value
))
4579 || uniform_vector_p (rhs
))
4584 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), j
, val
)
4585 if (TREE_CODE (val
) != SSA_NAME
4586 || !bb_vinfo
->lookup_def (val
))
4588 if (j
!= CONSTRUCTOR_NELTS (rhs
))
4591 stmt_vec_info stmt_info
= bb_vinfo
->lookup_stmt (assign
);
4592 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
4594 else if (gimple_assign_rhs_code (assign
) == BIT_INSERT_EXPR
4595 && VECTOR_TYPE_P (TREE_TYPE (rhs
))
4596 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).is_constant ()
4597 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).to_constant () > 1
4598 && integer_zerop (gimple_assign_rhs3 (assign
))
4599 && useless_type_conversion_p
4600 (TREE_TYPE (TREE_TYPE (rhs
)),
4601 TREE_TYPE (gimple_assign_rhs2 (assign
)))
4602 && bb_vinfo
->lookup_def (gimple_assign_rhs2 (assign
)))
4604 /* We start to match on insert to lane zero but since the
4605 inserts need not be ordered we'd have to search both
4606 the def and the use chains. */
4607 tree vectype
= TREE_TYPE (rhs
);
4608 unsigned nlanes
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
4609 auto_vec
<std::pair
<unsigned, tree
> > lane_defs (nlanes
);
4610 auto_sbitmap
lanes (nlanes
);
4611 bitmap_clear (lanes
);
4612 bitmap_set_bit (lanes
, 0);
4613 tree def
= gimple_assign_lhs (assign
);
4614 lane_defs
.quick_push
4615 (std::make_pair (0, gimple_assign_rhs2 (assign
)));
4616 unsigned lanes_found
= 1;
4617 /* Start with the use chains, the last stmt will be the root. */
4618 stmt_vec_info last
= bb_vinfo
->lookup_stmt (assign
);
4621 use_operand_p use_p
;
4623 if (!single_imm_use (def
, &use_p
, &use_stmt
))
4626 if (!bb_vinfo
->lookup_stmt (use_stmt
)
4627 || !vect_slp_is_lane_insert (use_stmt
, def
, &this_lane
)
4628 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (use_stmt
)))
4630 if (bitmap_bit_p (lanes
, this_lane
))
4633 bitmap_set_bit (lanes
, this_lane
);
4634 gassign
*use_ass
= as_a
<gassign
*> (use_stmt
);
4635 lane_defs
.quick_push (std::make_pair
4636 (this_lane
, gimple_assign_rhs2 (use_ass
)));
4637 last
= bb_vinfo
->lookup_stmt (use_ass
);
4638 def
= gimple_assign_lhs (use_ass
);
4640 while (lanes_found
< nlanes
);
4641 if (lanes_found
< nlanes
)
4643 /* Now search the def chain. */
4644 def
= gimple_assign_rhs1 (assign
);
4647 if (TREE_CODE (def
) != SSA_NAME
4648 || !has_single_use (def
))
4650 gimple
*def_stmt
= SSA_NAME_DEF_STMT (def
);
4652 if (!bb_vinfo
->lookup_stmt (def_stmt
)
4653 || !vect_slp_is_lane_insert (def_stmt
,
4654 NULL_TREE
, &this_lane
)
4655 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (def_stmt
)))
4657 if (bitmap_bit_p (lanes
, this_lane
))
4660 bitmap_set_bit (lanes
, this_lane
);
4661 lane_defs
.quick_push (std::make_pair
4663 gimple_assign_rhs2 (def_stmt
)));
4664 def
= gimple_assign_rhs1 (def_stmt
);
4666 while (lanes_found
< nlanes
);
4668 if (lanes_found
== nlanes
)
4670 /* Sort lane_defs after the lane index and register the root. */
4671 lane_defs
.qsort (vld_cmp
);
4672 vec
<stmt_vec_info
> stmts
;
4673 stmts
.create (nlanes
);
4674 for (unsigned i
= 0; i
< nlanes
; ++i
)
4675 stmts
.quick_push (bb_vinfo
->lookup_def (lane_defs
[i
].second
));
4676 bb_vinfo
->roots
.safe_push (slp_root (slp_inst_kind_ctor
,
4683 /* Walk the grouped store chains and replace entries with their
4684 pattern variant if any. */
4687 vect_fixup_store_groups_with_patterns (vec_info
*vinfo
)
4689 stmt_vec_info first_element
;
4692 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
4694 /* We also have CTORs in this array. */
4695 if (!STMT_VINFO_GROUPED_ACCESS (first_element
))
4697 if (STMT_VINFO_IN_PATTERN_P (first_element
))
4699 stmt_vec_info orig
= first_element
;
4700 first_element
= STMT_VINFO_RELATED_STMT (first_element
);
4701 DR_GROUP_FIRST_ELEMENT (first_element
) = first_element
;
4702 DR_GROUP_SIZE (first_element
) = DR_GROUP_SIZE (orig
);
4703 DR_GROUP_GAP (first_element
) = DR_GROUP_GAP (orig
);
4704 DR_GROUP_NEXT_ELEMENT (first_element
) = DR_GROUP_NEXT_ELEMENT (orig
);
4705 vinfo
->grouped_stores
[i
] = first_element
;
4707 stmt_vec_info prev
= first_element
;
4708 while (DR_GROUP_NEXT_ELEMENT (prev
))
4710 stmt_vec_info elt
= DR_GROUP_NEXT_ELEMENT (prev
);
4711 if (STMT_VINFO_IN_PATTERN_P (elt
))
4713 stmt_vec_info orig
= elt
;
4714 elt
= STMT_VINFO_RELATED_STMT (elt
);
4715 DR_GROUP_NEXT_ELEMENT (prev
) = elt
;
4716 DR_GROUP_GAP (elt
) = DR_GROUP_GAP (orig
);
4717 DR_GROUP_NEXT_ELEMENT (elt
) = DR_GROUP_NEXT_ELEMENT (orig
);
4719 DR_GROUP_FIRST_ELEMENT (elt
) = first_element
;
4725 /* Check if the region described by BB_VINFO can be vectorized, returning
4726 true if so. When returning false, set FATAL to true if the same failure
4727 would prevent vectorization at other vector sizes, false if it is still
4728 worth trying other sizes. N_STMTS is the number of statements in the
4732 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo
, int n_stmts
, bool &fatal
,
4733 vec
<int> *dataref_groups
)
4735 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4737 slp_instance instance
;
4739 poly_uint64 min_vf
= 2;
4741 /* The first group of checks is independent of the vector size. */
4744 /* Analyze the data references. */
4746 if (!vect_analyze_data_refs (bb_vinfo
, &min_vf
, NULL
))
4748 if (dump_enabled_p ())
4749 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4750 "not vectorized: unhandled data-ref in basic "
4755 if (!vect_analyze_data_ref_accesses (bb_vinfo
, dataref_groups
))
4757 if (dump_enabled_p ())
4758 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4759 "not vectorized: unhandled data access in "
4764 vect_slp_check_for_constructors (bb_vinfo
);
4766 /* If there are no grouped stores and no constructors in the region
4767 there is no need to continue with pattern recog as vect_analyze_slp
4768 will fail anyway. */
4769 if (bb_vinfo
->grouped_stores
.is_empty ()
4770 && bb_vinfo
->roots
.is_empty ())
4772 if (dump_enabled_p ())
4773 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4774 "not vectorized: no grouped stores in "
4779 /* While the rest of the analysis below depends on it in some way. */
4782 vect_pattern_recog (bb_vinfo
);
4784 /* Update store groups from pattern processing. */
4785 vect_fixup_store_groups_with_patterns (bb_vinfo
);
4787 /* Check the SLP opportunities in the basic block, analyze and build SLP
4789 if (!vect_analyze_slp (bb_vinfo
, n_stmts
))
4791 if (dump_enabled_p ())
4793 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4794 "Failed to SLP the basic block.\n");
4795 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4796 "not vectorized: failed to find SLP opportunities "
4797 "in basic block.\n");
4802 /* Optimize permutations. */
4803 vect_optimize_slp (bb_vinfo
);
4805 /* Gather the loads reachable from the SLP graph entries. */
4806 vect_gather_slp_loads (bb_vinfo
);
4808 vect_record_base_alignments (bb_vinfo
);
4810 /* Analyze and verify the alignment of data references and the
4811 dependence in the SLP instances. */
4812 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
4814 vect_location
= instance
->location ();
4815 if (! vect_slp_analyze_instance_alignment (bb_vinfo
, instance
)
4816 || ! vect_slp_analyze_instance_dependence (bb_vinfo
, instance
))
4818 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4819 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4820 if (dump_enabled_p ())
4821 dump_printf_loc (MSG_NOTE
, vect_location
,
4822 "removing SLP instance operations starting from: %G",
4824 vect_free_slp_instance (instance
);
4825 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
4829 /* Mark all the statements that we want to vectorize as pure SLP and
4831 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
4832 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
4833 if (stmt_vec_info root
= SLP_INSTANCE_ROOT_STMT (instance
))
4835 STMT_SLP_TYPE (root
) = pure_slp
;
4836 if (is_gimple_assign (root
->stmt
)
4837 && gimple_assign_rhs_code (root
->stmt
) == BIT_INSERT_EXPR
)
4839 /* ??? We should probably record the whole vector of
4840 root stmts so we do not have to back-track here... */
4841 for (unsigned n
= SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
));
4844 root
= bb_vinfo
->lookup_def (gimple_assign_rhs1 (root
->stmt
));
4845 STMT_SLP_TYPE (root
) = pure_slp
;
4852 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
4855 if (!vect_slp_analyze_operations (bb_vinfo
))
4857 if (dump_enabled_p ())
4858 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4859 "not vectorized: bad operation in basic block.\n");
4863 vect_bb_partition_graph (bb_vinfo
);
4868 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4869 basic blocks in BBS, returning true on success.
4870 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4873 vect_slp_region (vec
<basic_block
> bbs
, vec
<data_reference_p
> datarefs
,
4874 vec
<int> *dataref_groups
, unsigned int n_stmts
)
4876 bb_vec_info bb_vinfo
;
4877 auto_vector_modes vector_modes
;
4879 /* Autodetect first vector size we try. */
4880 machine_mode next_vector_mode
= VOIDmode
;
4881 targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
, false);
4882 unsigned int mode_i
= 0;
4884 vec_info_shared shared
;
4886 machine_mode autodetected_vector_mode
= VOIDmode
;
4889 bool vectorized
= false;
4891 bb_vinfo
= new _bb_vec_info (bbs
, &shared
);
4893 bool first_time_p
= shared
.datarefs
.is_empty ();
4894 BB_VINFO_DATAREFS (bb_vinfo
) = datarefs
;
4896 bb_vinfo
->shared
->save_datarefs ();
4898 bb_vinfo
->shared
->check_datarefs ();
4899 bb_vinfo
->vector_mode
= next_vector_mode
;
4901 if (vect_slp_analyze_bb_1 (bb_vinfo
, n_stmts
, fatal
, dataref_groups
))
4903 if (dump_enabled_p ())
4905 dump_printf_loc (MSG_NOTE
, vect_location
,
4906 "***** Analysis succeeded with vector mode"
4907 " %s\n", GET_MODE_NAME (bb_vinfo
->vector_mode
));
4908 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB part\n");
4911 bb_vinfo
->shared
->check_datarefs ();
4914 slp_instance instance
;
4915 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo
), i
, instance
)
4917 if (instance
->subgraph_entries
.is_empty ())
4920 vect_location
= instance
->location ();
4921 if (!unlimited_cost_model (NULL
)
4922 && !vect_bb_vectorization_profitable_p
4923 (bb_vinfo
, instance
->subgraph_entries
))
4925 if (dump_enabled_p ())
4926 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4927 "not vectorized: vectorization is not "
4932 if (!dbg_cnt (vect_slp
))
4935 if (!vectorized
&& dump_enabled_p ())
4936 dump_printf_loc (MSG_NOTE
, vect_location
,
4937 "Basic block will be vectorized "
4941 vect_schedule_slp (bb_vinfo
, instance
->subgraph_entries
);
4943 unsigned HOST_WIDE_INT bytes
;
4944 if (dump_enabled_p ())
4947 (bb_vinfo
->vector_mode
).is_constant (&bytes
))
4948 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
4949 "basic block part vectorized using %wu "
4950 "byte vectors\n", bytes
);
4952 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
4953 "basic block part vectorized using "
4954 "variable length vectors\n");
4960 if (dump_enabled_p ())
4961 dump_printf_loc (MSG_NOTE
, vect_location
,
4962 "***** Analysis failed with vector mode %s\n",
4963 GET_MODE_NAME (bb_vinfo
->vector_mode
));
4967 autodetected_vector_mode
= bb_vinfo
->vector_mode
;
4970 while (mode_i
< vector_modes
.length ()
4971 && vect_chooses_same_modes_p (bb_vinfo
, vector_modes
[mode_i
]))
4973 if (dump_enabled_p ())
4974 dump_printf_loc (MSG_NOTE
, vect_location
,
4975 "***** The result for vector mode %s would"
4977 GET_MODE_NAME (vector_modes
[mode_i
]));
4983 if (mode_i
< vector_modes
.length ()
4984 && VECTOR_MODE_P (autodetected_vector_mode
)
4985 && (related_vector_mode (vector_modes
[mode_i
],
4986 GET_MODE_INNER (autodetected_vector_mode
))
4987 == autodetected_vector_mode
)
4988 && (related_vector_mode (autodetected_vector_mode
,
4989 GET_MODE_INNER (vector_modes
[mode_i
]))
4990 == vector_modes
[mode_i
]))
4992 if (dump_enabled_p ())
4993 dump_printf_loc (MSG_NOTE
, vect_location
,
4994 "***** Skipping vector mode %s, which would"
4995 " repeat the analysis for %s\n",
4996 GET_MODE_NAME (vector_modes
[mode_i
]),
4997 GET_MODE_NAME (autodetected_vector_mode
));
5002 || mode_i
== vector_modes
.length ()
5003 || autodetected_vector_mode
== VOIDmode
5004 /* If vect_slp_analyze_bb_1 signaled that analysis for all
5005 vector sizes will fail do not bother iterating. */
5009 /* Try the next biggest vector size. */
5010 next_vector_mode
= vector_modes
[mode_i
++];
5011 if (dump_enabled_p ())
5012 dump_printf_loc (MSG_NOTE
, vect_location
,
5013 "***** Re-trying analysis with vector mode %s\n",
5014 GET_MODE_NAME (next_vector_mode
));
5019 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
5020 true if anything in the basic-block was vectorized. */
5023 vect_slp_bbs (vec
<basic_block
> bbs
)
5025 vec
<data_reference_p
> datarefs
= vNULL
;
5026 auto_vec
<int> dataref_groups
;
5028 int current_group
= 0;
5030 for (unsigned i
= 0; i
< bbs
.length (); i
++)
5032 basic_block bb
= bbs
[i
];
5033 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
5036 gimple
*stmt
= gsi_stmt (gsi
);
5037 if (is_gimple_debug (stmt
))
5042 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
5043 vect_location
= stmt
;
5045 if (!vect_find_stmt_data_reference (NULL
, stmt
, &datarefs
,
5046 &dataref_groups
, current_group
))
5051 return vect_slp_region (bbs
, datarefs
, &dataref_groups
, insns
);
5054 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5055 true if anything in the basic-block was vectorized. */
5058 vect_slp_bb (basic_block bb
)
5060 auto_vec
<basic_block
> bbs
;
5062 return vect_slp_bbs (bbs
);
5065 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5066 true if anything in the basic-block was vectorized. */
5069 vect_slp_function (function
*fun
)
5072 int *rpo
= XNEWVEC (int, n_basic_blocks_for_fn (fun
));
5073 unsigned n
= pre_and_rev_post_order_compute_fn (fun
, NULL
, rpo
, false);
5075 /* For the moment split the function into pieces to avoid making
5076 the iteration on the vector mode moot. Split at points we know
5077 to not handle well which is CFG merges (SLP discovery doesn't
5078 handle non-loop-header PHIs) and loop exits. Since pattern
5079 recog requires reverse iteration to visit uses before defs
5080 simply chop RPO into pieces. */
5081 auto_vec
<basic_block
> bbs
;
5082 for (unsigned i
= 0; i
< n
; i
++)
5084 basic_block bb
= BASIC_BLOCK_FOR_FN (fun
, rpo
[i
]);
5087 /* Split when a BB is not dominated by the first block. */
5088 if (!bbs
.is_empty ()
5089 && !dominated_by_p (CDI_DOMINATORS
, bb
, bbs
[0]))
5091 if (dump_enabled_p ())
5092 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5093 "splitting region at dominance boundary bb%d\n",
5097 /* Split when the loop determined by the first block
5098 is exited. This is because we eventually insert
5099 invariants at region begin. */
5100 else if (!bbs
.is_empty ()
5101 && bbs
[0]->loop_father
!= bb
->loop_father
5102 && !flow_loop_nested_p (bbs
[0]->loop_father
, bb
->loop_father
))
5104 if (dump_enabled_p ())
5105 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5106 "splitting region at loop %d exit at bb%d\n",
5107 bbs
[0]->loop_father
->num
, bb
->index
);
5111 if (split
&& !bbs
.is_empty ())
5113 r
|= vect_slp_bbs (bbs
);
5115 bbs
.quick_push (bb
);
5120 /* When we have a stmt ending this block and defining a
5121 value we have to insert on edges when inserting after it for
5122 a vector containing its definition. Avoid this for now. */
5123 if (gimple
*last
= last_stmt (bb
))
5124 if (gimple_get_lhs (last
)
5125 && is_ctrl_altering_stmt (last
))
5127 if (dump_enabled_p ())
5128 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5129 "splitting region at control altering "
5130 "definition %G", last
);
5131 r
|= vect_slp_bbs (bbs
);
5136 if (!bbs
.is_empty ())
5137 r
|= vect_slp_bbs (bbs
);
5144 /* Build a variable-length vector in which the elements in ELTS are repeated
5145 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
5146 RESULTS and add any new instructions to SEQ.
5148 The approach we use is:
5150 (1) Find a vector mode VM with integer elements of mode IM.
5152 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5153 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
5154 from small vectors to IM.
5156 (3) Duplicate each ELTS'[I] into a vector of mode VM.
5158 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
5159 correct byte contents.
5161 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
5163 We try to find the largest IM for which this sequence works, in order
5164 to cut down on the number of interleaves. */
5167 duplicate_and_interleave (vec_info
*vinfo
, gimple_seq
*seq
, tree vector_type
,
5168 vec
<tree
> elts
, unsigned int nresults
,
5171 unsigned int nelts
= elts
.length ();
5172 tree element_type
= TREE_TYPE (vector_type
);
5174 /* (1) Find a vector mode VM with integer elements of mode IM. */
5175 unsigned int nvectors
= 1;
5176 tree new_vector_type
;
5178 if (!can_duplicate_and_interleave_p (vinfo
, nelts
, element_type
,
5179 &nvectors
, &new_vector_type
,
5183 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
5184 unsigned int partial_nelts
= nelts
/ nvectors
;
5185 tree partial_vector_type
= build_vector_type (element_type
, partial_nelts
);
5187 tree_vector_builder partial_elts
;
5188 auto_vec
<tree
, 32> pieces (nvectors
* 2);
5189 pieces
.quick_grow_cleared (nvectors
* 2);
5190 for (unsigned int i
= 0; i
< nvectors
; ++i
)
5192 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5193 ELTS' has mode IM. */
5194 partial_elts
.new_vector (partial_vector_type
, partial_nelts
, 1);
5195 for (unsigned int j
= 0; j
< partial_nelts
; ++j
)
5196 partial_elts
.quick_push (elts
[i
* partial_nelts
+ j
]);
5197 tree t
= gimple_build_vector (seq
, &partial_elts
);
5198 t
= gimple_build (seq
, VIEW_CONVERT_EXPR
,
5199 TREE_TYPE (new_vector_type
), t
);
5201 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
5202 pieces
[i
] = gimple_build_vector_from_val (seq
, new_vector_type
, t
);
5205 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
5206 correct byte contents.
5208 Conceptually, we need to repeat the following operation log2(nvectors)
5209 times, where hi_start = nvectors / 2:
5211 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
5212 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
5214 However, if each input repeats every N elements and the VF is
5215 a multiple of N * 2, the HI result is the same as the LO result.
5216 This will be true for the first N1 iterations of the outer loop,
5217 followed by N2 iterations for which both the LO and HI results
5220 N1 + N2 = log2(nvectors)
5222 Each "N1 iteration" doubles the number of redundant vectors and the
5223 effect of the process as a whole is to have a sequence of nvectors/2**N1
5224 vectors that repeats 2**N1 times. Rather than generate these redundant
5225 vectors, we halve the number of vectors for each N1 iteration. */
5226 unsigned int in_start
= 0;
5227 unsigned int out_start
= nvectors
;
5228 unsigned int new_nvectors
= nvectors
;
5229 for (unsigned int in_repeat
= 1; in_repeat
< nvectors
; in_repeat
*= 2)
5231 unsigned int hi_start
= new_nvectors
/ 2;
5232 unsigned int out_i
= 0;
5233 for (unsigned int in_i
= 0; in_i
< new_nvectors
; ++in_i
)
5236 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type
),
5240 tree output
= make_ssa_name (new_vector_type
);
5241 tree input1
= pieces
[in_start
+ (in_i
/ 2)];
5242 tree input2
= pieces
[in_start
+ (in_i
/ 2) + hi_start
];
5243 gassign
*stmt
= gimple_build_assign (output
, VEC_PERM_EXPR
,
5245 permutes
[in_i
& 1]);
5246 gimple_seq_add_stmt (seq
, stmt
);
5247 pieces
[out_start
+ out_i
] = output
;
5250 std::swap (in_start
, out_start
);
5251 new_nvectors
= out_i
;
5254 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
5255 results
.reserve (nresults
);
5256 for (unsigned int i
= 0; i
< nresults
; ++i
)
5257 if (i
< new_nvectors
)
5258 results
.quick_push (gimple_build (seq
, VIEW_CONVERT_EXPR
, vector_type
,
5259 pieces
[in_start
+ i
]));
5261 results
.quick_push (results
[i
- new_nvectors
]);
5265 /* For constant and loop invariant defs in OP_NODE this function creates
5266 vector defs that will be used in the vectorized stmts and stores them
5267 to SLP_TREE_VEC_DEFS of OP_NODE. */
5270 vect_create_constant_vectors (vec_info
*vinfo
, slp_tree op_node
)
5272 unsigned HOST_WIDE_INT nunits
;
5274 unsigned j
, number_of_places_left_in_vector
;
5277 int group_size
= op_node
->ops
.length ();
5278 unsigned int vec_num
, i
;
5279 unsigned number_of_copies
= 1;
5281 gimple_seq ctor_seq
= NULL
;
5282 auto_vec
<tree
, 16> permute_results
;
5284 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
5285 vector_type
= SLP_TREE_VECTYPE (op_node
);
5287 unsigned int number_of_vectors
= SLP_TREE_NUMBER_OF_VEC_STMTS (op_node
);
5288 SLP_TREE_VEC_DEFS (op_node
).create (number_of_vectors
);
5289 auto_vec
<tree
> voprnds (number_of_vectors
);
5291 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5292 created vectors. It is greater than 1 if unrolling is performed.
5294 For example, we have two scalar operands, s1 and s2 (e.g., group of
5295 strided accesses of size two), while NUNITS is four (i.e., four scalars
5296 of this type can be packed in a vector). The output vector will contain
5297 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5300 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
5301 containing the operands.
5303 For example, NUNITS is four as before, and the group size is 8
5304 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5305 {s5, s6, s7, s8}. */
5307 /* When using duplicate_and_interleave, we just need one element for
5308 each scalar statement. */
5309 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
5310 nunits
= group_size
;
5312 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
5314 number_of_places_left_in_vector
= nunits
;
5316 tree_vector_builder
elts (vector_type
, nunits
, 1);
5317 elts
.quick_grow (nunits
);
5318 stmt_vec_info insert_after
= NULL
;
5319 for (j
= 0; j
< number_of_copies
; j
++)
5322 for (i
= group_size
- 1; op_node
->ops
.iterate (i
, &op
); i
--)
5324 /* Create 'vect_ = {op0,op1,...,opn}'. */
5325 number_of_places_left_in_vector
--;
5327 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
5329 if (CONSTANT_CLASS_P (op
))
5331 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
5333 /* Can't use VIEW_CONVERT_EXPR for booleans because
5334 of possibly different sizes of scalar value and
5336 if (integer_zerop (op
))
5337 op
= build_int_cst (TREE_TYPE (vector_type
), 0);
5338 else if (integer_onep (op
))
5339 op
= build_all_ones_cst (TREE_TYPE (vector_type
));
5344 op
= fold_unary (VIEW_CONVERT_EXPR
,
5345 TREE_TYPE (vector_type
), op
);
5346 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
5350 tree new_temp
= make_ssa_name (TREE_TYPE (vector_type
));
5352 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
5355 = build_all_ones_cst (TREE_TYPE (vector_type
));
5357 = build_zero_cst (TREE_TYPE (vector_type
));
5358 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op
)));
5359 init_stmt
= gimple_build_assign (new_temp
, COND_EXPR
,
5365 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
5368 = gimple_build_assign (new_temp
, VIEW_CONVERT_EXPR
,
5371 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
5375 elts
[number_of_places_left_in_vector
] = op
;
5376 if (!CONSTANT_CLASS_P (op
))
5378 /* For BB vectorization we have to compute an insert location
5379 when a def is inside the analyzed region since we cannot
5380 simply insert at the BB start in this case. */
5381 stmt_vec_info opdef
;
5382 if (TREE_CODE (orig_op
) == SSA_NAME
5383 && !SSA_NAME_IS_DEFAULT_DEF (orig_op
)
5384 && is_a
<bb_vec_info
> (vinfo
)
5385 && (opdef
= vinfo
->lookup_def (orig_op
)))
5388 insert_after
= opdef
;
5390 insert_after
= get_later_stmt (insert_after
, opdef
);
5393 if (number_of_places_left_in_vector
== 0)
5396 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
5397 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
5398 vec_cst
= gimple_build_vector (&ctor_seq
, &elts
);
5401 if (permute_results
.is_empty ())
5402 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
,
5403 elts
, number_of_vectors
,
5405 vec_cst
= permute_results
[number_of_vectors
- j
- 1];
5407 if (!gimple_seq_empty_p (ctor_seq
))
5411 gimple_stmt_iterator gsi
;
5412 if (gimple_code (insert_after
->stmt
) == GIMPLE_PHI
)
5414 gsi
= gsi_after_labels (gimple_bb (insert_after
->stmt
));
5415 gsi_insert_seq_before (&gsi
, ctor_seq
,
5416 GSI_CONTINUE_LINKING
);
5418 else if (!stmt_ends_bb_p (insert_after
->stmt
))
5420 gsi
= gsi_for_stmt (insert_after
->stmt
);
5421 gsi_insert_seq_after (&gsi
, ctor_seq
,
5422 GSI_CONTINUE_LINKING
);
5426 /* When we want to insert after a def where the
5427 defining stmt throws then insert on the fallthru
5429 edge e
= find_fallthru_edge
5430 (gimple_bb (insert_after
->stmt
)->succs
);
5432 = gsi_insert_seq_on_edge_immediate (e
, ctor_seq
);
5433 gcc_assert (!new_bb
);
5437 vinfo
->insert_seq_on_entry (NULL
, ctor_seq
);
5440 voprnds
.quick_push (vec_cst
);
5441 insert_after
= NULL
;
5442 number_of_places_left_in_vector
= nunits
;
5444 elts
.new_vector (vector_type
, nunits
, 1);
5445 elts
.quick_grow (nunits
);
5450 /* Since the vectors are created in the reverse order, we should invert
5452 vec_num
= voprnds
.length ();
5453 for (j
= vec_num
; j
!= 0; j
--)
5455 vop
= voprnds
[j
- 1];
5456 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
5459 /* In case that VF is greater than the unrolling factor needed for the SLP
5460 group of stmts, NUMBER_OF_VECTORS to be created is greater than
5461 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
5462 to replicate the vectors. */
5463 while (number_of_vectors
> SLP_TREE_VEC_DEFS (op_node
).length ())
5464 for (i
= 0; SLP_TREE_VEC_DEFS (op_node
).iterate (i
, &vop
) && i
< vec_num
;
5466 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
5469 /* Get the Ith vectorized definition from SLP_NODE. */
5472 vect_get_slp_vect_def (slp_tree slp_node
, unsigned i
)
5474 if (SLP_TREE_VEC_STMTS (slp_node
).exists ())
5475 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node
)[i
]);
5477 return SLP_TREE_VEC_DEFS (slp_node
)[i
];
5480 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
5483 vect_get_slp_defs (slp_tree slp_node
, vec
<tree
> *vec_defs
)
5485 vec_defs
->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
));
5486 if (SLP_TREE_DEF_TYPE (slp_node
) == vect_internal_def
)
5489 gimple
*vec_def_stmt
;
5490 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), j
, vec_def_stmt
)
5491 vec_defs
->quick_push (gimple_get_lhs (vec_def_stmt
));
5494 vec_defs
->splice (SLP_TREE_VEC_DEFS (slp_node
));
5497 /* Get N vectorized definitions for SLP_NODE. */
5500 vect_get_slp_defs (vec_info
*,
5501 slp_tree slp_node
, vec
<vec
<tree
> > *vec_oprnds
, unsigned n
)
5504 n
= SLP_TREE_CHILDREN (slp_node
).length ();
5506 for (unsigned i
= 0; i
< n
; ++i
)
5508 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
5509 vec
<tree
> vec_defs
= vNULL
;
5510 vect_get_slp_defs (child
, &vec_defs
);
5511 vec_oprnds
->quick_push (vec_defs
);
5515 /* Generate vector permute statements from a list of loads in DR_CHAIN.
5516 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
5517 permute statements for the SLP node NODE. Store the number of vector
5518 permute instructions in *N_PERMS and the number of vector load
5519 instructions in *N_LOADS. */
5522 vect_transform_slp_perm_load (vec_info
*vinfo
,
5523 slp_tree node
, vec
<tree
> dr_chain
,
5524 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
5525 bool analyze_only
, unsigned *n_perms
,
5526 unsigned int *n_loads
)
5528 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
5530 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5531 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (node
).length ();
5532 unsigned int mask_element
;
5535 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
5538 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
5540 mode
= TYPE_MODE (vectype
);
5541 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5543 /* Initialize the vect stmts of NODE to properly insert the generated
5546 for (unsigned i
= SLP_TREE_VEC_STMTS (node
).length ();
5547 i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
5548 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
5550 /* Generate permutation masks for every NODE. Number of masks for each NODE
5551 is equal to GROUP_SIZE.
5552 E.g., we have a group of three nodes with three loads from the same
5553 location in each node, and the vector size is 4. I.e., we have a
5554 a0b0c0a1b1c1... sequence and we need to create the following vectors:
5555 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
5556 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
5559 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
5560 The last mask is illegal since we assume two operands for permute
5561 operation, and the mask element values can't be outside that range.
5562 Hence, the last mask must be converted into {2,5,5,5}.
5563 For the first two permutations we need the first and the second input
5564 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
5565 we need the second and the third vectors: {b1,c1,a2,b2} and
5568 int vect_stmts_counter
= 0;
5569 unsigned int index
= 0;
5570 int first_vec_index
= -1;
5571 int second_vec_index
= -1;
5575 vec_perm_builder mask
;
5576 unsigned int nelts_to_build
;
5577 unsigned int nvectors_per_build
;
5578 unsigned int in_nlanes
;
5579 bool repeating_p
= (group_size
== DR_GROUP_SIZE (stmt_info
)
5580 && multiple_p (nunits
, group_size
));
5583 /* A single vector contains a whole number of copies of the node, so:
5584 (a) all permutes can use the same mask; and
5585 (b) the permutes only need a single vector input. */
5586 mask
.new_vector (nunits
, group_size
, 3);
5587 nelts_to_build
= mask
.encoded_nelts ();
5588 nvectors_per_build
= SLP_TREE_VEC_STMTS (node
).length ();
5589 in_nlanes
= DR_GROUP_SIZE (stmt_info
) * 3;
5593 /* We need to construct a separate mask for each vector statement. */
5594 unsigned HOST_WIDE_INT const_nunits
, const_vf
;
5595 if (!nunits
.is_constant (&const_nunits
)
5596 || !vf
.is_constant (&const_vf
))
5598 mask
.new_vector (const_nunits
, const_nunits
, 1);
5599 nelts_to_build
= const_vf
* group_size
;
5600 nvectors_per_build
= 1;
5601 in_nlanes
= const_vf
* DR_GROUP_SIZE (stmt_info
);
5603 auto_sbitmap
used_in_lanes (in_nlanes
);
5604 bitmap_clear (used_in_lanes
);
5606 unsigned int count
= mask
.encoded_nelts ();
5607 mask
.quick_grow (count
);
5608 vec_perm_indices indices
;
5610 for (unsigned int j
= 0; j
< nelts_to_build
; j
++)
5612 unsigned int iter_num
= j
/ group_size
;
5613 unsigned int stmt_num
= j
% group_size
;
5614 unsigned int i
= (iter_num
* DR_GROUP_SIZE (stmt_info
)
5615 + SLP_TREE_LOAD_PERMUTATION (node
)[stmt_num
]);
5616 bitmap_set_bit (used_in_lanes
, i
);
5619 first_vec_index
= 0;
5624 /* Enforced before the loop when !repeating_p. */
5625 unsigned int const_nunits
= nunits
.to_constant ();
5626 vec_index
= i
/ const_nunits
;
5627 mask_element
= i
% const_nunits
;
5628 if (vec_index
== first_vec_index
5629 || first_vec_index
== -1)
5631 first_vec_index
= vec_index
;
5633 else if (vec_index
== second_vec_index
5634 || second_vec_index
== -1)
5636 second_vec_index
= vec_index
;
5637 mask_element
+= const_nunits
;
5641 if (dump_enabled_p ())
5642 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5643 "permutation requires at "
5644 "least three vectors %G",
5646 gcc_assert (analyze_only
);
5650 gcc_assert (mask_element
< 2 * const_nunits
);
5653 if (mask_element
!= index
)
5655 mask
[index
++] = mask_element
;
5657 if (index
== count
&& !noop_p
)
5659 indices
.new_vector (mask
, second_vec_index
== -1 ? 1 : 2, nunits
);
5660 if (!can_vec_perm_const_p (mode
, indices
))
5662 if (dump_enabled_p ())
5664 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
5666 "unsupported vect permute { ");
5667 for (i
= 0; i
< count
; ++i
)
5669 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
5670 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
5672 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
5674 gcc_assert (analyze_only
);
5685 tree mask_vec
= NULL_TREE
;
5688 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
5690 if (second_vec_index
== -1)
5691 second_vec_index
= first_vec_index
;
5693 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
5695 /* Generate the permute statement if necessary. */
5696 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
5697 tree second_vec
= dr_chain
[second_vec_index
+ ri
];
5701 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
5703 = vect_create_destination_var (gimple_assign_lhs (stmt
),
5705 perm_dest
= make_ssa_name (perm_dest
);
5707 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
5708 first_vec
, second_vec
,
5710 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
,
5714 /* If mask was NULL_TREE generate the requested
5715 identity transform. */
5716 perm_stmt
= SSA_NAME_DEF_STMT (first_vec
);
5718 /* Store the vector statement in NODE. */
5719 SLP_TREE_VEC_STMTS (node
)[vect_stmts_counter
++] = perm_stmt
;
5724 first_vec_index
= -1;
5725 second_vec_index
= -1;
5733 *n_loads
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
5736 /* Enforced above when !repeating_p. */
5737 unsigned int const_nunits
= nunits
.to_constant ();
5739 bool load_seen
= false;
5740 for (unsigned i
= 0; i
< in_nlanes
; ++i
)
5742 if (i
% const_nunits
== 0)
5748 if (bitmap_bit_p (used_in_lanes
, i
))
5760 /* Vectorize the SLP permutations in NODE as specified
5761 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5762 child number and lane number.
5763 Interleaving of two two-lane two-child SLP subtrees (not supported):
5764 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5765 A blend of two four-lane two-child SLP subtrees:
5766 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5767 Highpart of a four-lane one-child SLP subtree (not supported):
5768 [ { 0, 2 }, { 0, 3 } ]
5769 Where currently only a subset is supported by code generating below. */
5772 vectorizable_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
5773 slp_tree node
, stmt_vector_for_cost
*cost_vec
)
5775 tree vectype
= SLP_TREE_VECTYPE (node
);
5777 /* ??? We currently only support all same vector input and output types
5778 while the SLP IL should really do a concat + select and thus accept
5779 arbitrary mismatches. */
5782 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5783 if (!vect_maybe_update_slp_op_vectype (child
, vectype
)
5784 || !types_compatible_p (SLP_TREE_VECTYPE (child
), vectype
))
5786 if (dump_enabled_p ())
5787 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5788 "Unsupported lane permutation\n");
5792 vec
<std::pair
<unsigned, unsigned> > &perm
= SLP_TREE_LANE_PERMUTATION (node
);
5793 gcc_assert (perm
.length () == SLP_TREE_LANES (node
));
5794 if (dump_enabled_p ())
5796 dump_printf_loc (MSG_NOTE
, vect_location
,
5797 "vectorizing permutation");
5798 for (unsigned i
= 0; i
< perm
.length (); ++i
)
5799 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
5800 dump_printf (MSG_NOTE
, "\n");
5803 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5804 if (!nunits
.is_constant ())
5806 unsigned HOST_WIDE_INT vf
= 1;
5807 if (loop_vec_info linfo
= dyn_cast
<loop_vec_info
> (vinfo
))
5808 if (!LOOP_VINFO_VECT_FACTOR (linfo
).is_constant (&vf
))
5810 unsigned olanes
= vf
* SLP_TREE_LANES (node
);
5811 gcc_assert (multiple_p (olanes
, nunits
));
5813 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5814 from the { SLP operand, scalar lane } permutation as recorded in the
5815 SLP node as intermediate step. This part should already work
5816 with SLP children with arbitrary number of lanes. */
5817 auto_vec
<std::pair
<std::pair
<unsigned, unsigned>, unsigned> > vperm
;
5818 auto_vec
<unsigned> active_lane
;
5819 vperm
.create (olanes
);
5820 active_lane
.safe_grow_cleared (SLP_TREE_CHILDREN (node
).length (), true);
5821 for (unsigned i
= 0; i
< vf
; ++i
)
5823 for (unsigned pi
= 0; pi
< perm
.length (); ++pi
)
5825 std::pair
<unsigned, unsigned> p
= perm
[pi
];
5826 tree vtype
= SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node
)[p
.first
]);
5827 unsigned vnunits
= TYPE_VECTOR_SUBPARTS (vtype
).to_constant ();
5828 unsigned vi
= (active_lane
[p
.first
] + p
.second
) / vnunits
;
5829 unsigned vl
= (active_lane
[p
.first
] + p
.second
) % vnunits
;
5830 vperm
.quick_push (std::make_pair (std::make_pair (p
.first
, vi
), vl
));
5832 /* Advance to the next group. */
5833 for (unsigned j
= 0; j
< SLP_TREE_CHILDREN (node
).length (); ++j
)
5834 active_lane
[j
] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node
)[j
]);
5837 if (dump_enabled_p ())
5839 dump_printf_loc (MSG_NOTE
, vect_location
, "as");
5840 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
5842 if (i
!= 0 && multiple_p (i
, TYPE_VECTOR_SUBPARTS (vectype
)))
5843 dump_printf (MSG_NOTE
, ",");
5844 dump_printf (MSG_NOTE
, " vops%u[%u][%u]",
5845 vperm
[i
].first
.first
, vperm
[i
].first
.second
,
5848 dump_printf (MSG_NOTE
, "\n");
5851 /* We can only handle two-vector permutes, everything else should
5852 be lowered on the SLP level. The following is closely inspired
5853 by vect_transform_slp_perm_load and is supposed to eventually
5855 ??? As intermediate step do code-gen in the SLP tree representation
5857 std::pair
<unsigned, unsigned> first_vec
= std::make_pair (-1U, -1U);
5858 std::pair
<unsigned, unsigned> second_vec
= std::make_pair (-1U, -1U);
5859 unsigned int const_nunits
= nunits
.to_constant ();
5860 unsigned int index
= 0;
5861 unsigned int mask_element
;
5862 vec_perm_builder mask
;
5863 mask
.new_vector (const_nunits
, const_nunits
, 1);
5864 unsigned int count
= mask
.encoded_nelts ();
5865 mask
.quick_grow (count
);
5866 vec_perm_indices indices
;
5867 unsigned nperms
= 0;
5868 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
5870 mask_element
= vperm
[i
].second
;
5871 if (first_vec
.first
== -1U
5872 || first_vec
== vperm
[i
].first
)
5873 first_vec
= vperm
[i
].first
;
5874 else if (second_vec
.first
== -1U
5875 || second_vec
== vperm
[i
].first
)
5877 second_vec
= vperm
[i
].first
;
5878 mask_element
+= const_nunits
;
5882 if (dump_enabled_p ())
5883 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5884 "permutation requires at "
5885 "least three vectors\n");
5890 mask
[index
++] = mask_element
;
5894 indices
.new_vector (mask
, second_vec
.first
== -1U ? 1 : 2,
5896 bool identity_p
= indices
.series_p (0, 1, 0, 1);
5898 && !can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5900 if (dump_enabled_p ())
5902 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
5904 "unsupported vect permute { ");
5905 for (i
= 0; i
< count
; ++i
)
5907 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
5908 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
5910 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
5920 if (second_vec
.first
== -1U)
5921 second_vec
= first_vec
;
5923 /* Generate the permute statement if necessary. */
5924 slp_tree first_node
= SLP_TREE_CHILDREN (node
)[first_vec
.first
];
5926 = vect_get_slp_vect_def (first_node
, first_vec
.second
);
5927 /* ??? We SLP match existing vector element extracts but
5928 allow punning which we need to re-instantiate at uses
5929 but have no good way of explicitely representing. */
5930 if (!types_compatible_p (TREE_TYPE (first_def
), vectype
))
5933 conv_stmt
= gimple_build_assign (make_ssa_name (vectype
),
5934 build1 (VIEW_CONVERT_EXPR
,
5935 vectype
, first_def
));
5936 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
5937 first_def
= gimple_assign_lhs (conv_stmt
);
5940 tree perm_dest
= make_ssa_name (vectype
);
5943 slp_tree second_node
5944 = SLP_TREE_CHILDREN (node
)[second_vec
.first
];
5946 = vect_get_slp_vect_def (second_node
, second_vec
.second
);
5947 if (!types_compatible_p (TREE_TYPE (second_def
), vectype
))
5950 conv_stmt
= gimple_build_assign (make_ssa_name (vectype
),
5953 vectype
, second_def
));
5954 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
5955 second_def
= gimple_assign_lhs (conv_stmt
);
5957 tree mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
5958 perm_stmt
= gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
5959 first_def
, second_def
,
5963 /* We need a copy here in case the def was external. */
5964 perm_stmt
= gimple_build_assign (perm_dest
, first_def
);
5965 vect_finish_stmt_generation (vinfo
, NULL
, perm_stmt
, gsi
);
5966 /* Store the vector statement in NODE. */
5967 SLP_TREE_VEC_STMTS (node
).quick_push (perm_stmt
);
5971 first_vec
= std::make_pair (-1U, -1U);
5972 second_vec
= std::make_pair (-1U, -1U);
5977 record_stmt_cost (cost_vec
, nperms
, vec_perm
, NULL
, vectype
, 0, vect_body
);
5982 /* Vectorize SLP NODE. */
5985 vect_schedule_slp_node (vec_info
*vinfo
,
5986 slp_tree node
, slp_instance instance
)
5988 gimple_stmt_iterator si
;
5992 /* For existing vectors there's nothing to do. */
5993 if (SLP_TREE_VEC_DEFS (node
).exists ())
5996 gcc_assert (SLP_TREE_VEC_STMTS (node
).is_empty ());
5998 /* Vectorize externals and constants. */
5999 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
6000 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
6002 /* ??? vectorizable_shift can end up using a scalar operand which is
6003 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
6004 node in this case. */
6005 if (!SLP_TREE_VECTYPE (node
))
6008 vect_create_constant_vectors (vinfo
, node
);
6012 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
6014 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) != 0);
6015 SLP_TREE_VEC_STMTS (node
).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
6017 if (dump_enabled_p ())
6018 dump_printf_loc (MSG_NOTE
, vect_location
,
6019 "------>vectorizing SLP node starting from: %G",
6022 if (STMT_VINFO_DATA_REF (stmt_info
)
6023 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
6025 /* Vectorized loads go before the first scalar load to make it
6026 ready early, vectorized stores go before the last scalar
6027 stmt which is where all uses are ready. */
6028 stmt_vec_info last_stmt_info
= NULL
;
6029 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
6030 last_stmt_info
= vect_find_first_scalar_stmt_in_slp (node
);
6031 else /* DR_IS_WRITE */
6032 last_stmt_info
= vect_find_last_scalar_stmt_in_slp (node
);
6033 si
= gsi_for_stmt (last_stmt_info
->stmt
);
6035 else if ((STMT_VINFO_TYPE (stmt_info
) == cycle_phi_info_type
6036 || STMT_VINFO_TYPE (stmt_info
) == induc_vec_info_type
6037 || STMT_VINFO_TYPE (stmt_info
) == phi_info_type
)
6038 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
6040 /* For PHI node vectorization we do not use the insertion iterator. */
6045 /* Emit other stmts after the children vectorized defs which is
6046 earliest possible. */
6047 gimple
*last_stmt
= NULL
;
6048 bool seen_vector_def
= false;
6049 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6050 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
6052 /* For fold-left reductions we are retaining the scalar
6053 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
6054 set so the representation isn't perfect. Resort to the
6055 last scalar def here. */
6056 if (SLP_TREE_VEC_STMTS (child
).is_empty ())
6058 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child
))
6059 == cycle_phi_info_type
);
6060 gphi
*phi
= as_a
<gphi
*>
6061 (vect_find_last_scalar_stmt_in_slp (child
)->stmt
);
6063 || vect_stmt_dominates_stmt_p (last_stmt
, phi
))
6066 /* We are emitting all vectorized stmts in the same place and
6067 the last one is the last.
6068 ??? Unless we have a load permutation applied and that
6069 figures to re-use an earlier generated load. */
6072 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child
), j
, vstmt
)
6074 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
6077 else if (!SLP_TREE_VECTYPE (child
))
6079 /* For externals we use unvectorized at all scalar defs. */
6082 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child
), j
, def
)
6083 if (TREE_CODE (def
) == SSA_NAME
6084 && !SSA_NAME_IS_DEFAULT_DEF (def
))
6086 gimple
*stmt
= SSA_NAME_DEF_STMT (def
);
6088 || vect_stmt_dominates_stmt_p (last_stmt
, stmt
))
6094 /* For externals we have to look at all defs since their
6095 insertion place is decided per vector. But beware
6096 of pre-existing vectors where we need to make sure
6097 we do not insert before the region boundary. */
6098 if (SLP_TREE_SCALAR_OPS (child
).is_empty ()
6099 && !vinfo
->lookup_def (SLP_TREE_VEC_DEFS (child
)[0]))
6100 seen_vector_def
= true;
6105 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
6106 if (TREE_CODE (vdef
) == SSA_NAME
6107 && !SSA_NAME_IS_DEFAULT_DEF (vdef
))
6109 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
6111 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
6116 /* This can happen when all children are pre-existing vectors or
6119 last_stmt
= vect_find_first_scalar_stmt_in_slp (node
)->stmt
;
6122 gcc_assert (seen_vector_def
);
6123 si
= gsi_after_labels (as_a
<bb_vec_info
> (vinfo
)->bbs
[0]);
6125 else if (is_a
<gphi
*> (last_stmt
))
6126 si
= gsi_after_labels (gimple_bb (last_stmt
));
6129 si
= gsi_for_stmt (last_stmt
);
6134 bool done_p
= false;
6136 /* Handle purely internal nodes. */
6137 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
6139 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
6140 be shared with different SLP nodes (but usually it's the same
6141 operation apart from the case the stmt is only there for denoting
6142 the actual scalar lane defs ...). So do not call vect_transform_stmt
6143 but open-code it here (partly). */
6144 bool done
= vectorizable_slp_permutation (vinfo
, &si
, node
, NULL
);
6149 vect_transform_stmt (vinfo
, stmt_info
, &si
, node
, instance
);
6152 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
6153 For loop vectorization this is done in vectorizable_call, but for SLP
6154 it needs to be deferred until end of vect_schedule_slp, because multiple
6155 SLP instances may refer to the same scalar stmt. */
6158 vect_remove_slp_scalar_calls (vec_info
*vinfo
,
6159 slp_tree node
, hash_set
<slp_tree
> &visited
)
6162 gimple_stmt_iterator gsi
;
6166 stmt_vec_info stmt_info
;
6168 if (!node
|| SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
6171 if (visited
.add (node
))
6174 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6175 vect_remove_slp_scalar_calls (vinfo
, child
, visited
);
6177 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
6179 gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
6180 if (!stmt
|| gimple_bb (stmt
) == NULL
)
6182 if (is_pattern_stmt_p (stmt_info
)
6183 || !PURE_SLP_STMT (stmt_info
))
6185 lhs
= gimple_call_lhs (stmt
);
6186 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
6187 gsi
= gsi_for_stmt (stmt
);
6188 vinfo
->replace_stmt (&gsi
, stmt_info
, new_stmt
);
6189 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
6194 vect_remove_slp_scalar_calls (vec_info
*vinfo
, slp_tree node
)
6196 hash_set
<slp_tree
> visited
;
6197 vect_remove_slp_scalar_calls (vinfo
, node
, visited
);
6200 /* Vectorize the instance root. */
6203 vectorize_slp_instance_root_stmt (slp_tree node
, slp_instance instance
)
6205 gassign
*rstmt
= NULL
;
6207 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) == 1)
6212 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
6214 tree vect_lhs
= gimple_get_lhs (child_stmt
);
6215 tree root_lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
6216 if (!useless_type_conversion_p (TREE_TYPE (root_lhs
),
6217 TREE_TYPE (vect_lhs
)))
6218 vect_lhs
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (root_lhs
),
6220 rstmt
= gimple_build_assign (root_lhs
, vect_lhs
);
6224 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) > 1)
6226 int nelts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
6229 vec
<constructor_elt
, va_gc
> *v
;
6230 vec_alloc (v
, nelts
);
6232 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
6234 CONSTRUCTOR_APPEND_ELT (v
,
6236 gimple_get_lhs (child_stmt
));
6238 tree lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
6239 tree rtype
= TREE_TYPE (gimple_assign_rhs1 (instance
->root_stmt
->stmt
));
6240 tree r_constructor
= build_constructor (rtype
, v
);
6241 rstmt
= gimple_build_assign (lhs
, r_constructor
);
6246 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmt
->stmt
);
6247 gsi_replace (&rgsi
, rstmt
, true);
6257 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
6260 vect_schedule_scc (vec_info
*vinfo
, slp_tree node
, slp_instance instance
,
6261 hash_map
<slp_tree
, slp_scc_info
> &scc_info
,
6262 int &maxdfs
, vec
<slp_tree
> &stack
)
6265 slp_scc_info
*info
= &scc_info
.get_or_insert (node
, &existed_p
);
6266 gcc_assert (!existed_p
);
6268 info
->lowlink
= maxdfs
;
6272 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
6274 info
->on_stack
= false;
6275 vect_schedule_slp_node (vinfo
, node
, instance
);
6279 info
->on_stack
= true;
6280 stack
.safe_push (node
);
6285 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6289 slp_scc_info
*child_info
= scc_info
.get (child
);
6292 vect_schedule_scc (vinfo
, child
, instance
, scc_info
, maxdfs
, stack
);
6293 /* Recursion might have re-allocated the node. */
6294 info
= scc_info
.get (node
);
6295 child_info
= scc_info
.get (child
);
6296 info
->lowlink
= MIN (info
->lowlink
, child_info
->lowlink
);
6298 else if (child_info
->on_stack
)
6299 info
->lowlink
= MIN (info
->lowlink
, child_info
->dfs
);
6301 if (info
->lowlink
!= info
->dfs
)
6304 auto_vec
<slp_tree
, 4> phis_to_fixup
;
6307 if (stack
.last () == node
)
6310 info
->on_stack
= false;
6311 vect_schedule_slp_node (vinfo
, node
, instance
);
6312 if (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
6313 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (node
)->stmt
))
6314 phis_to_fixup
.quick_push (node
);
6319 int last_idx
= stack
.length () - 1;
6320 while (stack
[last_idx
] != node
)
6322 /* We can break the cycle at PHIs who have at least one child
6323 code generated. Then we could re-start the DFS walk until
6324 all nodes in the SCC are covered (we might have new entries
6325 for only back-reachable nodes). But it's simpler to just
6326 iterate and schedule those that are ready. */
6327 unsigned todo
= stack
.length () - last_idx
;
6330 for (int idx
= stack
.length () - 1; idx
>= last_idx
; --idx
)
6332 slp_tree entry
= stack
[idx
];
6335 bool phi
= (SLP_TREE_CODE (entry
) != VEC_PERM_EXPR
6336 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (entry
)->stmt
));
6338 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry
), i
, child
)
6345 else if (scc_info
.get (child
)->on_stack
)
6363 vect_schedule_slp_node (vinfo
, entry
, instance
);
6364 scc_info
.get (entry
)->on_stack
= false;
6368 phis_to_fixup
.safe_push (entry
);
6375 stack
.truncate (last_idx
);
6378 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
6380 FOR_EACH_VEC_ELT (phis_to_fixup
, i
, phi_node
)
6382 gphi
*phi
= as_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (phi_node
)->stmt
);
6385 FOR_EACH_EDGE (e
, ei
, gimple_bb (phi
)->preds
)
6387 unsigned dest_idx
= e
->dest_idx
;
6388 child
= SLP_TREE_CHILDREN (phi_node
)[dest_idx
];
6389 if (!child
|| SLP_TREE_DEF_TYPE (child
) != vect_internal_def
)
6391 /* Simply fill all args. */
6392 for (unsigned i
= 0; i
< SLP_TREE_VEC_STMTS (phi_node
).length (); ++i
)
6393 add_phi_arg (as_a
<gphi
*> (SLP_TREE_VEC_STMTS (phi_node
)[i
]),
6394 vect_get_slp_vect_def (child
, i
),
6395 e
, gimple_phi_arg_location (phi
, dest_idx
));
6400 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
6403 vect_schedule_slp (vec_info
*vinfo
, vec
<slp_instance
> slp_instances
)
6405 slp_instance instance
;
6408 hash_map
<slp_tree
, slp_scc_info
> scc_info
;
6410 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
6412 slp_tree node
= SLP_INSTANCE_TREE (instance
);
6413 if (dump_enabled_p ())
6415 dump_printf_loc (MSG_NOTE
, vect_location
,
6416 "Vectorizing SLP tree:\n");
6417 if (SLP_INSTANCE_ROOT_STMT (instance
))
6418 dump_printf_loc (MSG_NOTE
, vect_location
, "Root stmt: %G",
6419 SLP_INSTANCE_ROOT_STMT (instance
)->stmt
);
6420 vect_print_slp_graph (MSG_NOTE
, vect_location
,
6421 SLP_INSTANCE_TREE (instance
));
6423 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
6424 have a PHI be the node breaking the cycle. */
6425 auto_vec
<slp_tree
> stack
;
6426 if (!scc_info
.get (node
))
6427 vect_schedule_scc (vinfo
, node
, instance
, scc_info
, maxdfs
, stack
);
6429 if (SLP_INSTANCE_ROOT_STMT (instance
))
6430 vectorize_slp_instance_root_stmt (node
, instance
);
6432 if (dump_enabled_p ())
6433 dump_printf_loc (MSG_NOTE
, vect_location
,
6434 "vectorizing stmts using SLP.\n");
6437 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
6439 slp_tree root
= SLP_INSTANCE_TREE (instance
);
6440 stmt_vec_info store_info
;
6443 /* Remove scalar call stmts. Do not do this for basic-block
6444 vectorization as not all uses may be vectorized.
6445 ??? Why should this be necessary? DCE should be able to
6446 remove the stmts itself.
6447 ??? For BB vectorization we can as well remove scalar
6448 stmts starting from the SLP tree root if they have no
6450 if (is_a
<loop_vec_info
> (vinfo
))
6451 vect_remove_slp_scalar_calls (vinfo
, root
);
6453 /* Remove vectorized stores original scalar stmts. */
6454 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store_info
); j
++)
6456 if (!STMT_VINFO_DATA_REF (store_info
)
6457 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info
)))
6460 store_info
= vect_orig_stmt (store_info
);
6461 /* Free the attached stmt_vec_info and remove the stmt. */
6462 vinfo
->remove_stmt (store_info
);