1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "tree-pass.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
51 #include "alloc-pool.h"
53 static bool vectorizable_slp_permutation (vec_info
*, gimple_stmt_iterator
*,
54 slp_tree
, stmt_vector_for_cost
*);
55 static void vect_print_slp_tree (dump_flags_t
, dump_location_t
, slp_tree
);
57 static object_allocator
<_slp_tree
> *slp_tree_pool
;
58 static slp_tree slp_first_node
;
63 slp_tree_pool
= new object_allocator
<_slp_tree
> ("SLP nodes");
69 while (slp_first_node
)
70 delete slp_first_node
;
76 _slp_tree::operator new (size_t n
)
78 gcc_assert (n
== sizeof (_slp_tree
));
79 return slp_tree_pool
->allocate_raw ();
83 _slp_tree::operator delete (void *node
, size_t n
)
85 gcc_assert (n
== sizeof (_slp_tree
));
86 slp_tree_pool
->remove_raw (node
);
90 /* Initialize a SLP node. */
92 _slp_tree::_slp_tree ()
94 this->prev_node
= NULL
;
96 slp_first_node
->prev_node
= this;
97 this->next_node
= slp_first_node
;
98 slp_first_node
= this;
99 SLP_TREE_SCALAR_STMTS (this) = vNULL
;
100 SLP_TREE_SCALAR_OPS (this) = vNULL
;
101 SLP_TREE_VEC_STMTS (this) = vNULL
;
102 SLP_TREE_VEC_DEFS (this) = vNULL
;
103 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
104 SLP_TREE_CHILDREN (this) = vNULL
;
105 SLP_TREE_LOAD_PERMUTATION (this) = vNULL
;
106 SLP_TREE_LANE_PERMUTATION (this) = vNULL
;
107 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def
;
108 SLP_TREE_CODE (this) = ERROR_MARK
;
109 SLP_TREE_VECTYPE (this) = NULL_TREE
;
110 SLP_TREE_REPRESENTATIVE (this) = NULL
;
111 SLP_TREE_REF_COUNT (this) = 1;
113 this->max_nunits
= 1;
117 /* Tear down a SLP node. */
119 _slp_tree::~_slp_tree ()
122 this->prev_node
->next_node
= this->next_node
;
124 slp_first_node
= this->next_node
;
126 this->next_node
->prev_node
= this->prev_node
;
127 SLP_TREE_CHILDREN (this).release ();
128 SLP_TREE_SCALAR_STMTS (this).release ();
129 SLP_TREE_SCALAR_OPS (this).release ();
130 SLP_TREE_VEC_STMTS (this).release ();
131 SLP_TREE_VEC_DEFS (this).release ();
132 SLP_TREE_LOAD_PERMUTATION (this).release ();
133 SLP_TREE_LANE_PERMUTATION (this).release ();
138 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
141 vect_free_slp_tree (slp_tree node
)
146 if (--SLP_TREE_REF_COUNT (node
) != 0)
149 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
151 vect_free_slp_tree (child
);
153 /* If the node defines any SLP only patterns then those patterns are no
154 longer valid and should be removed. */
155 stmt_vec_info rep_stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
156 if (rep_stmt_info
&& STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info
))
158 stmt_vec_info stmt_info
= vect_orig_stmt (rep_stmt_info
);
159 STMT_VINFO_IN_PATTERN_P (stmt_info
) = false;
160 STMT_SLP_TYPE (stmt_info
) = STMT_SLP_TYPE (rep_stmt_info
);
166 /* Return a location suitable for dumpings related to the SLP instance. */
169 _slp_instance::location () const
171 if (!root_stmts
.is_empty ())
172 return root_stmts
[0]->stmt
;
174 return SLP_TREE_SCALAR_STMTS (root
)[0]->stmt
;
178 /* Free the memory allocated for the SLP instance. */
181 vect_free_slp_instance (slp_instance instance
)
183 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
184 SLP_INSTANCE_LOADS (instance
).release ();
185 SLP_INSTANCE_ROOT_STMTS (instance
).release ();
186 instance
->subgraph_entries
.release ();
187 instance
->cost_vec
.release ();
192 /* Create an SLP node for SCALAR_STMTS. */
195 vect_create_new_slp_node (unsigned nops
, tree_code code
)
197 slp_tree node
= new _slp_tree
;
198 SLP_TREE_SCALAR_STMTS (node
) = vNULL
;
199 SLP_TREE_CHILDREN (node
).create (nops
);
200 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
201 SLP_TREE_CODE (node
) = code
;
204 /* Create an SLP node for SCALAR_STMTS. */
207 vect_create_new_slp_node (slp_tree node
,
208 vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
210 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
211 SLP_TREE_CHILDREN (node
).create (nops
);
212 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
213 SLP_TREE_REPRESENTATIVE (node
) = scalar_stmts
[0];
214 SLP_TREE_LANES (node
) = scalar_stmts
.length ();
218 /* Create an SLP node for SCALAR_STMTS. */
221 vect_create_new_slp_node (vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
223 return vect_create_new_slp_node (new _slp_tree
, scalar_stmts
, nops
);
226 /* Create an SLP node for OPS. */
229 vect_create_new_slp_node (slp_tree node
, vec
<tree
> ops
)
231 SLP_TREE_SCALAR_OPS (node
) = ops
;
232 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
233 SLP_TREE_LANES (node
) = ops
.length ();
237 /* Create an SLP node for OPS. */
240 vect_create_new_slp_node (vec
<tree
> ops
)
242 return vect_create_new_slp_node (new _slp_tree
, ops
);
246 /* This structure is used in creation of an SLP tree. Each instance
247 corresponds to the same operand in a group of scalar stmts in an SLP
249 typedef struct _slp_oprnd_info
251 /* Def-stmts for the operands. */
252 vec
<stmt_vec_info
> def_stmts
;
255 /* Information about the first statement, its vector def-type, type, the
256 operand itself in case it's constant, and an indication if it's a pattern
259 enum vect_def_type first_dt
;
264 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
266 static vec
<slp_oprnd_info
>
267 vect_create_oprnd_info (int nops
, int group_size
)
270 slp_oprnd_info oprnd_info
;
271 vec
<slp_oprnd_info
> oprnds_info
;
273 oprnds_info
.create (nops
);
274 for (i
= 0; i
< nops
; i
++)
276 oprnd_info
= XNEW (struct _slp_oprnd_info
);
277 oprnd_info
->def_stmts
.create (group_size
);
278 oprnd_info
->ops
.create (group_size
);
279 oprnd_info
->first_dt
= vect_uninitialized_def
;
280 oprnd_info
->first_op_type
= NULL_TREE
;
281 oprnd_info
->any_pattern
= false;
282 oprnds_info
.quick_push (oprnd_info
);
289 /* Free operands info. */
292 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
295 slp_oprnd_info oprnd_info
;
297 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
299 oprnd_info
->def_stmts
.release ();
300 oprnd_info
->ops
.release ();
301 XDELETE (oprnd_info
);
304 oprnds_info
.release ();
308 /* Return true if STMTS contains a pattern statement. */
311 vect_contains_pattern_stmt_p (vec
<stmt_vec_info
> stmts
)
313 stmt_vec_info stmt_info
;
315 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
316 if (is_pattern_stmt_p (stmt_info
))
321 /* Return true when all lanes in the external or constant NODE have
325 vect_slp_tree_uniform_p (slp_tree node
)
327 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
328 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
);
330 /* Pre-exsting vectors. */
331 if (SLP_TREE_SCALAR_OPS (node
).is_empty ())
335 tree op
, first
= NULL_TREE
;
336 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
339 else if (!operand_equal_p (first
, op
, 0))
345 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
346 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
350 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info
,
351 stmt_vec_info first_stmt_info
)
353 stmt_vec_info next_stmt_info
= first_stmt_info
;
356 if (first_stmt_info
!= DR_GROUP_FIRST_ELEMENT (stmt_info
))
361 if (next_stmt_info
== stmt_info
)
363 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
365 result
+= DR_GROUP_GAP (next_stmt_info
);
367 while (next_stmt_info
);
372 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
373 using the method implemented by duplicate_and_interleave. Return true
374 if so, returning the number of intermediate vectors in *NVECTORS_OUT
375 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
379 can_duplicate_and_interleave_p (vec_info
*vinfo
, unsigned int count
,
380 tree elt_type
, unsigned int *nvectors_out
,
381 tree
*vector_type_out
,
384 tree base_vector_type
= get_vectype_for_scalar_type (vinfo
, elt_type
, count
);
385 if (!base_vector_type
|| !VECTOR_MODE_P (TYPE_MODE (base_vector_type
)))
388 machine_mode base_vector_mode
= TYPE_MODE (base_vector_type
);
389 poly_int64 elt_bytes
= count
* GET_MODE_UNIT_SIZE (base_vector_mode
);
390 unsigned int nvectors
= 1;
393 scalar_int_mode int_mode
;
394 poly_int64 elt_bits
= elt_bytes
* BITS_PER_UNIT
;
395 if (int_mode_for_size (elt_bits
, 1).exists (&int_mode
))
397 /* Get the natural vector type for this SLP group size. */
398 tree int_type
= build_nonstandard_integer_type
399 (GET_MODE_BITSIZE (int_mode
), 1);
401 = get_vectype_for_scalar_type (vinfo
, int_type
, count
);
403 && VECTOR_MODE_P (TYPE_MODE (vector_type
))
404 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type
)),
405 GET_MODE_SIZE (base_vector_mode
)))
407 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
408 together into elements of type INT_TYPE and using the result
409 to build NVECTORS vectors. */
410 poly_uint64 nelts
= GET_MODE_NUNITS (TYPE_MODE (vector_type
));
411 vec_perm_builder
sel1 (nelts
, 2, 3);
412 vec_perm_builder
sel2 (nelts
, 2, 3);
413 poly_int64 half_nelts
= exact_div (nelts
, 2);
414 for (unsigned int i
= 0; i
< 3; ++i
)
417 sel1
.quick_push (i
+ nelts
);
418 sel2
.quick_push (half_nelts
+ i
);
419 sel2
.quick_push (half_nelts
+ i
+ nelts
);
421 vec_perm_indices
indices1 (sel1
, 2, nelts
);
422 vec_perm_indices
indices2 (sel2
, 2, nelts
);
423 if (can_vec_perm_const_p (TYPE_MODE (vector_type
), indices1
)
424 && can_vec_perm_const_p (TYPE_MODE (vector_type
), indices2
))
427 *nvectors_out
= nvectors
;
429 *vector_type_out
= vector_type
;
432 permutes
[0] = vect_gen_perm_mask_checked (vector_type
,
434 permutes
[1] = vect_gen_perm_mask_checked (vector_type
,
441 if (!multiple_p (elt_bytes
, 2, &elt_bytes
))
447 /* Return true if DTA and DTB match. */
450 vect_def_types_match (enum vect_def_type dta
, enum vect_def_type dtb
)
453 || ((dta
== vect_external_def
|| dta
== vect_constant_def
)
454 && (dtb
== vect_external_def
|| dtb
== vect_constant_def
)));
457 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
458 they are of a valid type and that they match the defs of the first stmt of
459 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
460 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
461 indicates swap is required for cond_expr stmts. Specifically, *SWAP
462 is 1 if STMT is cond and operands of comparison need to be swapped;
463 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
464 If there is any operand swap in this function, *SWAP is set to non-zero
466 If there was a fatal error return -1; if the error could be corrected by
467 swapping operands of father node of this one, return 1; if everything is
470 vect_get_and_check_slp_defs (vec_info
*vinfo
, unsigned char swap
,
472 vec
<stmt_vec_info
> stmts
, unsigned stmt_num
,
473 vec
<slp_oprnd_info
> *oprnds_info
)
475 stmt_vec_info stmt_info
= stmts
[stmt_num
];
477 unsigned int i
, number_of_oprnds
;
478 enum vect_def_type dt
= vect_uninitialized_def
;
479 slp_oprnd_info oprnd_info
;
480 int first_op_idx
= 1;
481 unsigned int commutative_op
= -1U;
482 bool first_op_cond
= false;
483 bool first
= stmt_num
== 0;
485 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
487 number_of_oprnds
= gimple_call_num_args (stmt
);
489 if (gimple_call_internal_p (stmt
))
491 internal_fn ifn
= gimple_call_internal_fn (stmt
);
492 commutative_op
= first_commutative_argument (ifn
);
494 /* Masked load, only look at mask. */
495 if (ifn
== IFN_MASK_LOAD
)
497 number_of_oprnds
= 1;
498 /* Mask operand index. */
503 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
505 enum tree_code code
= gimple_assign_rhs_code (stmt
);
506 number_of_oprnds
= gimple_num_ops (stmt
) - 1;
507 /* Swap can only be done for cond_expr if asked to, otherwise we
508 could result in different comparison code to the first stmt. */
509 if (code
== COND_EXPR
510 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt
)))
512 first_op_cond
= true;
516 commutative_op
= commutative_tree_code (code
) ? 0U : -1U;
518 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
519 number_of_oprnds
= gimple_phi_num_args (stmt
);
523 bool swapped
= (swap
!= 0);
524 bool backedge
= false;
525 gcc_assert (!swapped
|| first_op_cond
);
526 enum vect_def_type
*dts
= XALLOCAVEC (enum vect_def_type
, number_of_oprnds
);
527 for (i
= 0; i
< number_of_oprnds
; i
++)
531 /* Map indicating how operands of cond_expr should be swapped. */
532 int maps
[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
533 int *map
= maps
[swap
];
536 oprnd
= TREE_OPERAND (gimple_op (stmt_info
->stmt
,
537 first_op_idx
), map
[i
]);
539 oprnd
= gimple_op (stmt_info
->stmt
, map
[i
]);
541 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
543 oprnd
= gimple_phi_arg_def (stmt
, i
);
544 backedge
= dominated_by_p (CDI_DOMINATORS
,
545 gimple_phi_arg_edge (stmt
, i
)->src
,
546 gimple_bb (stmt_info
->stmt
));
549 oprnd
= gimple_op (stmt_info
->stmt
, first_op_idx
+ (swapped
? !i
: i
));
550 if (TREE_CODE (oprnd
) == VIEW_CONVERT_EXPR
)
551 oprnd
= TREE_OPERAND (oprnd
, 0);
553 oprnd_info
= (*oprnds_info
)[i
];
555 stmt_vec_info def_stmt_info
;
556 if (!vect_is_simple_use (oprnd
, vinfo
, &dts
[i
], &def_stmt_info
))
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
560 "Build SLP failed: can't analyze def for %T\n",
568 oprnd_info
->def_stmts
.quick_push (NULL
);
569 oprnd_info
->ops
.quick_push (NULL_TREE
);
570 oprnd_info
->first_dt
= vect_uninitialized_def
;
574 oprnd_info
->def_stmts
.quick_push (def_stmt_info
);
575 oprnd_info
->ops
.quick_push (oprnd
);
578 && is_pattern_stmt_p (def_stmt_info
))
580 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info
))
582 oprnd_info
->any_pattern
= true;
584 /* If we promote this to external use the original stmt def. */
585 oprnd_info
->ops
.last ()
586 = gimple_get_lhs (vect_orig_stmt (def_stmt_info
)->stmt
);
589 /* If there's a extern def on a backedge make sure we can
590 code-generate at the region start.
591 ??? This is another case that could be fixed by adjusting
592 how we split the function but at the moment we'd have conflicting
595 && dts
[i
] == vect_external_def
596 && is_a
<bb_vec_info
> (vinfo
)
597 && TREE_CODE (oprnd
) == SSA_NAME
598 && !SSA_NAME_IS_DEFAULT_DEF (oprnd
)
599 && !dominated_by_p (CDI_DOMINATORS
,
600 as_a
<bb_vec_info
> (vinfo
)->bbs
[0],
601 gimple_bb (SSA_NAME_DEF_STMT (oprnd
))))
603 if (dump_enabled_p ())
604 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
605 "Build SLP failed: extern def %T only defined "
606 "on backedge\n", oprnd
);
612 tree type
= TREE_TYPE (oprnd
);
614 if ((dt
== vect_constant_def
615 || dt
== vect_external_def
)
616 && !GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
617 && (TREE_CODE (type
) == BOOLEAN_TYPE
618 || !can_duplicate_and_interleave_p (vinfo
, stmts
.length (),
621 if (dump_enabled_p ())
622 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
623 "Build SLP failed: invalid type of def "
624 "for variable-length SLP %T\n", oprnd
);
628 /* For the swapping logic below force vect_reduction_def
629 for the reduction op in a SLP reduction group. */
630 if (!STMT_VINFO_DATA_REF (stmt_info
)
631 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
632 && (int)i
== STMT_VINFO_REDUC_IDX (stmt_info
)
634 dts
[i
] = dt
= vect_reduction_def
;
636 /* Check the types of the definition. */
639 case vect_external_def
:
640 case vect_constant_def
:
641 case vect_internal_def
:
642 case vect_reduction_def
:
643 case vect_induction_def
:
644 case vect_nested_cycle
:
648 /* FORNOW: Not supported. */
649 if (dump_enabled_p ())
650 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
651 "Build SLP failed: illegal type of def %T\n",
656 oprnd_info
->first_dt
= dt
;
657 oprnd_info
->first_op_type
= type
;
663 /* Now match the operand definition types to that of the first stmt. */
664 for (i
= 0; i
< number_of_oprnds
;)
672 oprnd_info
= (*oprnds_info
)[i
];
674 stmt_vec_info def_stmt_info
= oprnd_info
->def_stmts
[stmt_num
];
675 oprnd
= oprnd_info
->ops
[stmt_num
];
676 tree type
= TREE_TYPE (oprnd
);
678 if (!types_compatible_p (oprnd_info
->first_op_type
, type
))
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
682 "Build SLP failed: different operand types\n");
686 /* Not first stmt of the group, check that the def-stmt/s match
687 the def-stmt/s of the first stmt. Allow different definition
688 types for reduction chains: the first stmt must be a
689 vect_reduction_def (a phi node), and the rest
690 end in the reduction chain. */
691 if ((!vect_def_types_match (oprnd_info
->first_dt
, dt
)
692 && !(oprnd_info
->first_dt
== vect_reduction_def
693 && !STMT_VINFO_DATA_REF (stmt_info
)
694 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
696 && !STMT_VINFO_DATA_REF (def_stmt_info
)
697 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
698 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
))))
699 || (!STMT_VINFO_DATA_REF (stmt_info
)
700 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
702 || STMT_VINFO_DATA_REF (def_stmt_info
)
703 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
704 != REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
705 != (oprnd_info
->first_dt
!= vect_reduction_def
))))
707 /* Try swapping operands if we got a mismatch. For BB
708 vectorization only in case it will clearly improve things. */
709 if (i
== commutative_op
&& !swapped
710 && (!is_a
<bb_vec_info
> (vinfo
)
711 || (!vect_def_types_match ((*oprnds_info
)[i
+1]->first_dt
,
713 && (vect_def_types_match (oprnd_info
->first_dt
, dts
[i
+1])
714 || vect_def_types_match
715 ((*oprnds_info
)[i
+1]->first_dt
, dts
[i
])))))
717 if (dump_enabled_p ())
718 dump_printf_loc (MSG_NOTE
, vect_location
,
719 "trying swapped operands\n");
720 std::swap (dts
[i
], dts
[i
+1]);
721 std::swap ((*oprnds_info
)[i
]->def_stmts
[stmt_num
],
722 (*oprnds_info
)[i
+1]->def_stmts
[stmt_num
]);
723 std::swap ((*oprnds_info
)[i
]->ops
[stmt_num
],
724 (*oprnds_info
)[i
+1]->ops
[stmt_num
]);
729 if (is_a
<bb_vec_info
> (vinfo
)
730 && !oprnd_info
->any_pattern
)
732 /* Now for commutative ops we should see whether we can
733 make the other operand matching. */
734 if (dump_enabled_p ())
735 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
736 "treating operand as external\n");
737 oprnd_info
->first_dt
= dt
= vect_external_def
;
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
743 "Build SLP failed: different types\n");
748 /* Make sure to demote the overall operand to external. */
749 if (dt
== vect_external_def
)
750 oprnd_info
->first_dt
= vect_external_def
;
751 /* For a SLP reduction chain we want to duplicate the reduction to
752 each of the chain members. That gets us a sane SLP graph (still
753 the stmts are not 100% correct wrt the initial values). */
754 else if ((dt
== vect_internal_def
755 || dt
== vect_reduction_def
)
756 && oprnd_info
->first_dt
== vect_reduction_def
757 && !STMT_VINFO_DATA_REF (stmt_info
)
758 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
759 && !STMT_VINFO_DATA_REF (def_stmt_info
)
760 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
761 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
763 oprnd_info
->def_stmts
[stmt_num
] = oprnd_info
->def_stmts
[0];
764 oprnd_info
->ops
[stmt_num
] = oprnd_info
->ops
[0];
773 if (dump_enabled_p ())
774 dump_printf_loc (MSG_NOTE
, vect_location
,
775 "swapped operands to match def types in %G",
782 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
783 Return true if we can, meaning that this choice doesn't conflict with
784 existing SLP nodes that use STMT_INFO. */
787 vect_update_shared_vectype (stmt_vec_info stmt_info
, tree vectype
)
789 tree old_vectype
= STMT_VINFO_VECTYPE (stmt_info
);
791 return useless_type_conversion_p (vectype
, old_vectype
);
793 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
795 /* We maintain the invariant that if any statement in the group is
796 used, all other members of the group have the same vector type. */
797 stmt_vec_info first_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
798 stmt_vec_info member_info
= first_info
;
799 for (; member_info
; member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
800 if (is_pattern_stmt_p (member_info
)
801 && !useless_type_conversion_p (vectype
,
802 STMT_VINFO_VECTYPE (member_info
)))
807 for (member_info
= first_info
; member_info
;
808 member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
809 STMT_VINFO_VECTYPE (member_info
) = vectype
;
813 else if (!is_pattern_stmt_p (stmt_info
))
815 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
819 if (dump_enabled_p ())
821 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
822 "Build SLP failed: incompatible vector"
823 " types for: %G", stmt_info
->stmt
);
824 dump_printf_loc (MSG_NOTE
, vect_location
,
825 " old vector type: %T\n", old_vectype
);
826 dump_printf_loc (MSG_NOTE
, vect_location
,
827 " new vector type: %T\n", vectype
);
832 /* Return true if call statements CALL1 and CALL2 are similar enough
833 to be combined into the same SLP group. */
836 compatible_calls_p (gcall
*call1
, gcall
*call2
)
838 unsigned int nargs
= gimple_call_num_args (call1
);
839 if (nargs
!= gimple_call_num_args (call2
))
842 if (gimple_call_combined_fn (call1
) != gimple_call_combined_fn (call2
))
845 if (gimple_call_internal_p (call1
))
847 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1
)),
848 TREE_TYPE (gimple_call_lhs (call2
))))
850 for (unsigned int i
= 0; i
< nargs
; ++i
)
851 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1
, i
)),
852 TREE_TYPE (gimple_call_arg (call2
, i
))))
857 if (!operand_equal_p (gimple_call_fn (call1
),
858 gimple_call_fn (call2
), 0))
861 if (gimple_call_fntype (call1
) != gimple_call_fntype (call2
))
867 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
868 caller's attempt to find the vector type in STMT_INFO with the narrowest
869 element type. Return true if VECTYPE is nonnull and if it is valid
870 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
871 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
872 vect_build_slp_tree. */
875 vect_record_max_nunits (vec_info
*vinfo
, stmt_vec_info stmt_info
,
876 unsigned int group_size
,
877 tree vectype
, poly_uint64
*max_nunits
)
881 if (dump_enabled_p ())
882 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
883 "Build SLP failed: unsupported data-type in %G\n",
885 /* Fatal mismatch. */
889 /* If populating the vector type requires unrolling then fail
890 before adjusting *max_nunits for basic-block vectorization. */
891 if (is_a
<bb_vec_info
> (vinfo
)
892 && !multiple_p (group_size
, TYPE_VECTOR_SUBPARTS (vectype
)))
894 if (dump_enabled_p ())
895 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
896 "Build SLP failed: unrolling required "
897 "in basic block SLP\n");
898 /* Fatal mismatch. */
902 /* In case of multiple types we need to detect the smallest type. */
903 vect_update_max_nunits (max_nunits
, vectype
);
907 /* Verify if the scalar stmts STMTS are isomorphic, require data
908 permutation or are of unsupported types of operation. Return
909 true if they are, otherwise return false and indicate in *MATCHES
910 which stmts are not isomorphic to the first one. If MATCHES[0]
911 is false then this indicates the comparison could not be
912 carried out or the stmts will never be vectorized by SLP.
914 Note COND_EXPR is possibly isomorphic to another one after swapping its
915 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
916 the first stmt by swapping the two operands of comparison; set SWAP[i]
917 to 2 if stmt I is isormorphic to the first stmt by inverting the code
918 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
919 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
922 vect_build_slp_tree_1 (vec_info
*vinfo
, unsigned char *swap
,
923 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
924 poly_uint64
*max_nunits
, bool *matches
,
925 bool *two_operators
, tree
*node_vectype
)
928 stmt_vec_info first_stmt_info
= stmts
[0];
929 enum tree_code first_stmt_code
= ERROR_MARK
;
930 enum tree_code alt_stmt_code
= ERROR_MARK
;
931 enum tree_code rhs_code
= ERROR_MARK
;
932 enum tree_code first_cond_code
= ERROR_MARK
;
934 bool need_same_oprnds
= false;
935 tree vectype
= NULL_TREE
, first_op1
= NULL_TREE
;
938 machine_mode optab_op2_mode
;
939 machine_mode vec_mode
;
940 stmt_vec_info first_load
= NULL
, prev_first_load
= NULL
;
941 bool first_stmt_load_p
= false, load_p
= false;
942 bool first_stmt_phi_p
= false, phi_p
= false;
943 bool maybe_soft_fail
= false;
944 tree soft_fail_nunits_vectype
= NULL_TREE
;
946 /* For every stmt in NODE find its def stmt/s. */
947 stmt_vec_info stmt_info
;
948 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
950 gimple
*stmt
= stmt_info
->stmt
;
954 if (dump_enabled_p ())
955 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for %G", stmt
);
957 /* Fail to vectorize statements marked as unvectorizable, throw
959 if (!STMT_VINFO_VECTORIZABLE (stmt_info
)
960 || stmt_can_throw_internal (cfun
, stmt
)
961 || gimple_has_volatile_ops (stmt
))
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
965 "Build SLP failed: unvectorizable statement %G",
967 /* ??? For BB vectorization we want to commutate operands in a way
968 to shuffle all unvectorizable defs into one operand and have
969 the other still vectorized. The following doesn't reliably
970 work for this though but it's the easiest we can do here. */
971 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
973 /* Fatal mismatch. */
978 lhs
= gimple_get_lhs (stmt
);
979 if (lhs
== NULL_TREE
)
981 if (dump_enabled_p ())
982 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
983 "Build SLP failed: not GIMPLE_ASSIGN nor "
984 "GIMPLE_CALL %G", stmt
);
985 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
987 /* Fatal mismatch. */
993 if (!vect_get_vector_types_for_stmt (vinfo
, stmt_info
, &vectype
,
994 &nunits_vectype
, group_size
))
996 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
998 /* Fatal mismatch. */
1002 /* Record nunits required but continue analysis, producing matches[]
1003 as if nunits was not an issue. This allows splitting of groups
1006 && !vect_record_max_nunits (vinfo
, stmt_info
, group_size
,
1007 nunits_vectype
, max_nunits
))
1009 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
1010 maybe_soft_fail
= true;
1011 soft_fail_nunits_vectype
= nunits_vectype
;
1014 gcc_assert (vectype
);
1016 gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
);
1019 rhs_code
= CALL_EXPR
;
1021 if (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
))
1023 else if ((gimple_call_internal_p (call_stmt
)
1024 && (!vectorizable_internal_fn_p
1025 (gimple_call_internal_fn (call_stmt
))))
1026 || gimple_call_tail_p (call_stmt
)
1027 || gimple_call_noreturn_p (call_stmt
)
1028 || !gimple_call_nothrow_p (call_stmt
)
1029 || gimple_call_chain (call_stmt
))
1031 if (dump_enabled_p ())
1032 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1033 "Build SLP failed: unsupported call type %G",
1035 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1037 /* Fatal mismatch. */
1042 else if (gimple_code (stmt
) == GIMPLE_PHI
)
1044 rhs_code
= ERROR_MARK
;
1049 rhs_code
= gimple_assign_rhs_code (stmt
);
1050 load_p
= gimple_vuse (stmt
);
1053 /* Check the operation. */
1056 *node_vectype
= vectype
;
1057 first_stmt_code
= rhs_code
;
1058 first_stmt_load_p
= load_p
;
1059 first_stmt_phi_p
= phi_p
;
1061 /* Shift arguments should be equal in all the packed stmts for a
1062 vector shift with scalar shift operand. */
1063 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
1064 || rhs_code
== LROTATE_EXPR
1065 || rhs_code
== RROTATE_EXPR
)
1067 vec_mode
= TYPE_MODE (vectype
);
1069 /* First see if we have a vector/vector shift. */
1070 optab
= optab_for_tree_code (rhs_code
, vectype
,
1074 || optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
1076 /* No vector/vector shift, try for a vector/scalar shift. */
1077 optab
= optab_for_tree_code (rhs_code
, vectype
,
1082 if (dump_enabled_p ())
1083 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1084 "Build SLP failed: no optab.\n");
1085 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1087 /* Fatal mismatch. */
1091 icode
= (int) optab_handler (optab
, vec_mode
);
1092 if (icode
== CODE_FOR_nothing
)
1094 if (dump_enabled_p ())
1095 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1096 "Build SLP failed: "
1097 "op not supported by target.\n");
1098 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1100 /* Fatal mismatch. */
1104 optab_op2_mode
= insn_data
[icode
].operand
[2].mode
;
1105 if (!VECTOR_MODE_P (optab_op2_mode
))
1107 need_same_oprnds
= true;
1108 first_op1
= gimple_assign_rhs2 (stmt
);
1112 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
1114 need_same_oprnds
= true;
1115 first_op1
= gimple_assign_rhs2 (stmt
);
1118 && rhs_code
== BIT_FIELD_REF
)
1120 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 0);
1121 if (!is_a
<bb_vec_info
> (vinfo
)
1122 || TREE_CODE (vec
) != SSA_NAME
1123 || !operand_equal_p (TYPE_SIZE (vectype
),
1124 TYPE_SIZE (TREE_TYPE (vec
))))
1126 if (dump_enabled_p ())
1127 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1128 "Build SLP failed: "
1129 "BIT_FIELD_REF not supported\n");
1130 /* Fatal mismatch. */
1136 && gimple_call_internal_p (call_stmt
, IFN_DIV_POW2
))
1138 need_same_oprnds
= true;
1139 first_op1
= gimple_call_arg (call_stmt
, 1);
1144 if (first_stmt_code
!= rhs_code
1145 && alt_stmt_code
== ERROR_MARK
)
1146 alt_stmt_code
= rhs_code
;
1147 if ((first_stmt_code
!= rhs_code
1148 && (first_stmt_code
!= IMAGPART_EXPR
1149 || rhs_code
!= REALPART_EXPR
)
1150 && (first_stmt_code
!= REALPART_EXPR
1151 || rhs_code
!= IMAGPART_EXPR
)
1152 /* Handle mismatches in plus/minus by computing both
1153 and merging the results. */
1154 && !((first_stmt_code
== PLUS_EXPR
1155 || first_stmt_code
== MINUS_EXPR
)
1156 && (alt_stmt_code
== PLUS_EXPR
1157 || alt_stmt_code
== MINUS_EXPR
)
1158 && rhs_code
== alt_stmt_code
)
1159 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1160 && (first_stmt_code
== ARRAY_REF
1161 || first_stmt_code
== BIT_FIELD_REF
1162 || first_stmt_code
== INDIRECT_REF
1163 || first_stmt_code
== COMPONENT_REF
1164 || first_stmt_code
== MEM_REF
)))
1165 || first_stmt_load_p
!= load_p
1166 || first_stmt_phi_p
!= phi_p
)
1168 if (dump_enabled_p ())
1170 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1171 "Build SLP failed: different operation "
1172 "in stmt %G", stmt
);
1173 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1174 "original stmt %G", first_stmt_info
->stmt
);
1181 && first_stmt_code
== BIT_FIELD_REF
1182 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info
->stmt
), 0)
1183 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0)))
1185 if (dump_enabled_p ())
1186 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1187 "Build SLP failed: different BIT_FIELD_REF "
1188 "arguments in %G", stmt
);
1193 if (!load_p
&& rhs_code
== CALL_EXPR
)
1195 if (!compatible_calls_p (as_a
<gcall
*> (stmts
[0]->stmt
),
1196 as_a
<gcall
*> (stmt
)))
1198 if (dump_enabled_p ())
1199 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1200 "Build SLP failed: different calls in %G",
1207 if ((phi_p
|| gimple_could_trap_p (stmt_info
->stmt
))
1208 && (gimple_bb (first_stmt_info
->stmt
)
1209 != gimple_bb (stmt_info
->stmt
)))
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1213 "Build SLP failed: different BB for PHI "
1214 "or possibly trapping operation in %G", stmt
);
1219 if (need_same_oprnds
)
1221 tree other_op1
= (call_stmt
1222 ? gimple_call_arg (call_stmt
, 1)
1223 : gimple_assign_rhs2 (stmt
));
1224 if (!operand_equal_p (first_op1
, other_op1
, 0))
1226 if (dump_enabled_p ())
1227 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1228 "Build SLP failed: different shift "
1229 "arguments in %G", stmt
);
1235 if (!types_compatible_p (vectype
, *node_vectype
))
1237 if (dump_enabled_p ())
1238 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1239 "Build SLP failed: different vector type "
1246 /* Grouped store or load. */
1247 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1249 if (REFERENCE_CLASS_P (lhs
))
1257 first_load
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1258 if (prev_first_load
)
1260 /* Check that there are no loads from different interleaving
1261 chains in the same node. */
1262 if (prev_first_load
!= first_load
)
1264 if (dump_enabled_p ())
1265 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1267 "Build SLP failed: different "
1268 "interleaving chains in one node %G",
1275 prev_first_load
= first_load
;
1277 } /* Grouped access. */
1282 /* Not grouped load. */
1283 if (dump_enabled_p ())
1284 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1285 "Build SLP failed: not grouped load %G", stmt
);
1287 /* FORNOW: Not grouped loads are not supported. */
1288 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1290 /* Fatal mismatch. */
1295 /* Not memory operation. */
1297 && TREE_CODE_CLASS (rhs_code
) != tcc_binary
1298 && TREE_CODE_CLASS (rhs_code
) != tcc_unary
1299 && TREE_CODE_CLASS (rhs_code
) != tcc_expression
1300 && TREE_CODE_CLASS (rhs_code
) != tcc_comparison
1301 && rhs_code
!= VIEW_CONVERT_EXPR
1302 && rhs_code
!= CALL_EXPR
1303 && rhs_code
!= BIT_FIELD_REF
)
1305 if (dump_enabled_p ())
1306 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1307 "Build SLP failed: operation unsupported %G",
1309 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1311 /* Fatal mismatch. */
1316 if (rhs_code
== COND_EXPR
)
1318 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1319 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1320 enum tree_code swap_code
= ERROR_MARK
;
1321 enum tree_code invert_code
= ERROR_MARK
;
1324 first_cond_code
= TREE_CODE (cond_expr
);
1325 else if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1327 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
1328 swap_code
= swap_tree_comparison (cond_code
);
1329 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
1332 if (first_cond_code
== cond_code
)
1334 /* Isomorphic can be achieved by swapping. */
1335 else if (first_cond_code
== swap_code
)
1337 /* Isomorphic can be achieved by inverting. */
1338 else if (first_cond_code
== invert_code
)
1342 if (dump_enabled_p ())
1343 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1344 "Build SLP failed: different"
1345 " operation %G", stmt
);
1355 for (i
= 0; i
< group_size
; ++i
)
1359 /* If we allowed a two-operation SLP node verify the target can cope
1360 with the permute we are going to use. */
1361 if (alt_stmt_code
!= ERROR_MARK
1362 && TREE_CODE_CLASS (alt_stmt_code
) != tcc_reference
)
1364 *two_operators
= true;
1367 if (maybe_soft_fail
)
1369 unsigned HOST_WIDE_INT const_nunits
;
1370 if (!TYPE_VECTOR_SUBPARTS
1371 (soft_fail_nunits_vectype
).is_constant (&const_nunits
)
1372 || const_nunits
> group_size
)
1376 /* With constant vector elements simulate a mismatch at the
1377 point we need to split. */
1378 unsigned tail
= group_size
& (const_nunits
- 1);
1379 memset (&matches
[group_size
- tail
], 0, sizeof (bool) * tail
);
1387 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1388 Note we never remove apart from at destruction time so we do not
1389 need a special value for deleted that differs from empty. */
1392 typedef vec
<stmt_vec_info
> value_type
;
1393 typedef vec
<stmt_vec_info
> compare_type
;
1394 static inline hashval_t
hash (value_type
);
1395 static inline bool equal (value_type existing
, value_type candidate
);
1396 static inline bool is_empty (value_type x
) { return !x
.exists (); }
1397 static inline bool is_deleted (value_type x
) { return !x
.exists (); }
1398 static const bool empty_zero_p
= true;
1399 static inline void mark_empty (value_type
&x
) { x
.release (); }
1400 static inline void mark_deleted (value_type
&x
) { x
.release (); }
1401 static inline void remove (value_type
&x
) { x
.release (); }
1404 bst_traits::hash (value_type x
)
1407 for (unsigned i
= 0; i
< x
.length (); ++i
)
1408 h
.add_int (gimple_uid (x
[i
]->stmt
));
1412 bst_traits::equal (value_type existing
, value_type candidate
)
1414 if (existing
.length () != candidate
.length ())
1416 for (unsigned i
= 0; i
< existing
.length (); ++i
)
1417 if (existing
[i
] != candidate
[i
])
1422 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1423 but then vec::insert does memmove and that's not compatible with
1427 chain_op_t (tree_code code_
, vect_def_type dt_
, tree op_
)
1428 : code (code_
), dt (dt_
), op (op_
) {}
1434 /* Comparator for sorting associatable chains. */
1437 dt_sort_cmp (const void *op1_
, const void *op2_
, void *)
1439 auto *op1
= (const chain_op_t
*) op1_
;
1440 auto *op2
= (const chain_op_t
*) op2_
;
1441 if (op1
->dt
!= op2
->dt
)
1442 return (int)op1
->dt
- (int)op2
->dt
;
1443 return (int)op1
->code
- (int)op2
->code
;
1446 /* Linearize the associatable expression chain at START with the
1447 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1448 filling CHAIN with the result and using WORKLIST as intermediate storage.
1449 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1450 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1451 stmts, starting with START. */
1454 vect_slp_linearize_chain (vec_info
*vinfo
,
1455 vec
<std::pair
<tree_code
, gimple
*> > &worklist
,
1456 vec
<chain_op_t
> &chain
,
1457 enum tree_code code
, gimple
*start
,
1458 gimple
*&code_stmt
, gimple
*&alt_code_stmt
,
1459 vec
<gimple
*> *chain_stmts
)
1461 /* For each lane linearize the addition/subtraction (or other
1462 uniform associatable operation) expression tree. */
1463 worklist
.safe_push (std::make_pair (code
, start
));
1464 while (!worklist
.is_empty ())
1466 auto entry
= worklist
.pop ();
1467 gassign
*stmt
= as_a
<gassign
*> (entry
.second
);
1468 enum tree_code in_code
= entry
.first
;
1469 enum tree_code this_code
= gimple_assign_rhs_code (stmt
);
1470 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1472 && gimple_assign_rhs_code (stmt
) == code
)
1474 else if (!alt_code_stmt
1475 && gimple_assign_rhs_code (stmt
) == MINUS_EXPR
)
1476 alt_code_stmt
= stmt
;
1478 chain_stmts
->safe_push (stmt
);
1479 for (unsigned opnum
= 1; opnum
<= 2; ++opnum
)
1481 tree op
= gimple_op (stmt
, opnum
);
1483 stmt_vec_info def_stmt_info
;
1484 bool res
= vect_is_simple_use (op
, vinfo
, &dt
, &def_stmt_info
);
1486 if (dt
== vect_internal_def
1487 && is_pattern_stmt_p (def_stmt_info
))
1488 op
= gimple_get_lhs (def_stmt_info
->stmt
);
1490 use_operand_p use_p
;
1491 if (dt
== vect_internal_def
1492 && single_imm_use (op
, &use_p
, &use_stmt
)
1493 && is_gimple_assign (def_stmt_info
->stmt
)
1494 && (gimple_assign_rhs_code (def_stmt_info
->stmt
) == code
1495 || (code
== PLUS_EXPR
1496 && (gimple_assign_rhs_code (def_stmt_info
->stmt
)
1499 tree_code op_def_code
= this_code
;
1500 if (op_def_code
== MINUS_EXPR
&& opnum
== 1)
1501 op_def_code
= PLUS_EXPR
;
1502 if (in_code
== MINUS_EXPR
)
1503 op_def_code
= op_def_code
== PLUS_EXPR
? MINUS_EXPR
: PLUS_EXPR
;
1504 worklist
.safe_push (std::make_pair (op_def_code
,
1505 def_stmt_info
->stmt
));
1509 tree_code op_def_code
= this_code
;
1510 if (op_def_code
== MINUS_EXPR
&& opnum
== 1)
1511 op_def_code
= PLUS_EXPR
;
1512 if (in_code
== MINUS_EXPR
)
1513 op_def_code
= op_def_code
== PLUS_EXPR
? MINUS_EXPR
: PLUS_EXPR
;
1514 chain
.safe_push (chain_op_t (op_def_code
, dt
, op
));
1520 typedef hash_map
<vec
<stmt_vec_info
>, slp_tree
,
1521 simple_hashmap_traits
<bst_traits
, slp_tree
> >
1522 scalar_stmts_to_slp_tree_map_t
;
1525 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1526 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1527 poly_uint64
*max_nunits
,
1528 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1529 scalar_stmts_to_slp_tree_map_t
*bst_map
);
1532 vect_build_slp_tree (vec_info
*vinfo
,
1533 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1534 poly_uint64
*max_nunits
,
1535 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1536 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1538 if (slp_tree
*leader
= bst_map
->get (stmts
))
1540 if (dump_enabled_p ())
1541 dump_printf_loc (MSG_NOTE
, vect_location
, "re-using %sSLP tree %p\n",
1542 !(*leader
)->failed
? "" : "failed ", *leader
);
1543 if (!(*leader
)->failed
)
1545 SLP_TREE_REF_COUNT (*leader
)++;
1546 vect_update_max_nunits (max_nunits
, (*leader
)->max_nunits
);
1550 memcpy (matches
, (*leader
)->failed
, sizeof (bool) * group_size
);
1554 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1555 so we can pick up backedge destinations during discovery. */
1556 slp_tree res
= new _slp_tree
;
1557 SLP_TREE_DEF_TYPE (res
) = vect_internal_def
;
1558 SLP_TREE_SCALAR_STMTS (res
) = stmts
;
1559 bst_map
->put (stmts
.copy (), res
);
1563 if (dump_enabled_p ())
1564 dump_printf_loc (MSG_NOTE
, vect_location
,
1565 "SLP discovery limit exceeded\n");
1566 /* Mark the node invalid so we can detect those when still in use
1567 as backedge destinations. */
1568 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1569 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1570 res
->failed
= XNEWVEC (bool, group_size
);
1571 memset (res
->failed
, 0, sizeof (bool) * group_size
);
1572 memset (matches
, 0, sizeof (bool) * group_size
);
1577 if (dump_enabled_p ())
1578 dump_printf_loc (MSG_NOTE
, vect_location
,
1579 "starting SLP discovery for node %p\n", res
);
1581 poly_uint64 this_max_nunits
= 1;
1582 slp_tree res_
= vect_build_slp_tree_2 (vinfo
, res
, stmts
, group_size
,
1584 matches
, limit
, tree_size
, bst_map
);
1587 if (dump_enabled_p ())
1588 dump_printf_loc (MSG_NOTE
, vect_location
,
1589 "SLP discovery for node %p failed\n", res
);
1590 /* Mark the node invalid so we can detect those when still in use
1591 as backedge destinations. */
1592 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1593 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1594 res
->failed
= XNEWVEC (bool, group_size
);
1598 for (i
= 0; i
< group_size
; ++i
)
1601 gcc_assert (i
< group_size
);
1603 memcpy (res
->failed
, matches
, sizeof (bool) * group_size
);
1607 if (dump_enabled_p ())
1608 dump_printf_loc (MSG_NOTE
, vect_location
,
1609 "SLP discovery for node %p succeeded\n", res
);
1610 gcc_assert (res_
== res
);
1611 res
->max_nunits
= this_max_nunits
;
1612 vect_update_max_nunits (max_nunits
, this_max_nunits
);
1613 /* Keep a reference for the bst_map use. */
1614 SLP_TREE_REF_COUNT (res
)++;
1619 /* Helper for building an associated SLP node chain. */
1622 vect_slp_build_two_operator_nodes (slp_tree perm
, tree vectype
,
1623 slp_tree op0
, slp_tree op1
,
1624 stmt_vec_info oper1
, stmt_vec_info oper2
,
1625 vec
<std::pair
<unsigned, unsigned> > lperm
)
1627 unsigned group_size
= SLP_TREE_LANES (op1
);
1629 slp_tree child1
= new _slp_tree
;
1630 SLP_TREE_DEF_TYPE (child1
) = vect_internal_def
;
1631 SLP_TREE_VECTYPE (child1
) = vectype
;
1632 SLP_TREE_LANES (child1
) = group_size
;
1633 SLP_TREE_CHILDREN (child1
).create (2);
1634 SLP_TREE_CHILDREN (child1
).quick_push (op0
);
1635 SLP_TREE_CHILDREN (child1
).quick_push (op1
);
1636 SLP_TREE_REPRESENTATIVE (child1
) = oper1
;
1638 slp_tree child2
= new _slp_tree
;
1639 SLP_TREE_DEF_TYPE (child2
) = vect_internal_def
;
1640 SLP_TREE_VECTYPE (child2
) = vectype
;
1641 SLP_TREE_LANES (child2
) = group_size
;
1642 SLP_TREE_CHILDREN (child2
).create (2);
1643 SLP_TREE_CHILDREN (child2
).quick_push (op0
);
1644 SLP_TREE_REF_COUNT (op0
)++;
1645 SLP_TREE_CHILDREN (child2
).quick_push (op1
);
1646 SLP_TREE_REF_COUNT (op1
)++;
1647 SLP_TREE_REPRESENTATIVE (child2
) = oper2
;
1649 SLP_TREE_DEF_TYPE (perm
) = vect_internal_def
;
1650 SLP_TREE_CODE (perm
) = VEC_PERM_EXPR
;
1651 SLP_TREE_VECTYPE (perm
) = vectype
;
1652 SLP_TREE_LANES (perm
) = group_size
;
1653 /* ??? We should set this NULL but that's not expected. */
1654 SLP_TREE_REPRESENTATIVE (perm
) = oper1
;
1655 SLP_TREE_LANE_PERMUTATION (perm
) = lperm
;
1656 SLP_TREE_CHILDREN (perm
).quick_push (child1
);
1657 SLP_TREE_CHILDREN (perm
).quick_push (child2
);
1660 /* Recursively build an SLP tree starting from NODE.
1661 Fail (and return a value not equal to zero) if def-stmts are not
1662 isomorphic, require data permutation or are of unsupported types of
1663 operation. Otherwise, return 0.
1664 The value returned is the depth in the SLP tree where a mismatch
1668 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1669 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1670 poly_uint64
*max_nunits
,
1671 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1672 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1674 unsigned nops
, i
, this_tree_size
= 0;
1675 poly_uint64 this_max_nunits
= *max_nunits
;
1679 stmt_vec_info stmt_info
= stmts
[0];
1680 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1681 nops
= gimple_call_num_args (stmt
);
1682 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
1684 nops
= gimple_num_ops (stmt
) - 1;
1685 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
1688 else if (gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1689 nops
= gimple_phi_num_args (phi
);
1693 /* If the SLP node is a PHI (induction or reduction), terminate
1695 bool *skip_args
= XALLOCAVEC (bool, nops
);
1696 memset (skip_args
, 0, sizeof (bool) * nops
);
1697 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
1698 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1700 tree scalar_type
= TREE_TYPE (PHI_RESULT (stmt
));
1701 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
1703 if (!vect_record_max_nunits (vinfo
, stmt_info
, group_size
, vectype
,
1707 vect_def_type def_type
= STMT_VINFO_DEF_TYPE (stmt_info
);
1708 if (def_type
== vect_induction_def
)
1710 /* Induction PHIs are not cycles but walk the initial
1711 value. Only for inner loops through, for outer loops
1712 we need to pick up the value from the actual PHIs
1713 to more easily support peeling and epilogue vectorization. */
1714 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1715 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1716 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1719 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1721 else if (def_type
== vect_reduction_def
1722 || def_type
== vect_double_reduction_def
1723 || def_type
== vect_nested_cycle
)
1725 /* Else def types have to match. */
1726 stmt_vec_info other_info
;
1727 bool all_same
= true;
1728 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1730 if (STMT_VINFO_DEF_TYPE (other_info
) != def_type
)
1732 if (other_info
!= stmt_info
)
1735 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1736 /* Reduction initial values are not explicitely represented. */
1737 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1738 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1739 /* Reduction chain backedge defs are filled manually.
1740 ??? Need a better way to identify a SLP reduction chain PHI.
1741 Or a better overall way to SLP match those. */
1742 if (all_same
&& def_type
== vect_reduction_def
)
1743 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1745 else if (def_type
!= vect_internal_def
)
1750 bool two_operators
= false;
1751 unsigned char *swap
= XALLOCAVEC (unsigned char, group_size
);
1752 tree vectype
= NULL_TREE
;
1753 if (!vect_build_slp_tree_1 (vinfo
, swap
, stmts
, group_size
,
1754 &this_max_nunits
, matches
, &two_operators
,
1758 /* If the SLP node is a load, terminate the recursion unless masked. */
1759 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1760 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1762 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1765 gcc_assert (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
));
1770 *max_nunits
= this_max_nunits
;
1772 node
= vect_create_new_slp_node (node
, stmts
, 0);
1773 SLP_TREE_VECTYPE (node
) = vectype
;
1774 /* And compute the load permutation. Whether it is actually
1775 a permutation depends on the unrolling factor which is
1777 vec
<unsigned> load_permutation
;
1779 stmt_vec_info load_info
;
1780 load_permutation
.create (group_size
);
1781 stmt_vec_info first_stmt_info
1782 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
1783 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
1785 int load_place
= vect_get_place_in_interleaving_chain
1786 (load_info
, first_stmt_info
);
1787 gcc_assert (load_place
!= -1);
1788 load_permutation
.safe_push (load_place
);
1790 SLP_TREE_LOAD_PERMUTATION (node
) = load_permutation
;
1794 else if (gimple_assign_single_p (stmt_info
->stmt
)
1795 && !gimple_vuse (stmt_info
->stmt
)
1796 && gimple_assign_rhs_code (stmt_info
->stmt
) == BIT_FIELD_REF
)
1798 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1799 the same SSA name vector of a compatible type to vectype. */
1800 vec
<std::pair
<unsigned, unsigned> > lperm
= vNULL
;
1801 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0);
1802 stmt_vec_info estmt_info
;
1803 FOR_EACH_VEC_ELT (stmts
, i
, estmt_info
)
1805 gassign
*estmt
= as_a
<gassign
*> (estmt_info
->stmt
);
1806 tree bfref
= gimple_assign_rhs1 (estmt
);
1808 if (!known_eq (bit_field_size (bfref
),
1809 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype
))))
1810 || !constant_multiple_p (bit_field_offset (bfref
),
1811 bit_field_size (bfref
), &lane
))
1816 lperm
.safe_push (std::make_pair (0, (unsigned)lane
));
1818 slp_tree vnode
= vect_create_new_slp_node (vNULL
);
1819 /* ??? We record vectype here but we hide eventually necessary
1820 punning and instead rely on code generation to materialize
1821 VIEW_CONVERT_EXPRs as necessary. We instead should make
1822 this explicit somehow. */
1823 SLP_TREE_VECTYPE (vnode
) = vectype
;
1824 SLP_TREE_VEC_DEFS (vnode
).safe_push (vec
);
1825 /* We are always building a permutation node even if it is an identity
1826 permute to shield the rest of the vectorizer from the odd node
1827 representing an actual vector without any scalar ops.
1828 ??? We could hide it completely with making the permute node
1830 node
= vect_create_new_slp_node (node
, stmts
, 1);
1831 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1832 SLP_TREE_LANE_PERMUTATION (node
) = lperm
;
1833 SLP_TREE_VECTYPE (node
) = vectype
;
1834 SLP_TREE_CHILDREN (node
).quick_push (vnode
);
1837 /* When discovery reaches an associatable operation see whether we can
1838 improve that to match up lanes in a way superior to the operand
1839 swapping code which at most looks at two defs.
1840 ??? For BB vectorization we cannot do the brute-force search
1841 for matching as we can succeed by means of builds from scalars
1842 and have no good way to "cost" one build against another. */
1843 else if (is_a
<loop_vec_info
> (vinfo
)
1844 /* ??? We don't handle !vect_internal_def defs below. */
1845 && STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
1846 && is_gimple_assign (stmt_info
->stmt
)
1847 && (associative_tree_code (gimple_assign_rhs_code (stmt_info
->stmt
))
1848 || gimple_assign_rhs_code (stmt_info
->stmt
) == MINUS_EXPR
)
1849 && ((FLOAT_TYPE_P (vectype
) && flag_associative_math
)
1850 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype
))
1851 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype
)))))
1853 /* See if we have a chain of (mixed) adds or subtracts or other
1854 associatable ops. */
1855 enum tree_code code
= gimple_assign_rhs_code (stmt_info
->stmt
);
1856 if (code
== MINUS_EXPR
)
1858 stmt_vec_info other_op_stmt_info
= NULL
;
1859 stmt_vec_info op_stmt_info
= NULL
;
1860 unsigned chain_len
= 0;
1861 auto_vec
<chain_op_t
> chain
;
1862 auto_vec
<std::pair
<tree_code
, gimple
*> > worklist
;
1863 auto_vec
<vec
<chain_op_t
> > chains (group_size
);
1864 auto_vec
<slp_tree
, 4> children
;
1865 bool hard_fail
= true;
1866 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
1868 /* For each lane linearize the addition/subtraction (or other
1869 uniform associatable operation) expression tree. */
1870 gimple
*op_stmt
= NULL
, *other_op_stmt
= NULL
;
1871 vect_slp_linearize_chain (vinfo
, worklist
, chain
, code
,
1872 stmts
[lane
]->stmt
, op_stmt
, other_op_stmt
,
1874 if (!op_stmt_info
&& op_stmt
)
1875 op_stmt_info
= vinfo
->lookup_stmt (op_stmt
);
1876 if (!other_op_stmt_info
&& other_op_stmt
)
1877 other_op_stmt_info
= vinfo
->lookup_stmt (other_op_stmt
);
1878 if (chain
.length () == 2)
1880 /* In a chain of just two elements resort to the regular
1881 operand swapping scheme. If we run into a length
1882 mismatch still hard-FAIL. */
1887 matches
[lane
] = false;
1888 /* ??? We might want to process the other lanes, but
1889 make sure to not give false matching hints to the
1890 caller for lanes we did not process. */
1891 if (lane
!= group_size
- 1)
1896 else if (chain_len
== 0)
1897 chain_len
= chain
.length ();
1898 else if (chain
.length () != chain_len
)
1900 /* ??? Here we could slip in magic to compensate with
1901 neutral operands. */
1902 matches
[lane
] = false;
1903 if (lane
!= group_size
- 1)
1907 chains
.quick_push (chain
.copy ());
1910 if (chains
.length () == group_size
)
1912 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
1918 /* Now we have a set of chains with the same length. */
1919 /* 1. pre-sort according to def_type and operation. */
1920 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
1921 chains
[lane
].stablesort (dt_sort_cmp
, vinfo
);
1922 if (dump_enabled_p ())
1924 dump_printf_loc (MSG_NOTE
, vect_location
,
1925 "pre-sorted chains of %s\n",
1926 get_tree_code_name (code
));
1927 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
1929 for (unsigned opnum
= 0; opnum
< chain_len
; ++opnum
)
1930 dump_printf (MSG_NOTE
, "%s %T ",
1931 get_tree_code_name (chains
[lane
][opnum
].code
),
1932 chains
[lane
][opnum
].op
);
1933 dump_printf (MSG_NOTE
, "\n");
1936 /* 2. try to build children nodes, associating as necessary. */
1937 for (unsigned n
= 0; n
< chain_len
; ++n
)
1939 vect_def_type dt
= chains
[0][n
].dt
;
1941 for (lane
= 0; lane
< group_size
; ++lane
)
1942 if (chains
[lane
][n
].dt
!= dt
)
1944 if (dt
== vect_constant_def
1945 && chains
[lane
][n
].dt
== vect_external_def
)
1946 dt
= vect_external_def
;
1947 else if (dt
== vect_external_def
1948 && chains
[lane
][n
].dt
== vect_constant_def
)
1953 if (lane
!= group_size
)
1955 if (dump_enabled_p ())
1956 dump_printf_loc (MSG_NOTE
, vect_location
,
1957 "giving up on chain due to mismatched "
1959 matches
[lane
] = false;
1960 if (lane
!= group_size
- 1)
1964 if (dt
== vect_constant_def
1965 || dt
== vect_external_def
)
1967 /* We can always build those. Might want to sort last
1968 or defer building. */
1970 ops
.create (group_size
);
1971 for (lane
= 0; lane
< group_size
; ++lane
)
1972 ops
.quick_push (chains
[lane
][n
].op
);
1973 slp_tree child
= vect_create_new_slp_node (ops
);
1974 SLP_TREE_DEF_TYPE (child
) = dt
;
1975 children
.safe_push (child
);
1977 else if (dt
!= vect_internal_def
)
1979 /* Not sure, we might need sth special.
1980 gcc.dg/vect/pr96854.c,
1981 gfortran.dg/vect/fast-math-pr37021.f90
1982 and gfortran.dg/vect/pr61171.f trigger. */
1983 /* Soft-fail for now. */
1989 vec
<stmt_vec_info
> op_stmts
;
1990 op_stmts
.create (group_size
);
1991 slp_tree child
= NULL
;
1992 /* Brute-force our way. We have to consider a lane
1993 failing after fixing an earlier fail up in the
1994 SLP discovery recursion. So track the current
1995 permute per lane. */
1996 unsigned *perms
= XALLOCAVEC (unsigned, group_size
);
1997 memset (perms
, 0, sizeof (unsigned) * group_size
);
2000 op_stmts
.truncate (0);
2001 for (lane
= 0; lane
< group_size
; ++lane
)
2003 (vinfo
->lookup_def (chains
[lane
][n
].op
));
2004 child
= vect_build_slp_tree (vinfo
, op_stmts
,
2005 group_size
, &this_max_nunits
,
2007 &this_tree_size
, bst_map
);
2008 /* ??? We're likely getting too many fatal mismatches
2009 here so maybe we want to ignore them (but then we
2010 have no idea which lanes fatally mismatched). */
2011 if (child
|| !matches
[0])
2013 /* Swap another lane we have not yet matched up into
2014 lanes that did not match. If we run out of
2015 permute possibilities for a lane terminate the
2018 for (lane
= 1; lane
< group_size
; ++lane
)
2021 if (n
+ perms
[lane
] + 1 == chain_len
)
2026 std::swap (chains
[lane
][n
],
2027 chains
[lane
][n
+ perms
[lane
] + 1]);
2036 if (dump_enabled_p ())
2037 dump_printf_loc (MSG_NOTE
, vect_location
,
2038 "failed to match up op %d\n", n
);
2039 op_stmts
.release ();
2040 if (lane
!= group_size
- 1)
2043 matches
[lane
] = false;
2046 if (dump_enabled_p ())
2048 dump_printf_loc (MSG_NOTE
, vect_location
,
2049 "matched up op %d to\n", n
);
2050 vect_print_slp_tree (MSG_NOTE
, vect_location
, child
);
2052 children
.safe_push (child
);
2055 /* 3. build SLP nodes to combine the chain. */
2056 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2057 if (chains
[lane
][0].code
!= code
)
2059 /* See if there's any alternate all-PLUS entry. */
2061 for (n
= 1; n
< chain_len
; ++n
)
2063 for (lane
= 0; lane
< group_size
; ++lane
)
2064 if (chains
[lane
][n
].code
!= code
)
2066 if (lane
== group_size
)
2071 /* Swap that in at first position. */
2072 std::swap (children
[0], children
[n
]);
2073 for (lane
= 0; lane
< group_size
; ++lane
)
2074 std::swap (chains
[lane
][0], chains
[lane
][n
]);
2078 /* ??? When this triggers and we end up with two
2079 vect_constant/external_def up-front things break (ICE)
2080 spectacularly finding an insertion place for the
2081 all-constant op. We should have a fully
2082 vect_internal_def operand though(?) so we can swap
2083 that into first place and then prepend the all-zero
2085 if (dump_enabled_p ())
2086 dump_printf_loc (MSG_NOTE
, vect_location
,
2087 "inserting constant zero to compensate "
2088 "for (partially) negated first "
2091 for (lane
= 0; lane
< group_size
; ++lane
)
2092 chains
[lane
].safe_insert
2093 (0, chain_op_t (code
, vect_constant_def
, NULL_TREE
));
2095 zero_ops
.create (group_size
);
2096 zero_ops
.quick_push (build_zero_cst (TREE_TYPE (vectype
)));
2097 for (lane
= 1; lane
< group_size
; ++lane
)
2098 zero_ops
.quick_push (zero_ops
[0]);
2099 slp_tree zero
= vect_create_new_slp_node (zero_ops
);
2100 SLP_TREE_DEF_TYPE (zero
) = vect_constant_def
;
2101 children
.safe_insert (0, zero
);
2105 for (unsigned i
= 1; i
< children
.length (); ++i
)
2107 slp_tree op0
= children
[i
- 1];
2108 slp_tree op1
= children
[i
];
2109 bool this_two_op
= false;
2110 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2111 if (chains
[lane
][i
].code
!= chains
[0][i
].code
)
2117 if (i
== children
.length () - 1)
2118 child
= vect_create_new_slp_node (node
, stmts
, 2);
2120 child
= vect_create_new_slp_node (2, ERROR_MARK
);
2123 vec
<std::pair
<unsigned, unsigned> > lperm
;
2124 lperm
.create (group_size
);
2125 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2126 lperm
.quick_push (std::make_pair
2127 (chains
[lane
][i
].code
!= chains
[0][i
].code
, lane
));
2128 vect_slp_build_two_operator_nodes (child
, vectype
, op0
, op1
,
2129 (chains
[0][i
].code
== code
2131 : other_op_stmt_info
),
2132 (chains
[0][i
].code
== code
2133 ? other_op_stmt_info
2139 SLP_TREE_DEF_TYPE (child
) = vect_internal_def
;
2140 SLP_TREE_VECTYPE (child
) = vectype
;
2141 SLP_TREE_LANES (child
) = group_size
;
2142 SLP_TREE_CHILDREN (child
).quick_push (op0
);
2143 SLP_TREE_CHILDREN (child
).quick_push (op1
);
2144 SLP_TREE_REPRESENTATIVE (child
)
2145 = (chains
[0][i
].code
== code
2146 ? op_stmt_info
: other_op_stmt_info
);
2148 children
[i
] = child
;
2150 *tree_size
+= this_tree_size
+ 1;
2151 *max_nunits
= this_max_nunits
;
2152 while (!chains
.is_empty ())
2153 chains
.pop ().release ();
2157 while (!children
.is_empty ())
2158 vect_free_slp_tree (children
.pop ());
2159 while (!chains
.is_empty ())
2160 chains
.pop ().release ();
2161 /* Hard-fail, otherwise we might run into quadratic processing of the
2162 chains starting one stmt into the chain again. */
2165 /* Fall thru to normal processing. */
2168 /* Get at the operands, verifying they are compatible. */
2169 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
2170 slp_oprnd_info oprnd_info
;
2171 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
2173 int res
= vect_get_and_check_slp_defs (vinfo
, swap
[i
], skip_args
,
2174 stmts
, i
, &oprnds_info
);
2176 matches
[(res
== -1) ? 0 : i
] = false;
2180 for (i
= 0; i
< group_size
; ++i
)
2183 vect_free_oprnd_info (oprnds_info
);
2188 auto_vec
<slp_tree
, 4> children
;
2190 stmt_info
= stmts
[0];
2192 /* Create SLP_TREE nodes for the definition node/s. */
2193 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
2198 /* We're skipping certain operands from processing, for example
2199 outer loop reduction initial defs. */
2202 children
.safe_push (NULL
);
2206 if (oprnd_info
->first_dt
== vect_uninitialized_def
)
2208 /* COND_EXPR have one too many eventually if the condition
2210 gcc_assert (i
== 3 && nops
== 4);
2214 if (is_a
<bb_vec_info
> (vinfo
)
2215 && oprnd_info
->first_dt
== vect_internal_def
2216 && !oprnd_info
->any_pattern
)
2218 /* For BB vectorization, if all defs are the same do not
2219 bother to continue the build along the single-lane
2220 graph but use a splat of the scalar value. */
2221 stmt_vec_info first_def
= oprnd_info
->def_stmts
[0];
2222 for (j
= 1; j
< group_size
; ++j
)
2223 if (oprnd_info
->def_stmts
[j
] != first_def
)
2226 /* But avoid doing this for loads where we may be
2227 able to CSE things, unless the stmt is not
2229 && (!STMT_VINFO_VECTORIZABLE (first_def
)
2230 || !gimple_vuse (first_def
->stmt
)))
2232 if (dump_enabled_p ())
2233 dump_printf_loc (MSG_NOTE
, vect_location
,
2234 "Using a splat of the uniform operand\n");
2235 oprnd_info
->first_dt
= vect_external_def
;
2239 if (oprnd_info
->first_dt
== vect_external_def
2240 || oprnd_info
->first_dt
== vect_constant_def
)
2242 slp_tree invnode
= vect_create_new_slp_node (oprnd_info
->ops
);
2243 SLP_TREE_DEF_TYPE (invnode
) = oprnd_info
->first_dt
;
2244 oprnd_info
->ops
= vNULL
;
2245 children
.safe_push (invnode
);
2249 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
2250 group_size
, &this_max_nunits
,
2252 &this_tree_size
, bst_map
)) != NULL
)
2254 oprnd_info
->def_stmts
= vNULL
;
2255 children
.safe_push (child
);
2259 /* If the SLP build for operand zero failed and operand zero
2260 and one can be commutated try that for the scalar stmts
2261 that failed the match. */
2263 /* A first scalar stmt mismatch signals a fatal mismatch. */
2265 /* ??? For COND_EXPRs we can swap the comparison operands
2266 as well as the arms under some constraints. */
2268 && oprnds_info
[1]->first_dt
== vect_internal_def
2269 && is_gimple_assign (stmt_info
->stmt
)
2270 /* Swapping operands for reductions breaks assumptions later on. */
2271 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
2272 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
2274 /* See whether we can swap the matching or the non-matching
2276 bool swap_not_matching
= true;
2279 for (j
= 0; j
< group_size
; ++j
)
2281 if (matches
[j
] != !swap_not_matching
)
2283 stmt_vec_info stmt_info
= stmts
[j
];
2284 /* Verify if we can swap operands of this stmt. */
2285 gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
2287 || !commutative_tree_code (gimple_assign_rhs_code (stmt
)))
2289 if (!swap_not_matching
)
2291 swap_not_matching
= false;
2296 while (j
!= group_size
);
2298 /* Swap mismatched definition stmts. */
2299 if (dump_enabled_p ())
2300 dump_printf_loc (MSG_NOTE
, vect_location
,
2301 "Re-trying with swapped operands of stmts ");
2302 for (j
= 0; j
< group_size
; ++j
)
2303 if (matches
[j
] == !swap_not_matching
)
2305 std::swap (oprnds_info
[0]->def_stmts
[j
],
2306 oprnds_info
[1]->def_stmts
[j
]);
2307 std::swap (oprnds_info
[0]->ops
[j
],
2308 oprnds_info
[1]->ops
[j
]);
2309 if (dump_enabled_p ())
2310 dump_printf (MSG_NOTE
, "%d ", j
);
2312 if (dump_enabled_p ())
2313 dump_printf (MSG_NOTE
, "\n");
2314 /* And try again with scratch 'matches' ... */
2315 bool *tem
= XALLOCAVEC (bool, group_size
);
2316 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
2317 group_size
, &this_max_nunits
,
2319 &this_tree_size
, bst_map
)) != NULL
)
2321 oprnd_info
->def_stmts
= vNULL
;
2322 children
.safe_push (child
);
2328 /* If the SLP build failed and we analyze a basic-block
2329 simply treat nodes we fail to build as externally defined
2330 (and thus build vectors from the scalar defs).
2331 The cost model will reject outright expensive cases.
2332 ??? This doesn't treat cases where permutation ultimatively
2333 fails (or we don't try permutation below). Ideally we'd
2334 even compute a permutation that will end up with the maximum
2336 if (is_a
<bb_vec_info
> (vinfo
)
2337 /* ??? Rejecting patterns this way doesn't work. We'd have to
2338 do extra work to cancel the pattern so the uses see the
2340 && !is_pattern_stmt_p (stmt_info
)
2341 && !oprnd_info
->any_pattern
)
2343 /* But if there's a leading vector sized set of matching stmts
2344 fail here so we can split the group. This matches the condition
2345 vect_analyze_slp_instance uses. */
2346 /* ??? We might want to split here and combine the results to support
2347 multiple vector sizes better. */
2348 for (j
= 0; j
< group_size
; ++j
)
2351 if (!known_ge (j
, TYPE_VECTOR_SUBPARTS (vectype
)))
2353 if (dump_enabled_p ())
2354 dump_printf_loc (MSG_NOTE
, vect_location
,
2355 "Building vector operands from scalars\n");
2357 child
= vect_create_new_slp_node (oprnd_info
->ops
);
2358 children
.safe_push (child
);
2359 oprnd_info
->ops
= vNULL
;
2364 gcc_assert (child
== NULL
);
2365 FOR_EACH_VEC_ELT (children
, j
, child
)
2367 vect_free_slp_tree (child
);
2368 vect_free_oprnd_info (oprnds_info
);
2372 vect_free_oprnd_info (oprnds_info
);
2374 /* If we have all children of a child built up from uniform scalars
2375 or does more than one possibly expensive vector construction then
2376 just throw that away, causing it built up from scalars.
2377 The exception is the SLP node for the vector store. */
2378 if (is_a
<bb_vec_info
> (vinfo
)
2379 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2380 /* ??? Rejecting patterns this way doesn't work. We'd have to
2381 do extra work to cancel the pattern so the uses see the
2383 && !is_pattern_stmt_p (stmt_info
))
2387 bool all_uniform_p
= true;
2388 unsigned n_vector_builds
= 0;
2389 FOR_EACH_VEC_ELT (children
, j
, child
)
2393 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
2394 all_uniform_p
= false;
2395 else if (!vect_slp_tree_uniform_p (child
))
2397 all_uniform_p
= false;
2398 if (SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
2403 || n_vector_builds
> 1
2404 || (n_vector_builds
== children
.length ()
2405 && is_a
<gphi
*> (stmt_info
->stmt
)))
2409 FOR_EACH_VEC_ELT (children
, j
, child
)
2411 vect_free_slp_tree (child
);
2413 if (dump_enabled_p ())
2414 dump_printf_loc (MSG_NOTE
, vect_location
,
2415 "Building parent vector operands from "
2416 "scalars instead\n");
2421 *tree_size
+= this_tree_size
+ 1;
2422 *max_nunits
= this_max_nunits
;
2426 /* ??? We'd likely want to either cache in bst_map sth like
2427 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2428 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2429 explicit stmts to put in so the keying on 'stmts' doesn't
2430 work (but we have the same issue with nodes that use 'ops'). */
2431 slp_tree one
= new _slp_tree
;
2432 slp_tree two
= new _slp_tree
;
2433 SLP_TREE_DEF_TYPE (one
) = vect_internal_def
;
2434 SLP_TREE_DEF_TYPE (two
) = vect_internal_def
;
2435 SLP_TREE_VECTYPE (one
) = vectype
;
2436 SLP_TREE_VECTYPE (two
) = vectype
;
2437 SLP_TREE_CHILDREN (one
).safe_splice (children
);
2438 SLP_TREE_CHILDREN (two
).safe_splice (children
);
2440 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two
), i
, child
)
2441 SLP_TREE_REF_COUNT (child
)++;
2443 /* Here we record the original defs since this
2444 node represents the final lane configuration. */
2445 node
= vect_create_new_slp_node (node
, stmts
, 2);
2446 SLP_TREE_VECTYPE (node
) = vectype
;
2447 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
2448 SLP_TREE_CHILDREN (node
).quick_push (one
);
2449 SLP_TREE_CHILDREN (node
).quick_push (two
);
2450 gassign
*stmt
= as_a
<gassign
*> (stmts
[0]->stmt
);
2451 enum tree_code code0
= gimple_assign_rhs_code (stmt
);
2452 enum tree_code ocode
= ERROR_MARK
;
2453 stmt_vec_info ostmt_info
;
2455 FOR_EACH_VEC_ELT (stmts
, i
, ostmt_info
)
2457 gassign
*ostmt
= as_a
<gassign
*> (ostmt_info
->stmt
);
2458 if (gimple_assign_rhs_code (ostmt
) != code0
)
2460 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (1, i
));
2461 ocode
= gimple_assign_rhs_code (ostmt
);
2465 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (0, i
));
2467 SLP_TREE_CODE (one
) = code0
;
2468 SLP_TREE_CODE (two
) = ocode
;
2469 SLP_TREE_LANES (one
) = stmts
.length ();
2470 SLP_TREE_LANES (two
) = stmts
.length ();
2471 SLP_TREE_REPRESENTATIVE (one
) = stmts
[0];
2472 SLP_TREE_REPRESENTATIVE (two
) = stmts
[j
];
2476 node
= vect_create_new_slp_node (node
, stmts
, nops
);
2477 SLP_TREE_VECTYPE (node
) = vectype
;
2478 SLP_TREE_CHILDREN (node
).splice (children
);
2482 /* Dump a single SLP tree NODE. */
2485 vect_print_slp_tree (dump_flags_t dump_kind
, dump_location_t loc
,
2490 stmt_vec_info stmt_info
;
2493 dump_metadata_t
metadata (dump_kind
, loc
.get_impl_location ());
2494 dump_user_location_t user_loc
= loc
.get_user_location ();
2495 dump_printf_loc (metadata
, user_loc
, "node%s %p (max_nunits=%u, refcnt=%u)\n",
2496 SLP_TREE_DEF_TYPE (node
) == vect_external_def
2498 : (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
2501 estimated_poly_value (node
->max_nunits
),
2502 SLP_TREE_REF_COUNT (node
));
2503 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
2505 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
2506 dump_printf_loc (metadata
, user_loc
, "op: VEC_PERM_EXPR\n");
2508 dump_printf_loc (metadata
, user_loc
, "op template: %G",
2509 SLP_TREE_REPRESENTATIVE (node
)->stmt
);
2511 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
2512 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2513 dump_printf_loc (metadata
, user_loc
, "\tstmt %u %G", i
, stmt_info
->stmt
);
2516 dump_printf_loc (metadata
, user_loc
, "\t{ ");
2517 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
2518 dump_printf (metadata
, "%T%s ", op
,
2519 i
< SLP_TREE_SCALAR_OPS (node
).length () - 1 ? "," : "");
2520 dump_printf (metadata
, "}\n");
2522 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2524 dump_printf_loc (metadata
, user_loc
, "\tload permutation {");
2525 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node
), i
, j
)
2526 dump_printf (dump_kind
, " %u", j
);
2527 dump_printf (dump_kind
, " }\n");
2529 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
2531 dump_printf_loc (metadata
, user_loc
, "\tlane permutation {");
2532 for (i
= 0; i
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++i
)
2533 dump_printf (dump_kind
, " %u[%u]",
2534 SLP_TREE_LANE_PERMUTATION (node
)[i
].first
,
2535 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
);
2536 dump_printf (dump_kind
, " }\n");
2538 if (SLP_TREE_CHILDREN (node
).is_empty ())
2540 dump_printf_loc (metadata
, user_loc
, "\tchildren");
2541 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2542 dump_printf (dump_kind
, " %p", (void *)child
);
2543 dump_printf (dump_kind
, "\n");
2547 debug (slp_tree node
)
2549 debug_dump_context ctx
;
2550 vect_print_slp_tree (MSG_NOTE
,
2551 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
2555 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2558 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2559 slp_tree node
, hash_set
<slp_tree
> &visited
)
2564 if (visited
.add (node
))
2567 vect_print_slp_tree (dump_kind
, loc
, node
);
2569 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2571 vect_print_slp_graph (dump_kind
, loc
, child
, visited
);
2575 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2578 hash_set
<slp_tree
> visited
;
2579 vect_print_slp_graph (dump_kind
, loc
, entry
, visited
);
2582 /* Mark the tree rooted at NODE with PURE_SLP. */
2585 vect_mark_slp_stmts (slp_tree node
, hash_set
<slp_tree
> &visited
)
2588 stmt_vec_info stmt_info
;
2591 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2594 if (visited
.add (node
))
2597 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2598 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
2600 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2602 vect_mark_slp_stmts (child
, visited
);
2606 vect_mark_slp_stmts (slp_tree node
)
2608 hash_set
<slp_tree
> visited
;
2609 vect_mark_slp_stmts (node
, visited
);
2612 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2615 vect_mark_slp_stmts_relevant (slp_tree node
, hash_set
<slp_tree
> &visited
)
2618 stmt_vec_info stmt_info
;
2621 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2624 if (visited
.add (node
))
2627 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2629 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
2630 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
2631 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
2634 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2636 vect_mark_slp_stmts_relevant (child
, visited
);
2640 vect_mark_slp_stmts_relevant (slp_tree node
)
2642 hash_set
<slp_tree
> visited
;
2643 vect_mark_slp_stmts_relevant (node
, visited
);
2647 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2650 vect_gather_slp_loads (vec
<slp_tree
> &loads
, slp_tree node
,
2651 hash_set
<slp_tree
> &visited
)
2653 if (!node
|| visited
.add (node
))
2656 if (SLP_TREE_CHILDREN (node
).length () == 0)
2658 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2660 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2661 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2662 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2663 loads
.safe_push (node
);
2669 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2670 vect_gather_slp_loads (loads
, child
, visited
);
2675 /* Find the last store in SLP INSTANCE. */
2678 vect_find_last_scalar_stmt_in_slp (slp_tree node
)
2680 stmt_vec_info last
= NULL
;
2681 stmt_vec_info stmt_vinfo
;
2683 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2685 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2686 last
= last
? get_later_stmt (stmt_vinfo
, last
) : stmt_vinfo
;
2692 /* Find the first stmt in NODE. */
2695 vect_find_first_scalar_stmt_in_slp (slp_tree node
)
2697 stmt_vec_info first
= NULL
;
2698 stmt_vec_info stmt_vinfo
;
2700 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2702 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2704 || get_later_stmt (stmt_vinfo
, first
) == first
)
2711 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2712 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2713 (also containing the first GROUP1_SIZE stmts, since stores are
2714 consecutive), the second containing the remainder.
2715 Return the first stmt in the second group. */
2717 static stmt_vec_info
2718 vect_split_slp_store_group (stmt_vec_info first_vinfo
, unsigned group1_size
)
2720 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo
) == first_vinfo
);
2721 gcc_assert (group1_size
> 0);
2722 int group2_size
= DR_GROUP_SIZE (first_vinfo
) - group1_size
;
2723 gcc_assert (group2_size
> 0);
2724 DR_GROUP_SIZE (first_vinfo
) = group1_size
;
2726 stmt_vec_info stmt_info
= first_vinfo
;
2727 for (unsigned i
= group1_size
; i
> 1; i
--)
2729 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2730 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2732 /* STMT is now the last element of the first group. */
2733 stmt_vec_info group2
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2734 DR_GROUP_NEXT_ELEMENT (stmt_info
) = 0;
2736 DR_GROUP_SIZE (group2
) = group2_size
;
2737 for (stmt_info
= group2
; stmt_info
;
2738 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
))
2740 DR_GROUP_FIRST_ELEMENT (stmt_info
) = group2
;
2741 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2744 /* For the second group, the DR_GROUP_GAP is that before the original group,
2745 plus skipping over the first vector. */
2746 DR_GROUP_GAP (group2
) = DR_GROUP_GAP (first_vinfo
) + group1_size
;
2748 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2749 DR_GROUP_GAP (first_vinfo
) += group2_size
;
2751 if (dump_enabled_p ())
2752 dump_printf_loc (MSG_NOTE
, vect_location
, "Split group into %d and %d\n",
2753 group1_size
, group2_size
);
2758 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2759 statements and a vector of NUNITS elements. */
2762 calculate_unrolling_factor (poly_uint64 nunits
, unsigned int group_size
)
2764 return exact_div (common_multiple (nunits
, group_size
), group_size
);
2767 /* Helper that checks to see if a node is a load node. */
2770 vect_is_slp_load_node (slp_tree root
)
2772 return SLP_TREE_DEF_TYPE (root
) == vect_internal_def
2773 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root
))
2774 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root
)));
2778 /* Helper function of optimize_load_redistribution that performs the operation
2782 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t
*bst_map
,
2783 vec_info
*vinfo
, unsigned int group_size
,
2784 hash_map
<slp_tree
, slp_tree
> *load_map
,
2787 if (slp_tree
*leader
= load_map
->get (root
))
2793 /* For now, we don't know anything about externals so do not do anything. */
2794 if (!root
|| SLP_TREE_DEF_TYPE (root
) != vect_internal_def
)
2796 else if (SLP_TREE_CODE (root
) == VEC_PERM_EXPR
)
2798 /* First convert this node into a load node and add it to the leaves
2799 list and flatten the permute from a lane to a load one. If it's
2800 unneeded it will be elided later. */
2801 vec
<stmt_vec_info
> stmts
;
2802 stmts
.create (SLP_TREE_LANES (root
));
2803 lane_permutation_t lane_perm
= SLP_TREE_LANE_PERMUTATION (root
);
2804 for (unsigned j
= 0; j
< lane_perm
.length (); j
++)
2806 std::pair
<unsigned, unsigned> perm
= lane_perm
[j
];
2807 node
= SLP_TREE_CHILDREN (root
)[perm
.first
];
2809 if (!vect_is_slp_load_node (node
)
2810 || SLP_TREE_CHILDREN (node
).exists ())
2816 stmts
.quick_push (SLP_TREE_SCALAR_STMTS (node
)[perm
.second
]);
2819 if (dump_enabled_p ())
2820 dump_printf_loc (MSG_NOTE
, vect_location
,
2821 "converting stmts on permute node %p\n", root
);
2823 bool *matches
= XALLOCAVEC (bool, group_size
);
2824 poly_uint64 max_nunits
= 1;
2825 unsigned tree_size
= 0, limit
= 1;
2826 node
= vect_build_slp_tree (vinfo
, stmts
, group_size
, &max_nunits
,
2827 matches
, &limit
, &tree_size
, bst_map
);
2831 load_map
->put (root
, node
);
2836 load_map
->put (root
, NULL
);
2838 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
2841 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
2845 SLP_TREE_REF_COUNT (value
)++;
2846 SLP_TREE_CHILDREN (root
)[i
] = value
;
2847 /* ??? We know the original leafs of the replaced nodes will
2848 be referenced by bst_map, only the permutes created by
2849 pattern matching are not. */
2850 if (SLP_TREE_REF_COUNT (node
) == 1)
2851 load_map
->remove (node
);
2852 vect_free_slp_tree (node
);
2859 /* Temporary workaround for loads not being CSEd during SLP build. This
2860 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2861 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2862 same DR such that the final operation is equal to a permuted load. Such
2863 NODES are then directly converted into LOADS themselves. The nodes are
2864 CSEd using BST_MAP. */
2867 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t
*bst_map
,
2868 vec_info
*vinfo
, unsigned int group_size
,
2869 hash_map
<slp_tree
, slp_tree
> *load_map
,
2875 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
2878 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
2882 SLP_TREE_REF_COUNT (value
)++;
2883 SLP_TREE_CHILDREN (root
)[i
] = value
;
2884 /* ??? We know the original leafs of the replaced nodes will
2885 be referenced by bst_map, only the permutes created by
2886 pattern matching are not. */
2887 if (SLP_TREE_REF_COUNT (node
) == 1)
2888 load_map
->remove (node
);
2889 vect_free_slp_tree (node
);
2894 /* Helper function of vect_match_slp_patterns.
2896 Attempts to match patterns against the slp tree rooted in REF_NODE using
2897 VINFO. Patterns are matched in post-order traversal.
2899 If matching is successful the value in REF_NODE is updated and returned, if
2900 not then it is returned unchanged. */
2903 vect_match_slp_patterns_2 (slp_tree
*ref_node
, vec_info
*vinfo
,
2904 slp_tree_to_load_perm_map_t
*perm_cache
,
2905 hash_set
<slp_tree
> *visited
)
2908 slp_tree node
= *ref_node
;
2909 bool found_p
= false;
2910 if (!node
|| visited
->add (node
))
2914 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2915 found_p
|= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node
)[i
],
2916 vinfo
, perm_cache
, visited
);
2918 for (unsigned x
= 0; x
< num__slp_patterns
; x
++)
2920 vect_pattern
*pattern
= slp_patterns
[x
] (perm_cache
, ref_node
);
2923 pattern
->build (vinfo
);
2932 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2935 The modified tree is returned. Patterns are tried in order and multiple
2936 patterns may match. */
2939 vect_match_slp_patterns (slp_instance instance
, vec_info
*vinfo
,
2940 hash_set
<slp_tree
> *visited
,
2941 slp_tree_to_load_perm_map_t
*perm_cache
)
2943 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2944 slp_tree
*ref_node
= &SLP_INSTANCE_TREE (instance
);
2946 if (dump_enabled_p ())
2947 dump_printf_loc (MSG_NOTE
, vect_location
,
2948 "Analyzing SLP tree %p for patterns\n",
2949 SLP_INSTANCE_TREE (instance
));
2951 return vect_match_slp_patterns_2 (ref_node
, vinfo
, perm_cache
, visited
);
2954 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
2955 splitting into two, with the first split group having size NEW_GROUP_SIZE.
2956 Return true if we could use IFN_STORE_LANES instead and if that appears
2957 to be the better approach. */
2960 vect_slp_prefer_store_lanes_p (vec_info
*vinfo
, stmt_vec_info stmt_info
,
2961 unsigned int group_size
,
2962 unsigned int new_group_size
)
2964 tree scalar_type
= TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
2965 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
);
2968 /* Allow the split if one of the two new groups would operate on full
2969 vectors *within* rather than across one scalar loop iteration.
2970 This is purely a heuristic, but it should work well for group
2971 sizes of 3 and 4, where the possible splits are:
2973 3->2+1: OK if the vector has exactly two elements
2975 4->3+1: Less clear-cut. */
2976 if (multiple_p (group_size
- new_group_size
, TYPE_VECTOR_SUBPARTS (vectype
))
2977 || multiple_p (new_group_size
, TYPE_VECTOR_SUBPARTS (vectype
)))
2979 return vect_store_lanes_supported (vectype
, group_size
, false);
2982 /* Analyze an SLP instance starting from a group of grouped stores. Call
2983 vect_build_slp_tree to build a tree of packed stmts if possible.
2984 Return FALSE if it's impossible to SLP any stmt in the loop. */
2987 vect_analyze_slp_instance (vec_info
*vinfo
,
2988 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2989 stmt_vec_info stmt_info
, slp_instance_kind kind
,
2990 unsigned max_tree_size
, unsigned *limit
);
2992 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2993 of KIND. Return true if successful. */
2996 vect_build_slp_instance (vec_info
*vinfo
,
2997 slp_instance_kind kind
,
2998 vec
<stmt_vec_info
> &scalar_stmts
,
2999 vec
<stmt_vec_info
> &root_stmt_infos
,
3000 unsigned max_tree_size
, unsigned *limit
,
3001 scalar_stmts_to_slp_tree_map_t
*bst_map
,
3002 /* ??? We need stmt_info for group splitting. */
3003 stmt_vec_info stmt_info_
)
3005 if (dump_enabled_p ())
3007 dump_printf_loc (MSG_NOTE
, vect_location
,
3008 "Starting SLP discovery for\n");
3009 for (unsigned i
= 0; i
< scalar_stmts
.length (); ++i
)
3010 dump_printf_loc (MSG_NOTE
, vect_location
,
3011 " %G", scalar_stmts
[i
]->stmt
);
3014 /* Build the tree for the SLP instance. */
3015 unsigned int group_size
= scalar_stmts
.length ();
3016 bool *matches
= XALLOCAVEC (bool, group_size
);
3017 poly_uint64 max_nunits
= 1;
3018 unsigned tree_size
= 0;
3020 slp_tree node
= vect_build_slp_tree (vinfo
, scalar_stmts
, group_size
,
3021 &max_nunits
, matches
, limit
,
3022 &tree_size
, bst_map
);
3025 /* Calculate the unrolling factor based on the smallest type. */
3026 poly_uint64 unrolling_factor
3027 = calculate_unrolling_factor (max_nunits
, group_size
);
3029 if (maybe_ne (unrolling_factor
, 1U)
3030 && is_a
<bb_vec_info
> (vinfo
))
3032 unsigned HOST_WIDE_INT const_max_nunits
;
3033 if (!max_nunits
.is_constant (&const_max_nunits
)
3034 || const_max_nunits
> group_size
)
3036 if (dump_enabled_p ())
3037 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3038 "Build SLP failed: store group "
3039 "size not a multiple of the vector size "
3040 "in basic block SLP\n");
3041 vect_free_slp_tree (node
);
3044 /* Fatal mismatch. */
3045 if (dump_enabled_p ())
3046 dump_printf_loc (MSG_NOTE
, vect_location
,
3047 "SLP discovery succeeded but node needs "
3049 memset (matches
, true, group_size
);
3050 matches
[group_size
/ const_max_nunits
* const_max_nunits
] = false;
3051 vect_free_slp_tree (node
);
3055 /* Create a new SLP instance. */
3056 slp_instance new_instance
= XNEW (class _slp_instance
);
3057 SLP_INSTANCE_TREE (new_instance
) = node
;
3058 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
3059 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
3060 SLP_INSTANCE_ROOT_STMTS (new_instance
) = root_stmt_infos
;
3061 SLP_INSTANCE_KIND (new_instance
) = kind
;
3062 new_instance
->reduc_phis
= NULL
;
3063 new_instance
->cost_vec
= vNULL
;
3064 new_instance
->subgraph_entries
= vNULL
;
3066 if (dump_enabled_p ())
3067 dump_printf_loc (MSG_NOTE
, vect_location
,
3068 "SLP size %u vs. limit %u.\n",
3069 tree_size
, max_tree_size
);
3071 /* Fixup SLP reduction chains. */
3072 if (kind
== slp_inst_kind_reduc_chain
)
3074 /* If this is a reduction chain with a conversion in front
3075 amend the SLP tree with a node for that. */
3077 = vect_orig_stmt (scalar_stmts
[group_size
- 1])->stmt
;
3078 if (STMT_VINFO_DEF_TYPE (scalar_stmts
[0]) != vect_reduction_def
)
3080 /* Get at the conversion stmt - we know it's the single use
3081 of the last stmt of the reduction chain. */
3082 use_operand_p use_p
;
3083 bool r
= single_imm_use (gimple_assign_lhs (scalar_def
),
3084 &use_p
, &scalar_def
);
3086 stmt_vec_info next_info
= vinfo
->lookup_stmt (scalar_def
);
3087 next_info
= vect_stmt_to_vectorize (next_info
);
3088 scalar_stmts
= vNULL
;
3089 scalar_stmts
.create (group_size
);
3090 for (unsigned i
= 0; i
< group_size
; ++i
)
3091 scalar_stmts
.quick_push (next_info
);
3092 slp_tree conv
= vect_create_new_slp_node (scalar_stmts
, 1);
3093 SLP_TREE_VECTYPE (conv
) = STMT_VINFO_VECTYPE (next_info
);
3094 SLP_TREE_CHILDREN (conv
).quick_push (node
);
3095 SLP_INSTANCE_TREE (new_instance
) = conv
;
3096 /* We also have to fake this conversion stmt as SLP reduction
3097 group so we don't have to mess with too much code
3099 REDUC_GROUP_FIRST_ELEMENT (next_info
) = next_info
;
3100 REDUC_GROUP_NEXT_ELEMENT (next_info
) = NULL
;
3102 /* Fill the backedge child of the PHI SLP node. The
3103 general matching code cannot find it because the
3104 scalar code does not reflect how we vectorize the
3106 use_operand_p use_p
;
3107 imm_use_iterator imm_iter
;
3108 class loop
*loop
= LOOP_VINFO_LOOP (as_a
<loop_vec_info
> (vinfo
));
3109 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
,
3110 gimple_get_lhs (scalar_def
))
3111 /* There are exactly two non-debug uses, the reduction
3112 PHI and the loop-closed PHI node. */
3113 if (!is_gimple_debug (USE_STMT (use_p
))
3114 && gimple_bb (USE_STMT (use_p
)) == loop
->header
)
3116 auto_vec
<stmt_vec_info
, 64> phis (group_size
);
3117 stmt_vec_info phi_info
3118 = vinfo
->lookup_stmt (USE_STMT (use_p
));
3119 for (unsigned i
= 0; i
< group_size
; ++i
)
3120 phis
.quick_push (phi_info
);
3121 slp_tree
*phi_node
= bst_map
->get (phis
);
3122 unsigned dest_idx
= loop_latch_edge (loop
)->dest_idx
;
3123 SLP_TREE_CHILDREN (*phi_node
)[dest_idx
]
3124 = SLP_INSTANCE_TREE (new_instance
);
3125 SLP_INSTANCE_TREE (new_instance
)->refcnt
++;
3129 vinfo
->slp_instances
.safe_push (new_instance
);
3131 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3132 the number of scalar stmts in the root in a few places.
3133 Verify that assumption holds. */
3134 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
3135 .length () == group_size
);
3137 if (dump_enabled_p ())
3139 dump_printf_loc (MSG_NOTE
, vect_location
,
3140 "Final SLP tree for instance %p:\n", new_instance
);
3141 vect_print_slp_graph (MSG_NOTE
, vect_location
,
3142 SLP_INSTANCE_TREE (new_instance
));
3150 /* Failed to SLP. */
3151 /* Free the allocated memory. */
3152 scalar_stmts
.release ();
3155 stmt_vec_info stmt_info
= stmt_info_
;
3156 /* Try to break the group up into pieces. */
3157 if (kind
== slp_inst_kind_store
)
3159 /* ??? We could delay all the actual splitting of store-groups
3160 until after SLP discovery of the original group completed.
3161 Then we can recurse to vect_build_slp_instance directly. */
3162 for (i
= 0; i
< group_size
; i
++)
3166 /* For basic block SLP, try to break the group up into multiples of
3168 if (is_a
<bb_vec_info
> (vinfo
)
3169 && (i
> 1 && i
< group_size
))
3172 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
3173 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
3174 1 << floor_log2 (i
));
3175 unsigned HOST_WIDE_INT const_nunits
;
3177 && TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
))
3179 /* Split into two groups at the first vector boundary. */
3180 gcc_assert ((const_nunits
& (const_nunits
- 1)) == 0);
3181 unsigned group1_size
= i
& ~(const_nunits
- 1);
3183 if (dump_enabled_p ())
3184 dump_printf_loc (MSG_NOTE
, vect_location
,
3185 "Splitting SLP group at stmt %u\n", i
);
3186 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
3188 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
3189 kind
, max_tree_size
,
3191 /* Split the rest at the failure point and possibly
3192 re-analyze the remaining matching part if it has
3193 at least two lanes. */
3195 && (i
+ 1 < group_size
3196 || i
- group1_size
> 1))
3198 stmt_vec_info rest2
= rest
;
3199 rest
= vect_split_slp_store_group (rest
, i
- group1_size
);
3200 if (i
- group1_size
> 1)
3201 res
|= vect_analyze_slp_instance (vinfo
, bst_map
, rest2
,
3202 kind
, max_tree_size
,
3205 /* Re-analyze the non-matching tail if it has at least
3207 if (i
+ 1 < group_size
)
3208 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
3209 rest
, kind
, max_tree_size
,
3215 /* For loop vectorization split into arbitrary pieces of size > 1. */
3216 if (is_a
<loop_vec_info
> (vinfo
)
3217 && (i
> 1 && i
< group_size
)
3218 && !vect_slp_prefer_store_lanes_p (vinfo
, stmt_info
, group_size
, i
))
3220 unsigned group1_size
= i
;
3222 if (dump_enabled_p ())
3223 dump_printf_loc (MSG_NOTE
, vect_location
,
3224 "Splitting SLP group at stmt %u\n", i
);
3226 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
3228 /* Loop vectorization cannot handle gaps in stores, make sure
3229 the split group appears as strided. */
3230 STMT_VINFO_STRIDED_P (rest
) = 1;
3231 DR_GROUP_GAP (rest
) = 0;
3232 STMT_VINFO_STRIDED_P (stmt_info
) = 1;
3233 DR_GROUP_GAP (stmt_info
) = 0;
3235 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
3236 kind
, max_tree_size
, limit
);
3237 if (i
+ 1 < group_size
)
3238 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
3239 rest
, kind
, max_tree_size
, limit
);
3244 /* Even though the first vector did not all match, we might be able to SLP
3245 (some) of the remainder. FORNOW ignore this possibility. */
3248 /* Failed to SLP. */
3249 if (dump_enabled_p ())
3250 dump_printf_loc (MSG_NOTE
, vect_location
, "SLP discovery failed\n");
3255 /* Analyze an SLP instance starting from a group of grouped stores. Call
3256 vect_build_slp_tree to build a tree of packed stmts if possible.
3257 Return FALSE if it's impossible to SLP any stmt in the loop. */
3260 vect_analyze_slp_instance (vec_info
*vinfo
,
3261 scalar_stmts_to_slp_tree_map_t
*bst_map
,
3262 stmt_vec_info stmt_info
,
3263 slp_instance_kind kind
,
3264 unsigned max_tree_size
, unsigned *limit
)
3267 vec
<stmt_vec_info
> scalar_stmts
;
3269 if (is_a
<bb_vec_info
> (vinfo
))
3270 vect_location
= stmt_info
->stmt
;
3272 stmt_vec_info next_info
= stmt_info
;
3273 if (kind
== slp_inst_kind_store
)
3275 /* Collect the stores and store them in scalar_stmts. */
3276 scalar_stmts
.create (DR_GROUP_SIZE (stmt_info
));
3279 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
3280 next_info
= DR_GROUP_NEXT_ELEMENT (next_info
);
3283 else if (kind
== slp_inst_kind_reduc_chain
)
3285 /* Collect the reduction stmts and store them in scalar_stmts. */
3286 scalar_stmts
.create (REDUC_GROUP_SIZE (stmt_info
));
3289 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
3290 next_info
= REDUC_GROUP_NEXT_ELEMENT (next_info
);
3292 /* Mark the first element of the reduction chain as reduction to properly
3293 transform the node. In the reduction analysis phase only the last
3294 element of the chain is marked as reduction. */
3295 STMT_VINFO_DEF_TYPE (stmt_info
)
3296 = STMT_VINFO_DEF_TYPE (scalar_stmts
.last ());
3297 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))
3298 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts
.last ()));
3300 else if (kind
== slp_inst_kind_ctor
)
3302 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
3304 scalar_stmts
.create (CONSTRUCTOR_NELTS (rhs
));
3305 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), i
, val
)
3307 stmt_vec_info def_info
= vinfo
->lookup_def (val
);
3308 def_info
= vect_stmt_to_vectorize (def_info
);
3309 scalar_stmts
.quick_push (def_info
);
3311 if (dump_enabled_p ())
3312 dump_printf_loc (MSG_NOTE
, vect_location
,
3313 "Analyzing vectorizable constructor: %G\n",
3316 else if (kind
== slp_inst_kind_reduc_group
)
3318 /* Collect reduction statements. */
3319 vec
<stmt_vec_info
> reductions
= as_a
<loop_vec_info
> (vinfo
)->reductions
;
3320 scalar_stmts
.create (reductions
.length ());
3321 for (i
= 0; reductions
.iterate (i
, &next_info
); i
++)
3322 if (STMT_VINFO_RELEVANT_P (next_info
)
3323 || STMT_VINFO_LIVE_P (next_info
))
3324 scalar_stmts
.quick_push (next_info
);
3325 /* If less than two were relevant/live there's nothing to SLP. */
3326 if (scalar_stmts
.length () < 2)
3332 vec
<stmt_vec_info
> roots
= vNULL
;
3333 if (kind
== slp_inst_kind_ctor
)
3336 roots
.quick_push (stmt_info
);
3338 /* Build the tree for the SLP instance. */
3339 bool res
= vect_build_slp_instance (vinfo
, kind
, scalar_stmts
,
3341 max_tree_size
, limit
, bst_map
,
3342 kind
== slp_inst_kind_store
3343 ? stmt_info
: NULL
);
3347 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3348 where we should do store group splitting. */
3353 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3354 trees of packed scalar stmts if SLP is possible. */
3357 vect_analyze_slp (vec_info
*vinfo
, unsigned max_tree_size
)
3360 stmt_vec_info first_element
;
3361 slp_instance instance
;
3363 DUMP_VECT_SCOPE ("vect_analyze_slp");
3365 unsigned limit
= max_tree_size
;
3367 scalar_stmts_to_slp_tree_map_t
*bst_map
3368 = new scalar_stmts_to_slp_tree_map_t ();
3370 /* Find SLP sequences starting from groups of grouped stores. */
3371 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
3372 vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
3373 STMT_VINFO_GROUPED_ACCESS (first_element
)
3374 ? slp_inst_kind_store
: slp_inst_kind_ctor
,
3375 max_tree_size
, &limit
);
3377 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
3379 for (unsigned i
= 0; i
< bb_vinfo
->roots
.length (); ++i
)
3381 vect_location
= bb_vinfo
->roots
[i
].roots
[0]->stmt
;
3382 if (vect_build_slp_instance (bb_vinfo
, bb_vinfo
->roots
[i
].kind
,
3383 bb_vinfo
->roots
[i
].stmts
,
3384 bb_vinfo
->roots
[i
].roots
,
3385 max_tree_size
, &limit
, bst_map
, NULL
))
3387 bb_vinfo
->roots
[i
].stmts
= vNULL
;
3388 bb_vinfo
->roots
[i
].roots
= vNULL
;
3393 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3395 /* Find SLP sequences starting from reduction chains. */
3396 FOR_EACH_VEC_ELT (loop_vinfo
->reduction_chains
, i
, first_element
)
3397 if (! STMT_VINFO_RELEVANT_P (first_element
)
3398 && ! STMT_VINFO_LIVE_P (first_element
))
3400 else if (! vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
3401 slp_inst_kind_reduc_chain
,
3402 max_tree_size
, &limit
))
3404 /* Dissolve reduction chain group. */
3405 stmt_vec_info vinfo
= first_element
;
3406 stmt_vec_info last
= NULL
;
3409 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
3410 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
3411 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
3415 STMT_VINFO_DEF_TYPE (first_element
) = vect_internal_def
;
3416 /* It can be still vectorized as part of an SLP reduction. */
3417 loop_vinfo
->reductions
.safe_push (last
);
3420 /* Find SLP sequences starting from groups of reductions. */
3421 if (loop_vinfo
->reductions
.length () > 1)
3422 vect_analyze_slp_instance (vinfo
, bst_map
, loop_vinfo
->reductions
[0],
3423 slp_inst_kind_reduc_group
, max_tree_size
,
3427 hash_set
<slp_tree
> visited_patterns
;
3428 slp_tree_to_load_perm_map_t perm_cache
;
3430 /* See if any patterns can be found in the SLP tree. */
3431 bool pattern_found
= false;
3432 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
3433 pattern_found
|= vect_match_slp_patterns (instance
, vinfo
,
3434 &visited_patterns
, &perm_cache
);
3436 /* If any were found optimize permutations of loads. */
3439 hash_map
<slp_tree
, slp_tree
> load_map
;
3440 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
3442 slp_tree root
= SLP_INSTANCE_TREE (instance
);
3443 optimize_load_redistribution (bst_map
, vinfo
, SLP_TREE_LANES (root
),
3450 /* The map keeps a reference on SLP nodes built, release that. */
3451 for (scalar_stmts_to_slp_tree_map_t::iterator it
= bst_map
->begin ();
3452 it
!= bst_map
->end (); ++it
)
3454 vect_free_slp_tree ((*it
).second
);
3457 if (pattern_found
&& dump_enabled_p ())
3459 dump_printf_loc (MSG_NOTE
, vect_location
,
3460 "Pattern matched SLP tree\n");
3461 hash_set
<slp_tree
> visited
;
3462 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
3463 vect_print_slp_graph (MSG_NOTE
, vect_location
,
3464 SLP_INSTANCE_TREE (instance
), visited
);
3467 return opt_result::success ();
3472 slpg_vertex (slp_tree node_
)
3473 : node (node_
), perm_out (-1), materialize (0) {}
3475 int get_perm_in () const { return materialize
? materialize
: perm_out
; }
3478 /* The permutation on the outgoing lanes (towards SLP parents). */
3480 /* The permutation that is applied by this node. perm_out is
3481 relative to this. */
3485 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3488 vect_slp_build_vertices (hash_set
<slp_tree
> &visited
, slp_tree node
,
3489 vec
<slpg_vertex
> &vertices
, vec
<int> &leafs
)
3494 if (visited
.add (node
))
3497 node
->vertex
= vertices
.length ();
3498 vertices
.safe_push (slpg_vertex (node
));
3501 bool force_leaf
= false;
3502 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3506 vect_slp_build_vertices (visited
, child
, vertices
, leafs
);
3510 /* Since SLP discovery works along use-def edges all cycles have an
3511 entry - but there's the exception of cycles where we do not handle
3512 the entry explicitely (but with a NULL SLP node), like some reductions
3513 and inductions. Force those SLP PHIs to act as leafs to make them
3514 backwards reachable. */
3515 if (leaf
|| force_leaf
)
3516 leafs
.safe_push (node
->vertex
);
3519 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3522 vect_slp_build_vertices (vec_info
*info
, vec
<slpg_vertex
> &vertices
,
3525 hash_set
<slp_tree
> visited
;
3527 slp_instance instance
;
3528 FOR_EACH_VEC_ELT (info
->slp_instances
, i
, instance
)
3529 vect_slp_build_vertices (visited
, SLP_INSTANCE_TREE (instance
), vertices
,
3533 /* Apply (reverse) bijectite PERM to VEC. */
3537 vect_slp_permute (vec
<unsigned> perm
,
3538 vec
<T
> &vec
, bool reverse
)
3540 auto_vec
<T
, 64> saved
;
3541 saved
.create (vec
.length ());
3542 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3543 saved
.quick_push (vec
[i
]);
3547 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3548 vec
[perm
[i
]] = saved
[i
];
3549 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3550 gcc_assert (vec
[perm
[i
]] == saved
[i
]);
3554 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3555 vec
[i
] = saved
[perm
[i
]];
3556 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3557 gcc_assert (vec
[i
] == saved
[perm
[i
]]);
3561 /* Return whether permutations PERM_A and PERM_B as recorded in the
3562 PERMS vector are equal. */
3565 vect_slp_perms_eq (const vec
<vec
<unsigned> > &perms
,
3566 int perm_a
, int perm_b
)
3568 return (perm_a
== perm_b
3569 || (perm_a
!= -1 && perm_b
!= -1
3570 && perms
[perm_a
].length () == perms
[perm_b
].length ()
3571 && memcmp (&perms
[perm_a
][0], &perms
[perm_b
][0],
3572 sizeof (unsigned) * perms
[perm_a
].length ()) == 0));
3575 /* Optimize the SLP graph of VINFO. */
3578 vect_optimize_slp (vec_info
*vinfo
)
3580 if (vinfo
->slp_instances
.is_empty ())
3585 auto_vec
<slpg_vertex
> vertices
;
3586 auto_vec
<int> leafs
;
3587 vect_slp_build_vertices (vinfo
, vertices
, leafs
);
3589 struct graph
*slpg
= new_graph (vertices
.length ());
3590 for (slpg_vertex
&v
: vertices
)
3591 for (slp_tree child
: SLP_TREE_CHILDREN (v
.node
))
3593 add_edge (slpg
, v
.node
->vertex
, child
->vertex
);
3595 /* Compute (reverse) postorder on the inverted graph. */
3597 graphds_dfs (slpg
, &leafs
[0], leafs
.length (), &ipo
, false, NULL
, NULL
);
3599 auto_vec
<vec
<unsigned> > perms
;
3600 perms
.safe_push (vNULL
); /* zero is no permute */
3602 /* Produce initial permutations. */
3603 for (i
= 0; i
< leafs
.length (); ++i
)
3606 slp_tree node
= vertices
[idx
].node
;
3608 /* Handle externals and constants optimistically throughout the
3610 if (SLP_TREE_DEF_TYPE (node
) == vect_external_def
3611 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
3614 /* Leafs do not change across iterations. Note leafs also double
3615 as entries to the reverse graph. */
3616 if (!slpg
->vertices
[idx
].succ
)
3617 vertices
[idx
].perm_out
= 0;
3618 /* Loads are the only thing generating permutes. */
3619 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3622 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3623 node unpermuted, record this permute. */
3624 stmt_vec_info dr_stmt
= SLP_TREE_REPRESENTATIVE (node
);
3625 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt
))
3627 dr_stmt
= DR_GROUP_FIRST_ELEMENT (dr_stmt
);
3628 unsigned imin
= DR_GROUP_SIZE (dr_stmt
) + 1, imax
= 0;
3629 bool any_permute
= false;
3630 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3632 unsigned idx
= SLP_TREE_LOAD_PERMUTATION (node
)[j
];
3633 imin
= MIN (imin
, idx
);
3634 imax
= MAX (imax
, idx
);
3635 if (idx
- SLP_TREE_LOAD_PERMUTATION (node
)[0] != j
)
3638 /* If there's no permute no need to split one out. */
3641 /* If the span doesn't match we'd disrupt VF computation, avoid
3643 if (imax
- imin
+ 1 != SLP_TREE_LANES (node
))
3646 /* For now only handle true permutes, like
3647 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3648 when permuting constants and invariants keeping the permute
3650 auto_sbitmap
load_index (SLP_TREE_LANES (node
));
3651 bitmap_clear (load_index
);
3652 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3653 bitmap_set_bit (load_index
, SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
);
3655 for (j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3656 if (!bitmap_bit_p (load_index
, j
))
3658 if (j
!= SLP_TREE_LANES (node
))
3661 vec
<unsigned> perm
= vNULL
;
3662 perm
.safe_grow (SLP_TREE_LANES (node
), true);
3663 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3664 perm
[j
] = SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
;
3665 perms
.safe_push (perm
);
3666 vertices
[idx
].perm_out
= perms
.length () - 1;
3669 /* Propagate permutes along the graph and compute materialization points. */
3671 bool do_materialization
= false;
3672 unsigned iteration
= 0;
3678 if (dump_enabled_p ())
3679 dump_printf_loc (MSG_NOTE
, vect_location
,
3680 "SLP optimize iteration %d\n", iteration
);
3682 for (i
= vertices
.length (); i
> 0 ; --i
)
3685 slp_tree node
= vertices
[idx
].node
;
3687 /* Handle externals and constants optimistically throughout the
3689 if (SLP_TREE_DEF_TYPE (node
) == vect_external_def
3690 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
3693 /* We still eventually have failed backedge SLP nodes in the
3694 graph, those are only cancelled when analyzing operations.
3695 Simply treat them as transparent ops, propagating permutes
3697 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
3699 /* We do not handle stores with a permutation, so all
3700 incoming permutes must have been materialized. */
3701 stmt_vec_info rep
= SLP_TREE_REPRESENTATIVE (node
);
3702 if (STMT_VINFO_DATA_REF (rep
)
3703 && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep
)))
3705 vertices
[idx
].perm_out
= 0;
3708 /* We cannot move a permute across an operation that is
3709 not independent on lanes. Note this is an explicit
3710 negative list since that's much shorter than the respective
3711 positive one but it's critical to keep maintaining it. */
3712 if (is_gimple_call (STMT_VINFO_STMT (rep
)))
3713 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep
)))
3715 case CFN_COMPLEX_ADD_ROT90
:
3716 case CFN_COMPLEX_ADD_ROT270
:
3717 case CFN_COMPLEX_MUL
:
3718 case CFN_COMPLEX_MUL_CONJ
:
3719 case CFN_VEC_ADDSUB
:
3720 vertices
[idx
].perm_out
= 0;
3727 if (!slpg
->vertices
[idx
].succ
)
3728 /* Pick up pre-computed leaf values. */
3729 perm
= vertices
[idx
].perm_out
;
3732 perm
= vertices
[idx
].get_perm_in ();
3733 for (graph_edge
*succ
= slpg
->vertices
[idx
].succ
;
3734 succ
; succ
= succ
->succ_next
)
3736 int succ_idx
= succ
->dest
;
3737 int succ_perm
= vertices
[succ_idx
].perm_out
;
3738 /* Handle unvisited (and constant) nodes optimistically. */
3739 /* ??? But for constants once we want to handle
3740 non-bijective permutes we have to verify the permute,
3741 when unifying lanes, will not unify different constants.
3742 For example see gcc.dg/vect/bb-slp-14.c for a case
3743 that would break. */
3744 if (succ_perm
== -1)
3748 else if (succ_perm
== 0
3749 || !vect_slp_perms_eq (perms
, perm
, succ_perm
))
3756 /* If this is a node we do not want to eventually unshare
3757 but it can be permuted at will, verify all users have
3758 the same permutations registered and otherwise drop to
3761 && SLP_TREE_DEF_TYPE (node
) != vect_external_def
3762 && SLP_TREE_DEF_TYPE (node
) != vect_constant_def
)
3764 int preds_perm
= -1;
3765 for (graph_edge
*pred
= slpg
->vertices
[idx
].pred
;
3766 pred
; pred
= pred
->pred_next
)
3768 int pred_perm
= vertices
[pred
->src
].get_perm_in ();
3769 if (preds_perm
== -1)
3770 preds_perm
= pred_perm
;
3771 else if (!vect_slp_perms_eq (perms
,
3772 pred_perm
, preds_perm
))
3777 if (!vect_slp_perms_eq (perms
, perm
,
3778 vertices
[idx
].get_perm_in ()))
3780 /* Make sure we eventually converge. */
3781 gcc_checking_assert (vertices
[idx
].get_perm_in () == -1
3785 vertices
[idx
].perm_out
= 0;
3786 vertices
[idx
].materialize
= 0;
3788 if (!vertices
[idx
].materialize
)
3789 vertices
[idx
].perm_out
= perm
;
3794 /* Elide pruning at materialization points in the first
3796 if (!do_materialization
)
3799 if (perm
== 0 || perm
== -1)
3802 /* Decide on permute materialization. Look whether there's
3803 a use (pred) edge that is permuted differently than us.
3804 In that case mark ourselves so the permutation is applied.
3805 For VEC_PERM_EXPRs the permutation doesn't carry along
3806 from children to parents so force materialization at the
3807 point of the VEC_PERM_EXPR. In principle VEC_PERM_EXPRs
3808 are a source of an arbitrary permutation again, similar
3809 to constants/externals - that's something we do not yet
3810 optimally handle. */
3811 bool all_preds_permuted
= (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
3812 && slpg
->vertices
[idx
].pred
!= NULL
);
3813 if (all_preds_permuted
)
3814 for (graph_edge
*pred
= slpg
->vertices
[idx
].pred
;
3815 pred
; pred
= pred
->pred_next
)
3817 int pred_perm
= vertices
[pred
->src
].get_perm_in ();
3818 gcc_checking_assert (pred_perm
!= -1);
3819 if (!vect_slp_perms_eq (perms
, perm
, pred_perm
))
3821 all_preds_permuted
= false;
3825 if (!all_preds_permuted
)
3827 if (!vertices
[idx
].materialize
)
3829 vertices
[idx
].materialize
= perm
;
3830 vertices
[idx
].perm_out
= 0;
3834 /* If the initial propagation converged, switch on materialization
3835 and re-propagate. */
3836 if (!changed
&& !do_materialization
)
3838 do_materialization
= true;
3843 statistics_counter_event (cfun
, "SLP optimize perm iterations", iteration
);
3845 /* Compute pre-order. */
3846 auto_vec
<int> heads
;
3847 heads
.reserve (vinfo
->slp_instances
.length ());
3848 for (slp_instance inst
: vinfo
->slp_instances
)
3849 heads
.quick_push (SLP_INSTANCE_TREE (inst
)->vertex
);
3851 graphds_dfs (slpg
, &heads
[0], heads
.length (), &po
, true, NULL
, NULL
);
3853 /* Propagate materialized permutes to "any" permute nodes. For heads
3854 ending up as "any" (reductions with just invariants), set them to
3856 for (int idx
: heads
)
3857 if (vertices
[idx
].perm_out
== -1)
3858 vertices
[idx
].perm_out
= 0;
3859 for (i
= po
.length (); i
> 0; --i
)
3862 int perm_in
= vertices
[idx
].get_perm_in ();
3863 slp_tree node
= vertices
[idx
].node
;
3864 if (SLP_TREE_DEF_TYPE (node
) == vect_external_def
3865 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
3867 gcc_assert (perm_in
!= -1);
3868 for (graph_edge
*succ
= slpg
->vertices
[idx
].succ
;
3869 succ
; succ
= succ
->succ_next
)
3871 slp_tree succ_node
= vertices
[succ
->dest
].node
;
3872 if (SLP_TREE_DEF_TYPE (succ_node
) == vect_external_def
3873 || SLP_TREE_DEF_TYPE (succ_node
) == vect_constant_def
)
3875 if (vertices
[succ
->dest
].perm_out
== -1)
3876 vertices
[succ
->dest
].perm_out
= perm_in
;
3878 /* Propagation should have ensured that all preds have the same
3880 gcc_assert (vect_slp_perms_eq (perms
, perm_in
,
3881 vertices
[succ
->dest
].perm_out
));
3886 for (i
= 0; i
< vertices
.length (); ++i
)
3888 int perm
= vertices
[i
].get_perm_in ();
3892 slp_tree node
= vertices
[i
].node
;
3894 /* First permute invariant/external original successors. */
3897 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3900 || (SLP_TREE_DEF_TYPE (child
) != vect_constant_def
3901 && SLP_TREE_DEF_TYPE (child
) != vect_external_def
))
3904 /* If the vector is uniform there's nothing to do. */
3905 if (vect_slp_tree_uniform_p (child
))
3908 /* We can end up sharing some externals via two_operator
3909 handling. Be prepared to unshare those. */
3910 if (child
->refcnt
!= 1)
3912 gcc_assert (slpg
->vertices
[child
->vertex
].pred
->pred_next
);
3913 SLP_TREE_CHILDREN (node
)[j
] = child
3914 = vect_create_new_slp_node
3915 (SLP_TREE_SCALAR_OPS (child
).copy ());
3917 vect_slp_permute (perms
[perm
],
3918 SLP_TREE_SCALAR_OPS (child
), true);
3921 if (vertices
[i
].materialize
)
3923 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3924 /* For loads simply drop the permutation, the load permutation
3925 already performs the desired permutation. */
3927 else if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
3929 /* If the node is already a permute node we can apply
3930 the permutation to the lane selection, effectively
3931 materializing it on the incoming vectors. */
3932 if (dump_enabled_p ())
3933 dump_printf_loc (MSG_NOTE
, vect_location
,
3934 "simplifying permute node %p\n",
3937 for (unsigned k
= 0;
3938 k
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++k
)
3939 SLP_TREE_LANE_PERMUTATION (node
)[k
].second
3940 = perms
[perm
][SLP_TREE_LANE_PERMUTATION (node
)[k
].second
];
3944 if (dump_enabled_p ())
3945 dump_printf_loc (MSG_NOTE
, vect_location
,
3946 "inserting permute node in place of %p\n",
3949 /* Make a copy of NODE and in-place change it to a
3950 VEC_PERM node to permute the lanes of the copy. */
3951 slp_tree copy
= new _slp_tree
;
3952 SLP_TREE_CHILDREN (copy
) = SLP_TREE_CHILDREN (node
);
3953 SLP_TREE_CHILDREN (node
) = vNULL
;
3954 SLP_TREE_SCALAR_STMTS (copy
)
3955 = SLP_TREE_SCALAR_STMTS (node
).copy ();
3956 vect_slp_permute (perms
[perm
],
3957 SLP_TREE_SCALAR_STMTS (copy
), true);
3958 gcc_assert (!SLP_TREE_SCALAR_OPS (node
).exists ());
3959 SLP_TREE_REPRESENTATIVE (copy
) = SLP_TREE_REPRESENTATIVE (node
);
3960 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node
).exists ());
3961 SLP_TREE_LANE_PERMUTATION (copy
)
3962 = SLP_TREE_LANE_PERMUTATION (node
);
3963 SLP_TREE_LANE_PERMUTATION (node
) = vNULL
;
3964 SLP_TREE_VECTYPE (copy
) = SLP_TREE_VECTYPE (node
);
3966 copy
->max_nunits
= node
->max_nunits
;
3967 SLP_TREE_DEF_TYPE (copy
) = SLP_TREE_DEF_TYPE (node
);
3968 SLP_TREE_LANES (copy
) = SLP_TREE_LANES (node
);
3969 SLP_TREE_CODE (copy
) = SLP_TREE_CODE (node
);
3971 /* Now turn NODE into a VEC_PERM. */
3972 SLP_TREE_CHILDREN (node
).safe_push (copy
);
3973 SLP_TREE_LANE_PERMUTATION (node
).create (SLP_TREE_LANES (node
));
3974 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3975 SLP_TREE_LANE_PERMUTATION (node
)
3976 .quick_push (std::make_pair (0, perms
[perm
][j
]));
3977 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
3982 /* Apply the reverse permutation to our stmts. */
3983 vect_slp_permute (perms
[perm
],
3984 SLP_TREE_SCALAR_STMTS (node
), true);
3985 /* And to the load permutation, which we can simply
3986 make regular by design. */
3987 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3989 /* ??? When we handle non-bijective permutes the idea
3990 is that we can force the load-permutation to be
3991 { min, min + 1, min + 2, ... max }. But then the
3992 scalar defs might no longer match the lane content
3993 which means wrong-code with live lane vectorization.
3994 So we possibly have to have NULL entries for those. */
3995 vect_slp_permute (perms
[perm
],
3996 SLP_TREE_LOAD_PERMUTATION (node
), true);
4001 /* Elide any permutations at BB reduction roots. */
4002 if (is_a
<bb_vec_info
> (vinfo
))
4004 for (slp_instance instance
: vinfo
->slp_instances
)
4006 if (SLP_INSTANCE_KIND (instance
) != slp_inst_kind_bb_reduc
)
4008 slp_tree old
= SLP_INSTANCE_TREE (instance
);
4009 if (SLP_TREE_CODE (old
) == VEC_PERM_EXPR
4010 && SLP_TREE_CHILDREN (old
).length () == 1)
4012 slp_tree child
= SLP_TREE_CHILDREN (old
)[0];
4013 if (SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
4015 /* Preserve the special VEC_PERM we use to shield existing
4016 vector defs from the rest. But make it a no-op. */
4018 for (std::pair
<unsigned, unsigned> &p
4019 : SLP_TREE_LANE_PERMUTATION (old
))
4024 SLP_INSTANCE_TREE (instance
) = child
;
4025 SLP_TREE_REF_COUNT (child
)++;
4026 vect_free_slp_tree (old
);
4029 else if (SLP_TREE_LOAD_PERMUTATION (old
).exists ()
4030 && SLP_TREE_REF_COUNT (old
) == 1
4031 && vertices
[old
->vertex
].materialize
)
4033 /* ??? For loads the situation is more complex since
4034 we can't modify the permute in place in case the
4035 node is used multiple times. In fact for loads this
4036 should be somehow handled in the propagation engine. */
4037 /* Apply the reverse permutation to our stmts. */
4038 int perm
= vertices
[old
->vertex
].get_perm_in ();
4039 vect_slp_permute (perms
[perm
],
4040 SLP_TREE_SCALAR_STMTS (old
), true);
4041 vect_slp_permute (perms
[perm
],
4042 SLP_TREE_LOAD_PERMUTATION (old
), true);
4047 /* Free the perms vector used for propagation. */
4048 while (!perms
.is_empty ())
4049 perms
.pop ().release ();
4053 /* Now elide load permutations that are not necessary. */
4054 for (i
= 0; i
< leafs
.length (); ++i
)
4056 node
= vertices
[leafs
[i
]].node
;
4057 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
4060 /* In basic block vectorization we allow any subchain of an interleaving
4062 FORNOW: not in loop SLP because of realignment complications. */
4063 if (is_a
<bb_vec_info
> (vinfo
))
4065 bool subchain_p
= true;
4066 stmt_vec_info next_load_info
= NULL
;
4067 stmt_vec_info load_info
;
4069 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
4072 && (next_load_info
!= load_info
4073 || DR_GROUP_GAP (load_info
) != 1))
4078 next_load_info
= DR_GROUP_NEXT_ELEMENT (load_info
);
4082 SLP_TREE_LOAD_PERMUTATION (node
).release ();
4088 stmt_vec_info load_info
;
4089 bool this_load_permuted
= false;
4091 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
4092 if (SLP_TREE_LOAD_PERMUTATION (node
)[j
] != j
)
4094 this_load_permuted
= true;
4097 stmt_vec_info first_stmt_info
4098 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
4099 if (!this_load_permuted
4100 /* The load requires permutation when unrolling exposes
4101 a gap either because the group is larger than the SLP
4102 group-size or because there is a gap between the groups. */
4103 && (known_eq (LOOP_VINFO_VECT_FACTOR
4104 (as_a
<loop_vec_info
> (vinfo
)), 1U)
4105 || ((SLP_TREE_LANES (node
) == DR_GROUP_SIZE (first_stmt_info
))
4106 && DR_GROUP_GAP (first_stmt_info
) == 0)))
4108 SLP_TREE_LOAD_PERMUTATION (node
).release ();
4115 /* Gather loads reachable from the individual SLP graph entries. */
4118 vect_gather_slp_loads (vec_info
*vinfo
)
4121 slp_instance instance
;
4122 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
4124 hash_set
<slp_tree
> visited
;
4125 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance
),
4126 SLP_INSTANCE_TREE (instance
), visited
);
4131 /* For each possible SLP instance decide whether to SLP it and calculate overall
4132 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
4133 least one instance. */
4136 vect_make_slp_decision (loop_vec_info loop_vinfo
)
4139 poly_uint64 unrolling_factor
= 1;
4140 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
4141 slp_instance instance
;
4142 int decided_to_slp
= 0;
4144 DUMP_VECT_SCOPE ("vect_make_slp_decision");
4146 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4148 /* FORNOW: SLP if you can. */
4149 /* All unroll factors have the form:
4151 GET_MODE_SIZE (vinfo->vector_mode) * X
4153 for some rational X, so they must have a common multiple. */
4155 = force_common_multiple (unrolling_factor
,
4156 SLP_INSTANCE_UNROLLING_FACTOR (instance
));
4158 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
4159 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
4160 loop-based vectorization. Such stmts will be marked as HYBRID. */
4161 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
4165 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
4167 if (decided_to_slp
&& dump_enabled_p ())
4169 dump_printf_loc (MSG_NOTE
, vect_location
,
4170 "Decided to SLP %d instances. Unrolling factor ",
4172 dump_dec (MSG_NOTE
, unrolling_factor
);
4173 dump_printf (MSG_NOTE
, "\n");
4176 return (decided_to_slp
> 0);
4179 /* Private data for vect_detect_hybrid_slp. */
4182 loop_vec_info loop_vinfo
;
4183 vec
<stmt_vec_info
> *worklist
;
4186 /* Walker for walk_gimple_op. */
4189 vect_detect_hybrid_slp (tree
*tp
, int *, void *data
)
4191 walk_stmt_info
*wi
= (walk_stmt_info
*)data
;
4192 vdhs_data
*dat
= (vdhs_data
*)wi
->info
;
4197 stmt_vec_info def_stmt_info
= dat
->loop_vinfo
->lookup_def (*tp
);
4200 def_stmt_info
= vect_stmt_to_vectorize (def_stmt_info
);
4201 if (PURE_SLP_STMT (def_stmt_info
))
4203 if (dump_enabled_p ())
4204 dump_printf_loc (MSG_NOTE
, vect_location
, "marking hybrid: %G",
4205 def_stmt_info
->stmt
);
4206 STMT_SLP_TYPE (def_stmt_info
) = hybrid
;
4207 dat
->worklist
->safe_push (def_stmt_info
);
4213 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
4214 if so, otherwise pushing it to WORKLIST. */
4217 maybe_push_to_hybrid_worklist (vec_info
*vinfo
,
4218 vec
<stmt_vec_info
> &worklist
,
4219 stmt_vec_info stmt_info
)
4221 if (dump_enabled_p ())
4222 dump_printf_loc (MSG_NOTE
, vect_location
,
4223 "Processing hybrid candidate : %G", stmt_info
->stmt
);
4224 stmt_vec_info orig_info
= vect_orig_stmt (stmt_info
);
4225 imm_use_iterator iter2
;
4227 use_operand_p use_p
;
4228 def_operand_p def_p
;
4229 bool any_def
= false;
4230 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_info
->stmt
, iter1
, SSA_OP_DEF
)
4233 FOR_EACH_IMM_USE_FAST (use_p
, iter2
, DEF_FROM_PTR (def_p
))
4235 if (is_gimple_debug (USE_STMT (use_p
)))
4237 stmt_vec_info use_info
= vinfo
->lookup_stmt (USE_STMT (use_p
));
4238 /* An out-of loop use means this is a loop_vect sink. */
4241 if (dump_enabled_p ())
4242 dump_printf_loc (MSG_NOTE
, vect_location
,
4243 "Found loop_vect sink: %G", stmt_info
->stmt
);
4244 worklist
.safe_push (stmt_info
);
4247 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info
)))
4249 if (dump_enabled_p ())
4250 dump_printf_loc (MSG_NOTE
, vect_location
,
4251 "Found loop_vect use: %G", use_info
->stmt
);
4252 worklist
.safe_push (stmt_info
);
4257 /* No def means this is a loo_vect sink. */
4260 if (dump_enabled_p ())
4261 dump_printf_loc (MSG_NOTE
, vect_location
,
4262 "Found loop_vect sink: %G", stmt_info
->stmt
);
4263 worklist
.safe_push (stmt_info
);
4266 if (dump_enabled_p ())
4267 dump_printf_loc (MSG_NOTE
, vect_location
,
4268 "Marked SLP consumed stmt pure: %G", stmt_info
->stmt
);
4269 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
4272 /* Find stmts that must be both vectorized and SLPed. */
4275 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
4277 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
4279 /* All stmts participating in SLP are marked pure_slp, all other
4280 stmts are loop_vect.
4281 First collect all loop_vect stmts into a worklist.
4282 SLP patterns cause not all original scalar stmts to appear in
4283 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
4284 Rectify this here and do a backward walk over the IL only considering
4285 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
4286 mark them as pure_slp. */
4287 auto_vec
<stmt_vec_info
> worklist
;
4288 for (int i
= LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
- 1; i
>= 0; --i
)
4290 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
4291 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
4294 gphi
*phi
= gsi
.phi ();
4295 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (phi
);
4296 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
4297 maybe_push_to_hybrid_worklist (loop_vinfo
,
4298 worklist
, stmt_info
);
4300 for (gimple_stmt_iterator gsi
= gsi_last_bb (bb
); !gsi_end_p (gsi
);
4303 gimple
*stmt
= gsi_stmt (gsi
);
4304 if (is_gimple_debug (stmt
))
4306 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
4307 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
4309 for (gimple_stmt_iterator gsi2
4310 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
4311 !gsi_end_p (gsi2
); gsi_next (&gsi2
))
4313 stmt_vec_info patt_info
4314 = loop_vinfo
->lookup_stmt (gsi_stmt (gsi2
));
4315 if (!STMT_SLP_TYPE (patt_info
)
4316 && STMT_VINFO_RELEVANT (patt_info
))
4317 maybe_push_to_hybrid_worklist (loop_vinfo
,
4318 worklist
, patt_info
);
4320 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
4322 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
4323 maybe_push_to_hybrid_worklist (loop_vinfo
,
4324 worklist
, stmt_info
);
4328 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
4329 mark any SLP vectorized stmt as hybrid.
4330 ??? We're visiting def stmts N times (once for each non-SLP and
4331 once for each hybrid-SLP use). */
4334 dat
.worklist
= &worklist
;
4335 dat
.loop_vinfo
= loop_vinfo
;
4336 memset (&wi
, 0, sizeof (wi
));
4337 wi
.info
= (void *)&dat
;
4338 while (!worklist
.is_empty ())
4340 stmt_vec_info stmt_info
= worklist
.pop ();
4341 /* Since SSA operands are not set up for pattern stmts we need
4342 to use walk_gimple_op. */
4344 walk_gimple_op (stmt_info
->stmt
, vect_detect_hybrid_slp
, &wi
);
4349 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
4351 _bb_vec_info::_bb_vec_info (vec
<basic_block
> _bbs
, vec_info_shared
*shared
)
4352 : vec_info (vec_info::bb
, init_cost (NULL
, false), shared
),
4356 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
4359 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
4362 gphi
*phi
= si
.phi ();
4363 gimple_set_uid (phi
, 0);
4366 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
4367 !gsi_end_p (gsi
); gsi_next (&gsi
))
4369 gimple
*stmt
= gsi_stmt (gsi
);
4370 gimple_set_uid (stmt
, 0);
4371 if (is_gimple_debug (stmt
))
4379 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
4380 stmts in the basic block. */
4382 _bb_vec_info::~_bb_vec_info ()
4384 /* Reset region marker. */
4385 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
4388 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
4391 gphi
*phi
= si
.phi ();
4392 gimple_set_uid (phi
, -1);
4394 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
4395 !gsi_end_p (gsi
); gsi_next (&gsi
))
4397 gimple
*stmt
= gsi_stmt (gsi
);
4398 gimple_set_uid (stmt
, -1);
4402 for (unsigned i
= 0; i
< roots
.length (); ++i
)
4404 roots
[i
].stmts
.release ();
4405 roots
[i
].roots
.release ();
4410 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
4411 given then that child nodes have already been processed, and that
4412 their def types currently match their SLP node's def type. */
4415 vect_slp_analyze_node_operations_1 (vec_info
*vinfo
, slp_tree node
,
4416 slp_instance node_instance
,
4417 stmt_vector_for_cost
*cost_vec
)
4419 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
4421 /* Calculate the number of vector statements to be created for the
4422 scalar stmts in this node. For SLP reductions it is equal to the
4423 number of vector statements in the children (which has already been
4424 calculated by the recursive call). Otherwise it is the number of
4425 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
4426 VF divided by the number of elements in a vector. */
4427 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
4428 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
4430 for (unsigned i
= 0; i
< SLP_TREE_CHILDREN (node
).length (); ++i
)
4431 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node
)[i
]) == vect_internal_def
)
4433 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
4434 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node
)[i
]);
4441 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4442 vf
= loop_vinfo
->vectorization_factor
;
4445 unsigned int group_size
= SLP_TREE_LANES (node
);
4446 tree vectype
= SLP_TREE_VECTYPE (node
);
4447 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
4448 = vect_get_num_vectors (vf
* group_size
, vectype
);
4451 /* Handle purely internal nodes. */
4452 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
4453 return vectorizable_slp_permutation (vinfo
, NULL
, node
, cost_vec
);
4455 gcc_assert (STMT_SLP_TYPE (stmt_info
) != loop_vect
);
4456 if (is_a
<bb_vec_info
> (vinfo
)
4457 && !vect_update_shared_vectype (stmt_info
, SLP_TREE_VECTYPE (node
)))
4459 if (dump_enabled_p ())
4460 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4461 "desired vector type conflicts with earlier one "
4462 "for %G", stmt_info
->stmt
);
4467 return vect_analyze_stmt (vinfo
, stmt_info
, &dummy
,
4468 node
, node_instance
, cost_vec
);
4471 /* Try to build NODE from scalars, returning true on success.
4472 NODE_INSTANCE is the SLP instance that contains NODE. */
4475 vect_slp_convert_to_external (vec_info
*vinfo
, slp_tree node
,
4476 slp_instance node_instance
)
4478 stmt_vec_info stmt_info
;
4481 if (!is_a
<bb_vec_info
> (vinfo
)
4482 || node
== SLP_INSTANCE_TREE (node_instance
)
4483 || !SLP_TREE_SCALAR_STMTS (node
).exists ()
4484 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node
)))
4487 if (dump_enabled_p ())
4488 dump_printf_loc (MSG_NOTE
, vect_location
,
4489 "Building vector operands of %p from scalars instead\n", node
);
4491 /* Don't remove and free the child nodes here, since they could be
4492 referenced by other structures. The analysis and scheduling phases
4493 (need to) ignore child nodes of anything that isn't vect_internal_def. */
4494 unsigned int group_size
= SLP_TREE_LANES (node
);
4495 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
4496 SLP_TREE_SCALAR_OPS (node
).safe_grow (group_size
, true);
4497 SLP_TREE_LOAD_PERMUTATION (node
).release ();
4498 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4500 tree lhs
= gimple_get_lhs (vect_orig_stmt (stmt_info
)->stmt
);
4501 SLP_TREE_SCALAR_OPS (node
)[i
] = lhs
;
4506 /* Compute the prologue cost for invariant or constant operands represented
4510 vect_prologue_cost_for_slp (slp_tree node
,
4511 stmt_vector_for_cost
*cost_vec
)
4513 /* There's a special case of an existing vector, that costs nothing. */
4514 if (SLP_TREE_SCALAR_OPS (node
).length () == 0
4515 && !SLP_TREE_VEC_DEFS (node
).is_empty ())
4517 /* Without looking at the actual initializer a vector of
4518 constants can be implemented as load from the constant pool.
4519 When all elements are the same we can use a splat. */
4520 tree vectype
= SLP_TREE_VECTYPE (node
);
4521 unsigned group_size
= SLP_TREE_SCALAR_OPS (node
).length ();
4522 unsigned num_vects_to_check
;
4523 unsigned HOST_WIDE_INT const_nunits
;
4524 unsigned nelt_limit
;
4525 if (TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
)
4526 && ! multiple_p (const_nunits
, group_size
))
4528 num_vects_to_check
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
4529 nelt_limit
= const_nunits
;
4533 /* If either the vector has variable length or the vectors
4534 are composed of repeated whole groups we only need to
4535 cost construction once. All vectors will be the same. */
4536 num_vects_to_check
= 1;
4537 nelt_limit
= group_size
;
4539 tree elt
= NULL_TREE
;
4541 for (unsigned j
= 0; j
< num_vects_to_check
* nelt_limit
; ++j
)
4543 unsigned si
= j
% group_size
;
4545 elt
= SLP_TREE_SCALAR_OPS (node
)[si
];
4546 /* ??? We're just tracking whether all operands of a single
4547 vector initializer are the same, ideally we'd check if
4548 we emitted the same one already. */
4549 else if (elt
!= SLP_TREE_SCALAR_OPS (node
)[si
])
4552 if (nelt
== nelt_limit
)
4554 record_stmt_cost (cost_vec
, 1,
4555 SLP_TREE_DEF_TYPE (node
) == vect_external_def
4556 ? (elt
? scalar_to_vec
: vec_construct
)
4558 NULL
, vectype
, 0, vect_prologue
);
4564 /* Analyze statements contained in SLP tree NODE after recursively analyzing
4565 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
4567 Return true if the operations are supported. */
4570 vect_slp_analyze_node_operations (vec_info
*vinfo
, slp_tree node
,
4571 slp_instance node_instance
,
4572 hash_set
<slp_tree
> &visited_set
,
4573 vec
<slp_tree
> &visited_vec
,
4574 stmt_vector_for_cost
*cost_vec
)
4579 /* Assume we can code-generate all invariants. */
4581 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
4582 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
4585 if (SLP_TREE_DEF_TYPE (node
) == vect_uninitialized_def
)
4587 if (dump_enabled_p ())
4588 dump_printf_loc (MSG_NOTE
, vect_location
,
4589 "Failed cyclic SLP reference in %p\n", node
);
4592 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
);
4594 /* If we already analyzed the exact same set of scalar stmts we're done.
4595 We share the generated vector stmts for those. */
4596 if (visited_set
.add (node
))
4598 visited_vec
.safe_push (node
);
4601 unsigned visited_rec_start
= visited_vec
.length ();
4602 unsigned cost_vec_rec_start
= cost_vec
->length ();
4603 bool seen_non_constant_child
= false;
4604 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4606 res
= vect_slp_analyze_node_operations (vinfo
, child
, node_instance
,
4607 visited_set
, visited_vec
,
4611 if (child
&& SLP_TREE_DEF_TYPE (child
) != vect_constant_def
)
4612 seen_non_constant_child
= true;
4614 /* We're having difficulties scheduling nodes with just constant
4615 operands and no scalar stmts since we then cannot compute a stmt
4617 if (!seen_non_constant_child
&& SLP_TREE_SCALAR_STMTS (node
).is_empty ())
4619 if (dump_enabled_p ())
4620 dump_printf_loc (MSG_NOTE
, vect_location
,
4621 "Cannot vectorize all-constant op node %p\n", node
);
4626 res
= vect_slp_analyze_node_operations_1 (vinfo
, node
, node_instance
,
4628 /* If analysis failed we have to pop all recursive visited nodes
4632 while (visited_vec
.length () >= visited_rec_start
)
4633 visited_set
.remove (visited_vec
.pop ());
4634 cost_vec
->truncate (cost_vec_rec_start
);
4637 /* When the node can be vectorized cost invariant nodes it references.
4638 This is not done in DFS order to allow the refering node
4639 vectorizable_* calls to nail down the invariant nodes vector type
4640 and possibly unshare it if it needs a different vector type than
4643 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
4645 && (SLP_TREE_DEF_TYPE (child
) == vect_constant_def
4646 || SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
4647 /* Perform usual caching, note code-generation still
4648 code-gens these nodes multiple times but we expect
4649 to CSE them later. */
4650 && !visited_set
.add (child
))
4652 visited_vec
.safe_push (child
);
4653 /* ??? After auditing more code paths make a "default"
4654 and push the vector type from NODE to all children
4655 if it is not already set. */
4656 /* Compute the number of vectors to be generated. */
4657 tree vector_type
= SLP_TREE_VECTYPE (child
);
4660 /* For shifts with a scalar argument we don't need
4661 to cost or code-generate anything.
4662 ??? Represent this more explicitely. */
4663 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
4664 == shift_vec_info_type
)
4668 unsigned group_size
= SLP_TREE_LANES (child
);
4670 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4671 vf
= loop_vinfo
->vectorization_factor
;
4672 SLP_TREE_NUMBER_OF_VEC_STMTS (child
)
4673 = vect_get_num_vectors (vf
* group_size
, vector_type
);
4674 /* And cost them. */
4675 vect_prologue_cost_for_slp (child
, cost_vec
);
4678 /* If this node or any of its children can't be vectorized, try pruning
4679 the tree here rather than felling the whole thing. */
4680 if (!res
&& vect_slp_convert_to_external (vinfo
, node
, node_instance
))
4682 /* We'll need to revisit this for invariant costing and number
4683 of vectorized stmt setting. */
4690 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4691 region and that can be vectorized using vectorizable_live_operation
4692 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
4693 scalar code computing it to be retained. */
4696 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo
, slp_tree node
,
4697 slp_instance instance
,
4698 stmt_vector_for_cost
*cost_vec
,
4699 hash_set
<stmt_vec_info
> &svisited
,
4700 hash_set
<slp_tree
> &visited
)
4702 if (visited
.add (node
))
4706 stmt_vec_info stmt_info
;
4707 stmt_vec_info last_stmt
= vect_find_last_scalar_stmt_in_slp (node
);
4708 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4710 if (svisited
.contains (stmt_info
))
4712 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4713 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
)
4714 && STMT_VINFO_RELATED_STMT (orig_stmt_info
) != stmt_info
)
4715 /* Only the pattern root stmt computes the original scalar value. */
4717 bool mark_visited
= true;
4718 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
4719 ssa_op_iter op_iter
;
4720 def_operand_p def_p
;
4721 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
4723 imm_use_iterator use_iter
;
4725 stmt_vec_info use_stmt_info
;
4726 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4727 if (!is_gimple_debug (use_stmt
))
4729 use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
);
4731 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
4733 STMT_VINFO_LIVE_P (stmt_info
) = true;
4734 if (vectorizable_live_operation (bb_vinfo
, stmt_info
,
4735 NULL
, node
, instance
, i
,
4737 /* ??? So we know we can vectorize the live stmt
4738 from one SLP node. If we cannot do so from all
4739 or none consistently we'd have to record which
4740 SLP node (and lane) we want to use for the live
4741 operation. So make sure we can code-generate
4743 mark_visited
= false;
4745 STMT_VINFO_LIVE_P (stmt_info
) = false;
4749 /* We have to verify whether we can insert the lane extract
4750 before all uses. The following is a conservative approximation.
4751 We cannot put this into vectorizable_live_operation because
4752 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4754 Note that while the fact that we emit code for loads at the
4755 first load should make this a non-problem leafs we construct
4756 from scalars are vectorized after the last scalar def.
4757 ??? If we'd actually compute the insert location during
4758 analysis we could use sth less conservative than the last
4759 scalar stmt in the node for the dominance check. */
4760 /* ??? What remains is "live" uses in vector CTORs in the same
4761 SLP graph which is where those uses can end up code-generated
4762 right after their definition instead of close to their original
4763 use. But that would restrict us to code-generate lane-extracts
4764 from the latest stmt in a node. So we compensate for this
4765 during code-generation, simply not replacing uses for those
4766 hopefully rare cases. */
4767 if (STMT_VINFO_LIVE_P (stmt_info
))
4768 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4769 if (!is_gimple_debug (use_stmt
)
4770 && (!(use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
))
4771 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
4772 && !vect_stmt_dominates_stmt_p (last_stmt
->stmt
, use_stmt
))
4774 if (dump_enabled_p ())
4775 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4776 "Cannot determine insertion place for "
4778 STMT_VINFO_LIVE_P (stmt_info
) = false;
4779 mark_visited
= true;
4783 svisited
.add (stmt_info
);
4787 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4788 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4789 vect_bb_slp_mark_live_stmts (bb_vinfo
, child
, instance
,
4790 cost_vec
, svisited
, visited
);
4793 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
4796 vectorizable_bb_reduc_epilogue (slp_instance instance
,
4797 stmt_vector_for_cost
*cost_vec
)
4799 enum tree_code reduc_code
4800 = gimple_assign_rhs_code (instance
->root_stmts
[0]->stmt
);
4801 if (reduc_code
== MINUS_EXPR
)
4802 reduc_code
= PLUS_EXPR
;
4803 internal_fn reduc_fn
;
4804 tree vectype
= SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance
));
4805 if (!reduction_fn_for_scalar_code (reduc_code
, &reduc_fn
)
4806 || reduc_fn
== IFN_LAST
4807 || !direct_internal_fn_supported_p (reduc_fn
, vectype
, OPTIMIZE_FOR_BOTH
))
4810 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
4811 cost log2 vector operations plus shuffles. */
4812 unsigned steps
= floor_log2 (vect_nunits_for_cost (vectype
));
4813 record_stmt_cost (cost_vec
, steps
, vector_stmt
, instance
->root_stmts
[0],
4814 vectype
, 0, vect_body
);
4815 record_stmt_cost (cost_vec
, steps
, vec_perm
, instance
->root_stmts
[0],
4816 vectype
, 0, vect_body
);
4820 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
4821 and recurse to children. */
4824 vect_slp_prune_covered_roots (slp_tree node
, hash_set
<stmt_vec_info
> &roots
,
4825 hash_set
<slp_tree
> &visited
)
4827 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
4828 || visited
.add (node
))
4833 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
4834 roots
.remove (vect_orig_stmt (stmt
));
4837 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4839 vect_slp_prune_covered_roots (child
, roots
, visited
);
4842 /* Analyze statements in SLP instances of VINFO. Return true if the
4843 operations are supported. */
4846 vect_slp_analyze_operations (vec_info
*vinfo
)
4848 slp_instance instance
;
4851 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4853 hash_set
<slp_tree
> visited
;
4854 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
4856 auto_vec
<slp_tree
> visited_vec
;
4857 stmt_vector_for_cost cost_vec
;
4858 cost_vec
.create (2);
4859 if (is_a
<bb_vec_info
> (vinfo
))
4860 vect_location
= instance
->location ();
4861 if (!vect_slp_analyze_node_operations (vinfo
,
4862 SLP_INSTANCE_TREE (instance
),
4863 instance
, visited
, visited_vec
,
4865 /* CTOR instances require vectorized defs for the SLP tree root. */
4866 || (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_ctor
4867 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
))
4868 != vect_internal_def
))
4869 /* Check we can vectorize the reduction. */
4870 || (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_bb_reduc
4871 && !vectorizable_bb_reduc_epilogue (instance
, &cost_vec
)))
4873 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4874 stmt_vec_info stmt_info
;
4875 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
4876 stmt_info
= SLP_INSTANCE_ROOT_STMTS (instance
)[0];
4878 stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4879 if (dump_enabled_p ())
4880 dump_printf_loc (MSG_NOTE
, vect_location
,
4881 "removing SLP instance operations starting from: %G",
4883 vect_free_slp_instance (instance
);
4884 vinfo
->slp_instances
.ordered_remove (i
);
4885 cost_vec
.release ();
4886 while (!visited_vec
.is_empty ())
4887 visited
.remove (visited_vec
.pop ());
4893 /* For BB vectorization remember the SLP graph entry
4895 if (is_a
<bb_vec_info
> (vinfo
))
4896 instance
->cost_vec
= cost_vec
;
4899 add_stmt_costs (vinfo
, vinfo
->target_cost_data
, &cost_vec
);
4900 cost_vec
.release ();
4905 /* Now look for SLP instances with a root that are covered by other
4906 instances and remove them. */
4907 hash_set
<stmt_vec_info
> roots
;
4908 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4909 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
4910 roots
.add (SLP_INSTANCE_ROOT_STMTS (instance
)[0]);
4911 if (!roots
.is_empty ())
4914 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4915 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance
), roots
,
4917 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
4918 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ()
4919 && !roots
.contains (SLP_INSTANCE_ROOT_STMTS (instance
)[0]))
4921 stmt_vec_info root
= SLP_INSTANCE_ROOT_STMTS (instance
)[0];
4922 if (dump_enabled_p ())
4923 dump_printf_loc (MSG_NOTE
, vect_location
,
4924 "removing SLP instance operations starting "
4925 "from: %G", root
->stmt
);
4926 vect_free_slp_instance (instance
);
4927 vinfo
->slp_instances
.ordered_remove (i
);
4933 /* Compute vectorizable live stmts. */
4934 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
4936 hash_set
<stmt_vec_info
> svisited
;
4937 hash_set
<slp_tree
> visited
;
4938 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4940 vect_location
= instance
->location ();
4941 vect_bb_slp_mark_live_stmts (bb_vinfo
, SLP_INSTANCE_TREE (instance
),
4942 instance
, &instance
->cost_vec
, svisited
,
4947 return !vinfo
->slp_instances
.is_empty ();
4950 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
4951 closing the eventual chain. */
4954 get_ultimate_leader (slp_instance instance
,
4955 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
4957 auto_vec
<slp_instance
*, 8> chain
;
4959 while (*(tem
= instance_leader
.get (instance
)) != instance
)
4961 chain
.safe_push (tem
);
4964 while (!chain
.is_empty ())
4965 *chain
.pop () = instance
;
4969 /* Worker of vect_bb_partition_graph, recurse on NODE. */
4972 vect_bb_partition_graph_r (bb_vec_info bb_vinfo
,
4973 slp_instance instance
, slp_tree node
,
4974 hash_map
<stmt_vec_info
, slp_instance
> &stmt_to_instance
,
4975 hash_map
<slp_instance
, slp_instance
> &instance_leader
,
4976 hash_set
<slp_tree
> &visited
)
4978 stmt_vec_info stmt_info
;
4981 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4984 slp_instance
&stmt_instance
4985 = stmt_to_instance
.get_or_insert (stmt_info
, &existed_p
);
4988 else if (stmt_instance
!= instance
)
4990 /* If we're running into a previously marked stmt make us the
4991 leader of the current ultimate leader. This keeps the
4992 leader chain acyclic and works even when the current instance
4993 connects two previously independent graph parts. */
4994 slp_instance stmt_leader
4995 = get_ultimate_leader (stmt_instance
, instance_leader
);
4996 if (stmt_leader
!= instance
)
4997 instance_leader
.put (stmt_leader
, instance
);
4999 stmt_instance
= instance
;
5002 if (!SLP_TREE_SCALAR_STMTS (node
).is_empty () && visited
.add (node
))
5006 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5007 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
5008 vect_bb_partition_graph_r (bb_vinfo
, instance
, child
, stmt_to_instance
,
5009 instance_leader
, visited
);
5012 /* Partition the SLP graph into pieces that can be costed independently. */
5015 vect_bb_partition_graph (bb_vec_info bb_vinfo
)
5017 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
5019 /* First walk the SLP graph assigning each involved scalar stmt a
5020 corresponding SLP graph entry and upon visiting a previously
5021 marked stmt, make the stmts leader the current SLP graph entry. */
5022 hash_map
<stmt_vec_info
, slp_instance
> stmt_to_instance
;
5023 hash_map
<slp_instance
, slp_instance
> instance_leader
;
5024 hash_set
<slp_tree
> visited
;
5025 slp_instance instance
;
5026 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
5028 instance_leader
.put (instance
, instance
);
5029 vect_bb_partition_graph_r (bb_vinfo
,
5030 instance
, SLP_INSTANCE_TREE (instance
),
5031 stmt_to_instance
, instance_leader
,
5035 /* Then collect entries to each independent subgraph. */
5036 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
5038 slp_instance leader
= get_ultimate_leader (instance
, instance_leader
);
5039 leader
->subgraph_entries
.safe_push (instance
);
5040 if (dump_enabled_p ()
5041 && leader
!= instance
)
5042 dump_printf_loc (MSG_NOTE
, vect_location
,
5043 "instance %p is leader of %p\n",
5048 /* Compute the scalar cost of the SLP node NODE and its children
5049 and return it. Do not account defs that are marked in LIFE and
5050 update LIFE according to uses of NODE. */
5053 vect_bb_slp_scalar_cost (vec_info
*vinfo
,
5054 slp_tree node
, vec
<bool, va_heap
> *life
,
5055 stmt_vector_for_cost
*cost_vec
,
5056 hash_set
<slp_tree
> &visited
)
5059 stmt_vec_info stmt_info
;
5062 if (visited
.add (node
))
5065 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
5067 ssa_op_iter op_iter
;
5068 def_operand_p def_p
;
5073 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
5074 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
5076 /* If there is a non-vectorized use of the defs then the scalar
5077 stmt is kept live in which case we do not account it or any
5078 required defs in the SLP children in the scalar cost. This
5079 way we make the vectorization more costly when compared to
5081 if (!STMT_VINFO_LIVE_P (stmt_info
))
5083 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
5085 imm_use_iterator use_iter
;
5087 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
5088 if (!is_gimple_debug (use_stmt
))
5090 stmt_vec_info use_stmt_info
= vinfo
->lookup_stmt (use_stmt
);
5093 (vect_stmt_to_vectorize (use_stmt_info
)))
5104 /* Count scalar stmts only once. */
5105 if (gimple_visited_p (orig_stmt
))
5107 gimple_set_visited (orig_stmt
, true);
5109 vect_cost_for_stmt kind
;
5110 if (STMT_VINFO_DATA_REF (orig_stmt_info
))
5112 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info
)))
5115 kind
= scalar_store
;
5117 else if (vect_nop_conversion_p (orig_stmt_info
))
5119 /* For single-argument PHIs assume coalescing which means zero cost
5120 for the scalar and the vector PHIs. This avoids artificially
5121 favoring the vector path (but may pessimize it in some cases). */
5122 else if (is_a
<gphi
*> (orig_stmt_info
->stmt
)
5123 && gimple_phi_num_args
5124 (as_a
<gphi
*> (orig_stmt_info
->stmt
)) == 1)
5128 record_stmt_cost (cost_vec
, 1, kind
, orig_stmt_info
,
5129 SLP_TREE_VECTYPE (node
), 0, vect_body
);
5132 auto_vec
<bool, 20> subtree_life
;
5133 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5135 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
5137 /* Do not directly pass LIFE to the recursive call, copy it to
5138 confine changes in the callee to the current child/subtree. */
5139 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
5141 subtree_life
.safe_grow_cleared (SLP_TREE_LANES (child
), true);
5142 for (unsigned j
= 0;
5143 j
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++j
)
5145 auto perm
= SLP_TREE_LANE_PERMUTATION (node
)[j
];
5146 if (perm
.first
== i
)
5147 subtree_life
[perm
.second
] = (*life
)[j
];
5152 gcc_assert (SLP_TREE_LANES (node
) == SLP_TREE_LANES (child
));
5153 subtree_life
.safe_splice (*life
);
5155 vect_bb_slp_scalar_cost (vinfo
, child
, &subtree_life
, cost_vec
,
5157 subtree_life
.truncate (0);
5162 /* Comparator for the loop-index sorted cost vectors. */
5165 li_cost_vec_cmp (const void *a_
, const void *b_
)
5167 auto *a
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)a_
;
5168 auto *b
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)b_
;
5169 if (a
->first
< b
->first
)
5171 else if (a
->first
== b
->first
)
5176 /* Check if vectorization of the basic block is profitable for the
5177 subgraph denoted by SLP_INSTANCES. */
5180 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
,
5181 vec
<slp_instance
> slp_instances
)
5183 slp_instance instance
;
5185 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
5186 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
5188 if (dump_enabled_p ())
5190 dump_printf_loc (MSG_NOTE
, vect_location
, "Costing subgraph: \n");
5191 hash_set
<slp_tree
> visited
;
5192 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
5193 vect_print_slp_graph (MSG_NOTE
, vect_location
,
5194 SLP_INSTANCE_TREE (instance
), visited
);
5197 /* Calculate scalar cost and sum the cost for the vector stmts
5198 previously collected. */
5199 stmt_vector_for_cost scalar_costs
= vNULL
;
5200 stmt_vector_for_cost vector_costs
= vNULL
;
5201 hash_set
<slp_tree
> visited
;
5202 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
5204 auto_vec
<bool, 20> life
;
5205 life
.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
)),
5207 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
5208 record_stmt_cost (&scalar_costs
,
5209 SLP_INSTANCE_ROOT_STMTS (instance
).length (),
5211 SLP_INSTANCE_ROOT_STMTS (instance
)[0], 0, vect_body
);
5212 vect_bb_slp_scalar_cost (bb_vinfo
,
5213 SLP_INSTANCE_TREE (instance
),
5214 &life
, &scalar_costs
, visited
);
5215 vector_costs
.safe_splice (instance
->cost_vec
);
5216 instance
->cost_vec
.release ();
5218 /* Unset visited flag. */
5219 stmt_info_for_cost
*cost
;
5220 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
5221 gimple_set_visited (cost
->stmt_info
->stmt
, false);
5223 if (dump_enabled_p ())
5224 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
5226 /* When costing non-loop vectorization we need to consider each covered
5227 loop independently and make sure vectorization is profitable. For
5228 now we assume a loop may be not entered or executed an arbitrary
5229 number of iterations (??? static information can provide more
5230 precise info here) which means we can simply cost each containing
5231 loops stmts separately. */
5233 /* First produce cost vectors sorted by loop index. */
5234 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
5235 li_scalar_costs (scalar_costs
.length ());
5236 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
5237 li_vector_costs (vector_costs
.length ());
5238 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
5240 unsigned l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
5241 li_scalar_costs
.quick_push (std::make_pair (l
, cost
));
5243 /* Use a random used loop as fallback in case the first vector_costs
5244 entry does not have a stmt_info associated with it. */
5245 unsigned l
= li_scalar_costs
[0].first
;
5246 FOR_EACH_VEC_ELT (vector_costs
, i
, cost
)
5248 /* We inherit from the previous COST, invariants, externals and
5249 extracts immediately follow the cost for the related stmt. */
5250 if (cost
->stmt_info
)
5251 l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
5252 li_vector_costs
.quick_push (std::make_pair (l
, cost
));
5254 li_scalar_costs
.qsort (li_cost_vec_cmp
);
5255 li_vector_costs
.qsort (li_cost_vec_cmp
);
5257 /* Now cost the portions individually. */
5260 while (si
< li_scalar_costs
.length ()
5261 && vi
< li_vector_costs
.length ())
5263 unsigned sl
= li_scalar_costs
[si
].first
;
5264 unsigned vl
= li_vector_costs
[vi
].first
;
5267 if (dump_enabled_p ())
5268 dump_printf_loc (MSG_NOTE
, vect_location
,
5269 "Scalar %d and vector %d loop part do not "
5270 "match up, skipping scalar part\n", sl
, vl
);
5271 /* Skip the scalar part, assuming zero cost on the vector side. */
5276 while (si
< li_scalar_costs
.length ()
5277 && li_scalar_costs
[si
].first
== sl
);
5281 void *scalar_target_cost_data
= init_cost (NULL
, true);
5284 add_stmt_cost (bb_vinfo
, scalar_target_cost_data
,
5285 li_scalar_costs
[si
].second
);
5288 while (si
< li_scalar_costs
.length ()
5289 && li_scalar_costs
[si
].first
== sl
);
5291 finish_cost (scalar_target_cost_data
, &dummy
, &scalar_cost
, &dummy
);
5292 destroy_cost_data (scalar_target_cost_data
);
5294 /* Complete the target-specific vector cost calculation. */
5295 void *vect_target_cost_data
= init_cost (NULL
, false);
5298 add_stmt_cost (bb_vinfo
, vect_target_cost_data
,
5299 li_vector_costs
[vi
].second
);
5302 while (vi
< li_vector_costs
.length ()
5303 && li_vector_costs
[vi
].first
== vl
);
5304 finish_cost (vect_target_cost_data
, &vec_prologue_cost
,
5305 &vec_inside_cost
, &vec_epilogue_cost
);
5306 destroy_cost_data (vect_target_cost_data
);
5308 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
5310 if (dump_enabled_p ())
5312 dump_printf_loc (MSG_NOTE
, vect_location
,
5313 "Cost model analysis for part in loop %d:\n", sl
);
5314 dump_printf (MSG_NOTE
, " Vector cost: %d\n",
5315 vec_inside_cost
+ vec_outside_cost
);
5316 dump_printf (MSG_NOTE
, " Scalar cost: %d\n", scalar_cost
);
5319 /* Vectorization is profitable if its cost is more than the cost of scalar
5320 version. Note that we err on the vector side for equal cost because
5321 the cost estimate is otherwise quite pessimistic (constant uses are
5322 free on the scalar side but cost a load on the vector side for
5324 if (vec_outside_cost
+ vec_inside_cost
> scalar_cost
)
5326 scalar_costs
.release ();
5327 vector_costs
.release ();
5331 if (vi
< li_vector_costs
.length ())
5333 if (dump_enabled_p ())
5334 dump_printf_loc (MSG_NOTE
, vect_location
,
5335 "Excess vector cost for part in loop %d:\n",
5336 li_vector_costs
[vi
].first
);
5337 scalar_costs
.release ();
5338 vector_costs
.release ();
5342 scalar_costs
.release ();
5343 vector_costs
.release ();
5347 /* qsort comparator for lane defs. */
5350 vld_cmp (const void *a_
, const void *b_
)
5352 auto *a
= (const std::pair
<unsigned, tree
> *)a_
;
5353 auto *b
= (const std::pair
<unsigned, tree
> *)b_
;
5354 return a
->first
- b
->first
;
5357 /* Return true if USE_STMT is a vector lane insert into VEC and set
5358 *THIS_LANE to the lane number that is set. */
5361 vect_slp_is_lane_insert (gimple
*use_stmt
, tree vec
, unsigned *this_lane
)
5363 gassign
*use_ass
= dyn_cast
<gassign
*> (use_stmt
);
5365 || gimple_assign_rhs_code (use_ass
) != BIT_INSERT_EXPR
5367 ? gimple_assign_rhs1 (use_ass
) != vec
5368 : ((vec
= gimple_assign_rhs1 (use_ass
)), false))
5369 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec
)),
5370 TREE_TYPE (gimple_assign_rhs2 (use_ass
)))
5371 || !constant_multiple_p
5372 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass
)),
5373 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec
)))),
5379 /* Find any vectorizable constructors and add them to the grouped_store
5383 vect_slp_check_for_constructors (bb_vec_info bb_vinfo
)
5385 for (unsigned i
= 0; i
< bb_vinfo
->bbs
.length (); ++i
)
5386 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb_vinfo
->bbs
[i
]);
5387 !gsi_end_p (gsi
); gsi_next (&gsi
))
5389 gassign
*assign
= dyn_cast
<gassign
*> (gsi_stmt (gsi
));
5393 tree rhs
= gimple_assign_rhs1 (assign
);
5394 enum tree_code code
= gimple_assign_rhs_code (assign
);
5395 use_operand_p use_p
;
5397 if (code
== CONSTRUCTOR
)
5399 if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
5400 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)),
5401 CONSTRUCTOR_NELTS (rhs
))
5402 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs
, 0)->value
))
5403 || uniform_vector_p (rhs
))
5408 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), j
, val
)
5409 if (TREE_CODE (val
) != SSA_NAME
5410 || !bb_vinfo
->lookup_def (val
))
5412 if (j
!= CONSTRUCTOR_NELTS (rhs
))
5415 stmt_vec_info stmt_info
= bb_vinfo
->lookup_stmt (assign
);
5416 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
5418 else if (code
== BIT_INSERT_EXPR
5419 && VECTOR_TYPE_P (TREE_TYPE (rhs
))
5420 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).is_constant ()
5421 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).to_constant () > 1
5422 && integer_zerop (gimple_assign_rhs3 (assign
))
5423 && useless_type_conversion_p
5424 (TREE_TYPE (TREE_TYPE (rhs
)),
5425 TREE_TYPE (gimple_assign_rhs2 (assign
)))
5426 && bb_vinfo
->lookup_def (gimple_assign_rhs2 (assign
)))
5428 /* We start to match on insert to lane zero but since the
5429 inserts need not be ordered we'd have to search both
5430 the def and the use chains. */
5431 tree vectype
= TREE_TYPE (rhs
);
5432 unsigned nlanes
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5433 auto_vec
<std::pair
<unsigned, tree
> > lane_defs (nlanes
);
5434 auto_sbitmap
lanes (nlanes
);
5435 bitmap_clear (lanes
);
5436 bitmap_set_bit (lanes
, 0);
5437 tree def
= gimple_assign_lhs (assign
);
5438 lane_defs
.quick_push
5439 (std::make_pair (0, gimple_assign_rhs2 (assign
)));
5440 unsigned lanes_found
= 1;
5441 /* Start with the use chains, the last stmt will be the root. */
5442 stmt_vec_info last
= bb_vinfo
->lookup_stmt (assign
);
5443 vec
<stmt_vec_info
> roots
= vNULL
;
5444 roots
.safe_push (last
);
5447 use_operand_p use_p
;
5449 if (!single_imm_use (def
, &use_p
, &use_stmt
))
5452 if (!bb_vinfo
->lookup_stmt (use_stmt
)
5453 || !vect_slp_is_lane_insert (use_stmt
, def
, &this_lane
)
5454 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (use_stmt
)))
5456 if (bitmap_bit_p (lanes
, this_lane
))
5459 bitmap_set_bit (lanes
, this_lane
);
5460 gassign
*use_ass
= as_a
<gassign
*> (use_stmt
);
5461 lane_defs
.quick_push (std::make_pair
5462 (this_lane
, gimple_assign_rhs2 (use_ass
)));
5463 last
= bb_vinfo
->lookup_stmt (use_ass
);
5464 roots
.safe_push (last
);
5465 def
= gimple_assign_lhs (use_ass
);
5467 while (lanes_found
< nlanes
);
5468 if (roots
.length () > 1)
5469 std::swap(roots
[0], roots
[roots
.length () - 1]);
5470 if (lanes_found
< nlanes
)
5472 /* Now search the def chain. */
5473 def
= gimple_assign_rhs1 (assign
);
5476 if (TREE_CODE (def
) != SSA_NAME
5477 || !has_single_use (def
))
5479 gimple
*def_stmt
= SSA_NAME_DEF_STMT (def
);
5481 if (!bb_vinfo
->lookup_stmt (def_stmt
)
5482 || !vect_slp_is_lane_insert (def_stmt
,
5483 NULL_TREE
, &this_lane
)
5484 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (def_stmt
)))
5486 if (bitmap_bit_p (lanes
, this_lane
))
5489 bitmap_set_bit (lanes
, this_lane
);
5490 lane_defs
.quick_push (std::make_pair
5492 gimple_assign_rhs2 (def_stmt
)));
5493 roots
.safe_push (bb_vinfo
->lookup_stmt (def_stmt
));
5494 def
= gimple_assign_rhs1 (def_stmt
);
5496 while (lanes_found
< nlanes
);
5498 if (lanes_found
== nlanes
)
5500 /* Sort lane_defs after the lane index and register the root. */
5501 lane_defs
.qsort (vld_cmp
);
5502 vec
<stmt_vec_info
> stmts
;
5503 stmts
.create (nlanes
);
5504 for (unsigned i
= 0; i
< nlanes
; ++i
)
5505 stmts
.quick_push (bb_vinfo
->lookup_def (lane_defs
[i
].second
));
5506 bb_vinfo
->roots
.safe_push (slp_root (slp_inst_kind_ctor
,
5512 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
5513 && (associative_tree_code (code
) || code
== MINUS_EXPR
)
5514 /* ??? The flag_associative_math and TYPE_OVERFLOW_WRAPS
5515 checks pessimize a two-element reduction. PR54400.
5516 ??? In-order reduction could be handled if we only
5517 traverse one operand chain in vect_slp_linearize_chain. */
5518 && ((FLOAT_TYPE_P (TREE_TYPE (rhs
)) && flag_associative_math
)
5519 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs
))
5520 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs
))))
5521 /* Ops with constants at the tail can be stripped here. */
5522 && TREE_CODE (rhs
) == SSA_NAME
5523 && TREE_CODE (gimple_assign_rhs2 (assign
)) == SSA_NAME
5524 /* Should be the chain end. */
5525 && (!single_imm_use (gimple_assign_lhs (assign
),
5527 || !is_gimple_assign (use_stmt
)
5528 || (gimple_assign_rhs_code (use_stmt
) != code
5529 && ((code
!= PLUS_EXPR
&& code
!= MINUS_EXPR
)
5530 || (gimple_assign_rhs_code (use_stmt
)
5531 != (code
== PLUS_EXPR
? MINUS_EXPR
: PLUS_EXPR
))))))
5533 /* We start the match at the end of a possible association
5535 auto_vec
<chain_op_t
> chain
;
5536 auto_vec
<std::pair
<tree_code
, gimple
*> > worklist
;
5537 auto_vec
<gimple
*> chain_stmts
;
5538 gimple
*code_stmt
= NULL
, *alt_code_stmt
= NULL
;
5539 if (code
== MINUS_EXPR
)
5541 internal_fn reduc_fn
;
5542 if (!reduction_fn_for_scalar_code (code
, &reduc_fn
)
5543 || reduc_fn
== IFN_LAST
)
5545 vect_slp_linearize_chain (bb_vinfo
, worklist
, chain
, code
, assign
,
5547 code_stmt
, alt_code_stmt
, &chain_stmts
);
5548 if (chain
.length () > 1)
5550 /* Sort the chain according to def_type and operation. */
5551 chain
.sort (dt_sort_cmp
, bb_vinfo
);
5552 /* ??? Now we'd want to strip externals and constants
5553 but record those to be handled in the epilogue. */
5554 /* ??? For now do not allow mixing ops or externs/constants. */
5555 bool invalid
= false;
5556 for (unsigned i
= 0; i
< chain
.length (); ++i
)
5557 if (chain
[i
].dt
!= vect_internal_def
5558 || chain
[i
].code
!= code
)
5562 vec
<stmt_vec_info
> stmts
;
5563 stmts
.create (chain
.length ());
5564 for (unsigned i
= 0; i
< chain
.length (); ++i
)
5565 stmts
.quick_push (bb_vinfo
->lookup_def (chain
[i
].op
));
5566 vec
<stmt_vec_info
> roots
;
5567 roots
.create (chain_stmts
.length ());
5568 for (unsigned i
= 0; i
< chain_stmts
.length (); ++i
)
5569 roots
.quick_push (bb_vinfo
->lookup_stmt (chain_stmts
[i
]));
5570 bb_vinfo
->roots
.safe_push (slp_root (slp_inst_kind_bb_reduc
,
5578 /* Walk the grouped store chains and replace entries with their
5579 pattern variant if any. */
5582 vect_fixup_store_groups_with_patterns (vec_info
*vinfo
)
5584 stmt_vec_info first_element
;
5587 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
5589 /* We also have CTORs in this array. */
5590 if (!STMT_VINFO_GROUPED_ACCESS (first_element
))
5592 if (STMT_VINFO_IN_PATTERN_P (first_element
))
5594 stmt_vec_info orig
= first_element
;
5595 first_element
= STMT_VINFO_RELATED_STMT (first_element
);
5596 DR_GROUP_FIRST_ELEMENT (first_element
) = first_element
;
5597 DR_GROUP_SIZE (first_element
) = DR_GROUP_SIZE (orig
);
5598 DR_GROUP_GAP (first_element
) = DR_GROUP_GAP (orig
);
5599 DR_GROUP_NEXT_ELEMENT (first_element
) = DR_GROUP_NEXT_ELEMENT (orig
);
5600 vinfo
->grouped_stores
[i
] = first_element
;
5602 stmt_vec_info prev
= first_element
;
5603 while (DR_GROUP_NEXT_ELEMENT (prev
))
5605 stmt_vec_info elt
= DR_GROUP_NEXT_ELEMENT (prev
);
5606 if (STMT_VINFO_IN_PATTERN_P (elt
))
5608 stmt_vec_info orig
= elt
;
5609 elt
= STMT_VINFO_RELATED_STMT (elt
);
5610 DR_GROUP_NEXT_ELEMENT (prev
) = elt
;
5611 DR_GROUP_GAP (elt
) = DR_GROUP_GAP (orig
);
5612 DR_GROUP_NEXT_ELEMENT (elt
) = DR_GROUP_NEXT_ELEMENT (orig
);
5614 DR_GROUP_FIRST_ELEMENT (elt
) = first_element
;
5620 /* Check if the region described by BB_VINFO can be vectorized, returning
5621 true if so. When returning false, set FATAL to true if the same failure
5622 would prevent vectorization at other vector sizes, false if it is still
5623 worth trying other sizes. N_STMTS is the number of statements in the
5627 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo
, int n_stmts
, bool &fatal
,
5628 vec
<int> *dataref_groups
)
5630 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
5632 slp_instance instance
;
5634 poly_uint64 min_vf
= 2;
5636 /* The first group of checks is independent of the vector size. */
5639 /* Analyze the data references. */
5641 if (!vect_analyze_data_refs (bb_vinfo
, &min_vf
, NULL
))
5643 if (dump_enabled_p ())
5644 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5645 "not vectorized: unhandled data-ref in basic "
5650 if (!vect_analyze_data_ref_accesses (bb_vinfo
, dataref_groups
))
5652 if (dump_enabled_p ())
5653 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5654 "not vectorized: unhandled data access in "
5659 vect_slp_check_for_constructors (bb_vinfo
);
5661 /* If there are no grouped stores and no constructors in the region
5662 there is no need to continue with pattern recog as vect_analyze_slp
5663 will fail anyway. */
5664 if (bb_vinfo
->grouped_stores
.is_empty ()
5665 && bb_vinfo
->roots
.is_empty ())
5667 if (dump_enabled_p ())
5668 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5669 "not vectorized: no grouped stores in "
5674 /* While the rest of the analysis below depends on it in some way. */
5677 vect_pattern_recog (bb_vinfo
);
5679 /* Update store groups from pattern processing. */
5680 vect_fixup_store_groups_with_patterns (bb_vinfo
);
5682 /* Check the SLP opportunities in the basic block, analyze and build SLP
5684 if (!vect_analyze_slp (bb_vinfo
, n_stmts
))
5686 if (dump_enabled_p ())
5688 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5689 "Failed to SLP the basic block.\n");
5690 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5691 "not vectorized: failed to find SLP opportunities "
5692 "in basic block.\n");
5697 /* Optimize permutations. */
5698 vect_optimize_slp (bb_vinfo
);
5700 /* Gather the loads reachable from the SLP graph entries. */
5701 vect_gather_slp_loads (bb_vinfo
);
5703 vect_record_base_alignments (bb_vinfo
);
5705 /* Analyze and verify the alignment of data references and the
5706 dependence in the SLP instances. */
5707 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
5709 vect_location
= instance
->location ();
5710 if (! vect_slp_analyze_instance_alignment (bb_vinfo
, instance
)
5711 || ! vect_slp_analyze_instance_dependence (bb_vinfo
, instance
))
5713 slp_tree node
= SLP_INSTANCE_TREE (instance
);
5714 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
5715 if (dump_enabled_p ())
5716 dump_printf_loc (MSG_NOTE
, vect_location
,
5717 "removing SLP instance operations starting from: %G",
5719 vect_free_slp_instance (instance
);
5720 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
5724 /* Mark all the statements that we want to vectorize as pure SLP and
5726 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
5727 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
5730 /* Likewise consider instance root stmts as vectorized. */
5731 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance
), j
, root
)
5732 STMT_SLP_TYPE (root
) = pure_slp
;
5736 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
5739 if (!vect_slp_analyze_operations (bb_vinfo
))
5741 if (dump_enabled_p ())
5742 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5743 "not vectorized: bad operation in basic block.\n");
5747 vect_bb_partition_graph (bb_vinfo
);
5752 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
5753 basic blocks in BBS, returning true on success.
5754 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
5757 vect_slp_region (vec
<basic_block
> bbs
, vec
<data_reference_p
> datarefs
,
5758 vec
<int> *dataref_groups
, unsigned int n_stmts
)
5760 bb_vec_info bb_vinfo
;
5761 auto_vector_modes vector_modes
;
5763 /* Autodetect first vector size we try. */
5764 machine_mode next_vector_mode
= VOIDmode
;
5765 targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
, false);
5766 unsigned int mode_i
= 0;
5768 vec_info_shared shared
;
5770 machine_mode autodetected_vector_mode
= VOIDmode
;
5773 bool vectorized
= false;
5775 bb_vinfo
= new _bb_vec_info (bbs
, &shared
);
5777 bool first_time_p
= shared
.datarefs
.is_empty ();
5778 BB_VINFO_DATAREFS (bb_vinfo
) = datarefs
;
5780 bb_vinfo
->shared
->save_datarefs ();
5782 bb_vinfo
->shared
->check_datarefs ();
5783 bb_vinfo
->vector_mode
= next_vector_mode
;
5785 if (vect_slp_analyze_bb_1 (bb_vinfo
, n_stmts
, fatal
, dataref_groups
))
5787 if (dump_enabled_p ())
5789 dump_printf_loc (MSG_NOTE
, vect_location
,
5790 "***** Analysis succeeded with vector mode"
5791 " %s\n", GET_MODE_NAME (bb_vinfo
->vector_mode
));
5792 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB part\n");
5795 bb_vinfo
->shared
->check_datarefs ();
5798 slp_instance instance
;
5799 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo
), i
, instance
)
5801 if (instance
->subgraph_entries
.is_empty ())
5804 vect_location
= instance
->location ();
5805 if (!unlimited_cost_model (NULL
)
5806 && !vect_bb_vectorization_profitable_p
5807 (bb_vinfo
, instance
->subgraph_entries
))
5809 if (dump_enabled_p ())
5810 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5811 "not vectorized: vectorization is not "
5816 if (!dbg_cnt (vect_slp
))
5819 if (!vectorized
&& dump_enabled_p ())
5820 dump_printf_loc (MSG_NOTE
, vect_location
,
5821 "Basic block will be vectorized "
5825 vect_schedule_slp (bb_vinfo
, instance
->subgraph_entries
);
5827 unsigned HOST_WIDE_INT bytes
;
5828 if (dump_enabled_p ())
5831 (bb_vinfo
->vector_mode
).is_constant (&bytes
))
5832 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
5833 "basic block part vectorized using %wu "
5834 "byte vectors\n", bytes
);
5836 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
5837 "basic block part vectorized using "
5838 "variable length vectors\n");
5844 if (dump_enabled_p ())
5845 dump_printf_loc (MSG_NOTE
, vect_location
,
5846 "***** Analysis failed with vector mode %s\n",
5847 GET_MODE_NAME (bb_vinfo
->vector_mode
));
5851 autodetected_vector_mode
= bb_vinfo
->vector_mode
;
5854 while (mode_i
< vector_modes
.length ()
5855 && vect_chooses_same_modes_p (bb_vinfo
, vector_modes
[mode_i
]))
5857 if (dump_enabled_p ())
5858 dump_printf_loc (MSG_NOTE
, vect_location
,
5859 "***** The result for vector mode %s would"
5861 GET_MODE_NAME (vector_modes
[mode_i
]));
5867 if (mode_i
< vector_modes
.length ()
5868 && VECTOR_MODE_P (autodetected_vector_mode
)
5869 && (related_vector_mode (vector_modes
[mode_i
],
5870 GET_MODE_INNER (autodetected_vector_mode
))
5871 == autodetected_vector_mode
)
5872 && (related_vector_mode (autodetected_vector_mode
,
5873 GET_MODE_INNER (vector_modes
[mode_i
]))
5874 == vector_modes
[mode_i
]))
5876 if (dump_enabled_p ())
5877 dump_printf_loc (MSG_NOTE
, vect_location
,
5878 "***** Skipping vector mode %s, which would"
5879 " repeat the analysis for %s\n",
5880 GET_MODE_NAME (vector_modes
[mode_i
]),
5881 GET_MODE_NAME (autodetected_vector_mode
));
5886 || mode_i
== vector_modes
.length ()
5887 || autodetected_vector_mode
== VOIDmode
5888 /* If vect_slp_analyze_bb_1 signaled that analysis for all
5889 vector sizes will fail do not bother iterating. */
5893 /* Try the next biggest vector size. */
5894 next_vector_mode
= vector_modes
[mode_i
++];
5895 if (dump_enabled_p ())
5896 dump_printf_loc (MSG_NOTE
, vect_location
,
5897 "***** Re-trying analysis with vector mode %s\n",
5898 GET_MODE_NAME (next_vector_mode
));
5903 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
5904 true if anything in the basic-block was vectorized. */
5907 vect_slp_bbs (vec
<basic_block
> bbs
)
5909 vec
<data_reference_p
> datarefs
= vNULL
;
5910 auto_vec
<int> dataref_groups
;
5912 int current_group
= 0;
5914 for (unsigned i
= 0; i
< bbs
.length (); i
++)
5916 basic_block bb
= bbs
[i
];
5917 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
5920 gimple
*stmt
= gsi_stmt (gsi
);
5921 if (is_gimple_debug (stmt
))
5926 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
5927 vect_location
= stmt
;
5929 if (!vect_find_stmt_data_reference (NULL
, stmt
, &datarefs
,
5930 &dataref_groups
, current_group
))
5935 return vect_slp_region (bbs
, datarefs
, &dataref_groups
, insns
);
5938 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5939 true if anything in the basic-block was vectorized. */
5942 vect_slp_bb (basic_block bb
)
5944 auto_vec
<basic_block
> bbs
;
5946 return vect_slp_bbs (bbs
);
5949 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5950 true if anything in the basic-block was vectorized. */
5953 vect_slp_function (function
*fun
)
5956 int *rpo
= XNEWVEC (int, n_basic_blocks_for_fn (fun
));
5957 unsigned n
= pre_and_rev_post_order_compute_fn (fun
, NULL
, rpo
, false);
5959 /* For the moment split the function into pieces to avoid making
5960 the iteration on the vector mode moot. Split at points we know
5961 to not handle well which is CFG merges (SLP discovery doesn't
5962 handle non-loop-header PHIs) and loop exits. Since pattern
5963 recog requires reverse iteration to visit uses before defs
5964 simply chop RPO into pieces. */
5965 auto_vec
<basic_block
> bbs
;
5966 for (unsigned i
= 0; i
< n
; i
++)
5968 basic_block bb
= BASIC_BLOCK_FOR_FN (fun
, rpo
[i
]);
5971 /* Split when a BB is not dominated by the first block. */
5972 if (!bbs
.is_empty ()
5973 && !dominated_by_p (CDI_DOMINATORS
, bb
, bbs
[0]))
5975 if (dump_enabled_p ())
5976 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5977 "splitting region at dominance boundary bb%d\n",
5981 /* Split when the loop determined by the first block
5982 is exited. This is because we eventually insert
5983 invariants at region begin. */
5984 else if (!bbs
.is_empty ()
5985 && bbs
[0]->loop_father
!= bb
->loop_father
5986 && !flow_loop_nested_p (bbs
[0]->loop_father
, bb
->loop_father
))
5988 if (dump_enabled_p ())
5989 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5990 "splitting region at loop %d exit at bb%d\n",
5991 bbs
[0]->loop_father
->num
, bb
->index
);
5995 if (split
&& !bbs
.is_empty ())
5997 r
|= vect_slp_bbs (bbs
);
5999 bbs
.quick_push (bb
);
6004 /* When we have a stmt ending this block and defining a
6005 value we have to insert on edges when inserting after it for
6006 a vector containing its definition. Avoid this for now. */
6007 if (gimple
*last
= last_stmt (bb
))
6008 if (gimple_get_lhs (last
)
6009 && is_ctrl_altering_stmt (last
))
6011 if (dump_enabled_p ())
6012 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6013 "splitting region at control altering "
6014 "definition %G", last
);
6015 r
|= vect_slp_bbs (bbs
);
6020 if (!bbs
.is_empty ())
6021 r
|= vect_slp_bbs (bbs
);
6028 /* Build a variable-length vector in which the elements in ELTS are repeated
6029 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
6030 RESULTS and add any new instructions to SEQ.
6032 The approach we use is:
6034 (1) Find a vector mode VM with integer elements of mode IM.
6036 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6037 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
6038 from small vectors to IM.
6040 (3) Duplicate each ELTS'[I] into a vector of mode VM.
6042 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
6043 correct byte contents.
6045 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
6047 We try to find the largest IM for which this sequence works, in order
6048 to cut down on the number of interleaves. */
6051 duplicate_and_interleave (vec_info
*vinfo
, gimple_seq
*seq
, tree vector_type
,
6052 vec
<tree
> elts
, unsigned int nresults
,
6055 unsigned int nelts
= elts
.length ();
6056 tree element_type
= TREE_TYPE (vector_type
);
6058 /* (1) Find a vector mode VM with integer elements of mode IM. */
6059 unsigned int nvectors
= 1;
6060 tree new_vector_type
;
6062 if (!can_duplicate_and_interleave_p (vinfo
, nelts
, element_type
,
6063 &nvectors
, &new_vector_type
,
6067 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
6068 unsigned int partial_nelts
= nelts
/ nvectors
;
6069 tree partial_vector_type
= build_vector_type (element_type
, partial_nelts
);
6071 tree_vector_builder partial_elts
;
6072 auto_vec
<tree
, 32> pieces (nvectors
* 2);
6073 pieces
.quick_grow_cleared (nvectors
* 2);
6074 for (unsigned int i
= 0; i
< nvectors
; ++i
)
6076 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6077 ELTS' has mode IM. */
6078 partial_elts
.new_vector (partial_vector_type
, partial_nelts
, 1);
6079 for (unsigned int j
= 0; j
< partial_nelts
; ++j
)
6080 partial_elts
.quick_push (elts
[i
* partial_nelts
+ j
]);
6081 tree t
= gimple_build_vector (seq
, &partial_elts
);
6082 t
= gimple_build (seq
, VIEW_CONVERT_EXPR
,
6083 TREE_TYPE (new_vector_type
), t
);
6085 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
6086 pieces
[i
] = gimple_build_vector_from_val (seq
, new_vector_type
, t
);
6089 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
6090 correct byte contents.
6092 Conceptually, we need to repeat the following operation log2(nvectors)
6093 times, where hi_start = nvectors / 2:
6095 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
6096 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
6098 However, if each input repeats every N elements and the VF is
6099 a multiple of N * 2, the HI result is the same as the LO result.
6100 This will be true for the first N1 iterations of the outer loop,
6101 followed by N2 iterations for which both the LO and HI results
6104 N1 + N2 = log2(nvectors)
6106 Each "N1 iteration" doubles the number of redundant vectors and the
6107 effect of the process as a whole is to have a sequence of nvectors/2**N1
6108 vectors that repeats 2**N1 times. Rather than generate these redundant
6109 vectors, we halve the number of vectors for each N1 iteration. */
6110 unsigned int in_start
= 0;
6111 unsigned int out_start
= nvectors
;
6112 unsigned int new_nvectors
= nvectors
;
6113 for (unsigned int in_repeat
= 1; in_repeat
< nvectors
; in_repeat
*= 2)
6115 unsigned int hi_start
= new_nvectors
/ 2;
6116 unsigned int out_i
= 0;
6117 for (unsigned int in_i
= 0; in_i
< new_nvectors
; ++in_i
)
6120 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type
),
6124 tree output
= make_ssa_name (new_vector_type
);
6125 tree input1
= pieces
[in_start
+ (in_i
/ 2)];
6126 tree input2
= pieces
[in_start
+ (in_i
/ 2) + hi_start
];
6127 gassign
*stmt
= gimple_build_assign (output
, VEC_PERM_EXPR
,
6129 permutes
[in_i
& 1]);
6130 gimple_seq_add_stmt (seq
, stmt
);
6131 pieces
[out_start
+ out_i
] = output
;
6134 std::swap (in_start
, out_start
);
6135 new_nvectors
= out_i
;
6138 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
6139 results
.reserve (nresults
);
6140 for (unsigned int i
= 0; i
< nresults
; ++i
)
6141 if (i
< new_nvectors
)
6142 results
.quick_push (gimple_build (seq
, VIEW_CONVERT_EXPR
, vector_type
,
6143 pieces
[in_start
+ i
]));
6145 results
.quick_push (results
[i
- new_nvectors
]);
6149 /* For constant and loop invariant defs in OP_NODE this function creates
6150 vector defs that will be used in the vectorized stmts and stores them
6151 to SLP_TREE_VEC_DEFS of OP_NODE. */
6154 vect_create_constant_vectors (vec_info
*vinfo
, slp_tree op_node
)
6156 unsigned HOST_WIDE_INT nunits
;
6158 unsigned j
, number_of_places_left_in_vector
;
6161 int group_size
= op_node
->ops
.length ();
6162 unsigned int vec_num
, i
;
6163 unsigned number_of_copies
= 1;
6165 gimple_seq ctor_seq
= NULL
;
6166 auto_vec
<tree
, 16> permute_results
;
6168 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
6169 vector_type
= SLP_TREE_VECTYPE (op_node
);
6171 unsigned int number_of_vectors
= SLP_TREE_NUMBER_OF_VEC_STMTS (op_node
);
6172 SLP_TREE_VEC_DEFS (op_node
).create (number_of_vectors
);
6173 auto_vec
<tree
> voprnds (number_of_vectors
);
6175 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
6176 created vectors. It is greater than 1 if unrolling is performed.
6178 For example, we have two scalar operands, s1 and s2 (e.g., group of
6179 strided accesses of size two), while NUNITS is four (i.e., four scalars
6180 of this type can be packed in a vector). The output vector will contain
6181 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
6184 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
6185 containing the operands.
6187 For example, NUNITS is four as before, and the group size is 8
6188 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
6189 {s5, s6, s7, s8}. */
6191 /* When using duplicate_and_interleave, we just need one element for
6192 each scalar statement. */
6193 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
6194 nunits
= group_size
;
6196 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
6198 number_of_places_left_in_vector
= nunits
;
6200 tree_vector_builder
elts (vector_type
, nunits
, 1);
6201 elts
.quick_grow (nunits
);
6202 stmt_vec_info insert_after
= NULL
;
6203 for (j
= 0; j
< number_of_copies
; j
++)
6206 for (i
= group_size
- 1; op_node
->ops
.iterate (i
, &op
); i
--)
6208 /* Create 'vect_ = {op0,op1,...,opn}'. */
6209 number_of_places_left_in_vector
--;
6211 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
6213 if (CONSTANT_CLASS_P (op
))
6215 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
6217 /* Can't use VIEW_CONVERT_EXPR for booleans because
6218 of possibly different sizes of scalar value and
6220 if (integer_zerop (op
))
6221 op
= build_int_cst (TREE_TYPE (vector_type
), 0);
6222 else if (integer_onep (op
))
6223 op
= build_all_ones_cst (TREE_TYPE (vector_type
));
6228 op
= fold_unary (VIEW_CONVERT_EXPR
,
6229 TREE_TYPE (vector_type
), op
);
6230 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
6234 tree new_temp
= make_ssa_name (TREE_TYPE (vector_type
));
6236 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
6239 = build_all_ones_cst (TREE_TYPE (vector_type
));
6241 = build_zero_cst (TREE_TYPE (vector_type
));
6242 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op
)));
6243 init_stmt
= gimple_build_assign (new_temp
, COND_EXPR
,
6249 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
6252 = gimple_build_assign (new_temp
, VIEW_CONVERT_EXPR
,
6255 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
6259 elts
[number_of_places_left_in_vector
] = op
;
6260 if (!CONSTANT_CLASS_P (op
))
6262 /* For BB vectorization we have to compute an insert location
6263 when a def is inside the analyzed region since we cannot
6264 simply insert at the BB start in this case. */
6265 stmt_vec_info opdef
;
6266 if (TREE_CODE (orig_op
) == SSA_NAME
6267 && !SSA_NAME_IS_DEFAULT_DEF (orig_op
)
6268 && is_a
<bb_vec_info
> (vinfo
)
6269 && (opdef
= vinfo
->lookup_def (orig_op
)))
6272 insert_after
= opdef
;
6274 insert_after
= get_later_stmt (insert_after
, opdef
);
6277 if (number_of_places_left_in_vector
== 0)
6280 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
6281 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
6282 vec_cst
= gimple_build_vector (&ctor_seq
, &elts
);
6285 if (permute_results
.is_empty ())
6286 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
,
6287 elts
, number_of_vectors
,
6289 vec_cst
= permute_results
[number_of_vectors
- j
- 1];
6291 if (!gimple_seq_empty_p (ctor_seq
))
6295 gimple_stmt_iterator gsi
;
6296 if (gimple_code (insert_after
->stmt
) == GIMPLE_PHI
)
6298 gsi
= gsi_after_labels (gimple_bb (insert_after
->stmt
));
6299 gsi_insert_seq_before (&gsi
, ctor_seq
,
6300 GSI_CONTINUE_LINKING
);
6302 else if (!stmt_ends_bb_p (insert_after
->stmt
))
6304 gsi
= gsi_for_stmt (insert_after
->stmt
);
6305 gsi_insert_seq_after (&gsi
, ctor_seq
,
6306 GSI_CONTINUE_LINKING
);
6310 /* When we want to insert after a def where the
6311 defining stmt throws then insert on the fallthru
6313 edge e
= find_fallthru_edge
6314 (gimple_bb (insert_after
->stmt
)->succs
);
6316 = gsi_insert_seq_on_edge_immediate (e
, ctor_seq
);
6317 gcc_assert (!new_bb
);
6321 vinfo
->insert_seq_on_entry (NULL
, ctor_seq
);
6324 voprnds
.quick_push (vec_cst
);
6325 insert_after
= NULL
;
6326 number_of_places_left_in_vector
= nunits
;
6328 elts
.new_vector (vector_type
, nunits
, 1);
6329 elts
.quick_grow (nunits
);
6334 /* Since the vectors are created in the reverse order, we should invert
6336 vec_num
= voprnds
.length ();
6337 for (j
= vec_num
; j
!= 0; j
--)
6339 vop
= voprnds
[j
- 1];
6340 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
6343 /* In case that VF is greater than the unrolling factor needed for the SLP
6344 group of stmts, NUMBER_OF_VECTORS to be created is greater than
6345 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
6346 to replicate the vectors. */
6347 while (number_of_vectors
> SLP_TREE_VEC_DEFS (op_node
).length ())
6348 for (i
= 0; SLP_TREE_VEC_DEFS (op_node
).iterate (i
, &vop
) && i
< vec_num
;
6350 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
6353 /* Get the Ith vectorized definition from SLP_NODE. */
6356 vect_get_slp_vect_def (slp_tree slp_node
, unsigned i
)
6358 if (SLP_TREE_VEC_STMTS (slp_node
).exists ())
6359 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node
)[i
]);
6361 return SLP_TREE_VEC_DEFS (slp_node
)[i
];
6364 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
6367 vect_get_slp_defs (slp_tree slp_node
, vec
<tree
> *vec_defs
)
6369 vec_defs
->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
));
6370 if (SLP_TREE_DEF_TYPE (slp_node
) == vect_internal_def
)
6373 gimple
*vec_def_stmt
;
6374 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), j
, vec_def_stmt
)
6375 vec_defs
->quick_push (gimple_get_lhs (vec_def_stmt
));
6378 vec_defs
->splice (SLP_TREE_VEC_DEFS (slp_node
));
6381 /* Get N vectorized definitions for SLP_NODE. */
6384 vect_get_slp_defs (vec_info
*,
6385 slp_tree slp_node
, vec
<vec
<tree
> > *vec_oprnds
, unsigned n
)
6388 n
= SLP_TREE_CHILDREN (slp_node
).length ();
6390 for (unsigned i
= 0; i
< n
; ++i
)
6392 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
6393 vec
<tree
> vec_defs
= vNULL
;
6394 vect_get_slp_defs (child
, &vec_defs
);
6395 vec_oprnds
->quick_push (vec_defs
);
6399 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6400 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6401 permute statements for the SLP node NODE. Store the number of vector
6402 permute instructions in *N_PERMS and the number of vector load
6403 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
6404 that were not needed. */
6407 vect_transform_slp_perm_load (vec_info
*vinfo
,
6408 slp_tree node
, vec
<tree
> dr_chain
,
6409 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
6410 bool analyze_only
, unsigned *n_perms
,
6411 unsigned int *n_loads
, bool dce_chain
)
6413 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
6415 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6416 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (node
).length ();
6417 unsigned int mask_element
;
6420 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
6423 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
6425 mode
= TYPE_MODE (vectype
);
6426 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
6428 /* Initialize the vect stmts of NODE to properly insert the generated
6431 for (unsigned i
= SLP_TREE_VEC_STMTS (node
).length ();
6432 i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
6433 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
6435 /* Generate permutation masks for every NODE. Number of masks for each NODE
6436 is equal to GROUP_SIZE.
6437 E.g., we have a group of three nodes with three loads from the same
6438 location in each node, and the vector size is 4. I.e., we have a
6439 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6440 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6441 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6444 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
6445 The last mask is illegal since we assume two operands for permute
6446 operation, and the mask element values can't be outside that range.
6447 Hence, the last mask must be converted into {2,5,5,5}.
6448 For the first two permutations we need the first and the second input
6449 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6450 we need the second and the third vectors: {b1,c1,a2,b2} and
6453 int vect_stmts_counter
= 0;
6454 unsigned int index
= 0;
6455 int first_vec_index
= -1;
6456 int second_vec_index
= -1;
6460 vec_perm_builder mask
;
6461 unsigned int nelts_to_build
;
6462 unsigned int nvectors_per_build
;
6463 unsigned int in_nlanes
;
6464 bool repeating_p
= (group_size
== DR_GROUP_SIZE (stmt_info
)
6465 && multiple_p (nunits
, group_size
));
6468 /* A single vector contains a whole number of copies of the node, so:
6469 (a) all permutes can use the same mask; and
6470 (b) the permutes only need a single vector input. */
6471 mask
.new_vector (nunits
, group_size
, 3);
6472 nelts_to_build
= mask
.encoded_nelts ();
6473 nvectors_per_build
= SLP_TREE_VEC_STMTS (node
).length ();
6474 in_nlanes
= DR_GROUP_SIZE (stmt_info
) * 3;
6478 /* We need to construct a separate mask for each vector statement. */
6479 unsigned HOST_WIDE_INT const_nunits
, const_vf
;
6480 if (!nunits
.is_constant (&const_nunits
)
6481 || !vf
.is_constant (&const_vf
))
6483 mask
.new_vector (const_nunits
, const_nunits
, 1);
6484 nelts_to_build
= const_vf
* group_size
;
6485 nvectors_per_build
= 1;
6486 in_nlanes
= const_vf
* DR_GROUP_SIZE (stmt_info
);
6488 auto_sbitmap
used_in_lanes (in_nlanes
);
6489 bitmap_clear (used_in_lanes
);
6490 auto_bitmap used_defs
;
6492 unsigned int count
= mask
.encoded_nelts ();
6493 mask
.quick_grow (count
);
6494 vec_perm_indices indices
;
6496 for (unsigned int j
= 0; j
< nelts_to_build
; j
++)
6498 unsigned int iter_num
= j
/ group_size
;
6499 unsigned int stmt_num
= j
% group_size
;
6500 unsigned int i
= (iter_num
* DR_GROUP_SIZE (stmt_info
)
6501 + SLP_TREE_LOAD_PERMUTATION (node
)[stmt_num
]);
6502 bitmap_set_bit (used_in_lanes
, i
);
6505 first_vec_index
= 0;
6510 /* Enforced before the loop when !repeating_p. */
6511 unsigned int const_nunits
= nunits
.to_constant ();
6512 vec_index
= i
/ const_nunits
;
6513 mask_element
= i
% const_nunits
;
6514 if (vec_index
== first_vec_index
6515 || first_vec_index
== -1)
6517 first_vec_index
= vec_index
;
6519 else if (vec_index
== second_vec_index
6520 || second_vec_index
== -1)
6522 second_vec_index
= vec_index
;
6523 mask_element
+= const_nunits
;
6527 if (dump_enabled_p ())
6528 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6529 "permutation requires at "
6530 "least three vectors %G",
6532 gcc_assert (analyze_only
);
6536 gcc_assert (mask_element
< 2 * const_nunits
);
6539 if (mask_element
!= index
)
6541 mask
[index
++] = mask_element
;
6543 if (index
== count
&& !noop_p
)
6545 indices
.new_vector (mask
, second_vec_index
== -1 ? 1 : 2, nunits
);
6546 if (!can_vec_perm_const_p (mode
, indices
))
6548 if (dump_enabled_p ())
6550 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
6552 "unsupported vect permute { ");
6553 for (i
= 0; i
< count
; ++i
)
6555 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
6556 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
6558 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
6560 gcc_assert (analyze_only
);
6571 tree mask_vec
= NULL_TREE
;
6574 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
6576 if (second_vec_index
== -1)
6577 second_vec_index
= first_vec_index
;
6579 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
6581 /* Generate the permute statement if necessary. */
6582 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
6583 tree second_vec
= dr_chain
[second_vec_index
+ ri
];
6587 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
6589 = vect_create_destination_var (gimple_assign_lhs (stmt
),
6591 perm_dest
= make_ssa_name (perm_dest
);
6593 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
6594 first_vec
, second_vec
,
6596 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
,
6600 bitmap_set_bit (used_defs
, first_vec_index
+ ri
);
6601 bitmap_set_bit (used_defs
, second_vec_index
+ ri
);
6606 /* If mask was NULL_TREE generate the requested
6607 identity transform. */
6608 perm_stmt
= SSA_NAME_DEF_STMT (first_vec
);
6610 bitmap_set_bit (used_defs
, first_vec_index
+ ri
);
6613 /* Store the vector statement in NODE. */
6614 SLP_TREE_VEC_STMTS (node
)[vect_stmts_counter
++] = perm_stmt
;
6619 first_vec_index
= -1;
6620 second_vec_index
= -1;
6628 *n_loads
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
6631 /* Enforced above when !repeating_p. */
6632 unsigned int const_nunits
= nunits
.to_constant ();
6634 bool load_seen
= false;
6635 for (unsigned i
= 0; i
< in_nlanes
; ++i
)
6637 if (i
% const_nunits
== 0)
6643 if (bitmap_bit_p (used_in_lanes
, i
))
6652 for (unsigned i
= 0; i
< dr_chain
.length (); ++i
)
6653 if (!bitmap_bit_p (used_defs
, i
))
6655 gimple
*stmt
= SSA_NAME_DEF_STMT (dr_chain
[i
]);
6656 gimple_stmt_iterator rgsi
= gsi_for_stmt (stmt
);
6657 gsi_remove (&rgsi
, true);
6658 release_defs (stmt
);
6664 /* Produce the next vector result for SLP permutation NODE by adding a vector
6665 statement at GSI. If MASK_VEC is nonnull, add:
6667 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
6671 <new SSA name> = FIRST_DEF. */
6674 vect_add_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
6675 slp_tree node
, tree first_def
, tree second_def
,
6678 tree vectype
= SLP_TREE_VECTYPE (node
);
6680 /* ??? We SLP match existing vector element extracts but
6681 allow punning which we need to re-instantiate at uses
6682 but have no good way of explicitly representing. */
6683 if (!types_compatible_p (TREE_TYPE (first_def
), vectype
))
6686 = gimple_build_assign (make_ssa_name (vectype
),
6687 build1 (VIEW_CONVERT_EXPR
, vectype
, first_def
));
6688 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
6689 first_def
= gimple_assign_lhs (conv_stmt
);
6692 tree perm_dest
= make_ssa_name (vectype
);
6695 if (!types_compatible_p (TREE_TYPE (second_def
), vectype
))
6698 = gimple_build_assign (make_ssa_name (vectype
),
6699 build1 (VIEW_CONVERT_EXPR
,
6700 vectype
, second_def
));
6701 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
6702 second_def
= gimple_assign_lhs (conv_stmt
);
6704 perm_stmt
= gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
6705 first_def
, second_def
,
6709 /* We need a copy here in case the def was external. */
6710 perm_stmt
= gimple_build_assign (perm_dest
, first_def
);
6711 vect_finish_stmt_generation (vinfo
, NULL
, perm_stmt
, gsi
);
6712 /* Store the vector statement in NODE. */
6713 SLP_TREE_VEC_STMTS (node
).quick_push (perm_stmt
);
6716 /* Vectorize the SLP permutations in NODE as specified
6717 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
6718 child number and lane number.
6719 Interleaving of two two-lane two-child SLP subtrees (not supported):
6720 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
6721 A blend of two four-lane two-child SLP subtrees:
6722 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
6723 Highpart of a four-lane one-child SLP subtree (not supported):
6724 [ { 0, 2 }, { 0, 3 } ]
6725 Where currently only a subset is supported by code generating below. */
6728 vectorizable_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
6729 slp_tree node
, stmt_vector_for_cost
*cost_vec
)
6731 tree vectype
= SLP_TREE_VECTYPE (node
);
6733 /* ??? We currently only support all same vector input and output types
6734 while the SLP IL should really do a concat + select and thus accept
6735 arbitrary mismatches. */
6738 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
6739 bool repeating_p
= multiple_p (nunits
, SLP_TREE_LANES (node
));
6740 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6742 if (!vect_maybe_update_slp_op_vectype (child
, vectype
)
6743 || !types_compatible_p (SLP_TREE_VECTYPE (child
), vectype
))
6745 if (dump_enabled_p ())
6746 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6747 "Unsupported lane permutation\n");
6750 if (SLP_TREE_LANES (child
) != SLP_TREE_LANES (node
))
6751 repeating_p
= false;
6754 vec
<std::pair
<unsigned, unsigned> > &perm
= SLP_TREE_LANE_PERMUTATION (node
);
6755 gcc_assert (perm
.length () == SLP_TREE_LANES (node
));
6756 if (dump_enabled_p ())
6758 dump_printf_loc (MSG_NOTE
, vect_location
,
6759 "vectorizing permutation");
6760 for (unsigned i
= 0; i
< perm
.length (); ++i
)
6761 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
6763 dump_printf (MSG_NOTE
, " (repeat %d)\n", SLP_TREE_LANES (node
));
6764 dump_printf (MSG_NOTE
, "\n");
6767 /* REPEATING_P is true if every output vector is guaranteed to use the
6768 same permute vector. We can handle that case for both variable-length
6769 and constant-length vectors, but we only handle other cases for
6770 constant-length vectors.
6774 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
6775 mask vector that we want to build.
6777 - NCOPIES to the number of copies of PERM that we need in order
6778 to build the necessary permute mask vectors.
6780 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
6781 for each permute mask vector. This is only relevant when GSI is
6784 unsigned nelts_per_pattern
;
6786 unsigned noutputs_per_mask
;
6789 /* We need a single permute mask vector that has the form:
6791 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
6793 In other words, the original n-element permute in PERM is
6794 "unrolled" to fill a full vector. The stepped vector encoding
6795 that we use for permutes requires 3n elements. */
6796 npatterns
= SLP_TREE_LANES (node
);
6797 nelts_per_pattern
= ncopies
= 3;
6798 noutputs_per_mask
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
6802 /* Calculate every element of every permute mask vector explicitly,
6803 instead of relying on the pattern described above. */
6804 if (!nunits
.is_constant (&npatterns
))
6806 nelts_per_pattern
= ncopies
= 1;
6807 if (loop_vec_info linfo
= dyn_cast
<loop_vec_info
> (vinfo
))
6808 if (!LOOP_VINFO_VECT_FACTOR (linfo
).is_constant (&ncopies
))
6810 noutputs_per_mask
= 1;
6812 unsigned olanes
= ncopies
* SLP_TREE_LANES (node
);
6813 gcc_assert (repeating_p
|| multiple_p (olanes
, nunits
));
6815 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
6816 from the { SLP operand, scalar lane } permutation as recorded in the
6817 SLP node as intermediate step. This part should already work
6818 with SLP children with arbitrary number of lanes. */
6819 auto_vec
<std::pair
<std::pair
<unsigned, unsigned>, unsigned> > vperm
;
6820 auto_vec
<unsigned> active_lane
;
6821 vperm
.create (olanes
);
6822 active_lane
.safe_grow_cleared (SLP_TREE_CHILDREN (node
).length (), true);
6823 for (unsigned i
= 0; i
< ncopies
; ++i
)
6825 for (unsigned pi
= 0; pi
< perm
.length (); ++pi
)
6827 std::pair
<unsigned, unsigned> p
= perm
[pi
];
6828 tree vtype
= SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node
)[p
.first
]);
6830 vperm
.quick_push ({{p
.first
, 0}, p
.second
+ active_lane
[p
.first
]});
6833 /* We checked above that the vectors are constant-length. */
6834 unsigned vnunits
= TYPE_VECTOR_SUBPARTS (vtype
).to_constant ();
6835 unsigned vi
= (active_lane
[p
.first
] + p
.second
) / vnunits
;
6836 unsigned vl
= (active_lane
[p
.first
] + p
.second
) % vnunits
;
6837 vperm
.quick_push ({{p
.first
, vi
}, vl
});
6840 /* Advance to the next group. */
6841 for (unsigned j
= 0; j
< SLP_TREE_CHILDREN (node
).length (); ++j
)
6842 active_lane
[j
] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node
)[j
]);
6845 if (dump_enabled_p ())
6847 dump_printf_loc (MSG_NOTE
, vect_location
, "as");
6848 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
6852 ? multiple_p (i
, npatterns
)
6853 : multiple_p (i
, TYPE_VECTOR_SUBPARTS (vectype
))))
6854 dump_printf (MSG_NOTE
, ",");
6855 dump_printf (MSG_NOTE
, " vops%u[%u][%u]",
6856 vperm
[i
].first
.first
, vperm
[i
].first
.second
,
6859 dump_printf (MSG_NOTE
, "\n");
6862 /* We can only handle two-vector permutes, everything else should
6863 be lowered on the SLP level. The following is closely inspired
6864 by vect_transform_slp_perm_load and is supposed to eventually
6866 ??? As intermediate step do code-gen in the SLP tree representation
6868 std::pair
<unsigned, unsigned> first_vec
= std::make_pair (-1U, -1U);
6869 std::pair
<unsigned, unsigned> second_vec
= std::make_pair (-1U, -1U);
6870 unsigned int index
= 0;
6871 poly_uint64 mask_element
;
6872 vec_perm_builder mask
;
6873 mask
.new_vector (nunits
, npatterns
, nelts_per_pattern
);
6874 unsigned int count
= mask
.encoded_nelts ();
6875 mask
.quick_grow (count
);
6876 vec_perm_indices indices
;
6877 unsigned nperms
= 0;
6878 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
6880 mask_element
= vperm
[i
].second
;
6881 if (first_vec
.first
== -1U
6882 || first_vec
== vperm
[i
].first
)
6883 first_vec
= vperm
[i
].first
;
6884 else if (second_vec
.first
== -1U
6885 || second_vec
== vperm
[i
].first
)
6887 second_vec
= vperm
[i
].first
;
6888 mask_element
+= nunits
;
6892 if (dump_enabled_p ())
6893 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6894 "permutation requires at "
6895 "least three vectors\n");
6900 mask
[index
++] = mask_element
;
6904 indices
.new_vector (mask
, second_vec
.first
== -1U ? 1 : 2, nunits
);
6905 bool identity_p
= indices
.series_p (0, 1, 0, 1);
6907 && !can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6909 if (dump_enabled_p ())
6911 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
6913 "unsupported vect permute { ");
6914 for (i
= 0; i
< count
; ++i
)
6916 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
6917 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
6919 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
6929 if (second_vec
.first
== -1U)
6930 second_vec
= first_vec
;
6933 first_node
= SLP_TREE_CHILDREN (node
)[first_vec
.first
],
6934 second_node
= SLP_TREE_CHILDREN (node
)[second_vec
.first
];
6936 tree mask_vec
= NULL_TREE
;
6938 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
6940 for (unsigned int vi
= 0; vi
< noutputs_per_mask
; ++vi
)
6943 = vect_get_slp_vect_def (first_node
,
6944 first_vec
.second
+ vi
);
6946 = vect_get_slp_vect_def (second_node
,
6947 second_vec
.second
+ vi
);
6948 vect_add_slp_permutation (vinfo
, gsi
, node
, first_def
,
6949 second_def
, mask_vec
);
6954 first_vec
= std::make_pair (-1U, -1U);
6955 second_vec
= std::make_pair (-1U, -1U);
6960 record_stmt_cost (cost_vec
, nperms
, vec_perm
, NULL
, vectype
, 0, vect_body
);
6965 /* Vectorize SLP NODE. */
6968 vect_schedule_slp_node (vec_info
*vinfo
,
6969 slp_tree node
, slp_instance instance
)
6971 gimple_stmt_iterator si
;
6975 /* For existing vectors there's nothing to do. */
6976 if (SLP_TREE_VEC_DEFS (node
).exists ())
6979 gcc_assert (SLP_TREE_VEC_STMTS (node
).is_empty ());
6981 /* Vectorize externals and constants. */
6982 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
6983 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
6985 /* ??? vectorizable_shift can end up using a scalar operand which is
6986 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
6987 node in this case. */
6988 if (!SLP_TREE_VECTYPE (node
))
6991 vect_create_constant_vectors (vinfo
, node
);
6995 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
6997 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) != 0);
6998 SLP_TREE_VEC_STMTS (node
).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
7000 if (dump_enabled_p ())
7001 dump_printf_loc (MSG_NOTE
, vect_location
,
7002 "------>vectorizing SLP node starting from: %G",
7005 if (STMT_VINFO_DATA_REF (stmt_info
)
7006 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
7008 /* Vectorized loads go before the first scalar load to make it
7009 ready early, vectorized stores go before the last scalar
7010 stmt which is where all uses are ready. */
7011 stmt_vec_info last_stmt_info
= NULL
;
7012 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
7013 last_stmt_info
= vect_find_first_scalar_stmt_in_slp (node
);
7014 else /* DR_IS_WRITE */
7015 last_stmt_info
= vect_find_last_scalar_stmt_in_slp (node
);
7016 si
= gsi_for_stmt (last_stmt_info
->stmt
);
7018 else if ((STMT_VINFO_TYPE (stmt_info
) == cycle_phi_info_type
7019 || STMT_VINFO_TYPE (stmt_info
) == induc_vec_info_type
7020 || STMT_VINFO_TYPE (stmt_info
) == phi_info_type
)
7021 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
7023 /* For PHI node vectorization we do not use the insertion iterator. */
7028 /* Emit other stmts after the children vectorized defs which is
7029 earliest possible. */
7030 gimple
*last_stmt
= NULL
;
7031 bool seen_vector_def
= false;
7032 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
7033 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
7035 /* For fold-left reductions we are retaining the scalar
7036 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
7037 set so the representation isn't perfect. Resort to the
7038 last scalar def here. */
7039 if (SLP_TREE_VEC_STMTS (child
).is_empty ())
7041 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child
))
7042 == cycle_phi_info_type
);
7043 gphi
*phi
= as_a
<gphi
*>
7044 (vect_find_last_scalar_stmt_in_slp (child
)->stmt
);
7046 || vect_stmt_dominates_stmt_p (last_stmt
, phi
))
7049 /* We are emitting all vectorized stmts in the same place and
7050 the last one is the last.
7051 ??? Unless we have a load permutation applied and that
7052 figures to re-use an earlier generated load. */
7055 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child
), j
, vstmt
)
7057 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
7060 else if (!SLP_TREE_VECTYPE (child
))
7062 /* For externals we use unvectorized at all scalar defs. */
7065 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child
), j
, def
)
7066 if (TREE_CODE (def
) == SSA_NAME
7067 && !SSA_NAME_IS_DEFAULT_DEF (def
))
7069 gimple
*stmt
= SSA_NAME_DEF_STMT (def
);
7071 || vect_stmt_dominates_stmt_p (last_stmt
, stmt
))
7077 /* For externals we have to look at all defs since their
7078 insertion place is decided per vector. But beware
7079 of pre-existing vectors where we need to make sure
7080 we do not insert before the region boundary. */
7081 if (SLP_TREE_SCALAR_OPS (child
).is_empty ()
7082 && !vinfo
->lookup_def (SLP_TREE_VEC_DEFS (child
)[0]))
7083 seen_vector_def
= true;
7088 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
7089 if (TREE_CODE (vdef
) == SSA_NAME
7090 && !SSA_NAME_IS_DEFAULT_DEF (vdef
))
7092 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
7094 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
7099 /* This can happen when all children are pre-existing vectors or
7102 last_stmt
= vect_find_first_scalar_stmt_in_slp (node
)->stmt
;
7105 gcc_assert (seen_vector_def
);
7106 si
= gsi_after_labels (as_a
<bb_vec_info
> (vinfo
)->bbs
[0]);
7108 else if (is_a
<gphi
*> (last_stmt
))
7109 si
= gsi_after_labels (gimple_bb (last_stmt
));
7112 si
= gsi_for_stmt (last_stmt
);
7117 bool done_p
= false;
7119 /* Handle purely internal nodes. */
7120 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
7122 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
7123 be shared with different SLP nodes (but usually it's the same
7124 operation apart from the case the stmt is only there for denoting
7125 the actual scalar lane defs ...). So do not call vect_transform_stmt
7126 but open-code it here (partly). */
7127 bool done
= vectorizable_slp_permutation (vinfo
, &si
, node
, NULL
);
7132 vect_transform_stmt (vinfo
, stmt_info
, &si
, node
, instance
);
7135 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
7136 For loop vectorization this is done in vectorizable_call, but for SLP
7137 it needs to be deferred until end of vect_schedule_slp, because multiple
7138 SLP instances may refer to the same scalar stmt. */
7141 vect_remove_slp_scalar_calls (vec_info
*vinfo
,
7142 slp_tree node
, hash_set
<slp_tree
> &visited
)
7145 gimple_stmt_iterator gsi
;
7149 stmt_vec_info stmt_info
;
7151 if (!node
|| SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
7154 if (visited
.add (node
))
7157 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
7158 vect_remove_slp_scalar_calls (vinfo
, child
, visited
);
7160 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
7162 gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
7163 if (!stmt
|| gimple_bb (stmt
) == NULL
)
7165 if (is_pattern_stmt_p (stmt_info
)
7166 || !PURE_SLP_STMT (stmt_info
))
7168 lhs
= gimple_call_lhs (stmt
);
7169 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
7170 gsi
= gsi_for_stmt (stmt
);
7171 vinfo
->replace_stmt (&gsi
, stmt_info
, new_stmt
);
7172 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
7177 vect_remove_slp_scalar_calls (vec_info
*vinfo
, slp_tree node
)
7179 hash_set
<slp_tree
> visited
;
7180 vect_remove_slp_scalar_calls (vinfo
, node
, visited
);
7183 /* Vectorize the instance root. */
7186 vectorize_slp_instance_root_stmt (slp_tree node
, slp_instance instance
)
7188 gassign
*rstmt
= NULL
;
7190 if (instance
->kind
== slp_inst_kind_ctor
)
7192 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) == 1)
7197 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
7199 tree vect_lhs
= gimple_get_lhs (child_stmt
);
7200 tree root_lhs
= gimple_get_lhs (instance
->root_stmts
[0]->stmt
);
7201 if (!useless_type_conversion_p (TREE_TYPE (root_lhs
),
7202 TREE_TYPE (vect_lhs
)))
7203 vect_lhs
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (root_lhs
),
7205 rstmt
= gimple_build_assign (root_lhs
, vect_lhs
);
7209 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) > 1)
7211 int nelts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
7214 vec
<constructor_elt
, va_gc
> *v
;
7215 vec_alloc (v
, nelts
);
7217 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
7218 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
,
7219 gimple_get_lhs (child_stmt
));
7220 tree lhs
= gimple_get_lhs (instance
->root_stmts
[0]->stmt
);
7222 = TREE_TYPE (gimple_assign_rhs1 (instance
->root_stmts
[0]->stmt
));
7223 tree r_constructor
= build_constructor (rtype
, v
);
7224 rstmt
= gimple_build_assign (lhs
, r_constructor
);
7227 else if (instance
->kind
== slp_inst_kind_bb_reduc
)
7229 /* Largely inspired by reduction chain epilogue handling in
7230 vect_create_epilog_for_reduction. */
7231 vec
<tree
> vec_defs
= vNULL
;
7232 vect_get_slp_defs (node
, &vec_defs
);
7233 enum tree_code reduc_code
7234 = gimple_assign_rhs_code (instance
->root_stmts
[0]->stmt
);
7235 /* ??? We actually have to reflect signs somewhere. */
7236 if (reduc_code
== MINUS_EXPR
)
7237 reduc_code
= PLUS_EXPR
;
7238 gimple_seq epilogue
= NULL
;
7239 /* We may end up with more than one vector result, reduce them
7241 tree vec_def
= vec_defs
[0];
7242 for (unsigned i
= 1; i
< vec_defs
.length (); ++i
)
7243 vec_def
= gimple_build (&epilogue
, reduc_code
, TREE_TYPE (vec_def
),
7244 vec_def
, vec_defs
[i
]);
7245 vec_defs
.release ();
7246 /* ??? Support other schemes than direct internal fn. */
7247 internal_fn reduc_fn
;
7248 if (!reduction_fn_for_scalar_code (reduc_code
, &reduc_fn
)
7249 || reduc_fn
== IFN_LAST
)
7251 tree scalar_def
= gimple_build (&epilogue
, as_combined_fn (reduc_fn
),
7252 TREE_TYPE (TREE_TYPE (vec_def
)), vec_def
);
7254 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmts
[0]->stmt
);
7255 gsi_insert_seq_before (&rgsi
, epilogue
, GSI_SAME_STMT
);
7256 gimple_assign_set_rhs_from_tree (&rgsi
, scalar_def
);
7257 update_stmt (gsi_stmt (rgsi
));
7265 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmts
[0]->stmt
);
7266 gsi_replace (&rgsi
, rstmt
, true);
7276 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
7279 vect_schedule_scc (vec_info
*vinfo
, slp_tree node
, slp_instance instance
,
7280 hash_map
<slp_tree
, slp_scc_info
> &scc_info
,
7281 int &maxdfs
, vec
<slp_tree
> &stack
)
7284 slp_scc_info
*info
= &scc_info
.get_or_insert (node
, &existed_p
);
7285 gcc_assert (!existed_p
);
7287 info
->lowlink
= maxdfs
;
7291 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
7293 info
->on_stack
= false;
7294 vect_schedule_slp_node (vinfo
, node
, instance
);
7298 info
->on_stack
= true;
7299 stack
.safe_push (node
);
7304 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
7308 slp_scc_info
*child_info
= scc_info
.get (child
);
7311 vect_schedule_scc (vinfo
, child
, instance
, scc_info
, maxdfs
, stack
);
7312 /* Recursion might have re-allocated the node. */
7313 info
= scc_info
.get (node
);
7314 child_info
= scc_info
.get (child
);
7315 info
->lowlink
= MIN (info
->lowlink
, child_info
->lowlink
);
7317 else if (child_info
->on_stack
)
7318 info
->lowlink
= MIN (info
->lowlink
, child_info
->dfs
);
7320 if (info
->lowlink
!= info
->dfs
)
7323 auto_vec
<slp_tree
, 4> phis_to_fixup
;
7326 if (stack
.last () == node
)
7329 info
->on_stack
= false;
7330 vect_schedule_slp_node (vinfo
, node
, instance
);
7331 if (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
7332 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (node
)->stmt
))
7333 phis_to_fixup
.quick_push (node
);
7338 int last_idx
= stack
.length () - 1;
7339 while (stack
[last_idx
] != node
)
7341 /* We can break the cycle at PHIs who have at least one child
7342 code generated. Then we could re-start the DFS walk until
7343 all nodes in the SCC are covered (we might have new entries
7344 for only back-reachable nodes). But it's simpler to just
7345 iterate and schedule those that are ready. */
7346 unsigned todo
= stack
.length () - last_idx
;
7349 for (int idx
= stack
.length () - 1; idx
>= last_idx
; --idx
)
7351 slp_tree entry
= stack
[idx
];
7354 bool phi
= (SLP_TREE_CODE (entry
) != VEC_PERM_EXPR
7355 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (entry
)->stmt
));
7357 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry
), i
, child
)
7364 else if (scc_info
.get (child
)->on_stack
)
7382 vect_schedule_slp_node (vinfo
, entry
, instance
);
7383 scc_info
.get (entry
)->on_stack
= false;
7387 phis_to_fixup
.safe_push (entry
);
7394 stack
.truncate (last_idx
);
7397 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
7399 FOR_EACH_VEC_ELT (phis_to_fixup
, i
, phi_node
)
7401 gphi
*phi
= as_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (phi_node
)->stmt
);
7404 FOR_EACH_EDGE (e
, ei
, gimple_bb (phi
)->preds
)
7406 unsigned dest_idx
= e
->dest_idx
;
7407 child
= SLP_TREE_CHILDREN (phi_node
)[dest_idx
];
7408 if (!child
|| SLP_TREE_DEF_TYPE (child
) != vect_internal_def
)
7410 /* Simply fill all args. */
7411 for (unsigned i
= 0; i
< SLP_TREE_VEC_STMTS (phi_node
).length (); ++i
)
7412 add_phi_arg (as_a
<gphi
*> (SLP_TREE_VEC_STMTS (phi_node
)[i
]),
7413 vect_get_slp_vect_def (child
, i
),
7414 e
, gimple_phi_arg_location (phi
, dest_idx
));
7419 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
7422 vect_schedule_slp (vec_info
*vinfo
, vec
<slp_instance
> slp_instances
)
7424 slp_instance instance
;
7427 hash_map
<slp_tree
, slp_scc_info
> scc_info
;
7429 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
7431 slp_tree node
= SLP_INSTANCE_TREE (instance
);
7432 if (dump_enabled_p ())
7434 dump_printf_loc (MSG_NOTE
, vect_location
,
7435 "Vectorizing SLP tree:\n");
7437 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
7438 dump_printf_loc (MSG_NOTE
, vect_location
, "Root stmt: %G",
7439 SLP_INSTANCE_ROOT_STMTS (instance
)[0]->stmt
);
7440 vect_print_slp_graph (MSG_NOTE
, vect_location
,
7441 SLP_INSTANCE_TREE (instance
));
7443 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
7444 have a PHI be the node breaking the cycle. */
7445 auto_vec
<slp_tree
> stack
;
7446 if (!scc_info
.get (node
))
7447 vect_schedule_scc (vinfo
, node
, instance
, scc_info
, maxdfs
, stack
);
7449 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
7450 vectorize_slp_instance_root_stmt (node
, instance
);
7452 if (dump_enabled_p ())
7453 dump_printf_loc (MSG_NOTE
, vect_location
,
7454 "vectorizing stmts using SLP.\n");
7457 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
7459 slp_tree root
= SLP_INSTANCE_TREE (instance
);
7460 stmt_vec_info store_info
;
7463 /* Remove scalar call stmts. Do not do this for basic-block
7464 vectorization as not all uses may be vectorized.
7465 ??? Why should this be necessary? DCE should be able to
7466 remove the stmts itself.
7467 ??? For BB vectorization we can as well remove scalar
7468 stmts starting from the SLP tree root if they have no
7470 if (is_a
<loop_vec_info
> (vinfo
))
7471 vect_remove_slp_scalar_calls (vinfo
, root
);
7473 /* Remove vectorized stores original scalar stmts. */
7474 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store_info
); j
++)
7476 if (!STMT_VINFO_DATA_REF (store_info
)
7477 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info
)))
7480 store_info
= vect_orig_stmt (store_info
);
7481 /* Free the attached stmt_vec_info and remove the stmt. */
7482 vinfo
->remove_stmt (store_info
);
7484 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
7485 to not crash in vect_free_slp_tree later. */
7486 if (SLP_TREE_REPRESENTATIVE (root
) == store_info
)
7487 SLP_TREE_REPRESENTATIVE (root
) = NULL
;