1 /* Cost model implementation for RISC-V 'V' Extension for GNU compiler.
2 Copyright (C) 2023-2023 Free Software Foundation, Inc.
3 Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #define INCLUDE_STRING
26 #include "coretypes.h"
31 #include "basic-block.h"
34 #include "targhooks.h"
36 #include "fold-const.h"
38 #include "tree-vectorizer.h"
39 #include "gimple-iterator.h"
44 /* This file should be included last. */
45 #include "riscv-vector-costs.h"
47 namespace riscv_vector
{
49 /* Dynamic LMUL philosophy - Local linear-scan SSA live range based analysis
52 - Collect all vectorize STMTs locally for each loop block.
53 - Build program point based graph, ignore non-vectorize STMTs:
55 vectorize STMT 0 - point 0
56 scalar STMT 0 - ignore.
57 vectorize STMT 1 - point 1
59 - Compute the number of live V_REGs live at each program point
60 - Determine LMUL in VECTOR COST model according to the program point
61 which has maximum live V_REGs.
65 - BIGGEST_MODE is the biggest LMUL auto-vectorization element mode.
66 It's important for mixed size auto-vectorization (Conversions, ... etc).
67 E.g. For a loop that is vectorizing conversion of INT32 -> INT64.
68 The biggest mode is DImode and LMUL = 8, LMUL = 4 for SImode.
69 We compute the number live V_REGs at each program point according to
71 - We only compute program points and live ranges locally (within a block)
72 since we just need to compute the number of live V_REGs at each program
73 point and we are not really allocating the registers for each SSA.
74 We can make the variable has another local live range in another block
75 if it live out/live in to another block. Such approach doesn't affect
76 out accurate live range analysis.
77 - Current analysis didn't consider any instruction scheduling which
78 may improve the register pressure. So we are conservatively doing the
79 analysis which may end up with smaller LMUL.
80 TODO: Maybe we could support a reasonable live range shrink algorithm
81 which take advantage of instruction scheduling.
82 - We may have these following possible autovec modes analysis:
84 1. M8 -> M4 -> M2 -> M1 (stop analysis here) -> MF2 -> MF4 -> MF8
85 2. M8 -> M1(M4) -> MF2(M2) -> MF4(M1) (stop analysis here) -> MF8(MF2)
86 3. M1(M8) -> MF2(M4) -> MF4(M2) -> MF8(M1)
88 static hash_map
<class loop
*, autovec_info
> loop_autovec_infos
;
90 /* Collect all STMTs that are vectorized and compute their program points.
91 Note that we don't care about the STMTs that are not vectorized and
92 we only build the local graph (within a block) of program points.
96 STMT 1 (be vectorized) -- point 0
97 STMT 2 (not be vectorized) -- ignored
98 STMT 3 (be vectorized) -- point 1
99 STMT 4 (be vectorized) -- point 2
100 STMT 5 (be vectorized) -- point 3
103 STMT 1 (be vectorized) -- point 0
104 STMT 2 (be vectorized) -- point 1
105 STMT 3 (not be vectorized) -- ignored
106 STMT 4 (not be vectorized) -- ignored
107 STMT 5 (be vectorized) -- point 2
111 compute_local_program_points (
113 hash_map
<basic_block
, vec
<stmt_point
>> &program_points_per_bb
)
115 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
117 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
118 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
119 unsigned int nbbs
= loop
->num_nodes
;
120 gimple_stmt_iterator si
;
122 /* Collect the stmts that is vectorized and mark their program point. */
123 for (i
= 0; i
< nbbs
; i
++)
126 basic_block bb
= bbs
[i
];
127 vec
<stmt_point
> program_points
= vNULL
;
128 if (dump_enabled_p ())
129 dump_printf_loc (MSG_NOTE
, vect_location
,
130 "Compute local program points for bb %d:\n",
132 for (si
= gsi_start_bb (bbs
[i
]); !gsi_end_p (si
); gsi_next (&si
))
134 if (!(is_gimple_assign (gsi_stmt (si
))
135 || is_gimple_call (gsi_stmt (si
))))
137 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (gsi_stmt (si
));
138 if (STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info
))
139 != undef_vec_info_type
)
141 stmt_point info
= {point
, gsi_stmt (si
)};
142 program_points
.safe_push (info
);
144 if (dump_enabled_p ())
145 dump_printf_loc (MSG_NOTE
, vect_location
,
146 "program point %d: %G", info
.point
,
150 program_points_per_bb
.put (bb
, program_points
);
155 /* Compute local live ranges of each vectorized variable.
156 Note that we only compute local live ranges (within a block) since
157 local live ranges information is accurate enough for us to determine
158 the LMUL/vectorization factor of the loop.
163 STMT 2 (def SSA 1) -- point 1
164 STMT 3 (use SSA 1) -- point 2
170 STMT 4 (use SSA 2) -- point 3
172 The live range of SSA 1 is [1, 3] in bb 2.
173 The live range of SSA 2 is [0, 4] in bb 3. */
175 compute_local_live_ranges (
176 const hash_map
<basic_block
, vec
<stmt_point
>> &program_points_per_bb
,
177 hash_map
<basic_block
, hash_map
<tree
, pair
>> &live_ranges_per_bb
)
179 machine_mode biggest_mode
= QImode
;
180 if (!program_points_per_bb
.is_empty ())
182 auto_vec
<tree
> visited_vars
;
184 for (hash_map
<basic_block
, vec
<stmt_point
>>::iterator iter
185 = program_points_per_bb
.begin ();
186 iter
!= program_points_per_bb
.end (); ++iter
)
188 basic_block bb
= (*iter
).first
;
189 vec
<stmt_point
> program_points
= (*iter
).second
;
190 bool existed_p
= false;
191 hash_map
<tree
, pair
> *live_ranges
192 = &live_ranges_per_bb
.get_or_insert (bb
, &existed_p
);
193 gcc_assert (!existed_p
);
194 if (dump_enabled_p ())
195 dump_printf_loc (MSG_NOTE
, vect_location
,
196 "Compute local live ranges for bb %d:\n",
198 for (const auto program_point
: program_points
)
200 unsigned int point
= program_point
.point
;
201 gimple
*stmt
= program_point
.stmt
;
202 machine_mode mode
= biggest_mode
;
203 tree lhs
= gimple_get_lhs (stmt
);
204 if (lhs
!= NULL_TREE
&& is_gimple_reg (lhs
)
205 && !POINTER_TYPE_P (TREE_TYPE (lhs
)))
207 mode
= TYPE_MODE (TREE_TYPE (lhs
));
208 bool existed_p
= false;
210 = live_ranges
->get_or_insert (lhs
, &existed_p
);
211 gcc_assert (!existed_p
);
212 live_range
= pair (point
, point
);
214 for (i
= 0; i
< gimple_num_args (stmt
); i
++)
216 tree var
= gimple_arg (stmt
, i
);
217 /* Both IMM and REG are included since a VECTOR_CST may be
218 potentially held in a vector register. However, it's not
219 accurate, since a PLUS_EXPR can be vectorized into vadd.vi
222 TODO: We may elide the cases that the unnecessary IMM in
224 if (is_gimple_val (var
) && !POINTER_TYPE_P (TREE_TYPE (var
)))
226 mode
= TYPE_MODE (TREE_TYPE (var
));
227 bool existed_p
= false;
229 = live_ranges
->get_or_insert (var
, &existed_p
);
231 /* We will grow the live range for each use. */
232 live_range
= pair (live_range
.first
, point
);
234 /* We assume the variable is live from the start of
236 live_range
= pair (0, point
);
239 if (GET_MODE_SIZE (mode
).to_constant ()
240 > GET_MODE_SIZE (biggest_mode
).to_constant ())
243 if (dump_enabled_p ())
244 for (hash_map
<tree
, pair
>::iterator iter
= live_ranges
->begin ();
245 iter
!= live_ranges
->end (); ++iter
)
246 dump_printf_loc (MSG_NOTE
, vect_location
,
247 "%T: type = %T, start = %d, end = %d\n",
248 (*iter
).first
, TREE_TYPE ((*iter
).first
),
249 (*iter
).second
.first
, (*iter
).second
.second
);
252 if (dump_enabled_p ())
253 dump_printf_loc (MSG_NOTE
, vect_location
, "Biggest mode = %s\n",
254 GET_MODE_NAME (biggest_mode
));
258 /* Compute the mode for MODE, BIGGEST_MODE and LMUL.
260 E.g. If mode = SImode, biggest_mode = DImode, LMUL = M4.
261 Then return RVVM4SImode (LMUL = 4, element mode = SImode). */
263 compute_nregs_for_mode (machine_mode mode
, machine_mode biggest_mode
, int lmul
)
265 unsigned int mode_size
= GET_MODE_SIZE (mode
).to_constant ();
266 unsigned int biggest_size
= GET_MODE_SIZE (biggest_mode
).to_constant ();
267 gcc_assert (biggest_size
>= mode_size
);
268 unsigned int ratio
= biggest_size
/ mode_size
;
272 /* This function helps to determine whether current LMUL will cause
273 potential vector register (V_REG) spillings according to live range
276 - First, compute how many variable are alive of each program point
277 in each bb of the loop.
278 - Second, compute how many V_REGs are alive of each program point
279 in each bb of the loop according the BIGGEST_MODE and the variable
281 - Third, Return the maximum V_REGs are alive of the loop. */
283 max_number_of_live_regs (const basic_block bb
,
284 const hash_map
<tree
, pair
> &live_ranges
,
285 unsigned int max_point
, machine_mode biggest_mode
,
288 unsigned int max_nregs
= 0;
290 unsigned int live_point
= 0;
291 auto_vec
<unsigned int> live_vars_vec
;
292 live_vars_vec
.safe_grow (max_point
+ 1, true);
293 for (i
= 0; i
< live_vars_vec
.length (); ++i
)
294 live_vars_vec
[i
] = 0;
295 for (hash_map
<tree
, pair
>::iterator iter
= live_ranges
.begin ();
296 iter
!= live_ranges
.end (); ++iter
)
298 tree var
= (*iter
).first
;
299 pair live_range
= (*iter
).second
;
300 for (i
= live_range
.first
; i
<= live_range
.second
; i
++)
302 machine_mode mode
= TYPE_MODE (TREE_TYPE (var
));
304 = compute_nregs_for_mode (mode
, biggest_mode
, lmul
);
305 live_vars_vec
[i
] += nregs
;
306 if (live_vars_vec
[i
] > max_nregs
)
307 max_nregs
= live_vars_vec
[i
];
311 /* Collect user explicit RVV type. */
312 auto_vec
<basic_block
> all_preds
313 = get_all_dominated_blocks (CDI_POST_DOMINATORS
, bb
);
315 FOR_EACH_SSA_NAME (i
, t
, cfun
)
317 machine_mode mode
= TYPE_MODE (TREE_TYPE (t
));
318 if (!lookup_vector_type_attribute (TREE_TYPE (t
))
319 && !riscv_v_ext_vls_mode_p (mode
))
322 gimple
*def
= SSA_NAME_DEF_STMT (t
);
323 if (gimple_bb (def
) && !all_preds
.contains (gimple_bb (def
)))
326 imm_use_iterator iterator
;
328 FOR_EACH_IMM_USE_FAST (use_p
, iterator
, t
)
330 if (!USE_STMT (use_p
) || is_gimple_debug (USE_STMT (use_p
))
331 || !dominated_by_p (CDI_POST_DOMINATORS
, bb
,
332 gimple_bb (USE_STMT (use_p
))))
335 int regno_alignment
= riscv_get_v_regno_alignment (mode
);
336 max_nregs
+= regno_alignment
;
337 if (dump_enabled_p ())
339 MSG_NOTE
, vect_location
,
340 "Explicit used SSA %T, vectype = %T, mode = %s, cause %d "
341 "V_REG live in bb %d at program point %d\n",
342 t
, TREE_TYPE (t
), GET_MODE_NAME (mode
), regno_alignment
,
343 bb
->index
, live_point
);
348 if (dump_enabled_p ())
349 dump_printf_loc (MSG_NOTE
, vect_location
,
350 "Maximum lmul = %d, %d number of live V_REG at program "
351 "point %d for bb %d\n",
352 lmul
, max_nregs
, live_point
, bb
->index
);
356 /* Return the LMUL of the current analysis. */
358 get_current_lmul (class loop
*loop
)
360 return loop_autovec_infos
.get (loop
)->current_lmul
;
363 /* Update the live ranges according PHI.
368 STMT 2 (def SSA 1) -- point 1
369 STMT 3 (use SSA 1) -- point 2
375 STMT 3 (use SSA 2) -- point 2
378 Before this function, the SSA 1 live range is [2, 3] in bb 2
379 and SSA 2 is [0, 3] in bb 3.
381 Then, after this function, we update SSA 1 live range in bb 2
382 into [2, 4] since SSA 1 is live out into bb 3. */
384 update_local_live_ranges (
386 hash_map
<basic_block
, vec
<stmt_point
>> &program_points_per_bb
,
387 hash_map
<basic_block
, hash_map
<tree
, pair
>> &live_ranges_per_bb
)
389 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
393 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
394 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
395 unsigned int nbbs
= loop
->num_nodes
;
398 for (i
= 0; i
< nbbs
; i
++)
400 basic_block bb
= bbs
[i
];
401 if (dump_enabled_p ())
402 dump_printf_loc (MSG_NOTE
, vect_location
,
403 "Update local program points for bb %d:\n", bb
->index
);
404 for (psi
= gsi_start_phis (bbs
[i
]); !gsi_end_p (psi
); gsi_next (&psi
))
406 gphi
*phi
= psi
.phi ();
407 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (phi
);
408 if (STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info
))
409 == undef_vec_info_type
)
412 for (j
= 0; j
< gimple_phi_num_args (phi
); j
++)
414 edge e
= gimple_phi_arg_edge (phi
, j
);
415 tree def
= gimple_phi_arg_def (phi
, j
);
416 auto *live_ranges
= live_ranges_per_bb
.get (e
->src
);
417 if (!program_points_per_bb
.get (e
->src
))
419 unsigned int max_point
420 = (*program_points_per_bb
.get (e
->src
)).length () - 1;
421 auto *live_range
= live_ranges
->get (def
);
425 unsigned int end
= (*live_range
).second
;
426 (*live_range
).second
= max_point
;
427 if (dump_enabled_p ())
428 dump_printf_loc (MSG_NOTE
, vect_location
,
429 "Update %T end point from %d to %d:\n", def
,
430 end
, (*live_range
).second
);
436 costs::costs (vec_info
*vinfo
, bool costing_for_scalar
)
437 : vector_costs (vinfo
, costing_for_scalar
)
440 /* Return true that the LMUL of new COST model is preferred. */
442 costs::preferred_new_lmul_p (const vector_costs
*uncast_other
) const
444 auto other
= static_cast<const costs
*> (uncast_other
);
445 auto this_loop_vinfo
= as_a
<loop_vec_info
> (this->m_vinfo
);
446 auto other_loop_vinfo
= as_a
<loop_vec_info
> (other
->m_vinfo
);
447 class loop
*loop
= LOOP_VINFO_LOOP (this_loop_vinfo
);
449 if (loop_autovec_infos
.get (loop
) && loop_autovec_infos
.get (loop
)->end_p
)
451 else if (loop_autovec_infos
.get (loop
))
452 loop_autovec_infos
.get (loop
)->current_lmul
453 = loop_autovec_infos
.get (loop
)->current_lmul
/ 2;
457 = riscv_get_v_regno_alignment (other_loop_vinfo
->vector_mode
);
458 if (known_eq (LOOP_VINFO_SLP_UNROLLING_FACTOR (other_loop_vinfo
), 1U))
459 regno_alignment
= RVV_M8
;
460 loop_autovec_infos
.put (loop
, {regno_alignment
, regno_alignment
, false});
463 int lmul
= get_current_lmul (loop
);
464 if (dump_enabled_p ())
465 dump_printf_loc (MSG_NOTE
, vect_location
,
466 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
467 GET_MODE_NAME (this_loop_vinfo
->vector_mode
),
468 vect_vf_for_cost (this_loop_vinfo
),
469 GET_MODE_NAME (other_loop_vinfo
->vector_mode
),
470 vect_vf_for_cost (other_loop_vinfo
));
472 /* Compute local program points.
473 It's a fast and effective computation. */
474 hash_map
<basic_block
, vec
<stmt_point
>> program_points_per_bb
;
475 compute_local_program_points (other
->m_vinfo
, program_points_per_bb
);
477 /* Compute local live ranges. */
478 hash_map
<basic_block
, hash_map
<tree
, pair
>> live_ranges_per_bb
;
479 machine_mode biggest_mode
480 = compute_local_live_ranges (program_points_per_bb
, live_ranges_per_bb
);
482 /* If we can use simple VLS modes to handle NITERS element.
483 We don't need to use VLA modes with partial vector auto-vectorization. */
484 if (LOOP_VINFO_NITERS_KNOWN_P (this_loop_vinfo
)
485 && known_le (tree_to_poly_int64 (LOOP_VINFO_NITERS (this_loop_vinfo
))
486 * GET_MODE_SIZE (biggest_mode
).to_constant (),
487 (int) RVV_M8
* BYTES_PER_RISCV_VECTOR
)
488 && pow2p_hwi (LOOP_VINFO_INT_NITERS (this_loop_vinfo
)))
489 return vector_costs::better_main_loop_than_p (other
);
491 /* Update live ranges according to PHI. */
492 update_local_live_ranges (other
->m_vinfo
, program_points_per_bb
,
495 /* TODO: We calculate the maximum live vars base on current STMTS
496 sequence. We can support live range shrink if it can give us
497 big improvement in the future. */
498 if (!live_ranges_per_bb
.is_empty ())
500 unsigned int max_nregs
= 0;
501 for (hash_map
<basic_block
, hash_map
<tree
, pair
>>::iterator iter
502 = live_ranges_per_bb
.begin ();
503 iter
!= live_ranges_per_bb
.end (); ++iter
)
505 basic_block bb
= (*iter
).first
;
506 unsigned int max_point
507 = (*program_points_per_bb
.get (bb
)).length () - 1;
508 if ((*iter
).second
.is_empty ())
510 /* We prefer larger LMUL unless it causes register spillings. */
512 = max_number_of_live_regs (bb
, (*iter
).second
, max_point
,
514 if (nregs
> max_nregs
)
516 live_ranges_per_bb
.empty ();
518 live_ranges_per_bb
.empty ();
519 if (loop_autovec_infos
.get (loop
)->current_lmul
== RVV_M1
520 || max_nregs
<= V_REG_NUM
)
521 loop_autovec_infos
.get (loop
)->end_p
= true;
522 if (loop_autovec_infos
.get (loop
)->current_lmul
> RVV_M1
)
523 return max_nregs
> V_REG_NUM
;
526 if (!program_points_per_bb
.is_empty ())
528 for (hash_map
<basic_block
, vec
<stmt_point
>>::iterator iter
529 = program_points_per_bb
.begin ();
530 iter
!= program_points_per_bb
.end (); ++iter
)
532 vec
<stmt_point
> program_points
= (*iter
).second
;
533 if (!program_points
.is_empty ())
534 program_points
.release ();
536 program_points_per_bb
.empty ();
538 return lmul
> RVV_M1
;
542 costs::better_main_loop_than_p (const vector_costs
*uncast_other
) const
544 auto other
= static_cast<const costs
*> (uncast_other
);
546 if (!flag_vect_cost_model
)
547 return vector_costs::better_main_loop_than_p (other
);
549 if (riscv_autovec_lmul
== RVV_DYNAMIC
)
551 bool post_dom_available_p
= dom_info_available_p (CDI_POST_DOMINATORS
);
552 if (!post_dom_available_p
)
553 calculate_dominance_info (CDI_POST_DOMINATORS
);
554 bool preferred_p
= preferred_new_lmul_p (uncast_other
);
555 if (!post_dom_available_p
)
556 free_dominance_info (CDI_POST_DOMINATORS
);
560 return vector_costs::better_main_loop_than_p (other
);
564 costs::add_stmt_cost (int count
, vect_cost_for_stmt kind
,
565 stmt_vec_info stmt_info
, slp_tree
, tree vectype
,
566 int misalign
, vect_cost_model_location where
)
568 /* TODO: Use default STMT cost model.
569 We will support more accurate STMT cost model later. */
570 int stmt_cost
= default_builtin_vectorization_cost (kind
, vectype
, misalign
);
571 return record_stmt_cost (stmt_info
, where
, count
* stmt_cost
);
575 costs::finish_cost (const vector_costs
*scalar_costs
)
577 vector_costs::finish_cost (scalar_costs
);
580 } // namespace riscv_vector