1 /* Cost model implementation for RISC-V 'V' Extension for GNU compiler.
2 Copyright (C) 2023-2023 Free Software Foundation, Inc.
3 Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #define INCLUDE_STRING
26 #include "coretypes.h"
31 #include "basic-block.h"
34 #include "targhooks.h"
36 #include "fold-const.h"
38 #include "tree-vectorizer.h"
39 #include "gimple-iterator.h"
43 #include "tree-data-ref.h"
44 #include "tree-ssa-loop-niter.h"
46 /* This file should be included last. */
47 #include "riscv-vector-costs.h"
49 namespace riscv_vector
{
51 /* Dynamic LMUL philosophy - Local linear-scan SSA live range based analysis
54 - Collect all vectorize STMTs locally for each loop block.
55 - Build program point based graph, ignore non-vectorize STMTs:
57 vectorize STMT 0 - point 0
58 scalar STMT 0 - ignore.
59 vectorize STMT 1 - point 1
61 - Compute the number of live V_REGs live at each program point
62 - Determine LMUL in VECTOR COST model according to the program point
63 which has maximum live V_REGs.
67 - BIGGEST_MODE is the biggest LMUL auto-vectorization element mode.
68 It's important for mixed size auto-vectorization (Conversions, ... etc).
69 E.g. For a loop that is vectorizing conversion of INT32 -> INT64.
70 The biggest mode is DImode and LMUL = 8, LMUL = 4 for SImode.
71 We compute the number live V_REGs at each program point according to
73 - We only compute program points and live ranges locally (within a block)
74 since we just need to compute the number of live V_REGs at each program
75 point and we are not really allocating the registers for each SSA.
76 We can make the variable has another local live range in another block
77 if it live out/live in to another block. Such approach doesn't affect
78 out accurate live range analysis.
79 - Current analysis didn't consider any instruction scheduling which
80 may improve the register pressure. So we are conservatively doing the
81 analysis which may end up with smaller LMUL.
82 TODO: Maybe we could support a reasonable live range shrink algorithm
83 which take advantage of instruction scheduling.
84 - We may have these following possible autovec modes analysis:
86 1. M8 -> M4 -> M2 -> M1 (stop analysis here) -> MF2 -> MF4 -> MF8
87 2. M8 -> M1(M4) -> MF2(M2) -> MF4(M1) (stop analysis here) -> MF8(MF2)
88 3. M1(M8) -> MF2(M4) -> MF4(M2) -> MF8(M1)
91 /* Collect all STMTs that are vectorized and compute their program points.
92 Note that we don't care about the STMTs that are not vectorized and
93 we only build the local graph (within a block) of program points.
97 STMT 1 (be vectorized) -- point 0
98 STMT 2 (not be vectorized) -- ignored
99 STMT 3 (be vectorized) -- point 1
100 STMT 4 (be vectorized) -- point 2
101 STMT 5 (be vectorized) -- point 3
104 STMT 1 (be vectorized) -- point 0
105 STMT 2 (be vectorized) -- point 1
106 STMT 3 (not be vectorized) -- ignored
107 STMT 4 (not be vectorized) -- ignored
108 STMT 5 (be vectorized) -- point 2
112 compute_local_program_points (
114 hash_map
<basic_block
, vec
<stmt_point
>> &program_points_per_bb
)
116 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
118 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
119 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
120 unsigned int nbbs
= loop
->num_nodes
;
121 gimple_stmt_iterator si
;
123 /* Collect the stmts that is vectorized and mark their program point. */
124 for (i
= 0; i
< nbbs
; i
++)
127 basic_block bb
= bbs
[i
];
128 vec
<stmt_point
> program_points
= vNULL
;
129 if (dump_enabled_p ())
130 dump_printf_loc (MSG_NOTE
, vect_location
,
131 "Compute local program points for bb %d:\n",
133 for (si
= gsi_start_bb (bbs
[i
]); !gsi_end_p (si
); gsi_next (&si
))
135 if (!(is_gimple_assign (gsi_stmt (si
))
136 || is_gimple_call (gsi_stmt (si
))))
138 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (gsi_stmt (si
));
139 enum stmt_vec_info_type type
140 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info
));
141 if (type
!= undef_vec_info_type
)
143 stmt_point info
= {point
, gsi_stmt (si
)};
144 program_points
.safe_push (info
);
146 if (dump_enabled_p ())
147 dump_printf_loc (MSG_NOTE
, vect_location
,
148 "program point %d: %G", info
.point
,
152 program_points_per_bb
.put (bb
, program_points
);
158 get_biggest_mode (machine_mode mode1
, machine_mode mode2
)
160 unsigned int mode1_size
= GET_MODE_BITSIZE (mode1
).to_constant ();
161 unsigned int mode2_size
= GET_MODE_BITSIZE (mode2
).to_constant ();
162 return mode1_size
>= mode2_size
? mode1
: mode2
;
165 /* Compute local live ranges of each vectorized variable.
166 Note that we only compute local live ranges (within a block) since
167 local live ranges information is accurate enough for us to determine
168 the LMUL/vectorization factor of the loop.
173 STMT 2 (def SSA 1) -- point 1
174 STMT 3 (use SSA 1) -- point 2
180 STMT 4 (use SSA 2) -- point 3
182 The live range of SSA 1 is [1, 3] in bb 2.
183 The live range of SSA 2 is [0, 4] in bb 3. */
185 compute_local_live_ranges (
186 const hash_map
<basic_block
, vec
<stmt_point
>> &program_points_per_bb
,
187 hash_map
<basic_block
, hash_map
<tree
, pair
>> &live_ranges_per_bb
)
189 machine_mode biggest_mode
= QImode
;
190 if (!program_points_per_bb
.is_empty ())
192 auto_vec
<tree
> visited_vars
;
194 for (hash_map
<basic_block
, vec
<stmt_point
>>::iterator iter
195 = program_points_per_bb
.begin ();
196 iter
!= program_points_per_bb
.end (); ++iter
)
198 basic_block bb
= (*iter
).first
;
199 vec
<stmt_point
> program_points
= (*iter
).second
;
200 bool existed_p
= false;
201 hash_map
<tree
, pair
> *live_ranges
202 = &live_ranges_per_bb
.get_or_insert (bb
, &existed_p
);
203 gcc_assert (!existed_p
);
204 if (dump_enabled_p ())
205 dump_printf_loc (MSG_NOTE
, vect_location
,
206 "Compute local live ranges for bb %d:\n",
208 for (const auto program_point
: program_points
)
210 unsigned int point
= program_point
.point
;
211 gimple
*stmt
= program_point
.stmt
;
212 tree lhs
= gimple_get_lhs (stmt
);
213 if (lhs
!= NULL_TREE
&& is_gimple_reg (lhs
)
214 && !POINTER_TYPE_P (TREE_TYPE (lhs
)))
216 biggest_mode
= get_biggest_mode (biggest_mode
,
217 TYPE_MODE (TREE_TYPE (lhs
)));
218 bool existed_p
= false;
220 = live_ranges
->get_or_insert (lhs
, &existed_p
);
221 gcc_assert (!existed_p
);
222 live_range
= pair (point
, point
);
224 for (i
= 0; i
< gimple_num_args (stmt
); i
++)
226 tree var
= gimple_arg (stmt
, i
);
227 /* Both IMM and REG are included since a VECTOR_CST may be
228 potentially held in a vector register. However, it's not
229 accurate, since a PLUS_EXPR can be vectorized into vadd.vi
232 TODO: We may elide the cases that the unnecessary IMM in
234 if (poly_int_tree_p (var
)
235 || (is_gimple_val (var
)
236 && !POINTER_TYPE_P (TREE_TYPE (var
))))
239 = get_biggest_mode (biggest_mode
,
240 TYPE_MODE (TREE_TYPE (var
)));
241 bool existed_p
= false;
243 = live_ranges
->get_or_insert (var
, &existed_p
);
245 /* We will grow the live range for each use. */
246 live_range
= pair (live_range
.first
, point
);
248 /* We assume the variable is live from the start of
250 live_range
= pair (0, point
);
254 if (dump_enabled_p ())
255 for (hash_map
<tree
, pair
>::iterator iter
= live_ranges
->begin ();
256 iter
!= live_ranges
->end (); ++iter
)
257 dump_printf_loc (MSG_NOTE
, vect_location
,
258 "%T: type = %T, start = %d, end = %d\n",
259 (*iter
).first
, TREE_TYPE ((*iter
).first
),
260 (*iter
).second
.first
, (*iter
).second
.second
);
263 if (dump_enabled_p ())
264 dump_printf_loc (MSG_NOTE
, vect_location
, "Biggest mode = %s\n",
265 GET_MODE_NAME (biggest_mode
));
269 /* Compute the mode for MODE, BIGGEST_MODE and LMUL.
271 E.g. If mode = SImode, biggest_mode = DImode, LMUL = M4.
272 Then return RVVM4SImode (LMUL = 4, element mode = SImode). */
274 compute_nregs_for_mode (machine_mode mode
, machine_mode biggest_mode
, int lmul
)
276 unsigned int mode_size
= GET_MODE_SIZE (mode
).to_constant ();
277 unsigned int biggest_size
= GET_MODE_SIZE (biggest_mode
).to_constant ();
278 gcc_assert (biggest_size
>= mode_size
);
279 unsigned int ratio
= biggest_size
/ mode_size
;
283 /* This function helps to determine whether current LMUL will cause
284 potential vector register (V_REG) spillings according to live range
287 - First, compute how many variable are alive of each program point
288 in each bb of the loop.
289 - Second, compute how many V_REGs are alive of each program point
290 in each bb of the loop according the BIGGEST_MODE and the variable
292 - Third, Return the maximum V_REGs are alive of the loop. */
294 max_number_of_live_regs (const basic_block bb
,
295 const hash_map
<tree
, pair
> &live_ranges
,
296 unsigned int max_point
, machine_mode biggest_mode
,
299 unsigned int max_nregs
= 0;
301 unsigned int live_point
= 0;
302 auto_vec
<unsigned int> live_vars_vec
;
303 live_vars_vec
.safe_grow_cleared (max_point
, true);
304 for (hash_map
<tree
, pair
>::iterator iter
= live_ranges
.begin ();
305 iter
!= live_ranges
.end (); ++iter
)
307 tree var
= (*iter
).first
;
308 pair live_range
= (*iter
).second
;
309 for (i
= live_range
.first
+ 1; i
<= live_range
.second
; i
++)
311 machine_mode mode
= TYPE_MODE (TREE_TYPE (var
));
313 = compute_nregs_for_mode (mode
, biggest_mode
, lmul
);
314 live_vars_vec
[i
] += nregs
;
315 if (live_vars_vec
[i
] > max_nregs
)
316 max_nregs
= live_vars_vec
[i
];
320 /* Collect user explicit RVV type. */
321 auto_vec
<basic_block
> all_preds
322 = get_all_dominated_blocks (CDI_POST_DOMINATORS
, bb
);
324 FOR_EACH_SSA_NAME (i
, t
, cfun
)
326 machine_mode mode
= TYPE_MODE (TREE_TYPE (t
));
327 if (!lookup_vector_type_attribute (TREE_TYPE (t
))
328 && !riscv_v_ext_vls_mode_p (mode
))
331 gimple
*def
= SSA_NAME_DEF_STMT (t
);
332 if (gimple_bb (def
) && !all_preds
.contains (gimple_bb (def
)))
335 imm_use_iterator iterator
;
337 FOR_EACH_IMM_USE_FAST (use_p
, iterator
, t
)
339 if (!USE_STMT (use_p
) || is_gimple_debug (USE_STMT (use_p
))
340 || !dominated_by_p (CDI_POST_DOMINATORS
, bb
,
341 gimple_bb (USE_STMT (use_p
))))
344 int regno_alignment
= riscv_get_v_regno_alignment (mode
);
345 max_nregs
+= regno_alignment
;
346 if (dump_enabled_p ())
348 MSG_NOTE
, vect_location
,
349 "Explicit used SSA %T, vectype = %T, mode = %s, cause %d "
350 "V_REG live in bb %d at program point %d\n",
351 t
, TREE_TYPE (t
), GET_MODE_NAME (mode
), regno_alignment
,
352 bb
->index
, live_point
);
357 if (dump_enabled_p ())
359 MSG_NOTE
, vect_location
,
360 "Maximum lmul = %d, At most %d number of live V_REG at program "
361 "point %d for bb %d\n",
362 lmul
, max_nregs
, live_point
, bb
->index
);
366 /* Get STORE value. */
368 get_store_value (gimple
*stmt
)
370 if (is_gimple_call (stmt
) && gimple_call_internal_p (stmt
))
372 if (gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
373 return gimple_call_arg (stmt
, 3);
378 return gimple_assign_rhs1 (stmt
);
381 /* Return true if it is non-contiguous load/store. */
383 non_contiguous_memory_access_p (stmt_vec_info stmt_info
)
385 enum stmt_vec_info_type type
386 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info
));
387 return ((type
== load_vec_info_type
|| type
== store_vec_info_type
)
388 && !adjacent_dr_p (STMT_VINFO_DATA_REF (stmt_info
)));
391 /* Return the LMUL of the current analysis. */
393 compute_estimated_lmul (loop_vec_info loop_vinfo
, machine_mode mode
)
395 gcc_assert (GET_MODE_BITSIZE (mode
).is_constant ());
396 int regno_alignment
= riscv_get_v_regno_alignment (loop_vinfo
->vector_mode
);
397 if (riscv_v_ext_vls_mode_p (loop_vinfo
->vector_mode
))
398 return regno_alignment
;
399 else if (known_eq (LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
), 1U)
400 || LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
).is_constant ())
402 int estimated_vf
= vect_vf_for_cost (loop_vinfo
);
403 return estimated_vf
* GET_MODE_BITSIZE (mode
).to_constant ()
408 /* Estimate the VLA SLP LMUL. */
409 if (regno_alignment
> RVV_M1
)
410 return regno_alignment
;
411 else if (mode
!= QImode
)
414 if (can_div_trunc_p (BYTES_PER_RISCV_VECTOR
,
415 GET_MODE_SIZE (loop_vinfo
->vector_mode
), &ratio
))
427 /* Update the live ranges according PHI.
432 STMT 2 (def SSA 1) -- point 1
433 STMT 3 (use SSA 1) -- point 2
439 STMT 3 (use SSA 2) -- point 2
442 Before this function, the SSA 1 live range is [2, 3] in bb 2
443 and SSA 2 is [0, 3] in bb 3.
445 Then, after this function, we update SSA 1 live range in bb 2
446 into [2, 4] since SSA 1 is live out into bb 3. */
448 update_local_live_ranges (
450 hash_map
<basic_block
, vec
<stmt_point
>> &program_points_per_bb
,
451 hash_map
<basic_block
, hash_map
<tree
, pair
>> &live_ranges_per_bb
,
452 machine_mode
*biggest_mode
)
454 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
458 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
459 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
460 unsigned int nbbs
= loop
->num_nodes
;
463 gimple_stmt_iterator si
;
464 for (i
= 0; i
< nbbs
; i
++)
466 basic_block bb
= bbs
[i
];
467 if (dump_enabled_p ())
468 dump_printf_loc (MSG_NOTE
, vect_location
,
469 "Update local program points for bb %d:\n",
471 for (psi
= gsi_start_phis (bb
); !gsi_end_p (psi
); gsi_next (&psi
))
473 gphi
*phi
= psi
.phi ();
474 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (phi
);
475 if (STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info
))
476 == undef_vec_info_type
)
479 for (j
= 0; j
< gimple_phi_num_args (phi
); j
++)
481 edge e
= gimple_phi_arg_edge (phi
, j
);
482 tree def
= gimple_phi_arg_def (phi
, j
);
483 auto *live_ranges
= live_ranges_per_bb
.get (bb
);
484 auto *live_range
= live_ranges
->get (def
);
485 if (poly_int_tree_p (def
))
487 /* Insert live range of INTEGER_CST or POLY_CST since we will
488 need to allocate a vector register for it.
490 E.g. # j_17 = PHI <j_11(9), 0(5)> will be transformed
491 into # vect_vec_iv_.8_24 = PHI <_25(9), { 0, ... }(5)>
493 The live range for such value is short which only lives
494 from program point 0 to 1. */
497 unsigned int start
= (*live_range
).first
;
498 (*live_range
).first
= 0;
499 if (dump_enabled_p ())
501 MSG_NOTE
, vect_location
,
502 "Update %T start point from %d to 0:\n", def
, start
);
506 live_ranges
->put (def
, pair (0, 1));
507 auto &program_points
= (*program_points_per_bb
.get (bb
));
508 if (program_points
.is_empty ())
510 stmt_point info
= {1, phi
};
511 program_points
.safe_push (info
);
513 if (dump_enabled_p ())
514 dump_printf_loc (MSG_NOTE
, vect_location
,
515 "Add %T start point from 0 to 1:\n",
520 if (live_range
&& flow_bb_inside_loop_p (loop
, e
->src
))
522 unsigned int start
= (*live_range
).first
;
523 (*live_range
).first
= 0;
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE
, vect_location
,
526 "Update %T start point from %d to %d:\n",
527 def
, start
, (*live_range
).first
);
529 live_ranges
= live_ranges_per_bb
.get (e
->src
);
530 if (!program_points_per_bb
.get (e
->src
))
532 unsigned int max_point
533 = (*program_points_per_bb
.get (e
->src
)).length ();
534 live_range
= live_ranges
->get (def
);
538 unsigned int end
= (*live_range
).second
;
539 (*live_range
).second
= max_point
;
540 if (dump_enabled_p ())
541 dump_printf_loc (MSG_NOTE
, vect_location
,
542 "Update %T end point from %d to %d:\n", def
,
543 end
, (*live_range
).second
);
546 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
548 if (!(is_gimple_assign (gsi_stmt (si
))
549 || is_gimple_call (gsi_stmt (si
))))
551 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (gsi_stmt (si
));
552 enum stmt_vec_info_type type
553 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info
));
554 if (non_contiguous_memory_access_p (stmt_info
)
555 /* LOAD_LANES/STORE_LANES doesn't need a perm indice. */
556 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info
)
557 != VMAT_LOAD_STORE_LANES
)
559 /* For non-adjacent load/store STMT, we will potentially
562 1. MASK_LEN_GATHER_LOAD (..., perm indice).
563 2. Continguous load/store + VEC_PERM (..., perm indice)
565 We will be likely using one more vector variable. */
566 unsigned int max_point
567 = (*program_points_per_bb
.get (bb
)).length () - 1;
568 auto *live_ranges
= live_ranges_per_bb
.get (bb
);
569 bool existed_p
= false;
570 tree var
= type
== load_vec_info_type
571 ? gimple_get_lhs (gsi_stmt (si
))
572 : get_store_value (gsi_stmt (si
));
573 tree sel_type
= build_nonstandard_integer_type (
574 TYPE_PRECISION (TREE_TYPE (var
)), 1);
576 = get_biggest_mode (*biggest_mode
, TYPE_MODE (sel_type
));
577 tree sel
= build_decl (UNKNOWN_LOCATION
, VAR_DECL
,
578 get_identifier ("vect_perm"), sel_type
);
579 pair
&live_range
= live_ranges
->get_or_insert (sel
, &existed_p
);
580 gcc_assert (!existed_p
);
581 live_range
= pair (0, max_point
);
582 if (dump_enabled_p ())
583 dump_printf_loc (MSG_NOTE
, vect_location
,
584 "Add perm indice %T, start = 0, end = %d\n",
591 /* Compute the maximum live V_REGS. */
593 has_unexpected_spills_p (loop_vec_info loop_vinfo
)
595 /* Compute local program points.
596 It's a fast and effective computation. */
597 hash_map
<basic_block
, vec
<stmt_point
>> program_points_per_bb
;
598 compute_local_program_points (loop_vinfo
, program_points_per_bb
);
600 /* Compute local live ranges. */
601 hash_map
<basic_block
, hash_map
<tree
, pair
>> live_ranges_per_bb
;
602 machine_mode biggest_mode
603 = compute_local_live_ranges (program_points_per_bb
, live_ranges_per_bb
);
605 /* Update live ranges according to PHI. */
606 update_local_live_ranges (loop_vinfo
, program_points_per_bb
,
607 live_ranges_per_bb
, &biggest_mode
);
609 int lmul
= compute_estimated_lmul (loop_vinfo
, biggest_mode
);
610 /* TODO: We calculate the maximum live vars base on current STMTS
611 sequence. We can support live range shrink if it can give us
612 big improvement in the future. */
615 if (!live_ranges_per_bb
.is_empty ())
617 unsigned int max_nregs
= 0;
618 for (hash_map
<basic_block
, hash_map
<tree
, pair
>>::iterator iter
619 = live_ranges_per_bb
.begin ();
620 iter
!= live_ranges_per_bb
.end (); ++iter
)
622 basic_block bb
= (*iter
).first
;
623 unsigned int max_point
624 = (*program_points_per_bb
.get (bb
)).length () + 1;
625 if ((*iter
).second
.is_empty ())
627 /* We prefer larger LMUL unless it causes register spillings. */
629 = max_number_of_live_regs (bb
, (*iter
).second
, max_point
,
631 if (nregs
> max_nregs
)
634 live_ranges_per_bb
.empty ();
635 if (max_nregs
> V_REG_NUM
)
639 if (!program_points_per_bb
.is_empty ())
641 for (hash_map
<basic_block
, vec
<stmt_point
>>::iterator iter
642 = program_points_per_bb
.begin ();
643 iter
!= program_points_per_bb
.end (); ++iter
)
645 vec
<stmt_point
> program_points
= (*iter
).second
;
646 if (!program_points
.is_empty ())
647 program_points
.release ();
649 program_points_per_bb
.empty ();
654 costs::costs (vec_info
*vinfo
, bool costing_for_scalar
)
655 : vector_costs (vinfo
, costing_for_scalar
)
657 if (costing_for_scalar
)
658 m_cost_type
= SCALAR_COST
;
659 else if (riscv_v_ext_vector_mode_p (vinfo
->vector_mode
))
660 m_cost_type
= VLA_VECTOR_COST
;
662 m_cost_type
= VLS_VECTOR_COST
;
665 /* Do one-time initialization of the costs given that we're
666 costing the loop vectorization described by LOOP_VINFO. */
668 costs::analyze_loop_vinfo (loop_vec_info loop_vinfo
)
670 /* Record the number of times that the vector loop would execute,
672 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
673 auto scalar_niters
= max_stmt_executions_int (loop
);
674 if (scalar_niters
>= 0)
676 unsigned int vf
= vect_vf_for_cost (loop_vinfo
);
677 if (LOOP_VINFO_LENS (loop_vinfo
).is_empty ())
678 m_num_vector_iterations
= scalar_niters
/ vf
;
680 m_num_vector_iterations
= CEIL (scalar_niters
, vf
);
683 /* Detect whether we're vectorizing for VLA and should apply the unrolling
684 heuristic described above m_unrolled_vls_niters. */
685 record_potential_vls_unrolling (loop_vinfo
);
687 /* Detect whether the LOOP has unexpected spills. */
688 record_potential_unexpected_spills (loop_vinfo
);
691 /* Analyze the vectorized program stataments and use dynamic LMUL
692 heuristic to detect whether the loop has unexpected spills. */
694 costs::record_potential_unexpected_spills (loop_vec_info loop_vinfo
)
696 /* We only want to apply the heuristic if LOOP_VINFO is being
697 vectorized for VLA and known NITERS VLS loop. */
698 if (riscv_autovec_lmul
== RVV_DYNAMIC
699 && (m_cost_type
== VLA_VECTOR_COST
700 || (m_cost_type
== VLS_VECTOR_COST
701 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))))
703 bool post_dom_available_p
= dom_info_available_p (CDI_POST_DOMINATORS
);
704 if (!post_dom_available_p
)
705 calculate_dominance_info (CDI_POST_DOMINATORS
);
706 m_has_unexpected_spills_p
= has_unexpected_spills_p (loop_vinfo
);
707 if (!post_dom_available_p
)
708 free_dominance_info (CDI_POST_DOMINATORS
);
712 /* Decide whether to use the unrolling heuristic described above
713 m_unrolled_vls_niters, updating that field if so. LOOP_VINFO
714 describes the loop that we're vectorizing. */
716 costs::record_potential_vls_unrolling (loop_vec_info loop_vinfo
)
718 /* We only want to apply the heuristic if LOOP_VINFO is being
719 vectorized for VLA. */
720 if (m_cost_type
!= VLA_VECTOR_COST
)
723 /* We don't want to apply the heuristic to outer loops, since it's
724 harder to track two levels of unrolling. */
725 if (LOOP_VINFO_LOOP (loop_vinfo
)->inner
)
728 /* Only handle cases in which the number of VLS iterations
729 would be known at compile time but the number of SVE iterations
731 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
732 || BYTES_PER_RISCV_VECTOR
.is_constant ())
735 /* Guess how many times the VLS loop would iterate and make
736 sure that it is within the complete unrolling limit. Even if the
737 number of iterations is small enough, the number of statements might
738 not be, which is why we need to estimate the number of statements too. */
739 unsigned int vls_vf
= vect_vf_for_cost (loop_vinfo
);
740 unsigned HOST_WIDE_INT unrolled_vls_niters
741 = LOOP_VINFO_INT_NITERS (loop_vinfo
) / vls_vf
;
742 if (unrolled_vls_niters
> (unsigned int) param_max_completely_peel_times
)
745 /* Record that we're applying the heuristic and should try to estimate
746 the number of statements in the VLS loop. */
747 m_unrolled_vls_niters
= unrolled_vls_niters
;
750 /* Return true if (a) we're applying the VLS vs. VLA unrolling
751 heuristic described above m_unrolled_vls_niters and (b) the heuristic
752 says that we should prefer the VLS loop. */
754 costs::prefer_unrolled_loop () const
756 if (!m_unrolled_vls_stmts
)
759 if (dump_enabled_p ())
760 dump_printf_loc (MSG_NOTE
, vect_location
,
762 " unrolled VLS loop = " HOST_WIDE_INT_PRINT_UNSIGNED
"\n",
763 m_unrolled_vls_stmts
);
765 /* The balance here is tricky. On the one hand, we can't be sure whether
766 the code is vectorizable with VLS or not. However, even if
767 it isn't vectorizable with VLS, there's a possibility that
768 the scalar code could also be unrolled. Some of the code might then
769 benefit from SLP, or from using LDP and STP. We therefore apply
770 the heuristic regardless of can_use_vls_p. */
771 return (m_unrolled_vls_stmts
772 && (m_unrolled_vls_stmts
773 <= (unsigned int) param_max_completely_peeled_insns
));
777 costs::better_main_loop_than_p (const vector_costs
*uncast_other
) const
779 auto other
= static_cast<const costs
*> (uncast_other
);
780 auto this_loop_vinfo
= as_a
<loop_vec_info
> (this->m_vinfo
);
781 auto other_loop_vinfo
= as_a
<loop_vec_info
> (other
->m_vinfo
);
783 if (dump_enabled_p ())
784 dump_printf_loc (MSG_NOTE
, vect_location
,
785 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
786 GET_MODE_NAME (this_loop_vinfo
->vector_mode
),
787 vect_vf_for_cost (this_loop_vinfo
),
788 GET_MODE_NAME (other_loop_vinfo
->vector_mode
),
789 vect_vf_for_cost (other_loop_vinfo
));
791 /* Apply the unrolling heuristic described above m_unrolled_vls_niters. */
792 if (bool (m_unrolled_vls_stmts
) != bool (other
->m_unrolled_vls_stmts
))
794 bool this_prefer_unrolled
= this->prefer_unrolled_loop ();
795 bool other_prefer_unrolled
= other
->prefer_unrolled_loop ();
796 if (this_prefer_unrolled
!= other_prefer_unrolled
)
798 if (dump_enabled_p ())
799 dump_printf_loc (MSG_NOTE
, vect_location
,
800 "Preferring VLS loop because"
801 " it can be unrolled\n");
802 return other_prefer_unrolled
;
805 else if (riscv_autovec_lmul
== RVV_DYNAMIC
806 && !LOOP_VINFO_NITERS_KNOWN_P (other_loop_vinfo
))
808 if (other
->m_has_unexpected_spills_p
)
810 if (dump_enabled_p ())
811 dump_printf_loc (MSG_NOTE
, vect_location
,
812 "Preferring smaller LMUL loop because"
813 " it has unexpected spills\n");
820 return vector_costs::better_main_loop_than_p (other
);
824 costs::add_stmt_cost (int count
, vect_cost_for_stmt kind
,
825 stmt_vec_info stmt_info
, slp_tree
, tree vectype
,
826 int misalign
, vect_cost_model_location where
)
829 = targetm
.vectorize
.builtin_vectorization_cost (kind
, vectype
, misalign
);
831 /* Do one-time initialization based on the vinfo. */
832 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (m_vinfo
);
833 if (!m_analyzed_vinfo
)
836 analyze_loop_vinfo (loop_vinfo
);
838 m_analyzed_vinfo
= true;
843 /* If we're applying the VLA vs. VLS unrolling heuristic,
844 estimate the number of statements in the unrolled VLS
845 loop. For simplicitly, we assume that one iteration of the
846 VLS loop would need the same number of statements
847 as one iteration of the VLA loop. */
848 if (where
== vect_body
&& m_unrolled_vls_niters
)
849 m_unrolled_vls_stmts
+= count
* m_unrolled_vls_niters
;
852 return record_stmt_cost (stmt_info
, where
, count
* stmt_cost
);
856 costs::finish_cost (const vector_costs
*scalar_costs
)
858 vector_costs::finish_cost (scalar_costs
);
861 } // namespace riscv_vector