]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/riscv/riscv-vector-costs.cc
RISC-V: Some minior tweak on dynamic LMUL cost model
[thirdparty/gcc.git] / gcc / config / riscv / riscv-vector-costs.cc
1 /* Cost model implementation for RISC-V 'V' Extension for GNU compiler.
2 Copyright (C) 2023-2023 Free Software Foundation, Inc.
3 Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #define INCLUDE_STRING
24 #include "config.h"
25 #include "system.h"
26 #include "coretypes.h"
27 #include "tm.h"
28 #include "target.h"
29 #include "function.h"
30 #include "tree.h"
31 #include "basic-block.h"
32 #include "rtl.h"
33 #include "gimple.h"
34 #include "targhooks.h"
35 #include "cfgloop.h"
36 #include "fold-const.h"
37 #include "tm_p.h"
38 #include "tree-vectorizer.h"
39 #include "gimple-iterator.h"
40 #include "bitmap.h"
41 #include "ssa.h"
42 #include "backend.h"
43 #include "tree-data-ref.h"
44 #include "tree-ssa-loop-niter.h"
45
46 /* This file should be included last. */
47 #include "riscv-vector-costs.h"
48
49 namespace riscv_vector {
50
51 /* Dynamic LMUL philosophy - Local linear-scan SSA live range based analysis
52 determine LMUL
53
54 - Collect all vectorize STMTs locally for each loop block.
55 - Build program point based graph, ignore non-vectorize STMTs:
56
57 vectorize STMT 0 - point 0
58 scalar STMT 0 - ignore.
59 vectorize STMT 1 - point 1
60 ...
61 - Compute the number of live V_REGs live at each program point
62 - Determine LMUL in VECTOR COST model according to the program point
63 which has maximum live V_REGs.
64
65 Note:
66
67 - BIGGEST_MODE is the biggest LMUL auto-vectorization element mode.
68 It's important for mixed size auto-vectorization (Conversions, ... etc).
69 E.g. For a loop that is vectorizing conversion of INT32 -> INT64.
70 The biggest mode is DImode and LMUL = 8, LMUL = 4 for SImode.
71 We compute the number live V_REGs at each program point according to
72 this information.
73 - We only compute program points and live ranges locally (within a block)
74 since we just need to compute the number of live V_REGs at each program
75 point and we are not really allocating the registers for each SSA.
76 We can make the variable has another local live range in another block
77 if it live out/live in to another block. Such approach doesn't affect
78 out accurate live range analysis.
79 - Current analysis didn't consider any instruction scheduling which
80 may improve the register pressure. So we are conservatively doing the
81 analysis which may end up with smaller LMUL.
82 TODO: Maybe we could support a reasonable live range shrink algorithm
83 which take advantage of instruction scheduling.
84 - We may have these following possible autovec modes analysis:
85
86 1. M8 -> M4 -> M2 -> M1 (stop analysis here) -> MF2 -> MF4 -> MF8
87 2. M8 -> M1(M4) -> MF2(M2) -> MF4(M1) (stop analysis here) -> MF8(MF2)
88 3. M1(M8) -> MF2(M4) -> MF4(M2) -> MF8(M1)
89 */
90
91 /* Collect all STMTs that are vectorized and compute their program points.
92 Note that we don't care about the STMTs that are not vectorized and
93 we only build the local graph (within a block) of program points.
94
95 Loop:
96 bb 2:
97 STMT 1 (be vectorized) -- point 0
98 STMT 2 (not be vectorized) -- ignored
99 STMT 3 (be vectorized) -- point 1
100 STMT 4 (be vectorized) -- point 2
101 STMT 5 (be vectorized) -- point 3
102 ...
103 bb 3:
104 STMT 1 (be vectorized) -- point 0
105 STMT 2 (be vectorized) -- point 1
106 STMT 3 (not be vectorized) -- ignored
107 STMT 4 (not be vectorized) -- ignored
108 STMT 5 (be vectorized) -- point 2
109 ...
110 */
111 static void
112 compute_local_program_points (
113 vec_info *vinfo,
114 hash_map<basic_block, vec<stmt_point>> &program_points_per_bb)
115 {
116 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
117 {
118 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
119 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
120 unsigned int nbbs = loop->num_nodes;
121 gimple_stmt_iterator si;
122 unsigned int i;
123 /* Collect the stmts that is vectorized and mark their program point. */
124 for (i = 0; i < nbbs; i++)
125 {
126 int point = 1;
127 basic_block bb = bbs[i];
128 vec<stmt_point> program_points = vNULL;
129 if (dump_enabled_p ())
130 dump_printf_loc (MSG_NOTE, vect_location,
131 "Compute local program points for bb %d:\n",
132 bb->index);
133 for (si = gsi_start_bb (bbs[i]); !gsi_end_p (si); gsi_next (&si))
134 {
135 if (!(is_gimple_assign (gsi_stmt (si))
136 || is_gimple_call (gsi_stmt (si))))
137 continue;
138 stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
139 enum stmt_vec_info_type type
140 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
141 if (type != undef_vec_info_type)
142 {
143 stmt_point info = {point, gsi_stmt (si)};
144 program_points.safe_push (info);
145 point++;
146 if (dump_enabled_p ())
147 dump_printf_loc (MSG_NOTE, vect_location,
148 "program point %d: %G", info.point,
149 gsi_stmt (si));
150 }
151 }
152 program_points_per_bb.put (bb, program_points);
153 }
154 }
155 }
156
157 static machine_mode
158 get_biggest_mode (machine_mode mode1, machine_mode mode2)
159 {
160 unsigned int mode1_size = GET_MODE_BITSIZE (mode1).to_constant ();
161 unsigned int mode2_size = GET_MODE_BITSIZE (mode2).to_constant ();
162 return mode1_size >= mode2_size ? mode1 : mode2;
163 }
164
165 /* Compute local live ranges of each vectorized variable.
166 Note that we only compute local live ranges (within a block) since
167 local live ranges information is accurate enough for us to determine
168 the LMUL/vectorization factor of the loop.
169
170 Loop:
171 bb 2:
172 STMT 1 -- point 0
173 STMT 2 (def SSA 1) -- point 1
174 STMT 3 (use SSA 1) -- point 2
175 STMT 4 -- point 3
176 bb 3:
177 STMT 1 -- point 0
178 STMT 2 -- point 1
179 STMT 3 -- point 2
180 STMT 4 (use SSA 2) -- point 3
181
182 The live range of SSA 1 is [1, 3] in bb 2.
183 The live range of SSA 2 is [0, 4] in bb 3. */
184 static machine_mode
185 compute_local_live_ranges (
186 const hash_map<basic_block, vec<stmt_point>> &program_points_per_bb,
187 hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb)
188 {
189 machine_mode biggest_mode = QImode;
190 if (!program_points_per_bb.is_empty ())
191 {
192 auto_vec<tree> visited_vars;
193 unsigned int i;
194 for (hash_map<basic_block, vec<stmt_point>>::iterator iter
195 = program_points_per_bb.begin ();
196 iter != program_points_per_bb.end (); ++iter)
197 {
198 basic_block bb = (*iter).first;
199 vec<stmt_point> program_points = (*iter).second;
200 bool existed_p = false;
201 hash_map<tree, pair> *live_ranges
202 = &live_ranges_per_bb.get_or_insert (bb, &existed_p);
203 gcc_assert (!existed_p);
204 if (dump_enabled_p ())
205 dump_printf_loc (MSG_NOTE, vect_location,
206 "Compute local live ranges for bb %d:\n",
207 bb->index);
208 for (const auto program_point : program_points)
209 {
210 unsigned int point = program_point.point;
211 gimple *stmt = program_point.stmt;
212 tree lhs = gimple_get_lhs (stmt);
213 if (lhs != NULL_TREE && is_gimple_reg (lhs)
214 && !POINTER_TYPE_P (TREE_TYPE (lhs)))
215 {
216 biggest_mode = get_biggest_mode (biggest_mode,
217 TYPE_MODE (TREE_TYPE (lhs)));
218 bool existed_p = false;
219 pair &live_range
220 = live_ranges->get_or_insert (lhs, &existed_p);
221 gcc_assert (!existed_p);
222 live_range = pair (point, point);
223 }
224 for (i = 0; i < gimple_num_args (stmt); i++)
225 {
226 tree var = gimple_arg (stmt, i);
227 /* Both IMM and REG are included since a VECTOR_CST may be
228 potentially held in a vector register. However, it's not
229 accurate, since a PLUS_EXPR can be vectorized into vadd.vi
230 if IMM is -16 ~ 15.
231
232 TODO: We may elide the cases that the unnecessary IMM in
233 the future. */
234 if (poly_int_tree_p (var)
235 || (is_gimple_val (var)
236 && !POINTER_TYPE_P (TREE_TYPE (var))))
237 {
238 biggest_mode
239 = get_biggest_mode (biggest_mode,
240 TYPE_MODE (TREE_TYPE (var)));
241 bool existed_p = false;
242 pair &live_range
243 = live_ranges->get_or_insert (var, &existed_p);
244 if (existed_p)
245 /* We will grow the live range for each use. */
246 live_range = pair (live_range.first, point);
247 else
248 /* We assume the variable is live from the start of
249 this block. */
250 live_range = pair (0, point);
251 }
252 }
253 }
254 if (dump_enabled_p ())
255 for (hash_map<tree, pair>::iterator iter = live_ranges->begin ();
256 iter != live_ranges->end (); ++iter)
257 dump_printf_loc (MSG_NOTE, vect_location,
258 "%T: type = %T, start = %d, end = %d\n",
259 (*iter).first, TREE_TYPE ((*iter).first),
260 (*iter).second.first, (*iter).second.second);
261 }
262 }
263 if (dump_enabled_p ())
264 dump_printf_loc (MSG_NOTE, vect_location, "Biggest mode = %s\n",
265 GET_MODE_NAME (biggest_mode));
266 return biggest_mode;
267 }
268
269 /* Compute the mode for MODE, BIGGEST_MODE and LMUL.
270
271 E.g. If mode = SImode, biggest_mode = DImode, LMUL = M4.
272 Then return RVVM4SImode (LMUL = 4, element mode = SImode). */
273 static unsigned int
274 compute_nregs_for_mode (machine_mode mode, machine_mode biggest_mode, int lmul)
275 {
276 unsigned int mode_size = GET_MODE_SIZE (mode).to_constant ();
277 unsigned int biggest_size = GET_MODE_SIZE (biggest_mode).to_constant ();
278 gcc_assert (biggest_size >= mode_size);
279 unsigned int ratio = biggest_size / mode_size;
280 return lmul / ratio;
281 }
282
283 /* This function helps to determine whether current LMUL will cause
284 potential vector register (V_REG) spillings according to live range
285 information.
286
287 - First, compute how many variable are alive of each program point
288 in each bb of the loop.
289 - Second, compute how many V_REGs are alive of each program point
290 in each bb of the loop according the BIGGEST_MODE and the variable
291 mode.
292 - Third, Return the maximum V_REGs are alive of the loop. */
293 static unsigned int
294 max_number_of_live_regs (const basic_block bb,
295 const hash_map<tree, pair> &live_ranges,
296 unsigned int max_point, machine_mode biggest_mode,
297 int lmul)
298 {
299 unsigned int max_nregs = 0;
300 unsigned int i;
301 unsigned int live_point = 0;
302 auto_vec<unsigned int> live_vars_vec;
303 live_vars_vec.safe_grow_cleared (max_point, true);
304 for (hash_map<tree, pair>::iterator iter = live_ranges.begin ();
305 iter != live_ranges.end (); ++iter)
306 {
307 tree var = (*iter).first;
308 pair live_range = (*iter).second;
309 for (i = live_range.first + 1; i <= live_range.second; i++)
310 {
311 machine_mode mode = TYPE_MODE (TREE_TYPE (var));
312 unsigned int nregs
313 = compute_nregs_for_mode (mode, biggest_mode, lmul);
314 live_vars_vec[i] += nregs;
315 if (live_vars_vec[i] > max_nregs)
316 max_nregs = live_vars_vec[i];
317 }
318 }
319
320 /* Collect user explicit RVV type. */
321 auto_vec<basic_block> all_preds
322 = get_all_dominated_blocks (CDI_POST_DOMINATORS, bb);
323 tree t;
324 FOR_EACH_SSA_NAME (i, t, cfun)
325 {
326 machine_mode mode = TYPE_MODE (TREE_TYPE (t));
327 if (!lookup_vector_type_attribute (TREE_TYPE (t))
328 && !riscv_v_ext_vls_mode_p (mode))
329 continue;
330
331 gimple *def = SSA_NAME_DEF_STMT (t);
332 if (gimple_bb (def) && !all_preds.contains (gimple_bb (def)))
333 continue;
334 use_operand_p use_p;
335 imm_use_iterator iterator;
336
337 FOR_EACH_IMM_USE_FAST (use_p, iterator, t)
338 {
339 if (!USE_STMT (use_p) || is_gimple_debug (USE_STMT (use_p))
340 || !dominated_by_p (CDI_POST_DOMINATORS, bb,
341 gimple_bb (USE_STMT (use_p))))
342 continue;
343
344 int regno_alignment = riscv_get_v_regno_alignment (mode);
345 max_nregs += regno_alignment;
346 if (dump_enabled_p ())
347 dump_printf_loc (
348 MSG_NOTE, vect_location,
349 "Explicit used SSA %T, vectype = %T, mode = %s, cause %d "
350 "V_REG live in bb %d at program point %d\n",
351 t, TREE_TYPE (t), GET_MODE_NAME (mode), regno_alignment,
352 bb->index, live_point);
353 break;
354 }
355 }
356
357 if (dump_enabled_p ())
358 dump_printf_loc (
359 MSG_NOTE, vect_location,
360 "Maximum lmul = %d, At most %d number of live V_REG at program "
361 "point %d for bb %d\n",
362 lmul, max_nregs, live_point, bb->index);
363 return max_nregs;
364 }
365
366 /* Get STORE value. */
367 static tree
368 get_store_value (gimple *stmt)
369 {
370 if (is_gimple_call (stmt) && gimple_call_internal_p (stmt))
371 {
372 if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
373 return gimple_call_arg (stmt, 3);
374 else
375 gcc_unreachable ();
376 }
377 else
378 return gimple_assign_rhs1 (stmt);
379 }
380
381 /* Return true if it is non-contiguous load/store. */
382 static bool
383 non_contiguous_memory_access_p (stmt_vec_info stmt_info)
384 {
385 enum stmt_vec_info_type type
386 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
387 return ((type == load_vec_info_type || type == store_vec_info_type)
388 && !adjacent_dr_p (STMT_VINFO_DATA_REF (stmt_info)));
389 }
390
391 /* Return the LMUL of the current analysis. */
392 static int
393 compute_estimated_lmul (loop_vec_info loop_vinfo, machine_mode mode)
394 {
395 gcc_assert (GET_MODE_BITSIZE (mode).is_constant ());
396 int regno_alignment = riscv_get_v_regno_alignment (loop_vinfo->vector_mode);
397 if (riscv_v_ext_vls_mode_p (loop_vinfo->vector_mode))
398 return regno_alignment;
399 else if (known_eq (LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo), 1U)
400 || LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo).is_constant ())
401 {
402 int estimated_vf = vect_vf_for_cost (loop_vinfo);
403 return estimated_vf * GET_MODE_BITSIZE (mode).to_constant ()
404 / TARGET_MIN_VLEN;
405 }
406 else
407 {
408 /* Estimate the VLA SLP LMUL. */
409 if (regno_alignment > RVV_M1)
410 return regno_alignment;
411 else if (mode != QImode)
412 {
413 int ratio;
414 if (can_div_trunc_p (BYTES_PER_RISCV_VECTOR,
415 GET_MODE_SIZE (loop_vinfo->vector_mode), &ratio))
416 {
417 if (ratio == 1)
418 return RVV_M4;
419 else if (ratio == 2)
420 return RVV_M2;
421 }
422 }
423 }
424 return 0;
425 }
426
427 /* Update the live ranges according PHI.
428
429 Loop:
430 bb 2:
431 STMT 1 -- point 0
432 STMT 2 (def SSA 1) -- point 1
433 STMT 3 (use SSA 1) -- point 2
434 STMT 4 -- point 3
435 bb 3:
436 SSA 2 = PHI<SSA 1>
437 STMT 1 -- point 0
438 STMT 2 -- point 1
439 STMT 3 (use SSA 2) -- point 2
440 STMT 4 -- point 3
441
442 Before this function, the SSA 1 live range is [2, 3] in bb 2
443 and SSA 2 is [0, 3] in bb 3.
444
445 Then, after this function, we update SSA 1 live range in bb 2
446 into [2, 4] since SSA 1 is live out into bb 3. */
447 static void
448 update_local_live_ranges (
449 vec_info *vinfo,
450 hash_map<basic_block, vec<stmt_point>> &program_points_per_bb,
451 hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb,
452 machine_mode *biggest_mode)
453 {
454 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
455 if (!loop_vinfo)
456 return;
457
458 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
459 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
460 unsigned int nbbs = loop->num_nodes;
461 unsigned int i, j;
462 gphi_iterator psi;
463 gimple_stmt_iterator si;
464 for (i = 0; i < nbbs; i++)
465 {
466 basic_block bb = bbs[i];
467 if (dump_enabled_p ())
468 dump_printf_loc (MSG_NOTE, vect_location,
469 "Update local program points for bb %d:\n",
470 bbs[i]->index);
471 for (psi = gsi_start_phis (bb); !gsi_end_p (psi); gsi_next (&psi))
472 {
473 gphi *phi = psi.phi ();
474 stmt_vec_info stmt_info = vinfo->lookup_stmt (phi);
475 if (STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info))
476 == undef_vec_info_type)
477 continue;
478
479 for (j = 0; j < gimple_phi_num_args (phi); j++)
480 {
481 edge e = gimple_phi_arg_edge (phi, j);
482 tree def = gimple_phi_arg_def (phi, j);
483 auto *live_ranges = live_ranges_per_bb.get (bb);
484 auto *live_range = live_ranges->get (def);
485 if (poly_int_tree_p (def))
486 {
487 /* Insert live range of INTEGER_CST or POLY_CST since we will
488 need to allocate a vector register for it.
489
490 E.g. # j_17 = PHI <j_11(9), 0(5)> will be transformed
491 into # vect_vec_iv_.8_24 = PHI <_25(9), { 0, ... }(5)>
492
493 The live range for such value is short which only lives
494 from program point 0 to 1. */
495 if (live_range)
496 {
497 unsigned int start = (*live_range).first;
498 (*live_range).first = 0;
499 if (dump_enabled_p ())
500 dump_printf_loc (
501 MSG_NOTE, vect_location,
502 "Update %T start point from %d to 0:\n", def, start);
503 }
504 else
505 {
506 live_ranges->put (def, pair (0, 1));
507 auto &program_points = (*program_points_per_bb.get (bb));
508 if (program_points.is_empty ())
509 {
510 stmt_point info = {1, phi};
511 program_points.safe_push (info);
512 }
513 if (dump_enabled_p ())
514 dump_printf_loc (MSG_NOTE, vect_location,
515 "Add %T start point from 0 to 1:\n",
516 def);
517 }
518 continue;
519 }
520 if (live_range && flow_bb_inside_loop_p (loop, e->src))
521 {
522 unsigned int start = (*live_range).first;
523 (*live_range).first = 0;
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location,
526 "Update %T start point from %d to %d:\n",
527 def, start, (*live_range).first);
528 }
529 live_ranges = live_ranges_per_bb.get (e->src);
530 if (!program_points_per_bb.get (e->src))
531 continue;
532 unsigned int max_point
533 = (*program_points_per_bb.get (e->src)).length ();
534 live_range = live_ranges->get (def);
535 if (!live_range)
536 continue;
537
538 unsigned int end = (*live_range).second;
539 (*live_range).second = max_point;
540 if (dump_enabled_p ())
541 dump_printf_loc (MSG_NOTE, vect_location,
542 "Update %T end point from %d to %d:\n", def,
543 end, (*live_range).second);
544 }
545 }
546 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
547 {
548 if (!(is_gimple_assign (gsi_stmt (si))
549 || is_gimple_call (gsi_stmt (si))))
550 continue;
551 stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
552 enum stmt_vec_info_type type
553 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
554 if (non_contiguous_memory_access_p (stmt_info)
555 /* LOAD_LANES/STORE_LANES doesn't need a perm indice. */
556 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)
557 != VMAT_LOAD_STORE_LANES)
558 {
559 /* For non-adjacent load/store STMT, we will potentially
560 convert it into:
561
562 1. MASK_LEN_GATHER_LOAD (..., perm indice).
563 2. Continguous load/store + VEC_PERM (..., perm indice)
564
565 We will be likely using one more vector variable. */
566 unsigned int max_point
567 = (*program_points_per_bb.get (bb)).length () - 1;
568 auto *live_ranges = live_ranges_per_bb.get (bb);
569 bool existed_p = false;
570 tree var = type == load_vec_info_type
571 ? gimple_get_lhs (gsi_stmt (si))
572 : get_store_value (gsi_stmt (si));
573 tree sel_type = build_nonstandard_integer_type (
574 TYPE_PRECISION (TREE_TYPE (var)), 1);
575 *biggest_mode
576 = get_biggest_mode (*biggest_mode, TYPE_MODE (sel_type));
577 tree sel = build_decl (UNKNOWN_LOCATION, VAR_DECL,
578 get_identifier ("vect_perm"), sel_type);
579 pair &live_range = live_ranges->get_or_insert (sel, &existed_p);
580 gcc_assert (!existed_p);
581 live_range = pair (0, max_point);
582 if (dump_enabled_p ())
583 dump_printf_loc (MSG_NOTE, vect_location,
584 "Add perm indice %T, start = 0, end = %d\n",
585 sel, max_point);
586 }
587 }
588 }
589 }
590
591 /* Compute the maximum live V_REGS. */
592 static bool
593 has_unexpected_spills_p (loop_vec_info loop_vinfo)
594 {
595 /* Compute local program points.
596 It's a fast and effective computation. */
597 hash_map<basic_block, vec<stmt_point>> program_points_per_bb;
598 compute_local_program_points (loop_vinfo, program_points_per_bb);
599
600 /* Compute local live ranges. */
601 hash_map<basic_block, hash_map<tree, pair>> live_ranges_per_bb;
602 machine_mode biggest_mode
603 = compute_local_live_ranges (program_points_per_bb, live_ranges_per_bb);
604
605 /* Update live ranges according to PHI. */
606 update_local_live_ranges (loop_vinfo, program_points_per_bb,
607 live_ranges_per_bb, &biggest_mode);
608
609 int lmul = compute_estimated_lmul (loop_vinfo, biggest_mode);
610 /* TODO: We calculate the maximum live vars base on current STMTS
611 sequence. We can support live range shrink if it can give us
612 big improvement in the future. */
613 if (lmul > RVV_M1)
614 {
615 if (!live_ranges_per_bb.is_empty ())
616 {
617 unsigned int max_nregs = 0;
618 for (hash_map<basic_block, hash_map<tree, pair>>::iterator iter
619 = live_ranges_per_bb.begin ();
620 iter != live_ranges_per_bb.end (); ++iter)
621 {
622 basic_block bb = (*iter).first;
623 unsigned int max_point
624 = (*program_points_per_bb.get (bb)).length () + 1;
625 if ((*iter).second.is_empty ())
626 continue;
627 /* We prefer larger LMUL unless it causes register spillings. */
628 unsigned int nregs
629 = max_number_of_live_regs (bb, (*iter).second, max_point,
630 biggest_mode, lmul);
631 if (nregs > max_nregs)
632 max_nregs = nregs;
633 }
634 live_ranges_per_bb.empty ();
635 if (max_nregs > V_REG_NUM)
636 return true;
637 }
638 }
639 if (!program_points_per_bb.is_empty ())
640 {
641 for (hash_map<basic_block, vec<stmt_point>>::iterator iter
642 = program_points_per_bb.begin ();
643 iter != program_points_per_bb.end (); ++iter)
644 {
645 vec<stmt_point> program_points = (*iter).second;
646 if (!program_points.is_empty ())
647 program_points.release ();
648 }
649 program_points_per_bb.empty ();
650 }
651 return false;
652 }
653
654 costs::costs (vec_info *vinfo, bool costing_for_scalar)
655 : vector_costs (vinfo, costing_for_scalar)
656 {
657 if (costing_for_scalar)
658 m_cost_type = SCALAR_COST;
659 else if (riscv_v_ext_vector_mode_p (vinfo->vector_mode))
660 m_cost_type = VLA_VECTOR_COST;
661 else
662 m_cost_type = VLS_VECTOR_COST;
663 }
664
665 /* Do one-time initialization of the costs given that we're
666 costing the loop vectorization described by LOOP_VINFO. */
667 void
668 costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
669 {
670 /* Record the number of times that the vector loop would execute,
671 if known. */
672 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
673 auto scalar_niters = max_stmt_executions_int (loop);
674 if (scalar_niters >= 0)
675 {
676 unsigned int vf = vect_vf_for_cost (loop_vinfo);
677 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
678 m_num_vector_iterations = scalar_niters / vf;
679 else
680 m_num_vector_iterations = CEIL (scalar_niters, vf);
681 }
682
683 /* Detect whether we're vectorizing for VLA and should apply the unrolling
684 heuristic described above m_unrolled_vls_niters. */
685 record_potential_vls_unrolling (loop_vinfo);
686
687 /* Detect whether the LOOP has unexpected spills. */
688 record_potential_unexpected_spills (loop_vinfo);
689 }
690
691 /* Analyze the vectorized program stataments and use dynamic LMUL
692 heuristic to detect whether the loop has unexpected spills. */
693 void
694 costs::record_potential_unexpected_spills (loop_vec_info loop_vinfo)
695 {
696 /* We only want to apply the heuristic if LOOP_VINFO is being
697 vectorized for VLA and known NITERS VLS loop. */
698 if (riscv_autovec_lmul == RVV_DYNAMIC
699 && (m_cost_type == VLA_VECTOR_COST
700 || (m_cost_type == VLS_VECTOR_COST
701 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))))
702 {
703 bool post_dom_available_p = dom_info_available_p (CDI_POST_DOMINATORS);
704 if (!post_dom_available_p)
705 calculate_dominance_info (CDI_POST_DOMINATORS);
706 m_has_unexpected_spills_p = has_unexpected_spills_p (loop_vinfo);
707 if (!post_dom_available_p)
708 free_dominance_info (CDI_POST_DOMINATORS);
709 }
710 }
711
712 /* Decide whether to use the unrolling heuristic described above
713 m_unrolled_vls_niters, updating that field if so. LOOP_VINFO
714 describes the loop that we're vectorizing. */
715 void
716 costs::record_potential_vls_unrolling (loop_vec_info loop_vinfo)
717 {
718 /* We only want to apply the heuristic if LOOP_VINFO is being
719 vectorized for VLA. */
720 if (m_cost_type != VLA_VECTOR_COST)
721 return;
722
723 /* We don't want to apply the heuristic to outer loops, since it's
724 harder to track two levels of unrolling. */
725 if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
726 return;
727
728 /* Only handle cases in which the number of VLS iterations
729 would be known at compile time but the number of SVE iterations
730 would not. */
731 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
732 || BYTES_PER_RISCV_VECTOR.is_constant ())
733 return;
734
735 /* Guess how many times the VLS loop would iterate and make
736 sure that it is within the complete unrolling limit. Even if the
737 number of iterations is small enough, the number of statements might
738 not be, which is why we need to estimate the number of statements too. */
739 unsigned int vls_vf = vect_vf_for_cost (loop_vinfo);
740 unsigned HOST_WIDE_INT unrolled_vls_niters
741 = LOOP_VINFO_INT_NITERS (loop_vinfo) / vls_vf;
742 if (unrolled_vls_niters > (unsigned int) param_max_completely_peel_times)
743 return;
744
745 /* Record that we're applying the heuristic and should try to estimate
746 the number of statements in the VLS loop. */
747 m_unrolled_vls_niters = unrolled_vls_niters;
748 }
749
750 /* Return true if (a) we're applying the VLS vs. VLA unrolling
751 heuristic described above m_unrolled_vls_niters and (b) the heuristic
752 says that we should prefer the VLS loop. */
753 bool
754 costs::prefer_unrolled_loop () const
755 {
756 if (!m_unrolled_vls_stmts)
757 return false;
758
759 if (dump_enabled_p ())
760 dump_printf_loc (MSG_NOTE, vect_location,
761 "Number of insns in"
762 " unrolled VLS loop = " HOST_WIDE_INT_PRINT_UNSIGNED "\n",
763 m_unrolled_vls_stmts);
764
765 /* The balance here is tricky. On the one hand, we can't be sure whether
766 the code is vectorizable with VLS or not. However, even if
767 it isn't vectorizable with VLS, there's a possibility that
768 the scalar code could also be unrolled. Some of the code might then
769 benefit from SLP, or from using LDP and STP. We therefore apply
770 the heuristic regardless of can_use_vls_p. */
771 return (m_unrolled_vls_stmts
772 && (m_unrolled_vls_stmts
773 <= (unsigned int) param_max_completely_peeled_insns));
774 }
775
776 bool
777 costs::better_main_loop_than_p (const vector_costs *uncast_other) const
778 {
779 auto other = static_cast<const costs *> (uncast_other);
780 auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
781 auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
782
783 if (dump_enabled_p ())
784 dump_printf_loc (MSG_NOTE, vect_location,
785 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
786 GET_MODE_NAME (this_loop_vinfo->vector_mode),
787 vect_vf_for_cost (this_loop_vinfo),
788 GET_MODE_NAME (other_loop_vinfo->vector_mode),
789 vect_vf_for_cost (other_loop_vinfo));
790
791 /* Apply the unrolling heuristic described above m_unrolled_vls_niters. */
792 if (bool (m_unrolled_vls_stmts) != bool (other->m_unrolled_vls_stmts))
793 {
794 bool this_prefer_unrolled = this->prefer_unrolled_loop ();
795 bool other_prefer_unrolled = other->prefer_unrolled_loop ();
796 if (this_prefer_unrolled != other_prefer_unrolled)
797 {
798 if (dump_enabled_p ())
799 dump_printf_loc (MSG_NOTE, vect_location,
800 "Preferring VLS loop because"
801 " it can be unrolled\n");
802 return other_prefer_unrolled;
803 }
804 }
805 else if (riscv_autovec_lmul == RVV_DYNAMIC
806 && !LOOP_VINFO_NITERS_KNOWN_P (other_loop_vinfo))
807 {
808 if (other->m_has_unexpected_spills_p)
809 {
810 if (dump_enabled_p ())
811 dump_printf_loc (MSG_NOTE, vect_location,
812 "Preferring smaller LMUL loop because"
813 " it has unexpected spills\n");
814 return true;
815 }
816 else
817 return false;
818 }
819
820 return vector_costs::better_main_loop_than_p (other);
821 }
822
823 unsigned
824 costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
825 stmt_vec_info stmt_info, slp_tree, tree vectype,
826 int misalign, vect_cost_model_location where)
827 {
828 int stmt_cost
829 = targetm.vectorize.builtin_vectorization_cost (kind, vectype, misalign);
830
831 /* Do one-time initialization based on the vinfo. */
832 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
833 if (!m_analyzed_vinfo)
834 {
835 if (loop_vinfo)
836 analyze_loop_vinfo (loop_vinfo);
837
838 m_analyzed_vinfo = true;
839 }
840
841 if (stmt_info)
842 {
843 /* If we're applying the VLA vs. VLS unrolling heuristic,
844 estimate the number of statements in the unrolled VLS
845 loop. For simplicitly, we assume that one iteration of the
846 VLS loop would need the same number of statements
847 as one iteration of the VLA loop. */
848 if (where == vect_body && m_unrolled_vls_niters)
849 m_unrolled_vls_stmts += count * m_unrolled_vls_niters;
850 }
851
852 return record_stmt_cost (stmt_info, where, count * stmt_cost);
853 }
854
855 void
856 costs::finish_cost (const vector_costs *scalar_costs)
857 {
858 vector_costs::finish_cost (scalar_costs);
859 }
860
861 } // namespace riscv_vector