]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/riscv/riscv-vector-costs.cc
b9fdfdc5e3a95962c3c311ab69d954becd5f93f0
[thirdparty/gcc.git] / gcc / config / riscv / riscv-vector-costs.cc
1 /* Cost model implementation for RISC-V 'V' Extension for GNU compiler.
2 Copyright (C) 2023-2024 Free Software Foundation, Inc.
3 Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #define INCLUDE_STRING
24 #include "config.h"
25 #include "system.h"
26 #include "coretypes.h"
27 #include "tm.h"
28 #include "target.h"
29 #include "function.h"
30 #include "tree.h"
31 #include "basic-block.h"
32 #include "rtl.h"
33 #include "gimple.h"
34 #include "targhooks.h"
35 #include "cfgloop.h"
36 #include "fold-const.h"
37 #include "tm_p.h"
38 #include "tree-vectorizer.h"
39 #include "gimple-iterator.h"
40 #include "bitmap.h"
41 #include "ssa.h"
42 #include "backend.h"
43 #include "tree-data-ref.h"
44 #include "tree-ssa-loop-niter.h"
45
46 /* This file should be included last. */
47 #include "riscv-vector-costs.h"
48
49 namespace riscv_vector {
50
51 /* Dynamic LMUL philosophy - Local linear-scan SSA live range based analysis
52 determine LMUL
53
54 - Collect all vectorize STMTs locally for each loop block.
55 - Build program point based graph, ignore non-vectorize STMTs:
56
57 vectorize STMT 0 - point 0
58 scalar STMT 0 - ignore.
59 vectorize STMT 1 - point 1
60 ...
61 - Compute the number of live V_REGs live at each program point
62 - Determine LMUL in VECTOR COST model according to the program point
63 which has maximum live V_REGs.
64
65 Note:
66
67 - BIGGEST_MODE is the biggest LMUL auto-vectorization element mode.
68 It's important for mixed size auto-vectorization (Conversions, ... etc).
69 E.g. For a loop that is vectorizing conversion of INT32 -> INT64.
70 The biggest mode is DImode and LMUL = 8, LMUL = 4 for SImode.
71 We compute the number live V_REGs at each program point according to
72 this information.
73 - We only compute program points and live ranges locally (within a block)
74 since we just need to compute the number of live V_REGs at each program
75 point and we are not really allocating the registers for each SSA.
76 We can make the variable has another local live range in another block
77 if it live out/live in to another block. Such approach doesn't affect
78 out accurate live range analysis.
79 - Current analysis didn't consider any instruction scheduling which
80 may improve the register pressure. So we are conservatively doing the
81 analysis which may end up with smaller LMUL.
82 TODO: Maybe we could support a reasonable live range shrink algorithm
83 which take advantage of instruction scheduling.
84 - We may have these following possible autovec modes analysis:
85
86 1. M8 -> M4 -> M2 -> M1 (stop analysis here) -> MF2 -> MF4 -> MF8
87 2. M8 -> M1(M4) -> MF2(M2) -> MF4(M1) (stop analysis here) -> MF8(MF2)
88 3. M1(M8) -> MF2(M4) -> MF4(M2) -> MF8(M1)
89 */
90
91 static bool
92 is_gimple_assign_or_call (gimple *stmt)
93 {
94 return is_gimple_assign (stmt) || is_gimple_call (stmt);
95 }
96
97 /* Return the program point of 1st vectorized lanes statement. */
98 static unsigned int
99 get_first_lane_point (const vec<stmt_point> program_points,
100 stmt_vec_info stmt_info)
101 {
102 for (const auto program_point : program_points)
103 if (program_point.stmt_info == DR_GROUP_FIRST_ELEMENT (stmt_info))
104 return program_point.point;
105 return 0;
106 }
107
108 /* Return the program point of last vectorized lanes statement. */
109 static unsigned int
110 get_last_lane_point (const vec<stmt_point> program_points,
111 stmt_vec_info stmt_info)
112 {
113 unsigned int max_point = 0;
114 for (auto s = DR_GROUP_FIRST_ELEMENT (stmt_info); s != NULL;
115 s = DR_GROUP_NEXT_ELEMENT (s))
116 {
117 for (const auto program_point : program_points)
118 if (program_point.stmt_info == s && program_point.point > max_point)
119 max_point = program_point.point;
120 }
121 return max_point;
122 }
123
124 /* Return the last variable that is in the live range list. */
125 static pair *
126 get_live_range (hash_map<tree, pair> *live_ranges, tree arg)
127 {
128 auto *r = live_ranges->get (arg);
129 if (r)
130 return r;
131 else
132 {
133 tree t = arg;
134 gimple *def_stmt = NULL;
135 while (t && TREE_CODE (t) == SSA_NAME && !r
136 && (def_stmt = SSA_NAME_DEF_STMT (t)))
137 {
138 if (gimple_assign_cast_p (def_stmt))
139 {
140 t = gimple_assign_rhs1 (def_stmt);
141 r = live_ranges->get (t);
142 def_stmt = NULL;
143 }
144 else
145 /* FIXME: Currently we don't see any fold for
146 non-conversion statements. */
147 t = NULL_TREE;
148 }
149 if (r)
150 return r;
151 else
152 {
153 bool insert_p = live_ranges->put (arg, pair (0, 0));
154 gcc_assert (!insert_p);
155 return live_ranges->get (arg);
156 }
157 }
158 }
159
160 /* Collect all STMTs that are vectorized and compute their program points.
161 Note that we don't care about the STMTs that are not vectorized and
162 we only build the local graph (within a block) of program points.
163
164 Loop:
165 bb 2:
166 STMT 1 (be vectorized) -- point 0
167 STMT 2 (not be vectorized) -- ignored
168 STMT 3 (be vectorized) -- point 1
169 STMT 4 (be vectorized) -- point 2
170 STMT 5 (be vectorized) -- point 3
171 ...
172 bb 3:
173 STMT 1 (be vectorized) -- point 0
174 STMT 2 (be vectorized) -- point 1
175 STMT 3 (not be vectorized) -- ignored
176 STMT 4 (not be vectorized) -- ignored
177 STMT 5 (be vectorized) -- point 2
178 ...
179 */
180 static void
181 compute_local_program_points (
182 vec_info *vinfo,
183 hash_map<basic_block, vec<stmt_point>> &program_points_per_bb)
184 {
185 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
186 {
187 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
188 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
189 unsigned int nbbs = loop->num_nodes;
190 gimple_stmt_iterator si;
191 unsigned int i;
192 /* Collect the stmts that is vectorized and mark their program point. */
193 for (i = 0; i < nbbs; i++)
194 {
195 int point = 1;
196 basic_block bb = bbs[i];
197 vec<stmt_point> program_points = vNULL;
198 if (dump_enabled_p ())
199 dump_printf_loc (MSG_NOTE, vect_location,
200 "Compute local program points for bb %d:\n",
201 bb->index);
202 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
203 {
204 if (!is_gimple_assign_or_call (gsi_stmt (si)))
205 continue;
206 stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
207 enum stmt_vec_info_type type
208 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
209 if (type != undef_vec_info_type)
210 {
211 stmt_point info = {point, gsi_stmt (si), stmt_info};
212 program_points.safe_push (info);
213 point++;
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location,
216 "program point %d: %G", info.point,
217 gsi_stmt (si));
218 }
219 }
220 program_points_per_bb.put (bb, program_points);
221 }
222 }
223 }
224
225 static machine_mode
226 get_biggest_mode (machine_mode mode1, machine_mode mode2)
227 {
228 unsigned int mode1_size = GET_MODE_BITSIZE (mode1).to_constant ();
229 unsigned int mode2_size = GET_MODE_BITSIZE (mode2).to_constant ();
230 return mode1_size >= mode2_size ? mode1 : mode2;
231 }
232
233 /* Compute local live ranges of each vectorized variable.
234 Note that we only compute local live ranges (within a block) since
235 local live ranges information is accurate enough for us to determine
236 the LMUL/vectorization factor of the loop.
237
238 Loop:
239 bb 2:
240 STMT 1 -- point 0
241 STMT 2 (def SSA 1) -- point 1
242 STMT 3 (use SSA 1) -- point 2
243 STMT 4 -- point 3
244 bb 3:
245 STMT 1 -- point 0
246 STMT 2 -- point 1
247 STMT 3 -- point 2
248 STMT 4 (use SSA 2) -- point 3
249
250 The live range of SSA 1 is [1, 3] in bb 2.
251 The live range of SSA 2 is [0, 4] in bb 3. */
252 static machine_mode
253 compute_local_live_ranges (
254 const hash_map<basic_block, vec<stmt_point>> &program_points_per_bb,
255 hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb)
256 {
257 machine_mode biggest_mode = QImode;
258 if (!program_points_per_bb.is_empty ())
259 {
260 auto_vec<tree> visited_vars;
261 unsigned int i;
262 for (hash_map<basic_block, vec<stmt_point>>::iterator iter
263 = program_points_per_bb.begin ();
264 iter != program_points_per_bb.end (); ++iter)
265 {
266 basic_block bb = (*iter).first;
267 vec<stmt_point> program_points = (*iter).second;
268 bool existed_p = false;
269 hash_map<tree, pair> *live_ranges
270 = &live_ranges_per_bb.get_or_insert (bb, &existed_p);
271 gcc_assert (!existed_p);
272 if (dump_enabled_p ())
273 dump_printf_loc (MSG_NOTE, vect_location,
274 "Compute local live ranges for bb %d:\n",
275 bb->index);
276 for (const auto program_point : program_points)
277 {
278 unsigned int point = program_point.point;
279 gimple *stmt = program_point.stmt;
280 stmt_vec_info stmt_info = program_point.stmt_info;
281 tree lhs = gimple_get_lhs (stmt);
282 enum stmt_vec_info_type type
283 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
284 if (lhs != NULL_TREE && is_gimple_reg (lhs)
285 && (!POINTER_TYPE_P (TREE_TYPE (lhs))
286 || type != store_vec_info_type))
287 {
288 biggest_mode = get_biggest_mode (biggest_mode,
289 TYPE_MODE (TREE_TYPE (lhs)));
290 bool existed_p = false;
291 pair &live_range
292 = live_ranges->get_or_insert (lhs, &existed_p);
293 gcc_assert (!existed_p);
294 if (STMT_VINFO_MEMORY_ACCESS_TYPE (program_point.stmt_info)
295 == VMAT_LOAD_STORE_LANES)
296 point = get_first_lane_point (program_points,
297 program_point.stmt_info);
298 live_range = pair (point, point);
299 }
300 for (i = 0; i < gimple_num_args (stmt); i++)
301 {
302 tree var = gimple_arg (stmt, i);
303 /* Both IMM and REG are included since a VECTOR_CST may be
304 potentially held in a vector register. However, it's not
305 accurate, since a PLUS_EXPR can be vectorized into vadd.vi
306 if IMM is -16 ~ 15.
307
308 TODO: We may elide the cases that the unnecessary IMM in
309 the future. */
310 if (poly_int_tree_p (var)
311 || (is_gimple_val (var)
312 && (!POINTER_TYPE_P (TREE_TYPE (var))
313 || type != load_vec_info_type)))
314 {
315 biggest_mode
316 = get_biggest_mode (biggest_mode,
317 TYPE_MODE (TREE_TYPE (var)));
318 bool existed_p = false;
319 pair &live_range
320 = live_ranges->get_or_insert (var, &existed_p);
321 if (STMT_VINFO_MEMORY_ACCESS_TYPE (
322 program_point.stmt_info)
323 == VMAT_LOAD_STORE_LANES)
324 point = get_last_lane_point (program_points,
325 program_point.stmt_info);
326 else if (existed_p)
327 point = MAX (live_range.second, point);
328 if (existed_p)
329 /* We will grow the live range for each use. */
330 live_range = pair (live_range.first, point);
331 else
332 {
333 gimple *def_stmt;
334 if (TREE_CODE (var) == SSA_NAME
335 && (def_stmt = SSA_NAME_DEF_STMT (var))
336 && gimple_bb (def_stmt) == bb
337 && is_gimple_assign_or_call (def_stmt))
338 {
339 live_ranges->remove (var);
340 for (unsigned int j = 0;
341 j < gimple_num_args (def_stmt); j++)
342 {
343 tree arg = gimple_arg (def_stmt, j);
344 auto *r = get_live_range (live_ranges, arg);
345 gcc_assert (r);
346 (*r).second = MAX (point, (*r).second);
347 }
348 }
349 else
350 /* The splat vector lives the whole block. */
351 live_range = pair (0, program_points.length ());
352 }
353 }
354 }
355 }
356 if (dump_enabled_p ())
357 for (hash_map<tree, pair>::iterator iter = live_ranges->begin ();
358 iter != live_ranges->end (); ++iter)
359 dump_printf_loc (MSG_NOTE, vect_location,
360 "%T: type = %T, start = %d, end = %d\n",
361 (*iter).first, TREE_TYPE ((*iter).first),
362 (*iter).second.first, (*iter).second.second);
363 }
364 }
365 if (dump_enabled_p ())
366 dump_printf_loc (MSG_NOTE, vect_location, "Biggest mode = %s\n",
367 GET_MODE_NAME (biggest_mode));
368 return biggest_mode;
369 }
370
371 /* Compute the mode for MODE, BIGGEST_MODE and LMUL.
372
373 E.g. If mode = SImode, biggest_mode = DImode, LMUL = M4.
374 Then return RVVM4SImode (LMUL = 4, element mode = SImode). */
375 static unsigned int
376 compute_nregs_for_mode (loop_vec_info loop_vinfo, machine_mode mode,
377 machine_mode biggest_mode, int lmul)
378 {
379 unsigned int rgroup_size = LOOP_VINFO_LENS (loop_vinfo).is_empty ()
380 ? 1
381 : LOOP_VINFO_LENS (loop_vinfo).length ();
382 unsigned int mode_size = GET_MODE_SIZE (mode).to_constant ();
383 unsigned int biggest_size = GET_MODE_SIZE (biggest_mode).to_constant ();
384 gcc_assert (biggest_size >= mode_size);
385 unsigned int ratio = biggest_size / mode_size;
386 return MAX (lmul / ratio, 1) * rgroup_size;
387 }
388
389 /* This function helps to determine whether current LMUL will cause
390 potential vector register (V_REG) spillings according to live range
391 information.
392
393 - First, compute how many variable are alive of each program point
394 in each bb of the loop.
395 - Second, compute how many V_REGs are alive of each program point
396 in each bb of the loop according the BIGGEST_MODE and the variable
397 mode.
398 - Third, Return the maximum V_REGs are alive of the loop. */
399 static unsigned int
400 max_number_of_live_regs (loop_vec_info loop_vinfo, const basic_block bb,
401 const hash_map<tree, pair> &live_ranges,
402 unsigned int max_point, machine_mode biggest_mode,
403 int lmul)
404 {
405 unsigned int max_nregs = 0;
406 unsigned int i;
407 unsigned int live_point = 0;
408 auto_vec<unsigned int> live_vars_vec;
409 live_vars_vec.safe_grow_cleared (max_point, true);
410 for (hash_map<tree, pair>::iterator iter = live_ranges.begin ();
411 iter != live_ranges.end (); ++iter)
412 {
413 tree var = (*iter).first;
414 pair live_range = (*iter).second;
415 for (i = live_range.first + 1; i <= live_range.second; i++)
416 {
417 machine_mode mode = TYPE_MODE (TREE_TYPE (var));
418 unsigned int nregs
419 = compute_nregs_for_mode (loop_vinfo, mode, biggest_mode, lmul);
420 live_vars_vec[i] += nregs;
421 if (live_vars_vec[i] > max_nregs)
422 {
423 max_nregs = live_vars_vec[i];
424 live_point = i;
425 }
426 }
427 }
428
429 /* Collect user explicit RVV type. */
430 auto_vec<basic_block> all_preds
431 = get_all_dominated_blocks (CDI_POST_DOMINATORS, bb);
432 tree t;
433 FOR_EACH_SSA_NAME (i, t, cfun)
434 {
435 machine_mode mode = TYPE_MODE (TREE_TYPE (t));
436 if (!lookup_vector_type_attribute (TREE_TYPE (t))
437 && !riscv_v_ext_vls_mode_p (mode))
438 continue;
439
440 gimple *def = SSA_NAME_DEF_STMT (t);
441 if (gimple_bb (def) && !all_preds.contains (gimple_bb (def)))
442 continue;
443 use_operand_p use_p;
444 imm_use_iterator iterator;
445
446 FOR_EACH_IMM_USE_FAST (use_p, iterator, t)
447 {
448 if (!USE_STMT (use_p) || is_gimple_debug (USE_STMT (use_p))
449 || !dominated_by_p (CDI_POST_DOMINATORS, bb,
450 gimple_bb (USE_STMT (use_p))))
451 continue;
452
453 int regno_alignment = riscv_get_v_regno_alignment (mode);
454 max_nregs += regno_alignment;
455 if (dump_enabled_p ())
456 dump_printf_loc (
457 MSG_NOTE, vect_location,
458 "Explicit used SSA %T, vectype = %T, mode = %s, cause %d "
459 "V_REG live in bb %d at program point %d\n",
460 t, TREE_TYPE (t), GET_MODE_NAME (mode), regno_alignment,
461 bb->index, live_point);
462 break;
463 }
464 }
465
466 if (dump_enabled_p ())
467 dump_printf_loc (
468 MSG_NOTE, vect_location,
469 "Maximum lmul = %d, At most %d number of live V_REG at program "
470 "point %d for bb %d\n",
471 lmul, max_nregs, live_point, bb->index);
472 return max_nregs;
473 }
474
475 /* Get STORE value. */
476 static tree
477 get_store_value (gimple *stmt)
478 {
479 if (is_gimple_call (stmt) && gimple_call_internal_p (stmt))
480 {
481 if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
482 return gimple_call_arg (stmt, 3);
483 else
484 gcc_unreachable ();
485 }
486 else
487 return gimple_assign_rhs1 (stmt);
488 }
489
490 /* Return true if it is non-contiguous load/store. */
491 static bool
492 non_contiguous_memory_access_p (stmt_vec_info stmt_info)
493 {
494 enum stmt_vec_info_type type
495 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
496 return ((type == load_vec_info_type || type == store_vec_info_type)
497 && !adjacent_dr_p (STMT_VINFO_DATA_REF (stmt_info)));
498 }
499
500 /* Return the LMUL of the current analysis. */
501 static int
502 compute_estimated_lmul (loop_vec_info loop_vinfo, machine_mode mode)
503 {
504 gcc_assert (GET_MODE_BITSIZE (mode).is_constant ());
505 int regno_alignment = riscv_get_v_regno_alignment (loop_vinfo->vector_mode);
506 if (riscv_v_ext_vls_mode_p (loop_vinfo->vector_mode))
507 return regno_alignment;
508 else if (known_eq (LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo), 1U))
509 {
510 int estimated_vf = vect_vf_for_cost (loop_vinfo);
511 return estimated_vf * GET_MODE_BITSIZE (mode).to_constant ()
512 / TARGET_MIN_VLEN;
513 }
514 else
515 {
516 /* Estimate the VLA SLP LMUL. */
517 if (regno_alignment > RVV_M1)
518 return regno_alignment;
519 else if (mode != QImode
520 || LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo).is_constant ())
521 {
522 int ratio;
523 if (can_div_trunc_p (BYTES_PER_RISCV_VECTOR,
524 GET_MODE_SIZE (loop_vinfo->vector_mode), &ratio))
525 {
526 if (ratio == 1)
527 return RVV_M4;
528 else if (ratio == 2)
529 return RVV_M2;
530 }
531 }
532 }
533 return 0;
534 }
535
536 /* Update the live ranges according PHI.
537
538 Loop:
539 bb 2:
540 STMT 1 -- point 0
541 STMT 2 (def SSA 1) -- point 1
542 STMT 3 (use SSA 1) -- point 2
543 STMT 4 -- point 3
544 bb 3:
545 SSA 2 = PHI<SSA 1>
546 STMT 1 -- point 0
547 STMT 2 -- point 1
548 STMT 3 (use SSA 2) -- point 2
549 STMT 4 -- point 3
550
551 Before this function, the SSA 1 live range is [2, 3] in bb 2
552 and SSA 2 is [0, 3] in bb 3.
553
554 Then, after this function, we update SSA 1 live range in bb 2
555 into [2, 4] since SSA 1 is live out into bb 3. */
556 static void
557 update_local_live_ranges (
558 vec_info *vinfo,
559 hash_map<basic_block, vec<stmt_point>> &program_points_per_bb,
560 hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb,
561 machine_mode *biggest_mode)
562 {
563 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
564 if (!loop_vinfo)
565 return;
566
567 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
568 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
569 unsigned int nbbs = loop->num_nodes;
570 unsigned int i, j;
571 gphi_iterator psi;
572 gimple_stmt_iterator si;
573 for (i = 0; i < nbbs; i++)
574 {
575 basic_block bb = bbs[i];
576 if (dump_enabled_p ())
577 dump_printf_loc (MSG_NOTE, vect_location,
578 "Update local program points for bb %d:\n",
579 bbs[i]->index);
580 for (psi = gsi_start_phis (bb); !gsi_end_p (psi); gsi_next (&psi))
581 {
582 gphi *phi = psi.phi ();
583 stmt_vec_info stmt_info = vinfo->lookup_stmt (phi);
584 if (STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info))
585 == undef_vec_info_type)
586 continue;
587
588 for (j = 0; j < gimple_phi_num_args (phi); j++)
589 {
590 edge e = gimple_phi_arg_edge (phi, j);
591 tree def = gimple_phi_arg_def (phi, j);
592 auto *live_ranges = live_ranges_per_bb.get (bb);
593 auto *live_range = live_ranges->get (def);
594 if (poly_int_tree_p (def))
595 {
596 /* Insert live range of INTEGER_CST or POLY_CST since we will
597 need to allocate a vector register for it.
598
599 E.g. # j_17 = PHI <j_11(9), 0(5)> will be transformed
600 into # vect_vec_iv_.8_24 = PHI <_25(9), { 0, ... }(5)>
601
602 The live range for such value is short which only lives
603 from program point 0 to 1. */
604 if (live_range)
605 {
606 unsigned int start = (*live_range).first;
607 (*live_range).first = 0;
608 if (dump_enabled_p ())
609 dump_printf_loc (
610 MSG_NOTE, vect_location,
611 "Update %T start point from %d to 0:\n", def, start);
612 }
613 else
614 {
615 live_ranges->put (def, pair (0, 1));
616 auto &program_points = (*program_points_per_bb.get (bb));
617 if (program_points.is_empty ())
618 {
619 stmt_point info = {1, phi, stmt_info};
620 program_points.safe_push (info);
621 }
622 if (dump_enabled_p ())
623 dump_printf_loc (MSG_NOTE, vect_location,
624 "Add %T start point from 0 to 1:\n",
625 def);
626 }
627 continue;
628 }
629 if (live_range && flow_bb_inside_loop_p (loop, e->src))
630 {
631 unsigned int start = (*live_range).first;
632 (*live_range).first = 0;
633 if (dump_enabled_p ())
634 dump_printf_loc (MSG_NOTE, vect_location,
635 "Update %T start point from %d to %d:\n",
636 def, start, (*live_range).first);
637 }
638 live_ranges = live_ranges_per_bb.get (e->src);
639 if (!program_points_per_bb.get (e->src))
640 continue;
641 unsigned int max_point
642 = (*program_points_per_bb.get (e->src)).length ();
643 live_range = live_ranges->get (def);
644 if (!live_range)
645 continue;
646
647 unsigned int end = (*live_range).second;
648 (*live_range).second = max_point;
649 if (dump_enabled_p ())
650 dump_printf_loc (MSG_NOTE, vect_location,
651 "Update %T end point from %d to %d:\n", def,
652 end, (*live_range).second);
653 }
654 }
655 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
656 {
657 if (!is_gimple_assign_or_call (gsi_stmt (si)))
658 continue;
659 stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
660 enum stmt_vec_info_type type
661 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
662 if (non_contiguous_memory_access_p (stmt_info)
663 /* LOAD_LANES/STORE_LANES doesn't need a perm indice. */
664 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)
665 != VMAT_LOAD_STORE_LANES)
666 {
667 /* For non-adjacent load/store STMT, we will potentially
668 convert it into:
669
670 1. MASK_LEN_GATHER_LOAD (..., perm indice).
671 2. Continguous load/store + VEC_PERM (..., perm indice)
672
673 We will be likely using one more vector variable. */
674 unsigned int max_point
675 = (*program_points_per_bb.get (bb)).length () - 1;
676 auto *live_ranges = live_ranges_per_bb.get (bb);
677 bool existed_p = false;
678 tree var = type == load_vec_info_type
679 ? gimple_get_lhs (gsi_stmt (si))
680 : get_store_value (gsi_stmt (si));
681 tree sel_type = build_nonstandard_integer_type (
682 TYPE_PRECISION (TREE_TYPE (var)), 1);
683 *biggest_mode
684 = get_biggest_mode (*biggest_mode, TYPE_MODE (sel_type));
685 tree sel = build_decl (UNKNOWN_LOCATION, VAR_DECL,
686 get_identifier ("vect_perm"), sel_type);
687 pair &live_range = live_ranges->get_or_insert (sel, &existed_p);
688 gcc_assert (!existed_p);
689 live_range = pair (0, max_point);
690 if (dump_enabled_p ())
691 dump_printf_loc (MSG_NOTE, vect_location,
692 "Add perm indice %T, start = 0, end = %d\n",
693 sel, max_point);
694 if (!LOOP_VINFO_LENS (loop_vinfo).is_empty ()
695 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
696 {
697 /* If we are vectorizing a permutation when the rgroup number
698 > 1, we will need additional mask to shuffle the second
699 vector. */
700 tree mask = build_decl (UNKNOWN_LOCATION, VAR_DECL,
701 get_identifier ("vect_perm_mask"),
702 boolean_type_node);
703 pair &live_range
704 = live_ranges->get_or_insert (mask, &existed_p);
705 gcc_assert (!existed_p);
706 live_range = pair (0, max_point);
707 if (dump_enabled_p ())
708 dump_printf_loc (MSG_NOTE, vect_location,
709 "Add perm mask %T, start = 0, end = %d\n",
710 mask, max_point);
711 }
712 }
713 }
714 }
715 }
716
717 /* Compute the maximum live V_REGS. */
718 static bool
719 has_unexpected_spills_p (loop_vec_info loop_vinfo)
720 {
721 /* Compute local program points.
722 It's a fast and effective computation. */
723 hash_map<basic_block, vec<stmt_point>> program_points_per_bb;
724 compute_local_program_points (loop_vinfo, program_points_per_bb);
725
726 /* Compute local live ranges. */
727 hash_map<basic_block, hash_map<tree, pair>> live_ranges_per_bb;
728 machine_mode biggest_mode
729 = compute_local_live_ranges (program_points_per_bb, live_ranges_per_bb);
730
731 /* Update live ranges according to PHI. */
732 update_local_live_ranges (loop_vinfo, program_points_per_bb,
733 live_ranges_per_bb, &biggest_mode);
734
735 int lmul = compute_estimated_lmul (loop_vinfo, biggest_mode);
736 /* TODO: We calculate the maximum live vars base on current STMTS
737 sequence. We can support live range shrink if it can give us
738 big improvement in the future. */
739 if (lmul > RVV_M1)
740 {
741 if (!live_ranges_per_bb.is_empty ())
742 {
743 unsigned int max_nregs = 0;
744 for (hash_map<basic_block, hash_map<tree, pair>>::iterator iter
745 = live_ranges_per_bb.begin ();
746 iter != live_ranges_per_bb.end (); ++iter)
747 {
748 basic_block bb = (*iter).first;
749 unsigned int max_point
750 = (*program_points_per_bb.get (bb)).length () + 1;
751 if ((*iter).second.is_empty ())
752 continue;
753 /* We prefer larger LMUL unless it causes register spillings. */
754 unsigned int nregs
755 = max_number_of_live_regs (loop_vinfo, bb, (*iter).second,
756 max_point, biggest_mode, lmul);
757 if (nregs > max_nregs)
758 max_nregs = nregs;
759 }
760 live_ranges_per_bb.empty ();
761 if (max_nregs > V_REG_NUM)
762 return true;
763 }
764 }
765 if (!program_points_per_bb.is_empty ())
766 {
767 for (hash_map<basic_block, vec<stmt_point>>::iterator iter
768 = program_points_per_bb.begin ();
769 iter != program_points_per_bb.end (); ++iter)
770 {
771 vec<stmt_point> program_points = (*iter).second;
772 if (!program_points.is_empty ())
773 program_points.release ();
774 }
775 program_points_per_bb.empty ();
776 }
777 return false;
778 }
779
780 costs::costs (vec_info *vinfo, bool costing_for_scalar)
781 : vector_costs (vinfo, costing_for_scalar)
782 {
783 if (costing_for_scalar)
784 m_cost_type = SCALAR_COST;
785 else if (riscv_v_ext_vector_mode_p (vinfo->vector_mode))
786 m_cost_type = VLA_VECTOR_COST;
787 else
788 m_cost_type = VLS_VECTOR_COST;
789 }
790
791 /* Do one-time initialization of the costs given that we're
792 costing the loop vectorization described by LOOP_VINFO. */
793 void
794 costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
795 {
796 /* Record the number of times that the vector loop would execute,
797 if known. */
798 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
799 auto scalar_niters = max_stmt_executions_int (loop);
800 if (scalar_niters >= 0)
801 {
802 unsigned int vf = vect_vf_for_cost (loop_vinfo);
803 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
804 m_num_vector_iterations = scalar_niters / vf;
805 else
806 m_num_vector_iterations = CEIL (scalar_niters, vf);
807 }
808
809 /* Detect whether we're vectorizing for VLA and should apply the unrolling
810 heuristic described above m_unrolled_vls_niters. */
811 record_potential_vls_unrolling (loop_vinfo);
812
813 /* Detect whether the LOOP has unexpected spills. */
814 record_potential_unexpected_spills (loop_vinfo);
815 }
816
817 /* Analyze the vectorized program stataments and use dynamic LMUL
818 heuristic to detect whether the loop has unexpected spills. */
819 void
820 costs::record_potential_unexpected_spills (loop_vec_info loop_vinfo)
821 {
822 /* We only want to apply the heuristic if LOOP_VINFO is being
823 vectorized for VLA and known NITERS VLS loop. */
824 if (riscv_autovec_lmul == RVV_DYNAMIC
825 && (m_cost_type == VLA_VECTOR_COST
826 || (m_cost_type == VLS_VECTOR_COST
827 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))))
828 {
829 bool post_dom_available_p = dom_info_available_p (CDI_POST_DOMINATORS);
830 if (!post_dom_available_p)
831 calculate_dominance_info (CDI_POST_DOMINATORS);
832 m_has_unexpected_spills_p = has_unexpected_spills_p (loop_vinfo);
833 if (!post_dom_available_p)
834 free_dominance_info (CDI_POST_DOMINATORS);
835 }
836 }
837
838 /* Decide whether to use the unrolling heuristic described above
839 m_unrolled_vls_niters, updating that field if so. LOOP_VINFO
840 describes the loop that we're vectorizing. */
841 void
842 costs::record_potential_vls_unrolling (loop_vec_info loop_vinfo)
843 {
844 /* We only want to apply the heuristic if LOOP_VINFO is being
845 vectorized for VLA. */
846 if (m_cost_type != VLA_VECTOR_COST)
847 return;
848
849 /* We don't want to apply the heuristic to outer loops, since it's
850 harder to track two levels of unrolling. */
851 if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
852 return;
853
854 /* Only handle cases in which the number of VLS iterations
855 would be known at compile time but the number of SVE iterations
856 would not. */
857 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
858 || BYTES_PER_RISCV_VECTOR.is_constant ())
859 return;
860
861 /* Guess how many times the VLS loop would iterate and make
862 sure that it is within the complete unrolling limit. Even if the
863 number of iterations is small enough, the number of statements might
864 not be, which is why we need to estimate the number of statements too. */
865 unsigned int vls_vf = vect_vf_for_cost (loop_vinfo);
866 unsigned HOST_WIDE_INT unrolled_vls_niters
867 = LOOP_VINFO_INT_NITERS (loop_vinfo) / vls_vf;
868 if (unrolled_vls_niters > (unsigned int) param_max_completely_peel_times)
869 return;
870
871 /* Record that we're applying the heuristic and should try to estimate
872 the number of statements in the VLS loop. */
873 m_unrolled_vls_niters = unrolled_vls_niters;
874 }
875
876 /* Return true if (a) we're applying the VLS vs. VLA unrolling
877 heuristic described above m_unrolled_vls_niters and (b) the heuristic
878 says that we should prefer the VLS loop. */
879 bool
880 costs::prefer_unrolled_loop () const
881 {
882 if (!m_unrolled_vls_stmts)
883 return false;
884
885 if (dump_enabled_p ())
886 dump_printf_loc (MSG_NOTE, vect_location,
887 "Number of insns in"
888 " unrolled VLS loop = " HOST_WIDE_INT_PRINT_UNSIGNED "\n",
889 m_unrolled_vls_stmts);
890
891 /* The balance here is tricky. On the one hand, we can't be sure whether
892 the code is vectorizable with VLS or not. However, even if
893 it isn't vectorizable with VLS, there's a possibility that
894 the scalar code could also be unrolled. Some of the code might then
895 benefit from SLP, or from using LDP and STP. We therefore apply
896 the heuristic regardless of can_use_vls_p. */
897 return (m_unrolled_vls_stmts
898 && (m_unrolled_vls_stmts
899 <= (unsigned int) param_max_completely_peeled_insns));
900 }
901
902 bool
903 costs::better_main_loop_than_p (const vector_costs *uncast_other) const
904 {
905 auto other = static_cast<const costs *> (uncast_other);
906 auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
907 auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
908
909 if (dump_enabled_p ())
910 dump_printf_loc (MSG_NOTE, vect_location,
911 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
912 GET_MODE_NAME (this_loop_vinfo->vector_mode),
913 vect_vf_for_cost (this_loop_vinfo),
914 GET_MODE_NAME (other_loop_vinfo->vector_mode),
915 vect_vf_for_cost (other_loop_vinfo));
916
917 /* Apply the unrolling heuristic described above m_unrolled_vls_niters. */
918 if (bool (m_unrolled_vls_stmts) != bool (other->m_unrolled_vls_stmts))
919 {
920 bool this_prefer_unrolled = this->prefer_unrolled_loop ();
921 bool other_prefer_unrolled = other->prefer_unrolled_loop ();
922 if (this_prefer_unrolled != other_prefer_unrolled)
923 {
924 if (dump_enabled_p ())
925 dump_printf_loc (MSG_NOTE, vect_location,
926 "Preferring VLS loop because"
927 " it can be unrolled\n");
928 return other_prefer_unrolled;
929 }
930 }
931 else if (riscv_autovec_lmul == RVV_DYNAMIC)
932 {
933 if (other->m_has_unexpected_spills_p)
934 {
935 if (dump_enabled_p ())
936 dump_printf_loc (MSG_NOTE, vect_location,
937 "Preferring smaller LMUL loop because"
938 " it has unexpected spills\n");
939 return true;
940 }
941 else if (riscv_v_ext_vector_mode_p (other_loop_vinfo->vector_mode))
942 {
943 if (LOOP_VINFO_NITERS_KNOWN_P (other_loop_vinfo))
944 {
945 if (maybe_gt (LOOP_VINFO_INT_NITERS (this_loop_vinfo),
946 LOOP_VINFO_VECT_FACTOR (this_loop_vinfo)))
947 {
948 if (dump_enabled_p ())
949 dump_printf_loc (MSG_NOTE, vect_location,
950 "Keep current LMUL loop because"
951 " known NITERS exceed the new VF\n");
952 return false;
953 }
954 }
955 else
956 {
957 if (dump_enabled_p ())
958 dump_printf_loc (MSG_NOTE, vect_location,
959 "Keep current LMUL loop because"
960 " it is unknown NITERS\n");
961 return false;
962 }
963 }
964 }
965
966 return vector_costs::better_main_loop_than_p (other);
967 }
968
969 unsigned
970 costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
971 stmt_vec_info stmt_info, slp_tree, tree vectype,
972 int misalign, vect_cost_model_location where)
973 {
974 int stmt_cost
975 = targetm.vectorize.builtin_vectorization_cost (kind, vectype, misalign);
976
977 /* Do one-time initialization based on the vinfo. */
978 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
979 if (!m_analyzed_vinfo)
980 {
981 if (loop_vinfo)
982 analyze_loop_vinfo (loop_vinfo);
983
984 m_analyzed_vinfo = true;
985 }
986
987 if (stmt_info)
988 {
989 /* If we're applying the VLA vs. VLS unrolling heuristic,
990 estimate the number of statements in the unrolled VLS
991 loop. For simplicitly, we assume that one iteration of the
992 VLS loop would need the same number of statements
993 as one iteration of the VLA loop. */
994 if (where == vect_body && m_unrolled_vls_niters)
995 m_unrolled_vls_stmts += count * m_unrolled_vls_niters;
996 }
997
998 return record_stmt_cost (stmt_info, where, count * stmt_cost);
999 }
1000
1001 void
1002 costs::finish_cost (const vector_costs *scalar_costs)
1003 {
1004 vector_costs::finish_cost (scalar_costs);
1005 }
1006
1007 } // namespace riscv_vector