From: Jeff Law Date: Sat, 1 Nov 2025 22:27:05 +0000 (-0600) Subject: [RISC-V] Reorder ready queue slightly to avoid unnecessary vsetvl instructions X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=r16-4924-g63632889651f31;p=thirdparty%2Fgcc.git [RISC-V] Reorder ready queue slightly to avoid unnecessary vsetvl instructions As I've touched on before, particularly in the patchwork meeting, we can get a modest reduction in the number of vsetvl instructions we emit by being somewhat smarter in how we pull instructions out of the ready queue during scheduling. Each insn in the scheduler's ready queue has a priority which reflects the how that insn plays in a region's critical path. The higher the priority, the more important it is for that instruction to issue. When we have multiple insns with the same priority in the ready queue, we can roughly expect that issuing any insn from that set is equally good. Yes there are secondary sort keys that incorporate register lifetime and such, but those are just that -- secondary concerns. Given some set of insns with the same priority, we can select whichever one we want, so select the insn with the same vector configuration as whatever vector instruction was last issued from the ready queue. This will naturally tend to group vector instructions with the same vector configuration together, thus reducing the ping-ponging of vector configurations that we sometimes see. When I initially cobbled this together (about a year ago) Robin reported low single digit improvements on the BPI for x264. A lot has changed since then and it may not be as big a win now, but I think it still has value. This did expose that one of the move patterns in vector.md didn't have the proper vl_op/vtype_op attributes on it. Trivially fixed. Tested for riscv32-elf, riscv64-elf and on the Pioneer with no regressions (of course the Pioneer won't really exercise this code). BPI is in flight, but not due to complete for ~24hrs. We've also been running this internally for roughly a year 🙂 * config/riscv/riscv-protos.h (has_vtype_op): Add prototype. (mask_agnostic_p, get_avl, vsetvl_insn_p): Likewise. * config/riscv/riscv-vsetvl.cc (has_vtype_op): No longer static. (vsetvl_insn_p, get_avl_mask_agnostic_p): Likewise. * config/riscv/riscv.cc (struct last_vcofnig): New structure. (clear_vconfig): New function. (compatible_with_last_vconfig, riscv_sched_init): Likewise. (riscv_sched_reorder): Likewise. (TARGET_SCHED_INIT, TARGET_SCHED_REORDER): Define. * config/riscv/vector.md ("*mov"): Set has_vtype_op, has_vl_op attributes. --- diff --git a/gcc/.simplify-rtx.cc.swo b/gcc/.simplify-rtx.cc.swo new file mode 100644 index 00000000000..c2a8e8151b7 Binary files /dev/null and b/gcc/.simplify-rtx.cc.swo differ diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index cdb706ab82a..570acb14f58 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -209,6 +209,11 @@ rtl_opt_pass * make_pass_insert_landing_pad (gcc::context *ctxt); rtl_opt_pass * make_pass_vector_permconst (gcc::context *ctxt); rtl_opt_pass * make_pass_bclr_lowest_set_bit (gcc::context *ctxt); +/* Routines implemented in riscv-vsetvl.cc. */ +extern bool has_vtype_op (rtx_insn *); +extern bool mask_agnostic_p (rtx_insn *); +extern rtx get_avl (rtx_insn *); +extern bool vsetvl_insn_p (rtx_insn *); /* Routines implemented in riscv-string.c. */ extern bool riscv_expand_block_compare (rtx, rtx, rtx, rtx); diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index 3586d0cdcc2..580ac9cbe8e 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -258,7 +258,7 @@ policy_to_str (bool agnostic_p) /* Return true if it is an RVV instruction depends on VTYPE global status register. */ -static bool +bool has_vtype_op (rtx_insn *rinsn) { return recog_memoized (rinsn) >= 0 && get_attr_has_vtype_op (rinsn); @@ -306,7 +306,7 @@ vector_config_insn_p (rtx_insn *rinsn) } /* Return true if it is vsetvldi or vsetvlsi. */ -static bool +bool vsetvl_insn_p (rtx_insn *rinsn) { if (!rinsn || !vector_config_insn_p (rinsn)) @@ -386,7 +386,7 @@ get_vl (rtx_insn *rinsn) } /* Helper function to get AVL operand. */ -static rtx +rtx get_avl (rtx_insn *rinsn) { if (vsetvl_insn_p (rinsn) || vsetvl_discard_result_insn_p (rinsn)) @@ -411,7 +411,7 @@ get_default_ma () } /* Helper function to get MA operand. */ -static bool +bool mask_agnostic_p (rtx_insn *rinsn) { /* If it doesn't have MA, we return agnostic by default. */ diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 63404d3d514..e978f9209ed 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -10576,6 +10576,71 @@ riscv_issue_rate (void) return tune_param->issue_rate; } +/* Structure for very basic vector configuration tracking in the scheduler. */ +struct last_vconfig +{ + bool valid; + bool ta; + bool ma; + uint8_t sew; + uint8_t vlmul; + rtx avl; +} last_vconfig; + +/* Clear LAST_VCONFIG so we have no known state. */ +static void +clear_vconfig (void) +{ + memset (&last_vconfig, 0, sizeof (last_vconfig)); +} + +/* Return TRUE if INSN is a vector insn needing a particular + vector configuration that is trivially equal to the last + vector insn issued. Return FALSE otherwise. */ +static bool +compatible_with_last_vconfig (rtx_insn *insn) +{ + /* We might be able to extract the data from a preexisting vsetvl. */ + if (vsetvl_insn_p (insn)) + return false; + + /* Nothing to do for these cases. */ + if (!NONDEBUG_INSN_P (insn) || !has_vtype_op (insn)) + return false; + + extract_insn_cached (insn); + + rtx avl = get_avl (insn); + if (avl != last_vconfig.avl) + return false; + + if (get_sew (insn) != last_vconfig.sew) + return false; + + if (get_vlmul (insn) != last_vconfig.vlmul) + return false; + + if (tail_agnostic_p (insn) != last_vconfig.ta) + return false; + + if (mask_agnostic_p (insn) != last_vconfig.ma) + return false; + + /* No differences found, they're trivially compatible. */ + return true; +} + +/* Implement TARGET_SCHED_INIT, we use this to track the vector configuration + of the last issued vector instruction. We can then use that information + to potentially adjust the ready queue to issue instructions of a compatible + vector configuration instead of a conflicting configuration. That will + reduce the number of vsetvl instructions we ultimately emit. */ +static void +riscv_sched_init (FILE *, int, int) +{ + clear_vconfig (); +} + /* Implement TARGET_SCHED_VARIABLE_ISSUE. */ static int riscv_sched_variable_issue (FILE *, int, rtx_insn *insn, int more) @@ -10600,9 +10665,88 @@ riscv_sched_variable_issue (FILE *, int, rtx_insn *insn, int more) an assert so we can find and fix this problem. */ gcc_assert (insn_has_dfa_reservation_p (insn)); + /* If this is a vector insn with vl/vtype info, then record the last + vector configuration. */ + if (vsetvl_insn_p (insn)) + clear_vconfig (); + else if (NONDEBUG_INSN_P (insn) && has_vtype_op (insn)) + { + extract_insn_cached (insn); + + rtx avl = get_avl (insn); + if (avl == RVV_VLMAX) + avl = const0_rtx; + + if (!avl || !CONST_INT_P (avl)) + clear_vconfig (); + else + { + last_vconfig.valid = true; + last_vconfig.avl = avl; + last_vconfig.sew = get_sew (insn); + last_vconfig.vlmul = get_vlmul (insn); + last_vconfig.ta = tail_agnostic_p (insn); + last_vconfig.ma = mask_agnostic_p (insn); + } + } + return more - 1; } +/* Implement TARGET_SCHED_REORDER. The goal here is to look at the ready + queue and reorder it ever so slightly to encourage issing an insn with + the same vector configuration as the most recently issued vector + instruction. That will reduce vsetvl instructions. */ +static int +riscv_sched_reorder (FILE *, int, rtx_insn **ready, int *nreadyp, int) +{ + /* If we don't have a valid prior vector configuration, then there is + no point in reordering the ready queue, similarly if there is + just one entry in the queue. */ + if (!last_vconfig.valid || *nreadyp == 1) + return riscv_issue_rate (); + + return riscv_issue_rate (); + int nready = *nreadyp; + int priority = INSN_PRIORITY (ready[nready - 1]); + for (int i = nready - 1; i >= 0; i--) + { + rtx_insn *insn = ready[i]; + + /* On a high performance core, vsetvl instructions should be + inexpensive. Removing them is very much a secondary concern, so + be extremely conservative with reordering, essentially only + allowing reordering within the highest priority value. + + Lower end cores may benefit from more flexibility here. That + tuning is left to those who understand their core's behavior + and can thoroughly benchmark the result. Assuming such + designs appear, we can probably put an entry in the tuning + structure to indicate how much difference in priority to allow. */ + if (INSN_PRIORITY (insn) < priority) + break; + + if (compatible_with_last_vconfig (insn)) + { + /* This entry is compatible with the last vconfig and has + the same priority as the most important insn. So swap + it so that we keep the vector configuration as-is and + ultimately eliminate a vsetvl. + + Note no need to swap if this is the first entry in the + queue. */ + if (i == nready - 1) + break; + + std::swap (ready[i], ready[nready - 1]); + break; + } + } + + return riscv_issue_rate (); +} + + /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports instruction fusion of some sort. */ @@ -16011,9 +16155,15 @@ riscv_prefetch_offset_address_p (rtx x, machine_mode mode) #undef TARGET_SCHED_MACRO_FUSION_PAIR_P #define TARGET_SCHED_MACRO_FUSION_PAIR_P riscv_macro_fusion_pair_p +#undef TARGET_SCHED_INIT +#define TARGET_SCHED_INIT riscv_sched_init + #undef TARGET_SCHED_VARIABLE_ISSUE #define TARGET_SCHED_VARIABLE_ISSUE riscv_sched_variable_issue +#undef TARGET_SCHED_REORDER +#define TARGET_SCHED_REORDER riscv_sched_reorder + #undef TARGET_SCHED_ADJUST_COST #define TARGET_SCHED_ADJUST_COST riscv_sched_adjust_cost diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md index 3cb87bf4eae..9d34725a757 100644 --- a/gcc/config/riscv/vector.md +++ b/gcc/config/riscv/vector.md @@ -1437,6 +1437,8 @@ [(set_attr "type" "vlde,vste,vmov") (set_attr "mode" "") (set (attr "merge_op_idx") (const_int INVALID_ATTRIBUTE)) + (set (attr "has_vl_op") (const_string "false")) + (set (attr "has_vtype_op") (const_string "false")) (set (attr "avl_type_idx") (const_int INVALID_ATTRIBUTE)) (set (attr "mode_idx") (const_int INVALID_ATTRIBUTE))] )