]> git.ipfire.org Git - thirdparty/iproute2.git/commitdiff
devlink: Introduce burst period for health reporter
authorShahar Shitrit <shshitrit@nvidia.com>
Wed, 8 Oct 2025 06:57:18 +0000 (09:57 +0300)
committerDavid Ahern <dsahern@kernel.org>
Thu, 16 Oct 2025 15:26:34 +0000 (09:26 -0600)
Add a new devlink health set option to configure the health
reporter’s burst period. The burst period defines a time window
during which recovery attempts for reported errors are allowed.
Once this period expires, the configured grace period begins.

This feature addresses cases where multiple errors occur
simultaneously due to a common root cause. Without a burst period,
the grace period starts immediately after the first error recovery
attempt finishes. This means that only the first error might be
recovered, while subsequent errors are blocked during the grace period.
With the burst period, the reporter initiates a recovery attempt for
every error reported within this time window before the grace period
starts.

Example:
$ devlink health set pci/0000:00:09.0 reporter tx burst_period 500

Signed-off-by: Shahar Shitrit <shshitrit@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: David Ahern <dsahern@kernel.org>
bash-completion/devlink
devlink/devlink.c
man/man8/devlink-health.8

index 52dc82b37ca5a476515046d67f79623541f3764b..c053d3d08009fb1e97b1b5ac7626b72fdc54b27e 100644 (file)
@@ -792,12 +792,12 @@ _devlink_health()
             if [[ $command == "set" ]]; then
                 case $cword in
                     6)
-                        COMPREPLY=( $( compgen -W "grace_period auto_recover" \
+                        COMPREPLY=( $( compgen -W "grace_period burst_period auto_recover" \
                                    -- "$cur" ) )
                         ;;
                     7)
                         case $prev in
-                            grace_period)
+                            grace_period|burst_period)
                                 # Integer argument- msec
                                 ;;
                             auto_recover)
index 171b85327be34b9d90dc3c9c49cc98f8f34e61af..f77b4449e8c5fcd70b9e6e46c8b5d0ade0079292 100644 (file)
@@ -311,6 +311,7 @@ static int ifname_map_update(struct ifname_map *ifname_map, const char *ifname)
 #define DL_OPT_PORT_FN_CAPS    BIT(57)
 #define DL_OPT_PORT_FN_MAX_IO_EQS      BIT(58)
 #define DL_OPT_PORT_FN_RATE_TC_BWS     BIT(59)
+#define DL_OPT_HEALTH_REPORTER_BURST_PERIOD    BIT(60)
 
 struct dl_opts {
        uint64_t present; /* flags of present items */
@@ -346,6 +347,7 @@ struct dl_opts {
        const char *flash_component;
        const char *reporter_name;
        __u64 reporter_graceful_period;
+       __u64 reporter_burst_period;
        bool reporter_auto_recover;
        bool reporter_auto_dump;
        const char *trap_name;
@@ -697,6 +699,7 @@ static const enum mnl_attr_data_type devlink_policy[DEVLINK_ATTR_MAX + 1] = {
        [DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT] = MNL_TYPE_U64,
        [DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS] = MNL_TYPE_U64,
        [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = MNL_TYPE_U64,
+       [DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD] = MNL_TYPE_U64,
        [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = MNL_TYPE_STRING,
        [DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG] = MNL_TYPE_STRING,
        [DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE] = MNL_TYPE_U64,
@@ -2101,6 +2104,13 @@ static int dl_argv_parse(struct dl *dl, uint64_t o_required,
                        if (err)
                                return err;
                        o_found |= DL_OPT_HEALTH_REPORTER_GRACEFUL_PERIOD;
+               } else if (dl_argv_match(dl, "burst_period") &&
+                          (o_all & DL_OPT_HEALTH_REPORTER_BURST_PERIOD)) {
+                       dl_arg_inc(dl);
+                       err = dl_argv_uint64_t(dl, &opts->reporter_burst_period);
+                       if (err)
+                               return err;
+                       o_found |= DL_OPT_HEALTH_REPORTER_BURST_PERIOD;
                } else if (dl_argv_match(dl, "auto_recover") &&
                        (o_all & DL_OPT_HEALTH_REPORTER_AUTO_RECOVER)) {
                        dl_arg_inc(dl);
@@ -2701,6 +2711,10 @@ static void dl_opts_put(struct nlmsghdr *nlh, struct dl *dl)
                mnl_attr_put_u64(nlh,
                                 DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD,
                                 opts->reporter_graceful_period);
+       if (opts->present & DL_OPT_HEALTH_REPORTER_BURST_PERIOD)
+               mnl_attr_put_u64(nlh,
+                                DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD,
+                                opts->reporter_burst_period);
        if (opts->present & DL_OPT_HEALTH_REPORTER_AUTO_RECOVER)
                mnl_attr_put_u8(nlh, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,
                                opts->reporter_auto_recover);
@@ -9309,6 +9323,7 @@ static int cmd_health_set_params(struct dl *dl)
                               NLM_F_REQUEST | NLM_F_ACK);
        err = dl_argv_parse(dl, DL_OPT_HANDLE | DL_OPT_HANDLEP | DL_OPT_HEALTH_REPORTER_NAME,
                            DL_OPT_HEALTH_REPORTER_GRACEFUL_PERIOD |
+                           DL_OPT_HEALTH_REPORTER_BURST_PERIOD |
                            DL_OPT_HEALTH_REPORTER_AUTO_RECOVER |
                            DL_OPT_HEALTH_REPORTER_AUTO_DUMP);
        if (err)
@@ -9753,6 +9768,9 @@ static void pr_out_health(struct dl *dl, struct nlattr **tb_health,
        if (tb[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD])
                pr_out_u64(dl, "grace_period",
                           mnl_attr_get_u64(tb[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]));
+       if (tb[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD])
+               pr_out_u64(dl, "burst_period",
+                          mnl_attr_get_u64(tb[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]));
        if (tb[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])
                print_bool(PRINT_ANY, "auto_recover", " auto_recover %s",
                           mnl_attr_get_u8(tb[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]));
@@ -9827,6 +9845,7 @@ static void cmd_health_help(void)
        pr_err("       devlink health dump clear { DEV | DEV/PORT_INDEX } reporter REPORTER_NAME\n");
        pr_err("       devlink health set { DEV | DEV/PORT_INDEX } reporter REPORTER_NAME\n");
        pr_err("                          [ grace_period MSEC ]\n");
+       pr_err("                          [ burst_period MSEC ]\n");
        pr_err("                          [ auto_recover { true | false } ]\n");
        pr_err("                          [ auto_dump    { true | false } ]\n");
 }
index 975b8c75d79818dbcb977218ab304547fd044b13..fd6818dfadaa556b872375225e7012a7531d8175 100644 (file)
@@ -61,6 +61,8 @@ devlink-health \- devlink health reporting and recovery
 [
 .BI "grace_period " MSEC "
 ] [
+.BI "burst_period " MSEC "
+] [
 .BR auto_recover " { " true " | " false " } "
 ] [
 .BR auto_dump " { " true " | " false " } "
@@ -182,6 +184,11 @@ doesn't support a recovery or dump method.
 .BI grace_period " MSEC "
 Time interval between consecutive auto recoveries.
 
+.TP
+.BI burst_period " MSEC "
+Time window for error recoveries before starting the grace period.
+Configuring burst_period is invalid when the grace period is disabled.
+
 .TP
 .BR auto_recover " { " true " | " false " } "
 Indicates whether the devlink should execute automatic recover on error.
@@ -242,6 +249,19 @@ the specified port and reporter.
 devlink health set pci/0000:00:09.0 reporter fw_fatal auto_recover false
 .RS 4
 Turn off auto recovery on the specified device and reporter.
+.RE
+.PP
+devlink health set pci/0000:00:09.0 reporter tx burst_period 5000
+.RS 4
+Set the burst period to 5000 milliseconds on the specified
+device and reporter, prior to initiating the grace period.
+.RE
+.PP
+devlink health set pci/0000:00:09.0 reporter tx grace_period 0
+.RS 4
+Disable grace period on the specified device and reporter. Disabling the grace
+period also deactivates the burst period.
+.RE
 
 .RE
 .SH SEE ALSO