]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
devlink: Make health reporter burst period configurable
authorShahar Shitrit <shshitrit@nvidia.com>
Sun, 24 Aug 2025 08:43:53 +0000 (11:43 +0300)
committerJakub Kicinski <kuba@kernel.org>
Wed, 27 Aug 2025 00:24:16 +0000 (17:24 -0700)
Enable configuration of the burst period — a time window starting
from the first error recovery, during which the reporter allows
recovery attempts for each reported error.

This feature is helpful when a single underlying issue causes multiple
errors, as it delays the start of the grace period to allow sufficient
time for recovering all related errors. For example, if multiple TX
queues time out simultaneously, a sufficient burst period could allow
all affected TX queues to be recovered within that window. Without this
period, only the first TX queue that reports a timeout will undergo
recovery, while the remaining TX queues will be blocked once the grace
period begins.

Configuration example:
$ devlink health set pci/0000:00:09.0 reporter tx burst_period 500

Configuration example with ynl:
./tools/net/ynl/pyynl/cli.py \
 --spec Documentation/netlink/specs/devlink.yaml \
 --do health-reporter-set --json '{
  "bus-name": "auxiliary",
  "dev-name": "mlx5_core.eth.0",
  "port-index": 65535,
  "health-reporter-name": "tx",
  "health-reporter-burst-period": 500
}'

Signed-off-by: Shahar Shitrit <shshitrit@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250824084354.533182-5-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Documentation/netlink/specs/devlink.yaml
Documentation/networking/devlink/devlink-health.rst
include/uapi/linux/devlink.h
net/devlink/health.c
net/devlink/netlink_gen.c

index bb87111d5e16cac8e3742a543b73b62e32d33a18..3db59c9658694b3e11a98692f4584b6f36b00dec 100644 (file)
@@ -853,6 +853,10 @@ attribute-sets:
         type: nest
         multi-attr: true
         nested-attributes: dl-rate-tc-bws
+      -
+        name: health-reporter-burst-period
+        type: u64
+        doc: Time (in msec) for recoveries before starting the grace period.
   -
     name: dl-dev-stats
     subset-of: devlink
@@ -1216,6 +1220,8 @@ attribute-sets:
         name: health-reporter-dump-ts-ns
       -
         name: health-reporter-auto-dump
+      -
+        name: health-reporter-burst-period
 
   -
     name: dl-attr-stats
@@ -1961,6 +1967,7 @@ operations:
             - health-reporter-graceful-period
             - health-reporter-auto-recover
             - health-reporter-auto-dump
+            - health-reporter-burst-period
 
     -
       name: health-reporter-recover
index e0b8cfed610a7a6d88ffab0dfd963b144c25ef54..4d10536377ab70732373035657a1189a92393ecf 100644 (file)
@@ -50,7 +50,7 @@ Once an error is reported, devlink health will perform the following actions:
   * Auto recovery attempt is being done. Depends on:
 
     - Auto-recovery configuration
-    - Grace period vs. time passed since last recover
+    - Grace period (and burst period)  vs. time passed since last recover
 
 Devlink formatted message
 =========================
index 9fcb25a0f447b4b7983324b9ece5d9618725ac48..bcad11a787a55b753d32346823f77ec303eaff1d 100644 (file)
@@ -636,6 +636,8 @@ enum devlink_attr {
 
        DEVLINK_ATTR_RATE_TC_BWS,               /* nested */
 
+       DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD,      /* u64 */
+
        /* Add new attributes above here, update the spec in
         * Documentation/netlink/specs/devlink.yaml and re-generate
         * net/devlink/netlink_gen.c.
index 94ab77f77addfade5a1437a4d4b6c45d629727e4..136a67c36a20dd03fe1a103f2e728567b0256598 100644 (file)
@@ -116,6 +116,9 @@ __devlink_health_reporter_create(struct devlink *devlink,
        if (WARN_ON(ops->default_graceful_period && !ops->recover))
                return ERR_PTR(-EINVAL);
 
+       if (WARN_ON(ops->default_burst_period && !ops->default_graceful_period))
+               return ERR_PTR(-EINVAL);
+
        reporter = kzalloc(sizeof(*reporter), GFP_KERNEL);
        if (!reporter)
                return ERR_PTR(-ENOMEM);
@@ -293,6 +296,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg,
            devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD,
                               reporter->graceful_period))
                goto reporter_nest_cancel;
+       if (reporter->ops->recover &&
+           devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD,
+                              reporter->burst_period))
+               goto reporter_nest_cancel;
        if (reporter->ops->recover &&
            nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,
                       reporter->auto_recover))
@@ -458,16 +465,33 @@ int devlink_nl_health_reporter_set_doit(struct sk_buff *skb,
 
        if (!reporter->ops->recover &&
            (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] ||
-            info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]))
+            info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] ||
+            info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]))
                return -EOPNOTSUPP;
 
        if (!reporter->ops->dump &&
            info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP])
                return -EOPNOTSUPP;
 
-       if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD])
+       if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) {
                reporter->graceful_period =
                        nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]);
+               if (!reporter->graceful_period)
+                       reporter->burst_period = 0;
+       }
+
+       if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]) {
+               u64 burst_period =
+                       nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]);
+
+               if (!reporter->graceful_period && burst_period) {
+                       NL_SET_ERR_MSG_MOD(info->extack,
+                                          "Cannot set burst period without a grace period.");
+                       return -EINVAL;
+               }
+
+               reporter->burst_period = burst_period;
+       }
 
        if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])
                reporter->auto_recover =
index d97c326a9045b739c453d5b15710a174fa9de536..9fd00977d59e3c330d8a6296a2f1744ce9e1075f 100644 (file)
@@ -389,7 +389,7 @@ static const struct nla_policy devlink_health_reporter_get_dump_nl_policy[DEVLIN
 };
 
 /* DEVLINK_CMD_HEALTH_REPORTER_SET - do */
-static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP + 1] = {
+static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD + 1] = {
        [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
        [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
        [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
@@ -397,6 +397,7 @@ static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATT
        [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64, },
        [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8, },
        [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8, },
+       [DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD] = { .type = NLA_U64, },
 };
 
 /* DEVLINK_CMD_HEALTH_REPORTER_RECOVER - do */
@@ -1032,7 +1033,7 @@ const struct genl_split_ops devlink_nl_ops[74] = {
                .doit           = devlink_nl_health_reporter_set_doit,
                .post_doit      = devlink_nl_post_doit,
                .policy         = devlink_health_reporter_set_nl_policy,
-               .maxattr        = DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP,
+               .maxattr        = DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD,
                .flags          = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
        },
        {