]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
eth: fbnic: add FW health reporter
authorJakub Kicinski <kuba@kernel.org>
Tue, 16 Sep 2025 23:14:18 +0000 (16:14 -0700)
committerPaolo Abeni <pabeni@redhat.com>
Thu, 18 Sep 2025 09:37:23 +0000 (11:37 +0200)
Add a health reporter to catch FW crashes. Dumping the reporter
if FW has not crashed will create a snapshot of FW memory.

Reviewed-by: Lee Trager <lee@trager.us>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20250916231420.1693955-8-kuba@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Documentation/networking/device_drivers/ethernet/meta/fbnic.rst
drivers/net/ethernet/meta/fbnic/fbnic.h
drivers/net/ethernet/meta/fbnic/fbnic_devlink.c
drivers/net/ethernet/meta/fbnic/fbnic_pci.c

index fb6559fa4be4b2a96a17c53d249557fefc955650..62693566ff1f2ce02b96f3ae99aa5ddd9e4f35f9 100644 (file)
@@ -69,6 +69,16 @@ On host boot the latest UEFI driver is always used, no explicit activation
 is required. Firmware activation is required to run new control firmware. cmrt
 firmware can only be activated by power cycling the NIC.
 
+Health reporters
+----------------
+
+fw reporter
+~~~~~~~~~~~
+
+The ``fw`` health reporter tracks FW crashes. Dumping the reporter will
+show the core dump of the most recent FW crash, and if no FW crash has
+happened since power cycle - a snapshot of the FW memory.
+
 Statistics
 ----------
 
index b364c2f0724b277eb65614dd332ecc7fb001591b..5f99976de0bbce2748dd7d2500316df97371cb39 100644 (file)
@@ -27,6 +27,7 @@ struct fbnic_dev {
        struct net_device *netdev;
        struct dentry *dbg_fbd;
        struct device *hwmon;
+       struct devlink_health_reporter *fw_reporter;
 
        u32 __iomem *uc_addr0;
        u32 __iomem *uc_addr4;
@@ -159,8 +160,12 @@ extern char fbnic_driver_name[];
 
 void fbnic_devlink_free(struct fbnic_dev *fbd);
 struct fbnic_dev *fbnic_devlink_alloc(struct pci_dev *pdev);
+int fbnic_devlink_health_create(struct fbnic_dev *fbd);
+void fbnic_devlink_health_destroy(struct fbnic_dev *fbd);
 void fbnic_devlink_register(struct fbnic_dev *fbd);
 void fbnic_devlink_unregister(struct fbnic_dev *fbd);
+void __printf(2, 3)
+fbnic_devlink_fw_report(struct fbnic_dev *fbd, const char *format, ...);
 
 int fbnic_fw_request_mbx(struct fbnic_dev *fbd);
 void fbnic_fw_free_mbx(struct fbnic_dev *fbd);
index c5f81f139e7edaf326fb363314f17f276b3b5356..195245fb1a9689796a39c8d1668f2b8e5b5bcb34 100644 (file)
@@ -8,6 +8,7 @@
 #include <net/devlink.h>
 
 #include "fbnic.h"
+#include "fbnic_fw.h"
 #include "fbnic_tlv.h"
 
 #define FBNIC_SN_STR_LEN       24
@@ -369,6 +370,160 @@ static const struct devlink_ops fbnic_devlink_ops = {
        .flash_update   = fbnic_devlink_flash_update,
 };
 
+static int fbnic_fw_reporter_dump(struct devlink_health_reporter *reporter,
+                                 struct devlink_fmsg *fmsg, void *priv_ctx,
+                                 struct netlink_ext_ack *extack)
+{
+       struct fbnic_dev *fbd = devlink_health_reporter_priv(reporter);
+       u32 offset, index, index_count, length, size;
+       struct fbnic_fw_completion *fw_cmpl;
+       u8 *dump_data, **data;
+       int err;
+
+       fw_cmpl = fbnic_fw_alloc_cmpl(FBNIC_TLV_MSG_ID_COREDUMP_GET_INFO_RESP);
+       if (!fw_cmpl)
+               return -ENOMEM;
+
+       err = fbnic_fw_xmit_coredump_info_msg(fbd, fw_cmpl, true);
+       if (err) {
+               NL_SET_ERR_MSG_MOD(extack,
+                                  "Failed to transmit core dump info msg");
+               goto cmpl_free;
+       }
+       if (!wait_for_completion_timeout(&fw_cmpl->done, 2 * HZ)) {
+               NL_SET_ERR_MSG_MOD(extack,
+                                  "Timed out waiting on core dump info");
+               err = -ETIMEDOUT;
+               goto cmpl_cleanup;
+       }
+
+       size = fw_cmpl->u.coredump_info.size;
+       err = fw_cmpl->result;
+
+       fbnic_mbx_clear_cmpl(fbd, fw_cmpl);
+       fbnic_fw_put_cmpl(fw_cmpl);
+
+       /* Handle error returned by firmware */
+       if (err) {
+               NL_SET_ERR_MSG_MOD(extack, "Firmware core dump returned error");
+               return err;
+       }
+       if (!size) {
+               NL_SET_ERR_MSG_MOD(extack,
+                                  "Firmware core dump returned size 0");
+               return -EIO;
+       }
+
+       /* Read the dump, we can only transfer TLV_MAX_DATA at a time */
+       index_count = DIV_ROUND_UP(size, TLV_MAX_DATA);
+
+       fw_cmpl = __fbnic_fw_alloc_cmpl(FBNIC_TLV_MSG_ID_COREDUMP_READ_RESP,
+                                       sizeof(void *) * index_count + size);
+       if (!fw_cmpl)
+               return -ENOMEM;
+
+       /* Populate pointer table w/ pointer offsets */
+       dump_data = (void *)&fw_cmpl->u.coredump.data[index_count];
+       data = fw_cmpl->u.coredump.data;
+       fw_cmpl->u.coredump.size = size;
+       fw_cmpl->u.coredump.stride = TLV_MAX_DATA;
+
+       for (index = 0; index < index_count; index++) {
+               /* First iteration installs completion */
+               struct fbnic_fw_completion *cmpl_arg = index ? NULL : fw_cmpl;
+
+               offset = index * TLV_MAX_DATA;
+               length = min(size - offset, TLV_MAX_DATA);
+
+               data[index] = dump_data + offset;
+               err = fbnic_fw_xmit_coredump_read_msg(fbd, cmpl_arg,
+                                                     offset, length);
+               if (err) {
+                       NL_SET_ERR_MSG_MOD(extack,
+                                          "Failed to transmit core dump msg");
+                       if (cmpl_arg)
+                               goto cmpl_free;
+                       else
+                               goto cmpl_cleanup;
+               }
+
+               if (wait_for_completion_timeout(&fw_cmpl->done, 2 * HZ)) {
+                       reinit_completion(&fw_cmpl->done);
+               } else {
+                       NL_SET_ERR_MSG_FMT_MOD(extack,
+                                              "Timed out waiting on core dump (%d/%d)",
+                                              index + 1, index_count);
+                       err = -ETIMEDOUT;
+                       goto cmpl_cleanup;
+               }
+
+               /* If we didn't see the reply record as incomplete */
+               if (fw_cmpl->u.coredump.data[index]) {
+                       NL_SET_ERR_MSG_FMT_MOD(extack,
+                                              "No data for core dump chunk (%d/%d)",
+                                              index + 1, index_count);
+                       err = -EIO;
+                       goto cmpl_cleanup;
+               }
+       }
+
+       devlink_fmsg_binary_pair_nest_start(fmsg, "FW coredump");
+
+       for (offset = 0; offset < size; offset += length) {
+               length = min_t(u32, size - offset, TLV_MAX_DATA);
+
+               devlink_fmsg_binary_put(fmsg, dump_data + offset, length);
+       }
+
+       devlink_fmsg_binary_pair_nest_end(fmsg);
+
+cmpl_cleanup:
+       fbnic_mbx_clear_cmpl(fbd, fw_cmpl);
+cmpl_free:
+       fbnic_fw_put_cmpl(fw_cmpl);
+
+       return err;
+}
+
+void __printf(2, 3)
+fbnic_devlink_fw_report(struct fbnic_dev *fbd, const char *format, ...)
+{
+       char msg[FBNIC_FW_LOG_MAX_SIZE];
+       va_list args;
+
+       va_start(args, format);
+       vsnprintf(msg, FBNIC_FW_LOG_MAX_SIZE, format, args);
+       va_end(args);
+
+       devlink_health_report(fbd->fw_reporter, msg, fbd);
+       if (fbnic_fw_log_ready(fbd))
+               fbnic_fw_log_write(fbd, 0, fbd->firmware_time, msg);
+}
+
+static const struct devlink_health_reporter_ops fbnic_fw_ops = {
+       .name = "fw",
+       .dump = fbnic_fw_reporter_dump,
+};
+
+int fbnic_devlink_health_create(struct fbnic_dev *fbd)
+{
+       fbd->fw_reporter = devlink_health_reporter_create(priv_to_devlink(fbd),
+                                                         &fbnic_fw_ops, fbd);
+       if (IS_ERR(fbd->fw_reporter)) {
+               dev_warn(fbd->dev,
+                        "Failed to create FW fault reporter: %pe\n",
+                        fbd->fw_reporter);
+               return PTR_ERR(fbd->fw_reporter);
+       }
+
+       return 0;
+}
+
+void fbnic_devlink_health_destroy(struct fbnic_dev *fbd)
+{
+       devlink_health_reporter_destroy(fbd->fw_reporter);
+}
+
 void fbnic_devlink_free(struct fbnic_dev *fbd)
 {
        struct devlink *devlink = priv_to_devlink(fbd);
index 7d9b93f8ebd85108bb3fe38c6da8bb29b0c85306..576fc89f8704929ea875b5fc73cca38fe8549305 100644 (file)
@@ -196,6 +196,8 @@ static void fbnic_health_check(struct fbnic_dev *fbd)
        if (tx_mbx->head != tx_mbx->tail)
                return;
 
+       fbnic_devlink_fw_report(fbd, "Firmware crashed detected!");
+
        if (fbnic_fw_config_after_crash(fbd))
                dev_err(fbd->dev, "Firmware recovery failed after crash\n");
 }
@@ -278,6 +280,10 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
                return -ENOMEM;
        }
 
+       err = fbnic_devlink_health_create(fbd);
+       if (err)
+               goto free_fbd;
+
        /* Populate driver with hardware-specific info and handlers */
        fbd->max_num_queues = info->max_num_queues;
 
@@ -288,7 +294,7 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
        err = fbnic_alloc_irqs(fbd);
        if (err)
-               goto free_fbd;
+               goto err_destroy_health;
 
        err = fbnic_mac_init(fbd);
        if (err) {
@@ -357,6 +363,8 @@ init_failure_mode:
        return 0;
 free_irqs:
        fbnic_free_irqs(fbd);
+err_destroy_health:
+       fbnic_devlink_health_destroy(fbd);
 free_fbd:
        fbnic_devlink_free(fbd);
 
@@ -391,6 +399,7 @@ static void fbnic_remove(struct pci_dev *pdev)
        fbnic_fw_free_mbx(fbd);
        fbnic_free_irqs(fbd);
 
+       fbnic_devlink_health_destroy(fbd);
        fbnic_devlink_free(fbd);
 }