]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
net/mlx5: Fix lockdep assertion on sync reset unload event
authorMoshe Shemesh <moshe@nvidia.com>
Mon, 25 Aug 2025 14:34:29 +0000 (17:34 +0300)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 4 Sep 2025 13:31:49 +0000 (15:31 +0200)
[ Upstream commit 902a8bc23a24882200f57cadc270e15a2cfaf2bb ]

Fix lockdep assertion triggered during sync reset unload event. When the
sync reset flow is initiated using the devlink reload fw_activate
option, the PF already holds the devlink lock while handling unload
event. In this case, delegate sync reset unload event handling back to
the devlink callback process to avoid double-locking and resolve the
lockdep warning.

Kernel log:
WARNING: CPU: 9 PID: 1578 at devl_assert_locked+0x31/0x40
[...]
Call Trace:
<TASK>
 mlx5_unload_one_devl_locked+0x2c/0xc0 [mlx5_core]
 mlx5_sync_reset_unload_event+0xaf/0x2f0 [mlx5_core]
 process_one_work+0x222/0x640
 worker_thread+0x199/0x350
 kthread+0x10b/0x230
 ? __pfx_worker_thread+0x10/0x10
 ? __pfx_kthread+0x10/0x10
 ret_from_fork+0x8e/0x100
 ? __pfx_kthread+0x10/0x10
 ret_from_fork_asm+0x1a/0x30
</TASK>

Fixes: 7a9770f1bfea ("net/mlx5: Handle sync reset unload event")
Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250825143435.598584-7-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
drivers/net/ethernet/mellanox/mlx5/core/devlink.c
drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h

index 7211e65ad2dcc74bfaaf8a6a85571ad810dea665..511b3ba245420728e9569ccf117417b462609d69 100644 (file)
@@ -107,7 +107,7 @@ static int mlx5_devlink_reload_fw_activate(struct devlink *devlink, struct netli
        if (err)
                return err;
 
-       mlx5_unload_one_devl_locked(dev, false);
+       mlx5_sync_reset_unload_flow(dev, true);
        err = mlx5_health_wait_pci_up(dev);
        if (err)
                NL_SET_ERR_MSG_MOD(extack, "FW activate aborted, PCI reads fail after reset");
index 4f55e55ecb5513591236a942056202678e5d66fd..0829912157c978f26f9e4b1e12f0eadc48121eba 100644 (file)
@@ -12,7 +12,8 @@ enum {
        MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST,
        MLX5_FW_RESET_FLAGS_PENDING_COMP,
        MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS,
-       MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED
+       MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED,
+       MLX5_FW_RESET_FLAGS_UNLOAD_EVENT,
 };
 
 struct mlx5_fw_reset {
@@ -218,7 +219,7 @@ int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev)
        return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL0, 0, 0, false);
 }
 
-static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev, bool unloaded)
+static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev)
 {
        struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
        struct devlink *devlink = priv_to_devlink(dev);
@@ -227,8 +228,7 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev, bool unload
        if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) {
                complete(&fw_reset->done);
        } else {
-               if (!unloaded)
-                       mlx5_unload_one(dev, false);
+               mlx5_sync_reset_unload_flow(dev, false);
                if (mlx5_health_wait_pci_up(dev))
                        mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n");
                else
@@ -271,7 +271,7 @@ static void mlx5_sync_reset_reload_work(struct work_struct *work)
 
        mlx5_sync_reset_clear_reset_requested(dev, false);
        mlx5_enter_error_state(dev, true);
-       mlx5_fw_reset_complete_reload(dev, false);
+       mlx5_fw_reset_complete_reload(dev);
 }
 
 #define MLX5_RESET_POLL_INTERVAL       (HZ / 10)
@@ -581,6 +581,59 @@ static int mlx5_sync_pci_reset(struct mlx5_core_dev *dev, u8 reset_method)
        return err;
 }
 
+void mlx5_sync_reset_unload_flow(struct mlx5_core_dev *dev, bool locked)
+{
+       struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
+       unsigned long timeout;
+       bool reset_action;
+       u8 rst_state;
+       int err;
+
+       if (locked)
+               mlx5_unload_one_devl_locked(dev, false);
+       else
+               mlx5_unload_one(dev, false);
+
+       if (!test_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags))
+               return;
+
+       mlx5_set_fw_rst_ack(dev);
+       mlx5_core_warn(dev, "Sync Reset Unload done, device reset expected\n");
+
+       reset_action = false;
+       timeout = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, RESET_UNLOAD));
+       do {
+               rst_state = mlx5_get_fw_rst_state(dev);
+               if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ ||
+                   rst_state == MLX5_FW_RST_STATE_IDLE) {
+                       reset_action = true;
+                       break;
+               }
+               msleep(20);
+       } while (!time_after(jiffies, timeout));
+
+       if (!reset_action) {
+               mlx5_core_err(dev, "Got timeout waiting for sync reset action, state = %u\n",
+                             rst_state);
+               fw_reset->ret = -ETIMEDOUT;
+               goto done;
+       }
+
+       mlx5_core_warn(dev, "Sync Reset, got reset action. rst_state = %u\n",
+                      rst_state);
+       if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ) {
+               err = mlx5_sync_pci_reset(dev, fw_reset->reset_method);
+               if (err) {
+                       mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, err %d\n",
+                                      err);
+                       fw_reset->ret = err;
+               }
+       }
+
+done:
+       clear_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags);
+}
+
 static void mlx5_sync_reset_now_event(struct work_struct *work)
 {
        struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
@@ -608,16 +661,13 @@ static void mlx5_sync_reset_now_event(struct work_struct *work)
        mlx5_enter_error_state(dev, true);
 done:
        fw_reset->ret = err;
-       mlx5_fw_reset_complete_reload(dev, false);
+       mlx5_fw_reset_complete_reload(dev);
 }
 
 static void mlx5_sync_reset_unload_event(struct work_struct *work)
 {
        struct mlx5_fw_reset *fw_reset;
        struct mlx5_core_dev *dev;
-       unsigned long timeout;
-       bool reset_action;
-       u8 rst_state;
        int err;
 
        fw_reset = container_of(work, struct mlx5_fw_reset, reset_unload_work);
@@ -626,6 +676,7 @@ static void mlx5_sync_reset_unload_event(struct work_struct *work)
        if (mlx5_sync_reset_clear_reset_requested(dev, false))
                return;
 
+       set_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags);
        mlx5_core_warn(dev, "Sync Reset Unload. Function is forced down.\n");
 
        err = mlx5_cmd_fast_teardown_hca(dev);
@@ -634,44 +685,7 @@ static void mlx5_sync_reset_unload_event(struct work_struct *work)
        else
                mlx5_enter_error_state(dev, true);
 
-       if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags))
-               mlx5_unload_one_devl_locked(dev, false);
-       else
-               mlx5_unload_one(dev, false);
-
-       mlx5_set_fw_rst_ack(dev);
-       mlx5_core_warn(dev, "Sync Reset Unload done, device reset expected\n");
-
-       reset_action = false;
-       timeout = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, RESET_UNLOAD));
-       do {
-               rst_state = mlx5_get_fw_rst_state(dev);
-               if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ ||
-                   rst_state == MLX5_FW_RST_STATE_IDLE) {
-                       reset_action = true;
-                       break;
-               }
-               msleep(20);
-       } while (!time_after(jiffies, timeout));
-
-       if (!reset_action) {
-               mlx5_core_err(dev, "Got timeout waiting for sync reset action, state = %u\n",
-                             rst_state);
-               fw_reset->ret = -ETIMEDOUT;
-               goto done;
-       }
-
-       mlx5_core_warn(dev, "Sync Reset, got reset action. rst_state = %u\n", rst_state);
-       if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ) {
-               err = mlx5_sync_pci_reset(dev, fw_reset->reset_method);
-               if (err) {
-                       mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, err %d\n", err);
-                       fw_reset->ret = err;
-               }
-       }
-
-done:
-       mlx5_fw_reset_complete_reload(dev, true);
+       mlx5_fw_reset_complete_reload(dev);
 }
 
 static void mlx5_sync_reset_abort_event(struct work_struct *work)
index ea527d06a85f07c2406d69a90a9cc023363799df..d5b28525c960dcbd8c8ea24c9f6f80d7933d5650 100644 (file)
@@ -12,6 +12,7 @@ int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel,
 int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev);
 
 int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev);
+void mlx5_sync_reset_unload_flow(struct mlx5_core_dev *dev, bool locked);
 int mlx5_fw_reset_verify_fw_complete(struct mlx5_core_dev *dev,
                                     struct netlink_ext_ack *extack);
 void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev);