In the mentioned "Fixes" commit, various work tasks triggering devlink
health reporter recovery were switched to use netdev_trylock to protect
against concurrent tear down of the channels being recovered. But this
had the side effect of introducing potential deadlocks because of
incorrect lock ordering.
The correct lock order is described by the init flow:
probe_one -> mlx5_init_one (acquires devlink lock)
-> mlx5_init_one_devl_locked -> mlx5_register_device
-> mlx5_rescan_drivers_locked -...-> mlx5e_probe -> _mlx5e_probe
-> register_netdev (acquires rtnl lock)
-> register_netdevice (acquires netdev lock)
=> devlink lock -> rtnl lock -> netdev lock.
But in the current recovery flow, the order is wrong:
mlx5e_tx_err_cqe_work (acquires netdev lock)
-> mlx5e_reporter_tx_err_cqe -> mlx5e_health_report
-> devlink_health_report (acquires devlink lock => boom!)
-> devlink_health_reporter_recover
-> mlx5e_tx_reporter_recover -> mlx5e_tx_reporter_recover_from_ctx
-> mlx5e_tx_reporter_err_cqe_recover
The same pattern exists in:
mlx5e_reporter_rx_timeout
mlx5e_reporter_tx_ptpsq_unhealthy
mlx5e_reporter_tx_timeout
Fix these by moving the netdev_trylock calls from the work handlers
lower in the call stack, in the respective recovery functions, where
they are actually necessary.
Fixes: 8f7b00307bf1 ("net/mlx5e: Convert mlx5 netdevs to instance locking")
Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Jacob Keller <Jacob.e.keller@intel.com>
Link: https://patch.msgid.link/20260218072904.1764634-6-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
{
struct mlx5e_ptpsq *ptpsq =
container_of(work, struct mlx5e_ptpsq, report_unhealthy_work);
- struct mlx5e_txqsq *sq = &ptpsq->txqsq;
-
- /* Recovering the PTP SQ means re-enabling NAPI, which requires the
- * netdev instance lock. However, SQ closing has to wait for this work
- * task to finish while also holding the same lock. So either get the
- * lock or find that the SQ is no longer enabled and thus this work is
- * not relevant anymore.
- */
- while (!netdev_trylock(sq->netdev)) {
- if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))
- return;
- msleep(20);
- }
mlx5e_reporter_tx_ptpsq_unhealthy(ptpsq);
- netdev_unlock(sq->netdev);
}
static int mlx5e_ptp_open_txqsq(struct mlx5e_ptp *c, u32 tisn,
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2019 Mellanox Technologies.
+#include <net/netdev_lock.h>
+
#include "health.h"
#include "params.h"
#include "txrx.h"
rq = ctx;
priv = rq->priv;
+ /* Acquire netdev instance lock to synchronize with channel close and
+ * reopen flows. Either successfully obtain the lock, or detect that
+ * channels are closing for another reason, making this work no longer
+ * necessary.
+ */
+ while (!netdev_trylock(rq->netdev)) {
+ if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &rq->priv->state))
+ return 0;
+ msleep(20);
+ }
mutex_lock(&priv->state_lock);
eq = rq->cq.mcq.eq;
clear_bit(MLX5E_SQ_STATE_ENABLED, &rq->icosq->state);
mutex_unlock(&priv->state_lock);
+ netdev_unlock(rq->netdev);
return err;
}
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (c) 2019 Mellanox Technologies. */
+#include <net/netdev_lock.h>
+
#include "health.h"
#include "en/ptp.h"
#include "en/devlink.h"
if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
return 0;
+ /* Recovering queues means re-enabling NAPI, which requires the netdev
+ * instance lock. However, SQ closing flows have to wait for work tasks
+ * to finish while also holding the netdev instance lock. So either get
+ * the lock or find that the SQ is no longer enabled and thus this work
+ * is not relevant anymore.
+ */
+ while (!netdev_trylock(dev)) {
+ if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))
+ return 0;
+ msleep(20);
+ }
+
err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
if (err) {
netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
else
mlx5e_trigger_napi_sched(sq->cq.napi);
+ netdev_unlock(dev);
return 0;
out:
clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
+ netdev_unlock(dev);
return err;
}
sq = to_ctx->sq;
eq = sq->cq.mcq.eq;
priv = sq->priv;
+
+ /* Recovering the TX queues implies re-enabling NAPI, which requires
+ * the netdev instance lock.
+ * However, channel closing flows have to wait for this work to finish
+ * while holding the same lock. So either get the lock or find that
+ * channels are being closed for other reason and this work is not
+ * relevant anymore.
+ */
+ while (!netdev_trylock(sq->netdev)) {
+ if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state))
+ return 0;
+ msleep(20);
+ }
+
err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats);
if (!err) {
to_ctx->status = 0; /* this sq recovered */
- return err;
+ goto out;
}
mutex_lock(&priv->state_lock);
mutex_unlock(&priv->state_lock);
if (!err) {
to_ctx->status = 1; /* all channels recovered */
- return err;
+ goto out;
}
to_ctx->status = err;
netdev_err(priv->netdev,
"mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n",
err);
-
+out:
+ netdev_unlock(sq->netdev);
return err;
}
return 0;
priv = ptpsq->txqsq.priv;
+ netdev = priv->netdev;
+
+ /* Recovering the PTP SQ means re-enabling NAPI, which requires the
+ * netdev instance lock. However, SQ closing has to wait for this work
+ * task to finish while also holding the same lock. So either get the
+ * lock or find that the SQ is no longer enabled and thus this work is
+ * not relevant anymore.
+ */
+ while (!netdev_trylock(netdev)) {
+ if (!test_bit(MLX5E_SQ_STATE_ENABLED, &ptpsq->txqsq.state))
+ return 0;
+ msleep(20);
+ }
mutex_lock(&priv->state_lock);
chs = &priv->channels;
- netdev = priv->netdev;
carrier_ok = netif_carrier_ok(netdev);
netif_carrier_off(netdev);
netif_carrier_on(netdev);
mutex_unlock(&priv->state_lock);
+ netdev_unlock(netdev);
return err;
}
struct mlx5e_rq,
rx_timeout_work);
- /* Acquire netdev instance lock to synchronize with channel close and
- * reopen flows. Either successfully obtain the lock, or detect that
- * channels are closing for another reason, making this work no longer
- * necessary.
- */
- while (!netdev_trylock(rq->netdev)) {
- if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &rq->priv->state))
- return;
- msleep(20);
- }
-
mlx5e_reporter_rx_timeout(rq);
- netdev_unlock(rq->netdev);
}
static int mlx5e_alloc_mpwqe_rq_drop_page(struct mlx5e_rq *rq)
struct mlx5e_txqsq *sq = container_of(recover_work, struct mlx5e_txqsq,
recover_work);
- /* Recovering queues means re-enabling NAPI, which requires the netdev
- * instance lock. However, SQ closing flows have to wait for work tasks
- * to finish while also holding the netdev instance lock. So either get
- * the lock or find that the SQ is no longer enabled and thus this work
- * is not relevant anymore.
- */
- while (!netdev_trylock(sq->netdev)) {
- if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))
- return;
- msleep(20);
- }
-
mlx5e_reporter_tx_err_cqe(sq);
- netdev_unlock(sq->netdev);
}
static struct dim_cq_moder mlx5e_get_def_tx_moderation(u8 cq_period_mode)
struct net_device *netdev = priv->netdev;
int i;
- /* Recovering the TX queues implies re-enabling NAPI, which requires
- * the netdev instance lock.
- * However, channel closing flows have to wait for this work to finish
- * while holding the same lock. So either get the lock or find that
- * channels are being closed for other reason and this work is not
- * relevant anymore.
- */
- while (!netdev_trylock(netdev)) {
- if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state))
- return;
- msleep(20);
- }
-
for (i = 0; i < netdev->real_num_tx_queues; i++) {
struct netdev_queue *dev_queue =
netdev_get_tx_queue(netdev, i);
/* break if tried to reopened channels */
break;
}
-
- netdev_unlock(netdev);
}
static void mlx5e_tx_timeout(struct net_device *dev, unsigned int txqueue)