]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
net/mlx5: HWS, Check if device is down while polling for completion
authorYevgeny Kliteynik <kliteyn@nvidia.com>
Thu, 7 May 2026 17:34:41 +0000 (20:34 +0300)
committerJakub Kicinski <kuba@kernel.org>
Sun, 10 May 2026 17:22:25 +0000 (10:22 -0700)
In case the device is down for any reason (e.g. FLR),
the HW will no longer generate completions - no point
polling and waiting for timeout.

Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Erez Shitrit <erezsh@nvidia.com>
Reviewed-by: Shay Drori <shayd@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20260507173443.320465-2-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c

index 6dcd9c2a78aa868f845fea6dc80ea924f4d0447e..eae02bc74221881b0c8689d18d730d2fdec44e48 100644 (file)
@@ -422,6 +422,18 @@ int mlx5hws_bwc_queue_poll(struct mlx5hws_context *ctx,
        if (!got_comp && !drain)
                return 0;
 
+       if (unlikely(ctx->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)) {
+               /* If the device is down for any reason (e.g. FLR), the HW will
+                * no longer generate completions.
+                * Note that ETIMEDOUT is returned here because the BWC layer
+                * already has a special handling for timeouts - it breaks the
+                * rehash / resize / shrink loops to avoid chain of timeouts.
+                */
+               mlx5_core_warn_once(ctx->mdev,
+                                   "BWC poll: device is down, polling for completion aborted\n");
+               return -ETIMEDOUT;
+       }
+
        queue_full = mlx5hws_send_engine_full(&ctx->send_queue[queue_id]);
        while (queue_full || ((got_comp || drain) && *pending_rules)) {
                ret = mlx5hws_send_queue_poll(ctx, queue_id, comp, burst_th);