net/mlx5: HWS, Check if device is down while polling for completion

author Yevgeny Kliteynik <kliteyn@nvidia.com>

Thu, 7 May 2026 17:34:41 +0000 (20:34 +0300)

committer Jakub Kicinski <kuba@kernel.org>

Sun, 10 May 2026 17:22:25 +0000 (10:22 -0700)
author Yevgeny Kliteynik <kliteyn@nvidia.com>
Thu, 7 May 2026 17:34:41 +0000 (20:34 +0300)
committer Jakub Kicinski <kuba@kernel.org>
Sun, 10 May 2026 17:22:25 +0000 (10:22 -0700)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c

index 6dcd9c2a78aa868f845fea6dc80ea924f4d0447e..eae02bc74221881b0c8689d18d730d2fdec44e48 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
@@ -422,6 +422,18 @@ int mlx5hws_bwc_queue_poll(struct mlx5hws_context *ctx,
         if (!got_comp && !drain)
                 return 0;
  
+       if (unlikely(ctx->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)) {
+               /* If the device is down for any reason (e.g. FLR), the HW will
+                * no longer generate completions.
+                * Note that ETIMEDOUT is returned here because the BWC layer
+                * already has a special handling for timeouts - it breaks the
+                * rehash / resize / shrink loops to avoid chain of timeouts.
+                */
+               mlx5_core_warn_once(ctx->mdev,
+                                   "BWC poll: device is down, polling for completion aborted\n");
+               return -ETIMEDOUT;
+       }
+
         queue_full = mlx5hws_send_engine_full(&ctx->send_queue[queue_id]);
         while (queue_full || ((got_comp || drain) && *pending_rules)) {
                 ret = mlx5hws_send_queue_poll(ctx, queue_id, comp, burst_th);
author	Yevgeny Kliteynik <kliteyn@nvidia.com>
	Thu, 7 May 2026 17:34:41 +0000 (20:34 +0300)
committer	Jakub Kicinski <kuba@kernel.org>
	Sun, 10 May 2026 17:22:25 +0000 (10:22 -0700)