+++ /dev/null
-From 91adf137c885ef6e01eb88f38c2f113930228995 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Tue, 13 Jan 2026 11:08:13 -0500
-Subject: scsi: core: Wake up the error handler when final completions race
- against each other
-
-From: David Jeffery <djeffery@redhat.com>
-
-[ Upstream commit fe2f8ad6f0999db3b318359a01ee0108c703a8c3 ]
-
-The fragile ordering between marking commands completed or failed so
-that the error handler only wakes when the last running command
-completes or times out has race conditions. These race conditions can
-cause the SCSI layer to fail to wake the error handler, leaving I/O
-through the SCSI host stuck as the error state cannot advance.
-
-First, there is an memory ordering issue within scsi_dec_host_busy().
-The write which clears SCMD_STATE_INFLIGHT may be reordered with reads
-counting in scsi_host_busy(). While the local CPU will see its own
-write, reordering can allow other CPUs in scsi_dec_host_busy() or
-scsi_eh_inc_host_failed() to see a raised busy count, causing no CPU to
-see a host busy equal to the host_failed count.
-
-This race condition can be prevented with a memory barrier on the error
-path to force the write to be visible before counting host busy
-commands.
-
-Second, there is a general ordering issue with scsi_eh_inc_host_failed(). By
-counting busy commands before incrementing host_failed, it can race with a
-final command in scsi_dec_host_busy(), such that scsi_dec_host_busy() does
-not see host_failed incremented but scsi_eh_inc_host_failed() counts busy
-commands before SCMD_STATE_INFLIGHT is cleared by scsi_dec_host_busy(),
-resulting in neither waking the error handler task.
-
-This needs the call to scsi_host_busy() to be moved after host_failed is
-incremented to close the race condition.
-
-Fixes: 6eb045e092ef ("scsi: core: avoid host-wide host_busy counter for scsi_mq")
-Signed-off-by: David Jeffery <djeffery@redhat.com>
-Reviewed-by: Bart Van Assche <bvanassche@acm.org>
-Link: https://patch.msgid.link/20260113161036.6730-1-djeffery@redhat.com
-Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- drivers/scsi/scsi_error.c | 11 ++++++++++-
- drivers/scsi/scsi_lib.c | 8 ++++++++
- 2 files changed, 18 insertions(+), 1 deletion(-)
-
-diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
-index 2d5dc488f5117..08ad3c0af12ea 100644
---- a/drivers/scsi/scsi_error.c
-+++ b/drivers/scsi/scsi_error.c
-@@ -276,11 +276,20 @@ static void scsi_eh_inc_host_failed(struct rcu_head *head)
- {
- struct scsi_cmnd *scmd = container_of(head, typeof(*scmd), rcu);
- struct Scsi_Host *shost = scmd->device->host;
-- unsigned int busy = scsi_host_busy(shost);
-+ unsigned int busy;
- unsigned long flags;
-
- spin_lock_irqsave(shost->host_lock, flags);
- shost->host_failed++;
-+ spin_unlock_irqrestore(shost->host_lock, flags);
-+ /*
-+ * The counting of busy requests needs to occur after adding to
-+ * host_failed or after the lock acquire for adding to host_failed
-+ * to prevent a race with host unbusy and missing an eh wakeup.
-+ */
-+ busy = scsi_host_busy(shost);
-+
-+ spin_lock_irqsave(shost->host_lock, flags);
- scsi_eh_wakeup(shost, busy);
- spin_unlock_irqrestore(shost->host_lock, flags);
- }
-diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
-index c8be41d8eb247..576e0eb8f8f80 100644
---- a/drivers/scsi/scsi_lib.c
-+++ b/drivers/scsi/scsi_lib.c
-@@ -281,6 +281,14 @@ static void scsi_dec_host_busy(struct Scsi_Host *shost, struct scsi_cmnd *cmd)
- rcu_read_lock();
- __clear_bit(SCMD_STATE_INFLIGHT, &cmd->state);
- if (unlikely(scsi_host_in_recovery(shost))) {
-+ /*
-+ * Ensure the clear of SCMD_STATE_INFLIGHT is visible to
-+ * other CPUs before counting busy requests. Otherwise,
-+ * reordering can cause CPUs to race and miss an eh wakeup
-+ * when no CPU sees all busy requests as done or timed out.
-+ */
-+ smp_mb();
-+
- unsigned int busy = scsi_host_busy(shost);
-
- spin_lock_irqsave(shost->host_lock, flags);
---
-2.51.0
-