]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
scsi: ufs: core: Handle PM commands timeout before SCSI EH
authorHongjie Fang <hongjiefang@asrmicro.com>
Fri, 5 Jun 2026 11:20:34 +0000 (19:20 +0800)
committerMartin K. Petersen <martin.petersen@oracle.com>
Mon, 8 Jun 2026 21:41:45 +0000 (17:41 -0400)
A PM START STOP sent from the UFS well-known LU resume path can race
with SCSI EH:

The "wl resume" task flow is:
  __ufshcd_wl_resume()
    ufshcd_set_dev_pwr_mode(UFS_ACTIVE_PWR_MODE)
      ufshcd_execute_start_stop()
        scsi_execute_cmd()
          blk_execute_rq           <-- wait
          scsi_check_passthrough() <-- may retry START STOP

If the first START STOP time out, SCSI EH may already recover the link and
reset the device before scsi_execute_cmd() returns:

  scsi_timeout()
    scsi_eh_scmd_add()
      scsi_error_handler()
        scsi_unjam_host()
          scsi_eh_ready_devs()
            scsi_eh_host_reset()
              ufshcd_eh_host_reset_handler()
                if (hba->pm_op_in_progress)
                  ufshcd_link_recovery()
                    ufshcd_device_reset()
                    ufshcd_host_reset_and_restore()
          ...
          scsi_eh_flush_done_q()   <-- wakeup "wl resume" task
        ...                        <-- host still in SHOST_RECOVERY
        scsi_restart_operations()

A later passthrough retry can then run while the host is still in
SHOST_RECOVERY and hit the SCMD_FAIL_IF_RECOVERING path:

  scsi_queue_rq()
    if (scsi_host_in_recovery(shost) &&
        cmd->flags & SCMD_FAIL_IF_RECOVERING)
      return BLK_STS_OFFLINE

That retry completes with DID_ERROR or DID_NO_CONNECT even though EH may
already have restored the device to an operational ACTIVE state.

Handle these PM timeouts directly from ufshcd_eh_timed_out() instead.
After ufshcd_link_recovery(), complete the timed-out command immediately
if it has not been completed already.

For regular SCSI commands, complete them with DID_REQUEUE to match the
existing MCQ force-completion semantics and allow scsi_execute_cmd() to
retry if needed. For reserved internal device-management commands,
finish the request with DID_TIME_OUT without calling
ufshcd_release_scsi_cmd() since those commands use different resource
lifetime rules.

The system_suspending flag is no longer needed because PM command
timeout handling now uses pm_op_in_progress.

Fixes: b8c3a7bac9b6 ("scsi: ufs: Have midlayer retry start stop errors")
Signed-off-by: Hongjie Fang <hongjiefang@asrmicro.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Peter Wang <peter.wang@mediatek.com>
Link: https://patch.msgid.link/20260605112034.3802540-1-hongjiefang@asrmicro.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
drivers/ufs/core/ufshcd.c
include/ufs/ufshcd.h

index 60ec2c63c2d87ebc6c551e8c99a2d3fdb7b708ca..2bbab3b2f6564a685428d1ce3c5f82d17dcbd166 100644 (file)
@@ -9484,22 +9484,44 @@ static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd)
 {
        struct ufs_hba *hba = shost_priv(scmd->device->host);
 
-       if (!hba->system_suspending) {
+       if (!hba->pm_op_in_progress) {
                /* Activate the error handler in the SCSI core. */
                return SCSI_EH_NOT_HANDLED;
        }
 
        /*
-        * If we get here we know that no TMFs are outstanding and also that
-        * the only pending command is a START STOP UNIT command. Handle the
-        * timeout of that command directly to prevent a deadlock between
+        * Handle the timeout directly to prevent a deadlock between
         * ufshcd_set_dev_pwr_mode() and ufshcd_err_handler().
         */
        ufshcd_link_recovery(hba);
        dev_info(hba->dev, "%s() finished; outstanding_tasks = %#lx.\n",
                 __func__, hba->outstanding_tasks);
 
-       return scsi_host_busy(hba->host) ? SCSI_EH_RESET_TIMER : SCSI_EH_DONE;
+       /*
+        * ufshcd_link_recovery() may already have completed @scmd, e.g. via
+        * the existing MCQ force-completion path.
+        */
+       if (!test_bit(SCMD_STATE_COMPLETE, &scmd->state)) {
+               if (!hba->mcq_enabled) {
+                       unsigned long flags;
+                       struct request *rq = scsi_cmd_to_rq(scmd);
+
+                       spin_lock_irqsave(&hba->outstanding_lock, flags);
+                       __clear_bit(rq->tag, &hba->outstanding_reqs);
+                       spin_unlock_irqrestore(&hba->outstanding_lock, flags);
+               }
+
+               if (ufshcd_is_scsi_cmd(scmd)) {
+                       set_host_byte(scmd, DID_REQUEUE);
+                       ufshcd_release_scsi_cmd(hba, scmd);
+               } else {
+                       set_host_byte(scmd, DID_TIME_OUT);
+               }
+
+               scsi_done(scmd);
+       }
+
+       return SCSI_EH_DONE;
 }
 
 static const struct attribute_group *ufshcd_driver_groups[] = {
@@ -10523,7 +10545,6 @@ static int ufshcd_wl_suspend(struct device *dev)
 
        hba = shost_priv(sdev->host);
        down(&hba->host_sem);
-       hba->system_suspending = true;
 
        if (pm_runtime_suspended(dev))
                goto out;
@@ -10565,7 +10586,6 @@ out:
                hba->curr_dev_pwr_mode, hba->uic_link_state);
        if (!ret)
                hba->is_sys_suspended = false;
-       hba->system_suspending = false;
        up(&hba->host_sem);
        return ret;
 }
index 3eaae082329c95d4b70dda9924f768af1732ff6d..248d0a5bef4077672c0c11baccc7ce0f88ec9bfd 100644 (file)
@@ -1029,8 +1029,6 @@ enum ufshcd_mcq_opr {
  * @caps: bitmask with information about UFS controller capabilities
  * @devfreq: frequency scaling information owned by the devfreq core
  * @clk_scaling: frequency scaling information owned by the UFS driver
- * @system_suspending: system suspend has been started and system resume has
- *     not yet finished.
  * @is_sys_suspended: UFS device has been suspended because of system suspend
  * @urgent_bkops_lvl: keeps track of urgent bkops level for device
  * @is_urgent_bkops_lvl_checked: keeps track if the urgent bkops level for
@@ -1206,7 +1204,6 @@ struct ufs_hba {
 
        struct devfreq *devfreq;
        struct ufs_clk_scaling clk_scaling;
-       bool system_suspending;
        bool is_sys_suspended;
 
        enum bkops_status urgent_bkops_lvl;