From: Hongjie Fang Date: Fri, 5 Jun 2026 11:20:34 +0000 (+0800) Subject: scsi: ufs: core: Handle PM commands timeout before SCSI EH X-Git-Url: http://git.ipfire.org/gitweb/?a=commitdiff_plain;h=01d5e237b33931b970dd190dd6a19c5ef32c105d;p=thirdparty%2Fkernel%2Flinux.git scsi: ufs: core: Handle PM commands timeout before SCSI EH A PM START STOP sent from the UFS well-known LU resume path can race with SCSI EH: The "wl resume" task flow is: __ufshcd_wl_resume() ufshcd_set_dev_pwr_mode(UFS_ACTIVE_PWR_MODE) ufshcd_execute_start_stop() scsi_execute_cmd() blk_execute_rq <-- wait scsi_check_passthrough() <-- may retry START STOP If the first START STOP time out, SCSI EH may already recover the link and reset the device before scsi_execute_cmd() returns: scsi_timeout() scsi_eh_scmd_add() scsi_error_handler() scsi_unjam_host() scsi_eh_ready_devs() scsi_eh_host_reset() ufshcd_eh_host_reset_handler() if (hba->pm_op_in_progress) ufshcd_link_recovery() ufshcd_device_reset() ufshcd_host_reset_and_restore() ... scsi_eh_flush_done_q() <-- wakeup "wl resume" task ... <-- host still in SHOST_RECOVERY scsi_restart_operations() A later passthrough retry can then run while the host is still in SHOST_RECOVERY and hit the SCMD_FAIL_IF_RECOVERING path: scsi_queue_rq() if (scsi_host_in_recovery(shost) && cmd->flags & SCMD_FAIL_IF_RECOVERING) return BLK_STS_OFFLINE That retry completes with DID_ERROR or DID_NO_CONNECT even though EH may already have restored the device to an operational ACTIVE state. Handle these PM timeouts directly from ufshcd_eh_timed_out() instead. After ufshcd_link_recovery(), complete the timed-out command immediately if it has not been completed already. For regular SCSI commands, complete them with DID_REQUEUE to match the existing MCQ force-completion semantics and allow scsi_execute_cmd() to retry if needed. For reserved internal device-management commands, finish the request with DID_TIME_OUT without calling ufshcd_release_scsi_cmd() since those commands use different resource lifetime rules. The system_suspending flag is no longer needed because PM command timeout handling now uses pm_op_in_progress. Fixes: b8c3a7bac9b6 ("scsi: ufs: Have midlayer retry start stop errors") Signed-off-by: Hongjie Fang Reviewed-by: Bart Van Assche Reviewed-by: Peter Wang Link: https://patch.msgid.link/20260605112034.3802540-1-hongjiefang@asrmicro.com Signed-off-by: Martin K. Petersen --- diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 60ec2c63c2d8..2bbab3b2f656 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -9484,22 +9484,44 @@ static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd) { struct ufs_hba *hba = shost_priv(scmd->device->host); - if (!hba->system_suspending) { + if (!hba->pm_op_in_progress) { /* Activate the error handler in the SCSI core. */ return SCSI_EH_NOT_HANDLED; } /* - * If we get here we know that no TMFs are outstanding and also that - * the only pending command is a START STOP UNIT command. Handle the - * timeout of that command directly to prevent a deadlock between + * Handle the timeout directly to prevent a deadlock between * ufshcd_set_dev_pwr_mode() and ufshcd_err_handler(). */ ufshcd_link_recovery(hba); dev_info(hba->dev, "%s() finished; outstanding_tasks = %#lx.\n", __func__, hba->outstanding_tasks); - return scsi_host_busy(hba->host) ? SCSI_EH_RESET_TIMER : SCSI_EH_DONE; + /* + * ufshcd_link_recovery() may already have completed @scmd, e.g. via + * the existing MCQ force-completion path. + */ + if (!test_bit(SCMD_STATE_COMPLETE, &scmd->state)) { + if (!hba->mcq_enabled) { + unsigned long flags; + struct request *rq = scsi_cmd_to_rq(scmd); + + spin_lock_irqsave(&hba->outstanding_lock, flags); + __clear_bit(rq->tag, &hba->outstanding_reqs); + spin_unlock_irqrestore(&hba->outstanding_lock, flags); + } + + if (ufshcd_is_scsi_cmd(scmd)) { + set_host_byte(scmd, DID_REQUEUE); + ufshcd_release_scsi_cmd(hba, scmd); + } else { + set_host_byte(scmd, DID_TIME_OUT); + } + + scsi_done(scmd); + } + + return SCSI_EH_DONE; } static const struct attribute_group *ufshcd_driver_groups[] = { @@ -10523,7 +10545,6 @@ static int ufshcd_wl_suspend(struct device *dev) hba = shost_priv(sdev->host); down(&hba->host_sem); - hba->system_suspending = true; if (pm_runtime_suspended(dev)) goto out; @@ -10565,7 +10586,6 @@ out: hba->curr_dev_pwr_mode, hba->uic_link_state); if (!ret) hba->is_sys_suspended = false; - hba->system_suspending = false; up(&hba->host_sem); return ret; } diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index 3eaae082329c..248d0a5bef40 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -1029,8 +1029,6 @@ enum ufshcd_mcq_opr { * @caps: bitmask with information about UFS controller capabilities * @devfreq: frequency scaling information owned by the devfreq core * @clk_scaling: frequency scaling information owned by the UFS driver - * @system_suspending: system suspend has been started and system resume has - * not yet finished. * @is_sys_suspended: UFS device has been suspended because of system suspend * @urgent_bkops_lvl: keeps track of urgent bkops level for device * @is_urgent_bkops_lvl_checked: keeps track if the urgent bkops level for @@ -1206,7 +1204,6 @@ struct ufs_hba { struct devfreq *devfreq; struct ufs_clk_scaling clk_scaling; - bool system_suspending; bool is_sys_suspended; enum bkops_status urgent_bkops_lvl;