]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
scsi: ufs: core: Fix EH failure after W-LUN resume error
authorBrian Kao <powenkao@google.com>
Wed, 12 Nov 2025 06:32:02 +0000 (06:32 +0000)
committerMartin K. Petersen <martin.petersen@oracle.com>
Sat, 29 Nov 2025 19:58:38 +0000 (14:58 -0500)
When a W-LUN resume fails, its parent devices in the SCSI hierarchy,
including the scsi_target, may be runtime suspended. Subsequently, the
error handler in ufshcd_recover_pm_error() fails to set the W-LUN device
back to active because the parent target is not active.  This results in
the following errors:

  google-ufshcd 3c2d0000.ufs: ufshcd_err_handler started; HBA state eh_fatal; ...
  ufs_device_wlun 0:0:0:49488: START_STOP failed for power mode: 1, result 40000
  ufs_device_wlun 0:0:0:49488: ufshcd_wl_runtime_resume failed: -5
  ...
  ufs_device_wlun 0:0:0:49488: runtime PM trying to activate child device 0:0:0:49488 but parent (target0:0:0) is not active

Address this by:

 1. Ensuring the W-LUN's parent scsi_target is runtime resumed before
    attempting to set the W-LUN to active within
    ufshcd_recover_pm_error().

 2. Explicitly checking for power.runtime_error on the HBA and W-LUN
    devices before calling pm_runtime_set_active() to clear the error
    state.

 3. Adding pm_runtime_get_sync(hba->dev) in
    ufshcd_err_handling_prepare() to ensure the HBA itself is active
    during error recovery, even if a child device resume failed.

These changes ensure the device power states are managed correctly
during error recovery.

Signed-off-by: Brian Kao <powenkao@google.com>
Tested-by: Brian Kao <powenkao@google.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Link: https://patch.msgid.link/20251112063214.1195761-1-powenkao@google.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
drivers/ufs/core/ufshcd.c

index 040a0ceb170a7f0019bb1c63f3dd1326dfdfea0f..1b3fe1d8655e6dfa67b4086811317832bc236fbe 100644 (file)
@@ -6504,6 +6504,11 @@ static void ufshcd_clk_scaling_suspend(struct ufs_hba *hba, bool suspend)
 
 static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
 {
+       /*
+        * A WLUN resume failure could potentially lead to the HBA being
+        * runtime suspended, so take an extra reference on hba->dev.
+        */
+       pm_runtime_get_sync(hba->dev);
        ufshcd_rpm_get_sync(hba);
        if (pm_runtime_status_suspended(&hba->ufs_device_wlun->sdev_gendev) ||
            hba->is_sys_suspended) {
@@ -6543,6 +6548,7 @@ static void ufshcd_err_handling_unprepare(struct ufs_hba *hba)
        if (ufshcd_is_clkscaling_supported(hba))
                ufshcd_clk_scaling_suspend(hba, false);
        ufshcd_rpm_put(hba);
+       pm_runtime_put(hba->dev);
 }
 
 static inline bool ufshcd_err_handling_should_stop(struct ufs_hba *hba)
@@ -6557,28 +6563,42 @@ static inline bool ufshcd_err_handling_should_stop(struct ufs_hba *hba)
 #ifdef CONFIG_PM
 static void ufshcd_recover_pm_error(struct ufs_hba *hba)
 {
+       struct scsi_target *starget = hba->ufs_device_wlun->sdev_target;
        struct Scsi_Host *shost = hba->host;
        struct scsi_device *sdev;
        struct request_queue *q;
-       int ret;
+       bool resume_sdev_queues = false;
 
        hba->is_sys_suspended = false;
+
        /*
-        * Set RPM status of wlun device to RPM_ACTIVE,
-        * this also clears its runtime error.
+        * Ensure the parent's error status is cleared before proceeding
+        * to the child, as the parent must be active to activate the child.
         */
-       ret = pm_runtime_set_active(&hba->ufs_device_wlun->sdev_gendev);
+       if (hba->dev->power.runtime_error) {
+               /* hba->dev has no functional parent thus simplily set RPM_ACTIVE */
+               pm_runtime_set_active(hba->dev);
+               resume_sdev_queues = true;
+       }
+
+       if (hba->ufs_device_wlun->sdev_gendev.power.runtime_error) {
+               /*
+                * starget, parent of wlun, might be suspended if wlun resume failed.
+                * Make sure parent is resumed before set child (wlun) active.
+                */
+               pm_runtime_get_sync(&starget->dev);
+               pm_runtime_set_active(&hba->ufs_device_wlun->sdev_gendev);
+               pm_runtime_put_sync(&starget->dev);
+               resume_sdev_queues = true;
+       }
 
-       /* hba device might have a runtime error otherwise */
-       if (ret)
-               ret = pm_runtime_set_active(hba->dev);
        /*
         * If wlun device had runtime error, we also need to resume those
         * consumer scsi devices in case any of them has failed to be
         * resumed due to supplier runtime resume failure. This is to unblock
         * blk_queue_enter in case there are bios waiting inside it.
         */
-       if (!ret) {
+       if (resume_sdev_queues) {
                shost_for_each_device(sdev, shost) {
                        q = sdev->request_queue;
                        if (q->dev && (q->rpm_status == RPM_SUSPENDED ||