From: Boris Brezillon Date: Wed, 11 Dec 2024 07:54:18 +0000 (+0100) Subject: drm/panthor: Be robust against resume failures X-Git-Tag: v6.14-rc1~174^2~13^2~82 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=303e9e981db6c9f0ccd8067f0971416d929be426;p=thirdparty%2Fkernel%2Fstable.git drm/panthor: Be robust against resume failures When the runtime PM resume callback returns an error, it puts the device in a state where it can't be resumed anymore. Make sure we can recover from such transient failures by calling pm_runtime_set_suspended() explicitly after a pm_runtime_resume_and_get() failure. v3: - Add R-b/A-b v2: - Add a comment explaining potential races in panthor_device_resume_and_get() Signed-off-by: Boris Brezillon Reviewed-by: Adrian Larumbe Acked-by: Steven Price Link: https://patchwork.freedesktop.org/patch/msgid/20241211075419.2333731-5-boris.brezillon@collabora.com --- diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c index 4faca2684324c..5f9602bddc539 100644 --- a/drivers/gpu/drm/panthor/panthor_device.c +++ b/drivers/gpu/drm/panthor/panthor_device.c @@ -504,6 +504,7 @@ err_disable_core_clk: err_set_suspended: atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_SUSPENDED); + atomic_set(&ptdev->pm.recovery_needed, 1); return ret; } diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h index 0e68f5a70d206..b6c4f25a5d6e5 100644 --- a/drivers/gpu/drm/panthor/panthor_device.h +++ b/drivers/gpu/drm/panthor/panthor_device.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -180,6 +181,9 @@ struct panthor_device { * is suspended. */ struct page *dummy_latest_flush; + + /** @recovery_needed: True when a resume attempt failed. */ + atomic_t recovery_needed; } pm; /** @profile_mask: User-set profiling flags for job accounting. */ @@ -243,6 +247,28 @@ int panthor_device_mmap_io(struct panthor_device *ptdev, int panthor_device_resume(struct device *dev); int panthor_device_suspend(struct device *dev); +static inline int panthor_device_resume_and_get(struct panthor_device *ptdev) +{ + int ret = pm_runtime_resume_and_get(ptdev->base.dev); + + /* If the resume failed, we need to clear the runtime_error, which + * can done by forcing the RPM state to suspended. If multiple + * threads called panthor_device_resume_and_get(), we only want + * one of them to update the state, hence the cmpxchg. Note that a + * thread might enter panthor_device_resume_and_get() and call + * pm_runtime_resume_and_get() after another thread had attempted + * to resume and failed. This means we will end up with an error + * without even attempting a resume ourselves. The only risk here + * is to report an error when the second resume attempt might have + * succeeded. Given resume errors are not expected, this is probably + * something we can live with. + */ + if (ret && atomic_cmpxchg(&ptdev->pm.recovery_needed, 1, 0) == 1) + pm_runtime_set_suspended(ptdev->base.dev); + + return ret; +} + enum drm_panthor_exception_type { DRM_PANTHOR_EXCEPTION_OK = 0x00, DRM_PANTHOR_EXCEPTION_TERMINATED = 0x04, diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c index 32421bad3097d..33cfba8775254 100644 --- a/drivers/gpu/drm/panthor/panthor_drv.c +++ b/drivers/gpu/drm/panthor/panthor_drv.c @@ -763,7 +763,7 @@ static int panthor_query_timestamp_info(struct panthor_device *ptdev, { int ret; - ret = pm_runtime_resume_and_get(ptdev->base.dev); + ret = panthor_device_resume_and_get(ptdev); if (ret) return ret; diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c index ef4bec7ff9c71..58200d299fbb7 100644 --- a/drivers/gpu/drm/panthor/panthor_sched.c +++ b/drivers/gpu/drm/panthor/panthor_sched.c @@ -2354,7 +2354,7 @@ static void tick_work(struct work_struct *work) if (!drm_dev_enter(&ptdev->base, &cookie)) return; - ret = pm_runtime_resume_and_get(ptdev->base.dev); + ret = panthor_device_resume_and_get(ptdev); if (drm_WARN_ON(&ptdev->base, ret)) goto out_dev_exit; @@ -3115,7 +3115,7 @@ queue_run_job(struct drm_sched_job *sched_job) return dma_fence_get(job->done_fence); } - ret = pm_runtime_resume_and_get(ptdev->base.dev); + ret = panthor_device_resume_and_get(ptdev); if (drm_WARN_ON(&ptdev->base, ret)) return ERR_PTR(ret);