From 6a4d287a1ae6e49f8ef57fcb2a512c2b0bbef966 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sun, 29 Jun 2025 13:13:06 -0700 Subject: [PATCH] drm/msm: Mark VM as unusable on GPU hangs If userspace has opted-in to VM_BIND, then GPU hangs and VM_BIND errors will mark the VM as unusable. Signed-off-by: Rob Clark Signed-off-by: Rob Clark Tested-by: Antonino Maniscalco Reviewed-by: Antonino Maniscalco Patchwork: https://patchwork.freedesktop.org/patch/661499/ --- drivers/gpu/drm/msm/msm_gem.h | 17 +++++++++++++++++ drivers/gpu/drm/msm/msm_gem_submit.c | 3 +++ drivers/gpu/drm/msm/msm_gpu.c | 16 ++++++++++++++-- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h index b5bf21f62f9d4..f2631a8c62b95 100644 --- a/drivers/gpu/drm/msm/msm_gem.h +++ b/drivers/gpu/drm/msm/msm_gem.h @@ -76,6 +76,23 @@ struct msm_gem_vm { /** @managed: is this a kernel managed VM? */ bool managed; + + /** + * @unusable: True if the VM has turned unusable because something + * bad happened during an asynchronous request. + * + * We don't try to recover from such failures, because this implies + * informing userspace about the specific operation that failed, and + * hoping the userspace driver can replay things from there. This all + * sounds very complicated for little gain. + * + * Instead, we should just flag the VM as unusable, and fail any + * further request targeting this VM. + * + * As an analogy, this would be mapped to a VK_ERROR_DEVICE_LOST + * situation, where the logical device needs to be re-created. + */ + bool unusable; }; #define to_msm_vm(x) container_of(x, struct msm_gem_vm, base) diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index daf05c380e840..50fafcacf035d 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -681,6 +681,9 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, if (args->pad) return -EINVAL; + if (to_msm_vm(ctx->vm)->unusable) + return UERR(EPIPE, dev, "context is unusable"); + /* for now, we just have 3d pipe.. eventually this would need to * be more clever to dispatch to appropriate gpu module: */ diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index c08c942d85a07..0846f6c5169f7 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -389,8 +389,20 @@ static void recover_worker(struct kthread_work *work) /* Increment the fault counts */ submit->queue->faults++; - if (submit->vm) - to_msm_vm(submit->vm)->faults++; + if (submit->vm) { + struct msm_gem_vm *vm = to_msm_vm(submit->vm); + + vm->faults++; + + /* + * If userspace has opted-in to VM_BIND (and therefore userspace + * management of the VM), faults mark the VM as unusuable. This + * matches vulkan expectations (vulkan is the main target for + * VM_BIND) + */ + if (!vm->managed) + vm->unusable = true; + } get_comm_cmdline(submit, &comm, &cmd); -- 2.47.2