drm/sched: Avoid memory leaks with cancel_job() callback

author Philipp Stanner <phasta@kernel.org>

Thu, 10 Jul 2025 12:54:06 +0000 (14:54 +0200)

committer Philipp Stanner <phasta@kernel.org>

Thu, 10 Jul 2025 15:07:08 +0000 (17:07 +0200)
author Philipp Stanner <phasta@kernel.org>
Thu, 10 Jul 2025 12:54:06 +0000 (14:54 +0200)
committer Philipp Stanner <phasta@kernel.org>
Thu, 10 Jul 2025 15:07:08 +0000 (17:07 +0200)
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c

index 81ad40d9582bce1e2e901ff8bd4c70862008878a..a971f0c9e6e08af05f7cc27aee749cd746bd3cd0 100644 (file)
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -1352,6 +1352,18 @@ Out_check_own:
  }
  EXPORT_SYMBOL(drm_sched_init);
  
+static void drm_sched_cancel_remaining_jobs(struct drm_gpu_scheduler *sched)
+{
+       struct drm_sched_job *job, *tmp;
+
+       /* All other accessors are stopped. No locking necessary. */
+       list_for_each_entry_safe_reverse(job, tmp, &sched->pending_list, list) {
+               sched->ops->cancel_job(job);
+               list_del(&job->list);
+               sched->ops->free_job(job);
+       }
+}
+
  /**
   * drm_sched_fini - Destroy a gpu scheduler
   *
@@ -1359,19 +1371,11 @@ EXPORT_SYMBOL(drm_sched_init);
   *
   * Tears down and cleans up the scheduler.
   *
- * This stops submission of new jobs to the hardware through
- * drm_sched_backend_ops.run_job(). Consequently, drm_sched_backend_ops.free_job()
- * will not be called for all jobs still in drm_gpu_scheduler.pending_list.
- * There is no solution for this currently. Thus, it is up to the driver to make
- * sure that:
- *
- *  a) drm_sched_fini() is only called after for all submitted jobs
- *     drm_sched_backend_ops.free_job() has been called or that
- *  b) the jobs for which drm_sched_backend_ops.free_job() has not been called
- *     after drm_sched_fini() ran are freed manually.
- *
- * FIXME: Take care of the above problem and prevent this function from leaking
- * the jobs in drm_gpu_scheduler.pending_list under any circumstances.
+ * This stops submission of new jobs to the hardware through &struct
+ * drm_sched_backend_ops.run_job. If &struct drm_sched_backend_ops.cancel_job
+ * is implemented, all jobs will be canceled through it and afterwards cleaned
+ * up through &struct drm_sched_backend_ops.free_job. If cancel_job is not
+ * implemented, memory could leak.
   */
  void drm_sched_fini(struct drm_gpu_scheduler *sched)
  {
@@ -1401,6 +1405,10 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
         /* Confirm no work left behind accessing device structures */
         cancel_delayed_work_sync(&sched->work_tdr);
  
+       /* Avoid memory leaks if supported by the driver. */
+       if (sched->ops->cancel_job)
+               drm_sched_cancel_remaining_jobs(sched);
+
         if (sched->own_submit_wq)
                 destroy_workqueue(sched->submit_wq);
         sched->ready = false;
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h

index e62a7214e05217d72de5c6e5168544d47099090a..190844370f48aee6ac9734ddfad65c8718f0a52a 100644 (file)
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -512,6 +512,24 @@ struct drm_sched_backend_ops {
           * and it's time to clean it up.
          */
         void (*free_job)(struct drm_sched_job *sched_job);
+
+       /**
+        * @cancel_job: Used by the scheduler to guarantee remaining jobs' fences
+        * get signaled in drm_sched_fini().
+        *
+        * Used by the scheduler to cancel all jobs that have not been executed
+        * with &struct drm_sched_backend_ops.run_job by the time
+        * drm_sched_fini() gets invoked.
+        *
+        * Drivers need to signal the passed job's hardware fence with an
+        * appropriate error code (e.g., -ECANCELED) in this callback. They
+        * must not free the job.
+        *
+        * The scheduler will only call this callback once it stopped calling
+        * all other callbacks forever, with the exception of &struct
+        * drm_sched_backend_ops.free_job.
+        */
+       void (*cancel_job)(struct drm_sched_job *sched_job);
  };
  
  /**
author	Philipp Stanner <phasta@kernel.org>
	Thu, 10 Jul 2025 12:54:06 +0000 (14:54 +0200)
committer	Philipp Stanner <phasta@kernel.org>
	Thu, 10 Jul 2025 15:07:08 +0000 (17:07 +0200)
drivers/gpu/drm/scheduler/sched_main.c		patch \| blob \| blame \| history
include/drm/gpu_scheduler.h		patch \| blob \| blame \| history