]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
drm/sched: add optional errno to drm_sched_start()
authorChristian König <ckoenig.leichtzumerken@gmail.com>
Mon, 26 Aug 2024 12:25:38 +0000 (14:25 +0200)
committerChristian König <christian.koenig@amd.com>
Fri, 6 Sep 2024 16:05:52 +0000 (18:05 +0200)
The current implementation of drm_sched_start uses a hardcoded
-ECANCELED to dispose of a job when the parent/hw fence is NULL.
This results in drm_sched_job_done being called with -ECANCELED for
each job with a NULL parent in the pending list, making it difficult
to distinguish between recovery methods, whether a queue reset or a
full GPU reset was used.

To improve this, we first try a soft recovery for timeout jobs and
use the error code -ENODATA. If soft recovery fails, we proceed with
a queue reset, where the error code remains -ENODATA for the job.
Finally, for a full GPU reset, we use error codes -ECANCELED or
-ETIME. This patch adds an error code parameter to drm_sched_start,
allowing us to differentiate between queue reset and GPU reset
failures. This enables user mode and test applications to validate
the expected correctness of the requested operation. After a
successful queue reset, the only way to continue normal operation is
to call drm_sched_job_done with the specific error code -ENODATA.

v1: Initial implementation by Jesse utilized amdgpu_device_lock_reset_domain
    and amdgpu_device_unlock_reset_domain to allow user mode to track
    the queue reset status and distinguish between queue reset and
    GPU reset.
v2: Christian suggested using the error codes -ENODATA for queue reset
    and -ECANCELED or -ETIME for GPU reset, returned to
    amdgpu_cs_wait_ioctl.
v3: To meet the requirements, we introduce a new function
    drm_sched_start_ex with an additional parameter to set
    dma_fence_set_error, allowing us to handle the specific error
    codes appropriately and dispose of bad jobs with the selected
    error code depending on whether it was a queue reset or GPU reset.
v4: Alex suggested using a new name, drm_sched_start_with_recovery_error,
    which more accurately describes the function's purpose.
    Additionally, it was recommended to add documentation details
    about the new method.
v5: Fixed declaration of new function drm_sched_start_with_recovery_error.(Alex)
v6 (chk): rebase on upstream changes, cleanup the commit message,
          drop the new function again and update all callers,
          apply the errno also to scheduler fences with hw fences
v7 (chk): rebased

Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
Signed-off-by: Vitaly Prosyak <vitaly.prosyak@amd.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240826122541.85663-1-christian.koenig@amd.com
13 files changed:
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
drivers/gpu/drm/etnaviv/etnaviv_sched.c
drivers/gpu/drm/imagination/pvr_queue.c
drivers/gpu/drm/lima/lima_sched.c
drivers/gpu/drm/nouveau/nouveau_sched.c
drivers/gpu/drm/panfrost/panfrost_job.c
drivers/gpu/drm/panthor/panthor_mmu.c
drivers/gpu/drm/panthor/panthor_sched.c
drivers/gpu/drm/scheduler/sched_main.c
drivers/gpu/drm/v3d/v3d_sched.c
include/drm/gpu_scheduler.h

index 73b2b401b450db73b9eec7a276c38e240dafaa4b..30c5172d208ed7193fa65f621b075767452cf555 100644 (file)
@@ -300,7 +300,7 @@ static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool sus
                        if (r)
                                goto out;
                } else {
-                       drm_sched_start(&ring->sched);
+                       drm_sched_start(&ring->sched, 0);
                }
        }
 
index 49ef22dcf7fb59df80a165e4a721cf0bcdda97cc..0aad05b4f32eef0f36b8c03a4380952c0ead38aa 100644 (file)
@@ -5907,7 +5907,7 @@ skip_hw_reset:
                        if (!amdgpu_ring_sched_ready(ring))
                                continue;
 
-                       drm_sched_start(&ring->sched);
+                       drm_sched_start(&ring->sched, 0);
                }
 
                if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
@@ -6414,7 +6414,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
                if (!amdgpu_ring_sched_ready(ring))
                        continue;
 
-               drm_sched_start(&ring->sched);
+               drm_sched_start(&ring->sched, 0);
        }
 
        amdgpu_device_unset_mp1_state(adev);
index 597489dea114e0cf8820d0f34e606450f8376f12..3bb9d1ca74b8abdcd6d702db41d2150ebecaf131 100644 (file)
@@ -87,7 +87,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
                        atomic_inc(&ring->adev->gpu_reset_counter);
                        amdgpu_fence_driver_force_completion(ring);
                        if (amdgpu_ring_sched_ready(ring))
-                               drm_sched_start(&ring->sched);
+                               drm_sched_start(&ring->sched, 0);
                        goto exit;
                }
        }
index ab9ca4824b62e130f0e4f8c8bf1935ce9a689d87..23ced5896c7cd83d375313aa811ff0c4d52f2afa 100644 (file)
@@ -72,7 +72,7 @@ static enum drm_gpu_sched_stat etnaviv_sched_timedout_job(struct drm_sched_job
 
        drm_sched_resubmit_jobs(&gpu->sched);
 
-       drm_sched_start(&gpu->sched);
+       drm_sched_start(&gpu->sched, 0);
        return DRM_GPU_SCHED_STAT_NOMINAL;
 
 out_no_timeout:
index 20cb4601208214da59a7c29d065a735401ffe2c9..c4f08432882b12f5cdfeb7fc991fd941f0946676 100644 (file)
@@ -782,7 +782,7 @@ static void pvr_queue_start(struct pvr_queue *queue)
                }
        }
 
-       drm_sched_start(&queue->scheduler);
+       drm_sched_start(&queue->scheduler, 0);
 }
 
 /**
@@ -842,7 +842,7 @@ pvr_queue_timedout_job(struct drm_sched_job *s_job)
        }
        mutex_unlock(&pvr_dev->queues.lock);
 
-       drm_sched_start(sched);
+       drm_sched_start(sched, 0);
 
        return DRM_GPU_SCHED_STAT_NOMINAL;
 }
index 1a944edb6ddc6cdd90b1130d62d6ecb68455e0ef..b40c90e97d7eb3b3c43670bc0cd1bd217a7ce1af 100644 (file)
@@ -463,7 +463,7 @@ static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job
        lima_pm_idle(ldev);
 
        drm_sched_resubmit_jobs(&pipe->base);
-       drm_sched_start(&pipe->base);
+       drm_sched_start(&pipe->base, 0);
 
        return DRM_GPU_SCHED_STAT_NOMINAL;
 }
index eb6c3f9a01f54a48dd18d23a3bc29ca9982c29b9..4412f2711fb5557838ea2e590533cfbbdbb9145a 100644 (file)
@@ -379,7 +379,7 @@ nouveau_sched_timedout_job(struct drm_sched_job *sched_job)
        else
                NV_PRINTK(warn, job->cli, "Generic job timeout.\n");
 
-       drm_sched_start(sched);
+       drm_sched_start(sched, 0);
 
        return stat;
 }
index 3ad131eb66577bd17ab528b4a5595dedc0d2b759..9b8e82fb8bc4cea30821e3ed8bb7007937ac0b53 100644 (file)
@@ -733,7 +733,7 @@ panfrost_reset(struct panfrost_device *pfdev,
 
        /* Restart the schedulers */
        for (i = 0; i < NUM_JOB_SLOTS; i++)
-               drm_sched_start(&pfdev->js->queue[i].sched);
+               drm_sched_start(&pfdev->js->queue[i].sched, 0);
 
        /* Re-enable job interrupts now that everything has been restarted. */
        job_write(pfdev, JOB_INT_MASK,
index cd2bac54e761123a513336ef63a6e398b2d7e462..7d0a90559182fd723c16abc9766d7ed713c49e11 100644 (file)
@@ -827,7 +827,7 @@ static void panthor_vm_stop(struct panthor_vm *vm)
 
 static void panthor_vm_start(struct panthor_vm *vm)
 {
-       drm_sched_start(&vm->sched);
+       drm_sched_start(&vm->sched, 0);
 }
 
 /**
index c426a392b08117c00524b068b087d49bf1cd97f5..d3246f7d95917d2b13c5164b79c9be7faeade55e 100644 (file)
@@ -2538,7 +2538,7 @@ static void queue_start(struct panthor_queue *queue)
        list_for_each_entry(job, &queue->scheduler.pending_list, base.list)
                job->base.s_fence->parent = dma_fence_get(job->done_fence);
 
-       drm_sched_start(&queue->scheduler);
+       drm_sched_start(&queue->scheduler, 0);
 }
 
 static void panthor_group_stop(struct panthor_group *group)
index ab53ab486fe69daca7e9537afd8bd0445dd4498e..f093616fe53c1f99e10c997b94f21f7be1a265a2 100644 (file)
@@ -674,9 +674,10 @@ EXPORT_SYMBOL(drm_sched_stop);
  * drm_sched_start - recover jobs after a reset
  *
  * @sched: scheduler instance
+ * @errno: error to set on the pending fences
  *
  */
-void drm_sched_start(struct drm_gpu_scheduler *sched)
+void drm_sched_start(struct drm_gpu_scheduler *sched, int errno)
 {
        struct drm_sched_job *s_job, *tmp;
 
@@ -691,13 +692,13 @@ void drm_sched_start(struct drm_gpu_scheduler *sched)
                atomic_add(s_job->credits, &sched->credit_count);
 
                if (!fence) {
-                       drm_sched_job_done(s_job, -ECANCELED);
+                       drm_sched_job_done(s_job, errno ?: -ECANCELED);
                        continue;
                }
 
                if (dma_fence_add_callback(fence, &s_job->cb,
                                           drm_sched_job_done_cb))
-                       drm_sched_job_done(s_job, fence->error);
+                       drm_sched_job_done(s_job, fence->error ?: errno);
        }
 
        drm_sched_start_timeout_unlocked(sched);
index fa6859dd8368463a8e13d8d24c169da4272f1a95..090bbaebb4961a9bb4fd7f54caec8f8557d3167b 100644 (file)
@@ -661,7 +661,7 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job)
 
        /* Unblock schedulers and restart their jobs. */
        for (q = 0; q < V3D_MAX_QUEUES; q++) {
-               drm_sched_start(&v3d->queue[q].sched);
+               drm_sched_start(&v3d->queue[q].sched, 0);
        }
 
        mutex_unlock(&v3d->reset_lock);
index fe8edb917360793f3645e3399115fb7a15278f3c..a8d19b10f9b86e90f5b1d487006e55c94ad21987 100644 (file)
@@ -579,7 +579,7 @@ bool drm_sched_wqueue_ready(struct drm_gpu_scheduler *sched);
 void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched);
 void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched);
 void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
-void drm_sched_start(struct drm_gpu_scheduler *sched);
+void drm_sched_start(struct drm_gpu_scheduler *sched, int errno);
 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
 void drm_sched_increase_karma(struct drm_sched_job *bad);
 void drm_sched_reset_karma(struct drm_sched_job *bad);