git.ipfire.org Git - thirdparty/openwrt.git/blob

   1 From 59a7f3aa7c3045b92bfde6fd342017053d2d304c Mon Sep 17 00:00:00 2001
   2 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= <mcanal@igalia.com>
   3 Date: Sun, 27 Apr 2025 17:28:21 -0300
   4 Subject: [PATCH] drm/v3d: Add job to pending list if the reset was skipped
   5 MIME-Version: 1.0
   6 Content-Type: text/plain; charset=UTF-8
   7 Content-Transfer-Encoding: 8bit
   8
   9 When a CL/CSD job times out, we check if the GPU has made any progress
  10 since the last timeout. If so, instead of resetting the hardware, we skip
  11 the reset and let the timer get rearmed. This gives long-running jobs a
  12 chance to complete.
  13
  14 However, when `timedout_job()` is called, the job in question is removed
  15 from the pending list, which means it won't be automatically freed through
  16 `free_job()`. Consequently, when we skip the reset and keep the job
  17 running, the job won't be freed when it finally completes.
  18
  19 This situation leads to a memory leak, as exposed in [1].
  20
  21 Similarly to commit 704d3d60fec4 ("drm/etnaviv: don't block scheduler when
  22 GPU is still active"), this patch ensures the job is put back on the
  23 pending list when extending the timeout.
  24
  25 Cc: stable@vger.kernel.org # 6.0
  26 Link: https://gitlab.freedesktop.org/mesa/mesa/-/issues/12227 [1]
  27 Reported-by: Daivik Bhatia <dtgs1208@gmail.com>
  28 Signed-off-by: Maíra Canal <mcanal@igalia.com>
  29 ---
  30  drivers/gpu/drm/v3d/v3d_sched.c | 18 +++++++++++-------
  31  1 file changed, 11 insertions(+), 7 deletions(-)
  32
  33 --- a/drivers/gpu/drm/v3d/v3d_sched.c
  34 +++ b/drivers/gpu/drm/v3d/v3d_sched.c
  35 @@ -744,11 +744,6 @@ v3d_gpu_reset_for_timeout(struct v3d_dev
  36         return DRM_GPU_SCHED_STAT_NOMINAL;
  37  }
  38
  39 -/* If the current address or return address have changed, then the GPU
  40 - * has probably made progress and we should delay the reset.  This
  41 - * could fail if the GPU got in an infinite loop in the CL, but that
  42 - * is pretty unlikely outside of an i-g-t testcase.
  43 - */
  44  static enum drm_gpu_sched_stat
  45  v3d_cl_job_timedout(struct drm_sched_job *sched_job, enum v3d_queue q,
  46                     u32 *timedout_ctca, u32 *timedout_ctra)
  47 @@ -758,9 +753,16 @@ v3d_cl_job_timedout(struct drm_sched_job
  48         u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(q));
  49         u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(q));
  50
  51 +       /* If the current address or return address have changed, then the GPU
  52 +        * has probably made progress and we should delay the reset. This
  53 +        * could fail if the GPU got in an infinite loop in the CL, but that
  54 +        * is pretty unlikely outside of an i-g-t testcase.
  55 +        */
  56         if (*timedout_ctca != ctca || *timedout_ctra != ctra) {
  57                 *timedout_ctca = ctca;
  58                 *timedout_ctra = ctra;
  59 +
  60 +               list_add(&sched_job->list, &sched_job->sched->pending_list);
  61                 return DRM_GPU_SCHED_STAT_NOMINAL;
  62         }
  63
  64 @@ -800,11 +802,13 @@ v3d_csd_job_timedout(struct drm_sched_jo
  65         struct v3d_dev *v3d = job->base.v3d;
  66         u32 batches = V3D_CORE_READ(0, V3D_CSD_CURRENT_CFG4(v3d->ver));
  67
  68 -       /* If we've made progress, skip reset and let the timer get
  69 -        * rearmed.
  70 +       /* If we've made progress, skip reset, add the job to the pending
  71 +        * list, and let the timer get rearmed.
  72          */
  73         if (job->timedout_batches != batches) {
  74                 job->timedout_batches = batches;
  75 +
  76 +               list_add(&sched_job->list, &sched_job->sched->pending_list);
  77                 return DRM_GPU_SCHED_STAT_NOMINAL;
  78         }
  79