From: Christian König Date: Wed, 28 Jan 2026 12:58:14 +0000 (+0100) Subject: drm/amdgpu: completely rework eviction fence handling v2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2cd7284ba54b22869c301fba7aff9ec96a12f8c0;p=thirdparty%2Flinux.git drm/amdgpu: completely rework eviction fence handling v2 Well that was broken on multiple levels. First of all a lot of checks were placed at incorrect locations, especially if the resume worker should run or not. Then a bunch of code was just mid-layering because of incorrect assignment who should do what. And finally comments explaining what happens instead of why. Just re-write it from scratch, that should at least fix some of the hangs we are seeing. Use RCU for the eviction fence pointer in the manager, the spinlock usage was mostly incorrect as well. Then finally remove all the nonsense checks and actually add them in the correct locations. v2: some typo fixes and cleanups suggested by Sunil Signed-off-by: Christian König Reviewed-by: Sunil Khatri Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 9c50f054f1c10..bc0c62c312ff6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2952,9 +2952,9 @@ static int amdgpu_drm_release(struct inode *inode, struct file *filp) int idx; if (fpriv && drm_dev_enter(dev, &idx)) { - fpriv->evf_mgr.fd_closing = true; - amdgpu_eviction_fence_destroy(&fpriv->evf_mgr); + amdgpu_evf_mgr_shutdown(&fpriv->evf_mgr); amdgpu_userq_mgr_fini(&fpriv->userq_mgr); + amdgpu_evf_mgr_fini(&fpriv->evf_mgr); drm_dev_exit(idx); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c index 3b588c7740ec8..78737dda3f6ba 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c @@ -25,9 +25,6 @@ #include #include "amdgpu.h" -#define work_to_evf_mgr(w, name) container_of(w, struct amdgpu_eviction_fence_mgr, name) -#define evf_mgr_to_fpriv(e) container_of(e, struct amdgpu_fpriv, evf_mgr) - static const char * amdgpu_eviction_fence_get_driver_name(struct dma_fence *fence) { @@ -43,127 +40,66 @@ amdgpu_eviction_fence_get_timeline_name(struct dma_fence *f) return ef->timeline_name; } -int -amdgpu_eviction_fence_replace_fence(struct amdgpu_eviction_fence_mgr *evf_mgr, - struct drm_exec *exec) +static bool amdgpu_eviction_fence_enable_signaling(struct dma_fence *f) { - struct amdgpu_eviction_fence *old_ef, *new_ef; - struct drm_gem_object *obj; - unsigned long index; - int ret; - - if (evf_mgr->ev_fence && - !dma_fence_is_signaled(&evf_mgr->ev_fence->base)) - return 0; - /* - * Steps to replace eviction fence: - * * lock all objects in exec (caller) - * * create a new eviction fence - * * update new eviction fence in evf_mgr - * * attach the new eviction fence to BOs - * * release the old fence - * * unlock the objects (caller) - */ - new_ef = amdgpu_eviction_fence_create(evf_mgr); - if (!new_ef) { - DRM_ERROR("Failed to create new eviction fence\n"); - return -ENOMEM; - } - - /* Update the eviction fence now */ - spin_lock(&evf_mgr->ev_fence_lock); - old_ef = evf_mgr->ev_fence; - evf_mgr->ev_fence = new_ef; - spin_unlock(&evf_mgr->ev_fence_lock); + struct amdgpu_eviction_fence *ev_fence = to_ev_fence(f); - /* Attach the new fence */ - drm_exec_for_each_locked_object(exec, index, obj) { - struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); - - if (!bo) - continue; - ret = amdgpu_eviction_fence_attach(evf_mgr, bo); - if (ret) { - DRM_ERROR("Failed to attch new eviction fence\n"); - goto free_err; - } - } - - /* Free old fence */ - if (old_ef) - dma_fence_put(&old_ef->base); - return 0; - -free_err: - kfree(new_ef); - return ret; + schedule_work(&ev_fence->evf_mgr->suspend_work); + return true; } +static const struct dma_fence_ops amdgpu_eviction_fence_ops = { + .get_driver_name = amdgpu_eviction_fence_get_driver_name, + .get_timeline_name = amdgpu_eviction_fence_get_timeline_name, + .enable_signaling = amdgpu_eviction_fence_enable_signaling, +}; + static void amdgpu_eviction_fence_suspend_worker(struct work_struct *work) { - struct amdgpu_eviction_fence_mgr *evf_mgr = work_to_evf_mgr(work, suspend_work.work); - struct amdgpu_fpriv *fpriv = evf_mgr_to_fpriv(evf_mgr); + struct amdgpu_eviction_fence_mgr *evf_mgr = + container_of(work, struct amdgpu_eviction_fence_mgr, + suspend_work); + struct amdgpu_fpriv *fpriv = + container_of(evf_mgr, struct amdgpu_fpriv, evf_mgr); struct amdgpu_userq_mgr *uq_mgr = &fpriv->userq_mgr; - struct amdgpu_eviction_fence *ev_fence; + struct dma_fence *ev_fence; mutex_lock(&uq_mgr->userq_mutex); - spin_lock(&evf_mgr->ev_fence_lock); - ev_fence = evf_mgr->ev_fence; - if (ev_fence) - dma_fence_get(&ev_fence->base); - else - goto unlock; - spin_unlock(&evf_mgr->ev_fence_lock); - - amdgpu_userq_evict(uq_mgr, ev_fence); + ev_fence = amdgpu_evf_mgr_get_fence(evf_mgr); + amdgpu_userq_evict(uq_mgr, !evf_mgr->shutdown); - mutex_unlock(&uq_mgr->userq_mutex); - dma_fence_put(&ev_fence->base); - return; - -unlock: - spin_unlock(&evf_mgr->ev_fence_lock); + /* + * Signaling the eviction fence must be done while holding the + * userq_mutex. Otherwise we won't resume the queues before issuing the + * next fence. + */ + dma_fence_signal(ev_fence); + dma_fence_put(ev_fence); mutex_unlock(&uq_mgr->userq_mutex); } -static bool amdgpu_eviction_fence_enable_signaling(struct dma_fence *f) +void amdgpu_evf_mgr_attach_fence(struct amdgpu_eviction_fence_mgr *evf_mgr, + struct amdgpu_bo *bo) { - struct amdgpu_eviction_fence_mgr *evf_mgr; - struct amdgpu_eviction_fence *ev_fence; - - if (!f) - return true; - - ev_fence = to_ev_fence(f); - evf_mgr = ev_fence->evf_mgr; - - schedule_delayed_work(&evf_mgr->suspend_work, 0); - return true; -} - -static const struct dma_fence_ops amdgpu_eviction_fence_ops = { - .get_driver_name = amdgpu_eviction_fence_get_driver_name, - .get_timeline_name = amdgpu_eviction_fence_get_timeline_name, - .enable_signaling = amdgpu_eviction_fence_enable_signaling, -}; + struct dma_fence *ev_fence = amdgpu_evf_mgr_get_fence(evf_mgr); + struct dma_resv *resv = bo->tbo.base.resv; -void amdgpu_eviction_fence_signal(struct amdgpu_eviction_fence_mgr *evf_mgr, - struct amdgpu_eviction_fence *ev_fence) -{ - spin_lock(&evf_mgr->ev_fence_lock); - dma_fence_signal(&ev_fence->base); - spin_unlock(&evf_mgr->ev_fence_lock); + dma_resv_add_fence(resv, ev_fence, DMA_RESV_USAGE_BOOKKEEP); + dma_fence_put(ev_fence); } -struct amdgpu_eviction_fence * -amdgpu_eviction_fence_create(struct amdgpu_eviction_fence_mgr *evf_mgr) +int amdgpu_evf_mgr_rearm(struct amdgpu_eviction_fence_mgr *evf_mgr, + struct drm_exec *exec) { struct amdgpu_eviction_fence *ev_fence; + struct drm_gem_object *obj; + unsigned long index; + /* Create and initialize a new eviction fence */ ev_fence = kzalloc_obj(*ev_fence); if (!ev_fence) - return NULL; + return -ENOMEM; ev_fence->evf_mgr = evf_mgr; get_task_comm(ev_fence->timeline_name, current); @@ -171,56 +107,22 @@ amdgpu_eviction_fence_create(struct amdgpu_eviction_fence_mgr *evf_mgr) dma_fence_init64(&ev_fence->base, &amdgpu_eviction_fence_ops, &ev_fence->lock, evf_mgr->ev_fence_ctx, atomic_inc_return(&evf_mgr->ev_fence_seq)); - return ev_fence; -} - -void amdgpu_eviction_fence_destroy(struct amdgpu_eviction_fence_mgr *evf_mgr) -{ - struct amdgpu_eviction_fence *ev_fence; - - /* Wait for any pending work to execute */ - flush_delayed_work(&evf_mgr->suspend_work); - - spin_lock(&evf_mgr->ev_fence_lock); - ev_fence = evf_mgr->ev_fence; - spin_unlock(&evf_mgr->ev_fence_lock); - - if (!ev_fence) - return; - dma_fence_wait(&ev_fence->base, false); + /* Remember it for newly added BOs */ + dma_fence_put(evf_mgr->ev_fence); + evf_mgr->ev_fence = &ev_fence->base; - /* Last unref of ev_fence */ - dma_fence_put(&ev_fence->base); -} - -int amdgpu_eviction_fence_attach(struct amdgpu_eviction_fence_mgr *evf_mgr, - struct amdgpu_bo *bo) -{ - struct amdgpu_eviction_fence *ev_fence; - struct dma_resv *resv = bo->tbo.base.resv; - int ret; - - if (!resv) - return 0; + /* And add it to all existing BOs */ + drm_exec_for_each_locked_object(exec, index, obj) { + struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); - ret = dma_resv_reserve_fences(resv, 1); - if (ret) { - DRM_DEBUG_DRIVER("Failed to resv fence space\n"); - return ret; + amdgpu_evf_mgr_attach_fence(evf_mgr, bo); } - - spin_lock(&evf_mgr->ev_fence_lock); - ev_fence = evf_mgr->ev_fence; - if (ev_fence) - dma_resv_add_fence(resv, &ev_fence->base, DMA_RESV_USAGE_BOOKKEEP); - spin_unlock(&evf_mgr->ev_fence_lock); - return 0; } -void amdgpu_eviction_fence_detach(struct amdgpu_eviction_fence_mgr *evf_mgr, - struct amdgpu_bo *bo) +void amdgpu_evf_mgr_detach_fence(struct amdgpu_eviction_fence_mgr *evf_mgr, + struct amdgpu_bo *bo) { struct dma_fence *stub = dma_fence_get_stub(); @@ -229,13 +131,25 @@ void amdgpu_eviction_fence_detach(struct amdgpu_eviction_fence_mgr *evf_mgr, dma_fence_put(stub); } -int amdgpu_eviction_fence_init(struct amdgpu_eviction_fence_mgr *evf_mgr) +void amdgpu_evf_mgr_init(struct amdgpu_eviction_fence_mgr *evf_mgr) { - /* This needs to be done one time per open */ atomic_set(&evf_mgr->ev_fence_seq, 0); evf_mgr->ev_fence_ctx = dma_fence_context_alloc(1); - spin_lock_init(&evf_mgr->ev_fence_lock); + evf_mgr->ev_fence = dma_fence_get_stub(); - INIT_DELAYED_WORK(&evf_mgr->suspend_work, amdgpu_eviction_fence_suspend_worker); - return 0; + INIT_WORK(&evf_mgr->suspend_work, amdgpu_eviction_fence_suspend_worker); +} + +void amdgpu_evf_mgr_shutdown(struct amdgpu_eviction_fence_mgr *evf_mgr) +{ + evf_mgr->shutdown = true; + flush_work(&evf_mgr->suspend_work); +} + +void amdgpu_evf_mgr_fini(struct amdgpu_eviction_fence_mgr *evf_mgr) +{ + dma_fence_wait(rcu_dereference_protected(evf_mgr->ev_fence, true), + false); + flush_work(&evf_mgr->suspend_work); + dma_fence_put(evf_mgr->ev_fence); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.h index fcd867b7147dd..527de3a235837 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.h @@ -25,6 +25,8 @@ #ifndef AMDGPU_EV_FENCE_H_ #define AMDGPU_EV_FENCE_H_ +#include + struct amdgpu_eviction_fence { struct dma_fence base; spinlock_t lock; @@ -35,35 +37,35 @@ struct amdgpu_eviction_fence { struct amdgpu_eviction_fence_mgr { u64 ev_fence_ctx; atomic_t ev_fence_seq; - spinlock_t ev_fence_lock; - struct amdgpu_eviction_fence *ev_fence; - struct delayed_work suspend_work; - uint8_t fd_closing; -}; - -/* Eviction fence helper functions */ -struct amdgpu_eviction_fence * -amdgpu_eviction_fence_create(struct amdgpu_eviction_fence_mgr *evf_mgr); -void -amdgpu_eviction_fence_destroy(struct amdgpu_eviction_fence_mgr *evf_mgr); - -int -amdgpu_eviction_fence_attach(struct amdgpu_eviction_fence_mgr *evf_mgr, - struct amdgpu_bo *bo); + /* + * Only updated while holding the VM resv lock. + * Only signaled while holding the userq mutex. + */ + struct dma_fence __rcu *ev_fence; + struct work_struct suspend_work; + bool shutdown; +}; -void -amdgpu_eviction_fence_detach(struct amdgpu_eviction_fence_mgr *evf_mgr, - struct amdgpu_bo *bo); +static inline struct dma_fence * +amdgpu_evf_mgr_get_fence(struct amdgpu_eviction_fence_mgr *evf_mgr) +{ + struct dma_fence *ev_fence; -int -amdgpu_eviction_fence_init(struct amdgpu_eviction_fence_mgr *evf_mgr); + rcu_read_lock(); + ev_fence = dma_fence_get_rcu_safe(&evf_mgr->ev_fence); + rcu_read_unlock(); + return ev_fence; +} -void -amdgpu_eviction_fence_signal(struct amdgpu_eviction_fence_mgr *evf_mgr, - struct amdgpu_eviction_fence *ev_fence); +void amdgpu_evf_mgr_attach_fence(struct amdgpu_eviction_fence_mgr *evf_mgr, + struct amdgpu_bo *bo); +int amdgpu_evf_mgr_rearm(struct amdgpu_eviction_fence_mgr *evf_mgr, + struct drm_exec *exec); +void amdgpu_evf_mgr_detach_fence(struct amdgpu_eviction_fence_mgr *evf_mgr, + struct amdgpu_bo *bo); +void amdgpu_evf_mgr_init(struct amdgpu_eviction_fence_mgr *evf_mgr); +void amdgpu_evf_mgr_shutdown(struct amdgpu_eviction_fence_mgr *evf_mgr); +void amdgpu_evf_mgr_fini(struct amdgpu_eviction_fence_mgr *evf_mgr); -int -amdgpu_eviction_fence_replace_fence(struct amdgpu_eviction_fence_mgr *evf_mgr, - struct drm_exec *exec); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index a6107109a2b86..ae5c4b355359d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -263,13 +263,7 @@ static int amdgpu_gem_object_open(struct drm_gem_object *obj, else ++bo_va->ref_count; - /* attach gfx eviction fence */ - r = amdgpu_eviction_fence_attach(&fpriv->evf_mgr, abo); - if (r) { - DRM_DEBUG_DRIVER("Failed to attach eviction fence to BO\n"); - amdgpu_bo_unreserve(abo); - return r; - } + amdgpu_evf_mgr_attach_fence(&fpriv->evf_mgr, abo); drm_exec_fini(&exec); /* Validate and add eviction fence to DMABuf imports with dynamic @@ -337,7 +331,7 @@ static void amdgpu_gem_object_close(struct drm_gem_object *obj, } if (!amdgpu_vm_is_bo_always_valid(vm, bo)) - amdgpu_eviction_fence_detach(&fpriv->evf_mgr, bo); + amdgpu_evf_mgr_detach_fence(&fpriv->evf_mgr, bo); bo_va = amdgpu_vm_bo_find(vm, bo); if (!bo_va || --bo_va->ref_count) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 7f19554b9ad11..06efce38f3238 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1522,10 +1522,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) "Failed to init usermode queue manager (%d), use legacy workload submission only\n", r); - r = amdgpu_eviction_fence_init(&fpriv->evf_mgr); - if (r) - goto error_vm; - + amdgpu_evf_mgr_init(&fpriv->evf_mgr); amdgpu_ctx_mgr_init(&fpriv->ctx_mgr, adev); file_priv->driver_priv = fpriv; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 3cc6e8da48a26..8f8e7c9253171 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -472,17 +472,16 @@ void amdgpu_userq_ensure_ev_fence(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_eviction_fence_mgr *evf_mgr) { - struct amdgpu_eviction_fence *ev_fence; + struct dma_fence *ev_fence; retry: /* Flush any pending resume work to create ev_fence */ flush_delayed_work(&uq_mgr->resume_work); mutex_lock(&uq_mgr->userq_mutex); - spin_lock(&evf_mgr->ev_fence_lock); - ev_fence = evf_mgr->ev_fence; - spin_unlock(&evf_mgr->ev_fence_lock); - if (!ev_fence || dma_fence_is_signaled(&ev_fence->base)) { + ev_fence = amdgpu_evf_mgr_get_fence(evf_mgr); + if (dma_fence_is_signaled(ev_fence)) { + dma_fence_put(ev_fence); mutex_unlock(&uq_mgr->userq_mutex); /* * Looks like there was no pending resume work, @@ -491,6 +490,7 @@ retry: schedule_delayed_work(&uq_mgr->resume_work, 0); goto retry; } + dma_fence_put(ev_fence); } int amdgpu_userq_create_object(struct amdgpu_userq_mgr *uq_mgr, @@ -1197,7 +1197,7 @@ retry_lock: dma_fence_wait(bo_va->last_pt_update, false); dma_fence_wait(vm->last_update, false); - ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec); + ret = amdgpu_evf_mgr_rearm(&fpriv->evf_mgr, &exec); if (ret) drm_file_err(uq_mgr->file, "Failed to replace eviction fence\n"); @@ -1217,11 +1217,13 @@ static void amdgpu_userq_restore_worker(struct work_struct *work) { struct amdgpu_userq_mgr *uq_mgr = work_to_uq_mgr(work, resume_work.work); struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); + struct dma_fence *ev_fence; int ret; - flush_delayed_work(&fpriv->evf_mgr.suspend_work); - mutex_lock(&uq_mgr->userq_mutex); + ev_fence = amdgpu_evf_mgr_get_fence(&fpriv->evf_mgr); + if (!dma_fence_is_signaled(ev_fence)) + goto unlock; ret = amdgpu_userq_vm_validate(uq_mgr); if (ret) { @@ -1237,6 +1239,7 @@ static void amdgpu_userq_restore_worker(struct work_struct *work) unlock: mutex_unlock(&uq_mgr->userq_mutex); + dma_fence_put(ev_fence); } static int @@ -1312,11 +1315,8 @@ amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) } void -amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, - struct amdgpu_eviction_fence *ev_fence) +amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, bool schedule_resume) { - struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); - struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr; struct amdgpu_device *adev = uq_mgr->adev; int ret; @@ -1329,10 +1329,7 @@ amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, if (ret) dev_err(adev->dev, "Failed to evict userqueue\n"); - /* Signal current eviction fence */ - amdgpu_eviction_fence_signal(evf_mgr, ev_fence); - - if (!evf_mgr->fd_closing) + if (schedule_resume) schedule_delayed_work(&uq_mgr->resume_work, 0); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h index 54e1997b3cc03..82306d4890646 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h @@ -133,7 +133,7 @@ void amdgpu_userq_destroy_object(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_userq_obj *userq_obj); void amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, - struct amdgpu_eviction_fence *ev_fence); + bool schedule_resume); void amdgpu_userq_ensure_ev_fence(struct amdgpu_userq_mgr *userq_mgr, struct amdgpu_eviction_fence_mgr *evf_mgr);