drm/amdgpu: resume gfx userqueues

author Shashank Sharma <shashank.sharma@amd.com>

Wed, 20 Nov 2024 18:02:26 +0000 (19:02 +0100)

committer Alex Deucher <alexander.deucher@amd.com>

Tue, 8 Apr 2025 20:48:18 +0000 (16:48 -0400)
author Shashank Sharma <shashank.sharma@amd.com>
Wed, 20 Nov 2024 18:02:26 +0000 (19:02 +0100)
committer Alex Deucher <alexander.deucher@amd.com>
Tue, 8 Apr 2025 20:48:18 +0000 (16:48 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.h

index 12f168ec3ef003cfd89c8f902a8b5d370950de20..787182bd1069dd7f76fdeb8e485efa1569cd856a 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.h
@@ -38,6 +38,7 @@ struct amdgpu_eviction_fence_mgr {
         spinlock_t              ev_fence_lock;
         struct amdgpu_eviction_fence *ev_fence;
         struct delayed_work     suspend_work;
+       uint8_t fd_closing;
  };
  
  /* Eviction fence helper functions */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c

index 43fff8869ac9ab143ef89dca5ce11cd5aef7fc22..57e502d014a12532fd7310e935e2588e20ebadd6 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
@@ -22,6 +22,7 @@
   *
   */
  
+#include <drm/drm_exec.h>
  #include "amdgpu.h"
  #include "amdgpu_vm.h"
  #include "amdgpu_userqueue.h"
@@ -216,6 +217,7 @@ amdgpu_userqueue_destroy(struct drm_file *filp, int queue_id)
         struct amdgpu_userq_mgr *uq_mgr = &fpriv->userq_mgr;
         struct amdgpu_usermode_queue *queue;
  
+       cancel_delayed_work(&uq_mgr->resume_work);
         mutex_lock(&uq_mgr->userq_mutex);
  
         queue = amdgpu_userqueue_find(uq_mgr, queue_id);
@@ -228,7 +230,6 @@ amdgpu_userqueue_destroy(struct drm_file *filp, int queue_id)
         amdgpu_bo_unpin(queue->db_obj.obj);
         amdgpu_bo_unref(&queue->db_obj.obj);
         amdgpu_userqueue_cleanup(uq_mgr, queue, queue_id);
-       uq_mgr->num_userqs--;
         mutex_unlock(&uq_mgr->userq_mutex);
         return 0;
  }
@@ -304,10 +305,18 @@ amdgpu_userqueue_create(struct drm_file *filp, union drm_amdgpu_userq *args)
                 goto unlock;
         }
         args->out.queue_id = qid;
-       uq_mgr->num_userqs++;
  
  unlock:
         mutex_unlock(&uq_mgr->userq_mutex);
+       if (!r) {
+               /*
+               * There could be a situation that we are creating a new queue while
+               * the other queues under this UQ_mgr are suspended. So if there is any
+               * resume work pending, wait for it to get done.
+               */
+               flush_delayed_work(&uq_mgr->resume_work);
+       }
+
         return r;
  }
  
@@ -345,6 +354,161 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data,
  }
  #endif
  
+static int
+amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr)
+{
+       struct amdgpu_device *adev = uq_mgr->adev;
+       const struct amdgpu_userq_funcs *userq_funcs;
+       struct amdgpu_usermode_queue *queue;
+       int queue_id;
+       int ret = 0;
+
+       userq_funcs = adev->userq_funcs[AMDGPU_HW_IP_GFX];
+
+       /* Resume all the queues for this process */
+       idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id)
+               ret = userq_funcs->resume(uq_mgr, queue);
+
+       if (ret)
+               DRM_ERROR("Failed to resume all the queue\n");
+       return ret;
+}
+
+static int
+amdgpu_userqueue_validate_vm_bo(void *_unused, struct amdgpu_bo *bo)
+{
+       struct ttm_operation_ctx ctx = { false, false };
+       int ret;
+
+       amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
+
+       ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
+       if (ret)
+               DRM_ERROR("Fail to validate\n");
+
+       return ret;
+}
+
+static int
+amdgpu_userqueue_validate_bos(struct amdgpu_userq_mgr *uq_mgr)
+{
+       struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
+       struct amdgpu_vm *vm = &fpriv->vm;
+       struct amdgpu_device *adev = uq_mgr->adev;
+       struct amdgpu_bo_va *bo_va;
+       struct ww_acquire_ctx *ticket;
+       struct drm_exec exec;
+       struct amdgpu_bo *bo;
+       struct dma_resv *resv;
+       bool clear, unlock;
+       int ret = 0;
+
+       drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES | DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
+       drm_exec_until_all_locked(&exec) {
+               ret = amdgpu_vm_lock_pd(vm, &exec, 2);
+               drm_exec_retry_on_contention(&exec);
+               if (unlikely(ret)) {
+                       DRM_ERROR("Failed to lock PD\n");
+                       goto unlock_all;
+               }
+
+               /* Lock the done list */
+               list_for_each_entry(bo_va, &vm->done, base.vm_status) {
+                       bo = bo_va->base.bo;
+                       if (!bo)
+                               continue;
+
+                       ret = drm_exec_lock_obj(&exec, &bo->tbo.base);
+                       drm_exec_retry_on_contention(&exec);
+                       if (unlikely(ret))
+                               goto unlock_all;
+               }
+       }
+
+       spin_lock(&vm->status_lock);
+       while (!list_empty(&vm->moved)) {
+               bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va,
+                                        base.vm_status);
+               spin_unlock(&vm->status_lock);
+
+               /* Per VM BOs never need to bo cleared in the page tables */
+               ret = amdgpu_vm_bo_update(adev, bo_va, false);
+               if (ret)
+                       goto unlock_all;
+               spin_lock(&vm->status_lock);
+       }
+
+       ticket = &exec.ticket;
+       while (!list_empty(&vm->invalidated)) {
+               bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va,
+                                        base.vm_status);
+               resv = bo_va->base.bo->tbo.base.resv;
+               spin_unlock(&vm->status_lock);
+
+               bo = bo_va->base.bo;
+               ret = amdgpu_userqueue_validate_vm_bo(NULL, bo);
+               if (ret) {
+                       DRM_ERROR("Failed to validate BO\n");
+                       goto unlock_all;
+               }
+
+               /* Try to reserve the BO to avoid clearing its ptes */
+               if (!adev->debug_vm && dma_resv_trylock(resv)) {
+                       clear = false;
+                       unlock = true;
+               /* The caller is already holding the reservation lock */
+               } else if (ticket && dma_resv_locking_ctx(resv) == ticket) {
+                       clear = false;
+                       unlock = false;
+               /* Somebody else is using the BO right now */
+               } else {
+                       clear = true;
+                       unlock = false;
+               }
+
+               ret = amdgpu_vm_bo_update(adev, bo_va, clear);
+
+               if (unlock)
+                       dma_resv_unlock(resv);
+               if (ret)
+                       goto unlock_all;
+
+               spin_lock(&vm->status_lock);
+       }
+       spin_unlock(&vm->status_lock);
+
+       ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec);
+       if (ret)
+               DRM_ERROR("Failed to replace eviction fence\n");
+
+unlock_all:
+       drm_exec_fini(&exec);
+       return ret;
+}
+
+static void amdgpu_userqueue_resume_worker(struct work_struct *work)
+{
+       struct amdgpu_userq_mgr *uq_mgr = work_to_uq_mgr(work, resume_work.work);
+       int ret;
+
+       mutex_lock(&uq_mgr->userq_mutex);
+
+       ret = amdgpu_userqueue_validate_bos(uq_mgr);
+       if (ret) {
+               DRM_ERROR("Failed to validate BOs to restore\n");
+               goto unlock;
+       }
+
+       ret = amdgpu_userqueue_resume_all(uq_mgr);
+       if (ret) {
+               DRM_ERROR("Failed to resume all queues\n");
+               goto unlock;
+       }
+
+unlock:
+       mutex_unlock(&uq_mgr->userq_mutex);
+}
+
  static int
  amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr)
  {
@@ -393,6 +557,7 @@ amdgpu_userqueue_suspend(struct amdgpu_userq_mgr *uq_mgr)
         struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
         struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
  
+
         mutex_lock(&uq_mgr->userq_mutex);
  
         /* Wait for any pending userqueue fence to signal */
@@ -411,8 +576,14 @@ amdgpu_userqueue_suspend(struct amdgpu_userq_mgr *uq_mgr)
         /* Signal current eviction fence */
         amdgpu_eviction_fence_signal(evf_mgr);
  
-       /* Cleanup old eviction fence entry */
-       amdgpu_eviction_fence_destroy(evf_mgr);
+       if (evf_mgr->fd_closing) {
+               mutex_unlock(&uq_mgr->userq_mutex);
+               cancel_delayed_work(&uq_mgr->resume_work);
+               return;
+       }
+
+       /* Schedule a resume work */
+       schedule_delayed_work(&uq_mgr->resume_work, 0);
  
  unlock:
         mutex_unlock(&uq_mgr->userq_mutex);
@@ -425,7 +596,6 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct amdgpu_devi
         mutex_init(&userq_mgr->userq_mutex);
         idr_init_base(&userq_mgr->userq_idr, 1);
         userq_mgr->adev = adev;
-       userq_mgr->num_userqs = 0;
  
         fpriv = uq_mgr_to_fpriv(userq_mgr);
         if (!fpriv->evf_mgr.ev_fence) {
@@ -433,6 +603,7 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct amdgpu_devi
                 return -EINVAL;
         }
  
+       INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userqueue_resume_worker);
         return 0;
  }
  
@@ -441,6 +612,8 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
         uint32_t queue_id;
         struct amdgpu_usermode_queue *queue;
  
+       cancel_delayed_work(&userq_mgr->resume_work);
+
         idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id)
                 amdgpu_userqueue_cleanup(userq_mgr, queue, queue_id);
  
diff --git a/drivers/gpu/drm/amd/include/amdgpu_userqueue.h b/drivers/gpu/drm/amd/include/amdgpu_userqueue.h

index 7781ee59653b96667996ce6df5f87fee86850309..2bf28f3454cb4a6f3be0f34c41297403f88a3874 100644 (file)
--- a/drivers/gpu/drm/amd/include/amdgpu_userqueue.h
+++ b/drivers/gpu/drm/amd/include/amdgpu_userqueue.h
@@ -29,6 +29,7 @@
  
  #define to_ev_fence(f) container_of(f, struct amdgpu_eviction_fence, base)
  #define uq_mgr_to_fpriv(u) container_of(u, struct amdgpu_fpriv, userq_mgr)
+#define work_to_uq_mgr(w, name) container_of(w, struct amdgpu_userq_mgr, name)
  
  struct amdgpu_mqd_prop;
  
@@ -73,7 +74,7 @@ struct amdgpu_userq_mgr {
         struct idr                      userq_idr;
         struct mutex                    userq_mutex;
         struct amdgpu_device            *adev;
-       int num_userqs;
+       struct delayed_work             resume_work;
  };
  
  int amdgpu_userq_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
author	Shashank Sharma <shashank.sharma@amd.com>
	Wed, 20 Nov 2024 18:02:26 +0000 (19:02 +0100)
committer	Alex Deucher <alexander.deucher@amd.com>
	Tue, 8 Apr 2025 20:48:18 +0000 (16:48 -0400)
drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.h		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/include/amdgpu_userqueue.h		patch \| blob \| blame \| history