drm/amdgpu/vcn: add a helper framework for engine resets

author Alex Deucher <alexander.deucher@amd.com>

Mon, 16 Jun 2025 20:01:25 +0000 (16:01 -0400)

committer Alex Deucher <alexander.deucher@amd.com>

Wed, 16 Jul 2025 20:16:58 +0000 (16:16 -0400)
author Alex Deucher <alexander.deucher@amd.com>
Mon, 16 Jun 2025 20:01:25 +0000 (16:01 -0400)
committer Alex Deucher <alexander.deucher@amd.com>
Wed, 16 Jul 2025 20:16:58 +0000 (16:16 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c

index c8885c3d54b332fc777fc3d16d40c76eee1cfeaf..d799bc74936c07ac09db15f474627e8669cfcccc 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -134,6 +134,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev, int i)
  
         mutex_init(&adev->vcn.inst[i].vcn1_jpeg1_workaround);
         mutex_init(&adev->vcn.inst[i].vcn_pg_lock);
+       mutex_init(&adev->vcn.inst[i].engine_reset_mutex);
         atomic_set(&adev->vcn.inst[i].total_submission_cnt, 0);
         INIT_DELAYED_WORK(&adev->vcn.inst[i].idle_work, amdgpu_vcn_idle_work_handler);
         atomic_set(&adev->vcn.inst[i].dpg_enc_submission_cnt, 0);
@@ -1451,3 +1452,81 @@ int vcn_set_powergating_state(struct amdgpu_ip_block *ip_block,
  
         return ret;
  }
+
+/**
+ * amdgpu_vcn_reset_engine - Reset a specific VCN engine
+ * @adev: Pointer to the AMDGPU device
+ * @instance_id: VCN engine instance to reset
+ *
+ * Returns: 0 on success, or a negative error code on failure.
+ */
+static int amdgpu_vcn_reset_engine(struct amdgpu_device *adev,
+                                  uint32_t instance_id)
+{
+       struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[instance_id];
+       int r, i;
+
+       mutex_lock(&vinst->engine_reset_mutex);
+       /* Stop the scheduler's work queue for the dec and enc rings if they are running.
+        * This ensures that no new tasks are submitted to the queues while
+        * the reset is in progress.
+        */
+       drm_sched_wqueue_stop(&vinst->ring_dec.sched);
+       for (i = 0; i < vinst->num_enc_rings; i++)
+               drm_sched_wqueue_stop(&vinst->ring_enc[i].sched);
+
+       /* Perform the VCN reset for the specified instance */
+       r = vinst->reset(vinst);
+       if (r)
+               goto unlock;
+       r = amdgpu_ring_test_ring(&vinst->ring_dec);
+       if (r)
+               goto unlock;
+       for (i = 0; i < vinst->num_enc_rings; i++) {
+               r = amdgpu_ring_test_ring(&vinst->ring_enc[i]);
+               if (r)
+                       goto unlock;
+       }
+       amdgpu_fence_driver_force_completion(&vinst->ring_dec);
+       for (i = 0; i < vinst->num_enc_rings; i++)
+               amdgpu_fence_driver_force_completion(&vinst->ring_enc[i]);
+
+       /* Restart the scheduler's work queue for the dec and enc rings
+        * if they were stopped by this function. This allows new tasks
+        * to be submitted to the queues after the reset is complete.
+        */
+       drm_sched_wqueue_start(&vinst->ring_dec.sched);
+       for (i = 0; i < vinst->num_enc_rings; i++)
+               drm_sched_wqueue_start(&vinst->ring_enc[i].sched);
+
+unlock:
+       mutex_unlock(&vinst->engine_reset_mutex);
+
+       return r;
+}
+
+/**
+ * amdgpu_vcn_ring_reset - Reset a VCN ring
+ * @ring: ring to reset
+ * @vmid: vmid of guilty job
+ * @timedout_fence: fence of timed out job
+ *
+ * This helper is for VCN blocks without unified queues because
+ * resetting the engine resets all queues in that case.  With
+ * unified queues we have one queue per engine.
+ * Returns: 0 on success, or a negative error code on failure.
+ */
+int amdgpu_vcn_ring_reset(struct amdgpu_ring *ring,
+                         unsigned int vmid,
+                         struct amdgpu_fence *timedout_fence)
+{
+       struct amdgpu_device *adev = ring->adev;
+
+       if (!(adev->vcn.supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE))
+               return -EOPNOTSUPP;
+
+       if (adev->vcn.inst[ring->me].using_unified_queue)
+               return -EINVAL;
+
+       return amdgpu_vcn_reset_engine(adev, ring->me);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h

index 83adf81defc7114ce3301ab9e695862c6c4c5d16..0bc0a94d7cf0fb0a3a3b2584e07b2c1f4326bbd1 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -330,7 +330,9 @@ struct amdgpu_vcn_inst {
                               struct dpg_pause_state *new_state);
         int (*set_pg_state)(struct amdgpu_vcn_inst *vinst,
                             enum amd_powergating_state state);
+       int (*reset)(struct amdgpu_vcn_inst *vinst);
         bool using_unified_queue;
+       struct mutex            engine_reset_mutex;
  };
  
  struct amdgpu_vcn_ras {
@@ -552,5 +554,7 @@ void amdgpu_debugfs_vcn_sched_mask_init(struct amdgpu_device *adev);
  
  int vcn_set_powergating_state(struct amdgpu_ip_block *ip_block,
                               enum amd_powergating_state state);
-
+int amdgpu_vcn_ring_reset(struct amdgpu_ring *ring,
+                         unsigned int vmid,
+                         struct amdgpu_fence *guilty_fence);
  #endif
author	Alex Deucher <alexander.deucher@amd.com>
	Mon, 16 Jun 2025 20:01:25 +0000 (16:01 -0400)
committer	Alex Deucher <alexander.deucher@amd.com>
	Wed, 16 Jul 2025 20:16:58 +0000 (16:16 -0400)
drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h		patch \| blob \| blame \| history