]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/amdkfd: allow compute partition mode switch with cgroup exclusions
authorJonathan Kim <jonathan.kim@amd.com>
Wed, 14 May 2025 21:00:46 +0000 (17:00 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 18 Jun 2025 16:19:17 +0000 (12:19 -0400)
The KFD currently bars a compute partition mode switch while a KFD
process exists.

Since cgroup excluded devices remain excluded for the lifetime of a KFD
process and user space is able to mode switch single devices, allow
users to mode switch a device with any running process that has been
cgroup excluded from this device.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Harish Kasiviswanathan <harish.kasiviswanathan@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
drivers/gpu/drm/amd/amdkfd/kfd_device.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
drivers/gpu/drm/amd/amdkfd/kfd_process.c

index d8ac4b1051a81cc4d9a38bbc959f896f6deaea3f..652389d0d4e665f663d097ec675d52e2a2a42dbd 100644 (file)
@@ -749,12 +749,12 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
 
 int amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev)
 {
-       return kgd2kfd_check_and_lock_kfd();
+       return kgd2kfd_check_and_lock_kfd(adev->kfd.dev);
 }
 
 void amdgpu_amdkfd_unlock_kfd(struct amdgpu_device *adev)
 {
-       kgd2kfd_unlock_kfd();
+       kgd2kfd_unlock_kfd(adev->kfd.dev);
 }
 
 
index b6ca41859b53676a37ca8aa07a61e4c8c9279eae..3d5812269ea0d28b985850679e22c50d0ede9816 100644 (file)
@@ -419,8 +419,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd);
 void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
 void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
 void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask);
-int kgd2kfd_check_and_lock_kfd(void);
-void kgd2kfd_unlock_kfd(void);
+int kgd2kfd_check_and_lock_kfd(struct kfd_dev *kfd);
+void kgd2kfd_unlock_kfd(struct kfd_dev *kfd);
 int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id);
 int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id);
 bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id);
@@ -489,12 +489,12 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask)
 {
 }
 
-static inline int kgd2kfd_check_and_lock_kfd(void)
+static inline int kgd2kfd_check_and_lock_kfd(struct kfd_dev *kfd)
 {
        return 0;
 }
 
-static inline void kgd2kfd_unlock_kfd(void)
+static inline void kgd2kfd_unlock_kfd(struct kfd_dev *kfd)
 {
 }
 
index bf0854bd55551bd01ba23dea28fca21e319c6ac6..a12e1433943d11cc9489d07cf4a5235e670d4372 100644 (file)
@@ -1013,10 +1013,30 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
        return 0;
 }
 
-bool kfd_is_locked(void)
+bool kfd_is_locked(struct kfd_dev *kfd)
 {
+       uint8_t id  = 0;
+       struct kfd_node *dev;
+
        lockdep_assert_held(&kfd_processes_mutex);
-       return  (kfd_locked > 0);
+
+       /* check reset/suspend lock */
+       if (kfd_locked > 0)
+               return true;
+
+       if (kfd)
+               return kfd->kfd_dev_lock > 0;
+
+       /* check lock on all cgroup accessible devices */
+       while (kfd_topology_enum_kfd_devices(id++, &dev) == 0) {
+               if (!dev || kfd_devcgroup_check_permission(dev))
+                       continue;
+
+               if (dev->kfd->kfd_dev_lock > 0)
+                       return true;
+       }
+
+       return false;
 }
 
 void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
@@ -1442,24 +1462,53 @@ unsigned int kfd_get_num_xgmi_sdma_engines(struct kfd_node *node)
                kfd_get_num_sdma_engines(node);
 }
 
-int kgd2kfd_check_and_lock_kfd(void)
+int kgd2kfd_check_and_lock_kfd(struct kfd_dev *kfd)
 {
+       struct kfd_process *p;
+       int r = 0, temp, idx;
+
        mutex_lock(&kfd_processes_mutex);
-       if (!hash_empty(kfd_processes_table) || kfd_is_locked()) {
-               mutex_unlock(&kfd_processes_mutex);
-               return -EBUSY;
+
+       if (hash_empty(kfd_processes_table) && !kfd_is_locked(kfd))
+               goto out;
+
+       /* fail under system reset/resume or kfd device is partition switching. */
+       if (kfd_is_locked(kfd)) {
+               r = -EBUSY;
+               goto out;
+       }
+
+       /*
+        * ensure all running processes are cgroup excluded from device before mode switch.
+        * i.e. no pdd was created on the process socket.
+        */
+       idx = srcu_read_lock(&kfd_processes_srcu);
+       hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+               int i;
+
+               for (i = 0; i < p->n_pdds; i++) {
+                       if (p->pdds[i]->dev->kfd != kfd)
+                               continue;
+
+                       r = -EBUSY;
+                       goto proc_check_unlock;
+               }
        }
 
-       ++kfd_locked;
+proc_check_unlock:
+       srcu_read_unlock(&kfd_processes_srcu, idx);
+out:
+       if (!r)
+               ++kfd->kfd_dev_lock;
        mutex_unlock(&kfd_processes_mutex);
 
-       return 0;
+       return r;
 }
 
-void kgd2kfd_unlock_kfd(void)
+void kgd2kfd_unlock_kfd(struct kfd_dev *kfd)
 {
        mutex_lock(&kfd_processes_mutex);
-       --kfd_locked;
+       --kfd->kfd_dev_lock;
        mutex_unlock(&kfd_processes_mutex);
 }
 
index d221c58dccc3ccaa650ab535a5f97192fce5fef1..67694bcd9464653fdf4ce8d0d6f070766d25b048 100644 (file)
@@ -372,6 +372,9 @@ struct kfd_dev {
 
        /* bitmap for dynamic doorbell allocation from doorbell object */
        unsigned long *doorbell_bitmap;
+
+       /* for dynamic partitioning */
+       int kfd_dev_lock;
 };
 
 enum kfd_mempool {
@@ -1536,7 +1539,7 @@ static inline bool kfd_flush_tlb_after_unmap(struct kfd_dev *dev)
 int kfd_send_exception_to_runtime(struct kfd_process *p,
                                unsigned int queue_id,
                                uint64_t error_reason);
-bool kfd_is_locked(void);
+bool kfd_is_locked(struct kfd_dev *kfd);
 
 /* Compute profile */
 void kfd_inc_compute_active(struct kfd_node *dev);
index 722ac1662bdc19be73382fc822b5b9bc62e3f5f0..5be28c6c4f6aa592b4b188c6e477651fff2c4d09 100644 (file)
@@ -854,7 +854,7 @@ struct kfd_process *kfd_create_process(struct task_struct *thread)
         */
        mutex_lock(&kfd_processes_mutex);
 
-       if (kfd_is_locked()) {
+       if (kfd_is_locked(NULL)) {
                pr_debug("KFD is locked! Cannot create process");
                process = ERR_PTR(-EINVAL);
                goto out;