]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
amd/amdkfd: Fix profiler lock init order
authorTvrtko Ursulin <tvrtko.ursulin@igalia.com>
Fri, 29 May 2026 09:23:22 +0000 (10:23 +0100)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 3 Jun 2026 17:59:33 +0000 (13:59 -0400)
A call chain at driver probe exists where profiler lock is used before it
is initialized:

[   12.131440] kfd kfd: Allocated 3969056 bytes on gart
[   12.131561] kfd kfd: Total number of KFD nodes to be created: 1
[   12.132691] ------------[ cut here ]------------
[   12.132703] DEBUG_LOCKS_WARN_ON(lock->magic != lock)
[   12.132705] WARNING: kernel/locking/mutex.c:625 at __mutex_lock+0x616/0x1150, CPU#0: (udev-worker)/569
...
[   12.133051] Call Trace:
[   12.133055]  <TASK>
[   12.133059]  ? mark_held_locks+0x40/0x70
[   12.133068]  ? init_mqd+0xe1/0x1b0 [amdgpu 5154987db73e842b9b4f761e2bd86e17c7ada65c]
[   12.133671]  ? _raw_spin_unlock_irqrestore+0x4c/0x60
[   12.133683]  ? init_mqd+0xe1/0x1b0 [amdgpu 5154987db73e842b9b4f761e2bd86e17c7ada65c]
[   12.134235]  init_mqd+0xe1/0x1b0 [amdgpu 5154987db73e842b9b4f761e2bd86e17c7ada65c]
[   12.134781]  init_mqd_hiq+0x12/0x30 [amdgpu 5154987db73e842b9b4f761e2bd86e17c7ada65c]
[   12.135340]  kq_initialize.constprop.0+0x309/0x400 [amdgpu 5154987db73e842b9b4f761e2bd86e17c7ada65c]
[   12.135898]  kernel_queue_init+0x44/0x80 [amdgpu 5154987db73e842b9b4f761e2bd86e17c7ada65c]
[   12.136439]  pm_init+0x70/0x100 [amdgpu 5154987db73e842b9b4f761e2bd86e17c7ada65c]
[   12.136984]  start_cpsch+0x1dc/0x280 [amdgpu 5154987db73e842b9b4f761e2bd86e17c7ada65c]
[   12.137525]  kgd2kfd_device_init+0x70f/0xd10 [amdgpu 5154987db73e842b9b4f761e2bd86e17c7ada65c]
[   12.138070]  amdgpu_amdkfd_device_init+0x172/0x230 [amdgpu 5154987db73e842b9b4f761e2bd86e17c7ada65c]
[   12.138618]  amdgpu_device_init+0x246a/0x2960 [amdgpu 5154987db73e842b9b4f761e2bd86e17c7ada65c]

The human readable call chain is:

kgd2kfd_device_init
  kfd_init_node
    kfd_resume
      node->dqm->ops.start

Where start can be start_cpsch, which calls pm_init, etc, which ends up
calling kq->mqd_mgr->init_mqd, which takes the profiler lock:

init_mqd()
{
...
mutex_lock(&mm->dev->kfd->profiler_lock);
...

Fix it by initializing the mutext at the top of kgd2kfd_device_init().

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
Fixes: a789761de305 ("amd/amdkfd: Add kfd_ioctl_profiler to contain profiler kernel driver changes")
Cc: Benjamin Welton <benjamin.welton@amd.com>
Cc: Perry Yuan <perry.yuan@amd.com>
Cc: Kent Russell <kent.russell@amd.com>
Cc: Yifan Zhang <yifan1.zhang@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdkfd/kfd_device.c

index f48eb0f739afdfc77adb03d70a9362efb8e600dd..5eb863dec8f4d6975ffe3429a6831d19c6299b76 100644 (file)
@@ -739,6 +739,9 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
        int partition_mode;
        int xcp_idx;
 
+       kfd->profiler_process = NULL;
+       mutex_init(&kfd->profiler_lock);
+
        kfd->mec_fw_version = amdgpu_amdkfd_get_fw_version(kfd->adev,
                        KGD_ENGINE_MEC1);
        kfd->mec2_fw_version = amdgpu_amdkfd_get_fw_version(kfd->adev,
@@ -939,9 +942,6 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 
        svm_range_set_max_pages(kfd->adev);
 
-       kfd->profiler_process = NULL;
-       mutex_init(&kfd->profiler_lock);
-
        kfd->init_complete = true;
        dev_info(kfd_device, "added device %x:%x\n", kfd->adev->pdev->vendor,
                 kfd->adev->pdev->device);