]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
drm/amdgpu: Replace Mutex with Spinlock for RLCG register access to avoid Priority...
authorSrinivasan Shanmugam <srinivasan.shanmugam@amd.com>
Thu, 29 Jan 2026 09:27:02 +0000 (17:27 +0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 6 Feb 2026 15:44:24 +0000 (16:44 +0100)
[ Upstream commit dc0297f3198bd60108ccbd167ee5d9fa4af31ed0 ]

RLCG Register Access is a way for virtual functions to safely access GPU
registers in a virtualized environment., including TLB flushes and
register reads. When multiple threads or VFs try to access the same
registers simultaneously, it can lead to race conditions. By using the
RLCG interface, the driver can serialize access to the registers. This
means that only one thread can access the registers at a time,
preventing conflicts and ensuring that operations are performed
correctly. Additionally, when a low-priority task holds a mutex that a
high-priority task needs, ie., If a thread holding a spinlock tries to
acquire a mutex, it can lead to priority inversion. register access in
amdgpu_virt_rlcg_reg_rw especially in a fast code path is critical.

The call stack shows that the function amdgpu_virt_rlcg_reg_rw is being
called, which attempts to acquire the mutex. This function is invoked
from amdgpu_sriov_wreg, which in turn is called from
gmc_v11_0_flush_gpu_tlb.

The [ BUG: Invalid wait context ] indicates that a thread is trying to
acquire a mutex while it is in a context that does not allow it to sleep
(like holding a spinlock).

Fixes the below:

[  253.013423] =============================
[  253.013434] [ BUG: Invalid wait context ]
[  253.013446] 6.12.0-amdstaging-drm-next-lol-050225 #14 Tainted: G     U     OE
[  253.013464] -----------------------------
[  253.013475] kworker/0:1/10 is trying to lock:
[  253.013487] ffff9f30542e3cf8 (&adev->virt.rlcg_reg_lock){+.+.}-{3:3}, at: amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu]
[  253.013815] other info that might help us debug this:
[  253.013827] context-{4:4}
[  253.013835] 3 locks held by kworker/0:1/10:
[  253.013847]  #0: ffff9f3040050f58 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x3f5/0x680
[  253.013877]  #1: ffffb789c008be40 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_one_work+0x1d6/0x680
[  253.013905]  #2: ffff9f3054281838 (&adev->gmc.invalidate_lock){+.+.}-{2:2}, at: gmc_v11_0_flush_gpu_tlb+0x198/0x4f0 [amdgpu]
[  253.014154] stack backtrace:
[  253.014164] CPU: 0 UID: 0 PID: 10 Comm: kworker/0:1 Tainted: G     U     OE      6.12.0-amdstaging-drm-next-lol-050225 #14
[  253.014189] Tainted: [U]=USER, [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
[  253.014203] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 11/18/2024
[  253.014224] Workqueue: events work_for_cpu_fn
[  253.014241] Call Trace:
[  253.014250]  <TASK>
[  253.014260]  dump_stack_lvl+0x9b/0xf0
[  253.014275]  dump_stack+0x10/0x20
[  253.014287]  __lock_acquire+0xa47/0x2810
[  253.014303]  ? srso_alias_return_thunk+0x5/0xfbef5
[  253.014321]  lock_acquire+0xd1/0x300
[  253.014333]  ? amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu]
[  253.014562]  ? __lock_acquire+0xa6b/0x2810
[  253.014578]  __mutex_lock+0x85/0xe20
[  253.014591]  ? amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu]
[  253.014782]  ? sched_clock_noinstr+0x9/0x10
[  253.014795]  ? srso_alias_return_thunk+0x5/0xfbef5
[  253.014808]  ? local_clock_noinstr+0xe/0xc0
[  253.014822]  ? amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu]
[  253.015012]  ? srso_alias_return_thunk+0x5/0xfbef5
[  253.015029]  mutex_lock_nested+0x1b/0x30
[  253.015044]  ? mutex_lock_nested+0x1b/0x30
[  253.015057]  amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu]
[  253.015249]  amdgpu_sriov_wreg+0xc5/0xd0 [amdgpu]
[  253.015435]  gmc_v11_0_flush_gpu_tlb+0x44b/0x4f0 [amdgpu]
[  253.015667]  gfx_v11_0_hw_init+0x499/0x29c0 [amdgpu]
[  253.015901]  ? __pfx_smu_v13_0_update_pcie_parameters+0x10/0x10 [amdgpu]
[  253.016159]  ? srso_alias_return_thunk+0x5/0xfbef5
[  253.016173]  ? smu_hw_init+0x18d/0x300 [amdgpu]
[  253.016403]  amdgpu_device_init+0x29ad/0x36a0 [amdgpu]
[  253.016614]  amdgpu_driver_load_kms+0x1a/0xc0 [amdgpu]
[  253.017057]  amdgpu_pci_probe+0x1c2/0x660 [amdgpu]
[  253.017493]  local_pci_probe+0x4b/0xb0
[  253.017746]  work_for_cpu_fn+0x1a/0x30
[  253.017995]  process_one_work+0x21e/0x680
[  253.018248]  worker_thread+0x190/0x330
[  253.018500]  ? __pfx_worker_thread+0x10/0x10
[  253.018746]  kthread+0xe7/0x120
[  253.018988]  ? __pfx_kthread+0x10/0x10
[  253.019231]  ret_from_fork+0x3c/0x60
[  253.019468]  ? __pfx_kthread+0x10/0x10
[  253.019701]  ret_from_fork_asm+0x1a/0x30
[  253.019939]  </TASK>

v2: s/spin_trylock/spin_lock_irqsave to be safe (Christian).

Fixes: e864180ee49b ("drm/amdgpu: Add lock around VF RLCG interface")
Cc: lin cao <lin.cao@amd.com>
Cc: Jingwen Chen <Jingwen.Chen2@amd.com>
Cc: Victor Skvortsov <victor.skvortsov@amd.com>
Cc: Zhigang Luo <zhigang.luo@amd.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
Suggested-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
[ Minor conflict resolved. ]
Signed-off-by: Li hongliang <1468888505@139.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

index 38b81ae236cb3d9a2c2fc8546edbb2be7fc9af66..535cc74c5880166a751cede6b17bfa10eaa1eda2 100644 (file)
@@ -3652,7 +3652,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        mutex_init(&adev->grbm_idx_mutex);
        mutex_init(&adev->mn_lock);
        mutex_init(&adev->virt.vf_errors.lock);
-       mutex_init(&adev->virt.rlcg_reg_lock);
        hash_init(adev->mn_hash);
        mutex_init(&adev->psp.mutex);
        mutex_init(&adev->notifier_lock);
@@ -3674,6 +3673,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        spin_lock_init(&adev->se_cac_idx_lock);
        spin_lock_init(&adev->audio_endpt_idx_lock);
        spin_lock_init(&adev->mm_stats.lock);
+       spin_lock_init(&adev->virt.rlcg_reg_lock);
 
        INIT_LIST_HEAD(&adev->shadow_list);
        mutex_init(&adev->shadow_list_lock);
index c626ed88e6420db49ba0538204358ab231571a72..d300f7710ebb3e86d0eb18d37c2ee45e8d906365 100644 (file)
@@ -965,6 +965,7 @@ static u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v
        void *scratch_reg2;
        void *scratch_reg3;
        void *spare_int;
+       unsigned long flags;
 
        if (!adev->gfx.rlc.rlcg_reg_access_supported) {
                dev_err(adev->dev,
@@ -978,7 +979,7 @@ static u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v
        scratch_reg2 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg2;
        scratch_reg3 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg3;
 
-       mutex_lock(&adev->virt.rlcg_reg_lock);
+       spin_lock_irqsave(&adev->virt.rlcg_reg_lock, flags);
 
        if (reg_access_ctrl->spare_int)
                spare_int = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->spare_int;
@@ -1034,7 +1035,7 @@ static u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v
 
        ret = readl(scratch_reg0);
 
-       mutex_unlock(&adev->virt.rlcg_reg_lock);
+       spin_unlock_irqrestore(&adev->virt.rlcg_reg_lock, flags);
 
        return ret;
 }
index fc2859726f0a691d7c5c6fbff6f740a0a3c7df67..9267e792c4cf8abcd8ad04a1074faeb3a450a70d 100644 (file)
@@ -264,7 +264,8 @@ struct amdgpu_virt {
        /* the ucode id to signal the autoload */
        uint32_t autoload_ucode_id;
 
-       struct mutex rlcg_reg_lock;
+       /* Spinlock to protect access to the RLCG register interface */
+       spinlock_t rlcg_reg_lock;
 };
 
 struct amdgpu_video_codec_info;