1 From dc0297f3198bd60108ccbd167ee5d9fa4af31ed0 Mon Sep 17 00:00:00 2001
2 From: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
3 Date: Fri, 14 Feb 2025 14:22:17 +0530
4 Subject: drm/amdgpu: Replace Mutex with Spinlock for RLCG register access to avoid Priority Inversion in SRIOV
6 Content-Type: text/plain; charset=UTF-8
7 Content-Transfer-Encoding: 8bit
9 From: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
11 commit dc0297f3198bd60108ccbd167ee5d9fa4af31ed0 upstream.
13 RLCG Register Access is a way for virtual functions to safely access GPU
14 registers in a virtualized environment., including TLB flushes and
15 register reads. When multiple threads or VFs try to access the same
16 registers simultaneously, it can lead to race conditions. By using the
17 RLCG interface, the driver can serialize access to the registers. This
18 means that only one thread can access the registers at a time,
19 preventing conflicts and ensuring that operations are performed
20 correctly. Additionally, when a low-priority task holds a mutex that a
21 high-priority task needs, ie., If a thread holding a spinlock tries to
22 acquire a mutex, it can lead to priority inversion. register access in
23 amdgpu_virt_rlcg_reg_rw especially in a fast code path is critical.
25 The call stack shows that the function amdgpu_virt_rlcg_reg_rw is being
26 called, which attempts to acquire the mutex. This function is invoked
27 from amdgpu_sriov_wreg, which in turn is called from
28 gmc_v11_0_flush_gpu_tlb.
30 The [ BUG: Invalid wait context ] indicates that a thread is trying to
31 acquire a mutex while it is in a context that does not allow it to sleep
32 (like holding a spinlock).
36 [ 253.013423] =============================
37 [ 253.013434] [ BUG: Invalid wait context ]
38 [ 253.013446] 6.12.0-amdstaging-drm-next-lol-050225 #14 Tainted: G U OE
39 [ 253.013464] -----------------------------
40 [ 253.013475] kworker/0:1/10 is trying to lock:
41 [ 253.013487] ffff9f30542e3cf8 (&adev->virt.rlcg_reg_lock){+.+.}-{3:3}, at: amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu]
42 [ 253.013815] other info that might help us debug this:
43 [ 253.013827] context-{4:4}
44 [ 253.013835] 3 locks held by kworker/0:1/10:
45 [ 253.013847] #0: ffff9f3040050f58 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x3f5/0x680
46 [ 253.013877] #1: ffffb789c008be40 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_one_work+0x1d6/0x680
47 [ 253.013905] #2: ffff9f3054281838 (&adev->gmc.invalidate_lock){+.+.}-{2:2}, at: gmc_v11_0_flush_gpu_tlb+0x198/0x4f0 [amdgpu]
48 [ 253.014154] stack backtrace:
49 [ 253.014164] CPU: 0 UID: 0 PID: 10 Comm: kworker/0:1 Tainted: G U OE 6.12.0-amdstaging-drm-next-lol-050225 #14
50 [ 253.014189] Tainted: [U]=USER, [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
51 [ 253.014203] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 11/18/2024
52 [ 253.014224] Workqueue: events work_for_cpu_fn
53 [ 253.014241] Call Trace:
55 [ 253.014260] dump_stack_lvl+0x9b/0xf0
56 [ 253.014275] dump_stack+0x10/0x20
57 [ 253.014287] __lock_acquire+0xa47/0x2810
58 [ 253.014303] ? srso_alias_return_thunk+0x5/0xfbef5
59 [ 253.014321] lock_acquire+0xd1/0x300
60 [ 253.014333] ? amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu]
61 [ 253.014562] ? __lock_acquire+0xa6b/0x2810
62 [ 253.014578] __mutex_lock+0x85/0xe20
63 [ 253.014591] ? amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu]
64 [ 253.014782] ? sched_clock_noinstr+0x9/0x10
65 [ 253.014795] ? srso_alias_return_thunk+0x5/0xfbef5
66 [ 253.014808] ? local_clock_noinstr+0xe/0xc0
67 [ 253.014822] ? amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu]
68 [ 253.015012] ? srso_alias_return_thunk+0x5/0xfbef5
69 [ 253.015029] mutex_lock_nested+0x1b/0x30
70 [ 253.015044] ? mutex_lock_nested+0x1b/0x30
71 [ 253.015057] amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu]
72 [ 253.015249] amdgpu_sriov_wreg+0xc5/0xd0 [amdgpu]
73 [ 253.015435] gmc_v11_0_flush_gpu_tlb+0x44b/0x4f0 [amdgpu]
74 [ 253.015667] gfx_v11_0_hw_init+0x499/0x29c0 [amdgpu]
75 [ 253.015901] ? __pfx_smu_v13_0_update_pcie_parameters+0x10/0x10 [amdgpu]
76 [ 253.016159] ? srso_alias_return_thunk+0x5/0xfbef5
77 [ 253.016173] ? smu_hw_init+0x18d/0x300 [amdgpu]
78 [ 253.016403] amdgpu_device_init+0x29ad/0x36a0 [amdgpu]
79 [ 253.016614] amdgpu_driver_load_kms+0x1a/0xc0 [amdgpu]
80 [ 253.017057] amdgpu_pci_probe+0x1c2/0x660 [amdgpu]
81 [ 253.017493] local_pci_probe+0x4b/0xb0
82 [ 253.017746] work_for_cpu_fn+0x1a/0x30
83 [ 253.017995] process_one_work+0x21e/0x680
84 [ 253.018248] worker_thread+0x190/0x330
85 [ 253.018500] ? __pfx_worker_thread+0x10/0x10
86 [ 253.018746] kthread+0xe7/0x120
87 [ 253.018988] ? __pfx_kthread+0x10/0x10
88 [ 253.019231] ret_from_fork+0x3c/0x60
89 [ 253.019468] ? __pfx_kthread+0x10/0x10
90 [ 253.019701] ret_from_fork_asm+0x1a/0x30
93 v2: s/spin_trylock/spin_lock_irqsave to be safe (Christian).
95 Fixes: e864180ee49b ("drm/amdgpu: Add lock around VF RLCG interface")
96 Cc: lin cao <lin.cao@amd.com>
97 Cc: Jingwen Chen <Jingwen.Chen2@amd.com>
98 Cc: Victor Skvortsov <victor.skvortsov@amd.com>
99 Cc: Zhigang Luo <zhigang.luo@amd.com>
100 Cc: Christian König <christian.koenig@amd.com>
101 Cc: Alex Deucher <alexander.deucher@amd.com>
102 Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
103 Suggested-by: Alex Deucher <alexander.deucher@amd.com>
104 Reviewed-by: Christian König <christian.koenig@amd.com>
105 Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
106 [ Minor context change fixed. ]
107 Signed-off-by: Wenshan Lan <jetlan9@163.com>
108 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
110 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
111 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 5 +++--
112 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 ++-
113 3 files changed, 6 insertions(+), 4 deletions(-)
115 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
116 +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
117 @@ -4144,7 +4144,6 @@ int amdgpu_device_init(struct amdgpu_dev
118 mutex_init(&adev->grbm_idx_mutex);
119 mutex_init(&adev->mn_lock);
120 mutex_init(&adev->virt.vf_errors.lock);
121 - mutex_init(&adev->virt.rlcg_reg_lock);
122 hash_init(adev->mn_hash);
123 mutex_init(&adev->psp.mutex);
124 mutex_init(&adev->notifier_lock);
125 @@ -4170,6 +4169,7 @@ int amdgpu_device_init(struct amdgpu_dev
126 spin_lock_init(&adev->se_cac_idx_lock);
127 spin_lock_init(&adev->audio_endpt_idx_lock);
128 spin_lock_init(&adev->mm_stats.lock);
129 + spin_lock_init(&adev->virt.rlcg_reg_lock);
130 spin_lock_init(&adev->wb.lock);
132 INIT_LIST_HEAD(&adev->reset_list);
133 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
134 +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
135 @@ -1010,6 +1010,7 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgp
139 + unsigned long flags;
141 if (!adev->gfx.rlc.rlcg_reg_access_supported) {
143 @@ -1031,7 +1032,7 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgp
144 scratch_reg2 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg2;
145 scratch_reg3 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg3;
147 - mutex_lock(&adev->virt.rlcg_reg_lock);
148 + spin_lock_irqsave(&adev->virt.rlcg_reg_lock, flags);
150 if (reg_access_ctrl->spare_int)
151 spare_int = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->spare_int;
152 @@ -1090,7 +1091,7 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgp
154 ret = readl(scratch_reg0);
156 - mutex_unlock(&adev->virt.rlcg_reg_lock);
157 + spin_unlock_irqrestore(&adev->virt.rlcg_reg_lock, flags);
161 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
162 +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
163 @@ -275,7 +275,8 @@ struct amdgpu_virt {
164 /* the ucode id to signal the autoload */
165 uint32_t autoload_ucode_id;
167 - struct mutex rlcg_reg_lock;
168 + /* Spinlock to protect access to the RLCG register interface */
169 + spinlock_t rlcg_reg_lock;
172 struct amdgpu_video_codec_info;