]>
Commit | Line | Data |
---|---|---|
9d09275e GKH |
1 | From 3e1a12754d4df5804bfca5dedf09d2ba291bdc2a Mon Sep 17 00:00:00 2001 |
2 | From: Lyude Paul <lyude@redhat.com> | |
3 | Date: Wed, 15 Aug 2018 15:00:15 -0400 | |
4 | Subject: drm/nouveau: Fix deadlocks in nouveau_connector_detect() | |
5 | ||
6 | From: Lyude Paul <lyude@redhat.com> | |
7 | ||
8 | commit 3e1a12754d4df5804bfca5dedf09d2ba291bdc2a upstream. | |
9 | ||
10 | When we disable hotplugging on the GPU, we need to be able to | |
11 | synchronize with each connector's hotplug interrupt handler before the | |
12 | interrupt is finally disabled. This can be a problem however, since | |
13 | nouveau_connector_detect() currently grabs a runtime power reference | |
14 | when handling connector probing. This will deadlock the runtime suspend | |
15 | handler like so: | |
16 | ||
17 | [ 861.480896] INFO: task kworker/0:2:61 blocked for more than 120 seconds. | |
18 | [ 861.483290] Tainted: G O 4.18.0-rc6Lyude-Test+ #1 | |
19 | [ 861.485158] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. | |
20 | [ 861.486332] kworker/0:2 D 0 61 2 0x80000000 | |
21 | [ 861.487044] Workqueue: events nouveau_display_hpd_work [nouveau] | |
22 | [ 861.487737] Call Trace: | |
23 | [ 861.488394] __schedule+0x322/0xaf0 | |
24 | [ 861.489070] schedule+0x33/0x90 | |
25 | [ 861.489744] rpm_resume+0x19c/0x850 | |
26 | [ 861.490392] ? finish_wait+0x90/0x90 | |
27 | [ 861.491068] __pm_runtime_resume+0x4e/0x90 | |
28 | [ 861.491753] nouveau_display_hpd_work+0x22/0x60 [nouveau] | |
29 | [ 861.492416] process_one_work+0x231/0x620 | |
30 | [ 861.493068] worker_thread+0x44/0x3a0 | |
31 | [ 861.493722] kthread+0x12b/0x150 | |
32 | [ 861.494342] ? wq_pool_ids_show+0x140/0x140 | |
33 | [ 861.494991] ? kthread_create_worker_on_cpu+0x70/0x70 | |
34 | [ 861.495648] ret_from_fork+0x3a/0x50 | |
35 | [ 861.496304] INFO: task kworker/6:2:320 blocked for more than 120 seconds. | |
36 | [ 861.496968] Tainted: G O 4.18.0-rc6Lyude-Test+ #1 | |
37 | [ 861.497654] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. | |
38 | [ 861.498341] kworker/6:2 D 0 320 2 0x80000080 | |
39 | [ 861.499045] Workqueue: pm pm_runtime_work | |
40 | [ 861.499739] Call Trace: | |
41 | [ 861.500428] __schedule+0x322/0xaf0 | |
42 | [ 861.501134] ? wait_for_completion+0x104/0x190 | |
43 | [ 861.501851] schedule+0x33/0x90 | |
44 | [ 861.502564] schedule_timeout+0x3a5/0x590 | |
45 | [ 861.503284] ? mark_held_locks+0x58/0x80 | |
46 | [ 861.503988] ? _raw_spin_unlock_irq+0x2c/0x40 | |
47 | [ 861.504710] ? wait_for_completion+0x104/0x190 | |
48 | [ 861.505417] ? trace_hardirqs_on_caller+0xf4/0x190 | |
49 | [ 861.506136] ? wait_for_completion+0x104/0x190 | |
50 | [ 861.506845] wait_for_completion+0x12c/0x190 | |
51 | [ 861.507555] ? wake_up_q+0x80/0x80 | |
52 | [ 861.508268] flush_work+0x1c9/0x280 | |
53 | [ 861.508990] ? flush_workqueue_prep_pwqs+0x1b0/0x1b0 | |
54 | [ 861.509735] nvif_notify_put+0xb1/0xc0 [nouveau] | |
55 | [ 861.510482] nouveau_display_fini+0xbd/0x170 [nouveau] | |
56 | [ 861.511241] nouveau_display_suspend+0x67/0x120 [nouveau] | |
57 | [ 861.511969] nouveau_do_suspend+0x5e/0x2d0 [nouveau] | |
58 | [ 861.512715] nouveau_pmops_runtime_suspend+0x47/0xb0 [nouveau] | |
59 | [ 861.513435] pci_pm_runtime_suspend+0x6b/0x180 | |
60 | [ 861.514165] ? pci_has_legacy_pm_support+0x70/0x70 | |
61 | [ 861.514897] __rpm_callback+0x7a/0x1d0 | |
62 | [ 861.515618] ? pci_has_legacy_pm_support+0x70/0x70 | |
63 | [ 861.516313] rpm_callback+0x24/0x80 | |
64 | [ 861.517027] ? pci_has_legacy_pm_support+0x70/0x70 | |
65 | [ 861.517741] rpm_suspend+0x142/0x6b0 | |
66 | [ 861.518449] pm_runtime_work+0x97/0xc0 | |
67 | [ 861.519144] process_one_work+0x231/0x620 | |
68 | [ 861.519831] worker_thread+0x44/0x3a0 | |
69 | [ 861.520522] kthread+0x12b/0x150 | |
70 | [ 861.521220] ? wq_pool_ids_show+0x140/0x140 | |
71 | [ 861.521925] ? kthread_create_worker_on_cpu+0x70/0x70 | |
72 | [ 861.522622] ret_from_fork+0x3a/0x50 | |
73 | [ 861.523299] INFO: task kworker/6:0:1329 blocked for more than 120 seconds. | |
74 | [ 861.523977] Tainted: G O 4.18.0-rc6Lyude-Test+ #1 | |
75 | [ 861.524644] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. | |
76 | [ 861.525349] kworker/6:0 D 0 1329 2 0x80000000 | |
77 | [ 861.526073] Workqueue: events nvif_notify_work [nouveau] | |
78 | [ 861.526751] Call Trace: | |
79 | [ 861.527411] __schedule+0x322/0xaf0 | |
80 | [ 861.528089] schedule+0x33/0x90 | |
81 | [ 861.528758] rpm_resume+0x19c/0x850 | |
82 | [ 861.529399] ? finish_wait+0x90/0x90 | |
83 | [ 861.530073] __pm_runtime_resume+0x4e/0x90 | |
84 | [ 861.530798] nouveau_connector_detect+0x7e/0x510 [nouveau] | |
85 | [ 861.531459] ? ww_mutex_lock+0x47/0x80 | |
86 | [ 861.532097] ? ww_mutex_lock+0x47/0x80 | |
87 | [ 861.532819] ? drm_modeset_lock+0x88/0x130 [drm] | |
88 | [ 861.533481] drm_helper_probe_detect_ctx+0xa0/0x100 [drm_kms_helper] | |
89 | [ 861.534127] drm_helper_hpd_irq_event+0xa4/0x120 [drm_kms_helper] | |
90 | [ 861.534940] nouveau_connector_hotplug+0x98/0x120 [nouveau] | |
91 | [ 861.535556] nvif_notify_work+0x2d/0xb0 [nouveau] | |
92 | [ 861.536221] process_one_work+0x231/0x620 | |
93 | [ 861.536994] worker_thread+0x44/0x3a0 | |
94 | [ 861.537757] kthread+0x12b/0x150 | |
95 | [ 861.538463] ? wq_pool_ids_show+0x140/0x140 | |
96 | [ 861.539102] ? kthread_create_worker_on_cpu+0x70/0x70 | |
97 | [ 861.539815] ret_from_fork+0x3a/0x50 | |
98 | [ 861.540521] | |
99 | Showing all locks held in the system: | |
100 | [ 861.541696] 2 locks held by kworker/0:2/61: | |
101 | [ 861.542406] #0: 000000002dbf8af5 ((wq_completion)"events"){+.+.}, at: process_one_work+0x1b3/0x620 | |
102 | [ 861.543071] #1: 0000000076868126 ((work_completion)(&drm->hpd_work)){+.+.}, at: process_one_work+0x1b3/0x620 | |
103 | [ 861.543814] 1 lock held by khungtaskd/64: | |
104 | [ 861.544535] #0: 0000000059db4b53 (rcu_read_lock){....}, at: debug_show_all_locks+0x23/0x185 | |
105 | [ 861.545160] 3 locks held by kworker/6:2/320: | |
106 | [ 861.545896] #0: 00000000d9e1bc59 ((wq_completion)"pm"){+.+.}, at: process_one_work+0x1b3/0x620 | |
107 | [ 861.546702] #1: 00000000c9f92d84 ((work_completion)(&dev->power.work)){+.+.}, at: process_one_work+0x1b3/0x620 | |
108 | [ 861.547443] #2: 000000004afc5de1 (drm_connector_list_iter){.+.+}, at: nouveau_display_fini+0x96/0x170 [nouveau] | |
109 | [ 861.548146] 1 lock held by dmesg/983: | |
110 | [ 861.548889] 2 locks held by zsh/1250: | |
111 | [ 861.549605] #0: 00000000348e3cf6 (&tty->ldisc_sem){++++}, at: ldsem_down_read+0x37/0x40 | |
112 | [ 861.550393] #1: 000000007009a7a8 (&ldata->atomic_read_lock){+.+.}, at: n_tty_read+0xc1/0x870 | |
113 | [ 861.551122] 6 locks held by kworker/6:0/1329: | |
114 | [ 861.551957] #0: 000000002dbf8af5 ((wq_completion)"events"){+.+.}, at: process_one_work+0x1b3/0x620 | |
115 | [ 861.552765] #1: 00000000ddb499ad ((work_completion)(¬ify->work)#2){+.+.}, at: process_one_work+0x1b3/0x620 | |
116 | [ 861.553582] #2: 000000006e013cbe (&dev->mode_config.mutex){+.+.}, at: drm_helper_hpd_irq_event+0x6c/0x120 [drm_kms_helper] | |
117 | [ 861.554357] #3: 000000004afc5de1 (drm_connector_list_iter){.+.+}, at: drm_helper_hpd_irq_event+0x78/0x120 [drm_kms_helper] | |
118 | [ 861.555227] #4: 0000000044f294d9 (crtc_ww_class_acquire){+.+.}, at: drm_helper_probe_detect_ctx+0x3d/0x100 [drm_kms_helper] | |
119 | [ 861.556133] #5: 00000000db193642 (crtc_ww_class_mutex){+.+.}, at: drm_modeset_lock+0x4b/0x130 [drm] | |
120 | ||
121 | [ 861.557864] ============================================= | |
122 | ||
123 | [ 861.559507] NMI backtrace for cpu 2 | |
124 | [ 861.560363] CPU: 2 PID: 64 Comm: khungtaskd Tainted: G O 4.18.0-rc6Lyude-Test+ #1 | |
125 | [ 861.561197] Hardware name: LENOVO 20EQS64N0B/20EQS64N0B, BIOS N1EET78W (1.51 ) 05/18/2018 | |
126 | [ 861.561948] Call Trace: | |
127 | [ 861.562757] dump_stack+0x8e/0xd3 | |
128 | [ 861.563516] nmi_cpu_backtrace.cold.3+0x14/0x5a | |
129 | [ 861.564269] ? lapic_can_unplug_cpu.cold.27+0x42/0x42 | |
130 | [ 861.565029] nmi_trigger_cpumask_backtrace+0xa1/0xae | |
131 | [ 861.565789] arch_trigger_cpumask_backtrace+0x19/0x20 | |
132 | [ 861.566558] watchdog+0x316/0x580 | |
133 | [ 861.567355] kthread+0x12b/0x150 | |
134 | [ 861.568114] ? reset_hung_task_detector+0x20/0x20 | |
135 | [ 861.568863] ? kthread_create_worker_on_cpu+0x70/0x70 | |
136 | [ 861.569598] ret_from_fork+0x3a/0x50 | |
137 | [ 861.570370] Sending NMI from CPU 2 to CPUs 0-1,3-7: | |
138 | [ 861.571426] NMI backtrace for cpu 6 skipped: idling at intel_idle+0x7f/0x120 | |
139 | [ 861.571429] NMI backtrace for cpu 7 skipped: idling at intel_idle+0x7f/0x120 | |
140 | [ 861.571432] NMI backtrace for cpu 3 skipped: idling at intel_idle+0x7f/0x120 | |
141 | [ 861.571464] NMI backtrace for cpu 5 skipped: idling at intel_idle+0x7f/0x120 | |
142 | [ 861.571467] NMI backtrace for cpu 0 skipped: idling at intel_idle+0x7f/0x120 | |
143 | [ 861.571469] NMI backtrace for cpu 4 skipped: idling at intel_idle+0x7f/0x120 | |
144 | [ 861.571472] NMI backtrace for cpu 1 skipped: idling at intel_idle+0x7f/0x120 | |
145 | [ 861.572428] Kernel panic - not syncing: hung_task: blocked tasks | |
146 | ||
147 | So: fix this by making it so that normal hotplug handling /only/ happens | |
148 | so long as the GPU is currently awake without any pending runtime PM | |
149 | requests. In the event that a hotplug occurs while the device is | |
150 | suspending or resuming, we can simply defer our response until the GPU | |
151 | is fully runtime resumed again. | |
152 | ||
153 | Changes since v4: | |
154 | - Use a new trick I came up with using pm_runtime_get() instead of the | |
155 | hackish junk we had before | |
156 | ||
157 | Signed-off-by: Lyude Paul <lyude@redhat.com> | |
158 | Reviewed-by: Karol Herbst <kherbst@redhat.com> | |
159 | Acked-by: Daniel Vetter <daniel@ffwll.ch> | |
160 | Cc: stable@vger.kernel.org | |
161 | Cc: Lukas Wunner <lukas@wunner.de> | |
162 | Signed-off-by: Ben Skeggs <bskeggs@redhat.com> | |
163 | Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> | |
164 | ||
165 | --- | |
166 | drivers/gpu/drm/nouveau/nouveau_connector.c | 22 ++++++++++++++++++++++ | |
167 | 1 file changed, 22 insertions(+) | |
168 | ||
169 | --- a/drivers/gpu/drm/nouveau/nouveau_connector.c | |
170 | +++ b/drivers/gpu/drm/nouveau/nouveau_connector.c | |
171 | @@ -1120,6 +1120,26 @@ nouveau_connector_hotplug(struct nvif_no | |
172 | const struct nvif_notify_conn_rep_v0 *rep = notify->data; | |
173 | const char *name = connector->name; | |
174 | struct nouveau_encoder *nv_encoder; | |
175 | + int ret; | |
176 | + | |
177 | + ret = pm_runtime_get(drm->dev->dev); | |
178 | + if (ret == 0) { | |
179 | + /* We can't block here if there's a pending PM request | |
180 | + * running, as we'll deadlock nouveau_display_fini() when it | |
181 | + * calls nvif_put() on our nvif_notify struct. So, simply | |
182 | + * defer the hotplug event until the device finishes resuming | |
183 | + */ | |
184 | + NV_DEBUG(drm, "Deferring HPD on %s until runtime resume\n", | |
185 | + name); | |
186 | + schedule_work(&drm->hpd_work); | |
187 | + | |
188 | + pm_runtime_put_noidle(drm->dev->dev); | |
189 | + return NVIF_NOTIFY_KEEP; | |
190 | + } else if (ret != 1 && ret != -EACCES) { | |
191 | + NV_WARN(drm, "HPD on %s dropped due to RPM failure: %d\n", | |
192 | + name, ret); | |
193 | + return NVIF_NOTIFY_DROP; | |
194 | + } | |
195 | ||
196 | if (rep->mask & NVIF_NOTIFY_CONN_V0_IRQ) { | |
197 | NV_DEBUG(drm, "service %s\n", name); | |
198 | @@ -1137,6 +1157,8 @@ nouveau_connector_hotplug(struct nvif_no | |
199 | drm_helper_hpd_irq_event(connector->dev); | |
200 | } | |
201 | ||
202 | + pm_runtime_mark_last_busy(drm->dev->dev); | |
203 | + pm_runtime_put_autosuspend(drm->dev->dev); | |
204 | return NVIF_NOTIFY_KEEP; | |
205 | } | |
206 |