]> git.ipfire.org Git - thirdparty/kernel/stable.git/blob - drivers/gpu/drm/i915/i915_reset.c
Merge tag 'pwm/for-5.2-rc1' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git...
[thirdparty/kernel/stable.git] / drivers / gpu / drm / i915 / i915_reset.c
1 /*
2 * SPDX-License-Identifier: MIT
3 *
4 * Copyright © 2008-2018 Intel Corporation
5 */
6
7 #include <linux/sched/mm.h>
8 #include <linux/stop_machine.h>
9
10 #include "i915_drv.h"
11 #include "i915_gpu_error.h"
12 #include "i915_reset.h"
13
14 #include "intel_guc.h"
15
16 #define RESET_MAX_RETRIES 3
17
18 /* XXX How to handle concurrent GGTT updates using tiling registers? */
19 #define RESET_UNDER_STOP_MACHINE 0
20
21 static void rmw_set(struct intel_uncore *uncore, i915_reg_t reg, u32 set)
22 {
23 intel_uncore_rmw(uncore, reg, 0, set);
24 }
25
26 static void rmw_clear(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)
27 {
28 intel_uncore_rmw(uncore, reg, clr, 0);
29 }
30
31 static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set)
32 {
33 intel_uncore_rmw_fw(uncore, reg, 0, set);
34 }
35
36 static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)
37 {
38 intel_uncore_rmw_fw(uncore, reg, clr, 0);
39 }
40
41 static void engine_skip_context(struct i915_request *rq)
42 {
43 struct intel_engine_cs *engine = rq->engine;
44 struct i915_gem_context *hung_ctx = rq->gem_context;
45
46 lockdep_assert_held(&engine->timeline.lock);
47
48 if (!i915_request_is_active(rq))
49 return;
50
51 list_for_each_entry_continue(rq, &engine->timeline.requests, link)
52 if (rq->gem_context == hung_ctx)
53 i915_request_skip(rq, -EIO);
54 }
55
56 static void client_mark_guilty(struct drm_i915_file_private *file_priv,
57 const struct i915_gem_context *ctx)
58 {
59 unsigned int score;
60 unsigned long prev_hang;
61
62 if (i915_gem_context_is_banned(ctx))
63 score = I915_CLIENT_SCORE_CONTEXT_BAN;
64 else
65 score = 0;
66
67 prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
68 if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
69 score += I915_CLIENT_SCORE_HANG_FAST;
70
71 if (score) {
72 atomic_add(score, &file_priv->ban_score);
73
74 DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
75 ctx->name, score,
76 atomic_read(&file_priv->ban_score));
77 }
78 }
79
80 static bool context_mark_guilty(struct i915_gem_context *ctx)
81 {
82 unsigned long prev_hang;
83 bool banned;
84 int i;
85
86 atomic_inc(&ctx->guilty_count);
87
88 /* Cool contexts are too cool to be banned! (Used for reset testing.) */
89 if (!i915_gem_context_is_bannable(ctx))
90 return false;
91
92 /* Record the timestamp for the last N hangs */
93 prev_hang = ctx->hang_timestamp[0];
94 for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++)
95 ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1];
96 ctx->hang_timestamp[i] = jiffies;
97
98 /* If we have hung N+1 times in rapid succession, we ban the context! */
99 banned = !i915_gem_context_is_recoverable(ctx);
100 if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
101 banned = true;
102 if (banned) {
103 DRM_DEBUG_DRIVER("context %s: guilty %d, banned\n",
104 ctx->name, atomic_read(&ctx->guilty_count));
105 i915_gem_context_set_banned(ctx);
106 }
107
108 if (!IS_ERR_OR_NULL(ctx->file_priv))
109 client_mark_guilty(ctx->file_priv, ctx);
110
111 return banned;
112 }
113
114 static void context_mark_innocent(struct i915_gem_context *ctx)
115 {
116 atomic_inc(&ctx->active_count);
117 }
118
119 void i915_reset_request(struct i915_request *rq, bool guilty)
120 {
121 GEM_TRACE("%s rq=%llx:%lld, guilty? %s\n",
122 rq->engine->name,
123 rq->fence.context,
124 rq->fence.seqno,
125 yesno(guilty));
126
127 lockdep_assert_held(&rq->engine->timeline.lock);
128 GEM_BUG_ON(i915_request_completed(rq));
129
130 if (guilty) {
131 i915_request_skip(rq, -EIO);
132 if (context_mark_guilty(rq->gem_context))
133 engine_skip_context(rq);
134 } else {
135 dma_fence_set_error(&rq->fence, -EAGAIN);
136 context_mark_innocent(rq->gem_context);
137 }
138 }
139
140 static void gen3_stop_engine(struct intel_engine_cs *engine)
141 {
142 struct intel_uncore *uncore = engine->uncore;
143 const u32 base = engine->mmio_base;
144
145 GEM_TRACE("%s\n", engine->name);
146
147 if (intel_engine_stop_cs(engine))
148 GEM_TRACE("%s: timed out on STOP_RING\n", engine->name);
149
150 intel_uncore_write_fw(uncore,
151 RING_HEAD(base),
152 intel_uncore_read_fw(uncore, RING_TAIL(base)));
153 intel_uncore_posting_read_fw(uncore, RING_HEAD(base)); /* paranoia */
154
155 intel_uncore_write_fw(uncore, RING_HEAD(base), 0);
156 intel_uncore_write_fw(uncore, RING_TAIL(base), 0);
157 intel_uncore_posting_read_fw(uncore, RING_TAIL(base));
158
159 /* The ring must be empty before it is disabled */
160 intel_uncore_write_fw(uncore, RING_CTL(base), 0);
161
162 /* Check acts as a post */
163 if (intel_uncore_read_fw(uncore, RING_HEAD(base)))
164 GEM_TRACE("%s: ring head [%x] not parked\n",
165 engine->name,
166 intel_uncore_read_fw(uncore, RING_HEAD(base)));
167 }
168
169 static void i915_stop_engines(struct drm_i915_private *i915,
170 intel_engine_mask_t engine_mask)
171 {
172 struct intel_engine_cs *engine;
173 intel_engine_mask_t tmp;
174
175 if (INTEL_GEN(i915) < 3)
176 return;
177
178 for_each_engine_masked(engine, i915, engine_mask, tmp)
179 gen3_stop_engine(engine);
180 }
181
182 static bool i915_in_reset(struct pci_dev *pdev)
183 {
184 u8 gdrst;
185
186 pci_read_config_byte(pdev, I915_GDRST, &gdrst);
187 return gdrst & GRDOM_RESET_STATUS;
188 }
189
190 static int i915_do_reset(struct drm_i915_private *i915,
191 intel_engine_mask_t engine_mask,
192 unsigned int retry)
193 {
194 struct pci_dev *pdev = i915->drm.pdev;
195 int err;
196
197 /* Assert reset for at least 20 usec, and wait for acknowledgement. */
198 pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
199 udelay(50);
200 err = wait_for_atomic(i915_in_reset(pdev), 50);
201
202 /* Clear the reset request. */
203 pci_write_config_byte(pdev, I915_GDRST, 0);
204 udelay(50);
205 if (!err)
206 err = wait_for_atomic(!i915_in_reset(pdev), 50);
207
208 return err;
209 }
210
211 static bool g4x_reset_complete(struct pci_dev *pdev)
212 {
213 u8 gdrst;
214
215 pci_read_config_byte(pdev, I915_GDRST, &gdrst);
216 return (gdrst & GRDOM_RESET_ENABLE) == 0;
217 }
218
219 static int g33_do_reset(struct drm_i915_private *i915,
220 intel_engine_mask_t engine_mask,
221 unsigned int retry)
222 {
223 struct pci_dev *pdev = i915->drm.pdev;
224
225 pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
226 return wait_for_atomic(g4x_reset_complete(pdev), 50);
227 }
228
229 static int g4x_do_reset(struct drm_i915_private *i915,
230 intel_engine_mask_t engine_mask,
231 unsigned int retry)
232 {
233 struct pci_dev *pdev = i915->drm.pdev;
234 struct intel_uncore *uncore = &i915->uncore;
235 int ret;
236
237 /* WaVcpClkGateDisableForMediaReset:ctg,elk */
238 rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
239 intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
240
241 pci_write_config_byte(pdev, I915_GDRST,
242 GRDOM_MEDIA | GRDOM_RESET_ENABLE);
243 ret = wait_for_atomic(g4x_reset_complete(pdev), 50);
244 if (ret) {
245 DRM_DEBUG_DRIVER("Wait for media reset failed\n");
246 goto out;
247 }
248
249 pci_write_config_byte(pdev, I915_GDRST,
250 GRDOM_RENDER | GRDOM_RESET_ENABLE);
251 ret = wait_for_atomic(g4x_reset_complete(pdev), 50);
252 if (ret) {
253 DRM_DEBUG_DRIVER("Wait for render reset failed\n");
254 goto out;
255 }
256
257 out:
258 pci_write_config_byte(pdev, I915_GDRST, 0);
259
260 rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
261 intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
262
263 return ret;
264 }
265
266 static int ironlake_do_reset(struct drm_i915_private *i915,
267 intel_engine_mask_t engine_mask,
268 unsigned int retry)
269 {
270 struct intel_uncore *uncore = &i915->uncore;
271 int ret;
272
273 intel_uncore_write_fw(uncore, ILK_GDSR,
274 ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
275 ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
276 ILK_GRDOM_RESET_ENABLE, 0,
277 5000, 0,
278 NULL);
279 if (ret) {
280 DRM_DEBUG_DRIVER("Wait for render reset failed\n");
281 goto out;
282 }
283
284 intel_uncore_write_fw(uncore, ILK_GDSR,
285 ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
286 ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
287 ILK_GRDOM_RESET_ENABLE, 0,
288 5000, 0,
289 NULL);
290 if (ret) {
291 DRM_DEBUG_DRIVER("Wait for media reset failed\n");
292 goto out;
293 }
294
295 out:
296 intel_uncore_write_fw(uncore, ILK_GDSR, 0);
297 intel_uncore_posting_read_fw(uncore, ILK_GDSR);
298 return ret;
299 }
300
301 /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
302 static int gen6_hw_domain_reset(struct drm_i915_private *i915,
303 u32 hw_domain_mask)
304 {
305 struct intel_uncore *uncore = &i915->uncore;
306 int err;
307
308 /*
309 * GEN6_GDRST is not in the gt power well, no need to check
310 * for fifo space for the write or forcewake the chip for
311 * the read
312 */
313 intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask);
314
315 /* Wait for the device to ack the reset requests */
316 err = __intel_wait_for_register_fw(uncore,
317 GEN6_GDRST, hw_domain_mask, 0,
318 500, 0,
319 NULL);
320 if (err)
321 DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n",
322 hw_domain_mask);
323
324 return err;
325 }
326
327 static int gen6_reset_engines(struct drm_i915_private *i915,
328 intel_engine_mask_t engine_mask,
329 unsigned int retry)
330 {
331 struct intel_engine_cs *engine;
332 const u32 hw_engine_mask[] = {
333 [RCS0] = GEN6_GRDOM_RENDER,
334 [BCS0] = GEN6_GRDOM_BLT,
335 [VCS0] = GEN6_GRDOM_MEDIA,
336 [VCS1] = GEN8_GRDOM_MEDIA2,
337 [VECS0] = GEN6_GRDOM_VECS,
338 };
339 u32 hw_mask;
340
341 if (engine_mask == ALL_ENGINES) {
342 hw_mask = GEN6_GRDOM_FULL;
343 } else {
344 intel_engine_mask_t tmp;
345
346 hw_mask = 0;
347 for_each_engine_masked(engine, i915, engine_mask, tmp) {
348 GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
349 hw_mask |= hw_engine_mask[engine->id];
350 }
351 }
352
353 return gen6_hw_domain_reset(i915, hw_mask);
354 }
355
356 static u32 gen11_lock_sfc(struct intel_engine_cs *engine)
357 {
358 struct intel_uncore *uncore = engine->uncore;
359 u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access;
360 i915_reg_t sfc_forced_lock, sfc_forced_lock_ack;
361 u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit;
362 i915_reg_t sfc_usage;
363 u32 sfc_usage_bit;
364 u32 sfc_reset_bit;
365
366 switch (engine->class) {
367 case VIDEO_DECODE_CLASS:
368 if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
369 return 0;
370
371 sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
372 sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
373
374 sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine);
375 sfc_forced_lock_ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT;
376
377 sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine);
378 sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT;
379 sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
380 break;
381
382 case VIDEO_ENHANCEMENT_CLASS:
383 sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
384 sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
385
386 sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine);
387 sfc_forced_lock_ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT;
388
389 sfc_usage = GEN11_VECS_SFC_USAGE(engine);
390 sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT;
391 sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
392 break;
393
394 default:
395 return 0;
396 }
397
398 /*
399 * Tell the engine that a software reset is going to happen. The engine
400 * will then try to force lock the SFC (if currently locked, it will
401 * remain so until we tell the engine it is safe to unlock; if currently
402 * unlocked, it will ignore this and all new lock requests). If SFC
403 * ends up being locked to the engine we want to reset, we have to reset
404 * it as well (we will unlock it once the reset sequence is completed).
405 */
406 rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
407
408 if (__intel_wait_for_register_fw(uncore,
409 sfc_forced_lock_ack,
410 sfc_forced_lock_ack_bit,
411 sfc_forced_lock_ack_bit,
412 1000, 0, NULL)) {
413 DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
414 return 0;
415 }
416
417 if (intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit)
418 return sfc_reset_bit;
419
420 return 0;
421 }
422
423 static void gen11_unlock_sfc(struct intel_engine_cs *engine)
424 {
425 struct intel_uncore *uncore = engine->uncore;
426 u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access;
427 i915_reg_t sfc_forced_lock;
428 u32 sfc_forced_lock_bit;
429
430 switch (engine->class) {
431 case VIDEO_DECODE_CLASS:
432 if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
433 return;
434
435 sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
436 sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
437 break;
438
439 case VIDEO_ENHANCEMENT_CLASS:
440 sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
441 sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
442 break;
443
444 default:
445 return;
446 }
447
448 rmw_clear_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
449 }
450
451 static int gen11_reset_engines(struct drm_i915_private *i915,
452 intel_engine_mask_t engine_mask,
453 unsigned int retry)
454 {
455 const u32 hw_engine_mask[] = {
456 [RCS0] = GEN11_GRDOM_RENDER,
457 [BCS0] = GEN11_GRDOM_BLT,
458 [VCS0] = GEN11_GRDOM_MEDIA,
459 [VCS1] = GEN11_GRDOM_MEDIA2,
460 [VCS2] = GEN11_GRDOM_MEDIA3,
461 [VCS3] = GEN11_GRDOM_MEDIA4,
462 [VECS0] = GEN11_GRDOM_VECS,
463 [VECS1] = GEN11_GRDOM_VECS2,
464 };
465 struct intel_engine_cs *engine;
466 intel_engine_mask_t tmp;
467 u32 hw_mask;
468 int ret;
469
470 if (engine_mask == ALL_ENGINES) {
471 hw_mask = GEN11_GRDOM_FULL;
472 } else {
473 hw_mask = 0;
474 for_each_engine_masked(engine, i915, engine_mask, tmp) {
475 GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
476 hw_mask |= hw_engine_mask[engine->id];
477 hw_mask |= gen11_lock_sfc(engine);
478 }
479 }
480
481 ret = gen6_hw_domain_reset(i915, hw_mask);
482
483 if (engine_mask != ALL_ENGINES)
484 for_each_engine_masked(engine, i915, engine_mask, tmp)
485 gen11_unlock_sfc(engine);
486
487 return ret;
488 }
489
490 static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
491 {
492 struct intel_uncore *uncore = engine->uncore;
493 const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base);
494 u32 request, mask, ack;
495 int ret;
496
497 ack = intel_uncore_read_fw(uncore, reg);
498 if (ack & RESET_CTL_CAT_ERROR) {
499 /*
500 * For catastrophic errors, ready-for-reset sequence
501 * needs to be bypassed: HAS#396813
502 */
503 request = RESET_CTL_CAT_ERROR;
504 mask = RESET_CTL_CAT_ERROR;
505
506 /* Catastrophic errors need to be cleared by HW */
507 ack = 0;
508 } else if (!(ack & RESET_CTL_READY_TO_RESET)) {
509 request = RESET_CTL_REQUEST_RESET;
510 mask = RESET_CTL_READY_TO_RESET;
511 ack = RESET_CTL_READY_TO_RESET;
512 } else {
513 return 0;
514 }
515
516 intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request));
517 ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
518 700, 0, NULL);
519 if (ret)
520 DRM_ERROR("%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n",
521 engine->name, request,
522 intel_uncore_read_fw(uncore, reg));
523
524 return ret;
525 }
526
527 static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
528 {
529 intel_uncore_write_fw(engine->uncore,
530 RING_RESET_CTL(engine->mmio_base),
531 _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
532 }
533
534 static int gen8_reset_engines(struct drm_i915_private *i915,
535 intel_engine_mask_t engine_mask,
536 unsigned int retry)
537 {
538 struct intel_engine_cs *engine;
539 const bool reset_non_ready = retry >= 1;
540 intel_engine_mask_t tmp;
541 int ret;
542
543 for_each_engine_masked(engine, i915, engine_mask, tmp) {
544 ret = gen8_engine_reset_prepare(engine);
545 if (ret && !reset_non_ready)
546 goto skip_reset;
547
548 /*
549 * If this is not the first failed attempt to prepare,
550 * we decide to proceed anyway.
551 *
552 * By doing so we risk context corruption and with
553 * some gens (kbl), possible system hang if reset
554 * happens during active bb execution.
555 *
556 * We rather take context corruption instead of
557 * failed reset with a wedged driver/gpu. And
558 * active bb execution case should be covered by
559 * i915_stop_engines we have before the reset.
560 */
561 }
562
563 if (INTEL_GEN(i915) >= 11)
564 ret = gen11_reset_engines(i915, engine_mask, retry);
565 else
566 ret = gen6_reset_engines(i915, engine_mask, retry);
567
568 skip_reset:
569 for_each_engine_masked(engine, i915, engine_mask, tmp)
570 gen8_engine_reset_cancel(engine);
571
572 return ret;
573 }
574
575 typedef int (*reset_func)(struct drm_i915_private *,
576 intel_engine_mask_t engine_mask,
577 unsigned int retry);
578
579 static reset_func intel_get_gpu_reset(struct drm_i915_private *i915)
580 {
581 if (INTEL_GEN(i915) >= 8)
582 return gen8_reset_engines;
583 else if (INTEL_GEN(i915) >= 6)
584 return gen6_reset_engines;
585 else if (INTEL_GEN(i915) >= 5)
586 return ironlake_do_reset;
587 else if (IS_G4X(i915))
588 return g4x_do_reset;
589 else if (IS_G33(i915) || IS_PINEVIEW(i915))
590 return g33_do_reset;
591 else if (INTEL_GEN(i915) >= 3)
592 return i915_do_reset;
593 else
594 return NULL;
595 }
596
597 int intel_gpu_reset(struct drm_i915_private *i915,
598 intel_engine_mask_t engine_mask)
599 {
600 const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
601 reset_func reset;
602 int ret = -ETIMEDOUT;
603 int retry;
604
605 reset = intel_get_gpu_reset(i915);
606 if (!reset)
607 return -ENODEV;
608
609 /*
610 * If the power well sleeps during the reset, the reset
611 * request may be dropped and never completes (causing -EIO).
612 */
613 intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL);
614 for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) {
615 /*
616 * We stop engines, otherwise we might get failed reset and a
617 * dead gpu (on elk). Also as modern gpu as kbl can suffer
618 * from system hang if batchbuffer is progressing when
619 * the reset is issued, regardless of READY_TO_RESET ack.
620 * Thus assume it is best to stop engines on all gens
621 * where we have a gpu reset.
622 *
623 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
624 *
625 * WaMediaResetMainRingCleanup:ctg,elk (presumably)
626 *
627 * FIXME: Wa for more modern gens needs to be validated
628 */
629 if (retry)
630 i915_stop_engines(i915, engine_mask);
631
632 GEM_TRACE("engine_mask=%x\n", engine_mask);
633 preempt_disable();
634 ret = reset(i915, engine_mask, retry);
635 preempt_enable();
636 }
637 intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL);
638
639 return ret;
640 }
641
642 bool intel_has_gpu_reset(struct drm_i915_private *i915)
643 {
644 if (USES_GUC(i915))
645 return false;
646
647 if (!i915_modparams.reset)
648 return NULL;
649
650 return intel_get_gpu_reset(i915);
651 }
652
653 bool intel_has_reset_engine(struct drm_i915_private *i915)
654 {
655 return INTEL_INFO(i915)->has_reset_engine && i915_modparams.reset >= 2;
656 }
657
658 int intel_reset_guc(struct drm_i915_private *i915)
659 {
660 u32 guc_domain =
661 INTEL_GEN(i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
662 int ret;
663
664 GEM_BUG_ON(!HAS_GUC(i915));
665
666 intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL);
667 ret = gen6_hw_domain_reset(i915, guc_domain);
668 intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL);
669
670 return ret;
671 }
672
673 /*
674 * Ensure irq handler finishes, and not run again.
675 * Also return the active request so that we only search for it once.
676 */
677 static void reset_prepare_engine(struct intel_engine_cs *engine)
678 {
679 /*
680 * During the reset sequence, we must prevent the engine from
681 * entering RC6. As the context state is undefined until we restart
682 * the engine, if it does enter RC6 during the reset, the state
683 * written to the powercontext is undefined and so we may lose
684 * GPU state upon resume, i.e. fail to restart after a reset.
685 */
686 intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
687 engine->reset.prepare(engine);
688 }
689
690 static void revoke_mmaps(struct drm_i915_private *i915)
691 {
692 int i;
693
694 for (i = 0; i < i915->num_fence_regs; i++) {
695 struct drm_vma_offset_node *node;
696 struct i915_vma *vma;
697 u64 vma_offset;
698
699 vma = READ_ONCE(i915->fence_regs[i].vma);
700 if (!vma)
701 continue;
702
703 if (!i915_vma_has_userfault(vma))
704 continue;
705
706 GEM_BUG_ON(vma->fence != &i915->fence_regs[i]);
707 node = &vma->obj->base.vma_node;
708 vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT;
709 unmap_mapping_range(i915->drm.anon_inode->i_mapping,
710 drm_vma_node_offset_addr(node) + vma_offset,
711 vma->size,
712 1);
713 }
714 }
715
716 static void reset_prepare(struct drm_i915_private *i915)
717 {
718 struct intel_engine_cs *engine;
719 enum intel_engine_id id;
720
721 for_each_engine(engine, i915, id)
722 reset_prepare_engine(engine);
723
724 intel_uc_reset_prepare(i915);
725 }
726
727 static void gt_revoke(struct drm_i915_private *i915)
728 {
729 revoke_mmaps(i915);
730 }
731
732 static int gt_reset(struct drm_i915_private *i915,
733 intel_engine_mask_t stalled_mask)
734 {
735 struct intel_engine_cs *engine;
736 enum intel_engine_id id;
737 int err;
738
739 /*
740 * Everything depends on having the GTT running, so we need to start
741 * there.
742 */
743 err = i915_ggtt_enable_hw(i915);
744 if (err)
745 return err;
746
747 for_each_engine(engine, i915, id)
748 intel_engine_reset(engine, stalled_mask & engine->mask);
749
750 i915_gem_restore_fences(i915);
751
752 return err;
753 }
754
755 static void reset_finish_engine(struct intel_engine_cs *engine)
756 {
757 engine->reset.finish(engine);
758 intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
759 }
760
761 struct i915_gpu_restart {
762 struct work_struct work;
763 struct drm_i915_private *i915;
764 };
765
766 static void restart_work(struct work_struct *work)
767 {
768 struct i915_gpu_restart *arg = container_of(work, typeof(*arg), work);
769 struct drm_i915_private *i915 = arg->i915;
770 struct intel_engine_cs *engine;
771 enum intel_engine_id id;
772 intel_wakeref_t wakeref;
773
774 wakeref = intel_runtime_pm_get(i915);
775 mutex_lock(&i915->drm.struct_mutex);
776 WRITE_ONCE(i915->gpu_error.restart, NULL);
777
778 for_each_engine(engine, i915, id) {
779 struct i915_request *rq;
780
781 /*
782 * Ostensibily, we always want a context loaded for powersaving,
783 * so if the engine is idle after the reset, send a request
784 * to load our scratch kernel_context.
785 */
786 if (!intel_engine_is_idle(engine))
787 continue;
788
789 rq = i915_request_alloc(engine, i915->kernel_context);
790 if (!IS_ERR(rq))
791 i915_request_add(rq);
792 }
793
794 mutex_unlock(&i915->drm.struct_mutex);
795 intel_runtime_pm_put(i915, wakeref);
796
797 kfree(arg);
798 }
799
800 static void reset_finish(struct drm_i915_private *i915)
801 {
802 struct intel_engine_cs *engine;
803 enum intel_engine_id id;
804
805 for_each_engine(engine, i915, id) {
806 reset_finish_engine(engine);
807 intel_engine_signal_breadcrumbs(engine);
808 }
809 }
810
811 static void reset_restart(struct drm_i915_private *i915)
812 {
813 struct i915_gpu_restart *arg;
814
815 /*
816 * Following the reset, ensure that we always reload context for
817 * powersaving, and to correct engine->last_retired_context. Since
818 * this requires us to submit a request, queue a worker to do that
819 * task for us to evade any locking here.
820 */
821 if (READ_ONCE(i915->gpu_error.restart))
822 return;
823
824 arg = kmalloc(sizeof(*arg), GFP_KERNEL);
825 if (arg) {
826 arg->i915 = i915;
827 INIT_WORK(&arg->work, restart_work);
828
829 WRITE_ONCE(i915->gpu_error.restart, arg);
830 queue_work(i915->wq, &arg->work);
831 }
832 }
833
834 static void nop_submit_request(struct i915_request *request)
835 {
836 struct intel_engine_cs *engine = request->engine;
837 unsigned long flags;
838
839 GEM_TRACE("%s fence %llx:%lld -> -EIO\n",
840 engine->name, request->fence.context, request->fence.seqno);
841 dma_fence_set_error(&request->fence, -EIO);
842
843 spin_lock_irqsave(&engine->timeline.lock, flags);
844 __i915_request_submit(request);
845 i915_request_mark_complete(request);
846 spin_unlock_irqrestore(&engine->timeline.lock, flags);
847
848 intel_engine_queue_breadcrumbs(engine);
849 }
850
851 static void __i915_gem_set_wedged(struct drm_i915_private *i915)
852 {
853 struct i915_gpu_error *error = &i915->gpu_error;
854 struct intel_engine_cs *engine;
855 enum intel_engine_id id;
856
857 if (test_bit(I915_WEDGED, &error->flags))
858 return;
859
860 if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915)) {
861 struct drm_printer p = drm_debug_printer(__func__);
862
863 for_each_engine(engine, i915, id)
864 intel_engine_dump(engine, &p, "%s\n", engine->name);
865 }
866
867 GEM_TRACE("start\n");
868
869 /*
870 * First, stop submission to hw, but do not yet complete requests by
871 * rolling the global seqno forward (since this would complete requests
872 * for which we haven't set the fence error to EIO yet).
873 */
874 reset_prepare(i915);
875
876 /* Even if the GPU reset fails, it should still stop the engines */
877 if (!INTEL_INFO(i915)->gpu_reset_clobbers_display)
878 intel_gpu_reset(i915, ALL_ENGINES);
879
880 for_each_engine(engine, i915, id) {
881 engine->submit_request = nop_submit_request;
882 engine->schedule = NULL;
883 }
884 i915->caps.scheduler = 0;
885
886 /*
887 * Make sure no request can slip through without getting completed by
888 * either this call here to intel_engine_write_global_seqno, or the one
889 * in nop_submit_request.
890 */
891 synchronize_rcu_expedited();
892
893 /* Mark all executing requests as skipped */
894 for_each_engine(engine, i915, id)
895 engine->cancel_requests(engine);
896
897 reset_finish(i915);
898
899 smp_mb__before_atomic();
900 set_bit(I915_WEDGED, &error->flags);
901
902 GEM_TRACE("end\n");
903 }
904
905 void i915_gem_set_wedged(struct drm_i915_private *i915)
906 {
907 struct i915_gpu_error *error = &i915->gpu_error;
908 intel_wakeref_t wakeref;
909
910 mutex_lock(&error->wedge_mutex);
911 with_intel_runtime_pm(i915, wakeref)
912 __i915_gem_set_wedged(i915);
913 mutex_unlock(&error->wedge_mutex);
914 }
915
916 static bool __i915_gem_unset_wedged(struct drm_i915_private *i915)
917 {
918 struct i915_gpu_error *error = &i915->gpu_error;
919 struct i915_timeline *tl;
920
921 if (!test_bit(I915_WEDGED, &error->flags))
922 return true;
923
924 if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
925 return false;
926
927 GEM_TRACE("start\n");
928
929 /*
930 * Before unwedging, make sure that all pending operations
931 * are flushed and errored out - we may have requests waiting upon
932 * third party fences. We marked all inflight requests as EIO, and
933 * every execbuf since returned EIO, for consistency we want all
934 * the currently pending requests to also be marked as EIO, which
935 * is done inside our nop_submit_request - and so we must wait.
936 *
937 * No more can be submitted until we reset the wedged bit.
938 */
939 mutex_lock(&i915->gt.timelines.mutex);
940 list_for_each_entry(tl, &i915->gt.timelines.active_list, link) {
941 struct i915_request *rq;
942
943 rq = i915_active_request_get_unlocked(&tl->last_request);
944 if (!rq)
945 continue;
946
947 /*
948 * All internal dependencies (i915_requests) will have
949 * been flushed by the set-wedge, but we may be stuck waiting
950 * for external fences. These should all be capped to 10s
951 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
952 * in the worst case.
953 */
954 dma_fence_default_wait(&rq->fence, false, MAX_SCHEDULE_TIMEOUT);
955 i915_request_put(rq);
956 }
957 mutex_unlock(&i915->gt.timelines.mutex);
958
959 intel_engines_sanitize(i915, false);
960
961 /*
962 * Undo nop_submit_request. We prevent all new i915 requests from
963 * being queued (by disallowing execbuf whilst wedged) so having
964 * waited for all active requests above, we know the system is idle
965 * and do not have to worry about a thread being inside
966 * engine->submit_request() as we swap over. So unlike installing
967 * the nop_submit_request on reset, we can do this from normal
968 * context and do not require stop_machine().
969 */
970 intel_engines_reset_default_submission(i915);
971
972 GEM_TRACE("end\n");
973
974 smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
975 clear_bit(I915_WEDGED, &i915->gpu_error.flags);
976
977 return true;
978 }
979
980 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
981 {
982 struct i915_gpu_error *error = &i915->gpu_error;
983 bool result;
984
985 mutex_lock(&error->wedge_mutex);
986 result = __i915_gem_unset_wedged(i915);
987 mutex_unlock(&error->wedge_mutex);
988
989 return result;
990 }
991
992 static int do_reset(struct drm_i915_private *i915,
993 intel_engine_mask_t stalled_mask)
994 {
995 int err, i;
996
997 gt_revoke(i915);
998
999 err = intel_gpu_reset(i915, ALL_ENGINES);
1000 for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
1001 msleep(10 * (i + 1));
1002 err = intel_gpu_reset(i915, ALL_ENGINES);
1003 }
1004 if (err)
1005 return err;
1006
1007 return gt_reset(i915, stalled_mask);
1008 }
1009
1010 /**
1011 * i915_reset - reset chip after a hang
1012 * @i915: #drm_i915_private to reset
1013 * @stalled_mask: mask of the stalled engines with the guilty requests
1014 * @reason: user error message for why we are resetting
1015 *
1016 * Reset the chip. Useful if a hang is detected. Marks the device as wedged
1017 * on failure.
1018 *
1019 * Procedure is fairly simple:
1020 * - reset the chip using the reset reg
1021 * - re-init context state
1022 * - re-init hardware status page
1023 * - re-init ring buffer
1024 * - re-init interrupt state
1025 * - re-init display
1026 */
1027 void i915_reset(struct drm_i915_private *i915,
1028 intel_engine_mask_t stalled_mask,
1029 const char *reason)
1030 {
1031 struct i915_gpu_error *error = &i915->gpu_error;
1032 int ret;
1033
1034 GEM_TRACE("flags=%lx\n", error->flags);
1035
1036 might_sleep();
1037 assert_rpm_wakelock_held(i915);
1038 GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
1039
1040 /* Clear any previous failed attempts at recovery. Time to try again. */
1041 if (!__i915_gem_unset_wedged(i915))
1042 return;
1043
1044 if (reason)
1045 dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
1046 error->reset_count++;
1047
1048 reset_prepare(i915);
1049
1050 if (!intel_has_gpu_reset(i915)) {
1051 if (i915_modparams.reset)
1052 dev_err(i915->drm.dev, "GPU reset not supported\n");
1053 else
1054 DRM_DEBUG_DRIVER("GPU reset disabled\n");
1055 goto error;
1056 }
1057
1058 if (INTEL_INFO(i915)->gpu_reset_clobbers_display)
1059 intel_runtime_pm_disable_interrupts(i915);
1060
1061 if (do_reset(i915, stalled_mask)) {
1062 dev_err(i915->drm.dev, "Failed to reset chip\n");
1063 goto taint;
1064 }
1065
1066 if (INTEL_INFO(i915)->gpu_reset_clobbers_display)
1067 intel_runtime_pm_enable_interrupts(i915);
1068
1069 intel_overlay_reset(i915);
1070
1071 /*
1072 * Next we need to restore the context, but we don't use those
1073 * yet either...
1074 *
1075 * Ring buffer needs to be re-initialized in the KMS case, or if X
1076 * was running at the time of the reset (i.e. we weren't VT
1077 * switched away).
1078 */
1079 ret = i915_gem_init_hw(i915);
1080 if (ret) {
1081 DRM_ERROR("Failed to initialise HW following reset (%d)\n",
1082 ret);
1083 goto error;
1084 }
1085
1086 i915_queue_hangcheck(i915);
1087
1088 finish:
1089 reset_finish(i915);
1090 if (!__i915_wedged(error))
1091 reset_restart(i915);
1092 return;
1093
1094 taint:
1095 /*
1096 * History tells us that if we cannot reset the GPU now, we
1097 * never will. This then impacts everything that is run
1098 * subsequently. On failing the reset, we mark the driver
1099 * as wedged, preventing further execution on the GPU.
1100 * We also want to go one step further and add a taint to the
1101 * kernel so that any subsequent faults can be traced back to
1102 * this failure. This is important for CI, where if the
1103 * GPU/driver fails we would like to reboot and restart testing
1104 * rather than continue on into oblivion. For everyone else,
1105 * the system should still plod along, but they have been warned!
1106 */
1107 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
1108 error:
1109 __i915_gem_set_wedged(i915);
1110 goto finish;
1111 }
1112
1113 static inline int intel_gt_reset_engine(struct drm_i915_private *i915,
1114 struct intel_engine_cs *engine)
1115 {
1116 return intel_gpu_reset(i915, engine->mask);
1117 }
1118
1119 /**
1120 * i915_reset_engine - reset GPU engine to recover from a hang
1121 * @engine: engine to reset
1122 * @msg: reason for GPU reset; or NULL for no dev_notice()
1123 *
1124 * Reset a specific GPU engine. Useful if a hang is detected.
1125 * Returns zero on successful reset or otherwise an error code.
1126 *
1127 * Procedure is:
1128 * - identifies the request that caused the hang and it is dropped
1129 * - reset engine (which will force the engine to idle)
1130 * - re-init/configure engine
1131 */
1132 int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
1133 {
1134 struct i915_gpu_error *error = &engine->i915->gpu_error;
1135 int ret;
1136
1137 GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
1138 GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
1139
1140 reset_prepare_engine(engine);
1141
1142 if (msg)
1143 dev_notice(engine->i915->drm.dev,
1144 "Resetting %s for %s\n", engine->name, msg);
1145 error->reset_engine_count[engine->id]++;
1146
1147 if (!engine->i915->guc.execbuf_client)
1148 ret = intel_gt_reset_engine(engine->i915, engine);
1149 else
1150 ret = intel_guc_reset_engine(&engine->i915->guc, engine);
1151 if (ret) {
1152 /* If we fail here, we expect to fallback to a global reset */
1153 DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n",
1154 engine->i915->guc.execbuf_client ? "GuC " : "",
1155 engine->name, ret);
1156 goto out;
1157 }
1158
1159 /*
1160 * The request that caused the hang is stuck on elsp, we know the
1161 * active request and can drop it, adjust head to skip the offending
1162 * request to resume executing remaining requests in the queue.
1163 */
1164 intel_engine_reset(engine, true);
1165
1166 /*
1167 * The engine and its registers (and workarounds in case of render)
1168 * have been reset to their default values. Follow the init_ring
1169 * process to program RING_MODE, HWSP and re-enable submission.
1170 */
1171 ret = engine->init_hw(engine);
1172 if (ret)
1173 goto out;
1174
1175 out:
1176 intel_engine_cancel_stop_cs(engine);
1177 reset_finish_engine(engine);
1178 return ret;
1179 }
1180
1181 static void i915_reset_device(struct drm_i915_private *i915,
1182 u32 engine_mask,
1183 const char *reason)
1184 {
1185 struct i915_gpu_error *error = &i915->gpu_error;
1186 struct kobject *kobj = &i915->drm.primary->kdev->kobj;
1187 char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
1188 char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
1189 char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
1190 struct i915_wedge_me w;
1191
1192 kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
1193
1194 DRM_DEBUG_DRIVER("resetting chip\n");
1195 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
1196
1197 /* Use a watchdog to ensure that our reset completes */
1198 i915_wedge_on_timeout(&w, i915, 5 * HZ) {
1199 intel_prepare_reset(i915);
1200
1201 /* Flush everyone using a resource about to be clobbered */
1202 synchronize_srcu_expedited(&error->reset_backoff_srcu);
1203
1204 mutex_lock(&error->wedge_mutex);
1205 i915_reset(i915, engine_mask, reason);
1206 mutex_unlock(&error->wedge_mutex);
1207
1208 intel_finish_reset(i915);
1209 }
1210
1211 if (!test_bit(I915_WEDGED, &error->flags))
1212 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
1213 }
1214
1215 static void clear_register(struct intel_uncore *uncore, i915_reg_t reg)
1216 {
1217 intel_uncore_rmw(uncore, reg, 0, 0);
1218 }
1219
1220 void i915_clear_error_registers(struct drm_i915_private *i915)
1221 {
1222 struct intel_uncore *uncore = &i915->uncore;
1223 u32 eir;
1224
1225 if (!IS_GEN(i915, 2))
1226 clear_register(uncore, PGTBL_ER);
1227
1228 if (INTEL_GEN(i915) < 4)
1229 clear_register(uncore, IPEIR(RENDER_RING_BASE));
1230 else
1231 clear_register(uncore, IPEIR_I965);
1232
1233 clear_register(uncore, EIR);
1234 eir = intel_uncore_read(uncore, EIR);
1235 if (eir) {
1236 /*
1237 * some errors might have become stuck,
1238 * mask them.
1239 */
1240 DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir);
1241 rmw_set(uncore, EMR, eir);
1242 intel_uncore_write(uncore, GEN2_IIR,
1243 I915_MASTER_ERROR_INTERRUPT);
1244 }
1245
1246 if (INTEL_GEN(i915) >= 8) {
1247 rmw_clear(uncore, GEN8_RING_FAULT_REG, RING_FAULT_VALID);
1248 intel_uncore_posting_read(uncore, GEN8_RING_FAULT_REG);
1249 } else if (INTEL_GEN(i915) >= 6) {
1250 struct intel_engine_cs *engine;
1251 enum intel_engine_id id;
1252
1253 for_each_engine(engine, i915, id) {
1254 rmw_clear(uncore,
1255 RING_FAULT_REG(engine), RING_FAULT_VALID);
1256 intel_uncore_posting_read(uncore,
1257 RING_FAULT_REG(engine));
1258 }
1259 }
1260 }
1261
1262 /**
1263 * i915_handle_error - handle a gpu error
1264 * @i915: i915 device private
1265 * @engine_mask: mask representing engines that are hung
1266 * @flags: control flags
1267 * @fmt: Error message format string
1268 *
1269 * Do some basic checking of register state at error time and
1270 * dump it to the syslog. Also call i915_capture_error_state() to make
1271 * sure we get a record and make it available in debugfs. Fire a uevent
1272 * so userspace knows something bad happened (should trigger collection
1273 * of a ring dump etc.).
1274 */
1275 void i915_handle_error(struct drm_i915_private *i915,
1276 intel_engine_mask_t engine_mask,
1277 unsigned long flags,
1278 const char *fmt, ...)
1279 {
1280 struct i915_gpu_error *error = &i915->gpu_error;
1281 struct intel_engine_cs *engine;
1282 intel_wakeref_t wakeref;
1283 intel_engine_mask_t tmp;
1284 char error_msg[80];
1285 char *msg = NULL;
1286
1287 if (fmt) {
1288 va_list args;
1289
1290 va_start(args, fmt);
1291 vscnprintf(error_msg, sizeof(error_msg), fmt, args);
1292 va_end(args);
1293
1294 msg = error_msg;
1295 }
1296
1297 /*
1298 * In most cases it's guaranteed that we get here with an RPM
1299 * reference held, for example because there is a pending GPU
1300 * request that won't finish until the reset is done. This
1301 * isn't the case at least when we get here by doing a
1302 * simulated reset via debugfs, so get an RPM reference.
1303 */
1304 wakeref = intel_runtime_pm_get(i915);
1305
1306 engine_mask &= INTEL_INFO(i915)->engine_mask;
1307
1308 if (flags & I915_ERROR_CAPTURE) {
1309 i915_capture_error_state(i915, engine_mask, msg);
1310 i915_clear_error_registers(i915);
1311 }
1312
1313 /*
1314 * Try engine reset when available. We fall back to full reset if
1315 * single reset fails.
1316 */
1317 if (intel_has_reset_engine(i915) && !__i915_wedged(error)) {
1318 for_each_engine_masked(engine, i915, engine_mask, tmp) {
1319 BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
1320 if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1321 &error->flags))
1322 continue;
1323
1324 if (i915_reset_engine(engine, msg) == 0)
1325 engine_mask &= ~engine->mask;
1326
1327 clear_bit(I915_RESET_ENGINE + engine->id,
1328 &error->flags);
1329 wake_up_bit(&error->flags,
1330 I915_RESET_ENGINE + engine->id);
1331 }
1332 }
1333
1334 if (!engine_mask)
1335 goto out;
1336
1337 /* Full reset needs the mutex, stop any other user trying to do so. */
1338 if (test_and_set_bit(I915_RESET_BACKOFF, &error->flags)) {
1339 wait_event(error->reset_queue,
1340 !test_bit(I915_RESET_BACKOFF, &error->flags));
1341 goto out; /* piggy-back on the other reset */
1342 }
1343
1344 /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
1345 synchronize_rcu_expedited();
1346
1347 /* Prevent any other reset-engine attempt. */
1348 for_each_engine(engine, i915, tmp) {
1349 while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1350 &error->flags))
1351 wait_on_bit(&error->flags,
1352 I915_RESET_ENGINE + engine->id,
1353 TASK_UNINTERRUPTIBLE);
1354 }
1355
1356 i915_reset_device(i915, engine_mask, msg);
1357
1358 for_each_engine(engine, i915, tmp) {
1359 clear_bit(I915_RESET_ENGINE + engine->id,
1360 &error->flags);
1361 }
1362
1363 clear_bit(I915_RESET_BACKOFF, &error->flags);
1364 wake_up_all(&error->reset_queue);
1365
1366 out:
1367 intel_runtime_pm_put(i915, wakeref);
1368 }
1369
1370 int i915_reset_trylock(struct drm_i915_private *i915)
1371 {
1372 struct i915_gpu_error *error = &i915->gpu_error;
1373 int srcu;
1374
1375 might_lock(&error->reset_backoff_srcu);
1376 might_sleep();
1377
1378 rcu_read_lock();
1379 while (test_bit(I915_RESET_BACKOFF, &error->flags)) {
1380 rcu_read_unlock();
1381
1382 if (wait_event_interruptible(error->reset_queue,
1383 !test_bit(I915_RESET_BACKOFF,
1384 &error->flags)))
1385 return -EINTR;
1386
1387 rcu_read_lock();
1388 }
1389 srcu = srcu_read_lock(&error->reset_backoff_srcu);
1390 rcu_read_unlock();
1391
1392 return srcu;
1393 }
1394
1395 void i915_reset_unlock(struct drm_i915_private *i915, int tag)
1396 __releases(&i915->gpu_error.reset_backoff_srcu)
1397 {
1398 struct i915_gpu_error *error = &i915->gpu_error;
1399
1400 srcu_read_unlock(&error->reset_backoff_srcu, tag);
1401 }
1402
1403 int i915_terminally_wedged(struct drm_i915_private *i915)
1404 {
1405 struct i915_gpu_error *error = &i915->gpu_error;
1406
1407 might_sleep();
1408
1409 if (!__i915_wedged(error))
1410 return 0;
1411
1412 /* Reset still in progress? Maybe we will recover? */
1413 if (!test_bit(I915_RESET_BACKOFF, &error->flags))
1414 return -EIO;
1415
1416 /* XXX intel_reset_finish() still takes struct_mutex!!! */
1417 if (mutex_is_locked(&i915->drm.struct_mutex))
1418 return -EAGAIN;
1419
1420 if (wait_event_interruptible(error->reset_queue,
1421 !test_bit(I915_RESET_BACKOFF,
1422 &error->flags)))
1423 return -EINTR;
1424
1425 return __i915_wedged(error) ? -EIO : 0;
1426 }
1427
1428 bool i915_reset_flush(struct drm_i915_private *i915)
1429 {
1430 int err;
1431
1432 cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
1433
1434 flush_workqueue(i915->wq);
1435 GEM_BUG_ON(READ_ONCE(i915->gpu_error.restart));
1436
1437 mutex_lock(&i915->drm.struct_mutex);
1438 err = i915_gem_wait_for_idle(i915,
1439 I915_WAIT_LOCKED |
1440 I915_WAIT_FOR_IDLE_BOOST,
1441 MAX_SCHEDULE_TIMEOUT);
1442 mutex_unlock(&i915->drm.struct_mutex);
1443
1444 return !err;
1445 }
1446
1447 static void i915_wedge_me(struct work_struct *work)
1448 {
1449 struct i915_wedge_me *w = container_of(work, typeof(*w), work.work);
1450
1451 dev_err(w->i915->drm.dev,
1452 "%s timed out, cancelling all in-flight rendering.\n",
1453 w->name);
1454 i915_gem_set_wedged(w->i915);
1455 }
1456
1457 void __i915_init_wedge(struct i915_wedge_me *w,
1458 struct drm_i915_private *i915,
1459 long timeout,
1460 const char *name)
1461 {
1462 w->i915 = i915;
1463 w->name = name;
1464
1465 INIT_DELAYED_WORK_ONSTACK(&w->work, i915_wedge_me);
1466 schedule_delayed_work(&w->work, timeout);
1467 }
1468
1469 void __i915_fini_wedge(struct i915_wedge_me *w)
1470 {
1471 cancel_delayed_work_sync(&w->work);
1472 destroy_delayed_work_on_stack(&w->work);
1473 w->i915 = NULL;
1474 }