2 * SPDX-License-Identifier: MIT
4 * Copyright © 2008-2018 Intel Corporation
7 #include <linux/sched/mm.h>
8 #include <linux/stop_machine.h>
11 #include "i915_gpu_error.h"
12 #include "i915_reset.h"
14 #include "intel_guc.h"
16 #define RESET_MAX_RETRIES 3
18 /* XXX How to handle concurrent GGTT updates using tiling registers? */
19 #define RESET_UNDER_STOP_MACHINE 0
21 static void rmw_set(struct intel_uncore
*uncore
, i915_reg_t reg
, u32 set
)
23 intel_uncore_rmw(uncore
, reg
, 0, set
);
26 static void rmw_clear(struct intel_uncore
*uncore
, i915_reg_t reg
, u32 clr
)
28 intel_uncore_rmw(uncore
, reg
, clr
, 0);
31 static void rmw_set_fw(struct intel_uncore
*uncore
, i915_reg_t reg
, u32 set
)
33 intel_uncore_rmw_fw(uncore
, reg
, 0, set
);
36 static void rmw_clear_fw(struct intel_uncore
*uncore
, i915_reg_t reg
, u32 clr
)
38 intel_uncore_rmw_fw(uncore
, reg
, clr
, 0);
41 static void engine_skip_context(struct i915_request
*rq
)
43 struct intel_engine_cs
*engine
= rq
->engine
;
44 struct i915_gem_context
*hung_ctx
= rq
->gem_context
;
46 lockdep_assert_held(&engine
->timeline
.lock
);
48 if (!i915_request_is_active(rq
))
51 list_for_each_entry_continue(rq
, &engine
->timeline
.requests
, link
)
52 if (rq
->gem_context
== hung_ctx
)
53 i915_request_skip(rq
, -EIO
);
56 static void client_mark_guilty(struct drm_i915_file_private
*file_priv
,
57 const struct i915_gem_context
*ctx
)
60 unsigned long prev_hang
;
62 if (i915_gem_context_is_banned(ctx
))
63 score
= I915_CLIENT_SCORE_CONTEXT_BAN
;
67 prev_hang
= xchg(&file_priv
->hang_timestamp
, jiffies
);
68 if (time_before(jiffies
, prev_hang
+ I915_CLIENT_FAST_HANG_JIFFIES
))
69 score
+= I915_CLIENT_SCORE_HANG_FAST
;
72 atomic_add(score
, &file_priv
->ban_score
);
74 DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
76 atomic_read(&file_priv
->ban_score
));
80 static bool context_mark_guilty(struct i915_gem_context
*ctx
)
82 unsigned long prev_hang
;
86 atomic_inc(&ctx
->guilty_count
);
88 /* Cool contexts are too cool to be banned! (Used for reset testing.) */
89 if (!i915_gem_context_is_bannable(ctx
))
92 /* Record the timestamp for the last N hangs */
93 prev_hang
= ctx
->hang_timestamp
[0];
94 for (i
= 0; i
< ARRAY_SIZE(ctx
->hang_timestamp
) - 1; i
++)
95 ctx
->hang_timestamp
[i
] = ctx
->hang_timestamp
[i
+ 1];
96 ctx
->hang_timestamp
[i
] = jiffies
;
98 /* If we have hung N+1 times in rapid succession, we ban the context! */
99 banned
= !i915_gem_context_is_recoverable(ctx
);
100 if (time_before(jiffies
, prev_hang
+ CONTEXT_FAST_HANG_JIFFIES
))
103 DRM_DEBUG_DRIVER("context %s: guilty %d, banned\n",
104 ctx
->name
, atomic_read(&ctx
->guilty_count
));
105 i915_gem_context_set_banned(ctx
);
108 if (!IS_ERR_OR_NULL(ctx
->file_priv
))
109 client_mark_guilty(ctx
->file_priv
, ctx
);
114 static void context_mark_innocent(struct i915_gem_context
*ctx
)
116 atomic_inc(&ctx
->active_count
);
119 void i915_reset_request(struct i915_request
*rq
, bool guilty
)
121 GEM_TRACE("%s rq=%llx:%lld, guilty? %s\n",
127 lockdep_assert_held(&rq
->engine
->timeline
.lock
);
128 GEM_BUG_ON(i915_request_completed(rq
));
131 i915_request_skip(rq
, -EIO
);
132 if (context_mark_guilty(rq
->gem_context
))
133 engine_skip_context(rq
);
135 dma_fence_set_error(&rq
->fence
, -EAGAIN
);
136 context_mark_innocent(rq
->gem_context
);
140 static void gen3_stop_engine(struct intel_engine_cs
*engine
)
142 struct intel_uncore
*uncore
= engine
->uncore
;
143 const u32 base
= engine
->mmio_base
;
145 GEM_TRACE("%s\n", engine
->name
);
147 if (intel_engine_stop_cs(engine
))
148 GEM_TRACE("%s: timed out on STOP_RING\n", engine
->name
);
150 intel_uncore_write_fw(uncore
,
152 intel_uncore_read_fw(uncore
, RING_TAIL(base
)));
153 intel_uncore_posting_read_fw(uncore
, RING_HEAD(base
)); /* paranoia */
155 intel_uncore_write_fw(uncore
, RING_HEAD(base
), 0);
156 intel_uncore_write_fw(uncore
, RING_TAIL(base
), 0);
157 intel_uncore_posting_read_fw(uncore
, RING_TAIL(base
));
159 /* The ring must be empty before it is disabled */
160 intel_uncore_write_fw(uncore
, RING_CTL(base
), 0);
162 /* Check acts as a post */
163 if (intel_uncore_read_fw(uncore
, RING_HEAD(base
)))
164 GEM_TRACE("%s: ring head [%x] not parked\n",
166 intel_uncore_read_fw(uncore
, RING_HEAD(base
)));
169 static void i915_stop_engines(struct drm_i915_private
*i915
,
170 intel_engine_mask_t engine_mask
)
172 struct intel_engine_cs
*engine
;
173 intel_engine_mask_t tmp
;
175 if (INTEL_GEN(i915
) < 3)
178 for_each_engine_masked(engine
, i915
, engine_mask
, tmp
)
179 gen3_stop_engine(engine
);
182 static bool i915_in_reset(struct pci_dev
*pdev
)
186 pci_read_config_byte(pdev
, I915_GDRST
, &gdrst
);
187 return gdrst
& GRDOM_RESET_STATUS
;
190 static int i915_do_reset(struct drm_i915_private
*i915
,
191 intel_engine_mask_t engine_mask
,
194 struct pci_dev
*pdev
= i915
->drm
.pdev
;
197 /* Assert reset for at least 20 usec, and wait for acknowledgement. */
198 pci_write_config_byte(pdev
, I915_GDRST
, GRDOM_RESET_ENABLE
);
200 err
= wait_for_atomic(i915_in_reset(pdev
), 50);
202 /* Clear the reset request. */
203 pci_write_config_byte(pdev
, I915_GDRST
, 0);
206 err
= wait_for_atomic(!i915_in_reset(pdev
), 50);
211 static bool g4x_reset_complete(struct pci_dev
*pdev
)
215 pci_read_config_byte(pdev
, I915_GDRST
, &gdrst
);
216 return (gdrst
& GRDOM_RESET_ENABLE
) == 0;
219 static int g33_do_reset(struct drm_i915_private
*i915
,
220 intel_engine_mask_t engine_mask
,
223 struct pci_dev
*pdev
= i915
->drm
.pdev
;
225 pci_write_config_byte(pdev
, I915_GDRST
, GRDOM_RESET_ENABLE
);
226 return wait_for_atomic(g4x_reset_complete(pdev
), 50);
229 static int g4x_do_reset(struct drm_i915_private
*i915
,
230 intel_engine_mask_t engine_mask
,
233 struct pci_dev
*pdev
= i915
->drm
.pdev
;
234 struct intel_uncore
*uncore
= &i915
->uncore
;
237 /* WaVcpClkGateDisableForMediaReset:ctg,elk */
238 rmw_set_fw(uncore
, VDECCLK_GATE_D
, VCP_UNIT_CLOCK_GATE_DISABLE
);
239 intel_uncore_posting_read_fw(uncore
, VDECCLK_GATE_D
);
241 pci_write_config_byte(pdev
, I915_GDRST
,
242 GRDOM_MEDIA
| GRDOM_RESET_ENABLE
);
243 ret
= wait_for_atomic(g4x_reset_complete(pdev
), 50);
245 DRM_DEBUG_DRIVER("Wait for media reset failed\n");
249 pci_write_config_byte(pdev
, I915_GDRST
,
250 GRDOM_RENDER
| GRDOM_RESET_ENABLE
);
251 ret
= wait_for_atomic(g4x_reset_complete(pdev
), 50);
253 DRM_DEBUG_DRIVER("Wait for render reset failed\n");
258 pci_write_config_byte(pdev
, I915_GDRST
, 0);
260 rmw_clear_fw(uncore
, VDECCLK_GATE_D
, VCP_UNIT_CLOCK_GATE_DISABLE
);
261 intel_uncore_posting_read_fw(uncore
, VDECCLK_GATE_D
);
266 static int ironlake_do_reset(struct drm_i915_private
*i915
,
267 intel_engine_mask_t engine_mask
,
270 struct intel_uncore
*uncore
= &i915
->uncore
;
273 intel_uncore_write_fw(uncore
, ILK_GDSR
,
274 ILK_GRDOM_RENDER
| ILK_GRDOM_RESET_ENABLE
);
275 ret
= __intel_wait_for_register_fw(uncore
, ILK_GDSR
,
276 ILK_GRDOM_RESET_ENABLE
, 0,
280 DRM_DEBUG_DRIVER("Wait for render reset failed\n");
284 intel_uncore_write_fw(uncore
, ILK_GDSR
,
285 ILK_GRDOM_MEDIA
| ILK_GRDOM_RESET_ENABLE
);
286 ret
= __intel_wait_for_register_fw(uncore
, ILK_GDSR
,
287 ILK_GRDOM_RESET_ENABLE
, 0,
291 DRM_DEBUG_DRIVER("Wait for media reset failed\n");
296 intel_uncore_write_fw(uncore
, ILK_GDSR
, 0);
297 intel_uncore_posting_read_fw(uncore
, ILK_GDSR
);
301 /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
302 static int gen6_hw_domain_reset(struct drm_i915_private
*i915
,
305 struct intel_uncore
*uncore
= &i915
->uncore
;
309 * GEN6_GDRST is not in the gt power well, no need to check
310 * for fifo space for the write or forcewake the chip for
313 intel_uncore_write_fw(uncore
, GEN6_GDRST
, hw_domain_mask
);
315 /* Wait for the device to ack the reset requests */
316 err
= __intel_wait_for_register_fw(uncore
,
317 GEN6_GDRST
, hw_domain_mask
, 0,
321 DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n",
327 static int gen6_reset_engines(struct drm_i915_private
*i915
,
328 intel_engine_mask_t engine_mask
,
331 struct intel_engine_cs
*engine
;
332 const u32 hw_engine_mask
[] = {
333 [RCS0
] = GEN6_GRDOM_RENDER
,
334 [BCS0
] = GEN6_GRDOM_BLT
,
335 [VCS0
] = GEN6_GRDOM_MEDIA
,
336 [VCS1
] = GEN8_GRDOM_MEDIA2
,
337 [VECS0
] = GEN6_GRDOM_VECS
,
341 if (engine_mask
== ALL_ENGINES
) {
342 hw_mask
= GEN6_GRDOM_FULL
;
344 intel_engine_mask_t tmp
;
347 for_each_engine_masked(engine
, i915
, engine_mask
, tmp
) {
348 GEM_BUG_ON(engine
->id
>= ARRAY_SIZE(hw_engine_mask
));
349 hw_mask
|= hw_engine_mask
[engine
->id
];
353 return gen6_hw_domain_reset(i915
, hw_mask
);
356 static u32
gen11_lock_sfc(struct intel_engine_cs
*engine
)
358 struct intel_uncore
*uncore
= engine
->uncore
;
359 u8 vdbox_sfc_access
= RUNTIME_INFO(engine
->i915
)->vdbox_sfc_access
;
360 i915_reg_t sfc_forced_lock
, sfc_forced_lock_ack
;
361 u32 sfc_forced_lock_bit
, sfc_forced_lock_ack_bit
;
362 i915_reg_t sfc_usage
;
366 switch (engine
->class) {
367 case VIDEO_DECODE_CLASS
:
368 if ((BIT(engine
->instance
) & vdbox_sfc_access
) == 0)
371 sfc_forced_lock
= GEN11_VCS_SFC_FORCED_LOCK(engine
);
372 sfc_forced_lock_bit
= GEN11_VCS_SFC_FORCED_LOCK_BIT
;
374 sfc_forced_lock_ack
= GEN11_VCS_SFC_LOCK_STATUS(engine
);
375 sfc_forced_lock_ack_bit
= GEN11_VCS_SFC_LOCK_ACK_BIT
;
377 sfc_usage
= GEN11_VCS_SFC_LOCK_STATUS(engine
);
378 sfc_usage_bit
= GEN11_VCS_SFC_USAGE_BIT
;
379 sfc_reset_bit
= GEN11_VCS_SFC_RESET_BIT(engine
->instance
);
382 case VIDEO_ENHANCEMENT_CLASS
:
383 sfc_forced_lock
= GEN11_VECS_SFC_FORCED_LOCK(engine
);
384 sfc_forced_lock_bit
= GEN11_VECS_SFC_FORCED_LOCK_BIT
;
386 sfc_forced_lock_ack
= GEN11_VECS_SFC_LOCK_ACK(engine
);
387 sfc_forced_lock_ack_bit
= GEN11_VECS_SFC_LOCK_ACK_BIT
;
389 sfc_usage
= GEN11_VECS_SFC_USAGE(engine
);
390 sfc_usage_bit
= GEN11_VECS_SFC_USAGE_BIT
;
391 sfc_reset_bit
= GEN11_VECS_SFC_RESET_BIT(engine
->instance
);
399 * Tell the engine that a software reset is going to happen. The engine
400 * will then try to force lock the SFC (if currently locked, it will
401 * remain so until we tell the engine it is safe to unlock; if currently
402 * unlocked, it will ignore this and all new lock requests). If SFC
403 * ends up being locked to the engine we want to reset, we have to reset
404 * it as well (we will unlock it once the reset sequence is completed).
406 rmw_set_fw(uncore
, sfc_forced_lock
, sfc_forced_lock_bit
);
408 if (__intel_wait_for_register_fw(uncore
,
410 sfc_forced_lock_ack_bit
,
411 sfc_forced_lock_ack_bit
,
413 DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
417 if (intel_uncore_read_fw(uncore
, sfc_usage
) & sfc_usage_bit
)
418 return sfc_reset_bit
;
423 static void gen11_unlock_sfc(struct intel_engine_cs
*engine
)
425 struct intel_uncore
*uncore
= engine
->uncore
;
426 u8 vdbox_sfc_access
= RUNTIME_INFO(engine
->i915
)->vdbox_sfc_access
;
427 i915_reg_t sfc_forced_lock
;
428 u32 sfc_forced_lock_bit
;
430 switch (engine
->class) {
431 case VIDEO_DECODE_CLASS
:
432 if ((BIT(engine
->instance
) & vdbox_sfc_access
) == 0)
435 sfc_forced_lock
= GEN11_VCS_SFC_FORCED_LOCK(engine
);
436 sfc_forced_lock_bit
= GEN11_VCS_SFC_FORCED_LOCK_BIT
;
439 case VIDEO_ENHANCEMENT_CLASS
:
440 sfc_forced_lock
= GEN11_VECS_SFC_FORCED_LOCK(engine
);
441 sfc_forced_lock_bit
= GEN11_VECS_SFC_FORCED_LOCK_BIT
;
448 rmw_clear_fw(uncore
, sfc_forced_lock
, sfc_forced_lock_bit
);
451 static int gen11_reset_engines(struct drm_i915_private
*i915
,
452 intel_engine_mask_t engine_mask
,
455 const u32 hw_engine_mask
[] = {
456 [RCS0
] = GEN11_GRDOM_RENDER
,
457 [BCS0
] = GEN11_GRDOM_BLT
,
458 [VCS0
] = GEN11_GRDOM_MEDIA
,
459 [VCS1
] = GEN11_GRDOM_MEDIA2
,
460 [VCS2
] = GEN11_GRDOM_MEDIA3
,
461 [VCS3
] = GEN11_GRDOM_MEDIA4
,
462 [VECS0
] = GEN11_GRDOM_VECS
,
463 [VECS1
] = GEN11_GRDOM_VECS2
,
465 struct intel_engine_cs
*engine
;
466 intel_engine_mask_t tmp
;
470 if (engine_mask
== ALL_ENGINES
) {
471 hw_mask
= GEN11_GRDOM_FULL
;
474 for_each_engine_masked(engine
, i915
, engine_mask
, tmp
) {
475 GEM_BUG_ON(engine
->id
>= ARRAY_SIZE(hw_engine_mask
));
476 hw_mask
|= hw_engine_mask
[engine
->id
];
477 hw_mask
|= gen11_lock_sfc(engine
);
481 ret
= gen6_hw_domain_reset(i915
, hw_mask
);
483 if (engine_mask
!= ALL_ENGINES
)
484 for_each_engine_masked(engine
, i915
, engine_mask
, tmp
)
485 gen11_unlock_sfc(engine
);
490 static int gen8_engine_reset_prepare(struct intel_engine_cs
*engine
)
492 struct intel_uncore
*uncore
= engine
->uncore
;
493 const i915_reg_t reg
= RING_RESET_CTL(engine
->mmio_base
);
494 u32 request
, mask
, ack
;
497 ack
= intel_uncore_read_fw(uncore
, reg
);
498 if (ack
& RESET_CTL_CAT_ERROR
) {
500 * For catastrophic errors, ready-for-reset sequence
501 * needs to be bypassed: HAS#396813
503 request
= RESET_CTL_CAT_ERROR
;
504 mask
= RESET_CTL_CAT_ERROR
;
506 /* Catastrophic errors need to be cleared by HW */
508 } else if (!(ack
& RESET_CTL_READY_TO_RESET
)) {
509 request
= RESET_CTL_REQUEST_RESET
;
510 mask
= RESET_CTL_READY_TO_RESET
;
511 ack
= RESET_CTL_READY_TO_RESET
;
516 intel_uncore_write_fw(uncore
, reg
, _MASKED_BIT_ENABLE(request
));
517 ret
= __intel_wait_for_register_fw(uncore
, reg
, mask
, ack
,
520 DRM_ERROR("%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n",
521 engine
->name
, request
,
522 intel_uncore_read_fw(uncore
, reg
));
527 static void gen8_engine_reset_cancel(struct intel_engine_cs
*engine
)
529 intel_uncore_write_fw(engine
->uncore
,
530 RING_RESET_CTL(engine
->mmio_base
),
531 _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET
));
534 static int gen8_reset_engines(struct drm_i915_private
*i915
,
535 intel_engine_mask_t engine_mask
,
538 struct intel_engine_cs
*engine
;
539 const bool reset_non_ready
= retry
>= 1;
540 intel_engine_mask_t tmp
;
543 for_each_engine_masked(engine
, i915
, engine_mask
, tmp
) {
544 ret
= gen8_engine_reset_prepare(engine
);
545 if (ret
&& !reset_non_ready
)
549 * If this is not the first failed attempt to prepare,
550 * we decide to proceed anyway.
552 * By doing so we risk context corruption and with
553 * some gens (kbl), possible system hang if reset
554 * happens during active bb execution.
556 * We rather take context corruption instead of
557 * failed reset with a wedged driver/gpu. And
558 * active bb execution case should be covered by
559 * i915_stop_engines we have before the reset.
563 if (INTEL_GEN(i915
) >= 11)
564 ret
= gen11_reset_engines(i915
, engine_mask
, retry
);
566 ret
= gen6_reset_engines(i915
, engine_mask
, retry
);
569 for_each_engine_masked(engine
, i915
, engine_mask
, tmp
)
570 gen8_engine_reset_cancel(engine
);
575 typedef int (*reset_func
)(struct drm_i915_private
*,
576 intel_engine_mask_t engine_mask
,
579 static reset_func
intel_get_gpu_reset(struct drm_i915_private
*i915
)
581 if (INTEL_GEN(i915
) >= 8)
582 return gen8_reset_engines
;
583 else if (INTEL_GEN(i915
) >= 6)
584 return gen6_reset_engines
;
585 else if (INTEL_GEN(i915
) >= 5)
586 return ironlake_do_reset
;
587 else if (IS_G4X(i915
))
589 else if (IS_G33(i915
) || IS_PINEVIEW(i915
))
591 else if (INTEL_GEN(i915
) >= 3)
592 return i915_do_reset
;
597 int intel_gpu_reset(struct drm_i915_private
*i915
,
598 intel_engine_mask_t engine_mask
)
600 const int retries
= engine_mask
== ALL_ENGINES
? RESET_MAX_RETRIES
: 1;
602 int ret
= -ETIMEDOUT
;
605 reset
= intel_get_gpu_reset(i915
);
610 * If the power well sleeps during the reset, the reset
611 * request may be dropped and never completes (causing -EIO).
613 intel_uncore_forcewake_get(&i915
->uncore
, FORCEWAKE_ALL
);
614 for (retry
= 0; ret
== -ETIMEDOUT
&& retry
< retries
; retry
++) {
616 * We stop engines, otherwise we might get failed reset and a
617 * dead gpu (on elk). Also as modern gpu as kbl can suffer
618 * from system hang if batchbuffer is progressing when
619 * the reset is issued, regardless of READY_TO_RESET ack.
620 * Thus assume it is best to stop engines on all gens
621 * where we have a gpu reset.
623 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
625 * WaMediaResetMainRingCleanup:ctg,elk (presumably)
627 * FIXME: Wa for more modern gens needs to be validated
630 i915_stop_engines(i915
, engine_mask
);
632 GEM_TRACE("engine_mask=%x\n", engine_mask
);
634 ret
= reset(i915
, engine_mask
, retry
);
637 intel_uncore_forcewake_put(&i915
->uncore
, FORCEWAKE_ALL
);
642 bool intel_has_gpu_reset(struct drm_i915_private
*i915
)
647 if (!i915_modparams
.reset
)
650 return intel_get_gpu_reset(i915
);
653 bool intel_has_reset_engine(struct drm_i915_private
*i915
)
655 return INTEL_INFO(i915
)->has_reset_engine
&& i915_modparams
.reset
>= 2;
658 int intel_reset_guc(struct drm_i915_private
*i915
)
661 INTEL_GEN(i915
) >= 11 ? GEN11_GRDOM_GUC
: GEN9_GRDOM_GUC
;
664 GEM_BUG_ON(!HAS_GUC(i915
));
666 intel_uncore_forcewake_get(&i915
->uncore
, FORCEWAKE_ALL
);
667 ret
= gen6_hw_domain_reset(i915
, guc_domain
);
668 intel_uncore_forcewake_put(&i915
->uncore
, FORCEWAKE_ALL
);
674 * Ensure irq handler finishes, and not run again.
675 * Also return the active request so that we only search for it once.
677 static void reset_prepare_engine(struct intel_engine_cs
*engine
)
680 * During the reset sequence, we must prevent the engine from
681 * entering RC6. As the context state is undefined until we restart
682 * the engine, if it does enter RC6 during the reset, the state
683 * written to the powercontext is undefined and so we may lose
684 * GPU state upon resume, i.e. fail to restart after a reset.
686 intel_uncore_forcewake_get(engine
->uncore
, FORCEWAKE_ALL
);
687 engine
->reset
.prepare(engine
);
690 static void revoke_mmaps(struct drm_i915_private
*i915
)
694 for (i
= 0; i
< i915
->num_fence_regs
; i
++) {
695 struct drm_vma_offset_node
*node
;
696 struct i915_vma
*vma
;
699 vma
= READ_ONCE(i915
->fence_regs
[i
].vma
);
703 if (!i915_vma_has_userfault(vma
))
706 GEM_BUG_ON(vma
->fence
!= &i915
->fence_regs
[i
]);
707 node
= &vma
->obj
->base
.vma_node
;
708 vma_offset
= vma
->ggtt_view
.partial
.offset
<< PAGE_SHIFT
;
709 unmap_mapping_range(i915
->drm
.anon_inode
->i_mapping
,
710 drm_vma_node_offset_addr(node
) + vma_offset
,
716 static void reset_prepare(struct drm_i915_private
*i915
)
718 struct intel_engine_cs
*engine
;
719 enum intel_engine_id id
;
721 for_each_engine(engine
, i915
, id
)
722 reset_prepare_engine(engine
);
724 intel_uc_reset_prepare(i915
);
727 static void gt_revoke(struct drm_i915_private
*i915
)
732 static int gt_reset(struct drm_i915_private
*i915
,
733 intel_engine_mask_t stalled_mask
)
735 struct intel_engine_cs
*engine
;
736 enum intel_engine_id id
;
740 * Everything depends on having the GTT running, so we need to start
743 err
= i915_ggtt_enable_hw(i915
);
747 for_each_engine(engine
, i915
, id
)
748 intel_engine_reset(engine
, stalled_mask
& engine
->mask
);
750 i915_gem_restore_fences(i915
);
755 static void reset_finish_engine(struct intel_engine_cs
*engine
)
757 engine
->reset
.finish(engine
);
758 intel_uncore_forcewake_put(engine
->uncore
, FORCEWAKE_ALL
);
761 struct i915_gpu_restart
{
762 struct work_struct work
;
763 struct drm_i915_private
*i915
;
766 static void restart_work(struct work_struct
*work
)
768 struct i915_gpu_restart
*arg
= container_of(work
, typeof(*arg
), work
);
769 struct drm_i915_private
*i915
= arg
->i915
;
770 struct intel_engine_cs
*engine
;
771 enum intel_engine_id id
;
772 intel_wakeref_t wakeref
;
774 wakeref
= intel_runtime_pm_get(i915
);
775 mutex_lock(&i915
->drm
.struct_mutex
);
776 WRITE_ONCE(i915
->gpu_error
.restart
, NULL
);
778 for_each_engine(engine
, i915
, id
) {
779 struct i915_request
*rq
;
782 * Ostensibily, we always want a context loaded for powersaving,
783 * so if the engine is idle after the reset, send a request
784 * to load our scratch kernel_context.
786 if (!intel_engine_is_idle(engine
))
789 rq
= i915_request_alloc(engine
, i915
->kernel_context
);
791 i915_request_add(rq
);
794 mutex_unlock(&i915
->drm
.struct_mutex
);
795 intel_runtime_pm_put(i915
, wakeref
);
800 static void reset_finish(struct drm_i915_private
*i915
)
802 struct intel_engine_cs
*engine
;
803 enum intel_engine_id id
;
805 for_each_engine(engine
, i915
, id
) {
806 reset_finish_engine(engine
);
807 intel_engine_signal_breadcrumbs(engine
);
811 static void reset_restart(struct drm_i915_private
*i915
)
813 struct i915_gpu_restart
*arg
;
816 * Following the reset, ensure that we always reload context for
817 * powersaving, and to correct engine->last_retired_context. Since
818 * this requires us to submit a request, queue a worker to do that
819 * task for us to evade any locking here.
821 if (READ_ONCE(i915
->gpu_error
.restart
))
824 arg
= kmalloc(sizeof(*arg
), GFP_KERNEL
);
827 INIT_WORK(&arg
->work
, restart_work
);
829 WRITE_ONCE(i915
->gpu_error
.restart
, arg
);
830 queue_work(i915
->wq
, &arg
->work
);
834 static void nop_submit_request(struct i915_request
*request
)
836 struct intel_engine_cs
*engine
= request
->engine
;
839 GEM_TRACE("%s fence %llx:%lld -> -EIO\n",
840 engine
->name
, request
->fence
.context
, request
->fence
.seqno
);
841 dma_fence_set_error(&request
->fence
, -EIO
);
843 spin_lock_irqsave(&engine
->timeline
.lock
, flags
);
844 __i915_request_submit(request
);
845 i915_request_mark_complete(request
);
846 spin_unlock_irqrestore(&engine
->timeline
.lock
, flags
);
848 intel_engine_queue_breadcrumbs(engine
);
851 static void __i915_gem_set_wedged(struct drm_i915_private
*i915
)
853 struct i915_gpu_error
*error
= &i915
->gpu_error
;
854 struct intel_engine_cs
*engine
;
855 enum intel_engine_id id
;
857 if (test_bit(I915_WEDGED
, &error
->flags
))
860 if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915
)) {
861 struct drm_printer p
= drm_debug_printer(__func__
);
863 for_each_engine(engine
, i915
, id
)
864 intel_engine_dump(engine
, &p
, "%s\n", engine
->name
);
867 GEM_TRACE("start\n");
870 * First, stop submission to hw, but do not yet complete requests by
871 * rolling the global seqno forward (since this would complete requests
872 * for which we haven't set the fence error to EIO yet).
876 /* Even if the GPU reset fails, it should still stop the engines */
877 if (!INTEL_INFO(i915
)->gpu_reset_clobbers_display
)
878 intel_gpu_reset(i915
, ALL_ENGINES
);
880 for_each_engine(engine
, i915
, id
) {
881 engine
->submit_request
= nop_submit_request
;
882 engine
->schedule
= NULL
;
884 i915
->caps
.scheduler
= 0;
887 * Make sure no request can slip through without getting completed by
888 * either this call here to intel_engine_write_global_seqno, or the one
889 * in nop_submit_request.
891 synchronize_rcu_expedited();
893 /* Mark all executing requests as skipped */
894 for_each_engine(engine
, i915
, id
)
895 engine
->cancel_requests(engine
);
899 smp_mb__before_atomic();
900 set_bit(I915_WEDGED
, &error
->flags
);
905 void i915_gem_set_wedged(struct drm_i915_private
*i915
)
907 struct i915_gpu_error
*error
= &i915
->gpu_error
;
908 intel_wakeref_t wakeref
;
910 mutex_lock(&error
->wedge_mutex
);
911 with_intel_runtime_pm(i915
, wakeref
)
912 __i915_gem_set_wedged(i915
);
913 mutex_unlock(&error
->wedge_mutex
);
916 static bool __i915_gem_unset_wedged(struct drm_i915_private
*i915
)
918 struct i915_gpu_error
*error
= &i915
->gpu_error
;
919 struct i915_timeline
*tl
;
921 if (!test_bit(I915_WEDGED
, &error
->flags
))
924 if (!i915
->gt
.scratch
) /* Never full initialised, recovery impossible */
927 GEM_TRACE("start\n");
930 * Before unwedging, make sure that all pending operations
931 * are flushed and errored out - we may have requests waiting upon
932 * third party fences. We marked all inflight requests as EIO, and
933 * every execbuf since returned EIO, for consistency we want all
934 * the currently pending requests to also be marked as EIO, which
935 * is done inside our nop_submit_request - and so we must wait.
937 * No more can be submitted until we reset the wedged bit.
939 mutex_lock(&i915
->gt
.timelines
.mutex
);
940 list_for_each_entry(tl
, &i915
->gt
.timelines
.active_list
, link
) {
941 struct i915_request
*rq
;
943 rq
= i915_active_request_get_unlocked(&tl
->last_request
);
948 * All internal dependencies (i915_requests) will have
949 * been flushed by the set-wedge, but we may be stuck waiting
950 * for external fences. These should all be capped to 10s
951 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
954 dma_fence_default_wait(&rq
->fence
, false, MAX_SCHEDULE_TIMEOUT
);
955 i915_request_put(rq
);
957 mutex_unlock(&i915
->gt
.timelines
.mutex
);
959 intel_engines_sanitize(i915
, false);
962 * Undo nop_submit_request. We prevent all new i915 requests from
963 * being queued (by disallowing execbuf whilst wedged) so having
964 * waited for all active requests above, we know the system is idle
965 * and do not have to worry about a thread being inside
966 * engine->submit_request() as we swap over. So unlike installing
967 * the nop_submit_request on reset, we can do this from normal
968 * context and do not require stop_machine().
970 intel_engines_reset_default_submission(i915
);
974 smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
975 clear_bit(I915_WEDGED
, &i915
->gpu_error
.flags
);
980 bool i915_gem_unset_wedged(struct drm_i915_private
*i915
)
982 struct i915_gpu_error
*error
= &i915
->gpu_error
;
985 mutex_lock(&error
->wedge_mutex
);
986 result
= __i915_gem_unset_wedged(i915
);
987 mutex_unlock(&error
->wedge_mutex
);
992 static int do_reset(struct drm_i915_private
*i915
,
993 intel_engine_mask_t stalled_mask
)
999 err
= intel_gpu_reset(i915
, ALL_ENGINES
);
1000 for (i
= 0; err
&& i
< RESET_MAX_RETRIES
; i
++) {
1001 msleep(10 * (i
+ 1));
1002 err
= intel_gpu_reset(i915
, ALL_ENGINES
);
1007 return gt_reset(i915
, stalled_mask
);
1011 * i915_reset - reset chip after a hang
1012 * @i915: #drm_i915_private to reset
1013 * @stalled_mask: mask of the stalled engines with the guilty requests
1014 * @reason: user error message for why we are resetting
1016 * Reset the chip. Useful if a hang is detected. Marks the device as wedged
1019 * Procedure is fairly simple:
1020 * - reset the chip using the reset reg
1021 * - re-init context state
1022 * - re-init hardware status page
1023 * - re-init ring buffer
1024 * - re-init interrupt state
1027 void i915_reset(struct drm_i915_private
*i915
,
1028 intel_engine_mask_t stalled_mask
,
1031 struct i915_gpu_error
*error
= &i915
->gpu_error
;
1034 GEM_TRACE("flags=%lx\n", error
->flags
);
1037 assert_rpm_wakelock_held(i915
);
1038 GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF
, &error
->flags
));
1040 /* Clear any previous failed attempts at recovery. Time to try again. */
1041 if (!__i915_gem_unset_wedged(i915
))
1045 dev_notice(i915
->drm
.dev
, "Resetting chip for %s\n", reason
);
1046 error
->reset_count
++;
1048 reset_prepare(i915
);
1050 if (!intel_has_gpu_reset(i915
)) {
1051 if (i915_modparams
.reset
)
1052 dev_err(i915
->drm
.dev
, "GPU reset not supported\n");
1054 DRM_DEBUG_DRIVER("GPU reset disabled\n");
1058 if (INTEL_INFO(i915
)->gpu_reset_clobbers_display
)
1059 intel_runtime_pm_disable_interrupts(i915
);
1061 if (do_reset(i915
, stalled_mask
)) {
1062 dev_err(i915
->drm
.dev
, "Failed to reset chip\n");
1066 if (INTEL_INFO(i915
)->gpu_reset_clobbers_display
)
1067 intel_runtime_pm_enable_interrupts(i915
);
1069 intel_overlay_reset(i915
);
1072 * Next we need to restore the context, but we don't use those
1075 * Ring buffer needs to be re-initialized in the KMS case, or if X
1076 * was running at the time of the reset (i.e. we weren't VT
1079 ret
= i915_gem_init_hw(i915
);
1081 DRM_ERROR("Failed to initialise HW following reset (%d)\n",
1086 i915_queue_hangcheck(i915
);
1090 if (!__i915_wedged(error
))
1091 reset_restart(i915
);
1096 * History tells us that if we cannot reset the GPU now, we
1097 * never will. This then impacts everything that is run
1098 * subsequently. On failing the reset, we mark the driver
1099 * as wedged, preventing further execution on the GPU.
1100 * We also want to go one step further and add a taint to the
1101 * kernel so that any subsequent faults can be traced back to
1102 * this failure. This is important for CI, where if the
1103 * GPU/driver fails we would like to reboot and restart testing
1104 * rather than continue on into oblivion. For everyone else,
1105 * the system should still plod along, but they have been warned!
1107 add_taint(TAINT_WARN
, LOCKDEP_STILL_OK
);
1109 __i915_gem_set_wedged(i915
);
1113 static inline int intel_gt_reset_engine(struct drm_i915_private
*i915
,
1114 struct intel_engine_cs
*engine
)
1116 return intel_gpu_reset(i915
, engine
->mask
);
1120 * i915_reset_engine - reset GPU engine to recover from a hang
1121 * @engine: engine to reset
1122 * @msg: reason for GPU reset; or NULL for no dev_notice()
1124 * Reset a specific GPU engine. Useful if a hang is detected.
1125 * Returns zero on successful reset or otherwise an error code.
1128 * - identifies the request that caused the hang and it is dropped
1129 * - reset engine (which will force the engine to idle)
1130 * - re-init/configure engine
1132 int i915_reset_engine(struct intel_engine_cs
*engine
, const char *msg
)
1134 struct i915_gpu_error
*error
= &engine
->i915
->gpu_error
;
1137 GEM_TRACE("%s flags=%lx\n", engine
->name
, error
->flags
);
1138 GEM_BUG_ON(!test_bit(I915_RESET_ENGINE
+ engine
->id
, &error
->flags
));
1140 reset_prepare_engine(engine
);
1143 dev_notice(engine
->i915
->drm
.dev
,
1144 "Resetting %s for %s\n", engine
->name
, msg
);
1145 error
->reset_engine_count
[engine
->id
]++;
1147 if (!engine
->i915
->guc
.execbuf_client
)
1148 ret
= intel_gt_reset_engine(engine
->i915
, engine
);
1150 ret
= intel_guc_reset_engine(&engine
->i915
->guc
, engine
);
1152 /* If we fail here, we expect to fallback to a global reset */
1153 DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n",
1154 engine
->i915
->guc
.execbuf_client
? "GuC " : "",
1160 * The request that caused the hang is stuck on elsp, we know the
1161 * active request and can drop it, adjust head to skip the offending
1162 * request to resume executing remaining requests in the queue.
1164 intel_engine_reset(engine
, true);
1167 * The engine and its registers (and workarounds in case of render)
1168 * have been reset to their default values. Follow the init_ring
1169 * process to program RING_MODE, HWSP and re-enable submission.
1171 ret
= engine
->init_hw(engine
);
1176 intel_engine_cancel_stop_cs(engine
);
1177 reset_finish_engine(engine
);
1181 static void i915_reset_device(struct drm_i915_private
*i915
,
1185 struct i915_gpu_error
*error
= &i915
->gpu_error
;
1186 struct kobject
*kobj
= &i915
->drm
.primary
->kdev
->kobj
;
1187 char *error_event
[] = { I915_ERROR_UEVENT
"=1", NULL
};
1188 char *reset_event
[] = { I915_RESET_UEVENT
"=1", NULL
};
1189 char *reset_done_event
[] = { I915_ERROR_UEVENT
"=0", NULL
};
1190 struct i915_wedge_me w
;
1192 kobject_uevent_env(kobj
, KOBJ_CHANGE
, error_event
);
1194 DRM_DEBUG_DRIVER("resetting chip\n");
1195 kobject_uevent_env(kobj
, KOBJ_CHANGE
, reset_event
);
1197 /* Use a watchdog to ensure that our reset completes */
1198 i915_wedge_on_timeout(&w
, i915
, 5 * HZ
) {
1199 intel_prepare_reset(i915
);
1201 /* Flush everyone using a resource about to be clobbered */
1202 synchronize_srcu_expedited(&error
->reset_backoff_srcu
);
1204 mutex_lock(&error
->wedge_mutex
);
1205 i915_reset(i915
, engine_mask
, reason
);
1206 mutex_unlock(&error
->wedge_mutex
);
1208 intel_finish_reset(i915
);
1211 if (!test_bit(I915_WEDGED
, &error
->flags
))
1212 kobject_uevent_env(kobj
, KOBJ_CHANGE
, reset_done_event
);
1215 static void clear_register(struct intel_uncore
*uncore
, i915_reg_t reg
)
1217 intel_uncore_rmw(uncore
, reg
, 0, 0);
1220 void i915_clear_error_registers(struct drm_i915_private
*i915
)
1222 struct intel_uncore
*uncore
= &i915
->uncore
;
1225 if (!IS_GEN(i915
, 2))
1226 clear_register(uncore
, PGTBL_ER
);
1228 if (INTEL_GEN(i915
) < 4)
1229 clear_register(uncore
, IPEIR(RENDER_RING_BASE
));
1231 clear_register(uncore
, IPEIR_I965
);
1233 clear_register(uncore
, EIR
);
1234 eir
= intel_uncore_read(uncore
, EIR
);
1237 * some errors might have become stuck,
1240 DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir
);
1241 rmw_set(uncore
, EMR
, eir
);
1242 intel_uncore_write(uncore
, GEN2_IIR
,
1243 I915_MASTER_ERROR_INTERRUPT
);
1246 if (INTEL_GEN(i915
) >= 8) {
1247 rmw_clear(uncore
, GEN8_RING_FAULT_REG
, RING_FAULT_VALID
);
1248 intel_uncore_posting_read(uncore
, GEN8_RING_FAULT_REG
);
1249 } else if (INTEL_GEN(i915
) >= 6) {
1250 struct intel_engine_cs
*engine
;
1251 enum intel_engine_id id
;
1253 for_each_engine(engine
, i915
, id
) {
1255 RING_FAULT_REG(engine
), RING_FAULT_VALID
);
1256 intel_uncore_posting_read(uncore
,
1257 RING_FAULT_REG(engine
));
1263 * i915_handle_error - handle a gpu error
1264 * @i915: i915 device private
1265 * @engine_mask: mask representing engines that are hung
1266 * @flags: control flags
1267 * @fmt: Error message format string
1269 * Do some basic checking of register state at error time and
1270 * dump it to the syslog. Also call i915_capture_error_state() to make
1271 * sure we get a record and make it available in debugfs. Fire a uevent
1272 * so userspace knows something bad happened (should trigger collection
1273 * of a ring dump etc.).
1275 void i915_handle_error(struct drm_i915_private
*i915
,
1276 intel_engine_mask_t engine_mask
,
1277 unsigned long flags
,
1278 const char *fmt
, ...)
1280 struct i915_gpu_error
*error
= &i915
->gpu_error
;
1281 struct intel_engine_cs
*engine
;
1282 intel_wakeref_t wakeref
;
1283 intel_engine_mask_t tmp
;
1290 va_start(args
, fmt
);
1291 vscnprintf(error_msg
, sizeof(error_msg
), fmt
, args
);
1298 * In most cases it's guaranteed that we get here with an RPM
1299 * reference held, for example because there is a pending GPU
1300 * request that won't finish until the reset is done. This
1301 * isn't the case at least when we get here by doing a
1302 * simulated reset via debugfs, so get an RPM reference.
1304 wakeref
= intel_runtime_pm_get(i915
);
1306 engine_mask
&= INTEL_INFO(i915
)->engine_mask
;
1308 if (flags
& I915_ERROR_CAPTURE
) {
1309 i915_capture_error_state(i915
, engine_mask
, msg
);
1310 i915_clear_error_registers(i915
);
1314 * Try engine reset when available. We fall back to full reset if
1315 * single reset fails.
1317 if (intel_has_reset_engine(i915
) && !__i915_wedged(error
)) {
1318 for_each_engine_masked(engine
, i915
, engine_mask
, tmp
) {
1319 BUILD_BUG_ON(I915_RESET_MODESET
>= I915_RESET_ENGINE
);
1320 if (test_and_set_bit(I915_RESET_ENGINE
+ engine
->id
,
1324 if (i915_reset_engine(engine
, msg
) == 0)
1325 engine_mask
&= ~engine
->mask
;
1327 clear_bit(I915_RESET_ENGINE
+ engine
->id
,
1329 wake_up_bit(&error
->flags
,
1330 I915_RESET_ENGINE
+ engine
->id
);
1337 /* Full reset needs the mutex, stop any other user trying to do so. */
1338 if (test_and_set_bit(I915_RESET_BACKOFF
, &error
->flags
)) {
1339 wait_event(error
->reset_queue
,
1340 !test_bit(I915_RESET_BACKOFF
, &error
->flags
));
1341 goto out
; /* piggy-back on the other reset */
1344 /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
1345 synchronize_rcu_expedited();
1347 /* Prevent any other reset-engine attempt. */
1348 for_each_engine(engine
, i915
, tmp
) {
1349 while (test_and_set_bit(I915_RESET_ENGINE
+ engine
->id
,
1351 wait_on_bit(&error
->flags
,
1352 I915_RESET_ENGINE
+ engine
->id
,
1353 TASK_UNINTERRUPTIBLE
);
1356 i915_reset_device(i915
, engine_mask
, msg
);
1358 for_each_engine(engine
, i915
, tmp
) {
1359 clear_bit(I915_RESET_ENGINE
+ engine
->id
,
1363 clear_bit(I915_RESET_BACKOFF
, &error
->flags
);
1364 wake_up_all(&error
->reset_queue
);
1367 intel_runtime_pm_put(i915
, wakeref
);
1370 int i915_reset_trylock(struct drm_i915_private
*i915
)
1372 struct i915_gpu_error
*error
= &i915
->gpu_error
;
1375 might_lock(&error
->reset_backoff_srcu
);
1379 while (test_bit(I915_RESET_BACKOFF
, &error
->flags
)) {
1382 if (wait_event_interruptible(error
->reset_queue
,
1383 !test_bit(I915_RESET_BACKOFF
,
1389 srcu
= srcu_read_lock(&error
->reset_backoff_srcu
);
1395 void i915_reset_unlock(struct drm_i915_private
*i915
, int tag
)
1396 __releases(&i915
->gpu_error
.reset_backoff_srcu
)
1398 struct i915_gpu_error
*error
= &i915
->gpu_error
;
1400 srcu_read_unlock(&error
->reset_backoff_srcu
, tag
);
1403 int i915_terminally_wedged(struct drm_i915_private
*i915
)
1405 struct i915_gpu_error
*error
= &i915
->gpu_error
;
1409 if (!__i915_wedged(error
))
1412 /* Reset still in progress? Maybe we will recover? */
1413 if (!test_bit(I915_RESET_BACKOFF
, &error
->flags
))
1416 /* XXX intel_reset_finish() still takes struct_mutex!!! */
1417 if (mutex_is_locked(&i915
->drm
.struct_mutex
))
1420 if (wait_event_interruptible(error
->reset_queue
,
1421 !test_bit(I915_RESET_BACKOFF
,
1425 return __i915_wedged(error
) ? -EIO
: 0;
1428 bool i915_reset_flush(struct drm_i915_private
*i915
)
1432 cancel_delayed_work_sync(&i915
->gpu_error
.hangcheck_work
);
1434 flush_workqueue(i915
->wq
);
1435 GEM_BUG_ON(READ_ONCE(i915
->gpu_error
.restart
));
1437 mutex_lock(&i915
->drm
.struct_mutex
);
1438 err
= i915_gem_wait_for_idle(i915
,
1440 I915_WAIT_FOR_IDLE_BOOST
,
1441 MAX_SCHEDULE_TIMEOUT
);
1442 mutex_unlock(&i915
->drm
.struct_mutex
);
1447 static void i915_wedge_me(struct work_struct
*work
)
1449 struct i915_wedge_me
*w
= container_of(work
, typeof(*w
), work
.work
);
1451 dev_err(w
->i915
->drm
.dev
,
1452 "%s timed out, cancelling all in-flight rendering.\n",
1454 i915_gem_set_wedged(w
->i915
);
1457 void __i915_init_wedge(struct i915_wedge_me
*w
,
1458 struct drm_i915_private
*i915
,
1465 INIT_DELAYED_WORK_ONSTACK(&w
->work
, i915_wedge_me
);
1466 schedule_delayed_work(&w
->work
, timeout
);
1469 void __i915_fini_wedge(struct i915_wedge_me
*w
)
1471 cancel_delayed_work_sync(&w
->work
);
1472 destroy_delayed_work_on_stack(&w
->work
);