p = drm_coredump_printer(&iter);
drm_puts(&p, "**** Xe Device Coredump ****\n");
+ drm_printf(&p, "Reason: %s\n", ss->reason);
drm_puts(&p, "kernel: " UTS_RELEASE "\n");
drm_puts(&p, "module: " KBUILD_MODNAME "\n");
drm_printf(&p, "Snapshot time: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
ts = ktime_to_timespec64(ss->boot_time);
drm_printf(&p, "Uptime: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
- drm_printf(&p, "Process: %s\n", ss->process_name);
+ drm_printf(&p, "Process: %s [%d]\n", ss->process_name, ss->pid);
xe_device_snapshot_print(xe, &p);
drm_printf(&p, "\n**** GT #%d ****\n", ss->gt->info.id);
{
int i;
+ kfree(ss->reason);
+ ss->reason = NULL;
+
xe_guc_log_snapshot_free(ss->guc.log);
ss->guc.log = NULL;
ss->snapshot_time = ktime_get_real();
ss->boot_time = ktime_get_boottime();
- if (q->vm && q->vm->xef)
+ if (q->vm && q->vm->xef) {
process_name = q->vm->xef->process_name;
+ ss->pid = q->vm->xef->pid;
+ }
+
strscpy(ss->process_name, process_name);
ss->gt = q->gt;
* xe_devcoredump - Take the required snapshots and initialize coredump device.
* @q: The faulty xe_exec_queue, where the issue was detected.
* @job: The faulty xe_sched_job, where the issue was detected.
+ * @fmt: Printf format + args to describe the reason for the core dump
*
* This function should be called at the crash time within the serialized
* gt_reset. It is skipped if we still have the core dump device available
* with the information of the 'first' snapshot.
*/
-void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job)
+__printf(3, 4)
+void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job, const char *fmt, ...)
{
struct xe_device *xe = gt_to_xe(q->gt);
struct xe_devcoredump *coredump = &xe->devcoredump;
+ va_list varg;
if (coredump->captured) {
drm_dbg(&xe->drm, "Multiple hangs are occurring, but only the first snapshot was taken\n");
}
coredump->captured = true;
+
+ va_start(varg, fmt);
+ coredump->snapshot.reason = kvasprintf(GFP_ATOMIC, fmt, varg);
+ va_end(varg);
+
devcoredump_snapshot(coredump, q, job);
drm_info(&xe->drm, "Xe device coredump has been created\n");
if (!ret) {
xe_gt_warn(q->gt, "Schedule disable failed to respond, guc_id=%d\n",
q->guc->id);
- xe_devcoredump(q, NULL);
+ xe_devcoredump(q, NULL, "Schedule disable failed to respond, guc_id=%d\n",
+ q->guc->id);
xe_sched_submission_start(sched);
xe_gt_reset_async(q->gt);
return;
}
if (!exec_queue_killed(q) && !xe_lrc_ring_is_idle(q->lrc[0]))
- xe_devcoredump(q, NULL);
+ xe_devcoredump(q, NULL, "LR job cleanup, guc_id=%d", q->guc->id);
xe_sched_submission_start(sched);
}
xe_gt_warn(guc_to_gt(guc),
"Schedule disable failed to respond, guc_id=%d",
q->guc->id);
- xe_devcoredump(q, job);
+ xe_devcoredump(q, job,
+ "Schedule disable failed to respond, guc_id=%d, ret=%d, guc_read=%d",
+ q->guc->id, ret, xe_guc_read_stopped(guc));
set_exec_queue_extra_ref(q);
xe_exec_queue_get(q); /* GT reset owns this */
set_exec_queue_banned(q);
trace_xe_sched_job_timedout(job);
if (!exec_queue_killed(q))
- xe_devcoredump(q, job);
+ xe_devcoredump(q, job,
+ "Timedout job - seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx",
+ xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
+ q->guc->id, q->flags);
/*
* Kernel jobs should never fail, nor should VM jobs if they do