]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
drm: Create a task info option for wedge events
authorAndré Almeida <andrealmeid@igalia.com>
Tue, 17 Jun 2025 12:49:46 +0000 (09:49 -0300)
committerAndré Almeida <andrealmeid@igalia.com>
Tue, 17 Jun 2025 14:32:47 +0000 (11:32 -0300)
When a device get wedged, it might be caused by a guilty application.
For userspace, knowing which task was involved can be useful for some
situations, like for implementing a policy, logs or for giving a chance
for the compositor to let the user know what task was involved in the
problem.  This is an optional argument, when the task info is not
available, the PID and TASK string won't appear in the event string.

Sometimes just the PID isn't enough giving that the task might be already
dead by the time userspace will try to check what was this PID's name,
so to make the life easier also notify what's the task's name in the user
event.

Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Krzysztof Karas <krzysztof.karas@intel.com>
Reviewed-by: Raag Jadav <raag.jadav@intel.com>
Acked-by: Christian König <christian.koenig@amd.com>
Link: https://lore.kernel.org/r/20250617124949.2151549-4-andrealmeid@igalia.com
Signed-off-by: André Almeida <andrealmeid@igalia.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
drivers/gpu/drm/drm_drv.c
drivers/gpu/drm/i915/gt/intel_reset.c
drivers/gpu/drm/xe/xe_device.c
include/drm/drm_device.h
include/drm/drm_drv.h

index e1bab6a96cb67df049d52ca6601fea565783c27c..8a0f36f33f137c012aef6f6ceb2591c01327ce2a 100644 (file)
@@ -6364,7 +6364,7 @@ end_reset:
        atomic_set(&adev->reset_domain->reset_res, r);
 
        if (!r)
-               drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
+               drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
 
        return r;
 }
index 3d887428ca2b5d25a67460843854131ba49cc82e..0c1381b527fea16a45222ba43f8329006ed1d4a7 100644 (file)
@@ -164,7 +164,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
                        if (amdgpu_ring_sched_ready(ring))
                                drm_sched_start(&ring->sched, 0);
                        dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name);
-                       drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
+                       drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
                        goto exit;
                }
                dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name);
index bb291c4ac4d5720b78e10cba081061c56e3c6e31..02556363e918c291a5085e6dcba5e6d40c9d3cb1 100644 (file)
@@ -35,6 +35,7 @@
 #include <linux/moduleparam.h>
 #include <linux/mount.h>
 #include <linux/pseudo_fs.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/sprintf.h>
 #include <linux/srcu.h>
@@ -539,10 +540,15 @@ static const char *drm_get_wedge_recovery(unsigned int opt)
        }
 }
 
+#define WEDGE_STR_LEN  32
+#define PID_STR_LEN    15
+#define COMM_STR_LEN   (TASK_COMM_LEN + 5)
+
 /**
  * drm_dev_wedged_event - generate a device wedged uevent
  * @dev: DRM device
  * @method: method(s) to be used for recovery
+ * @info: optional information about the guilty task
  *
  * This generates a device wedged uevent for the DRM device specified by @dev.
  * Recovery @method\(s) of choice will be sent in the uevent environment as
@@ -555,13 +561,13 @@ static const char *drm_get_wedge_recovery(unsigned int opt)
  *
  * Returns: 0 on success, negative error code otherwise.
  */
-int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
+int drm_dev_wedged_event(struct drm_device *dev, unsigned long method,
+                        struct drm_wedge_task_info *info)
 {
+       char event_string[WEDGE_STR_LEN], pid_string[PID_STR_LEN], comm_string[COMM_STR_LEN];
+       char *envp[] = { event_string, NULL, NULL, NULL };
        const char *recovery = NULL;
        unsigned int len, opt;
-       /* Event string length up to 28+ characters with available methods */
-       char event_string[32];
-       char *envp[] = { event_string, NULL };
 
        len = scnprintf(event_string, sizeof(event_string), "%s", "WEDGED=");
 
@@ -583,6 +589,13 @@ int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
        drm_info(dev, "device wedged, %s\n", method == DRM_WEDGE_RECOVERY_NONE ?
                 "but recovered through reset" : "needs recovery");
 
+       if (info && (info->comm[0] != '\0') && (info->pid >= 0)) {
+               snprintf(pid_string, sizeof(pid_string), "PID=%u", info->pid);
+               snprintf(comm_string, sizeof(comm_string), "TASK=%s", info->comm);
+               envp[1] = pid_string;
+               envp[2] = comm_string;
+       }
+
        return kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
 }
 EXPORT_SYMBOL(drm_dev_wedged_event);
index dbdcfe130ad41fd465ce28a0a31d761f69a5e7a3..ba1d8fdc3c7b57f432ab8bf788c79429447b8a89 100644 (file)
@@ -1448,7 +1448,8 @@ static void intel_gt_reset_global(struct intel_gt *gt,
                kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
        else
                drm_dev_wedged_event(&gt->i915->drm,
-                                    DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET);
+                                    DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET,
+                                    NULL);
 }
 
 /**
index c02c4c4e941286a13384b3a1d71649911c49a27a..f329613e061f20a00f79ee8f0f0ee38fbe816d1f 100644 (file)
@@ -1168,7 +1168,8 @@ void xe_device_declare_wedged(struct xe_device *xe)
 
                /* Notify userspace of wedged device */
                drm_dev_wedged_event(&xe->drm,
-                                    DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET);
+                                    DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET,
+                                    NULL);
        }
 
        for_each_gt(gt, xe, id)
index e2f894f1b90a7b86ef5755966334a80b0f44361b..08b3b2467c4c1515d0740ba933856bcd8183c126 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/kref.h>
 #include <linux/mutex.h>
 #include <linux/idr.h>
+#include <linux/sched.h>
 
 #include <drm/drm_mode_config.h>
 
@@ -30,6 +31,14 @@ struct pci_controller;
 #define DRM_WEDGE_RECOVERY_REBIND      BIT(1)  /* unbind + bind driver */
 #define DRM_WEDGE_RECOVERY_BUS_RESET   BIT(2)  /* unbind + reset bus device + bind */
 
+/**
+ * struct drm_wedge_task_info - information about the guilty task of a wedge dev
+ */
+struct drm_wedge_task_info {
+       pid_t pid;
+       char comm[TASK_COMM_LEN];
+};
+
 /**
  * enum switch_power_state - power state of drm device
  */
index 63b51942d60645dc20e46171aef883247f01834a..3f76a32d6b84592dec6895cc973bcdf4dffb93ba 100644 (file)
@@ -487,7 +487,8 @@ void drm_put_dev(struct drm_device *dev);
 bool drm_dev_enter(struct drm_device *dev, int *idx);
 void drm_dev_exit(int idx);
 void drm_dev_unplug(struct drm_device *dev);
-int drm_dev_wedged_event(struct drm_device *dev, unsigned long method);
+int drm_dev_wedged_event(struct drm_device *dev, unsigned long method,
+                        struct drm_wedge_task_info *info);
 
 /**
  * drm_dev_is_unplugged - is a DRM device unplugged