]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
drm/amdgpu: Avoid NULL dereference in discovery topology coredump path v3
authorSrinivasan Shanmugam <srinivasan.shanmugam@amd.com>
Wed, 18 Mar 2026 04:45:09 +0000 (10:15 +0530)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 23 Mar 2026 18:04:43 +0000 (14:04 -0400)
When a GPU fault or timeout happens, the driver creates a devcoredump
to collect debug information.

During this, amdgpu_devcoredump_format() calls
amdgpu_discovery_dump() to print IP discovery data.

amdgpu_discovery_dump() uses:
  adev->discovery.ip_top

and then accesses:
  ip_top->die_kset

amdgpu_discovery_dump() uses adev->discovery.ip_top. However,
ip_top may be NULL if the discovery topology was never initialized.

The current code does not check for this before using ip_top. As a
result, when ip_top is NULL, the coredump worker crashes while taking
the spinlock for ip_top->die_kset.

Fix this by checking for a missing ip_top before walking the discovery
topology. If it is unavailable, print a short message in the dump and
return safely.

- If ip_top is NULL, print a message and skip the dump
- Also add the same check in the cleanup path

This makes the coredump and cleanup paths safe even when the
discovery topology is not available.

KASAN trace:
[  522.228252] [IGT] amd_deadlock: starting subtest amdgpu-deadlock-sdma
[  522.240681] [IGT] amd_deadlock: starting dynamic subtest amdgpu-deadlock-sdma

...

[  522.952317] Write of size 4 at addr 0000000000000050 by task kworker/u129:5/5434
[  522.937526] BUG: KASAN: null-ptr-deref in _raw_spin_lock+0x66/0xc0
[  522.967659] Workqueue: events_unbound amdgpu_devcoredump_deferred_work [amdgpu]

...

[  522.969445] Call Trace:
[  522.969508]  _raw_spin_lock+0x66/0xc0
[  522.969518]  ? __pfx__raw_spin_lock+0x10/0x10
[  522.969534]  amdgpu_discovery_dump+0x61/0x530 [amdgpu]
[  522.971346]  ? pick_next_task_fair+0x3f6/0x1c60
[  522.971363]  amdgpu_devcoredump_format+0x84f/0x26f0 [amdgpu]
[  522.973188]  ? __pfx_amdgpu_devcoredump_format+0x10/0x10 [amdgpu]
[  522.975012]  ? psi_task_switch+0x2b5/0x9b0
[  522.975027]  ? __pfx___drm_printfn_coredump+0x10/0x10 [drm]
[  522.975198]  ? __pfx___drm_puts_coredump+0x10/0x10 [drm]
[  522.975366]  ? __schedule+0x113c/0x38d0
[  522.975381]  amdgpu_devcoredump_deferred_work+0x4c/0x1f0 [amdgpu]

v2: Updated commit message - Clarified that ip_top is not freed, it can
    just be NULL if discovery was not initialized. (Christian/Lijo)

v3: Removed the extra drm_warn() for sysfs init failure as sysfs already
    reports errors. (Christian)

Fixes: e81eff80aad6 ("drm/amdgpu: include ip discovery data in devcoredump")
Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c

index 0eb0c62d2f4f2f9db7d6c97bc99873cc27a1e777..d42a9ff2f78e69a58ea436c3a8c1918c874cec78 100644 (file)
@@ -1394,6 +1394,9 @@ static void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev)
        struct list_head *el, *tmp;
        struct kset *die_kset;
 
+       if (!ip_top)
+               return;
+
        die_kset = &ip_top->die_kset;
        spin_lock(&die_kset->list_lock);
        list_for_each_prev_safe(el, tmp, &die_kset->list) {
@@ -1418,9 +1421,15 @@ void amdgpu_discovery_dump(struct amdgpu_device *adev, struct drm_printer *p)
        struct ip_hw_instance *ip_inst;
        int i = 0, j;
 
+       drm_printf(p, "\nHW IP Discovery\n");
+
+       if (!ip_top) {
+               drm_printf(p, "ip discovery topology unavailable\n");
+               return;
+       }
+
        die_kset = &ip_top->die_kset;
 
-       drm_printf(p, "\nHW IP Discovery\n");
        spin_lock(&die_kset->list_lock);
        list_for_each(el_die, &die_kset->list) {
                drm_printf(p, "die %d\n", i++);