From: Vitaly Prosyak Date: Wed, 13 May 2026 20:08:30 +0000 (-0400) Subject: drm/amdgpu: Add lockdep annotations for lock ordering validation X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=1d0f5838b1268de871e762bbeb408989dd96b449;p=thirdparty%2Fkernel%2Flinux.git drm/amdgpu: Add lockdep annotations for lock ordering validation Add lockdep annotations to teach lockdep the correct lock hierarchy and catch ordering violations during development. This follows the pattern established by dma-resv in drivers/dma-buf/dma-resv.c. Lock ordering hierarchy (outermost to innermost): 1. userq_sch_mutex - Global userq scheduler (enforce_isolation) 2. userq_mutex - Per-context userq (held across queue create/destroy) 3. notifier_lock - MMU notifier synchronization 4. vram_lock - VRAM memory allocator 5. reset_domain->sem - GPU reset synchronization 6. reset_lock - Reset control mutex 7. srbm_mutex - SRBM register access 8. grbm_idx_mutex - GRBM index register access 9. mmio_idx_lock - MMIO index access (spinlock) The implementation provides: - Lock ordering training at module init (amdgpu_lockdep_init) - Lock class association for real driver locks (amdgpu_lockdep_set_class) Dummy locks are associated with the same class keys as real driver locks via lockdep_set_class(), ensuring lockdep connects the training ordering with actual runtime locks. Testing: Build the kernel with CONFIG_PROVE_LOCKING=y (enables CONFIG_LOCKDEP): scripts/config --enable PROVE_LOCKING scripts/config --enable DEBUG_LOCKDEP make -j$(nproc) On boot, dmesg should show: AMDGPU: Lockdep annotations initialized (9 lock levels) The companion IGT test (tests/amdgpu/amd_lockdep) exercises lock-heavy GPU code paths concurrently to trigger lockdep warnings on violations: sudo ./build/tests/amdgpu/amd_lockdep sudo dmesg | grep -A 50 "circular locking dependency" IGT subtests: concurrent-reset-and-submit - reset_sem vs submission locks concurrent-mmap-and-evict - mmap_lock vs vram_lock concurrent-userptr-and-reset - notifier_lock vs reset_sem stress-all-paths - all of the above simultaneously A clean dmesg (no "circular locking dependency" or "possible recursive locking detected" messages) confirms no lock ordering violations. For CI integration, the test should be run on kernels compiled with CONFIG_LOCKDEP=y; dmesg is scanned post-run for lockdep splats. v2: (Christian) - Move notifier_lock and vram_lock before reset locks in hierarchy. HMM invalidation holds notifier_lock and can wait for GPU reset completion, so notifier_lock must be outer to reset_domain->sem. - Associate dummy locks with lock class keys via lockdep_set_class() so lockdep connects training with real driver locks. - Update commit message to list all 9 lock levels. Requires CONFIG_PROVE_LOCKING=y to activate. Cc: Christian Konig Cc: Alex Deucher Signed-off-by: Vitaly Prosyak Reviewed-by: Christian Konig Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index ee3574797bc28..ba80542ead9d9 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -69,7 +69,7 @@ amdgpu-y += amdgpu_device.o amdgpu_reg_access.o amdgpu_doorbell_mgr.o amdgpu_kms amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \ amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \ amdgpu_fw_attestation.o amdgpu_securedisplay.o \ - amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \ + amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o amdgpu_lockdep.o \ amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o amdgpu_dev_coredump.o \ amdgpu_cper.o amdgpu_userq_fence.o amdgpu_eviction_fence.o amdgpu_ip.o diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 5d7bfa59424a4..7b09410d6d8f0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -105,6 +105,7 @@ #include "amdgpu_mca.h" #include "amdgpu_aca.h" #include "amdgpu_ras.h" +#include "amdgpu_lockdep.h" #include "amdgpu_cper.h" #include "amdgpu_xcp.h" #include "amdgpu_seq64.h" diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1dddfde91c497..f18e465028298 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3757,6 +3757,9 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->pm.stable_pstate_ctx_lock); mutex_init(&adev->benchmark_mutex); mutex_init(&adev->gfx.reset_sem_mutex); + + /* Associate locks with lockdep classes for ordering validation */ + amdgpu_lockdep_set_class(adev); /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ mutex_init(&adev->enforce_isolation_mutex); for (i = 0; i < MAX_XCP; ++i) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 1781c0c3d010f..bf42602696813 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -3158,6 +3158,9 @@ static int __init amdgpu_init(void) { int r; + /* Train lockdep on correct lock ordering */ + amdgpu_lockdep_init(); + r = amdgpu_sync_init(); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c new file mode 100644 index 0000000000000..d5d71fd7c70d0 --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2024 Advanced Micro Devices, Inc. + * + * Lockdep annotation for AMDGPU lock ordering + * + * This module teaches lockdep the correct lock ordering to catch + * potential deadlocks at development time rather than runtime. + * + * Based on dma-resv lockdep approach from: + * drivers/dma-buf/dma-resv.c:dma_resv_lockdep() + */ + +#include "amdgpu.h" +#include "amdgpu_reset.h" + +#ifdef CONFIG_LOCKDEP + +/* Lock class keys for associating with real driver locks */ +static struct lock_class_key amdgpu_userq_sch_mutex_key; +static struct lock_class_key amdgpu_userq_mutex_key; +static struct lock_class_key amdgpu_notifier_lock_key; +static struct lock_class_key amdgpu_vram_lock_key; +static struct lock_class_key amdgpu_reset_sem_key; +static struct lock_class_key amdgpu_reset_lock_key; +static struct lock_class_key amdgpu_srbm_lock_key; +static struct lock_class_key amdgpu_grbm_lock_key; +static struct lock_class_key amdgpu_mmio_lock_key; + +/** + * amdgpu_lockdep_set_class - Associate lock class keys with real locks + * @adev: AMDGPU device + * + * Call during device init to associate lock classes with actual locks + * so lockdep can track them properly. + */ +void amdgpu_lockdep_set_class(struct amdgpu_device *adev) +{ + lockdep_set_class(&adev->gfx.userq_sch_mutex, + &amdgpu_userq_sch_mutex_key); + lockdep_set_class(&adev->notifier_lock, &amdgpu_notifier_lock_key); + lockdep_set_class(&adev->srbm_mutex, &amdgpu_srbm_lock_key); + lockdep_set_class(&adev->grbm_idx_mutex, &amdgpu_grbm_lock_key); + lockdep_set_class(&adev->mmio_idx_lock, &amdgpu_mmio_lock_key); + + if (adev->reset_domain) + lockdep_set_class(&adev->reset_domain->sem, + &amdgpu_reset_sem_key); +} + +/** + * amdgpu_lockdep_init - Teach lockdep the correct lock ordering + * + * Instantiates dummy objects and takes locks in the correct order to + * train lockdep. This helps catch lock ordering violations during + * development. + * + * Lock ordering hierarchy (outermost to innermost): + * + * 1. userq_sch_mutex - Global userq scheduler (enforce_isolation) + * 2. userq_mutex - Per-context userq (held across queue create/destroy) + * 3. notifier_lock - MMU notifier lock + * 4. vram_lock - VRAM allocator lock + * 5. reset_domain->sem - GPU reset synchronization + * 6. reset_lock - Reset control lock + * 7. srbm_mutex - SRBM register access + * 8. grbm_idx_mutex - GRBM index access + * 9. mmio_idx_lock - MMIO index access (spinlock) + * + * Evidence: + * - userq_sch_mutex -> userq_mutex: amdgpu_gfx_kfd_sch_ctrl() calls + * amdgpu_userq_stop_sched_for_enforce_isolation() which takes userq_mutex + * - userq_mutex -> notifier_lock: userq paths may trigger MMU notifier + * invalidation which acquires notifier_lock + * - notifier_lock -> reset_domain->sem: HMM invalidation callback holds + * notifier_lock and can wait for GPU reset completion, so notifier_lock + * must be outer to reset_domain->sem + * - vram_lock -> reset_domain->sem: VRAM management paths may need to + * wait for ongoing reset to complete + * + * Note: mmap_lock ordering relative to GPU locks is already taught + * by dma-resv (drivers/dma-buf/dma-resv.c). + */ +int amdgpu_lockdep_init(void) +{ + struct amdgpu_reset_domain *reset_domain = NULL; + struct amdgpu_reset_control reset_ctl; + struct mutex userq_sch_mutex; + struct mutex userq_mutex; + struct mutex notifier_lock; + struct mutex vram_lock; + struct mutex srbm_mutex; + struct mutex grbm_idx_mutex; + spinlock_t mmio_idx_lock; + unsigned long flags; + + /* + * Initialize dummy reset domain + */ + reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, + "lockdep_test"); + if (!reset_domain) + return -ENOMEM; + + /* Initialize dummy locks */ + mutex_init(&userq_sch_mutex); + mutex_init(&userq_mutex); + mutex_init(¬ifier_lock); + mutex_init(&vram_lock); + mutex_init(&reset_ctl.reset_lock); + mutex_init(&srbm_mutex); + mutex_init(&grbm_idx_mutex); + spin_lock_init(&mmio_idx_lock); + + /* + * Associate dummy locks with the same class keys used for real + * driver locks. This ensures lockdep connects the ordering learned + * here with the actual locks used at runtime. + */ + lockdep_set_class(&userq_sch_mutex, &amdgpu_userq_sch_mutex_key); + lockdep_set_class(&userq_mutex, &amdgpu_userq_mutex_key); + lockdep_set_class(¬ifier_lock, &amdgpu_notifier_lock_key); + lockdep_set_class(&vram_lock, &amdgpu_vram_lock_key); + lockdep_set_class(&reset_domain->sem, &amdgpu_reset_sem_key); + lockdep_set_class(&reset_ctl.reset_lock, &amdgpu_reset_lock_key); + lockdep_set_class(&srbm_mutex, &amdgpu_srbm_lock_key); + lockdep_set_class(&grbm_idx_mutex, &amdgpu_grbm_lock_key); + lockdep_set_class(&mmio_idx_lock, &amdgpu_mmio_lock_key); + + /* + * Take locks in the correct order to train lockdep. + * This establishes the dependency chain. + */ + + /* Level 1: Global userq scheduler mutex (outermost) */ + mutex_lock(&userq_sch_mutex); + + /* Level 2: Per-context userq mutex */ + mutex_lock(&userq_mutex); + + /* Level 3: MMU notifier lock */ + mutex_lock(¬ifier_lock); + + /* Level 4: VRAM allocator lock */ + mutex_lock(&vram_lock); + + /* Level 5: Reset domain semaphore */ + down_read(&reset_domain->sem); + + /* Level 6: Reset control lock */ + mutex_lock(&reset_ctl.reset_lock); + + /* + * Mark potential memory reclaim boundary. + * GPU operations might trigger memory allocation/reclaim. + */ + fs_reclaim_acquire(GFP_KERNEL); + + /* Level 7: SRBM register access */ + mutex_lock(&srbm_mutex); + + /* Level 8: GRBM index access */ + mutex_lock(&grbm_idx_mutex); + + /* Level 9: MMIO index access (innermost lock, spinlock) */ + spin_lock_irqsave(&mmio_idx_lock, flags); + + /* + * All locks acquired in order. + * Lockdep has now learned the valid dependency chain. + */ + + /* Release in reverse order */ + spin_unlock_irqrestore(&mmio_idx_lock, flags); + mutex_unlock(&grbm_idx_mutex); + mutex_unlock(&srbm_mutex); + + fs_reclaim_release(GFP_KERNEL); + + mutex_unlock(&reset_ctl.reset_lock); + up_read(&reset_domain->sem); + mutex_unlock(&vram_lock); + mutex_unlock(¬ifier_lock); + mutex_unlock(&userq_mutex); + mutex_unlock(&userq_sch_mutex); + + /* Cleanup */ + amdgpu_reset_put_reset_domain(reset_domain); + + pr_info("AMDGPU: Lockdep annotations initialized (9 lock levels)\n"); + + return 0; +} + +#endif /* CONFIG_LOCKDEP */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.h new file mode 100644 index 0000000000000..04adb58665bf5 --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2024 Advanced Micro Devices, Inc. + * + * Lockdep annotation interface for AMDGPU + */ + +#ifndef __AMDGPU_LOCKDEP_H__ +#define __AMDGPU_LOCKDEP_H__ + +#include + +struct amdgpu_device; + +#ifdef CONFIG_LOCKDEP + +/** + * amdgpu_lockdep_init - Train lockdep on correct lock ordering + * + * Call once during module init to establish the lock dependency chain. + */ +int amdgpu_lockdep_init(void); + +/** + * amdgpu_lockdep_set_class - Associate lock class keys with real locks + * @adev: AMDGPU device + * + * Call during device init to associate lock classes with actual locks. + */ +void amdgpu_lockdep_set_class(struct amdgpu_device *adev); + +#else /* !CONFIG_LOCKDEP */ + +static inline int amdgpu_lockdep_init(void) { return 0; } +static inline void amdgpu_lockdep_set_class(struct amdgpu_device *adev) {} + +#endif /* CONFIG_LOCKDEP */ + +#endif /* __AMDGPU_LOCKDEP_H__ */