With PROVE_LOCKING on an Snapdragon X1 and VM reclaim pressure, we see:
======================================================
WARNING: possible circular locking dependency detected
7.0.0-debug+ #43 Tainted: G W
------------------------------------------------------
kswapd0/82 is trying to acquire lock:
ffff800080ec3870 (reservation_ww_class_acquire){+.+.}-{0:0}, at: msm_gem_shrinker_scan+0x17c/0x400 [msm]
but task is already holding lock:
ffffc31709b263b8 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x88/0x988
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #2 (fs_reclaim){+.+.}-{0:0}:
__lock_acquire+0x4d0/0xad0
lock_acquire.part.0+0xc4/0x248
lock_acquire+0x8c/0x248
fs_reclaim_acquire+0xd0/0xf0
dma_resv_lockdep+0x224/0x348
do_one_initcall+0x84/0x5d0
do_initcalls+0x194/0x1d8
kernel_init_freeable+0x128/0x180
kernel_init+0x2c/0x160
ret_from_fork+0x10/0x20
-> #1 (reservation_ww_class_mutex){+.+.}-{4:4}:
__lock_acquire+0x4d0/0xad0
lock_acquire.part.0+0xc4/0x248
lock_acquire+0x8c/0x248
dma_resv_lockdep+0x1a8/0x348
do_one_initcall+0x84/0x5d0
do_initcalls+0x194/0x1d8
kernel_init_freeable+0x128/0x180
kernel_init+0x2c/0x160
ret_from_fork+0x10/0x20
-> #0 (reservation_ww_class_acquire){+.+.}-{0:0}:
check_prev_add+0x114/0x790
validate_chain+0x594/0x6f0
__lock_acquire+0x4d0/0xad0
lock_acquire.part.0+0xc4/0x248
lock_acquire+0x8c/0x248
drm_gem_lru_scan+0x1ac/0x440
msm_gem_shrinker_scan+0x17c/0x400 [msm]
do_shrink_slab+0x150/0x4a0
shrink_slab+0x144/0x460
shrink_one+0x9c/0x1b0
shrink_many+0x27c/0x5c0
shrink_node+0x344/0x550
balance_pgdat+0x2c0/0x988
kswapd+0x11c/0x318
kthread+0x10c/0x128
ret_from_fork+0x10/0x20
other info that might help us debug this:
Chain exists of:
reservation_ww_class_acquire --> reservation_ww_class_mutex --> fs_reclaim
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(fs_reclaim);
lock(reservation_ww_class_mutex);
lock(fs_reclaim);
lock(reservation_ww_class_acquire);
*** DEADLOCK ***
1 lock held by kswapd0/82:
#0:
ffffc31709b263b8 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x88/0x988
stack backtrace:
CPU: 4 UID: 0 PID: 82 Comm: kswapd0 Tainted: G W 7.0.0-debug+ #43 PREEMPT(full)
Tainted: [W]=WARN
Hardware name: LENOVO 21BX0016US/21BX0016US, BIOS N3HET94W (1.66 ) 09/15/2025
Call trace:
show_stack+0x20/0x40 (C)
dump_stack_lvl+0x9c/0xd0
dump_stack+0x18/0x30
print_circular_bug+0x114/0x120
check_noncircular+0x178/0x198
check_prev_add+0x114/0x790
validate_chain+0x594/0x6f0
__lock_acquire+0x4d0/0xad0
lock_acquire.part.0+0xc4/0x248
lock_acquire+0x8c/0x248
drm_gem_lru_scan+0x1ac/0x440
msm_gem_shrinker_scan+0x17c/0x400 [msm]
do_shrink_slab+0x150/0x4a0
shrink_slab+0x144/0x460
shrink_one+0x9c/0x1b0
shrink_many+0x27c/0x5c0
shrink_node+0x344/0x550
balance_pgdat+0x2c0/0x988
kswapd+0x11c/0x318
kthread+0x10c/0x128
ret_from_fork+0x10/0x20
kswapd0 holding fs_reclaim calls the MSM shrinker, which calls
dma_resv_lock. This in turn acquires fs_reclaim.
Fix this deadlock by using dma_resv_trylock() instead, dropping the
subsequently unused passed wait-wound lock 'ticket'.
Cc: stable@vger.kernel.org
Signed-off-by: Daniel J Blueman <daniel@quora.org>
Fixes: fe4952b5f27c ("drm/msm: Convert vm locking")
Patchwork: https://patchwork.freedesktop.org/patch/723564/
Message-ID: <
20260508065722.18785-1-daniel@quora.org>
[rob: fixup compile errors, replace lockdep splat with something legible]
Signed-off-by: Rob Clark <robin.clark@oss.qualcomm.com>
}
static bool
-with_vm_locks(struct ww_acquire_ctx *ticket,
- void (*fn)(struct drm_gem_object *obj),
+with_vm_locks(void (*fn)(struct drm_gem_object *obj),
struct drm_gem_object *obj)
{
/*
* success paths
*/
struct drm_gpuvm_bo *vm_bo, *last_locked = NULL;
- int ret = 0;
+ bool locked = true;
drm_gem_for_each_gpuvm_bo (vm_bo, obj) {
struct dma_resv *resv = drm_gpuvm_resv(vm_bo->vm);
if (resv == obj->resv)
continue;
- ret = dma_resv_lock(resv, ticket);
-
- /*
- * Since we already skip the case when the VM and obj
- * share a resv (ie. _NO_SHARE objs), we don't expect
- * to hit a double-locking scenario... which the lock
- * unwinding cannot really cope with.
- */
- WARN_ON(ret == -EALREADY);
-
/*
- * Don't bother with slow-lock / backoff / retry sequence,
- * if we can't get the lock just give up and move on to
- * the next object.
+ * dma_resv_lock can't be used due to acquiring 'ticket' before the
+ * fs_reclaim lock, which is held in shrinker context
*/
- if (ret)
+ if (!dma_resv_trylock(resv)) {
+ locked = false;
goto out_unlock;
+ }
/*
* Hold a ref to prevent the vm_bo from being freed
}
}
- return ret == 0;
+ return locked;
}
static bool
-purge(struct drm_gem_object *obj, struct ww_acquire_ctx *ticket)
+purge(struct drm_gem_object *obj, struct ww_acquire_ctx *)
{
if (!is_purgeable(to_msm_bo(obj)))
return false;
if (msm_gem_active(obj))
return false;
- return with_vm_locks(ticket, msm_gem_purge, obj);
+ return with_vm_locks(msm_gem_purge, obj);
}
static bool
-evict(struct drm_gem_object *obj, struct ww_acquire_ctx *ticket)
+evict(struct drm_gem_object *obj, struct ww_acquire_ctx *)
{
if (is_unevictable(to_msm_bo(obj)))
return false;
if (msm_gem_active(obj))
return false;
- return with_vm_locks(ticket, msm_gem_evict, obj);
+ return with_vm_locks(msm_gem_evict, obj);
}
static bool
msm_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc)
{
struct msm_drm_private *priv = shrinker->private_data;
- struct ww_acquire_ctx ticket;
struct {
struct drm_gem_lru *lru;
bool (*shrink)(struct drm_gem_object *obj, struct ww_acquire_ctx *ticket);
for (unsigned i = 0; (nr > 0) && (i < ARRAY_SIZE(stages)); i++) {
if (!stages[i].cond)
continue;
+ /*
+ * 'ticket' not needed on trylock paths
+ */
stages[i].freed =
drm_gem_lru_scan(stages[i].lru, nr,
&stages[i].remaining,
stages[i].shrink,
- &ticket);
+ NULL);
nr -= stages[i].freed;
freed += stages[i].freed;
remaining += stages[i].remaining;