]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
rhashtable: drop ht->mutex in rhashtable_free_and_destroy()
authorMikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
Wed, 22 Apr 2026 21:33:49 +0000 (02:33 +0500)
committerHerbert Xu <herbert@gondor.apana.org.au>
Tue, 5 May 2026 08:12:06 +0000 (16:12 +0800)
rhashtable_free_and_destroy() is a single-shot teardown routine:
cancel_work_sync() has already quiesced the deferred rehash worker, and
the function's documented contract requires the caller to guarantee no
other concurrent access to the rhashtable. Under those conditions
ht->mutex is not protecting anything -- taking it is a leftover from
the original teardown path.

That leftover is actively harmful: it closes a circular lock-class
dependency with fs_reclaim. The deferred rehash worker takes ht->mutex
and then allocates GFP_KERNEL memory in bucket_table_alloc(),
establishing

    &ht->mutex  ->  fs_reclaim

After commit b32c4a213698 ("xattr: add rhashtable-based simple_xattr
infrastructure") introduced simple_xattr_ht_free(), which calls
rhashtable_free_and_destroy(), the simple_xattrs teardown became
reachable from evict() under the dcache shrinker. The subsequent
per-subsystem adaptations made the reverse edge concrete in three
independent code paths:

  * commit 52b364fed6e1 ("shmem: adapt to rhashtable-based simple_xattrs with lazy allocation")
  * commit 5bd97f5c5f24 ("kernfs: adapt to rhashtable-based simple_xattrs with lazy allocation")
  * commit 50704c391fbf ("pidfs: adapt to rhashtable-based simple_xattrs")

Any of the three closes the cycle

    fs_reclaim  ->  &ht->mutex

which lockdep reports as follows. This particular splat was observed
organically on a workstation kernel built from vfs-7.1-rc1.xattr at
~35h uptime under normal mixed workload, with CONFIG_PROVE_LOCKING=y.
The path happens to go through kernfs:

  WARNING: possible circular locking dependency detected
  7.0.0-faeab166167f-with-fixes-v1+ #191 Tainted: G     U
  kswapd0/243 is trying to acquire lock:
  ffff8882e475c0f8 (&ht->mutex){+.+.}-{4:4},
    at: rhashtable_free_and_destroy+0x36/0x740
  but task is already holding lock:
  ffffffffa8ad1d00 (fs_reclaim){+.+.}-{0:0},
    at: balance_pgdat+0x995/0x1600

  the existing dependency chain (in reverse order) is:

  -> #1 (fs_reclaim){+.+.}-{0:0}:
         __lock_acquire+0x506/0xbf0
         lock_acquire.part.0+0xc7/0x280
         fs_reclaim_acquire+0xd9/0x130
         __kvmalloc_node_noprof+0xcd/0xb40
         bucket_table_alloc.isra.0+0x5a/0x440
         rhashtable_rehash_alloc+0x4e/0xd0
         rht_deferred_worker+0x14b/0x440
         process_one_work+0x8fd/0x16a0
         worker_thread+0x601/0xff0
         kthread+0x36b/0x470
         ret_from_fork+0x5bf/0x910
         ret_from_fork_asm+0x1a/0x30

  -> #0 (&ht->mutex){+.+.}-{4:4}:
         check_prev_add+0xdb/0xce0
         validate_chain+0x554/0x780
         __lock_acquire+0x506/0xbf0
         lock_acquire.part.0+0xc7/0x280
         __mutex_lock+0x1b2/0x2550
         rhashtable_free_and_destroy+0x36/0x740
         kernfs_put.part.0+0x119/0x570
         evict+0x3b6/0x9c0
         __dentry_kill+0x181/0x540
         shrink_dentry_list+0x135/0x440
         prune_dcache_sb+0xdb/0x150
         super_cache_scan+0x2ff/0x520
         do_shrink_slab+0x35a/0xee0
         shrink_slab_memcg+0x457/0x950
         shrink_slab+0x43b/0x550
         shrink_one+0x31a/0x6f0
         shrink_many+0x31e/0xc80
         shrink_node+0xeb3/0x14a0
         balance_pgdat+0x8ed/0x1600
         kswapd+0x2f3/0x530
         kthread+0x36b/0x470
         ret_from_fork+0x5bf/0x910
         ret_from_fork_asm+0x1a/0x30

   Possible unsafe locking scenario:

         CPU0                    CPU1
         ----                    ----
    lock(fs_reclaim);
                                 lock(&ht->mutex);
                                 lock(fs_reclaim);
    lock(&ht->mutex);

Note that lockdep tracks lock classes, not instances: the two
&ht->mutex sites are on different rhashtable objects (the deferred
worker was triggered by some unrelated rhashtable growth), but because
rhashtable_init() uses a single static lockdep key for all rhashtables,
this is a real class-level cycle. Once reported, lockdep disables
itself for the remainder of the boot, masking any subsequent locking
bugs.

Drop the mutex. After cancel_work_sync() the rehash worker is quiesced
and, per this function's contract, no other concurrent access is
possible; the tables are therefore owned exclusively by this function
and can be walked without any lock held.

Switch the table walks from rht_dereference() (which requires
ht->mutex to be held under CONFIG_PROVE_RCU) to rcu_dereference_raw(),
which has no lockdep annotation. rht_ptr_exclusive() already uses
rcu_dereference_protected(p, 1) and needs no change.

This is the only place in lib/rhashtable.c where &ht->mutex is
acquired from a path reachable under fs_reclaim; the deferred worker
is the only other site and it is the forward edge. Removing the
acquisition here therefore eliminates the class cycle for all three
subsystems that use simple_xattrs, not just the one in the splat
above. No locking-semantics change is introduced for correct users;
incorrect users would already be racing with rehash worker completion
regardless of the mutex.

Synthetic reproduction of the splat within a few-minute window was
unsuccessful across several attempts (tmpfs and kernfs zombies via
cgroupfs with open-fd-through-rmdir, with and without swap, up to
~60k reclaim-path executions of simple_xattr_ht_free() in a single
run), consistent with the rare coincidence-of-edges profile of the
bug: the forward edge is already registered in /proc/lockdep on any
idle system via rht_deferred_worker, but the reverse edge requires
evict() to complete kernfs_put()'s final release inside the fs_reclaim
critical section, which in my attempts was ordered against rather than
interleaved with the worker.

Fixes: b32c4a213698 ("xattr: add rhashtable-based simple_xattr infrastructure")
Signed-off-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
lib/rhashtable.c

index 7a67ef5b67b666afdd638394f895d7732fb5e62e..426d4e381f136482b25f12058e09521b1f694277 100644 (file)
@@ -1166,6 +1166,11 @@ static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
  * This function will eventually sleep to wait for an async resize
  * to complete. The caller is responsible that no further write operations
  * occurs in parallel.
+ *
+ * After cancel_work_sync() has returned, the deferred rehash worker is
+ * quiesced and, per the contract above, no other concurrent access to the
+ * rhashtable is possible. The tables are therefore owned exclusively by
+ * this function and can be walked without ht->mutex held.
  */
 void rhashtable_free_and_destroy(struct rhashtable *ht,
                                 void (*free_fn)(void *ptr, void *arg),
@@ -1177,8 +1182,15 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
        irq_work_sync(&ht->run_irq_work);
        cancel_work_sync(&ht->run_work);
 
-       mutex_lock(&ht->mutex);
-       tbl = rht_dereference(ht->tbl, ht);
+       /*
+        * Do NOT take ht->mutex here. The rehash worker establishes
+        * ht->mutex -> fs_reclaim via GFP_KERNEL bucket allocation under
+        * the mutex; callers on the reclaim path (e.g. simple_xattr_ht_free()
+        * from evict() under the dcache shrinker for shmem/kernfs/pidfs
+        * inodes) would otherwise close a circular dependency
+        * fs_reclaim -> ht->mutex.
+        */
+       tbl = rcu_dereference_raw(ht->tbl);
 restart:
        if (free_fn) {
                for (i = 0; i < tbl->size; i++) {
@@ -1187,22 +1199,21 @@ restart:
                        cond_resched();
                        for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)),
                             next = !rht_is_a_nulls(pos) ?
-                                       rht_dereference(pos->next, ht) : NULL;
+                                       rcu_dereference_raw(pos->next) : NULL;
                             !rht_is_a_nulls(pos);
                             pos = next,
                             next = !rht_is_a_nulls(pos) ?
-                                       rht_dereference(pos->next, ht) : NULL)
+                                       rcu_dereference_raw(pos->next) : NULL)
                                rhashtable_free_one(ht, pos, free_fn, arg);
                }
        }
 
-       next_tbl = rht_dereference(tbl->future_tbl, ht);
+       next_tbl = rcu_dereference_raw(tbl->future_tbl);
        bucket_table_free(tbl);
        if (next_tbl) {
                tbl = next_tbl;
                goto restart;
        }
-       mutex_unlock(&ht->mutex);
 }
 EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy);