]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
blk-cgroup: wait for blkcg cleanup before initializing new disk
authorMing Lei <ming.lei@redhat.com>
Wed, 11 Mar 2026 03:28:37 +0000 (11:28 +0800)
committerJens Axboe <axboe@kernel.dk>
Wed, 11 Mar 2026 14:30:30 +0000 (08:30 -0600)
When a queue is shared across disk rebind (e.g., SCSI unbind/bind), the
previous disk's blkcg state is cleaned up asynchronously via
disk_release() -> blkcg_exit_disk(). If the new disk's blkcg_init_disk()
runs before that cleanup finishes, we may overwrite q->root_blkg while
the old one is still alive, and radix_tree_insert() in blkg_create()
fails with -EEXIST because the old blkg entries still occupy the same
queue id slot in blkcg->blkg_tree. This causes the sd probe to fail
with -ENOMEM.

Fix it by waiting in blkcg_init_disk() for root_blkg to become NULL,
which indicates the previous disk's blkcg cleanup has completed.

Fixes: 1059699f87eb ("block: move blkcg initialization/destroy into disk allocation/release handler")
Cc: Yi Zhang <yi.zhang@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20260311032837.2368714-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
block/blk-cgroup.c

index b70096497d389aa7ecd0d1115b9b30fd83ba8a4c..2d7b18eb729155eec0dec42f0fd9f9958bd0d37e 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/backing-dev.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
+#include <linux/wait_bit.h>
 #include <linux/atomic.h>
 #include <linux/ctype.h>
 #include <linux/resume_user_mode.h>
@@ -611,6 +612,8 @@ restart:
 
        q->root_blkg = NULL;
        spin_unlock_irq(&q->queue_lock);
+
+       wake_up_var(&q->root_blkg);
 }
 
 static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
@@ -1498,6 +1501,18 @@ int blkcg_init_disk(struct gendisk *disk)
        struct blkcg_gq *new_blkg, *blkg;
        bool preloaded;
 
+       /*
+        * If the queue is shared across disk rebind (e.g., SCSI), the
+        * previous disk's blkcg state is cleaned up asynchronously via
+        * disk_release() -> blkcg_exit_disk(). Wait for that cleanup to
+        * finish (indicated by root_blkg becoming NULL) before setting up
+        * new blkcg state. Otherwise, we may overwrite q->root_blkg while
+        * the old one is still alive, and radix_tree_insert() in
+        * blkg_create() will fail with -EEXIST because the old entries
+        * still occupy the same queue id slot in blkcg->blkg_tree.
+        */
+       wait_var_event(&q->root_blkg, !READ_ONCE(q->root_blkg));
+
        new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
        if (!new_blkg)
                return -ENOMEM;