block: introduce a dedicated lock for protecting queue elevator updates

author Nilay Shroff <nilay@linux.ibm.com>

Tue, 4 Mar 2025 10:22:33 +0000 (15:52 +0530)

committer Jens Axboe <axboe@kernel.dk>

Mon, 10 Mar 2025 13:30:18 +0000 (07:30 -0600)
author Nilay Shroff <nilay@linux.ibm.com>
Tue, 4 Mar 2025 10:22:33 +0000 (15:52 +0530)
committer Jens Axboe <axboe@kernel.dk>
Mon, 10 Mar 2025 13:30:18 +0000 (07:30 -0600)
diff --git a/block/blk-core.c b/block/blk-core.c

index d6c4fa3943b5cf88b9aa0c17e7fca08268d0d735..362d0a55b07a547e39917307d1f60b2d9d7f50aa 100644 (file)
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -429,6 +429,7 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
  
         refcount_set(&q->refs, 1);
         mutex_init(&q->debugfs_mutex);
+       mutex_init(&q->elevator_lock);
         mutex_init(&q->sysfs_lock);
         mutex_init(&q->limits_lock);
         mutex_init(&q->rq_qos_mutex);
diff --git a/block/blk-mq.c b/block/blk-mq.c

index 40490ac8804570e579b257abd078e13d859de988..5a2d6392752529da0e8f7b2b8b6855e1f6cf8ad9 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4467,7 +4467,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
         unsigned long i, j;
  
         /* protect against switching io scheduler  */
-       mutex_lock(&q->sysfs_lock);
+       mutex_lock(&q->elevator_lock);
         for (i = 0; i < set->nr_hw_queues; i++) {
                 int old_node;
                 int node = blk_mq_get_hctx_node(set, i);
@@ -4500,7 +4500,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
  
         xa_for_each_start(&q->hctx_table, j, hctx, j)
                 blk_mq_exit_hctx(q, set, hctx, j);
-       mutex_unlock(&q->sysfs_lock);
+       mutex_unlock(&q->elevator_lock);
  
         /* unregister cpuhp callbacks for exited hctxs */
         blk_mq_remove_hw_queues_cpuhp(q);
@@ -4933,10 +4933,9 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
         if (!qe)
                 return false;
  
-       /* q->elevator needs protection from ->sysfs_lock */
-       mutex_lock(&q->sysfs_lock);
+       /* Accessing q->elevator needs protection from ->elevator_lock. */
+       mutex_lock(&q->elevator_lock);
  
-       /* the check has to be done with holding sysfs_lock */
         if (!q->elevator) {
                 kfree(qe);
                 goto unlock;
@@ -4950,7 +4949,7 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
         list_add(&qe->node, head);
         elevator_disable(q);
  unlock:
-       mutex_unlock(&q->sysfs_lock);
+       mutex_unlock(&q->elevator_lock);
  
         return true;
  }
@@ -4980,11 +4979,11 @@ static void blk_mq_elv_switch_back(struct list_head *head,
         list_del(&qe->node);
         kfree(qe);
  
-       mutex_lock(&q->sysfs_lock);
+       mutex_lock(&q->elevator_lock);
         elevator_switch(q, t);
         /* drop the reference acquired in blk_mq_elv_switch_none */
         elevator_put(t);
-       mutex_unlock(&q->sysfs_lock);
+       mutex_unlock(&q->elevator_lock);
  }
  
  static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c

index bc641ac71cde0db1c7f88333322c03de9dd139d8..1562e22877e14fe903d8143e4b8a69efa2f0826e 100644 (file)
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -693,10 +693,15 @@ static struct attribute *blk_mq_queue_attrs[] = {
          * Attributes which are protected with q->sysfs_lock.
          */
         &queue_requests_entry.attr,
-       &elv_iosched_entry.attr,
  #ifdef CONFIG_BLK_WBT
         &queue_wb_lat_entry.attr,
  #endif
+       /*
+        * Attributes which require some form of locking other than
+        * q->sysfs_lock.
+        */
+       &elv_iosched_entry.attr,
+
         /*
          * Attributes which don't require locking.
          */
@@ -865,15 +870,19 @@ int blk_register_queue(struct gendisk *disk)
         if (ret)
                 goto out_debugfs_remove;
  
+       ret = blk_crypto_sysfs_register(disk);
+       if (ret)
+               goto out_unregister_ia_ranges;
+
+       mutex_lock(&q->elevator_lock);
         if (q->elevator) {
                 ret = elv_register_queue(q, false);
-               if (ret)
-                       goto out_unregister_ia_ranges;
+               if (ret) {
+                       mutex_unlock(&q->elevator_lock);
+                       goto out_crypto_sysfs_unregister;
+               }
         }
-
-       ret = blk_crypto_sysfs_register(disk);
-       if (ret)
-               goto out_elv_unregister;
+       mutex_unlock(&q->elevator_lock);
  
         blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
         wbt_enable_default(disk);
@@ -898,8 +907,8 @@ int blk_register_queue(struct gendisk *disk)
  
         return ret;
  
-out_elv_unregister:
-       elv_unregister_queue(q);
+out_crypto_sysfs_unregister:
+       blk_crypto_sysfs_unregister(disk);
  out_unregister_ia_ranges:
         disk_unregister_independent_access_ranges(disk);
  out_debugfs_remove:
@@ -945,8 +954,11 @@ void blk_unregister_queue(struct gendisk *disk)
                 blk_mq_sysfs_unregister(disk);
         blk_crypto_sysfs_unregister(disk);
  
-       mutex_lock(&q->sysfs_lock);
+       mutex_lock(&q->elevator_lock);
         elv_unregister_queue(q);
+       mutex_unlock(&q->elevator_lock);
+
+       mutex_lock(&q->sysfs_lock);
         disk_unregister_independent_access_ranges(disk);
         mutex_unlock(&q->sysfs_lock);
  
diff --git a/block/elevator.c b/block/elevator.c

index 041f1d983bc7647e80152188d88f9173f78763c5..b4d08026b02cef8c0c573f3b921eb9e585fc516a 100644 (file)
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -457,7 +457,7 @@ int elv_register_queue(struct request_queue *q, bool uevent)
         struct elevator_queue *e = q->elevator;
         int error;
  
-       lockdep_assert_held(&q->sysfs_lock);
+       lockdep_assert_held(&q->elevator_lock);
  
         error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched");
         if (!error) {
@@ -481,7 +481,7 @@ void elv_unregister_queue(struct request_queue *q)
  {
         struct elevator_queue *e = q->elevator;
  
-       lockdep_assert_held(&q->sysfs_lock);
+       lockdep_assert_held(&q->elevator_lock);
  
         if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) {
                 kobject_uevent(&e->kobj, KOBJ_REMOVE);
@@ -618,7 +618,7 @@ int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
         unsigned int memflags;
         int ret;
  
-       lockdep_assert_held(&q->sysfs_lock);
+       lockdep_assert_held(&q->elevator_lock);
  
         memflags = blk_mq_freeze_queue(q);
         blk_mq_quiesce_queue(q);
@@ -655,7 +655,7 @@ void elevator_disable(struct request_queue *q)
  {
         unsigned int memflags;
  
-       lockdep_assert_held(&q->sysfs_lock);
+       lockdep_assert_held(&q->elevator_lock);
  
         memflags = blk_mq_freeze_queue(q);
         blk_mq_quiesce_queue(q);
@@ -700,28 +700,23 @@ static int elevator_change(struct request_queue *q, const char *elevator_name)
         return ret;
  }
  
-void elv_iosched_load_module(struct gendisk *disk, const char *buf,
-                            size_t count)
+static void elv_iosched_load_module(char *elevator_name)
  {
-       char elevator_name[ELV_NAME_MAX];
         struct elevator_type *found;
-       const char *name;
-
-       strscpy(elevator_name, buf, sizeof(elevator_name));
-       name = strstrip(elevator_name);
  
         spin_lock(&elv_list_lock);
-       found = __elevator_find(name);
+       found = __elevator_find(elevator_name);
         spin_unlock(&elv_list_lock);
  
         if (!found)
-               request_module("%s-iosched", name);
+               request_module("%s-iosched", elevator_name);
  }
  
  ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
                           size_t count)
  {
         char elevator_name[ELV_NAME_MAX];
+       char *name;
         int ret;
         unsigned int memflags;
         struct request_queue *q = disk->queue;
@@ -731,16 +726,18 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
          * queue to ensure that the module file can be read when the request
          * queue is the one for the device storing the module file.
          */
-       elv_iosched_load_module(disk, buf, count);
         strscpy(elevator_name, buf, sizeof(elevator_name));
+       name = strstrip(elevator_name);
+
+       elv_iosched_load_module(name);
  
-       mutex_lock(&q->sysfs_lock);
         memflags = blk_mq_freeze_queue(q);
-       ret = elevator_change(q, strstrip(elevator_name));
+       mutex_lock(&q->elevator_lock);
+       ret = elevator_change(q, name);
         if (!ret)
                 ret = count;
+       mutex_unlock(&q->elevator_lock);
         blk_mq_unfreeze_queue(q, memflags);
-       mutex_unlock(&q->sysfs_lock);
         return ret;
  }
  
@@ -751,7 +748,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name)
         struct elevator_type *cur = NULL, *e;
         int len = 0;
  
-       mutex_lock(&q->sysfs_lock);
+       mutex_lock(&q->elevator_lock);
         if (!q->elevator) {
                 len += sprintf(name+len, "[none] ");
         } else {
@@ -769,7 +766,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name)
         spin_unlock(&elv_list_lock);
  
         len += sprintf(name+len, "\n");
-       mutex_unlock(&q->sysfs_lock);
+       mutex_unlock(&q->elevator_lock);
  
         return len;
  }
diff --git a/block/elevator.h b/block/elevator.h

index e526662c5dbb1f313cb2346b72f713e1cd58665e..e4e44dfac50344ef3cbb65405f86b763490a743e 100644 (file)
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -148,8 +148,6 @@ extern void elv_unregister(struct elevator_type *);
   * io scheduler sysfs switching
   */
  ssize_t elv_iosched_show(struct gendisk *disk, char *page);
-void elv_iosched_load_module(struct gendisk *disk, const char *page,
-               size_t count);
  ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count);
  
  extern bool elv_bio_merge_ok(struct request *, struct bio *);
diff --git a/block/genhd.c b/block/genhd.c

index e9375e20d866ededa95d1f76fd78c5df330eab01..c2bd86cd09dec0ad28b341493b7b7a9bed0a2bd2 100644 (file)
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -565,8 +565,11 @@ out_free_ext_minor:
         if (disk->major == BLOCK_EXT_MAJOR)
                 blk_free_ext_minor(disk->first_minor);
  out_exit_elevator:
-       if (disk->queue->elevator)
+       if (disk->queue->elevator) {
+               mutex_lock(&disk->queue->elevator_lock);
                 elevator_exit(disk->queue);
+               mutex_unlock(&disk->queue->elevator_lock);
+       }
         return ret;
  }
  EXPORT_SYMBOL_GPL(add_disk_fwnode);
@@ -742,9 +745,9 @@ void del_gendisk(struct gendisk *disk)
  
         blk_mq_quiesce_queue(q);
         if (q->elevator) {
-               mutex_lock(&q->sysfs_lock);
+               mutex_lock(&q->elevator_lock);
                 elevator_exit(q);
-               mutex_unlock(&q->sysfs_lock);
+               mutex_unlock(&q->elevator_lock);
         }
         rq_qos_exit(q);
         blk_mq_unquiesce_queue(q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

index 248416ecd01c972f271c46a1a51350099c6357e6..31b1b635c7101c3707cbb7698f20d6ef5557d74b 100644 (file)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -560,6 +560,14 @@ struct request_queue {
         struct blk_flush_queue  *fq;
         struct list_head        flush_list;
  
+       /*
+        * Protects against I/O scheduler switching, specifically when
+        * updating q->elevator. To ensure proper locking order during
+        * an elevator update, first freeze the queue, then acquire
+        * ->elevator_lock.
+        */
+       struct mutex            elevator_lock;
+
         struct mutex            sysfs_lock;
         struct mutex            limits_lock;
author	Nilay Shroff <nilay@linux.ibm.com>
	Tue, 4 Mar 2025 10:22:33 +0000 (15:52 +0530)
committer	Jens Axboe <axboe@kernel.dk>
	Mon, 10 Mar 2025 13:30:18 +0000 (07:30 -0600)
block/blk-core.c		patch \| blob \| blame \| history
block/blk-mq.c		patch \| blob \| blame \| history
block/blk-sysfs.c		patch \| blob \| blame \| history
block/elevator.c		patch \| blob \| blame \| history
block/elevator.h		patch \| blob \| blame \| history
block/genhd.c		patch \| blob \| blame \| history
include/linux/blkdev.h		patch \| blob \| blame \| history