--- /dev/null
+From b89f625e28d44552083f43752f62d8621ded0a04 Mon Sep 17 00:00:00 2001
+From: Ming Lei <ming.lei@redhat.com>
+Date: Mon, 23 Sep 2019 23:12:09 +0800
+Subject: block: don't release queue's sysfs lock during switching elevator
+
+From: Ming Lei <ming.lei@redhat.com>
+
+commit b89f625e28d44552083f43752f62d8621ded0a04 upstream.
+
+cecf5d87ff20 ("block: split .sysfs_lock into two locks") starts to
+release & acquire sysfs_lock before registering/un-registering elevator
+queue during switching elevator for avoiding potential deadlock from
+showing & storing 'queue/iosched' attributes and removing elevator's
+kobject.
+
+Turns out there isn't such deadlock because 'q->sysfs_lock' isn't
+required in .show & .store of queue/iosched's attributes, and just
+elevator's sysfs lock is acquired in elv_iosched_store() and
+elv_iosched_show(). So it is safe to hold queue's sysfs lock when
+registering/un-registering elevator queue.
+
+The biggest issue is that commit cecf5d87ff20 assumes that concurrent
+write on 'queue/scheduler' can't happen. However, this assumption isn't
+true, because kernfs_fop_write() only guarantees that concurrent write
+aren't called on the same open file, but the write could be from
+different open on the file. So we can't release & re-acquire queue's
+sysfs lock during switching elevator, otherwise use-after-free on
+elevator could be triggered.
+
+Fixes the issue by not releasing queue's sysfs lock during switching
+elevator.
+
+Fixes: cecf5d87ff20 ("block: split .sysfs_lock into two locks")
+Cc: Christoph Hellwig <hch@infradead.org>
+Cc: Hannes Reinecke <hare@suse.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: Mike Snitzer <snitzer@redhat.com>
+Reviewed-by: Bart Van Assche <bvanassche@acm.org>
+Signed-off-by: Ming Lei <ming.lei@redhat.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+(jwang: adjust ctx for 4.19)
+Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/blk-sysfs.c | 3 ++-
+ block/elevator.c | 31 +------------------------------
+ 2 files changed, 3 insertions(+), 31 deletions(-)
+
+--- a/block/blk-sysfs.c
++++ b/block/blk-sysfs.c
+@@ -933,6 +933,7 @@ int blk_register_queue(struct gendisk *d
+ blk_mq_debugfs_register(q);
+ }
+
++ mutex_lock(&q->sysfs_lock);
+ /*
+ * The flag of QUEUE_FLAG_REGISTERED isn't set yet, so elevator
+ * switch won't happen at all.
+@@ -940,6 +941,7 @@ int blk_register_queue(struct gendisk *d
+ if (q->request_fn || (q->mq_ops && q->elevator)) {
+ ret = elv_register_queue(q, false);
+ if (ret) {
++ mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->sysfs_dir_lock);
+ kobject_del(&q->kobj);
+ blk_trace_remove_sysfs(dev);
+@@ -949,7 +951,6 @@ int blk_register_queue(struct gendisk *d
+ has_elevator = true;
+ }
+
+- mutex_lock(&q->sysfs_lock);
+ blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
+ wbt_enable_default(q);
+ blk_throtl_register_queue(q);
+--- a/block/elevator.c
++++ b/block/elevator.c
+@@ -856,11 +856,9 @@ int elv_register_queue(struct request_qu
+ if (uevent)
+ kobject_uevent(&e->kobj, KOBJ_ADD);
+
+- mutex_lock(&q->sysfs_lock);
+ e->registered = 1;
+ if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn)
+ e->type->ops.sq.elevator_registered_fn(q);
+- mutex_unlock(&q->sysfs_lock);
+ }
+ return error;
+ }
+@@ -878,11 +876,9 @@ void elv_unregister_queue(struct request
+ kobject_uevent(&e->kobj, KOBJ_REMOVE);
+ kobject_del(&e->kobj);
+
+- mutex_lock(&q->sysfs_lock);
+ e->registered = 0;
+ /* Re-enable throttling in case elevator disabled it */
+ wbt_enable_default(q);
+- mutex_unlock(&q->sysfs_lock);
+ }
+ }
+
+@@ -953,32 +949,11 @@ int elevator_switch_mq(struct request_qu
+ lockdep_assert_held(&q->sysfs_lock);
+
+ if (q->elevator) {
+- if (q->elevator->registered) {
+- mutex_unlock(&q->sysfs_lock);
+-
+- /*
+- * Concurrent elevator switch can't happen becasue
+- * sysfs write is always exclusively on same file.
+- *
+- * Also the elevator queue won't be freed after
+- * sysfs_lock is released becasue kobject_del() in
+- * blk_unregister_queue() waits for completion of
+- * .store & .show on its attributes.
+- */
++ if (q->elevator->registered)
+ elv_unregister_queue(q);
+
+- mutex_lock(&q->sysfs_lock);
+- }
+ ioc_clear_queue(q);
+ elevator_exit(q, q->elevator);
+-
+- /*
+- * sysfs_lock may be dropped, so re-check if queue is
+- * unregistered. If yes, don't switch to new elevator
+- * any more
+- */
+- if (!blk_queue_registered(q))
+- return 0;
+ }
+
+ ret = blk_mq_init_sched(q, new_e);
+@@ -986,11 +961,7 @@ int elevator_switch_mq(struct request_qu
+ goto out;
+
+ if (new_e) {
+- mutex_unlock(&q->sysfs_lock);
+-
+ ret = elv_register_queue(q, true);
+-
+- mutex_lock(&q->sysfs_lock);
+ if (ret) {
+ elevator_exit(q, q->elevator);
+ goto out;
--- /dev/null
+From cecf5d87ff2035127bb5a9ee054d0023a4a7cad3 Mon Sep 17 00:00:00 2001
+From: Ming Lei <ming.lei@redhat.com>
+Date: Tue, 27 Aug 2019 19:01:48 +0800
+Subject: block: split .sysfs_lock into two locks
+
+From: Ming Lei <ming.lei@redhat.com>
+
+commit cecf5d87ff2035127bb5a9ee054d0023a4a7cad3 upstream.
+
+The kernfs built-in lock of 'kn->count' is held in sysfs .show/.store
+path. Meantime, inside block's .show/.store callback, q->sysfs_lock is
+required.
+
+However, when mq & iosched kobjects are removed via
+blk_mq_unregister_dev() & elv_unregister_queue(), q->sysfs_lock is held
+too. This way causes AB-BA lock because the kernfs built-in lock of
+'kn-count' is required inside kobject_del() too, see the lockdep warning[1].
+
+On the other hand, it isn't necessary to acquire q->sysfs_lock for
+both blk_mq_unregister_dev() & elv_unregister_queue() because
+clearing REGISTERED flag prevents storing to 'queue/scheduler'
+from being happened. Also sysfs write(store) is exclusive, so no
+necessary to hold the lock for elv_unregister_queue() when it is
+called in switching elevator path.
+
+So split .sysfs_lock into two: one is still named as .sysfs_lock for
+covering sync .store, the other one is named as .sysfs_dir_lock
+for covering kobjects and related status change.
+
+sysfs itself can handle the race between add/remove kobjects and
+showing/storing attributes under kobjects. For switching scheduler
+via storing to 'queue/scheduler', we use the queue flag of
+QUEUE_FLAG_REGISTERED with .sysfs_lock for avoiding the race, then
+we can avoid to hold .sysfs_lock during removing/adding kobjects.
+
+[1] lockdep warning
+ ======================================================
+ WARNING: possible circular locking dependency detected
+ 5.3.0-rc3-00044-g73277fc75ea0 #1380 Not tainted
+ ------------------------------------------------------
+ rmmod/777 is trying to acquire lock:
+ 00000000ac50e981 (kn->count#202){++++}, at: kernfs_remove_by_name_ns+0x59/0x72
+
+ but task is already holding lock:
+ 00000000fb16ae21 (&q->sysfs_lock){+.+.}, at: blk_unregister_queue+0x78/0x10b
+
+ which lock already depends on the new lock.
+
+ the existing dependency chain (in reverse order) is:
+
+ -> #1 (&q->sysfs_lock){+.+.}:
+ __lock_acquire+0x95f/0xa2f
+ lock_acquire+0x1b4/0x1e8
+ __mutex_lock+0x14a/0xa9b
+ blk_mq_hw_sysfs_show+0x63/0xb6
+ sysfs_kf_seq_show+0x11f/0x196
+ seq_read+0x2cd/0x5f2
+ vfs_read+0xc7/0x18c
+ ksys_read+0xc4/0x13e
+ do_syscall_64+0xa7/0x295
+ entry_SYSCALL_64_after_hwframe+0x49/0xbe
+
+ -> #0 (kn->count#202){++++}:
+ check_prev_add+0x5d2/0xc45
+ validate_chain+0xed3/0xf94
+ __lock_acquire+0x95f/0xa2f
+ lock_acquire+0x1b4/0x1e8
+ __kernfs_remove+0x237/0x40b
+ kernfs_remove_by_name_ns+0x59/0x72
+ remove_files+0x61/0x96
+ sysfs_remove_group+0x81/0xa4
+ sysfs_remove_groups+0x3b/0x44
+ kobject_del+0x44/0x94
+ blk_mq_unregister_dev+0x83/0xdd
+ blk_unregister_queue+0xa0/0x10b
+ del_gendisk+0x259/0x3fa
+ null_del_dev+0x8b/0x1c3 [null_blk]
+ null_exit+0x5c/0x95 [null_blk]
+ __se_sys_delete_module+0x204/0x337
+ do_syscall_64+0xa7/0x295
+ entry_SYSCALL_64_after_hwframe+0x49/0xbe
+
+ other info that might help us debug this:
+
+ Possible unsafe locking scenario:
+
+ CPU0 CPU1
+ ---- ----
+ lock(&q->sysfs_lock);
+ lock(kn->count#202);
+ lock(&q->sysfs_lock);
+ lock(kn->count#202);
+
+ *** DEADLOCK ***
+
+ 2 locks held by rmmod/777:
+ #0: 00000000e69bd9de (&lock){+.+.}, at: null_exit+0x2e/0x95 [null_blk]
+ #1: 00000000fb16ae21 (&q->sysfs_lock){+.+.}, at: blk_unregister_queue+0x78/0x10b
+
+ stack backtrace:
+ CPU: 0 PID: 777 Comm: rmmod Not tainted 5.3.0-rc3-00044-g73277fc75ea0 #1380
+ Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ?-20180724_192412-buildhw-07.phx4
+ Call Trace:
+ dump_stack+0x9a/0xe6
+ check_noncircular+0x207/0x251
+ ? print_circular_bug+0x32a/0x32a
+ ? find_usage_backwards+0x84/0xb0
+ check_prev_add+0x5d2/0xc45
+ validate_chain+0xed3/0xf94
+ ? check_prev_add+0xc45/0xc45
+ ? mark_lock+0x11b/0x804
+ ? check_usage_forwards+0x1ca/0x1ca
+ __lock_acquire+0x95f/0xa2f
+ lock_acquire+0x1b4/0x1e8
+ ? kernfs_remove_by_name_ns+0x59/0x72
+ __kernfs_remove+0x237/0x40b
+ ? kernfs_remove_by_name_ns+0x59/0x72
+ ? kernfs_next_descendant_post+0x7d/0x7d
+ ? strlen+0x10/0x23
+ ? strcmp+0x22/0x44
+ kernfs_remove_by_name_ns+0x59/0x72
+ remove_files+0x61/0x96
+ sysfs_remove_group+0x81/0xa4
+ sysfs_remove_groups+0x3b/0x44
+ kobject_del+0x44/0x94
+ blk_mq_unregister_dev+0x83/0xdd
+ blk_unregister_queue+0xa0/0x10b
+ del_gendisk+0x259/0x3fa
+ ? disk_events_poll_msecs_store+0x12b/0x12b
+ ? check_flags+0x1ea/0x204
+ ? mark_held_locks+0x1f/0x7a
+ null_del_dev+0x8b/0x1c3 [null_blk]
+ null_exit+0x5c/0x95 [null_blk]
+ __se_sys_delete_module+0x204/0x337
+ ? free_module+0x39f/0x39f
+ ? blkcg_maybe_throttle_current+0x8a/0x718
+ ? rwlock_bug+0x62/0x62
+ ? __blkcg_punt_bio_submit+0xd0/0xd0
+ ? trace_hardirqs_on_thunk+0x1a/0x20
+ ? mark_held_locks+0x1f/0x7a
+ ? do_syscall_64+0x4c/0x295
+ do_syscall_64+0xa7/0x295
+ entry_SYSCALL_64_after_hwframe+0x49/0xbe
+ RIP: 0033:0x7fb696cdbe6b
+ Code: 73 01 c3 48 8b 0d 1d 20 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 008
+ RSP: 002b:00007ffec9588788 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
+ RAX: ffffffffffffffda RBX: 0000559e589137c0 RCX: 00007fb696cdbe6b
+ RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559e58913828
+ RBP: 0000000000000000 R08: 00007ffec9587701 R09: 0000000000000000
+ R10: 00007fb696d4eae0 R11: 0000000000000206 R12: 00007ffec95889b0
+ R13: 00007ffec95896b3 R14: 0000559e58913260 R15: 0000559e589137c0
+
+Cc: Christoph Hellwig <hch@infradead.org>
+Cc: Hannes Reinecke <hare@suse.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: Mike Snitzer <snitzer@redhat.com>
+Reviewed-by: Bart Van Assche <bvanassche@acm.org>
+Signed-off-by: Ming Lei <ming.lei@redhat.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+(jwang:cherry picked from commit cecf5d87ff2035127bb5a9ee054d0023a4a7cad3,
+adjust ctx for 4,19)
+Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/blk-core.c | 1
+ block/blk-mq-sysfs.c | 12 ++++-----
+ block/blk-sysfs.c | 44 +++++++++++++++++++++++-------------
+ block/blk.h | 2 -
+ block/elevator.c | 59 ++++++++++++++++++++++++++++++++++++++++---------
+ include/linux/blkdev.h | 1
+ 6 files changed, 86 insertions(+), 33 deletions(-)
+
+--- a/block/blk-core.c
++++ b/block/blk-core.c
+@@ -1059,6 +1059,7 @@ struct request_queue *blk_alloc_queue_no
+ mutex_init(&q->blk_trace_mutex);
+ #endif
+ mutex_init(&q->sysfs_lock);
++ mutex_init(&q->sysfs_dir_lock);
+ spin_lock_init(&q->__queue_lock);
+
+ if (!q->mq_ops)
+--- a/block/blk-mq-sysfs.c
++++ b/block/blk-mq-sysfs.c
+@@ -264,7 +264,7 @@ void blk_mq_unregister_dev(struct device
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+- lockdep_assert_held(&q->sysfs_lock);
++ lockdep_assert_held(&q->sysfs_dir_lock);
+
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_unregister_hctx(hctx);
+@@ -312,7 +312,7 @@ int __blk_mq_register_dev(struct device
+ int ret, i;
+
+ WARN_ON_ONCE(!q->kobj.parent);
+- lockdep_assert_held(&q->sysfs_lock);
++ lockdep_assert_held(&q->sysfs_dir_lock);
+
+ ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
+ if (ret < 0)
+@@ -358,7 +358,7 @@ void blk_mq_sysfs_unregister(struct requ
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+- mutex_lock(&q->sysfs_lock);
++ mutex_lock(&q->sysfs_dir_lock);
+ if (!q->mq_sysfs_init_done)
+ goto unlock;
+
+@@ -366,7 +366,7 @@ void blk_mq_sysfs_unregister(struct requ
+ blk_mq_unregister_hctx(hctx);
+
+ unlock:
+- mutex_unlock(&q->sysfs_lock);
++ mutex_unlock(&q->sysfs_dir_lock);
+ }
+
+ int blk_mq_sysfs_register(struct request_queue *q)
+@@ -374,7 +374,7 @@ int blk_mq_sysfs_register(struct request
+ struct blk_mq_hw_ctx *hctx;
+ int i, ret = 0;
+
+- mutex_lock(&q->sysfs_lock);
++ mutex_lock(&q->sysfs_dir_lock);
+ if (!q->mq_sysfs_init_done)
+ goto unlock;
+
+@@ -385,7 +385,7 @@ int blk_mq_sysfs_register(struct request
+ }
+
+ unlock:
+- mutex_unlock(&q->sysfs_lock);
++ mutex_unlock(&q->sysfs_dir_lock);
+
+ return ret;
+ }
+--- a/block/blk-sysfs.c
++++ b/block/blk-sysfs.c
+@@ -892,6 +892,7 @@ int blk_register_queue(struct gendisk *d
+ int ret;
+ struct device *dev = disk_to_dev(disk);
+ struct request_queue *q = disk->queue;
++ bool has_elevator = false;
+
+ if (WARN_ON(!q))
+ return -ENXIO;
+@@ -899,7 +900,6 @@ int blk_register_queue(struct gendisk *d
+ WARN_ONCE(blk_queue_registered(q),
+ "%s is registering an already registered queue\n",
+ kobject_name(&dev->kobj));
+- queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q);
+
+ /*
+ * SCSI probing may synchronously create and destroy a lot of
+@@ -920,8 +920,7 @@ int blk_register_queue(struct gendisk *d
+ if (ret)
+ return ret;
+
+- /* Prevent changes through sysfs until registration is completed. */
+- mutex_lock(&q->sysfs_lock);
++ mutex_lock(&q->sysfs_dir_lock);
+
+ ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
+ if (ret < 0) {
+@@ -934,26 +933,36 @@ int blk_register_queue(struct gendisk *d
+ blk_mq_debugfs_register(q);
+ }
+
+- kobject_uevent(&q->kobj, KOBJ_ADD);
+-
+- wbt_enable_default(q);
+-
+- blk_throtl_register_queue(q);
+-
++ /*
++ * The flag of QUEUE_FLAG_REGISTERED isn't set yet, so elevator
++ * switch won't happen at all.
++ */
+ if (q->request_fn || (q->mq_ops && q->elevator)) {
+- ret = elv_register_queue(q);
++ ret = elv_register_queue(q, false);
+ if (ret) {
+- mutex_unlock(&q->sysfs_lock);
+- kobject_uevent(&q->kobj, KOBJ_REMOVE);
++ mutex_unlock(&q->sysfs_dir_lock);
+ kobject_del(&q->kobj);
+ blk_trace_remove_sysfs(dev);
+ kobject_put(&dev->kobj);
+ return ret;
+ }
++ has_elevator = true;
+ }
++
++ mutex_lock(&q->sysfs_lock);
++ blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
++ wbt_enable_default(q);
++ blk_throtl_register_queue(q);
++
++ /* Now everything is ready and send out KOBJ_ADD uevent */
++ kobject_uevent(&q->kobj, KOBJ_ADD);
++ if (has_elevator)
++ kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
++ mutex_unlock(&q->sysfs_lock);
++
+ ret = 0;
+ unlock:
+- mutex_unlock(&q->sysfs_lock);
++ mutex_unlock(&q->sysfs_dir_lock);
+ return ret;
+ }
+ EXPORT_SYMBOL_GPL(blk_register_queue);
+@@ -968,6 +977,7 @@ EXPORT_SYMBOL_GPL(blk_register_queue);
+ void blk_unregister_queue(struct gendisk *disk)
+ {
+ struct request_queue *q = disk->queue;
++ bool has_elevator;
+
+ if (WARN_ON(!q))
+ return;
+@@ -982,25 +992,27 @@ void blk_unregister_queue(struct gendisk
+ * concurrent elv_iosched_store() calls.
+ */
+ mutex_lock(&q->sysfs_lock);
+-
+ blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
++ has_elevator = !!q->elevator;
++ mutex_unlock(&q->sysfs_lock);
+
++ mutex_lock(&q->sysfs_dir_lock);
+ /*
+ * Remove the sysfs attributes before unregistering the queue data
+ * structures that can be modified through sysfs.
+ */
+ if (q->mq_ops)
+ blk_mq_unregister_dev(disk_to_dev(disk), q);
+- mutex_unlock(&q->sysfs_lock);
+
+ kobject_uevent(&q->kobj, KOBJ_REMOVE);
+ kobject_del(&q->kobj);
+ blk_trace_remove_sysfs(disk_to_dev(disk));
+
+ mutex_lock(&q->sysfs_lock);
+- if (q->request_fn || (q->mq_ops && q->elevator))
++ if (q->request_fn || has_elevator)
+ elv_unregister_queue(q);
+ mutex_unlock(&q->sysfs_lock);
++ mutex_unlock(&q->sysfs_dir_lock);
+
+ kobject_put(&disk_to_dev(disk)->kobj);
+ }
+--- a/block/blk.h
++++ b/block/blk.h
+@@ -244,7 +244,7 @@ int elevator_init_mq(struct request_queu
+ int elevator_switch_mq(struct request_queue *q,
+ struct elevator_type *new_e);
+ void elevator_exit(struct request_queue *, struct elevator_queue *);
+-int elv_register_queue(struct request_queue *q);
++int elv_register_queue(struct request_queue *q, bool uevent);
+ void elv_unregister_queue(struct request_queue *q);
+
+ struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
+--- a/block/elevator.c
++++ b/block/elevator.c
+@@ -833,13 +833,16 @@ static struct kobj_type elv_ktype = {
+ .release = elevator_release,
+ };
+
+-int elv_register_queue(struct request_queue *q)
++/*
++ * elv_register_queue is called from either blk_register_queue or
++ * elevator_switch, elevator switch is prevented from being happen
++ * in the two paths, so it is safe to not hold q->sysfs_lock.
++ */
++int elv_register_queue(struct request_queue *q, bool uevent)
+ {
+ struct elevator_queue *e = q->elevator;
+ int error;
+
+- lockdep_assert_held(&q->sysfs_lock);
+-
+ error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
+ if (!error) {
+ struct elv_fs_entry *attr = e->type->elevator_attrs;
+@@ -850,26 +853,36 @@ int elv_register_queue(struct request_qu
+ attr++;
+ }
+ }
+- kobject_uevent(&e->kobj, KOBJ_ADD);
++ if (uevent)
++ kobject_uevent(&e->kobj, KOBJ_ADD);
++
++ mutex_lock(&q->sysfs_lock);
+ e->registered = 1;
+ if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn)
+ e->type->ops.sq.elevator_registered_fn(q);
++ mutex_unlock(&q->sysfs_lock);
+ }
+ return error;
+ }
+
++/*
++ * elv_unregister_queue is called from either blk_unregister_queue or
++ * elevator_switch, elevator switch is prevented from being happen
++ * in the two paths, so it is safe to not hold q->sysfs_lock.
++ */
+ void elv_unregister_queue(struct request_queue *q)
+ {
+- lockdep_assert_held(&q->sysfs_lock);
+-
+ if (q) {
+ struct elevator_queue *e = q->elevator;
+
+ kobject_uevent(&e->kobj, KOBJ_REMOVE);
+ kobject_del(&e->kobj);
++
++ mutex_lock(&q->sysfs_lock);
+ e->registered = 0;
+ /* Re-enable throttling in case elevator disabled it */
+ wbt_enable_default(q);
++ mutex_unlock(&q->sysfs_lock);
+ }
+ }
+
+@@ -940,10 +953,32 @@ int elevator_switch_mq(struct request_qu
+ lockdep_assert_held(&q->sysfs_lock);
+
+ if (q->elevator) {
+- if (q->elevator->registered)
++ if (q->elevator->registered) {
++ mutex_unlock(&q->sysfs_lock);
++
++ /*
++ * Concurrent elevator switch can't happen becasue
++ * sysfs write is always exclusively on same file.
++ *
++ * Also the elevator queue won't be freed after
++ * sysfs_lock is released becasue kobject_del() in
++ * blk_unregister_queue() waits for completion of
++ * .store & .show on its attributes.
++ */
+ elv_unregister_queue(q);
++
++ mutex_lock(&q->sysfs_lock);
++ }
+ ioc_clear_queue(q);
+ elevator_exit(q, q->elevator);
++
++ /*
++ * sysfs_lock may be dropped, so re-check if queue is
++ * unregistered. If yes, don't switch to new elevator
++ * any more
++ */
++ if (!blk_queue_registered(q))
++ return 0;
+ }
+
+ ret = blk_mq_init_sched(q, new_e);
+@@ -951,7 +986,11 @@ int elevator_switch_mq(struct request_qu
+ goto out;
+
+ if (new_e) {
+- ret = elv_register_queue(q);
++ mutex_unlock(&q->sysfs_lock);
++
++ ret = elv_register_queue(q, true);
++
++ mutex_lock(&q->sysfs_lock);
+ if (ret) {
+ elevator_exit(q, q->elevator);
+ goto out;
+@@ -1047,7 +1086,7 @@ static int elevator_switch(struct reques
+ if (err)
+ goto fail_init;
+
+- err = elv_register_queue(q);
++ err = elv_register_queue(q, true);
+ if (err)
+ goto fail_register;
+
+@@ -1067,7 +1106,7 @@ fail_init:
+ /* switch failed, restore and re-register old elevator */
+ if (old) {
+ q->elevator = old;
+- elv_register_queue(q);
++ elv_register_queue(q, true);
+ blk_queue_bypass_end(q);
+ }
+
+--- a/include/linux/blkdev.h
++++ b/include/linux/blkdev.h
+@@ -637,6 +637,7 @@ struct request_queue {
+ struct delayed_work requeue_work;
+
+ struct mutex sysfs_lock;
++ struct mutex sysfs_dir_lock;
+
+ int bypass_depth;
+ atomic_t mq_freeze_depth;