--- /dev/null
+From e8b8344de3980709080d86c157d24e7de07d70ad Mon Sep 17 00:00:00 2001
+From: Yu Kuai <yukuai3@huawei.com>
+Date: Fri, 29 Nov 2024 17:15:09 +0800
+Subject: block, bfq: fix bfqq uaf in bfq_limit_depth()
+
+From: Yu Kuai <yukuai3@huawei.com>
+
+commit e8b8344de3980709080d86c157d24e7de07d70ad upstream.
+
+Set new allocated bfqq to bic or remove freed bfqq from bic are both
+protected by bfqd->lock, however bfq_limit_depth() is deferencing bfqq
+from bic without the lock, this can lead to UAF if the io_context is
+shared by multiple tasks.
+
+For example, test bfq with io_uring can trigger following UAF in v6.6:
+
+==================================================================
+BUG: KASAN: slab-use-after-free in bfqq_group+0x15/0x50
+
+Call Trace:
+ <TASK>
+ dump_stack_lvl+0x47/0x80
+ print_address_description.constprop.0+0x66/0x300
+ print_report+0x3e/0x70
+ kasan_report+0xb4/0xf0
+ bfqq_group+0x15/0x50
+ bfqq_request_over_limit+0x130/0x9a0
+ bfq_limit_depth+0x1b5/0x480
+ __blk_mq_alloc_requests+0x2b5/0xa00
+ blk_mq_get_new_requests+0x11d/0x1d0
+ blk_mq_submit_bio+0x286/0xb00
+ submit_bio_noacct_nocheck+0x331/0x400
+ __block_write_full_folio+0x3d0/0x640
+ writepage_cb+0x3b/0xc0
+ write_cache_pages+0x254/0x6c0
+ write_cache_pages+0x254/0x6c0
+ do_writepages+0x192/0x310
+ filemap_fdatawrite_wbc+0x95/0xc0
+ __filemap_fdatawrite_range+0x99/0xd0
+ filemap_write_and_wait_range.part.0+0x4d/0xa0
+ blkdev_read_iter+0xef/0x1e0
+ io_read+0x1b6/0x8a0
+ io_issue_sqe+0x87/0x300
+ io_wq_submit_work+0xeb/0x390
+ io_worker_handle_work+0x24d/0x550
+ io_wq_worker+0x27f/0x6c0
+ ret_from_fork_asm+0x1b/0x30
+ </TASK>
+
+Allocated by task 808602:
+ kasan_save_stack+0x1e/0x40
+ kasan_set_track+0x21/0x30
+ __kasan_slab_alloc+0x83/0x90
+ kmem_cache_alloc_node+0x1b1/0x6d0
+ bfq_get_queue+0x138/0xfa0
+ bfq_get_bfqq_handle_split+0xe3/0x2c0
+ bfq_init_rq+0x196/0xbb0
+ bfq_insert_request.isra.0+0xb5/0x480
+ bfq_insert_requests+0x156/0x180
+ blk_mq_insert_request+0x15d/0x440
+ blk_mq_submit_bio+0x8a4/0xb00
+ submit_bio_noacct_nocheck+0x331/0x400
+ __blkdev_direct_IO_async+0x2dd/0x330
+ blkdev_write_iter+0x39a/0x450
+ io_write+0x22a/0x840
+ io_issue_sqe+0x87/0x300
+ io_wq_submit_work+0xeb/0x390
+ io_worker_handle_work+0x24d/0x550
+ io_wq_worker+0x27f/0x6c0
+ ret_from_fork+0x2d/0x50
+ ret_from_fork_asm+0x1b/0x30
+
+Freed by task 808589:
+ kasan_save_stack+0x1e/0x40
+ kasan_set_track+0x21/0x30
+ kasan_save_free_info+0x27/0x40
+ __kasan_slab_free+0x126/0x1b0
+ kmem_cache_free+0x10c/0x750
+ bfq_put_queue+0x2dd/0x770
+ __bfq_insert_request.isra.0+0x155/0x7a0
+ bfq_insert_request.isra.0+0x122/0x480
+ bfq_insert_requests+0x156/0x180
+ blk_mq_dispatch_plug_list+0x528/0x7e0
+ blk_mq_flush_plug_list.part.0+0xe5/0x590
+ __blk_flush_plug+0x3b/0x90
+ blk_finish_plug+0x40/0x60
+ do_writepages+0x19d/0x310
+ filemap_fdatawrite_wbc+0x95/0xc0
+ __filemap_fdatawrite_range+0x99/0xd0
+ filemap_write_and_wait_range.part.0+0x4d/0xa0
+ blkdev_read_iter+0xef/0x1e0
+ io_read+0x1b6/0x8a0
+ io_issue_sqe+0x87/0x300
+ io_wq_submit_work+0xeb/0x390
+ io_worker_handle_work+0x24d/0x550
+ io_wq_worker+0x27f/0x6c0
+ ret_from_fork+0x2d/0x50
+ ret_from_fork_asm+0x1b/0x30
+
+Fix the problem by protecting bic_to_bfqq() with bfqd->lock.
+
+CC: Jan Kara <jack@suse.cz>
+Fixes: 76f1df88bbc2 ("bfq: Limit number of requests consumed by each cgroup")
+Signed-off-by: Yu Kuai <yukuai3@huawei.com>
+Link: https://lore.kernel.org/r/20241129091509.2227136-1-yukuai1@huaweicloud.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Hagar Hemdan <hagarhem@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/bfq-iosched.c | 37 ++++++++++++++++++++++++-------------
+ 1 file changed, 24 insertions(+), 13 deletions(-)
+
+--- a/block/bfq-iosched.c
++++ b/block/bfq-iosched.c
+@@ -581,23 +581,31 @@ static struct request *bfq_choose_req(st
+ #define BFQ_LIMIT_INLINE_DEPTH 16
+
+ #ifdef CONFIG_BFQ_GROUP_IOSCHED
+-static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
++static bool bfqq_request_over_limit(struct bfq_data *bfqd,
++ struct bfq_io_cq *bic, blk_opf_t opf,
++ unsigned int act_idx, int limit)
+ {
+- struct bfq_data *bfqd = bfqq->bfqd;
+- struct bfq_entity *entity = &bfqq->entity;
+ struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH];
+ struct bfq_entity **entities = inline_entities;
+- int depth, level, alloc_depth = BFQ_LIMIT_INLINE_DEPTH;
+- int class_idx = bfqq->ioprio_class - 1;
++ int alloc_depth = BFQ_LIMIT_INLINE_DEPTH;
+ struct bfq_sched_data *sched_data;
++ struct bfq_entity *entity;
++ struct bfq_queue *bfqq;
+ unsigned long wsum;
+ bool ret = false;
+-
+- if (!entity->on_st_or_in_serv)
+- return false;
++ int depth;
++ int level;
+
+ retry:
+ spin_lock_irq(&bfqd->lock);
++ bfqq = bic_to_bfqq(bic, op_is_sync(opf), act_idx);
++ if (!bfqq)
++ goto out;
++
++ entity = &bfqq->entity;
++ if (!entity->on_st_or_in_serv)
++ goto out;
++
+ /* +1 for bfqq entity, root cgroup not included */
+ depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1;
+ if (depth > alloc_depth) {
+@@ -642,7 +650,7 @@ retry:
+ * class.
+ */
+ wsum = 0;
+- for (i = 0; i <= class_idx; i++) {
++ for (i = 0; i <= bfqq->ioprio_class - 1; i++) {
+ wsum = wsum * IOPRIO_BE_NR +
+ sched_data->service_tree[i].wsum;
+ }
+@@ -665,7 +673,9 @@ out:
+ return ret;
+ }
+ #else
+-static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
++static bool bfqq_request_over_limit(struct bfq_data *bfqd,
++ struct bfq_io_cq *bic, blk_opf_t opf,
++ unsigned int act_idx, int limit)
+ {
+ return false;
+ }
+@@ -703,8 +713,9 @@ static void bfq_limit_depth(blk_opf_t op
+ }
+
+ for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) {
+- struct bfq_queue *bfqq =
+- bic_to_bfqq(bic, op_is_sync(opf), act_idx);
++ /* Fast path to check if bfqq is already allocated. */
++ if (!bic_to_bfqq(bic, op_is_sync(opf), act_idx))
++ continue;
+
+ /*
+ * Does queue (or any parent entity) exceed number of
+@@ -712,7 +723,7 @@ static void bfq_limit_depth(blk_opf_t op
+ * limit depth so that it cannot consume more
+ * available requests and thus starve other entities.
+ */
+- if (bfqq && bfqq_request_over_limit(bfqq, limit)) {
++ if (bfqq_request_over_limit(bfqd, bic, opf, act_idx, limit)) {
+ depth = 1;
+ break;
+ }
--- /dev/null
+From 9778369a2d6c5ed2b81a04164c4aa9da1bdb193d Mon Sep 17 00:00:00 2001
+From: Paolo Valente <paolo.valente@linaro.org>
+Date: Tue, 3 Jan 2023 15:54:56 +0100
+Subject: block, bfq: split sync bfq_queues on a per-actuator basis
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Paolo Valente <paolo.valente@linaro.org>
+
+commit 9778369a2d6c5ed2b81a04164c4aa9da1bdb193d upstream.
+
+Single-LUN multi-actuator SCSI drives, as well as all multi-actuator
+SATA drives appear as a single device to the I/O subsystem [1]. Yet
+they address commands to different actuators internally, as a function
+of Logical Block Addressing (LBAs). A given sector is reachable by
+only one of the actuators. For example, Seagate’s Serial Advanced
+Technology Attachment (SATA) version contains two actuators and maps
+the lower half of the SATA LBA space to the lower actuator and the
+upper half to the upper actuator.
+
+Evidently, to fully utilize actuators, no actuator must be left idle
+or underutilized while there is pending I/O for it. The block layer
+must somehow control the load of each actuator individually. This
+commit lays the ground for allowing BFQ to provide such a per-actuator
+control.
+
+BFQ associates an I/O-request sync bfq_queue with each process doing
+synchronous I/O, or with a group of processes, in case of queue
+merging. Then BFQ serves one bfq_queue at a time. While in service, a
+bfq_queue is emptied in request-position order. Yet the same process,
+or group of processes, may generate I/O for different actuators. In
+this case, different streams of I/O (each for a different actuator)
+get all inserted into the same sync bfq_queue. So there is basically
+no individual control on when each stream is served, i.e., on when the
+I/O requests of the stream are picked from the bfq_queue and
+dispatched to the drive.
+
+This commit enables BFQ to control the service of each actuator
+individually for synchronous I/O, by simply splitting each sync
+bfq_queue into N queues, one for each actuator. In other words, a sync
+bfq_queue is now associated to a pair (process, actuator). As a
+consequence of this split, the per-queue proportional-share policy
+implemented by BFQ will guarantee that the sync I/O generated for each
+actuator, by each process, receives its fair share of service.
+
+This is just a preparatory patch. If the I/O of the same process
+happens to be sent to different queues, then each of these queues may
+undergo queue merging. To handle this event, the bfq_io_cq data
+structure must be properly extended. In addition, stable merging must
+be disabled to avoid loss of control on individual actuators. Finally,
+also async queues must be split. These issues are described in detail
+and addressed in next commits. As for this commit, although multiple
+per-process bfq_queues are provided, the I/O of each process or group
+of processes is still sent to only one queue, regardless of the
+actuator the I/O is for. The forwarding to distinct bfq_queues will be
+enabled after addressing the above issues.
+
+[1] https://www.linaro.org/blog/budget-fair-queueing-bfq-linux-io-scheduler-optimizations-for-multi-actuator-sata-hard-drives/
+
+Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
+Signed-off-by: Gabriele Felici <felicigb@gmail.com>
+Signed-off-by: Carmine Zaccagnino <carmine@carminezacc.com>
+Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
+Link: https://lore.kernel.org/r/20230103145503.71712-2-paolo.valente@linaro.org
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Stable-dep-of: e8b8344de398 ("block, bfq: fix bfqq uaf in bfq_limit_depth()")
+[Hagar: needed contextual fixes]
+Signed-off-by: Hagar Hemdan <hagarhem@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/bfq-cgroup.c | 97 ++++++++++++++++---------------
+ block/bfq-iosched.c | 160 ++++++++++++++++++++++++++++++++++------------------
+ block/bfq-iosched.h | 51 +++++++++++++---
+ 3 files changed, 197 insertions(+), 111 deletions(-)
+
+--- a/block/bfq-cgroup.c
++++ b/block/bfq-cgroup.c
+@@ -704,6 +704,46 @@ void bfq_bfqq_move(struct bfq_data *bfqd
+ bfq_put_queue(bfqq);
+ }
+
++static void bfq_sync_bfqq_move(struct bfq_data *bfqd,
++ struct bfq_queue *sync_bfqq,
++ struct bfq_io_cq *bic,
++ struct bfq_group *bfqg,
++ unsigned int act_idx)
++{
++ struct bfq_queue *bfqq;
++
++ if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) {
++ /* We are the only user of this bfqq, just move it */
++ if (sync_bfqq->entity.sched_data != &bfqg->sched_data)
++ bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
++ return;
++ }
++
++ /*
++ * The queue was merged to a different queue. Check
++ * that the merge chain still belongs to the same
++ * cgroup.
++ */
++ for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq)
++ if (bfqq->entity.sched_data != &bfqg->sched_data)
++ break;
++ if (bfqq) {
++ /*
++ * Some queue changed cgroup so the merge is not valid
++ * anymore. We cannot easily just cancel the merge (by
++ * clearing new_bfqq) as there may be other processes
++ * using this queue and holding refs to all queues
++ * below sync_bfqq->new_bfqq. Similarly if the merge
++ * already happened, we need to detach from bfqq now
++ * so that we cannot merge bio to a request from the
++ * old cgroup.
++ */
++ bfq_put_cooperator(sync_bfqq);
++ bfq_release_process_ref(bfqd, sync_bfqq);
++ bic_set_bfqq(bic, NULL, true, act_idx);
++ }
++}
++
+ /**
+ * __bfq_bic_change_cgroup - move @bic to @bfqg.
+ * @bfqd: the queue descriptor.
+@@ -714,60 +754,25 @@ void bfq_bfqq_move(struct bfq_data *bfqd
+ * sure that the reference to cgroup is valid across the call (see
+ * comments in bfq_bic_update_cgroup on this issue)
+ */
+-static void *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
++static void __bfq_bic_change_cgroup(struct bfq_data *bfqd,
+ struct bfq_io_cq *bic,
+ struct bfq_group *bfqg)
+ {
+- struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false);
+- struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true);
+- struct bfq_entity *entity;
+-
+- if (async_bfqq) {
+- entity = &async_bfqq->entity;
++ unsigned int act_idx;
+
+- if (entity->sched_data != &bfqg->sched_data) {
+- bic_set_bfqq(bic, NULL, false);
++ for (act_idx = 0; act_idx < bfqd->num_actuators; act_idx++) {
++ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false, act_idx);
++ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true, act_idx);
++
++ if (async_bfqq &&
++ async_bfqq->entity.sched_data != &bfqg->sched_data) {
++ bic_set_bfqq(bic, NULL, false, act_idx);
+ bfq_release_process_ref(bfqd, async_bfqq);
+ }
+- }
+
+- if (sync_bfqq) {
+- if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) {
+- /* We are the only user of this bfqq, just move it */
+- if (sync_bfqq->entity.sched_data != &bfqg->sched_data)
+- bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
+- } else {
+- struct bfq_queue *bfqq;
+-
+- /*
+- * The queue was merged to a different queue. Check
+- * that the merge chain still belongs to the same
+- * cgroup.
+- */
+- for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq)
+- if (bfqq->entity.sched_data !=
+- &bfqg->sched_data)
+- break;
+- if (bfqq) {
+- /*
+- * Some queue changed cgroup so the merge is
+- * not valid anymore. We cannot easily just
+- * cancel the merge (by clearing new_bfqq) as
+- * there may be other processes using this
+- * queue and holding refs to all queues below
+- * sync_bfqq->new_bfqq. Similarly if the merge
+- * already happened, we need to detach from
+- * bfqq now so that we cannot merge bio to a
+- * request from the old cgroup.
+- */
+- bfq_put_cooperator(sync_bfqq);
+- bic_set_bfqq(bic, NULL, true);
+- bfq_release_process_ref(bfqd, sync_bfqq);
+- }
+- }
++ if (sync_bfqq)
++ bfq_sync_bfqq_move(bfqd, sync_bfqq, bic, bfqg, act_idx);
+ }
+-
+- return bfqg;
+ }
+
+ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+--- a/block/bfq-iosched.c
++++ b/block/bfq-iosched.c
+@@ -377,16 +377,23 @@ static const unsigned long bfq_late_stab
+ #define RQ_BIC(rq) ((struct bfq_io_cq *)((rq)->elv.priv[0]))
+ #define RQ_BFQQ(rq) ((rq)->elv.priv[1])
+
+-struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
++struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync,
++ unsigned int actuator_idx)
+ {
+- return bic->bfqq[is_sync];
++ if (is_sync)
++ return bic->bfqq[1][actuator_idx];
++
++ return bic->bfqq[0][actuator_idx];
+ }
+
+ static void bfq_put_stable_ref(struct bfq_queue *bfqq);
+
+-void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
++void bic_set_bfqq(struct bfq_io_cq *bic,
++ struct bfq_queue *bfqq,
++ bool is_sync,
++ unsigned int actuator_idx)
+ {
+- struct bfq_queue *old_bfqq = bic->bfqq[is_sync];
++ struct bfq_queue *old_bfqq = bic->bfqq[is_sync][actuator_idx];
+
+ /* Clear bic pointer if bfqq is detached from this bic */
+ if (old_bfqq && old_bfqq->bic == bic)
+@@ -405,7 +412,10 @@ void bic_set_bfqq(struct bfq_io_cq *bic,
+ * we cancel the stable merge if
+ * bic->stable_merge_bfqq == bfqq.
+ */
+- bic->bfqq[is_sync] = bfqq;
++ if (is_sync)
++ bic->bfqq[1][actuator_idx] = bfqq;
++ else
++ bic->bfqq[0][actuator_idx] = bfqq;
+
+ if (bfqq && bic->stable_merge_bfqq == bfqq) {
+ /*
+@@ -680,9 +690,9 @@ static void bfq_limit_depth(blk_opf_t op
+ {
+ struct bfq_data *bfqd = data->q->elevator->elevator_data;
+ struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
+- struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(opf)) : NULL;
+ int depth;
+ unsigned limit = data->q->nr_requests;
++ unsigned int act_idx;
+
+ /* Sync reads have full depth available */
+ if (op_is_sync(opf) && !op_is_write(opf)) {
+@@ -692,14 +702,21 @@ static void bfq_limit_depth(blk_opf_t op
+ limit = (limit * depth) >> bfqd->full_depth_shift;
+ }
+
+- /*
+- * Does queue (or any parent entity) exceed number of requests that
+- * should be available to it? Heavily limit depth so that it cannot
+- * consume more available requests and thus starve other entities.
+- */
+- if (bfqq && bfqq_request_over_limit(bfqq, limit))
+- depth = 1;
++ for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) {
++ struct bfq_queue *bfqq =
++ bic_to_bfqq(bic, op_is_sync(opf), act_idx);
+
++ /*
++ * Does queue (or any parent entity) exceed number of
++ * requests that should be available to it? Heavily
++ * limit depth so that it cannot consume more
++ * available requests and thus starve other entities.
++ */
++ if (bfqq && bfqq_request_over_limit(bfqq, limit)) {
++ depth = 1;
++ break;
++ }
++ }
+ bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
+ __func__, bfqd->wr_busy_queues, op_is_sync(opf), depth);
+ if (depth)
+@@ -1820,6 +1837,18 @@ static bool bfq_bfqq_higher_class_or_wei
+ return bfqq_weight > in_serv_weight;
+ }
+
++/*
++ * Get the index of the actuator that will serve bio.
++ */
++static unsigned int bfq_actuator_index(struct bfq_data *bfqd, struct bio *bio)
++{
++ /*
++ * Multi-actuator support not complete yet, so always return 0
++ * for the moment (to keep incomplete mechanisms off).
++ */
++ return 0;
++}
++
+ static bool bfq_better_to_idle(struct bfq_queue *bfqq);
+
+ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
+@@ -2150,7 +2179,7 @@ static void bfq_check_waker(struct bfq_d
+ * We reset waker detection logic also if too much time has passed
+ * since the first detection. If wakeups are rare, pointless idling
+ * doesn't hurt throughput that much. The condition below makes sure
+- * we do not uselessly idle blocking waker in more than 1/64 cases.
++ * we do not uselessly idle blocking waker in more than 1/64 cases.
+ */
+ if (bfqd->last_completed_rq_bfqq !=
+ bfqq->tentative_waker_bfqq ||
+@@ -2486,7 +2515,8 @@ static bool bfq_bio_merge(struct request
+ */
+ bfq_bic_update_cgroup(bic, bio);
+
+- bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
++ bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf),
++ bfq_actuator_index(bfqd, bio));
+ } else {
+ bfqd->bio_bfqq = NULL;
+ }
+@@ -3188,7 +3218,7 @@ static struct bfq_queue *bfq_merge_bfqqs
+ /*
+ * Merge queues (that is, let bic redirect its requests to new_bfqq)
+ */
+- bic_set_bfqq(bic, new_bfqq, true);
++ bic_set_bfqq(bic, new_bfqq, true, bfqq->actuator_idx);
+ bfq_mark_bfqq_coop(new_bfqq);
+ /*
+ * new_bfqq now belongs to at least two bics (it is a shared queue):
+@@ -4818,11 +4848,8 @@ check_queue:
+ */
+ if (bfq_bfqq_wait_request(bfqq) ||
+ (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
+- struct bfq_queue *async_bfqq =
+- bfqq->bic && bfqq->bic->bfqq[0] &&
+- bfq_bfqq_busy(bfqq->bic->bfqq[0]) &&
+- bfqq->bic->bfqq[0]->next_rq ?
+- bfqq->bic->bfqq[0] : NULL;
++ unsigned int act_idx = bfqq->actuator_idx;
++ struct bfq_queue *async_bfqq = NULL;
+ struct bfq_queue *blocked_bfqq =
+ !hlist_empty(&bfqq->woken_list) ?
+ container_of(bfqq->woken_list.first,
+@@ -4830,6 +4857,10 @@ check_queue:
+ woken_list_node)
+ : NULL;
+
++ if (bfqq->bic && bfqq->bic->bfqq[0][act_idx] &&
++ bfq_bfqq_busy(bfqq->bic->bfqq[0][act_idx]) &&
++ bfqq->bic->bfqq[0][act_idx]->next_rq)
++ async_bfqq = bfqq->bic->bfqq[0][act_idx];
+ /*
+ * The next four mutually-exclusive ifs decide
+ * whether to try injection, and choose the queue to
+@@ -4914,7 +4945,7 @@ check_queue:
+ icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic &&
+ bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
+ bfq_bfqq_budget_left(async_bfqq))
+- bfqq = bfqq->bic->bfqq[0];
++ bfqq = bfqq->bic->bfqq[0][act_idx];
+ else if (bfqq->waker_bfqq &&
+ bfq_bfqq_busy(bfqq->waker_bfqq) &&
+ bfqq->waker_bfqq->next_rq &&
+@@ -5375,48 +5406,54 @@ static void bfq_exit_bfqq(struct bfq_dat
+ bfq_release_process_ref(bfqd, bfqq);
+ }
+
+-static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
++static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync,
++ unsigned int actuator_idx)
+ {
+- struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
++ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, actuator_idx);
+ struct bfq_data *bfqd;
+
+ if (bfqq)
+ bfqd = bfqq->bfqd; /* NULL if scheduler already exited */
+
+ if (bfqq && bfqd) {
+- unsigned long flags;
+-
+- spin_lock_irqsave(&bfqd->lock, flags);
+- bic_set_bfqq(bic, NULL, is_sync);
++ bic_set_bfqq(bic, NULL, is_sync, actuator_idx);
+ bfq_exit_bfqq(bfqd, bfqq);
+- spin_unlock_irqrestore(&bfqd->lock, flags);
+ }
+ }
+
+ static void bfq_exit_icq(struct io_cq *icq)
+ {
+ struct bfq_io_cq *bic = icq_to_bic(icq);
++ struct bfq_data *bfqd = bic_to_bfqd(bic);
++ unsigned long flags;
++ unsigned int act_idx;
++ /*
++ * If bfqd and thus bfqd->num_actuators is not available any
++ * longer, then cycle over all possible per-actuator bfqqs in
++ * next loop. We rely on bic being zeroed on creation, and
++ * therefore on its unused per-actuator fields being NULL.
++ */
++ unsigned int num_actuators = BFQ_MAX_ACTUATORS;
+
+- if (bic->stable_merge_bfqq) {
+- struct bfq_data *bfqd = bic->stable_merge_bfqq->bfqd;
++ /*
++ * bfqd is NULL if scheduler already exited, and in that case
++ * this is the last time these queues are accessed.
++ */
++ if (bfqd) {
++ spin_lock_irqsave(&bfqd->lock, flags);
++ num_actuators = bfqd->num_actuators;
++ }
+
+- /*
+- * bfqd is NULL if scheduler already exited, and in
+- * that case this is the last time bfqq is accessed.
+- */
+- if (bfqd) {
+- unsigned long flags;
++ if (bic->stable_merge_bfqq)
++ bfq_put_stable_ref(bic->stable_merge_bfqq);
+
+- spin_lock_irqsave(&bfqd->lock, flags);
+- bfq_put_stable_ref(bic->stable_merge_bfqq);
+- spin_unlock_irqrestore(&bfqd->lock, flags);
+- } else {
+- bfq_put_stable_ref(bic->stable_merge_bfqq);
+- }
++ for (act_idx = 0; act_idx < num_actuators; act_idx++) {
++ bfq_exit_icq_bfqq(bic, true, act_idx);
++ bfq_exit_icq_bfqq(bic, false, act_idx);
+ }
+
+- bfq_exit_icq_bfqq(bic, true);
+- bfq_exit_icq_bfqq(bic, false);
++ if (bfqd)
++ spin_unlock_irqrestore(&bfqd->lock, flags);
+ }
+
+ /*
+@@ -5493,25 +5530,27 @@ static void bfq_check_ioprio_change(stru
+
+ bic->ioprio = ioprio;
+
+- bfqq = bic_to_bfqq(bic, false);
++ bfqq = bic_to_bfqq(bic, false, bfq_actuator_index(bfqd, bio));
+ if (bfqq) {
+ struct bfq_queue *old_bfqq = bfqq;
+
+ bfqq = bfq_get_queue(bfqd, bio, false, bic, true);
+- bic_set_bfqq(bic, bfqq, false);
++ bic_set_bfqq(bic, bfqq, false, bfq_actuator_index(bfqd, bio));
+ bfq_release_process_ref(bfqd, old_bfqq);
+ }
+
+- bfqq = bic_to_bfqq(bic, true);
++ bfqq = bic_to_bfqq(bic, true, bfq_actuator_index(bfqd, bio));
+ if (bfqq)
+ bfq_set_next_ioprio_data(bfqq, bic);
+ }
+
+ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+- struct bfq_io_cq *bic, pid_t pid, int is_sync)
++ struct bfq_io_cq *bic, pid_t pid, int is_sync,
++ unsigned int act_idx)
+ {
+ u64 now_ns = ktime_get_ns();
+
++ bfqq->actuator_idx = act_idx;
+ RB_CLEAR_NODE(&bfqq->entity.rb_node);
+ INIT_LIST_HEAD(&bfqq->fifo);
+ INIT_HLIST_NODE(&bfqq->burst_list_node);
+@@ -5762,7 +5801,7 @@ static struct bfq_queue *bfq_get_queue(s
+
+ if (bfqq) {
+ bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
+- is_sync);
++ is_sync, bfq_actuator_index(bfqd, bio));
+ bfq_init_entity(&bfqq->entity, bfqg);
+ bfq_log_bfqq(bfqd, bfqq, "allocated");
+ } else {
+@@ -6078,7 +6117,8 @@ static bool __bfq_insert_request(struct
+ * then complete the merge and redirect it to
+ * new_bfqq.
+ */
+- if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) {
++ if (bic_to_bfqq(RQ_BIC(rq), true,
++ bfq_actuator_index(bfqd, rq->bio)) == bfqq) {
+ while (bfqq != new_bfqq)
+ bfqq = bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq);
+ }
+@@ -6632,7 +6672,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, st
+ return bfqq;
+ }
+
+- bic_set_bfqq(bic, NULL, true);
++ bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx);
+
+ bfq_put_cooperator(bfqq);
+
+@@ -6646,7 +6686,8 @@ static struct bfq_queue *bfq_get_bfqq_ha
+ bool split, bool is_sync,
+ bool *new_queue)
+ {
+- struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
++ unsigned int act_idx = bfq_actuator_index(bfqd, bio);
++ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, act_idx);
+
+ if (likely(bfqq && bfqq != &bfqd->oom_bfqq))
+ return bfqq;
+@@ -6658,7 +6699,7 @@ static struct bfq_queue *bfq_get_bfqq_ha
+ bfq_put_queue(bfqq);
+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split);
+
+- bic_set_bfqq(bic, bfqq, is_sync);
++ bic_set_bfqq(bic, bfqq, is_sync, act_idx);
+ if (split && is_sync) {
+ if ((bic->was_in_burst_list && bfqd->large_burst) ||
+ bic->saved_in_large_burst)
+@@ -7139,8 +7180,10 @@ static int bfq_init_queue(struct request
+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
+ * Grab a permanent reference to it, so that the normal code flow
+ * will not attempt to free it.
++ * Set zero as actuator index: we will pretend that
++ * all I/O requests are for the same actuator.
+ */
+- bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0, 0);
+ bfqd->oom_bfqq.ref++;
+ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
+ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
+@@ -7159,6 +7202,13 @@ static int bfq_init_queue(struct request
+
+ bfqd->queue = q;
+
++ /*
++ * Multi-actuator support not complete yet, unconditionally
++ * set to only one actuator for the moment (to keep incomplete
++ * mechanisms off).
++ */
++ bfqd->num_actuators = 1;
++
+ INIT_LIST_HEAD(&bfqd->dispatch);
+
+ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
+--- a/block/bfq-iosched.h
++++ b/block/bfq-iosched.h
+@@ -33,6 +33,14 @@
+ */
+ #define BFQ_SOFTRT_WEIGHT_FACTOR 100
+
++/*
++ * Maximum number of actuators supported. This constant is used simply
++ * to define the size of the static array that will contain
++ * per-actuator data. The current value is hopefully a good upper
++ * bound to the possible number of actuators of any actual drive.
++ */
++#define BFQ_MAX_ACTUATORS 8
++
+ struct bfq_entity;
+
+ /**
+@@ -225,12 +233,14 @@ struct bfq_ttime {
+ * struct bfq_queue - leaf schedulable entity.
+ *
+ * A bfq_queue is a leaf request queue; it can be associated with an
+- * io_context or more, if it is async or shared between cooperating
+- * processes. @cgroup holds a reference to the cgroup, to be sure that it
+- * does not disappear while a bfqq still references it (mostly to avoid
+- * races between request issuing and task migration followed by cgroup
+- * destruction).
+- * All the fields are protected by the queue lock of the containing bfqd.
++ * io_context or more, if it is async or shared between cooperating
++ * processes. Besides, it contains I/O requests for only one actuator
++ * (an io_context is associated with a different bfq_queue for each
++ * actuator it generates I/O for). @cgroup holds a reference to the
++ * cgroup, to be sure that it does not disappear while a bfqq still
++ * references it (mostly to avoid races between request issuing and
++ * task migration followed by cgroup destruction). All the fields are
++ * protected by the queue lock of the containing bfqd.
+ */
+ struct bfq_queue {
+ /* reference counter */
+@@ -395,6 +405,9 @@ struct bfq_queue {
+ * the woken queues when this queue exits.
+ */
+ struct hlist_head woken_list;
++
++ /* index of the actuator this queue is associated with */
++ unsigned int actuator_idx;
+ };
+
+ /**
+@@ -403,8 +416,17 @@ struct bfq_queue {
+ struct bfq_io_cq {
+ /* associated io_cq structure */
+ struct io_cq icq; /* must be the first member */
+- /* array of two process queues, the sync and the async */
+- struct bfq_queue *bfqq[2];
++ /*
++ * Matrix of associated process queues: first row for async
++ * queues, second row sync queues. Each row contains one
++ * column for each actuator. An I/O request generated by the
++ * process is inserted into the queue pointed by bfqq[i][j] if
++ * the request is to be served by the j-th actuator of the
++ * drive, where i==0 or i==1, depending on whether the request
++ * is async or sync. So there is a distinct queue for each
++ * actuator.
++ */
++ struct bfq_queue *bfqq[2][BFQ_MAX_ACTUATORS];
+ /* per (request_queue, blkcg) ioprio */
+ int ioprio;
+ #ifdef CONFIG_BFQ_GROUP_IOSCHED
+@@ -768,6 +790,13 @@ struct bfq_data {
+ */
+ unsigned int word_depths[2][2];
+ unsigned int full_depth_shift;
++
++ /*
++ * Number of independent actuators. This is equal to 1 in
++ * case of single-actuator drives.
++ */
++ unsigned int num_actuators;
++
+ };
+
+ enum bfqq_state_flags {
+@@ -964,8 +993,10 @@ struct bfq_group {
+
+ extern const int bfq_timeout;
+
+-struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync);
+-void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
++struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync,
++ unsigned int actuator_idx);
++void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync,
++ unsigned int actuator_idx);
+ struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
+ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
--- /dev/null
+From 318e8c339c9a0891c389298bb328ed0762a9935e Mon Sep 17 00:00:00 2001
+From: Patrick Bellasi <derkling@google.com>
+Date: Wed, 5 Feb 2025 14:04:41 +0000
+Subject: x86/cpu/kvm: SRSO: Fix possible missing IBPB on VM-Exit
+
+From: Patrick Bellasi <derkling@google.com>
+
+commit 318e8c339c9a0891c389298bb328ed0762a9935e upstream.
+
+In [1] the meaning of the synthetic IBPB flags has been redefined for a
+better separation of concerns:
+ - ENTRY_IBPB -- issue IBPB on entry only
+ - IBPB_ON_VMEXIT -- issue IBPB on VM-Exit only
+and the Retbleed mitigations have been updated to match this new
+semantics.
+
+Commit [2] was merged shortly before [1], and their interaction was not
+handled properly. This resulted in IBPB not being triggered on VM-Exit
+in all SRSO mitigation configs requesting an IBPB there.
+
+Specifically, an IBPB on VM-Exit is triggered only when
+X86_FEATURE_IBPB_ON_VMEXIT is set. However:
+
+ - X86_FEATURE_IBPB_ON_VMEXIT is not set for "spec_rstack_overflow=ibpb",
+ because before [1] having X86_FEATURE_ENTRY_IBPB was enough. Hence,
+ an IBPB is triggered on entry but the expected IBPB on VM-exit is
+ not.
+
+ - X86_FEATURE_IBPB_ON_VMEXIT is not set also when
+ "spec_rstack_overflow=ibpb-vmexit" if X86_FEATURE_ENTRY_IBPB is
+ already set.
+
+ That's because before [1] this was effectively redundant. Hence, e.g.
+ a "retbleed=ibpb spec_rstack_overflow=bpb-vmexit" config mistakenly
+ reports the machine still vulnerable to SRSO, despite an IBPB being
+ triggered both on entry and VM-Exit, because of the Retbleed selected
+ mitigation config.
+
+ - UNTRAIN_RET_VM won't still actually do anything unless
+ CONFIG_MITIGATION_IBPB_ENTRY is set.
+
+For "spec_rstack_overflow=ibpb", enable IBPB on both entry and VM-Exit
+and clear X86_FEATURE_RSB_VMEXIT which is made superfluous by
+X86_FEATURE_IBPB_ON_VMEXIT. This effectively makes this mitigation
+option similar to the one for 'retbleed=ibpb', thus re-order the code
+for the RETBLEED_MITIGATION_IBPB option to be less confusing by having
+all features enabling before the disabling of the not needed ones.
+
+For "spec_rstack_overflow=ibpb-vmexit", guard this mitigation setting
+with CONFIG_MITIGATION_IBPB_ENTRY to ensure UNTRAIN_RET_VM sequence is
+effectively compiled in. Drop instead the CONFIG_MITIGATION_SRSO guard,
+since none of the SRSO compile cruft is required in this configuration.
+Also, check only that the required microcode is present to effectively
+enabled the IBPB on VM-Exit.
+
+Finally, update the KConfig description for CONFIG_MITIGATION_IBPB_ENTRY
+to list also all SRSO config settings enabled by this guard.
+
+Fixes: 864bcaa38ee4 ("x86/cpu/kvm: Provide UNTRAIN_RET_VM") [1]
+Fixes: d893832d0e1e ("x86/srso: Add IBPB on VMEXIT") [2]
+Reported-by: Yosry Ahmed <yosryahmed@google.com>
+Signed-off-by: Patrick Bellasi <derkling@google.com>
+Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
+Cc: stable@kernel.org
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/Kconfig | 3 ++-
+ arch/x86/kernel/cpu/bugs.c | 20 ++++++++++++++------
+ 2 files changed, 16 insertions(+), 7 deletions(-)
+
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -2506,7 +2506,8 @@ config CPU_IBPB_ENTRY
+ depends on CPU_SUP_AMD && X86_64
+ default y
+ help
+- Compile the kernel with support for the retbleed=ibpb mitigation.
++ Compile the kernel with support for the retbleed=ibpb and
++ spec_rstack_overflow={ibpb,ibpb-vmexit} mitigations.
+
+ config CPU_IBRS_ENTRY
+ bool "Enable IBRS on kernel entry"
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -1092,6 +1092,8 @@ do_cmd_auto:
+
+ case RETBLEED_MITIGATION_IBPB:
+ setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
++ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
++ mitigate_smt = true;
+
+ /*
+ * IBPB on entry already obviates the need for
+@@ -1101,8 +1103,6 @@ do_cmd_auto:
+ setup_clear_cpu_cap(X86_FEATURE_UNRET);
+ setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
+
+- mitigate_smt = true;
+-
+ /*
+ * There is no need for RSB filling: entry_ibpb() ensures
+ * all predictions, including the RSB, are invalidated,
+@@ -2607,6 +2607,7 @@ static void __init srso_select_mitigatio
+ if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) {
+ if (has_microcode) {
+ setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
++ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
+ srso_mitigation = SRSO_MITIGATION_IBPB;
+
+ /*
+@@ -2616,6 +2617,13 @@ static void __init srso_select_mitigatio
+ */
+ setup_clear_cpu_cap(X86_FEATURE_UNRET);
+ setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
++
++ /*
++ * There is no need for RSB filling: entry_ibpb() ensures
++ * all predictions, including the RSB, are invalidated,
++ * regardless of IBPB implementation.
++ */
++ setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+ }
+ } else {
+ pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n");
+@@ -2624,8 +2632,8 @@ static void __init srso_select_mitigatio
+ break;
+
+ case SRSO_CMD_IBPB_ON_VMEXIT:
+- if (IS_ENABLED(CONFIG_CPU_SRSO)) {
+- if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) {
++ if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) {
++ if (has_microcode) {
+ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
+ srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
+
+@@ -2637,9 +2645,9 @@ static void __init srso_select_mitigatio
+ setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+ }
+ } else {
+- pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
++ pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n");
+ goto pred_cmd;
+- }
++ }
+ break;
+
+ default: