From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 22 Dec 2021 12:10:01 +0000 (+0100)
Subject: 4.19-stable patches
X-Git-Tag: v4.4.297~60
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=bc617c1e047ebd5c2012cc7080ca7a0e916ded38;p=thirdparty%2Fkernel%2Fstable-queue.git

4.19-stable patches

added patches:
	block-bfq-fix-asymmetric-scenarios-detection.patch
	block-bfq-fix-decrement-of-num_active_groups.patch
	block-bfq-fix-queue-removal-from-weights-tree.patch
	block-bfq-fix-use-after-free-in-bfq_bfqq_expire.patch
	block-bfq-improve-asymmetric-scenarios-detection.patch
---

diff --git a/queue-4.19/block-bfq-fix-asymmetric-scenarios-detection.patch b/queue-4.19/block-bfq-fix-asymmetric-scenarios-detection.patch
new file mode 100644
index 00000000000..4501480e841
--- /dev/null
+++ b/queue-4.19/block-bfq-fix-asymmetric-scenarios-detection.patch
@@ -0,0 +1,84 @@
+From foo@baz Wed Dec 22 01:08:34 PM CET 2021
+From: Yu Kuai <yukuai3@huawei.com>
+Date: Tue, 21 Dec 2021 19:38:46 +0800
+Subject: block, bfq: fix asymmetric scenarios detection
+To: <jack@suse.cz>, <gregkh@linuxfoundation.org>, <paolo.valente@linaro.org>, <axboe@kernel.dk>
+Cc: <linux-block@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <yukuai3@huawei.com>, <yi.zhang@huawei.com>
+Message-ID: <20211221113849.2219126-3-yukuai3@huawei.com>
+
+From: Federico Motta <federico@willer.it>
+
+commit 98fa7a3e001b21fb47c08af4304f40a3b0535cbd upstream.
+
+Since commit 2d29c9f89fcd ("block, bfq: improve asymmetric scenarios
+detection"), a scenario is defined asymmetric when one of the
+following conditions holds:
+- active bfq_queues have different weights
+- one or more group of entities (bfq_queue or other groups of entities)
+  are active
+bfq grants fairness and low latency also in such asymmetric scenarios,
+by plugging the dispatching of I/O if the bfq_queue in service happens
+to be temporarily idle. This plugging may lower throughput, so it is
+important to do it only when strictly needed.
+
+By mistake, in commit '2d29c9f89fcd' ("block, bfq: improve asymmetric
+scenarios detection") the num_active_groups counter was firstly
+incremented and subsequently decremented at any entity (group or
+bfq_queue) weight change.
+
+This is useless, because only transitions from active to inactive and
+vice versa matter for that counter. Unfortunately this is also
+incorrect in the following case: the entity at issue is a bfq_queue
+and it is under weight raising. In fact in this case there is a
+spurious increment of the num_active_groups counter.
+
+This spurious increment may cause scenarios to be wrongly detected as
+asymmetric, thus causing useless plugging and loss of throughput.
+
+This commit fixes this issue by simply removing the above useless and
+wrong increments and decrements.
+
+Fixes: 2d29c9f89fcd ("block, bfq: improve asymmetric scenarios detection")
+Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
+Signed-off-by: Federico Motta <federico@willer.it>
+Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Yu Kuai <yukuai3@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/bfq-wf2q.c |   18 ++++++------------
+ 1 file changed, 6 insertions(+), 12 deletions(-)
+
+--- a/block/bfq-wf2q.c
++++ b/block/bfq-wf2q.c
+@@ -792,24 +792,18 @@ __bfq_entity_update_weight_prio(struct b
+ 		 * queue, remove the entity from its old weight counter (if
+ 		 * there is a counter associated with the entity).
+ 		 */
+-		if (prev_weight != new_weight) {
+-			if (bfqq) {
+-				root = &bfqd->queue_weights_tree;
+-				__bfq_weights_tree_remove(bfqd, bfqq, root);
+-			} else
+-				bfqd->num_active_groups--;
++		if (prev_weight != new_weight && bfqq) {
++			root = &bfqd->queue_weights_tree;
++			__bfq_weights_tree_remove(bfqd, bfqq, root);
+ 		}
+ 		entity->weight = new_weight;
+ 		/*
+ 		 * Add the entity, if it is not a weight-raised queue,
+ 		 * to the counter associated with its new weight.
+ 		 */
+-		if (prev_weight != new_weight) {
+-			if (bfqq && bfqq->wr_coeff == 1) {
+-				/* If we get here, root has been initialized. */
+-				bfq_weights_tree_add(bfqd, bfqq, root);
+-			} else
+-				bfqd->num_active_groups++;
++		if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) {
++			/* If we get here, root has been initialized. */
++			bfq_weights_tree_add(bfqd, bfqq, root);
+ 		}
+ 
+ 		new_st->wsum += entity->weight;
diff --git a/queue-4.19/block-bfq-fix-decrement-of-num_active_groups.patch b/queue-4.19/block-bfq-fix-decrement-of-num_active_groups.patch
new file mode 100644
index 00000000000..cfd9d03057d
--- /dev/null
+++ b/queue-4.19/block-bfq-fix-decrement-of-num_active_groups.patch
@@ -0,0 +1,270 @@
+From foo@baz Wed Dec 22 01:08:34 PM CET 2021
+From: Yu Kuai <yukuai3@huawei.com>
+Date: Tue, 21 Dec 2021 19:38:47 +0800
+Subject: block, bfq: fix decrement of num_active_groups
+To: <jack@suse.cz>, <gregkh@linuxfoundation.org>, <paolo.valente@linaro.org>, <axboe@kernel.dk>
+Cc: <linux-block@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <yukuai3@huawei.com>, <yi.zhang@huawei.com>
+Message-ID: <20211221113849.2219126-4-yukuai3@huawei.com>
+
+From: Paolo Valente <paolo.valente@linaro.org>
+
+commit ba7aeae5539c7a7cccc4cf07a2bc61281a93c50e upstream.
+
+Since commit '2d29c9f89fcd ("block, bfq: improve asymmetric scenarios
+detection")', if there are process groups with I/O requests waiting for
+completion, then BFQ tags the scenario as 'asymmetric'. This detection
+is needed for preserving service guarantees (for details, see comments
+on the computation * of the variable asymmetric_scenario in the
+function bfq_better_to_idle).
+
+Unfortunately, commit '2d29c9f89fcd ("block, bfq: improve asymmetric
+scenarios detection")' contains an error exactly in the updating of
+the number of groups with I/O requests waiting for completion: if a
+group has more than one descendant process, then the above number of
+groups, which is renamed from num_active_groups to a more appropriate
+num_groups_with_pending_reqs by this commit, may happen to be wrongly
+decremented multiple times, namely every time one of the descendant
+processes gets all its pending I/O requests completed.
+
+A correct, complete solution should work as follows. Consider a group
+that is inactive, i.e., that has no descendant process with pending
+I/O inside BFQ queues. Then suppose that num_groups_with_pending_reqs
+is still accounting for this group, because the group still has some
+descendant process with some I/O request still in
+flight. num_groups_with_pending_reqs should be decremented when the
+in-flight request of the last descendant process is finally completed
+(assuming that nothing else has changed for the group in the meantime,
+in terms of composition of the group and active/inactive state of
+child groups and processes). To accomplish this, an additional
+pending-request counter must be added to entities, and must be
+updated correctly.
+
+To avoid this additional field and operations, this commit resorts to
+the following tradeoff between simplicity and accuracy: for an
+inactive group that is still counted in num_groups_with_pending_reqs,
+this commit decrements num_groups_with_pending_reqs when the first
+descendant process of the group remains with no request waiting for
+completion.
+
+This simplified scheme provides a fix to the unbalanced decrements
+introduced by 2d29c9f89fcd. Since this error was also caused by lack
+of comments on this non-trivial issue, this commit also adds related
+comments.
+
+Fixes: 2d29c9f89fcd ("block, bfq: improve asymmetric scenarios detection")
+Reported-by: Steven Barrett <steven@liquorix.net>
+Tested-by: Steven Barrett <steven@liquorix.net>
+Tested-by: Lucjan Lucjanov <lucjan.lucjanov@gmail.com>
+Reviewed-by: Federico Motta <federico@willer.it>
+Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Yu Kuai <yukuai3@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/bfq-iosched.c |   76 ++++++++++++++++++++++++++++++++++++----------------
+ block/bfq-iosched.h |   51 +++++++++++++++++++++++++++++++++-
+ block/bfq-wf2q.c    |    5 ++-
+ 3 files changed, 107 insertions(+), 25 deletions(-)
+
+--- a/block/bfq-iosched.c
++++ b/block/bfq-iosched.c
+@@ -639,7 +639,7 @@ static bool bfq_varied_queue_weights_or_
+ 		 bfqd->queue_weights_tree.rb_node->rb_right)
+ #ifdef CONFIG_BFQ_GROUP_IOSCHED
+ 	       ) ||
+-		(bfqd->num_active_groups > 0
++		(bfqd->num_groups_with_pending_reqs > 0
+ #endif
+ 	       );
+ }
+@@ -803,7 +803,21 @@ void bfq_weights_tree_remove(struct bfq_
+ 			 */
+ 			break;
+ 		}
+-		bfqd->num_active_groups--;
++
++		/*
++		 * The decrement of num_groups_with_pending_reqs is
++		 * not performed immediately upon the deactivation of
++		 * entity, but it is delayed to when it also happens
++		 * that the first leaf descendant bfqq of entity gets
++		 * all its pending requests completed. The following
++		 * instructions perform this delayed decrement, if
++		 * needed. See the comments on
++		 * num_groups_with_pending_reqs for details.
++		 */
++		if (entity->in_groups_with_pending_reqs) {
++			entity->in_groups_with_pending_reqs = false;
++			bfqd->num_groups_with_pending_reqs--;
++		}
+ 	}
+ }
+ 
+@@ -3544,27 +3558,44 @@ static bool bfq_better_to_idle(struct bf
+ 	 * fact, if there are active groups, then, for condition (i)
+ 	 * to become false, it is enough that an active group contains
+ 	 * more active processes or sub-groups than some other active
+-	 * group. We address this issue with the following bi-modal
+-	 * behavior, implemented in the function
++	 * group. More precisely, for condition (i) to hold because of
++	 * such a group, it is not even necessary that the group is
++	 * (still) active: it is sufficient that, even if the group
++	 * has become inactive, some of its descendant processes still
++	 * have some request already dispatched but still waiting for
++	 * completion. In fact, requests have still to be guaranteed
++	 * their share of the throughput even after being
++	 * dispatched. In this respect, it is easy to show that, if a
++	 * group frequently becomes inactive while still having
++	 * in-flight requests, and if, when this happens, the group is
++	 * not considered in the calculation of whether the scenario
++	 * is asymmetric, then the group may fail to be guaranteed its
++	 * fair share of the throughput (basically because idling may
++	 * not be performed for the descendant processes of the group,
++	 * but it had to be).  We address this issue with the
++	 * following bi-modal behavior, implemented in the function
+ 	 * bfq_symmetric_scenario().
+ 	 *
+-	 * If there are active groups, then the scenario is tagged as
++	 * If there are groups with requests waiting for completion
++	 * (as commented above, some of these groups may even be
++	 * already inactive), then the scenario is tagged as
+ 	 * asymmetric, conservatively, without checking any of the
+ 	 * conditions (i) and (ii). So the device is idled for bfqq.
+ 	 * This behavior matches also the fact that groups are created
+-	 * exactly if controlling I/O (to preserve bandwidth and
+-	 * latency guarantees) is a primary concern.
++	 * exactly if controlling I/O is a primary concern (to
++	 * preserve bandwidth and latency guarantees).
+ 	 *
+-	 * On the opposite end, if there are no active groups, then
+-	 * only condition (i) is actually controlled, i.e., provided
+-	 * that condition (i) holds, idling is not performed,
+-	 * regardless of whether condition (ii) holds. In other words,
+-	 * only if condition (i) does not hold, then idling is
+-	 * allowed, and the device tends to be prevented from queueing
+-	 * many requests, possibly of several processes. Since there
+-	 * are no active groups, then, to control condition (i) it is
+-	 * enough to check whether all active queues have the same
+-	 * weight.
++	 * On the opposite end, if there are no groups with requests
++	 * waiting for completion, then only condition (i) is actually
++	 * controlled, i.e., provided that condition (i) holds, idling
++	 * is not performed, regardless of whether condition (ii)
++	 * holds. In other words, only if condition (i) does not hold,
++	 * then idling is allowed, and the device tends to be
++	 * prevented from queueing many requests, possibly of several
++	 * processes. Since there are no groups with requests waiting
++	 * for completion, then, to control condition (i) it is enough
++	 * to check just whether all the queues with requests waiting
++	 * for completion also have the same weight.
+ 	 *
+ 	 * Not checking condition (ii) evidently exposes bfqq to the
+ 	 * risk of getting less throughput than its fair share.
+@@ -3622,10 +3653,11 @@ static bool bfq_better_to_idle(struct bf
+ 	 * bfqq is weight-raised is checked explicitly here. More
+ 	 * precisely, the compound condition below takes into account
+ 	 * also the fact that, even if bfqq is being weight-raised,
+-	 * the scenario is still symmetric if all active queues happen
+-	 * to be weight-raised. Actually, we should be even more
+-	 * precise here, and differentiate between interactive weight
+-	 * raising and soft real-time weight raising.
++	 * the scenario is still symmetric if all queues with requests
++	 * waiting for completion happen to be
++	 * weight-raised. Actually, we should be even more precise
++	 * here, and differentiate between interactive weight raising
++	 * and soft real-time weight raising.
+ 	 *
+ 	 * As a side note, it is worth considering that the above
+ 	 * device-idling countermeasures may however fail in the
+@@ -5447,7 +5479,7 @@ static int bfq_init_queue(struct request
+ 	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
+ 
+ 	bfqd->queue_weights_tree = RB_ROOT;
+-	bfqd->num_active_groups = 0;
++	bfqd->num_groups_with_pending_reqs = 0;
+ 
+ 	INIT_LIST_HEAD(&bfqd->active_list);
+ 	INIT_LIST_HEAD(&bfqd->idle_list);
+--- a/block/bfq-iosched.h
++++ b/block/bfq-iosched.h
+@@ -196,6 +196,9 @@ struct bfq_entity {
+ 
+ 	/* flag, set to request a weight, ioprio or ioprio_class change  */
+ 	int prio_changed;
++
++	/* flag, set if the entity is counted in groups_with_pending_reqs */
++	bool in_groups_with_pending_reqs;
+ };
+ 
+ struct bfq_group;
+@@ -448,10 +451,54 @@ struct bfq_data {
+ 	 * bfq_weights_tree_[add|remove] for further details).
+ 	 */
+ 	struct rb_root queue_weights_tree;
++
+ 	/*
+-	 * number of groups with requests still waiting for completion
++	 * Number of groups with at least one descendant process that
++	 * has at least one request waiting for completion. Note that
++	 * this accounts for also requests already dispatched, but not
++	 * yet completed. Therefore this number of groups may differ
++	 * (be larger) than the number of active groups, as a group is
++	 * considered active only if its corresponding entity has
++	 * descendant queues with at least one request queued. This
++	 * number is used to decide whether a scenario is symmetric.
++	 * For a detailed explanation see comments on the computation
++	 * of the variable asymmetric_scenario in the function
++	 * bfq_better_to_idle().
++	 *
++	 * However, it is hard to compute this number exactly, for
++	 * groups with multiple descendant processes. Consider a group
++	 * that is inactive, i.e., that has no descendant process with
++	 * pending I/O inside BFQ queues. Then suppose that
++	 * num_groups_with_pending_reqs is still accounting for this
++	 * group, because the group has descendant processes with some
++	 * I/O request still in flight. num_groups_with_pending_reqs
++	 * should be decremented when the in-flight request of the
++	 * last descendant process is finally completed (assuming that
++	 * nothing else has changed for the group in the meantime, in
++	 * terms of composition of the group and active/inactive state of child
++	 * groups and processes). To accomplish this, an additional
++	 * pending-request counter must be added to entities, and must
++	 * be updated correctly. To avoid this additional field and operations,
++	 * we resort to the following tradeoff between simplicity and
++	 * accuracy: for an inactive group that is still counted in
++	 * num_groups_with_pending_reqs, we decrement
++	 * num_groups_with_pending_reqs when the first descendant
++	 * process of the group remains with no request waiting for
++	 * completion.
++	 *
++	 * Even this simpler decrement strategy requires a little
++	 * carefulness: to avoid multiple decrements, we flag a group,
++	 * more precisely an entity representing a group, as still
++	 * counted in num_groups_with_pending_reqs when it becomes
++	 * inactive. Then, when the first descendant queue of the
++	 * entity remains with no request waiting for completion,
++	 * num_groups_with_pending_reqs is decremented, and this flag
++	 * is reset. After this flag is reset for the entity,
++	 * num_groups_with_pending_reqs won't be decremented any
++	 * longer in case a new descendant queue of the entity remains
++	 * with no request waiting for completion.
+ 	 */
+-	unsigned int num_active_groups;
++	unsigned int num_groups_with_pending_reqs;
+ 
+ 	/*
+ 	 * Number of bfq_queues containing requests (including the
+--- a/block/bfq-wf2q.c
++++ b/block/bfq-wf2q.c
+@@ -1012,7 +1012,10 @@ static void __bfq_activate_entity(struct
+ 			container_of(entity, struct bfq_group, entity);
+ 		struct bfq_data *bfqd = bfqg->bfqd;
+ 
+-		bfqd->num_active_groups++;
++		if (!entity->in_groups_with_pending_reqs) {
++			entity->in_groups_with_pending_reqs = true;
++			bfqd->num_groups_with_pending_reqs++;
++		}
+ 	}
+ #endif
+ 
diff --git a/queue-4.19/block-bfq-fix-queue-removal-from-weights-tree.patch b/queue-4.19/block-bfq-fix-queue-removal-from-weights-tree.patch
new file mode 100644
index 00000000000..88a213ac238
--- /dev/null
+++ b/queue-4.19/block-bfq-fix-queue-removal-from-weights-tree.patch
@@ -0,0 +1,118 @@
+From foo@baz Wed Dec 22 01:08:34 PM CET 2021
+From: Yu Kuai <yukuai3@huawei.com>
+Date: Tue, 21 Dec 2021 19:38:48 +0800
+Subject: block, bfq: fix queue removal from weights tree
+To: <jack@suse.cz>, <gregkh@linuxfoundation.org>, <paolo.valente@linaro.org>, <axboe@kernel.dk>
+Cc: <linux-block@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <yukuai3@huawei.com>, <yi.zhang@huawei.com>
+Message-ID: <20211221113849.2219126-5-yukuai3@huawei.com>
+
+From: Paolo Valente <paolo.valente@linaro.org>
+
+commit 9dee8b3b057e1da26f85f1842f2aaf3bb200fb94 upstream.
+
+bfq maintains an ordered list, through a red-black tree, of unique
+weights of active bfq_queues. This list is used to detect whether there
+are active queues with differentiated weights. The weight of a queue is
+removed from the list when both the following two conditions become
+true:
+
+(1) the bfq_queue is flagged as inactive
+(2) the has no in-flight request any longer;
+
+Unfortunately, in the rare cases where condition (2) becomes true before
+condition (1), the removal fails, because the function to remove the
+weight of the queue (bfq_weights_tree_remove) is rightly invoked in the
+path that deactivates the bfq_queue, but mistakenly invoked *before* the
+function that actually performs the deactivation (bfq_deactivate_bfqq).
+
+This commits moves the invocation of bfq_weights_tree_remove for
+condition (1) to after bfq_deactivate_bfqq. As a consequence of this
+move, it is necessary to add a further reference to the queue when the
+weight of a queue is added, because the queue might otherwise be freed
+before bfq_weights_tree_remove is invoked. This commit adds this
+reference and makes all related modifications.
+
+Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Yu Kuai <yukuai3@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/bfq-iosched.c |   17 +++++++++++++----
+ block/bfq-wf2q.c    |    6 +++---
+ 2 files changed, 16 insertions(+), 7 deletions(-)
+
+--- a/block/bfq-iosched.c
++++ b/block/bfq-iosched.c
+@@ -748,6 +748,7 @@ void bfq_weights_tree_add(struct bfq_dat
+ 
+ inc_counter:
+ 	bfqq->weight_counter->num_active++;
++	bfqq->ref++;
+ }
+ 
+ /*
+@@ -772,6 +773,7 @@ void __bfq_weights_tree_remove(struct bf
+ 
+ reset_entity_pointer:
+ 	bfqq->weight_counter = NULL;
++	bfq_put_queue(bfqq);
+ }
+ 
+ /*
+@@ -783,9 +785,6 @@ void bfq_weights_tree_remove(struct bfq_
+ {
+ 	struct bfq_entity *entity = bfqq->entity.parent;
+ 
+-	__bfq_weights_tree_remove(bfqd, bfqq,
+-				  &bfqd->queue_weights_tree);
+-
+ 	for_each_entity(entity) {
+ 		struct bfq_sched_data *sd = entity->my_sched_data;
+ 
+@@ -819,6 +818,15 @@ void bfq_weights_tree_remove(struct bfq_
+ 			bfqd->num_groups_with_pending_reqs--;
+ 		}
+ 	}
++
++	/*
++	 * Next function is invoked last, because it causes bfqq to be
++	 * freed if the following holds: bfqq is not in service and
++	 * has no dispatched request. DO NOT use bfqq after the next
++	 * function invocation.
++	 */
++	__bfq_weights_tree_remove(bfqd, bfqq,
++				  &bfqd->queue_weights_tree);
+ }
+ 
+ /*
+@@ -1012,7 +1020,8 @@ bfq_bfqq_resume_state(struct bfq_queue *
+ 
+ static int bfqq_process_refs(struct bfq_queue *bfqq)
+ {
+-	return bfqq->ref - bfqq->allocated - bfqq->entity.on_st;
++	return bfqq->ref - bfqq->allocated - bfqq->entity.on_st -
++		(bfqq->weight_counter != NULL);
+ }
+ 
+ /* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
+--- a/block/bfq-wf2q.c
++++ b/block/bfq-wf2q.c
+@@ -1668,15 +1668,15 @@ void bfq_del_bfqq_busy(struct bfq_data *
+ 
+ 	bfqd->busy_queues--;
+ 
+-	if (!bfqq->dispatched)
+-		bfq_weights_tree_remove(bfqd, bfqq);
+-
+ 	if (bfqq->wr_coeff > 1)
+ 		bfqd->wr_busy_queues--;
+ 
+ 	bfqg_stats_update_dequeue(bfqq_group(bfqq));
+ 
+ 	bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
++
++	if (!bfqq->dispatched)
++		bfq_weights_tree_remove(bfqd, bfqq);
+ }
+ 
+ /*
diff --git a/queue-4.19/block-bfq-fix-use-after-free-in-bfq_bfqq_expire.patch b/queue-4.19/block-bfq-fix-use-after-free-in-bfq_bfqq_expire.patch
new file mode 100644
index 00000000000..139899e5bb2
--- /dev/null
+++ b/queue-4.19/block-bfq-fix-use-after-free-in-bfq_bfqq_expire.patch
@@ -0,0 +1,140 @@
+From foo@baz Wed Dec 22 01:08:34 PM CET 2021
+From: Yu Kuai <yukuai3@huawei.com>
+Date: Tue, 21 Dec 2021 19:38:49 +0800
+Subject: block, bfq: fix use after free in bfq_bfqq_expire
+To: <jack@suse.cz>, <gregkh@linuxfoundation.org>, <paolo.valente@linaro.org>, <axboe@kernel.dk>
+Cc: <linux-block@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <yukuai3@huawei.com>, <yi.zhang@huawei.com>
+Message-ID: <20211221113849.2219126-6-yukuai3@huawei.com>
+
+From: Paolo Valente <paolo.valente@linaro.org>
+
+commit eed47d19d9362bdd958e4ab56af480b9dbf6b2b6 upstream.
+
+The function bfq_bfqq_expire() invokes the function
+__bfq_bfqq_expire(), and the latter may free the in-service bfq-queue.
+If this happens, then no other instruction of bfq_bfqq_expire() must
+be executed, or a use-after-free will occur.
+
+Basing on the assumption that __bfq_bfqq_expire() invokes
+bfq_put_queue() on the in-service bfq-queue exactly once, the queue is
+assumed to be freed if its refcounter is equal to one right before
+invoking __bfq_bfqq_expire().
+
+But, since commit 9dee8b3b057e ("block, bfq: fix queue removal from
+weights tree") this assumption is false. __bfq_bfqq_expire() may also
+invoke bfq_weights_tree_remove() and, since commit 9dee8b3b057e
+("block, bfq: fix queue removal from weights tree"), also
+the latter function may invoke bfq_put_queue(). So __bfq_bfqq_expire()
+may invoke bfq_put_queue() twice, and this is the actual case where
+the in-service queue may happen to be freed.
+
+To address this issue, this commit moves the check on the refcounter
+of the queue right around the last bfq_put_queue() that may be invoked
+on the queue.
+
+Fixes: 9dee8b3b057e ("block, bfq: fix queue removal from weights tree")
+Reported-by: Dmitrii Tcvetkov <demfloro@demfloro.ru>
+Reported-by: Douglas Anderson <dianders@chromium.org>
+Tested-by: Dmitrii Tcvetkov <demfloro@demfloro.ru>
+Tested-by: Douglas Anderson <dianders@chromium.org>
+Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Yu Kuai <yukuai3@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/bfq-iosched.c |   15 +++++++--------
+ block/bfq-iosched.h |    2 +-
+ block/bfq-wf2q.c    |   17 +++++++++++++++--
+ 3 files changed, 23 insertions(+), 11 deletions(-)
+
+--- a/block/bfq-iosched.c
++++ b/block/bfq-iosched.c
+@@ -2816,7 +2816,7 @@ static void bfq_dispatch_remove(struct r
+ 	bfq_remove_request(q, rq);
+ }
+ 
+-static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
++static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+ {
+ 	/*
+ 	 * If this bfqq is shared between multiple processes, check
+@@ -2849,9 +2849,11 @@ static void __bfq_bfqq_expire(struct bfq
+ 	/*
+ 	 * All in-service entities must have been properly deactivated
+ 	 * or requeued before executing the next function, which
+-	 * resets all in-service entites as no more in service.
++	 * resets all in-service entities as no more in service. This
++	 * may cause bfqq to be freed. If this happens, the next
++	 * function returns true.
+ 	 */
+-	__bfq_bfqd_reset_in_service(bfqd);
++	return __bfq_bfqd_reset_in_service(bfqd);
+ }
+ 
+ /**
+@@ -3256,7 +3258,6 @@ void bfq_bfqq_expire(struct bfq_data *bf
+ 	bool slow;
+ 	unsigned long delta = 0;
+ 	struct bfq_entity *entity = &bfqq->entity;
+-	int ref;
+ 
+ 	/*
+ 	 * Check whether the process is slow (see bfq_bfqq_is_slow).
+@@ -3325,10 +3326,8 @@ void bfq_bfqq_expire(struct bfq_data *bf
+ 	 * reason.
+ 	 */
+ 	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
+-	ref = bfqq->ref;
+-	__bfq_bfqq_expire(bfqd, bfqq);
+-
+-	if (ref == 1) /* bfqq is gone, no more actions on it */
++	if (__bfq_bfqq_expire(bfqd, bfqq))
++		/* bfqq is gone, no more actions on it */
+ 		return;
+ 
+ 	bfqq->injected_service = 0;
+--- a/block/bfq-iosched.h
++++ b/block/bfq-iosched.h
+@@ -993,7 +993,7 @@ bool __bfq_deactivate_entity(struct bfq_
+ 			     bool ins_into_idle_tree);
+ bool next_queue_may_preempt(struct bfq_data *bfqd);
+ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd);
+-void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd);
++bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd);
+ void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ 			 bool ins_into_idle_tree, bool expiration);
+ void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+--- a/block/bfq-wf2q.c
++++ b/block/bfq-wf2q.c
+@@ -1600,7 +1600,8 @@ struct bfq_queue *bfq_get_next_queue(str
+ 	return bfqq;
+ }
+ 
+-void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
++/* returns true if the in-service queue gets freed */
++bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
+ {
+ 	struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
+ 	struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
+@@ -1624,8 +1625,20 @@ void __bfq_bfqd_reset_in_service(struct
+ 	 * service tree either, then release the service reference to
+ 	 * the queue it represents (taken with bfq_get_entity).
+ 	 */
+-	if (!in_serv_entity->on_st)
++	if (!in_serv_entity->on_st) {
++		/*
++		 * If no process is referencing in_serv_bfqq any
++		 * longer, then the service reference may be the only
++		 * reference to the queue. If this is the case, then
++		 * bfqq gets freed here.
++		 */
++		int ref = in_serv_bfqq->ref;
+ 		bfq_put_queue(in_serv_bfqq);
++		if (ref == 1)
++			return true;
++	}
++
++	return false;
+ }
+ 
+ void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
diff --git a/queue-4.19/block-bfq-improve-asymmetric-scenarios-detection.patch b/queue-4.19/block-bfq-improve-asymmetric-scenarios-detection.patch
new file mode 100644
index 00000000000..7e97904be9d
--- /dev/null
+++ b/queue-4.19/block-bfq-improve-asymmetric-scenarios-detection.patch
@@ -0,0 +1,587 @@
+From foo@baz Wed Dec 22 01:08:34 PM CET 2021
+From: Yu Kuai <yukuai3@huawei.com>
+Date: Tue, 21 Dec 2021 19:38:45 +0800
+Subject: block, bfq: improve asymmetric scenarios detection
+To: <jack@suse.cz>, <gregkh@linuxfoundation.org>, <paolo.valente@linaro.org>, <axboe@kernel.dk>
+Cc: <linux-block@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <yukuai3@huawei.com>, <yi.zhang@huawei.com>
+Message-ID: <20211221113849.2219126-2-yukuai3@huawei.com>
+
+From: Federico Motta <federico@willer.it>
+
+commit 2d29c9f89fcd9bf408fcdaaf515c90a169f22ecd upstream.
+
+bfq defines as asymmetric a scenario where an active entity, say E
+(representing either a single bfq_queue or a group of other entities),
+has a higher weight than some other entities.  If the entity E does sync
+I/O in such a scenario, then bfq plugs the dispatch of the I/O of the
+other entities in the following situation: E is in service but
+temporarily has no pending I/O request.  In fact, without this plugging,
+all the times that E stops being temporarily idle, it may find the
+internal queues of the storage device already filled with an
+out-of-control number of extra requests, from other entities. So E may
+have to wait for the service of these extra requests, before finally
+having its own requests served. This may easily break service
+guarantees, with E getting less than its fair share of the device
+throughput.  Usually, the end result is that E gets the same fraction of
+the throughput as the other entities, instead of getting more, according
+to its higher weight.
+
+Yet there are two other more subtle cases where E, even if its weight is
+actually equal to or even lower than the weight of any other active
+entities, may get less than its fair share of the throughput in case the
+above I/O plugging is not performed:
+1. other entities issue larger requests than E;
+2. other entities contain more active child entities than E (or in
+   general tend to have more backlog than E).
+
+In the first case, other entities may get more service than E because
+they get larger requests, than those of E, served during the temporary
+idle periods of E.  In the second case, other entities get more service
+because, by having many child entities, they have many requests ready
+for dispatching while E is temporarily idle.
+
+This commit addresses this issue by extending the definition of
+asymmetric scenario: a scenario is asymmetric when
+- active entities representing bfq_queues have differentiated weights,
+  as in the original definition
+or (inclusive)
+- one or more entities representing groups of entities are active.
+
+This broader definition makes sure that I/O plugging will be performed
+in all the above cases, provided that there is at least one active
+group.  Of course, this definition is very coarse, so it will trigger
+I/O plugging also in cases where it is not needed, such as, e.g.,
+multiple active entities with just one child each, and all with the same
+I/O-request size.  The reason for this coarse definition is just that a
+finer-grained definition would be rather heavy to compute.
+
+On the opposite end, even this new definition does not trigger I/O
+plugging in all cases where there is no active group, and all bfq_queues
+have the same weight.  So, in these cases some unfairness may occur if
+there are asymmetries in I/O-request sizes.  We made this choice because
+I/O plugging may lower throughput, and probably a user that has not
+created any group cares more about throughput than about perfect
+fairness.  At any rate, as for possible applications that may care about
+service guarantees, bfq already guarantees a high responsiveness and a
+low latency to soft real-time applications automatically.
+
+Signed-off-by: Federico Motta <federico@willer.it>
+Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Yu Kuai <yukuai3@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/bfq-iosched.c |  223 ++++++++++++++++++++++++++++------------------------
+ block/bfq-iosched.h |   27 ++----
+ block/bfq-wf2q.c    |   36 ++++----
+ 3 files changed, 155 insertions(+), 131 deletions(-)
+
+--- a/block/bfq-iosched.c
++++ b/block/bfq-iosched.c
+@@ -625,12 +625,13 @@ void bfq_pos_tree_add_move(struct bfq_da
+ }
+ 
+ /*
+- * Tell whether there are active queues or groups with differentiated weights.
++ * Tell whether there are active queues with different weights or
++ * active groups.
+  */
+-static bool bfq_differentiated_weights(struct bfq_data *bfqd)
++static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
+ {
+ 	/*
+-	 * For weights to differ, at least one of the trees must contain
++	 * For queue weights to differ, queue_weights_tree must contain
+ 	 * at least two nodes.
+ 	 */
+ 	return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
+@@ -638,9 +639,7 @@ static bool bfq_differentiated_weights(s
+ 		 bfqd->queue_weights_tree.rb_node->rb_right)
+ #ifdef CONFIG_BFQ_GROUP_IOSCHED
+ 	       ) ||
+-	       (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
+-		(bfqd->group_weights_tree.rb_node->rb_left ||
+-		 bfqd->group_weights_tree.rb_node->rb_right)
++		(bfqd->num_active_groups > 0
+ #endif
+ 	       );
+ }
+@@ -658,26 +657,25 @@ static bool bfq_differentiated_weights(s
+  * 3) all active groups at the same level in the groups tree have the same
+  *    number of children.
+  *
+- * Unfortunately, keeping the necessary state for evaluating exactly the
+- * above symmetry conditions would be quite complex and time-consuming.
+- * Therefore this function evaluates, instead, the following stronger
+- * sub-conditions, for which it is much easier to maintain the needed
+- * state:
++ * Unfortunately, keeping the necessary state for evaluating exactly
++ * the last two symmetry sub-conditions above would be quite complex
++ * and time consuming.  Therefore this function evaluates, instead,
++ * only the following stronger two sub-conditions, for which it is
++ * much easier to maintain the needed state:
+  * 1) all active queues have the same weight,
+- * 2) all active groups have the same weight,
+- * 3) all active groups have at most one active child each.
+- * In particular, the last two conditions are always true if hierarchical
+- * support and the cgroups interface are not enabled, thus no state needs
+- * to be maintained in this case.
++ * 2) there are no active groups.
++ * In particular, the last condition is always true if hierarchical
++ * support or the cgroups interface are not enabled, thus no state
++ * needs to be maintained in this case.
+  */
+ static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
+ {
+-	return !bfq_differentiated_weights(bfqd);
++	return !bfq_varied_queue_weights_or_active_groups(bfqd);
+ }
+ 
+ /*
+  * If the weight-counter tree passed as input contains no counter for
+- * the weight of the input entity, then add that counter; otherwise just
++ * the weight of the input queue, then add that counter; otherwise just
+  * increment the existing counter.
+  *
+  * Note that weight-counter trees contain few nodes in mostly symmetric
+@@ -688,25 +686,25 @@ static bool bfq_symmetric_scenario(struc
+  * In most scenarios, the rate at which nodes are created/destroyed
+  * should be low too.
+  */
+-void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
++void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ 			  struct rb_root *root)
+ {
++	struct bfq_entity *entity = &bfqq->entity;
+ 	struct rb_node **new = &(root->rb_node), *parent = NULL;
+ 
+ 	/*
+-	 * Do not insert if the entity is already associated with a
++	 * Do not insert if the queue is already associated with a
+ 	 * counter, which happens if:
+-	 *   1) the entity is associated with a queue,
+-	 *   2) a request arrival has caused the queue to become both
++	 *   1) a request arrival has caused the queue to become both
+ 	 *      non-weight-raised, and hence change its weight, and
+ 	 *      backlogged; in this respect, each of the two events
+ 	 *      causes an invocation of this function,
+-	 *   3) this is the invocation of this function caused by the
++	 *   2) this is the invocation of this function caused by the
+ 	 *      second event. This second invocation is actually useless,
+ 	 *      and we handle this fact by exiting immediately. More
+ 	 *      efficient or clearer solutions might possibly be adopted.
+ 	 */
+-	if (entity->weight_counter)
++	if (bfqq->weight_counter)
+ 		return;
+ 
+ 	while (*new) {
+@@ -716,7 +714,7 @@ void bfq_weights_tree_add(struct bfq_dat
+ 		parent = *new;
+ 
+ 		if (entity->weight == __counter->weight) {
+-			entity->weight_counter = __counter;
++			bfqq->weight_counter = __counter;
+ 			goto inc_counter;
+ 		}
+ 		if (entity->weight < __counter->weight)
+@@ -725,66 +723,67 @@ void bfq_weights_tree_add(struct bfq_dat
+ 			new = &((*new)->rb_right);
+ 	}
+ 
+-	entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
+-					 GFP_ATOMIC);
++	bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
++				       GFP_ATOMIC);
+ 
+ 	/*
+ 	 * In the unlucky event of an allocation failure, we just
+-	 * exit. This will cause the weight of entity to not be
+-	 * considered in bfq_differentiated_weights, which, in its
+-	 * turn, causes the scenario to be deemed wrongly symmetric in
+-	 * case entity's weight would have been the only weight making
+-	 * the scenario asymmetric. On the bright side, no unbalance
+-	 * will however occur when entity becomes inactive again (the
+-	 * invocation of this function is triggered by an activation
+-	 * of entity). In fact, bfq_weights_tree_remove does nothing
+-	 * if !entity->weight_counter.
++	 * exit. This will cause the weight of queue to not be
++	 * considered in bfq_varied_queue_weights_or_active_groups,
++	 * which, in its turn, causes the scenario to be deemed
++	 * wrongly symmetric in case bfqq's weight would have been
++	 * the only weight making the scenario asymmetric.  On the
++	 * bright side, no unbalance will however occur when bfqq
++	 * becomes inactive again (the invocation of this function
++	 * is triggered by an activation of queue).  In fact,
++	 * bfq_weights_tree_remove does nothing if
++	 * !bfqq->weight_counter.
+ 	 */
+-	if (unlikely(!entity->weight_counter))
++	if (unlikely(!bfqq->weight_counter))
+ 		return;
+ 
+-	entity->weight_counter->weight = entity->weight;
+-	rb_link_node(&entity->weight_counter->weights_node, parent, new);
+-	rb_insert_color(&entity->weight_counter->weights_node, root);
++	bfqq->weight_counter->weight = entity->weight;
++	rb_link_node(&bfqq->weight_counter->weights_node, parent, new);
++	rb_insert_color(&bfqq->weight_counter->weights_node, root);
+ 
+ inc_counter:
+-	entity->weight_counter->num_active++;
++	bfqq->weight_counter->num_active++;
+ }
+ 
+ /*
+- * Decrement the weight counter associated with the entity, and, if the
++ * Decrement the weight counter associated with the queue, and, if the
+  * counter reaches 0, remove the counter from the tree.
+  * See the comments to the function bfq_weights_tree_add() for considerations
+  * about overhead.
+  */
+ void __bfq_weights_tree_remove(struct bfq_data *bfqd,
+-			       struct bfq_entity *entity,
++			       struct bfq_queue *bfqq,
+ 			       struct rb_root *root)
+ {
+-	if (!entity->weight_counter)
++	if (!bfqq->weight_counter)
+ 		return;
+ 
+-	entity->weight_counter->num_active--;
+-	if (entity->weight_counter->num_active > 0)
++	bfqq->weight_counter->num_active--;
++	if (bfqq->weight_counter->num_active > 0)
+ 		goto reset_entity_pointer;
+ 
+-	rb_erase(&entity->weight_counter->weights_node, root);
+-	kfree(entity->weight_counter);
++	rb_erase(&bfqq->weight_counter->weights_node, root);
++	kfree(bfqq->weight_counter);
+ 
+ reset_entity_pointer:
+-	entity->weight_counter = NULL;
++	bfqq->weight_counter = NULL;
+ }
+ 
+ /*
+- * Invoke __bfq_weights_tree_remove on bfqq and all its inactive
+- * parent entities.
++ * Invoke __bfq_weights_tree_remove on bfqq and decrement the number
++ * of active groups for each queue's inactive parent entity.
+  */
+ void bfq_weights_tree_remove(struct bfq_data *bfqd,
+ 			     struct bfq_queue *bfqq)
+ {
+ 	struct bfq_entity *entity = bfqq->entity.parent;
+ 
+-	__bfq_weights_tree_remove(bfqd, &bfqq->entity,
++	__bfq_weights_tree_remove(bfqd, bfqq,
+ 				  &bfqd->queue_weights_tree);
+ 
+ 	for_each_entity(entity) {
+@@ -798,17 +797,13 @@ void bfq_weights_tree_remove(struct bfq_
+ 			 * next_in_service for details on why
+ 			 * in_service_entity must be checked too).
+ 			 *
+-			 * As a consequence, the weight of entity is
+-			 * not to be removed. In addition, if entity
+-			 * is active, then its parent entities are
+-			 * active as well, and thus their weights are
+-			 * not to be removed either. In the end, this
+-			 * loop must stop here.
++			 * As a consequence, its parent entities are
++			 * active as well, and thus this loop must
++			 * stop here.
+ 			 */
+ 			break;
+ 		}
+-		__bfq_weights_tree_remove(bfqd, entity,
+-					  &bfqd->group_weights_tree);
++		bfqd->num_active_groups--;
+ 	}
+ }
+ 
+@@ -3521,9 +3516,11 @@ static bool bfq_better_to_idle(struct bf
+ 	 * symmetric scenario where:
+ 	 * (i)  each of these processes must get the same throughput as
+ 	 *      the others;
+-	 * (ii) all these processes have the same I/O pattern
+-		(either sequential or random).
+-	 * In fact, in such a scenario, the drive will tend to treat
++	 * (ii) the I/O of each process has the same properties, in
++	 *      terms of locality (sequential or random), direction
++	 *      (reads or writes), request sizes, greediness
++	 *      (from I/O-bound to sporadic), and so on.
++	 * In fact, in such a scenario, the drive tends to treat
+ 	 * the requests of each of these processes in about the same
+ 	 * way as the requests of the others, and thus to provide
+ 	 * each of these processes with about the same throughput
+@@ -3532,18 +3529,50 @@ static bool bfq_better_to_idle(struct bf
+ 	 * certainly needed to guarantee that bfqq receives its
+ 	 * assigned fraction of the device throughput (see [1] for
+ 	 * details).
++	 * The problem is that idling may significantly reduce
++	 * throughput with certain combinations of types of I/O and
++	 * devices. An important example is sync random I/O, on flash
++	 * storage with command queueing. So, unless bfqq falls in the
++	 * above cases where idling also boosts throughput, it would
++	 * be important to check conditions (i) and (ii) accurately,
++	 * so as to avoid idling when not strictly needed for service
++	 * guarantees.
++	 *
++	 * Unfortunately, it is extremely difficult to thoroughly
++	 * check condition (ii). And, in case there are active groups,
++	 * it becomes very difficult to check condition (i) too. In
++	 * fact, if there are active groups, then, for condition (i)
++	 * to become false, it is enough that an active group contains
++	 * more active processes or sub-groups than some other active
++	 * group. We address this issue with the following bi-modal
++	 * behavior, implemented in the function
++	 * bfq_symmetric_scenario().
+ 	 *
+-	 * We address this issue by controlling, actually, only the
+-	 * symmetry sub-condition (i), i.e., provided that
+-	 * sub-condition (i) holds, idling is not performed,
+-	 * regardless of whether sub-condition (ii) holds. In other
+-	 * words, only if sub-condition (i) holds, then idling is
++	 * If there are active groups, then the scenario is tagged as
++	 * asymmetric, conservatively, without checking any of the
++	 * conditions (i) and (ii). So the device is idled for bfqq.
++	 * This behavior matches also the fact that groups are created
++	 * exactly if controlling I/O (to preserve bandwidth and
++	 * latency guarantees) is a primary concern.
++	 *
++	 * On the opposite end, if there are no active groups, then
++	 * only condition (i) is actually controlled, i.e., provided
++	 * that condition (i) holds, idling is not performed,
++	 * regardless of whether condition (ii) holds. In other words,
++	 * only if condition (i) does not hold, then idling is
+ 	 * allowed, and the device tends to be prevented from queueing
+-	 * many requests, possibly of several processes. The reason
+-	 * for not controlling also sub-condition (ii) is that we
+-	 * exploit preemption to preserve guarantees in case of
+-	 * symmetric scenarios, even if (ii) does not hold, as
+-	 * explained in the next two paragraphs.
++	 * many requests, possibly of several processes. Since there
++	 * are no active groups, then, to control condition (i) it is
++	 * enough to check whether all active queues have the same
++	 * weight.
++	 *
++	 * Not checking condition (ii) evidently exposes bfqq to the
++	 * risk of getting less throughput than its fair share.
++	 * However, for queues with the same weight, a further
++	 * mechanism, preemption, mitigates or even eliminates this
++	 * problem. And it does so without consequences on overall
++	 * throughput. This mechanism and its benefits are explained
++	 * in the next three paragraphs.
+ 	 *
+ 	 * Even if a queue, say Q, is expired when it remains idle, Q
+ 	 * can still preempt the new in-service queue if the next
+@@ -3557,11 +3586,7 @@ static bool bfq_better_to_idle(struct bf
+ 	 * idling allows the internal queues of the device to contain
+ 	 * many requests, and thus to reorder requests, we can rather
+ 	 * safely assume that the internal scheduler still preserves a
+-	 * minimum of mid-term fairness. The motivation for using
+-	 * preemption instead of idling is that, by not idling,
+-	 * service guarantees are preserved without minimally
+-	 * sacrificing throughput. In other words, both a high
+-	 * throughput and its desired distribution are obtained.
++	 * minimum of mid-term fairness.
+ 	 *
+ 	 * More precisely, this preemption-based, idleless approach
+ 	 * provides fairness in terms of IOPS, and not sectors per
+@@ -3580,27 +3605,27 @@ static bool bfq_better_to_idle(struct bf
+ 	 * 1024/8 times as high as the service received by the other
+ 	 * queue.
+ 	 *
+-	 * On the other hand, device idling is performed, and thus
+-	 * pure sector-domain guarantees are provided, for the
+-	 * following queues, which are likely to need stronger
+-	 * throughput guarantees: weight-raised queues, and queues
+-	 * with a higher weight than other queues. When such queues
+-	 * are active, sub-condition (i) is false, which triggers
+-	 * device idling.
++	 * The motivation for using preemption instead of idling (for
++	 * queues with the same weight) is that, by not idling,
++	 * service guarantees are preserved (completely or at least in
++	 * part) without minimally sacrificing throughput. And, if
++	 * there is no active group, then the primary expectation for
++	 * this device is probably a high throughput.
+ 	 *
+-	 * According to the above considerations, the next variable is
+-	 * true (only) if sub-condition (i) holds. To compute the
+-	 * value of this variable, we not only use the return value of
+-	 * the function bfq_symmetric_scenario(), but also check
+-	 * whether bfqq is being weight-raised, because
+-	 * bfq_symmetric_scenario() does not take into account also
+-	 * weight-raised queues (see comments on
+-	 * bfq_weights_tree_add()). In particular, if bfqq is being
+-	 * weight-raised, it is important to idle only if there are
+-	 * other, non-weight-raised queues that may steal throughput
+-	 * to bfqq. Actually, we should be even more precise, and
+-	 * differentiate between interactive weight raising and
+-	 * soft real-time weight raising.
++	 * We are now left only with explaining the additional
++	 * compound condition that is checked below for deciding
++	 * whether the scenario is asymmetric. To explain this
++	 * compound condition, we need to add that the function
++	 * bfq_symmetric_scenario checks the weights of only
++	 * non-weight-raised queues, for efficiency reasons (see
++	 * comments on bfq_weights_tree_add()). Then the fact that
++	 * bfqq is weight-raised is checked explicitly here. More
++	 * precisely, the compound condition below takes into account
++	 * also the fact that, even if bfqq is being weight-raised,
++	 * the scenario is still symmetric if all active queues happen
++	 * to be weight-raised. Actually, we should be even more
++	 * precise here, and differentiate between interactive weight
++	 * raising and soft real-time weight raising.
+ 	 *
+ 	 * As a side note, it is worth considering that the above
+ 	 * device-idling countermeasures may however fail in the
+@@ -5422,7 +5447,7 @@ static int bfq_init_queue(struct request
+ 	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
+ 
+ 	bfqd->queue_weights_tree = RB_ROOT;
+-	bfqd->group_weights_tree = RB_ROOT;
++	bfqd->num_active_groups = 0;
+ 
+ 	INIT_LIST_HEAD(&bfqd->active_list);
+ 	INIT_LIST_HEAD(&bfqd->idle_list);
+--- a/block/bfq-iosched.h
++++ b/block/bfq-iosched.h
+@@ -108,15 +108,14 @@ struct bfq_sched_data {
+ };
+ 
+ /**
+- * struct bfq_weight_counter - counter of the number of all active entities
++ * struct bfq_weight_counter - counter of the number of all active queues
+  *                             with a given weight.
+  */
+ struct bfq_weight_counter {
+-	unsigned int weight; /* weight of the entities this counter refers to */
+-	unsigned int num_active; /* nr of active entities with this weight */
++	unsigned int weight; /* weight of the queues this counter refers to */
++	unsigned int num_active; /* nr of active queues with this weight */
+ 	/*
+-	 * Weights tree member (see bfq_data's @queue_weights_tree and
+-	 * @group_weights_tree)
++	 * Weights tree member (see bfq_data's @queue_weights_tree)
+ 	 */
+ 	struct rb_node weights_node;
+ };
+@@ -151,8 +150,6 @@ struct bfq_weight_counter {
+ struct bfq_entity {
+ 	/* service_tree member */
+ 	struct rb_node rb_node;
+-	/* pointer to the weight counter associated with this entity */
+-	struct bfq_weight_counter *weight_counter;
+ 
+ 	/*
+ 	 * Flag, true if the entity is on a tree (either the active or
+@@ -266,6 +263,9 @@ struct bfq_queue {
+ 	/* entity representing this queue in the scheduler */
+ 	struct bfq_entity entity;
+ 
++	/* pointer to the weight counter associated with this entity */
++	struct bfq_weight_counter *weight_counter;
++
+ 	/* maximum budget allowed from the feedback mechanism */
+ 	int max_budget;
+ 	/* budget expiration (in jiffies) */
+@@ -449,14 +449,9 @@ struct bfq_data {
+ 	 */
+ 	struct rb_root queue_weights_tree;
+ 	/*
+-	 * rbtree of non-queue @bfq_entity weight counters, sorted by
+-	 * weight. Used to keep track of whether all @bfq_groups have
+-	 * the same weight. The tree contains one counter for each
+-	 * distinct weight associated to some active @bfq_group (see
+-	 * the comments to the functions bfq_weights_tree_[add|remove]
+-	 * for further details).
++	 * number of groups with requests still waiting for completion
+ 	 */
+-	struct rb_root group_weights_tree;
++	unsigned int num_active_groups;
+ 
+ 	/*
+ 	 * Number of bfq_queues containing requests (including the
+@@ -854,10 +849,10 @@ struct bfq_queue *bic_to_bfqq(struct bfq
+ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
+ struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
+ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+-void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
++void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ 			  struct rb_root *root);
+ void __bfq_weights_tree_remove(struct bfq_data *bfqd,
+-			       struct bfq_entity *entity,
++			       struct bfq_queue *bfqq,
+ 			       struct rb_root *root);
+ void bfq_weights_tree_remove(struct bfq_data *bfqd,
+ 			     struct bfq_queue *bfqq);
+--- a/block/bfq-wf2q.c
++++ b/block/bfq-wf2q.c
+@@ -788,25 +788,29 @@ __bfq_entity_update_weight_prio(struct b
+ 		new_weight = entity->orig_weight *
+ 			     (bfqq ? bfqq->wr_coeff : 1);
+ 		/*
+-		 * If the weight of the entity changes, remove the entity
+-		 * from its old weight counter (if there is a counter
+-		 * associated with the entity), and add it to the counter
+-		 * associated with its new weight.
++		 * If the weight of the entity changes, and the entity is a
++		 * queue, remove the entity from its old weight counter (if
++		 * there is a counter associated with the entity).
+ 		 */
+ 		if (prev_weight != new_weight) {
+-			root = bfqq ? &bfqd->queue_weights_tree :
+-				      &bfqd->group_weights_tree;
+-			__bfq_weights_tree_remove(bfqd, entity, root);
++			if (bfqq) {
++				root = &bfqd->queue_weights_tree;
++				__bfq_weights_tree_remove(bfqd, bfqq, root);
++			} else
++				bfqd->num_active_groups--;
+ 		}
+ 		entity->weight = new_weight;
+ 		/*
+-		 * Add the entity to its weights tree only if it is
+-		 * not associated with a weight-raised queue.
++		 * Add the entity, if it is not a weight-raised queue,
++		 * to the counter associated with its new weight.
+ 		 */
+-		if (prev_weight != new_weight &&
+-		    (bfqq ? bfqq->wr_coeff == 1 : 1))
+-			/* If we get here, root has been initialized. */
+-			bfq_weights_tree_add(bfqd, entity, root);
++		if (prev_weight != new_weight) {
++			if (bfqq && bfqq->wr_coeff == 1) {
++				/* If we get here, root has been initialized. */
++				bfq_weights_tree_add(bfqd, bfqq, root);
++			} else
++				bfqd->num_active_groups++;
++		}
+ 
+ 		new_st->wsum += entity->weight;
+ 
+@@ -1012,9 +1016,9 @@ static void __bfq_activate_entity(struct
+ 	if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */
+ 		struct bfq_group *bfqg =
+ 			container_of(entity, struct bfq_group, entity);
++		struct bfq_data *bfqd = bfqg->bfqd;
+ 
+-		bfq_weights_tree_add(bfqg->bfqd, entity,
+-				     &bfqd->group_weights_tree);
++		bfqd->num_active_groups++;
+ 	}
+ #endif
+ 
+@@ -1692,7 +1696,7 @@ void bfq_add_bfqq_busy(struct bfq_data *
+ 
+ 	if (!bfqq->dispatched)
+ 		if (bfqq->wr_coeff == 1)
+-			bfq_weights_tree_add(bfqd, &bfqq->entity,
++			bfq_weights_tree_add(bfqd, bfqq,
+ 					     &bfqd->queue_weights_tree);
+ 
+ 	if (bfqq->wr_coeff > 1)
diff --git a/queue-4.19/series b/queue-4.19/series
index 46c52fd9363..d60a95a30cd 100644
--- a/queue-4.19/series
+++ b/queue-4.19/series
@@ -1 +1,6 @@
 net-usb-lan78xx-add-allied-telesis-at29m2-af.patch
+block-bfq-improve-asymmetric-scenarios-detection.patch
+block-bfq-fix-asymmetric-scenarios-detection.patch
+block-bfq-fix-decrement-of-num_active_groups.patch
+block-bfq-fix-queue-removal-from-weights-tree.patch
+block-bfq-fix-use-after-free-in-bfq_bfqq_expire.patch