4.19-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Wed, 22 Dec 2021 12:10:01 +0000 (13:10 +0100)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Wed, 22 Dec 2021 12:10:01 +0000 (13:10 +0100)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 22 Dec 2021 12:10:01 +0000 (13:10 +0100)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 22 Dec 2021 12:10:01 +0000 (13:10 +0100)
diff --git a/queue-4.19/block-bfq-fix-asymmetric-scenarios-detection.patch b/queue-4.19/block-bfq-fix-asymmetric-scenarios-detection.patch

new file mode 100644 (file)

index 0000000..4501480
--- /dev/null
+++ b/queue-4.19/block-bfq-fix-asymmetric-scenarios-detection.patch
@@ -0,0 +1,84 @@
+From foo@baz Wed Dec 22 01:08:34 PM CET 2021
+From: Yu Kuai <yukuai3@huawei.com>
+Date: Tue, 21 Dec 2021 19:38:46 +0800
+Subject: block, bfq: fix asymmetric scenarios detection
+To: <jack@suse.cz>, <gregkh@linuxfoundation.org>, <paolo.valente@linaro.org>, <axboe@kernel.dk>
+Cc: <linux-block@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <yukuai3@huawei.com>, <yi.zhang@huawei.com>
+Message-ID: <20211221113849.2219126-3-yukuai3@huawei.com>
+
+From: Federico Motta <federico@willer.it>
+
+commit 98fa7a3e001b21fb47c08af4304f40a3b0535cbd upstream.
+
+Since commit 2d29c9f89fcd ("block, bfq: improve asymmetric scenarios
+detection"), a scenario is defined asymmetric when one of the
+following conditions holds:
+- active bfq_queues have different weights
+- one or more group of entities (bfq_queue or other groups of entities)
+  are active
+bfq grants fairness and low latency also in such asymmetric scenarios,
+by plugging the dispatching of I/O if the bfq_queue in service happens
+to be temporarily idle. This plugging may lower throughput, so it is
+important to do it only when strictly needed.
+
+By mistake, in commit '2d29c9f89fcd' ("block, bfq: improve asymmetric
+scenarios detection") the num_active_groups counter was firstly
+incremented and subsequently decremented at any entity (group or
+bfq_queue) weight change.
+
+This is useless, because only transitions from active to inactive and
+vice versa matter for that counter. Unfortunately this is also
+incorrect in the following case: the entity at issue is a bfq_queue
+and it is under weight raising. In fact in this case there is a
+spurious increment of the num_active_groups counter.
+
+This spurious increment may cause scenarios to be wrongly detected as
+asymmetric, thus causing useless plugging and loss of throughput.
+
+This commit fixes this issue by simply removing the above useless and
+wrong increments and decrements.
+
+Fixes: 2d29c9f89fcd ("block, bfq: improve asymmetric scenarios detection")
+Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
+Signed-off-by: Federico Motta <federico@willer.it>
+Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Yu Kuai <yukuai3@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/bfq-wf2q.c |   18 ++++++------------
+ 1 file changed, 6 insertions(+), 12 deletions(-)
+
+--- a/block/bfq-wf2q.c
++++ b/block/bfq-wf2q.c
+@@ -792,24 +792,18 @@ __bfq_entity_update_weight_prio(struct b
+                * queue, remove the entity from its old weight counter (if
+                * there is a counter associated with the entity).
+                */
+-              if (prev_weight != new_weight) {
+-                      if (bfqq) {
+-                              root = &bfqd->queue_weights_tree;
+-                              __bfq_weights_tree_remove(bfqd, bfqq, root);
+-                      } else
+-                              bfqd->num_active_groups--;
++              if (prev_weight != new_weight && bfqq) {
++                      root = &bfqd->queue_weights_tree;
++                      __bfq_weights_tree_remove(bfqd, bfqq, root);
+               }
+               entity->weight = new_weight;
+               /*
+                * Add the entity, if it is not a weight-raised queue,
+                * to the counter associated with its new weight.
+                */
+-              if (prev_weight != new_weight) {
+-                      if (bfqq && bfqq->wr_coeff == 1) {
+-                              /* If we get here, root has been initialized. */
+-                              bfq_weights_tree_add(bfqd, bfqq, root);
+-                      } else
+-                              bfqd->num_active_groups++;
++              if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) {
++                      /* If we get here, root has been initialized. */
++                      bfq_weights_tree_add(bfqd, bfqq, root);
+               }
+ 
+               new_st->wsum += entity->weight;
diff --git a/queue-4.19/block-bfq-fix-decrement-of-num_active_groups.patch b/queue-4.19/block-bfq-fix-decrement-of-num_active_groups.patch

new file mode 100644 (file)

index 0000000..cfd9d03
--- /dev/null
+++ b/queue-4.19/block-bfq-fix-decrement-of-num_active_groups.patch
@@ -0,0 +1,270 @@
+From foo@baz Wed Dec 22 01:08:34 PM CET 2021
+From: Yu Kuai <yukuai3@huawei.com>
+Date: Tue, 21 Dec 2021 19:38:47 +0800
+Subject: block, bfq: fix decrement of num_active_groups
+To: <jack@suse.cz>, <gregkh@linuxfoundation.org>, <paolo.valente@linaro.org>, <axboe@kernel.dk>
+Cc: <linux-block@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <yukuai3@huawei.com>, <yi.zhang@huawei.com>
+Message-ID: <20211221113849.2219126-4-yukuai3@huawei.com>
+
+From: Paolo Valente <paolo.valente@linaro.org>
+
+commit ba7aeae5539c7a7cccc4cf07a2bc61281a93c50e upstream.
+
+Since commit '2d29c9f89fcd ("block, bfq: improve asymmetric scenarios
+detection")', if there are process groups with I/O requests waiting for
+completion, then BFQ tags the scenario as 'asymmetric'. This detection
+is needed for preserving service guarantees (for details, see comments
+on the computation * of the variable asymmetric_scenario in the
+function bfq_better_to_idle).
+
+Unfortunately, commit '2d29c9f89fcd ("block, bfq: improve asymmetric
+scenarios detection")' contains an error exactly in the updating of
+the number of groups with I/O requests waiting for completion: if a
+group has more than one descendant process, then the above number of
+groups, which is renamed from num_active_groups to a more appropriate
+num_groups_with_pending_reqs by this commit, may happen to be wrongly
+decremented multiple times, namely every time one of the descendant
+processes gets all its pending I/O requests completed.
+
+A correct, complete solution should work as follows. Consider a group
+that is inactive, i.e., that has no descendant process with pending
+I/O inside BFQ queues. Then suppose that num_groups_with_pending_reqs
+is still accounting for this group, because the group still has some
+descendant process with some I/O request still in
+flight. num_groups_with_pending_reqs should be decremented when the
+in-flight request of the last descendant process is finally completed
+(assuming that nothing else has changed for the group in the meantime,
+in terms of composition of the group and active/inactive state of
+child groups and processes). To accomplish this, an additional
+pending-request counter must be added to entities, and must be
+updated correctly.
+
+To avoid this additional field and operations, this commit resorts to
+the following tradeoff between simplicity and accuracy: for an
+inactive group that is still counted in num_groups_with_pending_reqs,
+this commit decrements num_groups_with_pending_reqs when the first
+descendant process of the group remains with no request waiting for
+completion.
+
+This simplified scheme provides a fix to the unbalanced decrements
+introduced by 2d29c9f89fcd. Since this error was also caused by lack
+of comments on this non-trivial issue, this commit also adds related
+comments.
+
+Fixes: 2d29c9f89fcd ("block, bfq: improve asymmetric scenarios detection")
+Reported-by: Steven Barrett <steven@liquorix.net>
+Tested-by: Steven Barrett <steven@liquorix.net>
+Tested-by: Lucjan Lucjanov <lucjan.lucjanov@gmail.com>
+Reviewed-by: Federico Motta <federico@willer.it>
+Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Yu Kuai <yukuai3@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/bfq-iosched.c |   76 ++++++++++++++++++++++++++++++++++++----------------
+ block/bfq-iosched.h |   51 +++++++++++++++++++++++++++++++++-
+ block/bfq-wf2q.c    |    5 ++-
+ 3 files changed, 107 insertions(+), 25 deletions(-)
+
+--- a/block/bfq-iosched.c
++++ b/block/bfq-iosched.c
+@@ -639,7 +639,7 @@ static bool bfq_varied_queue_weights_or_
+                bfqd->queue_weights_tree.rb_node->rb_right)
+ #ifdef CONFIG_BFQ_GROUP_IOSCHED
+              ) ||
+-              (bfqd->num_active_groups > 0
++              (bfqd->num_groups_with_pending_reqs > 0
+ #endif
+              );
+ }
+@@ -803,7 +803,21 @@ void bfq_weights_tree_remove(struct bfq_
+                        */
+                       break;
+               }
+-              bfqd->num_active_groups--;
++
++              /*
++               * The decrement of num_groups_with_pending_reqs is
++               * not performed immediately upon the deactivation of
++               * entity, but it is delayed to when it also happens
++               * that the first leaf descendant bfqq of entity gets
++               * all its pending requests completed. The following
++               * instructions perform this delayed decrement, if
++               * needed. See the comments on
++               * num_groups_with_pending_reqs for details.
++               */
++              if (entity->in_groups_with_pending_reqs) {
++                      entity->in_groups_with_pending_reqs = false;
++                      bfqd->num_groups_with_pending_reqs--;
++              }
+       }
+ }
+ 
+@@ -3544,27 +3558,44 @@ static bool bfq_better_to_idle(struct bf
+        * fact, if there are active groups, then, for condition (i)
+        * to become false, it is enough that an active group contains
+        * more active processes or sub-groups than some other active
+-       * group. We address this issue with the following bi-modal
+-       * behavior, implemented in the function
++       * group. More precisely, for condition (i) to hold because of
++       * such a group, it is not even necessary that the group is
++       * (still) active: it is sufficient that, even if the group
++       * has become inactive, some of its descendant processes still
++       * have some request already dispatched but still waiting for
++       * completion. In fact, requests have still to be guaranteed
++       * their share of the throughput even after being
++       * dispatched. In this respect, it is easy to show that, if a
++       * group frequently becomes inactive while still having
++       * in-flight requests, and if, when this happens, the group is
++       * not considered in the calculation of whether the scenario
++       * is asymmetric, then the group may fail to be guaranteed its
++       * fair share of the throughput (basically because idling may
++       * not be performed for the descendant processes of the group,
++       * but it had to be).  We address this issue with the
++       * following bi-modal behavior, implemented in the function
+        * bfq_symmetric_scenario().
+        *
+-       * If there are active groups, then the scenario is tagged as
++       * If there are groups with requests waiting for completion
++       * (as commented above, some of these groups may even be
++       * already inactive), then the scenario is tagged as
+        * asymmetric, conservatively, without checking any of the
+        * conditions (i) and (ii). So the device is idled for bfqq.
+        * This behavior matches also the fact that groups are created
+-       * exactly if controlling I/O (to preserve bandwidth and
+-       * latency guarantees) is a primary concern.
++       * exactly if controlling I/O is a primary concern (to
++       * preserve bandwidth and latency guarantees).
+        *
+-       * On the opposite end, if there are no active groups, then
+-       * only condition (i) is actually controlled, i.e., provided
+-       * that condition (i) holds, idling is not performed,
+-       * regardless of whether condition (ii) holds. In other words,
+-       * only if condition (i) does not hold, then idling is
+-       * allowed, and the device tends to be prevented from queueing
+-       * many requests, possibly of several processes. Since there
+-       * are no active groups, then, to control condition (i) it is
+-       * enough to check whether all active queues have the same
+-       * weight.
++       * On the opposite end, if there are no groups with requests
++       * waiting for completion, then only condition (i) is actually
++       * controlled, i.e., provided that condition (i) holds, idling
++       * is not performed, regardless of whether condition (ii)
++       * holds. In other words, only if condition (i) does not hold,
++       * then idling is allowed, and the device tends to be
++       * prevented from queueing many requests, possibly of several
++       * processes. Since there are no groups with requests waiting
++       * for completion, then, to control condition (i) it is enough
++       * to check just whether all the queues with requests waiting
++       * for completion also have the same weight.
+        *
+        * Not checking condition (ii) evidently exposes bfqq to the
+        * risk of getting less throughput than its fair share.
+@@ -3622,10 +3653,11 @@ static bool bfq_better_to_idle(struct bf
+        * bfqq is weight-raised is checked explicitly here. More
+        * precisely, the compound condition below takes into account
+        * also the fact that, even if bfqq is being weight-raised,
+-       * the scenario is still symmetric if all active queues happen
+-       * to be weight-raised. Actually, we should be even more
+-       * precise here, and differentiate between interactive weight
+-       * raising and soft real-time weight raising.
++       * the scenario is still symmetric if all queues with requests
++       * waiting for completion happen to be
++       * weight-raised. Actually, we should be even more precise
++       * here, and differentiate between interactive weight raising
++       * and soft real-time weight raising.
+        *
+        * As a side note, it is worth considering that the above
+        * device-idling countermeasures may however fail in the
+@@ -5447,7 +5479,7 @@ static int bfq_init_queue(struct request
+       bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
+ 
+       bfqd->queue_weights_tree = RB_ROOT;
+-      bfqd->num_active_groups = 0;
++      bfqd->num_groups_with_pending_reqs = 0;
+ 
+       INIT_LIST_HEAD(&bfqd->active_list);
+       INIT_LIST_HEAD(&bfqd->idle_list);
+--- a/block/bfq-iosched.h
++++ b/block/bfq-iosched.h
+@@ -196,6 +196,9 @@ struct bfq_entity {
+ 
+       /* flag, set to request a weight, ioprio or ioprio_class change  */
+       int prio_changed;
++
++      /* flag, set if the entity is counted in groups_with_pending_reqs */
++      bool in_groups_with_pending_reqs;
+ };
+ 
+ struct bfq_group;
+@@ -448,10 +451,54 @@ struct bfq_data {
+        * bfq_weights_tree_[add|remove] for further details).
+        */
+       struct rb_root queue_weights_tree;
++
+       /*
+-       * number of groups with requests still waiting for completion
++       * Number of groups with at least one descendant process that
++       * has at least one request waiting for completion. Note that
++       * this accounts for also requests already dispatched, but not
++       * yet completed. Therefore this number of groups may differ
++       * (be larger) than the number of active groups, as a group is
++       * considered active only if its corresponding entity has
++       * descendant queues with at least one request queued. This
++       * number is used to decide whether a scenario is symmetric.
++       * For a detailed explanation see comments on the computation
++       * of the variable asymmetric_scenario in the function
++       * bfq_better_to_idle().
++       *
++       * However, it is hard to compute this number exactly, for
++       * groups with multiple descendant processes. Consider a group
++       * that is inactive, i.e., that has no descendant process with
++       * pending I/O inside BFQ queues. Then suppose that
++       * num_groups_with_pending_reqs is still accounting for this
++       * group, because the group has descendant processes with some
++       * I/O request still in flight. num_groups_with_pending_reqs
++       * should be decremented when the in-flight request of the
++       * last descendant process is finally completed (assuming that
++       * nothing else has changed for the group in the meantime, in
++       * terms of composition of the group and active/inactive state of child
++       * groups and processes). To accomplish this, an additional
++       * pending-request counter must be added to entities, and must
++       * be updated correctly. To avoid this additional field and operations,
++       * we resort to the following tradeoff between simplicity and
++       * accuracy: for an inactive group that is still counted in
++       * num_groups_with_pending_reqs, we decrement
++       * num_groups_with_pending_reqs when the first descendant
++       * process of the group remains with no request waiting for
++       * completion.
++       *
++       * Even this simpler decrement strategy requires a little
++       * carefulness: to avoid multiple decrements, we flag a group,
++       * more precisely an entity representing a group, as still
++       * counted in num_groups_with_pending_reqs when it becomes
++       * inactive. Then, when the first descendant queue of the
++       * entity remains with no request waiting for completion,
++       * num_groups_with_pending_reqs is decremented, and this flag
++       * is reset. After this flag is reset for the entity,
++       * num_groups_with_pending_reqs won't be decremented any
++       * longer in case a new descendant queue of the entity remains
++       * with no request waiting for completion.
+        */
+-      unsigned int num_active_groups;
++      unsigned int num_groups_with_pending_reqs;
+ 
+       /*
+        * Number of bfq_queues containing requests (including the
+--- a/block/bfq-wf2q.c
++++ b/block/bfq-wf2q.c
+@@ -1012,7 +1012,10 @@ static void __bfq_activate_entity(struct
+                       container_of(entity, struct bfq_group, entity);
+               struct bfq_data *bfqd = bfqg->bfqd;
+ 
+-              bfqd->num_active_groups++;
++              if (!entity->in_groups_with_pending_reqs) {
++                      entity->in_groups_with_pending_reqs = true;
++                      bfqd->num_groups_with_pending_reqs++;
++              }
+       }
+ #endif
+ 
diff --git a/queue-4.19/block-bfq-fix-queue-removal-from-weights-tree.patch b/queue-4.19/block-bfq-fix-queue-removal-from-weights-tree.patch

new file mode 100644 (file)

index 0000000..88a213a
--- /dev/null
+++ b/queue-4.19/block-bfq-fix-queue-removal-from-weights-tree.patch
@@ -0,0 +1,118 @@
+From foo@baz Wed Dec 22 01:08:34 PM CET 2021
+From: Yu Kuai <yukuai3@huawei.com>
+Date: Tue, 21 Dec 2021 19:38:48 +0800
+Subject: block, bfq: fix queue removal from weights tree
+To: <jack@suse.cz>, <gregkh@linuxfoundation.org>, <paolo.valente@linaro.org>, <axboe@kernel.dk>
+Cc: <linux-block@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <yukuai3@huawei.com>, <yi.zhang@huawei.com>
+Message-ID: <20211221113849.2219126-5-yukuai3@huawei.com>
+
+From: Paolo Valente <paolo.valente@linaro.org>
+
+commit 9dee8b3b057e1da26f85f1842f2aaf3bb200fb94 upstream.
+
+bfq maintains an ordered list, through a red-black tree, of unique
+weights of active bfq_queues. This list is used to detect whether there
+are active queues with differentiated weights. The weight of a queue is
+removed from the list when both the following two conditions become
+true:
+
+(1) the bfq_queue is flagged as inactive
+(2) the has no in-flight request any longer;
+
+Unfortunately, in the rare cases where condition (2) becomes true before
+condition (1), the removal fails, because the function to remove the
+weight of the queue (bfq_weights_tree_remove) is rightly invoked in the
+path that deactivates the bfq_queue, but mistakenly invoked *before* the
+function that actually performs the deactivation (bfq_deactivate_bfqq).
+
+This commits moves the invocation of bfq_weights_tree_remove for
+condition (1) to after bfq_deactivate_bfqq. As a consequence of this
+move, it is necessary to add a further reference to the queue when the
+weight of a queue is added, because the queue might otherwise be freed
+before bfq_weights_tree_remove is invoked. This commit adds this
+reference and makes all related modifications.
+
+Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Yu Kuai <yukuai3@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/bfq-iosched.c |   17 +++++++++++++----
+ block/bfq-wf2q.c    |    6 +++---
+ 2 files changed, 16 insertions(+), 7 deletions(-)
+
+--- a/block/bfq-iosched.c
++++ b/block/bfq-iosched.c
+@@ -748,6 +748,7 @@ void bfq_weights_tree_add(struct bfq_dat
+ 
+ inc_counter:
+       bfqq->weight_counter->num_active++;
++      bfqq->ref++;
+ }
+ 
+ /*
+@@ -772,6 +773,7 @@ void __bfq_weights_tree_remove(struct bf
+ 
+ reset_entity_pointer:
+       bfqq->weight_counter = NULL;
++      bfq_put_queue(bfqq);
+ }
+ 
+ /*
+@@ -783,9 +785,6 @@ void bfq_weights_tree_remove(struct bfq_
+ {
+       struct bfq_entity *entity = bfqq->entity.parent;
+ 
+-      __bfq_weights_tree_remove(bfqd, bfqq,
+-                                &bfqd->queue_weights_tree);
+-
+       for_each_entity(entity) {
+               struct bfq_sched_data *sd = entity->my_sched_data;
+ 
+@@ -819,6 +818,15 @@ void bfq_weights_tree_remove(struct bfq_
+                       bfqd->num_groups_with_pending_reqs--;
+               }
+       }
++
++      /*
++       * Next function is invoked last, because it causes bfqq to be
++       * freed if the following holds: bfqq is not in service and
++       * has no dispatched request. DO NOT use bfqq after the next
++       * function invocation.
++       */
++      __bfq_weights_tree_remove(bfqd, bfqq,
++                                &bfqd->queue_weights_tree);
+ }
+ 
+ /*
+@@ -1012,7 +1020,8 @@ bfq_bfqq_resume_state(struct bfq_queue *
+ 
+ static int bfqq_process_refs(struct bfq_queue *bfqq)
+ {
+-      return bfqq->ref - bfqq->allocated - bfqq->entity.on_st;
++      return bfqq->ref - bfqq->allocated - bfqq->entity.on_st -
++              (bfqq->weight_counter != NULL);
+ }
+ 
+ /* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
+--- a/block/bfq-wf2q.c
++++ b/block/bfq-wf2q.c
+@@ -1668,15 +1668,15 @@ void bfq_del_bfqq_busy(struct bfq_data *
+ 
+       bfqd->busy_queues--;
+ 
+-      if (!bfqq->dispatched)
+-              bfq_weights_tree_remove(bfqd, bfqq);
+-
+       if (bfqq->wr_coeff > 1)
+               bfqd->wr_busy_queues--;
+ 
+       bfqg_stats_update_dequeue(bfqq_group(bfqq));
+ 
+       bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
++
++      if (!bfqq->dispatched)
++              bfq_weights_tree_remove(bfqd, bfqq);
+ }
+ 
+ /*
diff --git a/queue-4.19/block-bfq-fix-use-after-free-in-bfq_bfqq_expire.patch b/queue-4.19/block-bfq-fix-use-after-free-in-bfq_bfqq_expire.patch

new file mode 100644 (file)

index 0000000..139899e
--- /dev/null
+++ b/queue-4.19/block-bfq-fix-use-after-free-in-bfq_bfqq_expire.patch
@@ -0,0 +1,140 @@
+From foo@baz Wed Dec 22 01:08:34 PM CET 2021
+From: Yu Kuai <yukuai3@huawei.com>
+Date: Tue, 21 Dec 2021 19:38:49 +0800
+Subject: block, bfq: fix use after free in bfq_bfqq_expire
+To: <jack@suse.cz>, <gregkh@linuxfoundation.org>, <paolo.valente@linaro.org>, <axboe@kernel.dk>
+Cc: <linux-block@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <yukuai3@huawei.com>, <yi.zhang@huawei.com>
+Message-ID: <20211221113849.2219126-6-yukuai3@huawei.com>
+
+From: Paolo Valente <paolo.valente@linaro.org>
+
+commit eed47d19d9362bdd958e4ab56af480b9dbf6b2b6 upstream.
+
+The function bfq_bfqq_expire() invokes the function
+__bfq_bfqq_expire(), and the latter may free the in-service bfq-queue.
+If this happens, then no other instruction of bfq_bfqq_expire() must
+be executed, or a use-after-free will occur.
+
+Basing on the assumption that __bfq_bfqq_expire() invokes
+bfq_put_queue() on the in-service bfq-queue exactly once, the queue is
+assumed to be freed if its refcounter is equal to one right before
+invoking __bfq_bfqq_expire().
+
+But, since commit 9dee8b3b057e ("block, bfq: fix queue removal from
+weights tree") this assumption is false. __bfq_bfqq_expire() may also
+invoke bfq_weights_tree_remove() and, since commit 9dee8b3b057e
+("block, bfq: fix queue removal from weights tree"), also
+the latter function may invoke bfq_put_queue(). So __bfq_bfqq_expire()
+may invoke bfq_put_queue() twice, and this is the actual case where
+the in-service queue may happen to be freed.
+
+To address this issue, this commit moves the check on the refcounter
+of the queue right around the last bfq_put_queue() that may be invoked
+on the queue.
+
+Fixes: 9dee8b3b057e ("block, bfq: fix queue removal from weights tree")
+Reported-by: Dmitrii Tcvetkov <demfloro@demfloro.ru>
+Reported-by: Douglas Anderson <dianders@chromium.org>
+Tested-by: Dmitrii Tcvetkov <demfloro@demfloro.ru>
+Tested-by: Douglas Anderson <dianders@chromium.org>
+Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Yu Kuai <yukuai3@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/bfq-iosched.c |   15 +++++++--------
+ block/bfq-iosched.h |    2 +-
+ block/bfq-wf2q.c    |   17 +++++++++++++++--
+ 3 files changed, 23 insertions(+), 11 deletions(-)
+
+--- a/block/bfq-iosched.c
++++ b/block/bfq-iosched.c
+@@ -2816,7 +2816,7 @@ static void bfq_dispatch_remove(struct r
+       bfq_remove_request(q, rq);
+ }
+ 
+-static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
++static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+ {
+       /*
+        * If this bfqq is shared between multiple processes, check
+@@ -2849,9 +2849,11 @@ static void __bfq_bfqq_expire(struct bfq
+       /*
+        * All in-service entities must have been properly deactivated
+        * or requeued before executing the next function, which
+-       * resets all in-service entites as no more in service.
++       * resets all in-service entities as no more in service. This
++       * may cause bfqq to be freed. If this happens, the next
++       * function returns true.
+        */
+-      __bfq_bfqd_reset_in_service(bfqd);
++      return __bfq_bfqd_reset_in_service(bfqd);
+ }
+ 
+ /**
+@@ -3256,7 +3258,6 @@ void bfq_bfqq_expire(struct bfq_data *bf
+       bool slow;
+       unsigned long delta = 0;
+       struct bfq_entity *entity = &bfqq->entity;
+-      int ref;
+ 
+       /*
+        * Check whether the process is slow (see bfq_bfqq_is_slow).
+@@ -3325,10 +3326,8 @@ void bfq_bfqq_expire(struct bfq_data *bf
+        * reason.
+        */
+       __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
+-      ref = bfqq->ref;
+-      __bfq_bfqq_expire(bfqd, bfqq);
+-
+-      if (ref == 1) /* bfqq is gone, no more actions on it */
++      if (__bfq_bfqq_expire(bfqd, bfqq))
++              /* bfqq is gone, no more actions on it */
+               return;
+ 
+       bfqq->injected_service = 0;
+--- a/block/bfq-iosched.h
++++ b/block/bfq-iosched.h
+@@ -993,7 +993,7 @@ bool __bfq_deactivate_entity(struct bfq_
+                            bool ins_into_idle_tree);
+ bool next_queue_may_preempt(struct bfq_data *bfqd);
+ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd);
+-void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd);
++bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd);
+ void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                        bool ins_into_idle_tree, bool expiration);
+ void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+--- a/block/bfq-wf2q.c
++++ b/block/bfq-wf2q.c
+@@ -1600,7 +1600,8 @@ struct bfq_queue *bfq_get_next_queue(str
+       return bfqq;
+ }
+ 
+-void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
++/* returns true if the in-service queue gets freed */
++bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
+ {
+       struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
+       struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
+@@ -1624,8 +1625,20 @@ void __bfq_bfqd_reset_in_service(struct
+        * service tree either, then release the service reference to
+        * the queue it represents (taken with bfq_get_entity).
+        */
+-      if (!in_serv_entity->on_st)
++      if (!in_serv_entity->on_st) {
++              /*
++               * If no process is referencing in_serv_bfqq any
++               * longer, then the service reference may be the only
++               * reference to the queue. If this is the case, then
++               * bfqq gets freed here.
++               */
++              int ref = in_serv_bfqq->ref;
+               bfq_put_queue(in_serv_bfqq);
++              if (ref == 1)
++                      return true;
++      }
++
++      return false;
+ }
+ 
+ void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
diff --git a/queue-4.19/block-bfq-improve-asymmetric-scenarios-detection.patch b/queue-4.19/block-bfq-improve-asymmetric-scenarios-detection.patch

new file mode 100644 (file)

index 0000000..7e97904
--- /dev/null
+++ b/queue-4.19/block-bfq-improve-asymmetric-scenarios-detection.patch
@@ -0,0 +1,587 @@
+From foo@baz Wed Dec 22 01:08:34 PM CET 2021
+From: Yu Kuai <yukuai3@huawei.com>
+Date: Tue, 21 Dec 2021 19:38:45 +0800
+Subject: block, bfq: improve asymmetric scenarios detection
+To: <jack@suse.cz>, <gregkh@linuxfoundation.org>, <paolo.valente@linaro.org>, <axboe@kernel.dk>
+Cc: <linux-block@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <yukuai3@huawei.com>, <yi.zhang@huawei.com>
+Message-ID: <20211221113849.2219126-2-yukuai3@huawei.com>
+
+From: Federico Motta <federico@willer.it>
+
+commit 2d29c9f89fcd9bf408fcdaaf515c90a169f22ecd upstream.
+
+bfq defines as asymmetric a scenario where an active entity, say E
+(representing either a single bfq_queue or a group of other entities),
+has a higher weight than some other entities.  If the entity E does sync
+I/O in such a scenario, then bfq plugs the dispatch of the I/O of the
+other entities in the following situation: E is in service but
+temporarily has no pending I/O request.  In fact, without this plugging,
+all the times that E stops being temporarily idle, it may find the
+internal queues of the storage device already filled with an
+out-of-control number of extra requests, from other entities. So E may
+have to wait for the service of these extra requests, before finally
+having its own requests served. This may easily break service
+guarantees, with E getting less than its fair share of the device
+throughput.  Usually, the end result is that E gets the same fraction of
+the throughput as the other entities, instead of getting more, according
+to its higher weight.
+
+Yet there are two other more subtle cases where E, even if its weight is
+actually equal to or even lower than the weight of any other active
+entities, may get less than its fair share of the throughput in case the
+above I/O plugging is not performed:
+1. other entities issue larger requests than E;
+2. other entities contain more active child entities than E (or in
+   general tend to have more backlog than E).
+
+In the first case, other entities may get more service than E because
+they get larger requests, than those of E, served during the temporary
+idle periods of E.  In the second case, other entities get more service
+because, by having many child entities, they have many requests ready
+for dispatching while E is temporarily idle.
+
+This commit addresses this issue by extending the definition of
+asymmetric scenario: a scenario is asymmetric when
+- active entities representing bfq_queues have differentiated weights,
+  as in the original definition
+or (inclusive)
+- one or more entities representing groups of entities are active.
+
+This broader definition makes sure that I/O plugging will be performed
+in all the above cases, provided that there is at least one active
+group.  Of course, this definition is very coarse, so it will trigger
+I/O plugging also in cases where it is not needed, such as, e.g.,
+multiple active entities with just one child each, and all with the same
+I/O-request size.  The reason for this coarse definition is just that a
+finer-grained definition would be rather heavy to compute.
+
+On the opposite end, even this new definition does not trigger I/O
+plugging in all cases where there is no active group, and all bfq_queues
+have the same weight.  So, in these cases some unfairness may occur if
+there are asymmetries in I/O-request sizes.  We made this choice because
+I/O plugging may lower throughput, and probably a user that has not
+created any group cares more about throughput than about perfect
+fairness.  At any rate, as for possible applications that may care about
+service guarantees, bfq already guarantees a high responsiveness and a
+low latency to soft real-time applications automatically.
+
+Signed-off-by: Federico Motta <federico@willer.it>
+Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Yu Kuai <yukuai3@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/bfq-iosched.c |  223 ++++++++++++++++++++++++++++------------------------
+ block/bfq-iosched.h |   27 ++----
+ block/bfq-wf2q.c    |   36 ++++----
+ 3 files changed, 155 insertions(+), 131 deletions(-)
+
+--- a/block/bfq-iosched.c
++++ b/block/bfq-iosched.c
+@@ -625,12 +625,13 @@ void bfq_pos_tree_add_move(struct bfq_da
+ }
+ 
+ /*
+- * Tell whether there are active queues or groups with differentiated weights.
++ * Tell whether there are active queues with different weights or
++ * active groups.
+  */
+-static bool bfq_differentiated_weights(struct bfq_data *bfqd)
++static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
+ {
+       /*
+-       * For weights to differ, at least one of the trees must contain
++       * For queue weights to differ, queue_weights_tree must contain
+        * at least two nodes.
+        */
+       return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
+@@ -638,9 +639,7 @@ static bool bfq_differentiated_weights(s
+                bfqd->queue_weights_tree.rb_node->rb_right)
+ #ifdef CONFIG_BFQ_GROUP_IOSCHED
+              ) ||
+-             (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
+-              (bfqd->group_weights_tree.rb_node->rb_left ||
+-               bfqd->group_weights_tree.rb_node->rb_right)
++              (bfqd->num_active_groups > 0
+ #endif
+              );
+ }
+@@ -658,26 +657,25 @@ static bool bfq_differentiated_weights(s
+  * 3) all active groups at the same level in the groups tree have the same
+  *    number of children.
+  *
+- * Unfortunately, keeping the necessary state for evaluating exactly the
+- * above symmetry conditions would be quite complex and time-consuming.
+- * Therefore this function evaluates, instead, the following stronger
+- * sub-conditions, for which it is much easier to maintain the needed
+- * state:
++ * Unfortunately, keeping the necessary state for evaluating exactly
++ * the last two symmetry sub-conditions above would be quite complex
++ * and time consuming.  Therefore this function evaluates, instead,
++ * only the following stronger two sub-conditions, for which it is
++ * much easier to maintain the needed state:
+  * 1) all active queues have the same weight,
+- * 2) all active groups have the same weight,
+- * 3) all active groups have at most one active child each.
+- * In particular, the last two conditions are always true if hierarchical
+- * support and the cgroups interface are not enabled, thus no state needs
+- * to be maintained in this case.
++ * 2) there are no active groups.
++ * In particular, the last condition is always true if hierarchical
++ * support or the cgroups interface are not enabled, thus no state
++ * needs to be maintained in this case.
+  */
+ static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
+ {
+-      return !bfq_differentiated_weights(bfqd);
++      return !bfq_varied_queue_weights_or_active_groups(bfqd);
+ }
+ 
+ /*
+  * If the weight-counter tree passed as input contains no counter for
+- * the weight of the input entity, then add that counter; otherwise just
++ * the weight of the input queue, then add that counter; otherwise just
+  * increment the existing counter.
+  *
+  * Note that weight-counter trees contain few nodes in mostly symmetric
+@@ -688,25 +686,25 @@ static bool bfq_symmetric_scenario(struc
+  * In most scenarios, the rate at which nodes are created/destroyed
+  * should be low too.
+  */
+-void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
++void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                         struct rb_root *root)
+ {
++      struct bfq_entity *entity = &bfqq->entity;
+       struct rb_node **new = &(root->rb_node), *parent = NULL;
+ 
+       /*
+-       * Do not insert if the entity is already associated with a
++       * Do not insert if the queue is already associated with a
+        * counter, which happens if:
+-       *   1) the entity is associated with a queue,
+-       *   2) a request arrival has caused the queue to become both
++       *   1) a request arrival has caused the queue to become both
+        *      non-weight-raised, and hence change its weight, and
+        *      backlogged; in this respect, each of the two events
+        *      causes an invocation of this function,
+-       *   3) this is the invocation of this function caused by the
++       *   2) this is the invocation of this function caused by the
+        *      second event. This second invocation is actually useless,
+        *      and we handle this fact by exiting immediately. More
+        *      efficient or clearer solutions might possibly be adopted.
+        */
+-      if (entity->weight_counter)
++      if (bfqq->weight_counter)
+               return;
+ 
+       while (*new) {
+@@ -716,7 +714,7 @@ void bfq_weights_tree_add(struct bfq_dat
+               parent = *new;
+ 
+               if (entity->weight == __counter->weight) {
+-                      entity->weight_counter = __counter;
++                      bfqq->weight_counter = __counter;
+                       goto inc_counter;
+               }
+               if (entity->weight < __counter->weight)
+@@ -725,66 +723,67 @@ void bfq_weights_tree_add(struct bfq_dat
+                       new = &((*new)->rb_right);
+       }
+ 
+-      entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
+-                                       GFP_ATOMIC);
++      bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
++                                     GFP_ATOMIC);
+ 
+       /*
+        * In the unlucky event of an allocation failure, we just
+-       * exit. This will cause the weight of entity to not be
+-       * considered in bfq_differentiated_weights, which, in its
+-       * turn, causes the scenario to be deemed wrongly symmetric in
+-       * case entity's weight would have been the only weight making
+-       * the scenario asymmetric. On the bright side, no unbalance
+-       * will however occur when entity becomes inactive again (the
+-       * invocation of this function is triggered by an activation
+-       * of entity). In fact, bfq_weights_tree_remove does nothing
+-       * if !entity->weight_counter.
++       * exit. This will cause the weight of queue to not be
++       * considered in bfq_varied_queue_weights_or_active_groups,
++       * which, in its turn, causes the scenario to be deemed
++       * wrongly symmetric in case bfqq's weight would have been
++       * the only weight making the scenario asymmetric.  On the
++       * bright side, no unbalance will however occur when bfqq
++       * becomes inactive again (the invocation of this function
++       * is triggered by an activation of queue).  In fact,
++       * bfq_weights_tree_remove does nothing if
++       * !bfqq->weight_counter.
+        */
+-      if (unlikely(!entity->weight_counter))
++      if (unlikely(!bfqq->weight_counter))
+               return;
+ 
+-      entity->weight_counter->weight = entity->weight;
+-      rb_link_node(&entity->weight_counter->weights_node, parent, new);
+-      rb_insert_color(&entity->weight_counter->weights_node, root);
++      bfqq->weight_counter->weight = entity->weight;
++      rb_link_node(&bfqq->weight_counter->weights_node, parent, new);
++      rb_insert_color(&bfqq->weight_counter->weights_node, root);
+ 
+ inc_counter:
+-      entity->weight_counter->num_active++;
++      bfqq->weight_counter->num_active++;
+ }
+ 
+ /*
+- * Decrement the weight counter associated with the entity, and, if the
++ * Decrement the weight counter associated with the queue, and, if the
+  * counter reaches 0, remove the counter from the tree.
+  * See the comments to the function bfq_weights_tree_add() for considerations
+  * about overhead.
+  */
+ void __bfq_weights_tree_remove(struct bfq_data *bfqd,
+-                             struct bfq_entity *entity,
++                             struct bfq_queue *bfqq,
+                              struct rb_root *root)
+ {
+-      if (!entity->weight_counter)
++      if (!bfqq->weight_counter)
+               return;
+ 
+-      entity->weight_counter->num_active--;
+-      if (entity->weight_counter->num_active > 0)
++      bfqq->weight_counter->num_active--;
++      if (bfqq->weight_counter->num_active > 0)
+               goto reset_entity_pointer;
+ 
+-      rb_erase(&entity->weight_counter->weights_node, root);
+-      kfree(entity->weight_counter);
++      rb_erase(&bfqq->weight_counter->weights_node, root);
++      kfree(bfqq->weight_counter);
+ 
+ reset_entity_pointer:
+-      entity->weight_counter = NULL;
++      bfqq->weight_counter = NULL;
+ }
+ 
+ /*
+- * Invoke __bfq_weights_tree_remove on bfqq and all its inactive
+- * parent entities.
++ * Invoke __bfq_weights_tree_remove on bfqq and decrement the number
++ * of active groups for each queue's inactive parent entity.
+  */
+ void bfq_weights_tree_remove(struct bfq_data *bfqd,
+                            struct bfq_queue *bfqq)
+ {
+       struct bfq_entity *entity = bfqq->entity.parent;
+ 
+-      __bfq_weights_tree_remove(bfqd, &bfqq->entity,
++      __bfq_weights_tree_remove(bfqd, bfqq,
+                                 &bfqd->queue_weights_tree);
+ 
+       for_each_entity(entity) {
+@@ -798,17 +797,13 @@ void bfq_weights_tree_remove(struct bfq_
+                        * next_in_service for details on why
+                        * in_service_entity must be checked too).
+                        *
+-                       * As a consequence, the weight of entity is
+-                       * not to be removed. In addition, if entity
+-                       * is active, then its parent entities are
+-                       * active as well, and thus their weights are
+-                       * not to be removed either. In the end, this
+-                       * loop must stop here.
++                       * As a consequence, its parent entities are
++                       * active as well, and thus this loop must
++                       * stop here.
+                        */
+                       break;
+               }
+-              __bfq_weights_tree_remove(bfqd, entity,
+-                                        &bfqd->group_weights_tree);
++              bfqd->num_active_groups--;
+       }
+ }
+ 
+@@ -3521,9 +3516,11 @@ static bool bfq_better_to_idle(struct bf
+        * symmetric scenario where:
+        * (i)  each of these processes must get the same throughput as
+        *      the others;
+-       * (ii) all these processes have the same I/O pattern
+-              (either sequential or random).
+-       * In fact, in such a scenario, the drive will tend to treat
++       * (ii) the I/O of each process has the same properties, in
++       *      terms of locality (sequential or random), direction
++       *      (reads or writes), request sizes, greediness
++       *      (from I/O-bound to sporadic), and so on.
++       * In fact, in such a scenario, the drive tends to treat
+        * the requests of each of these processes in about the same
+        * way as the requests of the others, and thus to provide
+        * each of these processes with about the same throughput
+@@ -3532,18 +3529,50 @@ static bool bfq_better_to_idle(struct bf
+        * certainly needed to guarantee that bfqq receives its
+        * assigned fraction of the device throughput (see [1] for
+        * details).
++       * The problem is that idling may significantly reduce
++       * throughput with certain combinations of types of I/O and
++       * devices. An important example is sync random I/O, on flash
++       * storage with command queueing. So, unless bfqq falls in the
++       * above cases where idling also boosts throughput, it would
++       * be important to check conditions (i) and (ii) accurately,
++       * so as to avoid idling when not strictly needed for service
++       * guarantees.
++       *
++       * Unfortunately, it is extremely difficult to thoroughly
++       * check condition (ii). And, in case there are active groups,
++       * it becomes very difficult to check condition (i) too. In
++       * fact, if there are active groups, then, for condition (i)
++       * to become false, it is enough that an active group contains
++       * more active processes or sub-groups than some other active
++       * group. We address this issue with the following bi-modal
++       * behavior, implemented in the function
++       * bfq_symmetric_scenario().
+        *
+-       * We address this issue by controlling, actually, only the
+-       * symmetry sub-condition (i), i.e., provided that
+-       * sub-condition (i) holds, idling is not performed,
+-       * regardless of whether sub-condition (ii) holds. In other
+-       * words, only if sub-condition (i) holds, then idling is
++       * If there are active groups, then the scenario is tagged as
++       * asymmetric, conservatively, without checking any of the
++       * conditions (i) and (ii). So the device is idled for bfqq.
++       * This behavior matches also the fact that groups are created
++       * exactly if controlling I/O (to preserve bandwidth and
++       * latency guarantees) is a primary concern.
++       *
++       * On the opposite end, if there are no active groups, then
++       * only condition (i) is actually controlled, i.e., provided
++       * that condition (i) holds, idling is not performed,
++       * regardless of whether condition (ii) holds. In other words,
++       * only if condition (i) does not hold, then idling is
+        * allowed, and the device tends to be prevented from queueing
+-       * many requests, possibly of several processes. The reason
+-       * for not controlling also sub-condition (ii) is that we
+-       * exploit preemption to preserve guarantees in case of
+-       * symmetric scenarios, even if (ii) does not hold, as
+-       * explained in the next two paragraphs.
++       * many requests, possibly of several processes. Since there
++       * are no active groups, then, to control condition (i) it is
++       * enough to check whether all active queues have the same
++       * weight.
++       *
++       * Not checking condition (ii) evidently exposes bfqq to the
++       * risk of getting less throughput than its fair share.
++       * However, for queues with the same weight, a further
++       * mechanism, preemption, mitigates or even eliminates this
++       * problem. And it does so without consequences on overall
++       * throughput. This mechanism and its benefits are explained
++       * in the next three paragraphs.
+        *
+        * Even if a queue, say Q, is expired when it remains idle, Q
+        * can still preempt the new in-service queue if the next
+@@ -3557,11 +3586,7 @@ static bool bfq_better_to_idle(struct bf
+        * idling allows the internal queues of the device to contain
+        * many requests, and thus to reorder requests, we can rather
+        * safely assume that the internal scheduler still preserves a
+-       * minimum of mid-term fairness. The motivation for using
+-       * preemption instead of idling is that, by not idling,
+-       * service guarantees are preserved without minimally
+-       * sacrificing throughput. In other words, both a high
+-       * throughput and its desired distribution are obtained.
++       * minimum of mid-term fairness.
+        *
+        * More precisely, this preemption-based, idleless approach
+        * provides fairness in terms of IOPS, and not sectors per
+@@ -3580,27 +3605,27 @@ static bool bfq_better_to_idle(struct bf
+        * 1024/8 times as high as the service received by the other
+        * queue.
+        *
+-       * On the other hand, device idling is performed, and thus
+-       * pure sector-domain guarantees are provided, for the
+-       * following queues, which are likely to need stronger
+-       * throughput guarantees: weight-raised queues, and queues
+-       * with a higher weight than other queues. When such queues
+-       * are active, sub-condition (i) is false, which triggers
+-       * device idling.
++       * The motivation for using preemption instead of idling (for
++       * queues with the same weight) is that, by not idling,
++       * service guarantees are preserved (completely or at least in
++       * part) without minimally sacrificing throughput. And, if
++       * there is no active group, then the primary expectation for
++       * this device is probably a high throughput.
+        *
+-       * According to the above considerations, the next variable is
+-       * true (only) if sub-condition (i) holds. To compute the
+-       * value of this variable, we not only use the return value of
+-       * the function bfq_symmetric_scenario(), but also check
+-       * whether bfqq is being weight-raised, because
+-       * bfq_symmetric_scenario() does not take into account also
+-       * weight-raised queues (see comments on
+-       * bfq_weights_tree_add()). In particular, if bfqq is being
+-       * weight-raised, it is important to idle only if there are
+-       * other, non-weight-raised queues that may steal throughput
+-       * to bfqq. Actually, we should be even more precise, and
+-       * differentiate between interactive weight raising and
+-       * soft real-time weight raising.
++       * We are now left only with explaining the additional
++       * compound condition that is checked below for deciding
++       * whether the scenario is asymmetric. To explain this
++       * compound condition, we need to add that the function
++       * bfq_symmetric_scenario checks the weights of only
++       * non-weight-raised queues, for efficiency reasons (see
++       * comments on bfq_weights_tree_add()). Then the fact that
++       * bfqq is weight-raised is checked explicitly here. More
++       * precisely, the compound condition below takes into account
++       * also the fact that, even if bfqq is being weight-raised,
++       * the scenario is still symmetric if all active queues happen
++       * to be weight-raised. Actually, we should be even more
++       * precise here, and differentiate between interactive weight
++       * raising and soft real-time weight raising.
+        *
+        * As a side note, it is worth considering that the above
+        * device-idling countermeasures may however fail in the
+@@ -5422,7 +5447,7 @@ static int bfq_init_queue(struct request
+       bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
+ 
+       bfqd->queue_weights_tree = RB_ROOT;
+-      bfqd->group_weights_tree = RB_ROOT;
++      bfqd->num_active_groups = 0;
+ 
+       INIT_LIST_HEAD(&bfqd->active_list);
+       INIT_LIST_HEAD(&bfqd->idle_list);
+--- a/block/bfq-iosched.h
++++ b/block/bfq-iosched.h
+@@ -108,15 +108,14 @@ struct bfq_sched_data {
+ };
+ 
+ /**
+- * struct bfq_weight_counter - counter of the number of all active entities
++ * struct bfq_weight_counter - counter of the number of all active queues
+  *                             with a given weight.
+  */
+ struct bfq_weight_counter {
+-      unsigned int weight; /* weight of the entities this counter refers to */
+-      unsigned int num_active; /* nr of active entities with this weight */
++      unsigned int weight; /* weight of the queues this counter refers to */
++      unsigned int num_active; /* nr of active queues with this weight */
+       /*
+-       * Weights tree member (see bfq_data's @queue_weights_tree and
+-       * @group_weights_tree)
++       * Weights tree member (see bfq_data's @queue_weights_tree)
+        */
+       struct rb_node weights_node;
+ };
+@@ -151,8 +150,6 @@ struct bfq_weight_counter {
+ struct bfq_entity {
+       /* service_tree member */
+       struct rb_node rb_node;
+-      /* pointer to the weight counter associated with this entity */
+-      struct bfq_weight_counter *weight_counter;
+ 
+       /*
+        * Flag, true if the entity is on a tree (either the active or
+@@ -266,6 +263,9 @@ struct bfq_queue {
+       /* entity representing this queue in the scheduler */
+       struct bfq_entity entity;
+ 
++      /* pointer to the weight counter associated with this entity */
++      struct bfq_weight_counter *weight_counter;
++
+       /* maximum budget allowed from the feedback mechanism */
+       int max_budget;
+       /* budget expiration (in jiffies) */
+@@ -449,14 +449,9 @@ struct bfq_data {
+        */
+       struct rb_root queue_weights_tree;
+       /*
+-       * rbtree of non-queue @bfq_entity weight counters, sorted by
+-       * weight. Used to keep track of whether all @bfq_groups have
+-       * the same weight. The tree contains one counter for each
+-       * distinct weight associated to some active @bfq_group (see
+-       * the comments to the functions bfq_weights_tree_[add|remove]
+-       * for further details).
++       * number of groups with requests still waiting for completion
+        */
+-      struct rb_root group_weights_tree;
++      unsigned int num_active_groups;
+ 
+       /*
+        * Number of bfq_queues containing requests (including the
+@@ -854,10 +849,10 @@ struct bfq_queue *bic_to_bfqq(struct bfq
+ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
+ struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
+ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+-void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
++void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                         struct rb_root *root);
+ void __bfq_weights_tree_remove(struct bfq_data *bfqd,
+-                             struct bfq_entity *entity,
++                             struct bfq_queue *bfqq,
+                              struct rb_root *root);
+ void bfq_weights_tree_remove(struct bfq_data *bfqd,
+                            struct bfq_queue *bfqq);
+--- a/block/bfq-wf2q.c
++++ b/block/bfq-wf2q.c
+@@ -788,25 +788,29 @@ __bfq_entity_update_weight_prio(struct b
+               new_weight = entity->orig_weight *
+                            (bfqq ? bfqq->wr_coeff : 1);
+               /*
+-               * If the weight of the entity changes, remove the entity
+-               * from its old weight counter (if there is a counter
+-               * associated with the entity), and add it to the counter
+-               * associated with its new weight.
++               * If the weight of the entity changes, and the entity is a
++               * queue, remove the entity from its old weight counter (if
++               * there is a counter associated with the entity).
+                */
+               if (prev_weight != new_weight) {
+-                      root = bfqq ? &bfqd->queue_weights_tree :
+-                                    &bfqd->group_weights_tree;
+-                      __bfq_weights_tree_remove(bfqd, entity, root);
++                      if (bfqq) {
++                              root = &bfqd->queue_weights_tree;
++                              __bfq_weights_tree_remove(bfqd, bfqq, root);
++                      } else
++                              bfqd->num_active_groups--;
+               }
+               entity->weight = new_weight;
+               /*
+-               * Add the entity to its weights tree only if it is
+-               * not associated with a weight-raised queue.
++               * Add the entity, if it is not a weight-raised queue,
++               * to the counter associated with its new weight.
+                */
+-              if (prev_weight != new_weight &&
+-                  (bfqq ? bfqq->wr_coeff == 1 : 1))
+-                      /* If we get here, root has been initialized. */
+-                      bfq_weights_tree_add(bfqd, entity, root);
++              if (prev_weight != new_weight) {
++                      if (bfqq && bfqq->wr_coeff == 1) {
++                              /* If we get here, root has been initialized. */
++                              bfq_weights_tree_add(bfqd, bfqq, root);
++                      } else
++                              bfqd->num_active_groups++;
++              }
+ 
+               new_st->wsum += entity->weight;
+ 
+@@ -1012,9 +1016,9 @@ static void __bfq_activate_entity(struct
+       if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */
+               struct bfq_group *bfqg =
+                       container_of(entity, struct bfq_group, entity);
++              struct bfq_data *bfqd = bfqg->bfqd;
+ 
+-              bfq_weights_tree_add(bfqg->bfqd, entity,
+-                                   &bfqd->group_weights_tree);
++              bfqd->num_active_groups++;
+       }
+ #endif
+ 
+@@ -1692,7 +1696,7 @@ void bfq_add_bfqq_busy(struct bfq_data *
+ 
+       if (!bfqq->dispatched)
+               if (bfqq->wr_coeff == 1)
+-                      bfq_weights_tree_add(bfqd, &bfqq->entity,
++                      bfq_weights_tree_add(bfqd, bfqq,
+                                            &bfqd->queue_weights_tree);
+ 
+       if (bfqq->wr_coeff > 1)
diff --git a/queue-4.19/series b/queue-4.19/series

index 46c52fd9363684cb4091371b64ec790dbd7cbf3f..d60a95a30cdebfb5fe23140f23b36b77227fc867 100644 (file)
--- a/queue-4.19/series
+++ b/queue-4.19/series
@@ -1 +1,6 @@
  net-usb-lan78xx-add-allied-telesis-at29m2-af.patch
+block-bfq-improve-asymmetric-scenarios-detection.patch
+block-bfq-fix-asymmetric-scenarios-detection.patch
+block-bfq-fix-decrement-of-num_active_groups.patch
+block-bfq-fix-queue-removal-from-weights-tree.patch
+block-bfq-fix-use-after-free-in-bfq_bfqq_expire.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Wed, 22 Dec 2021 12:10:01 +0000 (13:10 +0100)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Wed, 22 Dec 2021 12:10:01 +0000 (13:10 +0100)
queue-4.19/block-bfq-fix-asymmetric-scenarios-detection.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/block-bfq-fix-decrement-of-num_active_groups.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/block-bfq-fix-queue-removal-from-weights-tree.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/block-bfq-fix-use-after-free-in-bfq_bfqq_expire.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/block-bfq-improve-asymmetric-scenarios-detection.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/series		patch \| blob \| blame \| history