Merge tag 'for-6.9/block-20240310' of git://git.kernel.dk/linux

[thirdparty/kernel/stable.git] / block / blk-mq.c
diff --git a/block/blk-mq.c b/block/blk-mq.c

index 9c475a89e3afec4e134c80b4694c761940ff7d8d..555ada922cf06021124eb3170983fc308e8d2a38 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -21,7 +21,6 @@
  #include <linux/llist.h>
  #include <linux/cpu.h>
  #include <linux/cache.h>
-#include <linux/sched/sysctl.h>
  #include <linux/sched/topology.h>
  #include <linux/sched/signal.h>
  #include <linux/delay.h>
@@ -322,7 +321,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
         RB_CLEAR_NODE(&rq->rb_node);
         rq->tag = BLK_MQ_NO_TAG;
         rq->internal_tag = BLK_MQ_NO_TAG;
-       rq->start_time_ns = ktime_get_ns();
+       rq->start_time_ns = blk_time_get_ns();
         rq->part = NULL;
         blk_crypto_rq_set_defaults(rq);
  }
@@ -332,7 +331,7 @@ EXPORT_SYMBOL(blk_rq_init);
  static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
  {
         if (blk_mq_need_time_stamp(rq))
-               rq->start_time_ns = ktime_get_ns();
+               rq->start_time_ns = blk_time_get_ns();
         else
                 rq->start_time_ns = 0;
  
@@ -443,7 +442,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
  
         /* alloc_time includes depth and tag waits */
         if (blk_queue_rq_alloc_time(q))
-               alloc_time_ns = ktime_get_ns();
+               alloc_time_ns = blk_time_get_ns();
  
         if (data->cmd_flags & REQ_NOWAIT)
                 data->flags |= BLK_MQ_REQ_NOWAIT;
@@ -628,7 +627,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
  
         /* alloc_time includes depth and tag waits */
         if (blk_queue_rq_alloc_time(q))
-               alloc_time_ns = ktime_get_ns();
+               alloc_time_ns = blk_time_get_ns();
  
         /*
          * If the tag allocator sleeps we could get an allocation for a
@@ -1041,7 +1040,7 @@ static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
  inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
  {
         if (blk_mq_need_time_stamp(rq))
-               __blk_mq_end_request_acct(rq, ktime_get_ns());
+               __blk_mq_end_request_acct(rq, blk_time_get_ns());
  
         blk_mq_finish_request(rq);
  
@@ -1084,7 +1083,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
         u64 now = 0;
  
         if (iob->need_ts)
-               now = ktime_get_ns();
+               now = blk_time_get_ns();
  
         while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
                 prefetch(rq->bio);
@@ -1167,10 +1166,11 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq)
         if (force_irqthreads())
                 return false;
  
-       /* same CPU or cache domain?  Complete locally */
+       /* same CPU or cache domain and capacity?  Complete locally */
         if (cpu == rq->mq_ctx->cpu ||
             (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
-            cpus_share_cache(cpu, rq->mq_ctx->cpu)))
+            cpus_share_cache(cpu, rq->mq_ctx->cpu) &&
+            cpus_equal_capacity(cpu, rq->mq_ctx->cpu)))
                 return false;
  
         /* don't try to IPI to an offline CPU */
@@ -1254,7 +1254,7 @@ void blk_mq_start_request(struct request *rq)
  
         if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) &&
             !blk_rq_is_passthrough(rq)) {
-               rq->io_start_time_ns = ktime_get_ns();
+               rq->io_start_time_ns = blk_time_get_ns();
                 rq->stats_sectors = blk_rq_sectors(rq);
                 rq->rq_flags |= RQF_STATS;
                 rq_qos_issue(q, rq);
@@ -1409,22 +1409,10 @@ blk_status_t blk_execute_rq(struct request *rq, bool at_head)
         blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
         blk_mq_run_hw_queue(hctx, false);
  
-       if (blk_rq_is_poll(rq)) {
+       if (blk_rq_is_poll(rq))
                 blk_rq_poll_completion(rq, &wait.done);
-       } else {
-               /*
-                * Prevent hang_check timer from firing at us during very long
-                * I/O
-                */
-               unsigned long hang_check = sysctl_hung_task_timeout_secs;
-
-               if (hang_check)
-                       while (!wait_for_completion_io_timeout(&wait.done,
-                                       hang_check * (HZ/2)))
-                               ;
-               else
-                       wait_for_completion_io(&wait.done);
-       }
+       else
+               blk_wait_io(&wait.done);
  
         return wait.ret;
  }
@@ -2892,9 +2880,6 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
         };
         struct request *rq;
  
-       if (blk_mq_attempt_bio_merge(q, bio, nsegs))
-               return NULL;
-
         rq_qos_throttle(q, bio);
  
         if (plug) {
@@ -2913,22 +2898,31 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
  }
  
  /*
- * Check if we can use the passed on request for submitting the passed in bio,
- * and remove it from the request list if it can be used.
+ * Check if there is a suitable cached request and return it.
   */
-static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
-               struct bio *bio)
+static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
+               struct request_queue *q, blk_opf_t opf)
  {
-       enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf);
-       enum hctx_type hctx_type = rq->mq_hctx->type;
+       enum hctx_type type = blk_mq_get_hctx_type(opf);
+       struct request *rq;
  
-       WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq);
+       if (!plug)
+               return NULL;
+       rq = rq_list_peek(&plug->cached_rq);
+       if (!rq || rq->q != q)
+               return NULL;
+       if (type != rq->mq_hctx->type &&
+           (type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT))
+               return NULL;
+       if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
+               return NULL;
+       return rq;
+}
  
-       if (type != hctx_type &&
-           !(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT))
-               return false;
-       if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf))
-               return false;
+static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
+               struct bio *bio)
+{
+       WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq);
  
         /*
          * If any qos ->throttle() end up blocking, we will have flushed the
@@ -2941,7 +2935,6 @@ static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
         blk_mq_rq_time_init(rq, 0);
         rq->cmd_flags = bio->bi_opf;
         INIT_LIST_HEAD(&rq->queuelist);
-       return true;
  }
  
  /**
@@ -2963,50 +2956,43 @@ void blk_mq_submit_bio(struct bio *bio)
         struct blk_plug *plug = blk_mq_plug(bio);
         const int is_sync = op_is_sync(bio->bi_opf);
         struct blk_mq_hw_ctx *hctx;
-       struct request *rq = NULL;
         unsigned int nr_segs = 1;
+       struct request *rq;
         blk_status_t ret;
  
         bio = blk_queue_bounce(bio, q);
  
-       if (plug) {
-               rq = rq_list_peek(&plug->cached_rq);
-               if (rq && rq->q != q)
-                       rq = NULL;
-       }
-       if (rq) {
-               if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
-                       bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
-                       if (!bio)
-                               return;
-               }
-               if (!bio_integrity_prep(bio))
-                       return;
-               if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
-                       return;
-               if (blk_mq_use_cached_rq(rq, plug, bio))
-                       goto done;
-               percpu_ref_get(&q->q_usage_counter);
-       } else {
+       /*
+        * If the plug has a cached request for this queue, try use it.
+        *
+        * The cached request already holds a q_usage_counter reference and we
+        * don't have to acquire a new one if we use it.
+        */
+       rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
+       if (!rq) {
                 if (unlikely(bio_queue_enter(bio)))
                         return;
-               if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
-                       bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
-                       if (!bio)
-                               goto fail;
-               }
-               if (!bio_integrity_prep(bio))
-                       goto fail;
         }
  
-       rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
-       if (unlikely(!rq)) {
-fail:
-               blk_queue_exit(q);
-               return;
+       if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
+               bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
+               if (!bio)
+                       goto queue_exit;
+       }
+       if (!bio_integrity_prep(bio))
+               goto queue_exit;
+
+       if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
+               goto queue_exit;
+
+       if (!rq) {
+               rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
+               if (unlikely(!rq))
+                       goto queue_exit;
+       } else {
+               blk_mq_use_cached_rq(rq, plug, bio);
         }
  
-done:
         trace_block_getrq(bio);
  
         rq_qos_track(q, rq, bio);
@@ -3037,6 +3023,15 @@ done:
         } else {
                 blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq));
         }
+       return;
+
+queue_exit:
+       /*
+        * Don't drop the queue reference if we were trying to use a cached
+        * request and thus didn't acquire one.
+        */
+       if (!rq)
+               blk_queue_exit(q);
  }
  
  #ifdef CONFIG_BLK_MQ_STACKING
@@ -3098,7 +3093,7 @@ blk_status_t blk_insert_cloned_request(struct request *rq)
         blk_mq_run_dispatch_ops(q,
                         ret = blk_mq_request_issue_directly(rq, true));
         if (ret)
-               blk_account_io_done(rq, ktime_get_ns());
+               blk_account_io_done(rq, blk_time_get_ns());
         return ret;
  }
  EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
@@ -4078,15 +4073,16 @@ void blk_mq_release(struct request_queue *q)
         blk_mq_sysfs_deinit(q);
  }
  
-static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
-               void *queuedata)
+struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
+               struct queue_limits *lim, void *queuedata)
  {
+       struct queue_limits default_lim = { };
         struct request_queue *q;
         int ret;
  
-       q = blk_alloc_queue(set->numa_node);
-       if (!q)
-               return ERR_PTR(-ENOMEM);
+       q = blk_alloc_queue(lim ? lim : &default_lim, set->numa_node);
+       if (IS_ERR(q))
+               return q;
         q->queuedata = queuedata;
         ret = blk_mq_init_allocated_queue(set, q);
         if (ret) {
@@ -4095,20 +4091,15 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
         }
         return q;
  }
-
-struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
-{
-       return blk_mq_init_queue_data(set, NULL);
-}
-EXPORT_SYMBOL(blk_mq_init_queue);
+EXPORT_SYMBOL(blk_mq_alloc_queue);
  
  /**
   * blk_mq_destroy_queue - shutdown a request queue
   * @q: request queue to shutdown
   *
- * This shuts down a request queue allocated by blk_mq_init_queue(). All future
+ * This shuts down a request queue allocated by blk_mq_alloc_queue(). All future
   * requests will be failed with -ENODEV. The caller is responsible for dropping
- * the reference from blk_mq_init_queue() by calling blk_put_queue().
+ * the reference from blk_mq_alloc_queue() by calling blk_put_queue().
   *
   * Context: can sleep
   */
@@ -4129,13 +4120,14 @@ void blk_mq_destroy_queue(struct request_queue *q)
  }
  EXPORT_SYMBOL(blk_mq_destroy_queue);
  
-struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
+struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
+               struct queue_limits *lim, void *queuedata,
                 struct lock_class_key *lkclass)
  {
         struct request_queue *q;
         struct gendisk *disk;
  
-       q = blk_mq_init_queue_data(set, queuedata);
+       q = blk_mq_alloc_queue(set, lim, queuedata);
         if (IS_ERR(q))
                 return ERR_CAST(q);
  
@@ -4389,7 +4381,7 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
         if (set->nr_maps == 1)
                 set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
  
-       if (set->ops->map_queues && !is_kdump_kernel()) {
+       if (set->ops->map_queues) {
                 int i;
  
                 /*
@@ -4488,14 +4480,12 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
  
         /*
          * If a crashdump is active, then we are potentially in a very
-        * memory constrained environment. Limit us to 1 queue and
-        * 64 tags to prevent using too much memory.
+        * memory constrained environment. Limit us to  64 tags to prevent
+        * using too much memory.
          */
-       if (is_kdump_kernel()) {
-               set->nr_hw_queues = 1;
-               set->nr_maps = 1;
+       if (is_kdump_kernel())
                 set->queue_depth = min(64U, set->queue_depth);
-       }
+
         /*
          * There is no use for more h/w queues than cpus if we just have
          * a single map
@@ -4525,7 +4515,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
                                                   GFP_KERNEL, set->numa_node);
                 if (!set->map[i].mq_map)
                         goto out_free_mq_map;
-               set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
+               set->map[i].nr_queues = set->nr_hw_queues;
         }
  
         blk_mq_update_queue_map(set);