Merge tag 'for-6.10/block-20240511' of git://git.kernel.dk/linux

[thirdparty/kernel/linux.git] / block / blk-mq.c
diff --git a/block/blk-mq.c b/block/blk-mq.c

index 32afb87efbd0ef6a4814d05aa79020a07cbfc19f..8e01e4b32e100f45a5346d22fc652736a33cd0cd 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -28,6 +28,7 @@
  #include <linux/prefetch.h>
  #include <linux/blk-crypto.h>
  #include <linux/part_stat.h>
+#include <linux/sched/isolation.h>
  
  #include <trace/events/block.h>
  
@@ -690,6 +691,8 @@ static void blk_mq_finish_request(struct request *rq)
  {
         struct request_queue *q = rq->q;
  
+       blk_zone_finish_request(rq);
+
         if (rq->rq_flags & RQF_USE_SCHED) {
                 q->elevator->type->ops.finish_request(rq);
                 /*
@@ -761,31 +764,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
  }
  EXPORT_SYMBOL(blk_dump_rq_flags);
  
-static void req_bio_endio(struct request *rq, struct bio *bio,
-                         unsigned int nbytes, blk_status_t error)
-{
-       if (unlikely(error)) {
-               bio->bi_status = error;
-       } else if (req_op(rq) == REQ_OP_ZONE_APPEND) {
-               /*
-                * Partial zone append completions cannot be supported as the
-                * BIO fragments may end up not being written sequentially.
-                */
-               if (bio->bi_iter.bi_size != nbytes)
-                       bio->bi_status = BLK_STS_IOERR;
-               else
-                       bio->bi_iter.bi_sector = rq->__sector;
-       }
-
-       bio_advance(bio, nbytes);
-
-       if (unlikely(rq->rq_flags & RQF_QUIET))
-               bio_set_flag(bio, BIO_QUIET);
-       /* don't actually finish bio if it's part of flush sequence */
-       if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
-               bio_endio(bio);
-}
-
  static void blk_account_io_completion(struct request *req, unsigned int bytes)
  {
         if (req->part && blk_do_io_stat(req)) {
@@ -845,8 +823,7 @@ static void blk_complete_request(struct request *req)
                 /* Completion has already been traced */
                 bio_clear_flag(bio, BIO_TRACE_COMPLETION);
  
-               if (req_op(req) == REQ_OP_ZONE_APPEND)
-                       bio->bi_iter.bi_sector = req->__sector;
+               blk_zone_update_request_bio(req, bio);
  
                 if (!is_flush)
                         bio_endio(bio);
@@ -889,6 +866,8 @@ static void blk_complete_request(struct request *req)
  bool blk_update_request(struct request *req, blk_status_t error,
                 unsigned int nr_bytes)
  {
+       bool is_flush = req->rq_flags & RQF_FLUSH_SEQ;
+       bool quiet = req->rq_flags & RQF_QUIET;
         int total_bytes;
  
         trace_block_rq_complete(req, error, nr_bytes);
@@ -909,9 +888,8 @@ bool blk_update_request(struct request *req, blk_status_t error,
         if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
                 __blk_crypto_rq_put_keyslot(req);
  
-       if (unlikely(error && !blk_rq_is_passthrough(req) &&
-                    !(req->rq_flags & RQF_QUIET)) &&
-                    !test_bit(GD_DEAD, &req->q->disk->state)) {
+       if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) &&
+           !test_bit(GD_DEAD, &req->q->disk->state)) {
                 blk_print_req_error(req, error);
                 trace_block_rq_error(req, error, nr_bytes);
         }
@@ -923,12 +901,33 @@ bool blk_update_request(struct request *req, blk_status_t error,
                 struct bio *bio = req->bio;
                 unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
  
-               if (bio_bytes == bio->bi_iter.bi_size)
+               if (unlikely(error))
+                       bio->bi_status = error;
+
+               if (bio_bytes == bio->bi_iter.bi_size) {
                         req->bio = bio->bi_next;
+               } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) {
+                       /*
+                        * Partial zone append completions cannot be supported
+                        * as the BIO fragments may end up not being written
+                        * sequentially.
+                        */
+                       bio->bi_status = BLK_STS_IOERR;
+               }
  
                 /* Completion has already been traced */
                 bio_clear_flag(bio, BIO_TRACE_COMPLETION);
-               req_bio_endio(req, bio, bio_bytes, error);
+               if (unlikely(quiet))
+                       bio_set_flag(bio, BIO_QUIET);
+
+               bio_advance(bio, bio_bytes);
+
+               /* Don't actually finish bio if it's part of flush sequence */
+               if (!bio->bi_iter.bi_size) {
+                       blk_zone_update_request_bio(req, bio);
+                       if (!is_flush)
+                               bio_endio(bio);
+               }
  
                 total_bytes += bio_bytes;
                 nr_bytes -= bio_bytes;
@@ -997,6 +996,8 @@ static inline void blk_account_io_done(struct request *req, u64 now)
                 update_io_ticks(req->part, jiffies, true);
                 part_stat_inc(req->part, ios[sgrp]);
                 part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+               part_stat_local_dec(req->part,
+                                   in_flight[op_is_write(req_op(req))]);
                 part_stat_unlock();
         }
  }
@@ -1019,6 +1020,8 @@ static inline void blk_account_io_start(struct request *req)
  
                 part_stat_lock();
                 update_io_ticks(req->part, jiffies, false);
+               part_stat_local_inc(req->part,
+                                   in_flight[op_is_write(req_op(req))]);
                 part_stat_unlock();
         }
  }
@@ -1330,11 +1333,6 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head)
  
         blk_account_io_start(rq);
  
-       /*
-        * As plugging can be enabled for passthrough requests on a zoned
-        * device, directly accessing the plug instead of using blk_mq_plug()
-        * should not have any consequences.
-        */
         if (current->plug && !at_head) {
                 blk_add_rq_to_plug(current->plug, rq);
                 return;
@@ -1921,19 +1919,6 @@ static void blk_mq_handle_dev_resource(struct request *rq,
         __blk_mq_requeue_request(rq);
  }
  
-static void blk_mq_handle_zone_resource(struct request *rq,
-                                       struct list_head *zone_list)
-{
-       /*
-        * If we end up here it is because we cannot dispatch a request to a
-        * specific zone due to LLD level zone-write locking or other zone
-        * related resource not being available. In this case, set the request
-        * aside in zone_list for retrying it later.
-        */
-       list_add(&rq->queuelist, zone_list);
-       __blk_mq_requeue_request(rq);
-}
-
  enum prep_dispatch {
         PREP_DISPATCH_OK,
         PREP_DISPATCH_NO_TAG,
@@ -2019,7 +2004,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
         struct request *rq;
         int queued;
         blk_status_t ret = BLK_STS_OK;
-       LIST_HEAD(zone_list);
         bool needs_resource = false;
  
         if (list_empty(list))
@@ -2061,23 +2045,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
                 case BLK_STS_DEV_RESOURCE:
                         blk_mq_handle_dev_resource(rq, list);
                         goto out;
-               case BLK_STS_ZONE_RESOURCE:
-                       /*
-                        * Move the request to zone_list and keep going through
-                        * the dispatch list to find more requests the drive can
-                        * accept.
-                        */
-                       blk_mq_handle_zone_resource(rq, &zone_list);
-                       needs_resource = true;
-                       break;
                 default:
                         blk_mq_end_request(rq, ret);
                 }
         } while (!list_empty(list));
  out:
-       if (!list_empty(&zone_list))
-               list_splice_tail_init(&zone_list, list);
-
         /* If we didn't flush the entire list, we could have told the driver
          * there was more coming, but that turned out to be a lie.
          */
@@ -2163,6 +2135,15 @@ static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
         return cpu;
  }
  
+/*
+ * ->next_cpu is always calculated from hctx->cpumask, so simply use
+ * it for speeding up the check
+ */
+static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx)
+{
+        return hctx->next_cpu >= nr_cpu_ids;
+}
+
  /*
   * It'd be great if the workqueue API had a way to pass
   * in a mask and had some smarts for more clever placement.
@@ -2174,7 +2155,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
         bool tried = false;
         int next_cpu = hctx->next_cpu;
  
-       if (hctx->queue->nr_hw_queues == 1)
+       /* Switch to unbound if no allowable CPUs in this hctx */
+       if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx))
                 return WORK_CPU_UNBOUND;
  
         if (--hctx->next_cpu_batch <= 0) {
@@ -2948,22 +2930,37 @@ static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
  void blk_mq_submit_bio(struct bio *bio)
  {
         struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-       struct blk_plug *plug = blk_mq_plug(bio);
+       struct blk_plug *plug = current->plug;
         const int is_sync = op_is_sync(bio->bi_opf);
         struct blk_mq_hw_ctx *hctx;
         unsigned int nr_segs = 1;
         struct request *rq;
         blk_status_t ret;
  
+       /*
+        * If the plug has a cached request for this queue, try to use it.
+        */
+       rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
+
+       /*
+        * A BIO that was released from a zone write plug has already been
+        * through the preparation in this function, already holds a reference
+        * on the queue usage counter, and is the only write BIO in-flight for
+        * the target zone. Go straight to preparing a request for it.
+        */
+       if (bio_zone_write_plugging(bio)) {
+               nr_segs = bio->__bi_nr_segments;
+               if (rq)
+                       blk_queue_exit(q);
+               goto new_request;
+       }
+
         bio = blk_queue_bounce(bio, q);
  
         /*
-        * If the plug has a cached request for this queue, try use it.
-        *
          * The cached request already holds a q_usage_counter reference and we
          * don't have to acquire a new one if we use it.
          */
-       rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
         if (!rq) {
                 if (unlikely(bio_queue_enter(bio)))
                         return;
@@ -2980,6 +2977,10 @@ void blk_mq_submit_bio(struct bio *bio)
         if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
                 goto queue_exit;
  
+       if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
+               goto queue_exit;
+
+new_request:
         if (!rq) {
                 rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
                 if (unlikely(!rq))
@@ -3002,6 +3003,9 @@ void blk_mq_submit_bio(struct bio *bio)
                 return;
         }
  
+       if (bio_zone_write_plugging(bio))
+               blk_zone_write_plug_init_request(rq);
+
         if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
                 return;
  
@@ -3483,14 +3487,30 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
         return data.has_rq;
  }
  
-static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
-               struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx,
+               unsigned int this_cpu)
  {
-       if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu)
-               return false;
-       if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
-               return false;
-       return true;
+       enum hctx_type type = hctx->type;
+       int cpu;
+
+       /*
+        * hctx->cpumask has to rule out isolated CPUs, but userspace still
+        * might submit IOs on these isolated CPUs, so use the queue map to
+        * check if all CPUs mapped to this hctx are offline
+        */
+       for_each_online_cpu(cpu) {
+               struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue,
+                               type, cpu);
+
+               if (h != hctx)
+                       continue;
+
+               /* this hctx has at least one online CPU */
+               if (this_cpu != cpu)
+                       return true;
+       }
+
+       return false;
  }
  
  static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
@@ -3498,8 +3518,7 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
         struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
                         struct blk_mq_hw_ctx, cpuhp_online);
  
-       if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
-           !blk_mq_last_cpu_in_hctx(cpu, hctx))
+       if (blk_mq_hctx_has_online_cpu(hctx, cpu))
                 return 0;
  
         /*
@@ -3907,6 +3926,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
         }
  
         queue_for_each_hw_ctx(q, hctx, i) {
+               int cpu;
+
                 /*
                  * If no software queues are mapped to this hardware queue,
                  * disable it and free the request entries.
@@ -3933,6 +3954,15 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                  */
                 sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
  
+               /*
+                * Rule out isolated CPUs from hctx->cpumask to avoid
+                * running block kworker on isolated CPUs
+                */
+               for_each_cpu(cpu, hctx->cpumask) {
+                       if (cpu_is_isolated(cpu))
+                               cpumask_clear_cpu(cpu, hctx->cpumask);
+               }
+
                 /*
                  * Initialize batch roundrobin counts
                  */