]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
block: accumulate memory segment gaps per bio
authorKeith Busch <kbusch@kernel.org>
Tue, 14 Oct 2025 15:04:55 +0000 (08:04 -0700)
committerJens Axboe <axboe@kernel.dk>
Fri, 7 Nov 2025 01:11:58 +0000 (18:11 -0700)
The blk-mq dma iterator has an optimization for requests that align to
the device's iommu merge boundary. This boundary may be larger than the
device's virtual boundary, but the code had been depending on that queue
limit to know ahead of time if the request is guaranteed to align to
that optimization.

Rather than rely on that queue limit, which many devices may not report,
save the lowest set bit of any boundary gap between each segment in the
bio while checking the segments. The request stores the value for
merging and quickly checking per io if the request can use iova
optimizations.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
block/bio.c
block/blk-map.c
block/blk-merge.c
block/blk-mq-dma.c
block/blk-mq.c
include/linux/bio.h
include/linux/blk-mq.h
include/linux/blk_types.h

index b3a79285c278d50720669678a673e9921ca85a1f..7b13bdf72de09a625c4862d98ce79468af82d4f9 100644 (file)
@@ -253,6 +253,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
        bio->bi_write_hint = 0;
        bio->bi_write_stream = 0;
        bio->bi_status = 0;
+       bio->bi_bvec_gap_bit = 0;
        bio->bi_iter.bi_sector = 0;
        bio->bi_iter.bi_size = 0;
        bio->bi_iter.bi_idx = 0;
index 60faf036fb6e4ad121817c320270d95226aed54f..17a1dc2886786070e285e5bb68a7e00e852b8ecf 100644 (file)
@@ -459,6 +459,8 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio)
        if (rq->bio) {
                if (!ll_back_merge_fn(rq, bio, nr_segs))
                        return -EINVAL;
+               rq->phys_gap_bit = bio_seg_gap(rq->q, rq->biotail, bio,
+                                              rq->phys_gap_bit);
                rq->biotail->bi_next = bio;
                rq->biotail = bio;
                rq->__data_len += bio->bi_iter.bi_size;
@@ -469,6 +471,7 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio)
        rq->nr_phys_segments = nr_segs;
        rq->bio = rq->biotail = bio;
        rq->__data_len = bio->bi_iter.bi_size;
+       rq->phys_gap_bit = bio->bi_bvec_gap_bit;
        return 0;
 }
 EXPORT_SYMBOL(blk_rq_append_bio);
index c47d18587a0b6eddaab098a8cca30077670eb3a3..3ca6fbf8b7870ff96f1b00a352280e00e37a0f71 100644 (file)
@@ -302,6 +302,12 @@ static unsigned int bio_split_alignment(struct bio *bio,
        return lim->logical_block_size;
 }
 
+static inline unsigned int bvec_seg_gap(struct bio_vec *bvprv,
+                                       struct bio_vec *bv)
+{
+       return bv->bv_offset | (bvprv->bv_offset + bvprv->bv_len);
+}
+
 /**
  * bio_split_io_at - check if and where to split a bio
  * @bio:  [in] bio to be split
@@ -319,8 +325,8 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
                unsigned *segs, unsigned max_bytes, unsigned len_align_mask)
 {
        struct bio_vec bv, bvprv, *bvprvp = NULL;
+       unsigned nsegs = 0, bytes = 0, gaps = 0;
        struct bvec_iter iter;
-       unsigned nsegs = 0, bytes = 0;
 
        bio_for_each_bvec(bv, bio, iter) {
                if (bv.bv_offset & lim->dma_alignment ||
@@ -331,8 +337,11 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
                 * If the queue doesn't support SG gaps and adding this
                 * offset would create a gap, disallow it.
                 */
-               if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
-                       goto split;
+               if (bvprvp) {
+                       if (bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
+                               goto split;
+                       gaps |= bvec_seg_gap(bvprvp, &bv);
+               }
 
                if (nsegs < lim->max_segments &&
                    bytes + bv.bv_len <= max_bytes &&
@@ -350,6 +359,7 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
        }
 
        *segs = nsegs;
+       bio->bi_bvec_gap_bit = ffs(gaps);
        return 0;
 split:
        if (bio->bi_opf & REQ_ATOMIC)
@@ -385,6 +395,7 @@ split:
         * big IO can be trival, disable iopoll when split needed.
         */
        bio_clear_polled(bio);
+       bio->bi_bvec_gap_bit = ffs(gaps);
        return bytes >> SECTOR_SHIFT;
 }
 EXPORT_SYMBOL_GPL(bio_split_io_at);
@@ -721,6 +732,21 @@ static bool blk_atomic_write_mergeable_rqs(struct request *rq,
        return (rq->cmd_flags & REQ_ATOMIC) == (next->cmd_flags & REQ_ATOMIC);
 }
 
+u8 bio_seg_gap(struct request_queue *q, struct bio *prev, struct bio *next,
+              u8 gaps_bit)
+{
+       struct bio_vec pb, nb;
+
+       gaps_bit = min_not_zero(gaps_bit, prev->bi_bvec_gap_bit);
+       gaps_bit = min_not_zero(gaps_bit, next->bi_bvec_gap_bit);
+
+       bio_get_last_bvec(prev, &pb);
+       bio_get_first_bvec(next, &nb);
+       if (!biovec_phys_mergeable(q, &pb, &nb))
+               gaps_bit = min_not_zero(gaps_bit, ffs(bvec_seg_gap(&pb, &nb)));
+       return gaps_bit;
+}
+
 /*
  * For non-mq, this has to be called with the request spinlock acquired.
  * For mq with scheduling, the appropriate queue wide lock should be held.
@@ -785,6 +811,9 @@ static struct request *attempt_merge(struct request_queue *q,
        if (next->start_time_ns < req->start_time_ns)
                req->start_time_ns = next->start_time_ns;
 
+       req->phys_gap_bit = bio_seg_gap(req->q, req->biotail, next->bio,
+                                       min_not_zero(next->phys_gap_bit,
+                                                    req->phys_gap_bit));
        req->biotail->bi_next = next->bio;
        req->biotail = next->biotail;
 
@@ -908,6 +937,8 @@ enum bio_merge_status bio_attempt_back_merge(struct request *req,
        if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
                blk_zone_write_plug_bio_merged(bio);
 
+       req->phys_gap_bit = bio_seg_gap(req->q, req->biotail, bio,
+                                       req->phys_gap_bit);
        req->biotail->bi_next = bio;
        req->biotail = bio;
        req->__data_len += bio->bi_iter.bi_size;
@@ -942,6 +973,8 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req,
 
        blk_update_mixed_merge(req, bio, true);
 
+       req->phys_gap_bit = bio_seg_gap(req->q, bio, req->bio,
+                                       req->phys_gap_bit);
        bio->bi_next = req->bio;
        req->bio = bio;
 
index 449950029872a0c583accfb4a62fe26c5cda9e29..94d3461b5bc8ea35ecb55191e4852f430b295057 100644 (file)
@@ -79,8 +79,7 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter,
 static inline bool blk_can_dma_map_iova(struct request *req,
                struct device *dma_dev)
 {
-       return !((queue_virt_boundary(req->q) + 1) &
-               dma_get_merge_boundary(dma_dev));
+       return !(req_phys_gap_mask(req) & dma_get_merge_boundary(dma_dev));
 }
 
 static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
index d626d32f6e576f95bc68495c467a9d9c7b73a581..b2fdeaac0efb5beaeaa84ec12f976a4c608b8479 100644 (file)
@@ -376,6 +376,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
        INIT_LIST_HEAD(&rq->queuelist);
        rq->q = q;
        rq->__sector = (sector_t) -1;
+       rq->phys_gap_bit = 0;
        INIT_HLIST_NODE(&rq->hash);
        RB_CLEAR_NODE(&rq->rb_node);
        rq->tag = BLK_MQ_NO_TAG;
@@ -668,6 +669,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
                        goto out_queue_exit;
        }
        rq->__data_len = 0;
+       rq->phys_gap_bit = 0;
        rq->__sector = (sector_t) -1;
        rq->bio = rq->biotail = NULL;
        return rq;
@@ -748,6 +750,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
        rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
        blk_mq_rq_time_init(rq, alloc_time_ns);
        rq->__data_len = 0;
+       rq->phys_gap_bit = 0;
        rq->__sector = (sector_t) -1;
        rq->bio = rq->biotail = NULL;
        return rq;
@@ -2674,6 +2677,8 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
        rq->bio = rq->biotail = bio;
        rq->__sector = bio->bi_iter.bi_sector;
        rq->__data_len = bio->bi_iter.bi_size;
+       rq->phys_gap_bit = bio->bi_bvec_gap_bit;
+
        rq->nr_phys_segments = nr_segs;
        if (bio_integrity(bio))
                rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q,
@@ -3380,6 +3385,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
        }
        rq->nr_phys_segments = rq_src->nr_phys_segments;
        rq->nr_integrity_segments = rq_src->nr_integrity_segments;
+       rq->phys_gap_bit = rq_src->phys_gap_bit;
 
        if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
                goto free_and_out;
index 16c1c85613b7673323a706a879b55f4e20d71398..ad2d57908c1c0e2b7c533080075519bf513bb7aa 100644 (file)
@@ -324,6 +324,8 @@ extern struct bio *bio_split(struct bio *bio, int sectors,
                             gfp_t gfp, struct bio_set *bs);
 int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
                unsigned *segs, unsigned max_bytes, unsigned len_align);
+u8 bio_seg_gap(struct request_queue *q, struct bio *prev, struct bio *next,
+               u8 gaps_bit);
 
 /**
  * bio_next_split - get next @sectors from a bio, splitting if necessary
index b25d12545f46da3b962de59639349c3eebf7b5cc..b54506b3b76d9076b2a3fe7e8320dfe521902f66 100644 (file)
@@ -152,6 +152,14 @@ struct request {
        unsigned short nr_phys_segments;
        unsigned short nr_integrity_segments;
 
+       /*
+        * The lowest set bit for address gaps between physical segments. This
+        * provides information necessary for dma optimization opprotunities,
+        * like for testing if the segments can be coalesced against the
+        * device's iommu granule.
+        */
+       unsigned char phys_gap_bit;
+
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
        struct bio_crypt_ctx *crypt_ctx;
        struct blk_crypto_keyslot *crypt_keyslot;
@@ -208,6 +216,14 @@ struct request {
        void *end_io_data;
 };
 
+/*
+ * Returns a mask with all bits starting at req->phys_gap_bit set to 1.
+ */
+static inline unsigned long req_phys_gap_mask(const struct request *req)
+{
+       return ~(((1 << req->phys_gap_bit) >> 1) - 1);
+}
+
 static inline enum req_op req_op(const struct request *req)
 {
        return req->cmd_flags & REQ_OP_MASK;
index 8e8d1cc8b06c4c09c82ec3bff19090250ea773a6..53501ebb0623ede47e191cbd0e8358b91a2880f8 100644 (file)
@@ -218,6 +218,18 @@ struct bio {
        enum rw_hint            bi_write_hint;
        u8                      bi_write_stream;
        blk_status_t            bi_status;
+
+       /*
+        * The bvec gap bit indicates the lowest set bit in any address offset
+        * between all bi_io_vecs. This field is initialized only after the bio
+        * is split to the hardware limits (see bio_split_io_at()). The value
+        * may be used to consider DMA optimization when performing that
+        * mapping. The value is compared to a power of two mask where the
+        * result depends on any bit set within the mask, so saving the lowest
+        * bit is sufficient to know if any segment gap collides with the mask.
+        */
+       u8                      bi_bvec_gap_bit;
+
        atomic_t                __bi_remaining;
 
        struct bvec_iter        bi_iter;