From 853e892076ba5666c81afbc86552e008280f9768 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 27 Jan 2026 19:53:57 -0500 Subject: [PATCH] RDMA/core: use IOVA-based DMA mapping for bvec RDMA operations The bvec RDMA API maps each bvec individually via dma_map_phys(), requiring an IOTLB sync for each mapping. For large I/O operations with many bvecs, this overhead becomes significant. The two-step IOVA API (dma_iova_try_alloc / dma_iova_link / dma_iova_sync) allocates a contiguous IOVA range upfront, links all physical pages without IOTLB syncs, then performs a single sync at the end. This reduces IOTLB flushes from O(n) to O(1). It also requires only a single output dma_addr_t compared to extra per-input element storage in struct scatterlist. Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever Link: https://patch.msgid.link/20260128005400.25147-3-cel@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/rw.c | 106 +++++++++++++++++++++++++++++++++++ include/rdma/rw.h | 8 +++ 2 files changed, 114 insertions(+) diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index 39ca21d18d7b1..c2fc8cba972ef 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -14,6 +14,7 @@ enum { RDMA_RW_MULTI_WR, RDMA_RW_MR, RDMA_RW_SIG_MR, + RDMA_RW_IOVA, }; static bool rdma_rw_force_mr; @@ -383,6 +384,87 @@ out_unmap: return -ENOMEM; } +/* + * Try to use the two-step IOVA API to map bvecs into a contiguous DMA range. + * This reduces IOTLB sync overhead by doing one sync at the end instead of + * one per bvec, and produces a contiguous DMA address range that can be + * described by a single SGE. + * + * Returns the number of WQEs (always 1) on success, -EOPNOTSUPP if IOVA + * mapping is not available, or another negative error code on failure. + */ +static int rdma_rw_init_iova_wrs_bvec(struct rdma_rw_ctx *ctx, + struct ib_qp *qp, const struct bio_vec *bvec, + struct bvec_iter *iter, u64 remote_addr, u32 rkey, + enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + struct device *dma_dev = dev->dma_device; + size_t total_len = iter->bi_size; + struct bio_vec first_bv; + size_t mapped_len = 0; + int ret; + + /* Virtual DMA devices cannot support IOVA allocators */ + if (ib_uses_virt_dma(dev)) + return -EOPNOTSUPP; + + /* Try to allocate contiguous IOVA space */ + first_bv = mp_bvec_iter_bvec(bvec, *iter); + if (!dma_iova_try_alloc(dma_dev, &ctx->iova.state, + bvec_phys(&first_bv), total_len)) + return -EOPNOTSUPP; + + /* Link all bvecs into the IOVA space */ + while (iter->bi_size) { + struct bio_vec bv = mp_bvec_iter_bvec(bvec, *iter); + + ret = dma_iova_link(dma_dev, &ctx->iova.state, bvec_phys(&bv), + mapped_len, bv.bv_len, dir, 0); + if (ret) + goto out_destroy; + + mapped_len += bv.bv_len; + bvec_iter_advance(bvec, iter, bv.bv_len); + } + + /* Sync the IOTLB once for all linked pages */ + ret = dma_iova_sync(dma_dev, &ctx->iova.state, 0, mapped_len); + if (ret) + goto out_destroy; + + ctx->iova.mapped_len = mapped_len; + + /* Single SGE covers the entire contiguous IOVA range */ + ctx->iova.sge.addr = ctx->iova.state.addr; + ctx->iova.sge.length = mapped_len; + ctx->iova.sge.lkey = qp->pd->local_dma_lkey; + + /* Single WR for the whole transfer */ + memset(&ctx->iova.wr, 0, sizeof(ctx->iova.wr)); + if (dir == DMA_TO_DEVICE) + ctx->iova.wr.wr.opcode = IB_WR_RDMA_WRITE; + else + ctx->iova.wr.wr.opcode = IB_WR_RDMA_READ; + ctx->iova.wr.wr.num_sge = 1; + ctx->iova.wr.wr.sg_list = &ctx->iova.sge; + ctx->iova.wr.remote_addr = remote_addr; + ctx->iova.wr.rkey = rkey; + + ctx->type = RDMA_RW_IOVA; + ctx->nr_ops = 1; + return 1; + +out_destroy: + /* + * dma_iova_destroy() expects the actual mapped length, not the + * total allocation size. It unlinks only the successfully linked + * range and frees the entire IOVA allocation. + */ + dma_iova_destroy(dma_dev, &ctx->iova.state, mapped_len, dir, 0); + return ret; +} + /** * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context * @ctx: context to initialize @@ -485,6 +567,8 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct bvec_iter iter, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { + int ret; + if (nr_bvec == 0 || iter.bi_size == 0) return -EINVAL; @@ -495,6 +579,16 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, if (nr_bvec == 1) return rdma_rw_init_single_wr_bvec(ctx, qp, bvecs, &iter, remote_addr, rkey, dir); + + /* + * Try IOVA-based mapping first for multi-bvec transfers. + * This reduces IOTLB sync overhead by batching all mappings. + */ + ret = rdma_rw_init_iova_wrs_bvec(ctx, qp, bvecs, &iter, remote_addr, + rkey, dir); + if (ret != -EOPNOTSUPP) + return ret; + return rdma_rw_init_map_wrs_bvec(ctx, qp, bvecs, nr_bvec, &iter, remote_addr, rkey, dir); } @@ -671,6 +765,10 @@ struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, first_wr = &ctx->reg[0].reg_wr.wr; last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr; break; + case RDMA_RW_IOVA: + first_wr = &ctx->iova.wr.wr; + last_wr = &ctx->iova.wr.wr; + break; case RDMA_RW_MULTI_WR: first_wr = &ctx->map.wrs[0].wr; last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr; @@ -745,6 +843,10 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, break; case RDMA_RW_SINGLE_WR: break; + case RDMA_RW_IOVA: + /* IOVA contexts must use rdma_rw_ctx_destroy_bvec() */ + WARN_ON_ONCE(1); + return; default: BUG(); break; @@ -778,6 +880,10 @@ void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 i; switch (ctx->type) { + case RDMA_RW_IOVA: + dma_iova_destroy(dev->dma_device, &ctx->iova.state, + ctx->iova.mapped_len, dir, 0); + break; case RDMA_RW_MULTI_WR: for (i = 0; i < nr_bvec; i++) ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr, diff --git a/include/rdma/rw.h b/include/rdma/rw.h index b2fc3e2373d70..205e16ed6cd84 100644 --- a/include/rdma/rw.h +++ b/include/rdma/rw.h @@ -32,6 +32,14 @@ struct rdma_rw_ctx { struct ib_rdma_wr *wrs; } map; + /* for IOVA-based mapping of bvecs into contiguous DMA range: */ + struct { + struct dma_iova_state state; + struct ib_sge sge; + struct ib_rdma_wr wr; + size_t mapped_len; + } iova; + /* for registering multiple WRs: */ struct rdma_rw_reg_ctx { struct ib_sge sge; -- 2.47.3