]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
xprtrdma: Replace rpcrdma_mr_seg with xdr_buf cursor
authorChuck Lever <chuck.lever@oracle.com>
Fri, 6 Mar 2026 21:56:26 +0000 (16:56 -0500)
committerTrond Myklebust <trond.myklebust@hammerspace.com>
Mon, 13 Apr 2026 19:04:58 +0000 (12:04 -0700)
The FRWR registration path converts data through three
representations: xdr_buf -> rpcrdma_mr_seg[] -> scatterlist[]
-> ib_map_mr_sg(). The rpcrdma_mr_seg intermediate is a relic
of when multiple registration strategies existed (FMR, physical,
FRWR). Only FRWR remains, so this indirection and the 6240-byte
rl_segments[260] array embedded in each rpcrdma_req serve no
purpose.

Introduce struct rpcrdma_xdr_cursor to track position within
an xdr_buf during iterative MR registration. Rewrite frwr_map to
populate scatterlist entries directly from the xdr_buf regions
(head kvec, page list, tail kvec). The boundary logic for
non-SG_GAPS devices is simpler because the xdr_buf structure
guarantees that page-region entries after the first start at
offset 0, and that head/tail kvecs are separate regions that
naturally break at MR boundaries.

Fix a pre-existing bug in rpcrdma_encode_write_list where the
write-pad statistics accumulator added mr->mr_length from the last
data MR rather than the write-pad MR. The refactored code uses
ep->re_write_pad_mr->mr_length.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
include/trace/events/rpcrdma.h
net/sunrpc/xprtrdma/frwr_ops.c
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/xprt_rdma.h

index e6a72646c50798c6ed6eaf7165e85bfc556b88ab..b79913048e1a0f622e5e30f72e2d1effc0b6e111 100644 (file)
@@ -392,10 +392,10 @@ DECLARE_EVENT_CLASS(xprtrdma_rdch_event,
                const struct rpc_task *task,
                unsigned int pos,
                struct rpcrdma_mr *mr,
-               int nsegs
+               bool is_last
        ),
 
-       TP_ARGS(task, pos, mr, nsegs),
+       TP_ARGS(task, pos, mr, is_last),
 
        TP_STRUCT__entry(
                __field(unsigned int, task_id)
@@ -405,7 +405,7 @@ DECLARE_EVENT_CLASS(xprtrdma_rdch_event,
                __field(u32, handle)
                __field(u32, length)
                __field(u64, offset)
-               __field(int, nsegs)
+               __field(bool, is_last)
        ),
 
        TP_fast_assign(
@@ -416,7 +416,7 @@ DECLARE_EVENT_CLASS(xprtrdma_rdch_event,
                __entry->handle = mr->mr_handle;
                __entry->length = mr->mr_length;
                __entry->offset = mr->mr_offset;
-               __entry->nsegs = nsegs;
+               __entry->is_last = is_last;
        ),
 
        TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
@@ -424,7 +424,7 @@ DECLARE_EVENT_CLASS(xprtrdma_rdch_event,
                __entry->task_id, __entry->client_id,
                __entry->pos, __entry->length,
                (unsigned long long)__entry->offset, __entry->handle,
-               __entry->nents < __entry->nsegs ? "more" : "last"
+               __entry->is_last ? "last" : "more"
        )
 );
 
@@ -434,18 +434,18 @@ DECLARE_EVENT_CLASS(xprtrdma_rdch_event,
                                        const struct rpc_task *task,    \
                                        unsigned int pos,               \
                                        struct rpcrdma_mr *mr,          \
-                                       int nsegs                       \
+                                       bool is_last                    \
                                ),                                      \
-                               TP_ARGS(task, pos, mr, nsegs))
+                               TP_ARGS(task, pos, mr, is_last))
 
 DECLARE_EVENT_CLASS(xprtrdma_wrch_event,
        TP_PROTO(
                const struct rpc_task *task,
                struct rpcrdma_mr *mr,
-               int nsegs
+               bool is_last
        ),
 
-       TP_ARGS(task, mr, nsegs),
+       TP_ARGS(task, mr, is_last),
 
        TP_STRUCT__entry(
                __field(unsigned int, task_id)
@@ -454,7 +454,7 @@ DECLARE_EVENT_CLASS(xprtrdma_wrch_event,
                __field(u32, handle)
                __field(u32, length)
                __field(u64, offset)
-               __field(int, nsegs)
+               __field(bool, is_last)
        ),
 
        TP_fast_assign(
@@ -464,7 +464,7 @@ DECLARE_EVENT_CLASS(xprtrdma_wrch_event,
                __entry->handle = mr->mr_handle;
                __entry->length = mr->mr_length;
                __entry->offset = mr->mr_offset;
-               __entry->nsegs = nsegs;
+               __entry->is_last = is_last;
        ),
 
        TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
@@ -472,7 +472,7 @@ DECLARE_EVENT_CLASS(xprtrdma_wrch_event,
                __entry->task_id, __entry->client_id,
                __entry->length, (unsigned long long)__entry->offset,
                __entry->handle,
-               __entry->nents < __entry->nsegs ? "more" : "last"
+               __entry->is_last ? "last" : "more"
        )
 );
 
@@ -481,9 +481,9 @@ DECLARE_EVENT_CLASS(xprtrdma_wrch_event,
                                TP_PROTO(                               \
                                        const struct rpc_task *task,    \
                                        struct rpcrdma_mr *mr,          \
-                                       int nsegs                       \
+                                       bool is_last                    \
                                ),                                      \
-                               TP_ARGS(task, mr, nsegs))
+                               TP_ARGS(task, mr, is_last))
 
 TRACE_DEFINE_ENUM(DMA_BIDIRECTIONAL);
 TRACE_DEFINE_ENUM(DMA_TO_DEVICE);
index 4331b0b65f4c539cb6fa3f2a0257b74148f7818c..229057d35fb8063ee49c473db863195e09535427 100644 (file)
@@ -268,10 +268,9 @@ int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device)
 }
 
 /**
- * frwr_map - Register a memory region
+ * frwr_map - Register a memory region from an xdr_buf cursor
  * @r_xprt: controlling transport
- * @seg: memory region co-ordinates
- * @nsegs: number of segments remaining
+ * @cur: cursor tracking position within the xdr_buf
  * @writing: true when RDMA Write will be used
  * @xid: XID of RPC using the registered memory
  * @mr: MR to fill in
@@ -279,34 +278,104 @@ int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device)
  * Prepare a REG_MR Work Request to register a memory region
  * for remote access via RDMA READ or RDMA WRITE.
  *
- * Returns the next segment or a negative errno pointer.
- * On success, @mr is filled in.
+ * Returns 0 on success (cursor advanced past consumed data,
+ * @mr populated) or a negative errno on failure.
  */
-struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
-                               struct rpcrdma_mr_seg *seg,
-                               int nsegs, bool writing, __be32 xid,
-                               struct rpcrdma_mr *mr)
+int frwr_map(struct rpcrdma_xprt *r_xprt,
+            struct rpcrdma_xdr_cursor *cur,
+            bool writing, __be32 xid,
+            struct rpcrdma_mr *mr)
 {
        struct rpcrdma_ep *ep = r_xprt->rx_ep;
+       const struct xdr_buf *xdrbuf = cur->xc_buf;
+       bool sg_gaps = ep->re_mrtype == IB_MR_TYPE_SG_GAPS;
+       unsigned int max_depth = ep->re_max_fr_depth;
        struct ib_reg_wr *reg_wr;
        int i, n, dma_nents;
        struct ib_mr *ibmr;
        u8 key;
 
-       if (nsegs > ep->re_max_fr_depth)
-               nsegs = ep->re_max_fr_depth;
-       for (i = 0; i < nsegs;) {
-               sg_set_page(&mr->mr_sg[i], seg->mr_page,
-                           seg->mr_len, seg->mr_offset);
-
-               ++seg;
-               ++i;
-               if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS)
-                       continue;
-               if ((i < nsegs && seg->mr_offset) ||
-                   offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
-                       break;
+       i = 0;
+
+       /* Head kvec */
+       if (!(cur->xc_flags & XC_HEAD_DONE)) {
+               const struct kvec *head = &xdrbuf->head[0];
+
+               sg_set_page(&mr->mr_sg[i],
+                           virt_to_page(head->iov_base),
+                           head->iov_len,
+                           offset_in_page(head->iov_base));
+               cur->xc_flags |= XC_HEAD_DONE;
+               i++;
+               /* Without sg-gap support, each non-contiguous region
+                * must be registered as a separate MR.  Returning
+                * here after the head kvec causes the caller to
+                * invoke frwr_map() again for the page list and
+                * tail.
+                */
+               if (!sg_gaps)
+                       goto finish;
        }
+
+       /* Page list */
+       if (!(cur->xc_flags & XC_PAGES_DONE) && xdrbuf->page_len) {
+               unsigned int page_base, remaining;
+               struct page **ppages;
+
+               remaining = xdrbuf->page_len - cur->xc_page_offset;
+               page_base = offset_in_page(xdrbuf->page_base +
+                                          cur->xc_page_offset);
+               ppages = xdrbuf->pages +
+                        ((xdrbuf->page_base + cur->xc_page_offset)
+                         >> PAGE_SHIFT);
+
+               while (remaining > 0 && i < max_depth) {
+                       unsigned int len;
+
+                       len = min_t(unsigned int,
+                                   PAGE_SIZE - page_base, remaining);
+                       sg_set_page(&mr->mr_sg[i], *ppages,
+                                   len, page_base);
+                       cur->xc_page_offset += len;
+                       i++;
+                       ppages++;
+                       remaining -= len;
+
+                       if (!sg_gaps && remaining > 0 &&
+                           offset_in_page(page_base + len))
+                               goto finish;
+                       page_base = 0;
+               }
+               if (remaining == 0)
+                       cur->xc_flags |= XC_PAGES_DONE;
+       } else if (!(cur->xc_flags & XC_PAGES_DONE)) {
+               cur->xc_flags |= XC_PAGES_DONE;
+       }
+
+       /* Tail kvec */
+       if (!(cur->xc_flags & XC_TAIL_DONE) && xdrbuf->tail[0].iov_len &&
+           i < max_depth) {
+               const struct kvec *tail = &xdrbuf->tail[0];
+
+               if (!sg_gaps && i > 0) {
+                       struct scatterlist *prev = &mr->mr_sg[i - 1];
+
+                       if (offset_in_page(prev->offset + prev->length) ||
+                           offset_in_page(tail->iov_base))
+                               goto finish;
+               }
+               sg_set_page(&mr->mr_sg[i],
+                           virt_to_page(tail->iov_base),
+                           tail->iov_len,
+                           offset_in_page(tail->iov_base));
+               cur->xc_flags |= XC_TAIL_DONE;
+               i++;
+       } else if (!(cur->xc_flags & XC_TAIL_DONE) &&
+                  !xdrbuf->tail[0].iov_len) {
+               cur->xc_flags |= XC_TAIL_DONE;
+       }
+
+finish:
        mr->mr_dir = rpcrdma_data_dir(writing);
        mr->mr_nents = i;
 
@@ -338,15 +407,15 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
        mr->mr_offset = ibmr->iova;
        trace_xprtrdma_mr_map(mr);
 
-       return seg;
+       return 0;
 
 out_dmamap_err:
        trace_xprtrdma_frwr_sgerr(mr, i);
-       return ERR_PTR(-EIO);
+       return -EIO;
 
 out_mapmr_err:
        trace_xprtrdma_frwr_maperr(mr, n);
-       return ERR_PTR(-EIO);
+       return -EIO;
 }
 
 /**
index 3aac1456e23e5389fc720c116f33127053ffbc87..a77e7e48aab2eb5d9dcd4640a8aab91f78a90241 100644 (file)
@@ -200,67 +200,30 @@ rpcrdma_alloc_sparse_pages(struct xdr_buf *buf)
        return 0;
 }
 
-/* Convert @vec to a single SGL element.
- *
- * Returns pointer to next available SGE, and bumps the total number
- * of SGEs consumed.
- */
-static struct rpcrdma_mr_seg *
-rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
-                    unsigned int *n)
+static void
+rpcrdma_xdr_cursor_init(struct rpcrdma_xdr_cursor *cur,
+                       const struct xdr_buf *xdrbuf,
+                       unsigned int pos, enum rpcrdma_chunktype type)
 {
-       seg->mr_page = virt_to_page(vec->iov_base);
-       seg->mr_offset = offset_in_page(vec->iov_base);
-       seg->mr_len = vec->iov_len;
-       ++seg;
-       ++(*n);
-       return seg;
+       cur->xc_buf = xdrbuf;
+       cur->xc_page_offset = 0;
+       cur->xc_flags = 0;
+
+       if (pos != 0)
+               cur->xc_flags |= XC_HEAD_DONE;
+       if (!xdrbuf->page_len)
+               cur->xc_flags |= XC_PAGES_DONE;
+       if (type == rpcrdma_readch || type == rpcrdma_writech ||
+           !xdrbuf->tail[0].iov_len)
+               cur->xc_flags |= XC_TAIL_DONE;
 }
 
-/* Convert @xdrbuf into SGEs no larger than a page each. As they
- * are registered, these SGEs are then coalesced into RDMA segments
- * when the selected memreg mode supports it.
- *
- * Returns positive number of SGEs consumed, or a negative errno.
- */
-
-static int
-rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
-                    unsigned int pos, enum rpcrdma_chunktype type,
-                    struct rpcrdma_mr_seg *seg)
+static bool
+rpcrdma_xdr_cursor_done(const struct rpcrdma_xdr_cursor *cur)
 {
-       unsigned long page_base;
-       unsigned int len, n;
-       struct page **ppages;
-
-       n = 0;
-       if (pos == 0)
-               seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n);
-
-       len = xdrbuf->page_len;
-       ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
-       page_base = offset_in_page(xdrbuf->page_base);
-       while (len) {
-               seg->mr_page = *ppages;
-               seg->mr_offset = page_base;
-               seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
-               len -= seg->mr_len;
-               ++ppages;
-               ++seg;
-               ++n;
-               page_base = 0;
-       }
-
-       if (type == rpcrdma_readch || type == rpcrdma_writech)
-               goto out;
-
-       if (xdrbuf->tail[0].iov_len)
-               rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n);
-
-out:
-       if (unlikely(n > RPCRDMA_MAX_SEGS))
-               return -EIO;
-       return n;
+       return (cur->xc_flags & (XC_HEAD_DONE | XC_PAGES_DONE |
+                                XC_TAIL_DONE)) ==
+              (XC_HEAD_DONE | XC_PAGES_DONE | XC_TAIL_DONE);
 }
 
 static int
@@ -292,11 +255,10 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr,
        return 0;
 }
 
-static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
-                                                struct rpcrdma_req *req,
-                                                struct rpcrdma_mr_seg *seg,
-                                                int nsegs, bool writing,
-                                                struct rpcrdma_mr **mr)
+static int rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
+                             struct rpcrdma_req *req,
+                             struct rpcrdma_xdr_cursor *cur,
+                             bool writing, struct rpcrdma_mr **mr)
 {
        *mr = rpcrdma_mr_pop(&req->rl_free_mrs);
        if (!*mr) {
@@ -307,13 +269,13 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
        }
 
        rpcrdma_mr_push(*mr, &req->rl_registered);
-       return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr);
+       return frwr_map(r_xprt, cur, writing, req->rl_slot.rq_xid, *mr);
 
 out_getmr_err:
        trace_xprtrdma_nomrs_err(r_xprt, req);
        xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
        rpcrdma_mrs_refresh(r_xprt);
-       return ERR_PTR(-EAGAIN);
+       return -EAGAIN;
 }
 
 /* Register and XDR encode the Read list. Supports encoding a list of read
@@ -336,10 +298,10 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
                                    enum rpcrdma_chunktype rtype)
 {
        struct xdr_stream *xdr = &req->rl_stream;
-       struct rpcrdma_mr_seg *seg;
+       struct rpcrdma_xdr_cursor cur;
        struct rpcrdma_mr *mr;
        unsigned int pos;
-       int nsegs;
+       int ret;
 
        if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped)
                goto done;
@@ -347,24 +309,20 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
        pos = rqst->rq_snd_buf.head[0].iov_len;
        if (rtype == rpcrdma_areadch)
                pos = 0;
-       seg = req->rl_segments;
-       nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos,
-                                    rtype, seg);
-       if (nsegs < 0)
-               return nsegs;
+       rpcrdma_xdr_cursor_init(&cur, &rqst->rq_snd_buf, pos, rtype);
 
        do {
-               seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr);
-               if (IS_ERR(seg))
-                       return PTR_ERR(seg);
+               ret = rpcrdma_mr_prepare(r_xprt, req, &cur, false, &mr);
+               if (ret)
+                       return ret;
 
                if (encode_read_segment(xdr, mr, pos) < 0)
                        return -EMSGSIZE;
 
-               trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, nsegs);
+               trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr,
+                                         rpcrdma_xdr_cursor_done(&cur));
                r_xprt->rx_stats.read_chunk_count++;
-               nsegs -= mr->mr_nents;
-       } while (nsegs);
+       } while (!rpcrdma_xdr_cursor_done(&cur));
 
 done:
        if (xdr_stream_encode_item_absent(xdr) < 0)
@@ -394,20 +352,16 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
 {
        struct xdr_stream *xdr = &req->rl_stream;
        struct rpcrdma_ep *ep = r_xprt->rx_ep;
-       struct rpcrdma_mr_seg *seg;
+       struct rpcrdma_xdr_cursor cur;
        struct rpcrdma_mr *mr;
-       int nsegs, nchunks;
+       int nchunks, ret;
        __be32 *segcount;
 
        if (wtype != rpcrdma_writech)
                goto done;
 
-       seg = req->rl_segments;
-       nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf,
-                                    rqst->rq_rcv_buf.head[0].iov_len,
-                                    wtype, seg);
-       if (nsegs < 0)
-               return nsegs;
+       rpcrdma_xdr_cursor_init(&cur, &rqst->rq_rcv_buf,
+                               rqst->rq_rcv_buf.head[0].iov_len, wtype);
 
        if (xdr_stream_encode_item_present(xdr) < 0)
                return -EMSGSIZE;
@@ -418,30 +372,30 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
 
        nchunks = 0;
        do {
-               seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr);
-               if (IS_ERR(seg))
-                       return PTR_ERR(seg);
+               ret = rpcrdma_mr_prepare(r_xprt, req, &cur, true, &mr);
+               if (ret)
+                       return ret;
 
                if (encode_rdma_segment(xdr, mr) < 0)
                        return -EMSGSIZE;
 
-               trace_xprtrdma_chunk_write(rqst->rq_task, mr, nsegs);
+               trace_xprtrdma_chunk_write(rqst->rq_task, mr,
+                                          rpcrdma_xdr_cursor_done(&cur));
                r_xprt->rx_stats.write_chunk_count++;
                r_xprt->rx_stats.total_rdma_request += mr->mr_length;
                nchunks++;
-               nsegs -= mr->mr_nents;
-       } while (nsegs);
+       } while (!rpcrdma_xdr_cursor_done(&cur));
 
        if (xdr_pad_size(rqst->rq_rcv_buf.page_len)) {
                if (encode_rdma_segment(xdr, ep->re_write_pad_mr) < 0)
                        return -EMSGSIZE;
 
                trace_xprtrdma_chunk_wp(rqst->rq_task, ep->re_write_pad_mr,
-                                       nsegs);
+                                       true);
                r_xprt->rx_stats.write_chunk_count++;
-               r_xprt->rx_stats.total_rdma_request += mr->mr_length;
+               r_xprt->rx_stats.total_rdma_request +=
+                       ep->re_write_pad_mr->mr_length;
                nchunks++;
-               nsegs -= mr->mr_nents;
        }
 
        /* Update count of segments in this Write chunk */
@@ -471,9 +425,9 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
                                      enum rpcrdma_chunktype wtype)
 {
        struct xdr_stream *xdr = &req->rl_stream;
-       struct rpcrdma_mr_seg *seg;
+       struct rpcrdma_xdr_cursor cur;
        struct rpcrdma_mr *mr;
-       int nsegs, nchunks;
+       int nchunks, ret;
        __be32 *segcount;
 
        if (wtype != rpcrdma_replych) {
@@ -482,10 +436,7 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
                return 0;
        }
 
-       seg = req->rl_segments;
-       nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
-       if (nsegs < 0)
-               return nsegs;
+       rpcrdma_xdr_cursor_init(&cur, &rqst->rq_rcv_buf, 0, wtype);
 
        if (xdr_stream_encode_item_present(xdr) < 0)
                return -EMSGSIZE;
@@ -496,19 +447,19 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
 
        nchunks = 0;
        do {
-               seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr);
-               if (IS_ERR(seg))
-                       return PTR_ERR(seg);
+               ret = rpcrdma_mr_prepare(r_xprt, req, &cur, true, &mr);
+               if (ret)
+                       return ret;
 
                if (encode_rdma_segment(xdr, mr) < 0)
                        return -EMSGSIZE;
 
-               trace_xprtrdma_chunk_reply(rqst->rq_task, mr, nsegs);
+               trace_xprtrdma_chunk_reply(rqst->rq_task, mr,
+                                          rpcrdma_xdr_cursor_done(&cur));
                r_xprt->rx_stats.reply_chunk_count++;
                r_xprt->rx_stats.total_rdma_request += mr->mr_length;
                nchunks++;
-               nsegs -= mr->mr_nents;
-       } while (nsegs);
+       } while (!rpcrdma_xdr_cursor_done(&cur));
 
        /* Update count of segments in the Reply chunk */
        *segcount = cpu_to_be32(nchunks);
index 8147d2b414940e55b66cf40b36c7fe258b31a013..37bba72065e8fa39fb24534acb1ff505f88fc94f 100644 (file)
@@ -283,19 +283,36 @@ struct rpcrdma_mr {
  * registered or invalidated. Must handle a Reply chunk:
  */
 enum {
-       RPCRDMA_MAX_IOV_SEGS    = 3,
+       RPCRDMA_MAX_IOV_SEGS    = 3,    /* head, page-boundary, tail */
        RPCRDMA_MAX_DATA_SEGS   = ((1 * 1024 * 1024) / PAGE_SIZE) + 1,
        RPCRDMA_MAX_SEGS        = RPCRDMA_MAX_DATA_SEGS +
                                  RPCRDMA_MAX_IOV_SEGS,
 };
 
-/* Arguments for DMA mapping and registration */
-struct rpcrdma_mr_seg {
-       u32             mr_len;         /* length of segment */
-       struct page     *mr_page;       /* underlying struct page */
-       u64             mr_offset;      /* IN: page offset, OUT: iova */
+/**
+ * struct rpcrdma_xdr_cursor - tracks position within an xdr_buf
+ *     for iterative MR registration
+ * @xc_buf: the xdr_buf being iterated
+ * @xc_page_offset: byte offset into the page region consumed so far
+ * @xc_flags: combination of XC_* bits
+ *
+ * Each XC_*_DONE flag indicates that this region has no
+ * remaining MR registration work.  That condition holds both when the region
+ * has already been registered by a prior frwr_map() call and
+ * when the region is excluded from this chunk type (pre-set
+ * at init time by rpcrdma_xdr_cursor_init()).  frwr_map()
+ * treats the two cases identically: skip the region.
+ */
+struct rpcrdma_xdr_cursor {
+       const struct xdr_buf            *xc_buf;
+       unsigned int                    xc_page_offset;
+       unsigned int                    xc_flags;
 };
 
+#define XC_HEAD_DONE   BIT(0)
+#define XC_PAGES_DONE  BIT(1)
+#define XC_TAIL_DONE   BIT(2)
+
 /* The Send SGE array is provisioned to send a maximum size
  * inline request:
  * - RPC-over-RDMA header
@@ -330,7 +347,6 @@ struct rpcrdma_req {
 
        struct list_head        rl_free_mrs;
        struct list_head        rl_registered;
-       struct rpcrdma_mr_seg   rl_segments[RPCRDMA_MAX_SEGS];
 };
 
 static inline struct rpcrdma_req *
@@ -450,8 +466,8 @@ rpcrdma_portstr(const struct rpcrdma_xprt *r_xprt)
 }
 
 /* Setting this to 0 ensures interoperability with early servers.
- * Setting this to 1 enhances certain unaligned read/write performance.
- * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
+ * Setting this to 1 enhances unaligned read/write performance.
+ * Default is 0, see sysctl entry and rpc_rdma.c */
 extern int xprt_rdma_pad_optimize;
 
 /* This setting controls the hunt for a supported memory
@@ -535,10 +551,10 @@ void frwr_reset(struct rpcrdma_req *req);
 int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device);
 int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr);
 void frwr_mr_release(struct rpcrdma_mr *mr);
-struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
-                               struct rpcrdma_mr_seg *seg,
-                               int nsegs, bool writing, __be32 xid,
-                               struct rpcrdma_mr *mr);
+int frwr_map(struct rpcrdma_xprt *r_xprt,
+            struct rpcrdma_xdr_cursor *cur,
+            bool writing, __be32 xid,
+            struct rpcrdma_mr *mr);
 int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
 void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
 void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);