]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
xprtrdma: Pull up sometimes
authorChuck Lever <chuck.lever@oracle.com>
Thu, 17 Oct 2019 18:31:53 +0000 (14:31 -0400)
committerAnna Schumaker <Anna.Schumaker@Netapp.com>
Thu, 24 Oct 2019 14:30:40 +0000 (10:30 -0400)
On some platforms, DMA mapping part of a page is more costly than
copying bytes. Restore the pull-up code and use that when we
think it's going to be faster. The heuristic for now is to pull-up
when the size of the RPC message body fits in the buffer underlying
the head iovec.

Indeed, not involving the I/O MMU can help the RPC/RDMA transport
scale better for tiny I/Os across more RDMA devices. This is because
interaction with the I/O MMU is eliminated, as is handling a Send
completion, for each of these small I/Os. Without the explicit
unmapping, the NIC no longer needs to do a costly internal TLB shoot
down for buffers that are just a handful of bytes.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
include/trace/events/rpcrdma.h
net/sunrpc/xprtrdma/backchannel.c
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h

index f8edab91e09c2eeb24c730e26c984ecc06e1470b..213c72585a5fcb84ddc788382681696090f7c571 100644 (file)
@@ -532,6 +532,8 @@ DEFINE_WRCH_EVENT(write);
 DEFINE_WRCH_EVENT(reply);
 
 TRACE_DEFINE_ENUM(rpcrdma_noch);
+TRACE_DEFINE_ENUM(rpcrdma_noch_pullup);
+TRACE_DEFINE_ENUM(rpcrdma_noch_mapped);
 TRACE_DEFINE_ENUM(rpcrdma_readch);
 TRACE_DEFINE_ENUM(rpcrdma_areadch);
 TRACE_DEFINE_ENUM(rpcrdma_writech);
@@ -540,6 +542,8 @@ TRACE_DEFINE_ENUM(rpcrdma_replych);
 #define xprtrdma_show_chunktype(x)                                     \
                __print_symbolic(x,                                     \
                                { rpcrdma_noch, "inline" },             \
+                               { rpcrdma_noch_pullup, "pullup" },      \
+                               { rpcrdma_noch_mapped, "mapped" },      \
                                { rpcrdma_readch, "read list" },        \
                                { rpcrdma_areadch, "*read list" },      \
                                { rpcrdma_writech, "write list" },      \
index 50e075fcdd8feb13bc9747c613373e9b2570c073..11685245546a3f8318b50a6395dce8c3a4f38587 100644 (file)
@@ -79,7 +79,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
        *p = xdr_zero;
 
        if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN,
-                                     &rqst->rq_snd_buf, rpcrdma_noch))
+                                     &rqst->rq_snd_buf, rpcrdma_noch_pullup))
                return -EIO;
 
        trace_xprtrdma_cb_reply(rqst);
index a441dbf9f1986737cc1b7274d088a8a2de2ff003..4ad88893e9641a5fed986add6aaa35716eeb5194 100644 (file)
@@ -392,7 +392,7 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
        unsigned int pos;
        int nsegs;
 
-       if (rtype == rpcrdma_noch)
+       if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped)
                goto done;
 
        pos = rqst->rq_snd_buf.head[0].iov_len;
@@ -691,6 +691,72 @@ out_mapping_err:
        return false;
 }
 
+/* Copy the tail to the end of the head buffer.
+ */
+static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt,
+                                   struct rpcrdma_req *req,
+                                   struct xdr_buf *xdr)
+{
+       unsigned char *dst;
+
+       dst = (unsigned char *)xdr->head[0].iov_base;
+       dst += xdr->head[0].iov_len + xdr->page_len;
+       memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len);
+       r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len;
+}
+
+/* Copy pagelist content into the head buffer.
+ */
+static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt,
+                                   struct rpcrdma_req *req,
+                                   struct xdr_buf *xdr)
+{
+       unsigned int len, page_base, remaining;
+       struct page **ppages;
+       unsigned char *src, *dst;
+
+       dst = (unsigned char *)xdr->head[0].iov_base;
+       dst += xdr->head[0].iov_len;
+       ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+       page_base = offset_in_page(xdr->page_base);
+       remaining = xdr->page_len;
+       while (remaining) {
+               src = page_address(*ppages);
+               src += page_base;
+               len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
+               memcpy(dst, src, len);
+               r_xprt->rx_stats.pullup_copy_count += len;
+
+               ppages++;
+               dst += len;
+               remaining -= len;
+               page_base = 0;
+       }
+}
+
+/* Copy the contents of @xdr into @rl_sendbuf and DMA sync it.
+ * When the head, pagelist, and tail are small, a pull-up copy
+ * is considerably less costly than DMA mapping the components
+ * of @xdr.
+ *
+ * Assumptions:
+ *  - the caller has already verified that the total length
+ *    of the RPC Call body will fit into @rl_sendbuf.
+ */
+static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt,
+                                       struct rpcrdma_req *req,
+                                       struct xdr_buf *xdr)
+{
+       if (unlikely(xdr->tail[0].iov_len))
+               rpcrdma_pullup_tail_iov(r_xprt, req, xdr);
+
+       if (unlikely(xdr->page_len))
+               rpcrdma_pullup_pagelist(r_xprt, req, xdr);
+
+       /* The whole RPC message resides in the head iovec now */
+       return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len);
+}
+
 static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt,
                                        struct rpcrdma_req *req,
                                        struct xdr_buf *xdr)
@@ -779,7 +845,11 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
                goto out_unmap;
 
        switch (rtype) {
-       case rpcrdma_noch:
+       case rpcrdma_noch_pullup:
+               if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr))
+                       goto out_unmap;
+               break;
+       case rpcrdma_noch_mapped:
                if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr))
                        goto out_unmap;
                break;
@@ -827,6 +897,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
        struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
        struct xdr_stream *xdr = &req->rl_stream;
        enum rpcrdma_chunktype rtype, wtype;
+       struct xdr_buf *buf = &rqst->rq_snd_buf;
        bool ddp_allowed;
        __be32 *p;
        int ret;
@@ -884,8 +955,9 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
         */
        if (rpcrdma_args_inline(r_xprt, rqst)) {
                *p++ = rdma_msg;
-               rtype = rpcrdma_noch;
-       } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
+               rtype = buf->len < rdmab_length(req->rl_sendbuf) ?
+                       rpcrdma_noch_pullup : rpcrdma_noch_mapped;
+       } else if (ddp_allowed && buf->flags & XDRBUF_WRITE) {
                *p++ = rdma_msg;
                rtype = rpcrdma_readch;
        } else {
@@ -927,7 +999,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
                goto out_err;
 
        ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len,
-                                       &rqst->rq_snd_buf, rtype);
+                                       buf, rtype);
        if (ret)
                goto out_err;
 
index 2f465825534361afb8ebab0035fabd7b0268becd..a514e2c89ac342053fae2181a4481bdade0132d6 100644 (file)
@@ -1165,7 +1165,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
        for (i = 0; i < buf->rb_max_requests; i++) {
                struct rpcrdma_req *req;
 
-               req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE,
+               req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2,
                                         GFP_KERNEL);
                if (!req)
                        goto out;
index cdd6a3d43e0f2df23d30be6d12ee7c3495a80fdf..5d15140a026601819f851fa5f3b98682e7898167 100644 (file)
@@ -554,6 +554,8 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
 
 enum rpcrdma_chunktype {
        rpcrdma_noch = 0,
+       rpcrdma_noch_pullup,
+       rpcrdma_noch_mapped,
        rpcrdma_readch,
        rpcrdma_areadch,
        rpcrdma_writech,