]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
Revert "svcrdma: Use contiguous pages for RDMA Read sink buffers"
authorChuck Lever <chuck.lever@oracle.com>
Sun, 7 Jun 2026 02:24:56 +0000 (22:24 -0400)
committerChuck Lever <cel@kernel.org>
Tue, 9 Jun 2026 20:32:59 +0000 (16:32 -0400)
Jonathan Flynn reports that commit 18755b8c2f24 ("svcrdma: Use
contiguous pages for RDMA Read sink buffers") regresses NFS/RDMA
WRITE throughput from 73.9 GiB/s to 30.3 GiB/s on a 128-core
single-NUMA-node server driving dual 400Gb/s links with 640 nfsd
threads. Server CPU utilization rises from 8.5% to 76%, with
roughly three quarters of all cycles spent spinning on zone->lock.

The sink buffers are allocated as high-order page blocks, split
into single pages so each sub-page carries an independent refcount,
and later released one page at a time through folio batches. The
per-CPU page caches cannot satisfy an allocation stream whose alloc
order differs from its free order, so every sink buffer page makes
a round trip through the buddy allocator's free lists, serialized
on the zone lock of the single NUMA node. The rq_pages entries that
the split pages displace, bulk-allocated moments earlier by
svc_alloc_arg(), are freed without ever being used, doubling the
allocator traffic.

The regression cannot be addressed trivially. Revert the commit
now; a reworked approach can return in an upcoming merge window.

Reported-by: Jonathan Flynn <jonathan.flynn@hammerspace.com>
Reported-by: Mike Snitzer <snitzer@kernel.org>
Closes: https://lore.kernel.org/linux-nfs/aiHlPmeZq3WgMwoJ@kernel.org/
Closes: https://lore.kernel.org/linux-nfs/3cb119b4b2a8aada30c0c60286778a54@mail.gmail.com/
Fixes: 18755b8c2f24 ("svcrdma: Use contiguous pages for RDMA Read sink buffers")
Cc: stable@vger.kernel.org
Tested-by: Jonathan Flynn <jonathan.flynn@hammerspace.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
net/sunrpc/xprtrdma/svc_rdma_rw.c

index cca8ec973de4b98598585cf900837b29d19c14f3..13554793b039d364bd13197339264047b4eb52d2 100644 (file)
@@ -745,216 +745,6 @@ int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma,
        return xdr->len;
 }
 
-/*
- * Cap contiguous RDMA Read sink allocations at order-4.
- * Higher orders risk allocation failure under
- * __GFP_NORETRY, which would negate the benefit of the
- * contiguous fast path.
- */
-#define SVC_RDMA_CONTIG_MAX_ORDER      4
-
-/**
- * svc_rdma_alloc_read_pages - Allocate physically contiguous pages
- * @nr_pages: number of pages needed
- * @order: on success, set to the allocation order
- *
- * Attempts a higher-order allocation, falling back to smaller orders.
- * The returned pages are split immediately so each sub-page has its
- * own refcount and can be freed independently.
- *
- * Returns a pointer to the first page on success, or NULL if even
- * order-1 allocation fails.
- */
-static struct page *
-svc_rdma_alloc_read_pages(unsigned int nr_pages, unsigned int *order)
-{
-       unsigned int o;
-       struct page *page;
-
-       o = min(get_order(nr_pages << PAGE_SHIFT),
-               SVC_RDMA_CONTIG_MAX_ORDER);
-
-       while (o >= 1) {
-               page = alloc_pages(GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN,
-                                  o);
-               if (page) {
-                       split_page(page, o);
-                       *order = o;
-                       return page;
-               }
-               o--;
-       }
-       return NULL;
-}
-
-/*
- * svc_rdma_fill_contig_bvec - Replace rq_pages with a contiguous allocation
- * @rqstp: RPC transaction context
- * @head: context for ongoing I/O
- * @bv: bvec entry to fill
- * @pages_left: number of data pages remaining in the segment
- * @len_left: bytes remaining in the segment
- *
- * On success, fills @bv with a bvec spanning the contiguous range and
- * advances rc_curpage/rc_page_count. Returns the byte length covered,
- * or zero if the allocation failed or would overrun rq_maxpages.
- */
-static unsigned int
-svc_rdma_fill_contig_bvec(struct svc_rqst *rqstp,
-                         struct svc_rdma_recv_ctxt *head,
-                         struct bio_vec *bv, unsigned int pages_left,
-                         unsigned int len_left)
-{
-       unsigned int order, npages, chunk_pages, chunk_len, i;
-       struct page *page;
-
-       page = svc_rdma_alloc_read_pages(pages_left, &order);
-       if (!page)
-               return 0;
-       npages = 1 << order;
-
-       if (head->rc_curpage + npages > rqstp->rq_maxpages) {
-               for (i = 0; i < npages; i++)
-                       __free_page(page + i);
-               return 0;
-       }
-
-       /*
-        * Replace rq_pages[] entries with pages from the contiguous
-        * allocation. If npages exceeds chunk_pages, the extra pages
-        * stay in rq_pages[] for later reuse or normal rqst teardown.
-        */
-       for (i = 0; i < npages; i++) {
-               svc_rqst_page_release(rqstp,
-                                     rqstp->rq_pages[head->rc_curpage + i]);
-               rqstp->rq_pages[head->rc_curpage + i] = page + i;
-       }
-
-       chunk_pages = min(npages, pages_left);
-       chunk_len = min_t(unsigned int, chunk_pages << PAGE_SHIFT, len_left);
-       bvec_set_page(bv, page, chunk_len, 0);
-       head->rc_page_count += chunk_pages;
-       head->rc_curpage += chunk_pages;
-       return chunk_len;
-}
-
-/*
- * svc_rdma_fill_page_bvec - Add a single rq_page to the bvec array
- * @head: context for ongoing I/O
- * @ctxt: R/W context whose bvec array is being filled
- * @cur: page to add
- * @bvec_idx: pointer to current bvec index, not advanced on merge
- * @len_left: bytes remaining in the segment
- *
- * If @cur is physically contiguous with the preceding bvec, it is
- * merged by extending that bvec's length. Otherwise a new bvec
- * entry is created. Returns the byte length covered.
- */
-static unsigned int
-svc_rdma_fill_page_bvec(struct svc_rdma_recv_ctxt *head,
-                       struct svc_rdma_rw_ctxt *ctxt, struct page *cur,
-                       unsigned int *bvec_idx, unsigned int len_left)
-{
-       unsigned int chunk_len = min_t(unsigned int, PAGE_SIZE, len_left);
-
-       head->rc_page_count++;
-       head->rc_curpage++;
-
-       if (*bvec_idx > 0) {
-               struct bio_vec *prev = &ctxt->rw_bvec[*bvec_idx - 1];
-
-               if (page_to_phys(prev->bv_page) + prev->bv_offset +
-                   prev->bv_len == page_to_phys(cur)) {
-                       prev->bv_len += chunk_len;
-                       return chunk_len;
-               }
-       }
-
-       bvec_set_page(&ctxt->rw_bvec[*bvec_idx], cur, chunk_len, 0);
-       (*bvec_idx)++;
-       return chunk_len;
-}
-
-/**
- * svc_rdma_build_read_segment_contig - Build RDMA Read WR with contiguous pages
- * @rqstp: RPC transaction context
- * @head: context for ongoing I/O
- * @segment: co-ordinates of remote memory to be read
- *
- * Greedily allocates higher-order pages to cover the segment,
- * building one bvec per contiguous chunk. Each allocation is
- * split so sub-pages have independent refcounts. When a
- * higher-order allocation fails, remaining pages are covered
- * individually, merging adjacent pages into the preceding bvec
- * when they are physically contiguous. The split sub-pages
- * replace entries in rq_pages[] so downstream cleanup is
- * unchanged.
- *
- * Returns:
- *   %0: the Read WR was constructed successfully
- *   %-ENOMEM: allocation failed
- *   %-EIO: a DMA mapping error occurred
- */
-static int svc_rdma_build_read_segment_contig(struct svc_rqst *rqstp,
-                                             struct svc_rdma_recv_ctxt *head,
-                                             const struct svc_rdma_segment *segment)
-{
-       struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp);
-       struct svc_rdma_chunk_ctxt *cc = &head->rc_cc;
-       unsigned int nr_data_pages, bvec_idx;
-       struct svc_rdma_rw_ctxt *ctxt;
-       unsigned int len_left;
-       int ret;
-
-       nr_data_pages = PAGE_ALIGN(segment->rs_length) >> PAGE_SHIFT;
-       if (head->rc_curpage + nr_data_pages > rqstp->rq_maxpages)
-               return -ENOMEM;
-
-       ctxt = svc_rdma_get_rw_ctxt(rdma, nr_data_pages);
-       if (!ctxt)
-               return -ENOMEM;
-
-       bvec_idx = 0;
-       len_left = segment->rs_length;
-       while (len_left) {
-               unsigned int pages_left = PAGE_ALIGN(len_left) >> PAGE_SHIFT;
-               unsigned int chunk_len = 0;
-
-               if (pages_left >= 2)
-                       chunk_len = svc_rdma_fill_contig_bvec(rqstp, head,
-                                                             &ctxt->rw_bvec[bvec_idx],
-                                                             pages_left, len_left);
-               if (chunk_len) {
-                       bvec_idx++;
-               } else {
-                       struct page *cur =
-                               rqstp->rq_pages[head->rc_curpage];
-                       chunk_len = svc_rdma_fill_page_bvec(head, ctxt, cur,
-                                                           &bvec_idx,
-                                                           len_left);
-               }
-
-               len_left -= chunk_len;
-       }
-
-       ctxt->rw_nents = bvec_idx;
-
-       head->rc_pageoff = offset_in_page(segment->rs_length);
-       if (head->rc_pageoff)
-               head->rc_curpage--;
-
-       ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset,
-                                  segment->rs_handle, segment->rs_length,
-                                  DMA_FROM_DEVICE);
-       if (ret < 0)
-               return -EIO;
-       percpu_counter_inc(&svcrdma_stat_read);
-
-       list_add(&ctxt->rw_list, &cc->cc_rwctxts);
-       cc->cc_sqecount += ret;
-       return 0;
-}
-
 /**
  * svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment
  * @rqstp: RPC transaction context
@@ -981,14 +771,6 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp,
        if (check_add_overflow(head->rc_pageoff, len, &total))
                return -EINVAL;
        nr_bvec = PAGE_ALIGN(total) >> PAGE_SHIFT;
-
-       if (head->rc_pageoff == 0 && nr_bvec >= 2) {
-               ret = svc_rdma_build_read_segment_contig(rqstp, head,
-                                                        segment);
-               if (ret != -ENOMEM)
-                       return ret;
-       }
-
        ctxt = svc_rdma_get_rw_ctxt(rdma, nr_bvec);
        if (!ctxt)
                return -ENOMEM;
@@ -1334,11 +1116,6 @@ static void svc_rdma_clear_rqst_pages(struct svc_rqst *rqstp,
 {
        unsigned int i;
 
-       /*
-        * Move only pages containing RPC data into rc_pages[]. Pages
-        * from a contiguous allocation that were not used for the
-        * payload remain in rq_pages[] for subsequent reuse.
-        */
        for (i = 0; i < head->rc_page_count; i++) {
                head->rc_pages[i] = rqstp->rq_pages[i];
                rqstp->rq_pages[i] = NULL;