]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
xfs: use vmalloc instead of vm_map_area for buffer backing memory
authorChristoph Hellwig <hch@lst.de>
Mon, 10 Mar 2025 13:19:13 +0000 (14:19 +0100)
committerCarlos Maiolino <cem@kernel.org>
Mon, 10 Mar 2025 13:29:44 +0000 (14:29 +0100)
The fallback buffer allocation path currently open codes a suboptimal
version of vmalloc to allocate pages that are then mapped into
vmalloc space.  Switch to using vmalloc instead, which uses all the
optimizations in the common vmalloc code, and removes the need to
track the backing pages in the xfs_buf structure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
fs/xfs/xfs_buf.c
fs/xfs/xfs_buf.h
fs/xfs/xfs_buf_mem.c

index b5ec7d83210f63407791fcfee0113f9f2e07677a..4aaa588330e4acb1f0f44dc7e37e77fccafb8f54 100644 (file)
@@ -55,13 +55,6 @@ static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
        return bp->b_rhash_key == XFS_BUF_DADDR_NULL;
 }
 
-static inline int
-xfs_buf_vmap_len(
-       struct xfs_buf  *bp)
-{
-       return (bp->b_page_count * PAGE_SIZE);
-}
-
 /*
  * When we mark a buffer stale, we remove the buffer from the LRU and clear the
  * b_lru_ref count so that the buffer is freed immediately when the buffer
@@ -190,29 +183,6 @@ _xfs_buf_alloc(
        return 0;
 }
 
-static void
-xfs_buf_free_pages(
-       struct xfs_buf  *bp)
-{
-       uint            i;
-
-       ASSERT(bp->b_flags & _XBF_PAGES);
-
-       if (is_vmalloc_addr(bp->b_addr))
-               vm_unmap_ram(bp->b_addr, bp->b_page_count);
-
-       for (i = 0; i < bp->b_page_count; i++) {
-               if (bp->b_pages[i])
-                       folio_put(page_folio(bp->b_pages[i]));
-       }
-       mm_account_reclaimed_pages(howmany(BBTOB(bp->b_length), PAGE_SIZE));
-
-       if (bp->b_pages != bp->b_page_array)
-               kfree(bp->b_pages);
-       bp->b_pages = NULL;
-       bp->b_flags &= ~_XBF_PAGES;
-}
-
 static void
 xfs_buf_free_callback(
        struct callback_head    *cb)
@@ -227,16 +197,23 @@ static void
 xfs_buf_free(
        struct xfs_buf          *bp)
 {
+       unsigned int            size = BBTOB(bp->b_length);
+
        trace_xfs_buf_free(bp, _RET_IP_);
 
        ASSERT(list_empty(&bp->b_lru));
 
+       if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE)
+               mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT));
+
        if (xfs_buftarg_is_mem(bp->b_target))
                xmbuf_unmap_page(bp);
-       else if (bp->b_flags & _XBF_PAGES)
-               xfs_buf_free_pages(bp);
+       else if (is_vmalloc_addr(bp->b_addr))
+               vfree(bp->b_addr);
        else if (bp->b_flags & _XBF_KMEM)
                kfree(bp->b_addr);
+       else
+               folio_put(virt_to_folio(bp->b_addr));
 
        call_rcu(&bp->b_rcu, xfs_buf_free_callback);
 }
@@ -264,9 +241,6 @@ xfs_buf_alloc_kmem(
                bp->b_addr = NULL;
                return -ENOMEM;
        }
-       bp->b_pages = bp->b_page_array;
-       bp->b_pages[0] = kmem_to_page(bp->b_addr);
-       bp->b_page_count = 1;
        bp->b_flags |= _XBF_KMEM;
        return 0;
 }
@@ -287,9 +261,9 @@ xfs_buf_alloc_kmem(
  * by the rest of the code - the buffer memory spans a single contiguous memory
  * region that we don't have to map and unmap to access the data directly.
  *
- * The third type of buffer is the multi-page buffer. These are always made
- * up of single pages so that they can be fed to vmap_ram() to return a
- * contiguous memory region we can access the data through.
+ * The third type of buffer is the vmalloc()d buffer. This provides the buffer
+ * with the required contiguous memory region but backed by discontiguous
+ * physical pages.
  */
 static int
 xfs_buf_alloc_backing_mem(
@@ -299,7 +273,6 @@ xfs_buf_alloc_backing_mem(
        size_t          size = BBTOB(bp->b_length);
        gfp_t           gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
        struct folio    *folio;
-       long            filled = 0;
 
        if (xfs_buftarg_is_mem(bp->b_target))
                return xmbuf_map_page(bp);
@@ -351,98 +324,18 @@ xfs_buf_alloc_backing_mem(
                goto fallback;
        }
        bp->b_addr = folio_address(folio);
-       bp->b_page_array[0] = &folio->page;
-       bp->b_pages = bp->b_page_array;
-       bp->b_page_count = 1;
-       bp->b_flags |= _XBF_PAGES;
        return 0;
 
 fallback:
-       /* Fall back to allocating an array of single page folios. */
-       bp->b_page_count = DIV_ROUND_UP(size, PAGE_SIZE);
-       if (bp->b_page_count <= XB_PAGES) {
-               bp->b_pages = bp->b_page_array;
-       } else {
-               bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
-                                       gfp_mask);
-               if (!bp->b_pages)
-                       return -ENOMEM;
-       }
-       bp->b_flags |= _XBF_PAGES;
-
-       /*
-        * Bulk filling of pages can take multiple calls. Not filling the entire
-        * array is not an allocation failure, so don't back off if we get at
-        * least one extra page.
-        */
        for (;;) {
-               long    last = filled;
-
-               filled = alloc_pages_bulk(gfp_mask, bp->b_page_count,
-                                         bp->b_pages);
-               if (filled == bp->b_page_count) {
-                       XFS_STATS_INC(bp->b_mount, xb_page_found);
+               bp->b_addr = __vmalloc(size, gfp_mask);
+               if (bp->b_addr)
                        break;
-               }
-
-               if (filled != last)
-                       continue;
-
-               if (flags & XBF_READ_AHEAD) {
-                       xfs_buf_free_pages(bp);
+               if (flags & XBF_READ_AHEAD)
                        return -ENOMEM;
-               }
-
                XFS_STATS_INC(bp->b_mount, xb_page_retries);
                memalloc_retry_wait(gfp_mask);
        }
-       return 0;
-}
-
-/*
- *     Map buffer into kernel address-space if necessary.
- */
-STATIC int
-_xfs_buf_map_pages(
-       struct xfs_buf          *bp,
-       xfs_buf_flags_t         flags)
-{
-       ASSERT(bp->b_flags & _XBF_PAGES);
-       if (bp->b_page_count == 1) {
-               /* A single page buffer is always mappable */
-               bp->b_addr = page_address(bp->b_pages[0]);
-       } else {
-               int retried = 0;
-               unsigned nofs_flag;
-
-               /*
-                * vm_map_ram() will allocate auxiliary structures (e.g.
-                * pagetables) with GFP_KERNEL, yet we often under a scoped nofs
-                * context here. Mixing GFP_KERNEL with GFP_NOFS allocations
-                * from the same call site that can be run from both above and
-                * below memory reclaim causes lockdep false positives. Hence we
-                * always need to force this allocation to nofs context because
-                * we can't pass __GFP_NOLOCKDEP down to auxillary structures to
-                * prevent false positive lockdep reports.
-                *
-                * XXX(dgc): I think dquot reclaim is the only place we can get
-                * to this function from memory reclaim context now. If we fix
-                * that like we've fixed inode reclaim to avoid writeback from
-                * reclaim, this nofs wrapping can go away.
-                */
-               nofs_flag = memalloc_nofs_save();
-               do {
-                       bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-                                               -1);
-                       if (bp->b_addr)
-                               break;
-                       vm_unmap_aliases();
-               } while (retried++ <= 1);
-               memalloc_nofs_restore(nofs_flag);
-
-               if (!bp->b_addr)
-                       return -ENOMEM;
-       }
 
        return 0;
 }
@@ -562,7 +455,7 @@ xfs_buf_find_lock(
                        return -ENOENT;
                }
                ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
-               bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
+               bp->b_flags &= _XBF_KMEM;
                bp->b_ops = NULL;
        }
        return 0;
@@ -748,18 +641,6 @@ xfs_buf_get_map(
                        xfs_perag_put(pag);
        }
 
-       /* We do not hold a perag reference anymore. */
-       if (!bp->b_addr) {
-               error = _xfs_buf_map_pages(bp, flags);
-               if (unlikely(error)) {
-                       xfs_warn_ratelimited(btp->bt_mount,
-                               "%s: failed to map %u pages", __func__,
-                               bp->b_page_count);
-                       xfs_buf_relse(bp);
-                       return error;
-               }
-       }
-
        /*
         * Clear b_error if this is a lookup from a caller that doesn't expect
         * valid data to be found in the buffer.
@@ -1002,14 +883,6 @@ xfs_buf_get_uncached(
        if (error)
                goto fail_free_buf;
 
-       if (!bp->b_addr)
-               error = _xfs_buf_map_pages(bp, 0);
-       if (unlikely(error)) {
-               xfs_warn(target->bt_mount,
-                       "%s: failed to map pages", __func__);
-               goto fail_free_buf;
-       }
-
        trace_xfs_buf_get_uncached(bp, _RET_IP_);
        *bpp = bp;
        return 0;
@@ -1343,7 +1216,7 @@ __xfs_buf_ioend(
        if (bp->b_flags & XBF_READ) {
                if (!bp->b_error && is_vmalloc_addr(bp->b_addr))
                        invalidate_kernel_vmap_range(bp->b_addr,
-                                       xfs_buf_vmap_len(bp));
+                               roundup(BBTOB(bp->b_length), PAGE_SIZE));
                if (!bp->b_error && bp->b_ops)
                        bp->b_ops->verify_read(bp);
                if (!bp->b_error)
@@ -1504,29 +1377,48 @@ static void
 xfs_buf_submit_bio(
        struct xfs_buf          *bp)
 {
-       unsigned int            size = BBTOB(bp->b_length);
-       unsigned int            map = 0, p;
+       unsigned int            map = 0;
        struct blk_plug         plug;
        struct bio              *bio;
 
-       bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
-                       xfs_buf_bio_op(bp), GFP_NOIO);
-       bio->bi_private = bp;
-       bio->bi_end_io = xfs_buf_bio_end_io;
+       if (is_vmalloc_addr(bp->b_addr)) {
+               unsigned int    size = BBTOB(bp->b_length);
+               unsigned int    alloc_size = roundup(size, PAGE_SIZE);
+               void            *data = bp->b_addr;
 
-       if (bp->b_page_count == 1) {
-               __bio_add_page(bio, virt_to_page(bp->b_addr), size,
-                               offset_in_page(bp->b_addr));
-       } else {
-               for (p = 0; p < bp->b_page_count; p++)
-                       __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
-               bio->bi_iter.bi_size = size; /* limit to the actual size used */
+               bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT,
+                               xfs_buf_bio_op(bp), GFP_NOIO);
+
+               do {
+                       unsigned int    len = min(size, PAGE_SIZE);
 
-               if (is_vmalloc_addr(bp->b_addr))
-                       flush_kernel_vmap_range(bp->b_addr,
-                                       xfs_buf_vmap_len(bp));
+                       ASSERT(offset_in_page(data) == 0);
+                       __bio_add_page(bio, vmalloc_to_page(data), len, 0);
+                       data += len;
+                       size -= len;
+               } while (size);
+
+               flush_kernel_vmap_range(bp->b_addr, alloc_size);
+       } else {
+               /*
+                * Single folio or slab allocation.  Must be contiguous and thus
+                * only a single bvec is needed.
+                *
+                * This uses the page based bio add helper for now as that is
+                * the lowest common denominator between folios and slab
+                * allocations.  To be replaced with a better block layer
+                * helper soon (hopefully).
+                */
+               bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp),
+                               GFP_NOIO);
+               __bio_add_page(bio, virt_to_page(bp->b_addr),
+                               BBTOB(bp->b_length),
+                               offset_in_page(bp->b_addr));
        }
 
+       bio->bi_private = bp;
+       bio->bi_end_io = xfs_buf_bio_end_io;
+
        /*
         * If there is more than one map segment, split out a new bio for each
         * map except of the last one.  The last map is handled by the
index 8db522f19b0c0fe29041f634479a1bec9bef073a..db43bdc17f55bb50fbab291330ebd8692c6f3e26 100644 (file)
@@ -36,7 +36,6 @@ struct xfs_buf;
 #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
 
 /* flags used only internally */
-#define _XBF_PAGES      (1u << 20)/* backed by refcounted pages */
 #define _XBF_KMEM       (1u << 21)/* backed by heap memory */
 #define _XBF_DELWRI_Q   (1u << 22)/* buffer on a delwri queue */
 
@@ -61,7 +60,6 @@ typedef unsigned int xfs_buf_flags_t;
        { XBF_STALE,            "STALE" }, \
        { XBF_WRITE_FAIL,       "WRITE_FAIL" }, \
        { _XBF_LOGRECOVERY,     "LOG_RECOVERY" }, \
-       { _XBF_PAGES,           "PAGES" }, \
        { _XBF_KMEM,            "KMEM" }, \
        { _XBF_DELWRI_Q,        "DELWRI_Q" }, \
        /* The following interface flags should never be set */ \
@@ -122,8 +120,6 @@ struct xfs_buftarg {
        struct xfs_buf_cache    bt_cache[];
 };
 
-#define XB_PAGES       2
-
 struct xfs_buf_map {
        xfs_daddr_t             bm_bn;  /* block number for I/O */
        int                     bm_len; /* size of I/O */
@@ -185,13 +181,10 @@ struct xfs_buf {
        struct xfs_buf_log_item *b_log_item;
        struct list_head        b_li_list;      /* Log items list head */
        struct xfs_trans        *b_transp;
-       struct page             **b_pages;      /* array of page pointers */
-       struct page             *b_page_array[XB_PAGES]; /* inline pages */
        struct xfs_buf_map      *b_maps;        /* compound buffer map */
        struct xfs_buf_map      __b_map;        /* inline compound buffer map */
        int                     b_map_count;
        atomic_t                b_pin_count;    /* pin count */
-       unsigned int            b_page_count;   /* size of page array */
        int                     b_error;        /* error code on I/O */
        void                    (*b_iodone)(struct xfs_buf *bp);
 
index 5b64a2b3b113f96429658b596175142e2e8e51ad..b207754d2ee029965cabddf9eaf3f8d8db7cf56e 100644 (file)
@@ -169,9 +169,6 @@ xmbuf_map_page(
        unlock_page(page);
 
        bp->b_addr = page_address(page);
-       bp->b_pages = bp->b_page_array;
-       bp->b_pages[0] = page;
-       bp->b_page_count = 1;
        return 0;
 }
 
@@ -180,16 +177,10 @@ void
 xmbuf_unmap_page(
        struct xfs_buf          *bp)
 {
-       struct page             *page = bp->b_pages[0];
-
        ASSERT(xfs_buftarg_is_mem(bp->b_target));
 
-       put_page(page);
-
+       put_page(virt_to_page(bp->b_addr));
        bp->b_addr = NULL;
-       bp->b_pages[0] = NULL;
-       bp->b_pages = NULL;
-       bp->b_page_count = 0;
 }
 
 /* Is this a valid daddr within the buftarg? */