xfs: use vmalloc instead of vm_map_area for buffer backing memory

author Christoph Hellwig <hch@lst.de>

Mon, 10 Mar 2025 13:19:13 +0000 (14:19 +0100)

committer Carlos Maiolino <cem@kernel.org>

Mon, 10 Mar 2025 13:29:44 +0000 (14:29 +0100)
author Christoph Hellwig <hch@lst.de>
Mon, 10 Mar 2025 13:19:13 +0000 (14:19 +0100)
committer Carlos Maiolino <cem@kernel.org>
Mon, 10 Mar 2025 13:29:44 +0000 (14:29 +0100)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c

index b5ec7d83210f63407791fcfee0113f9f2e07677a..4aaa588330e4acb1f0f44dc7e37e77fccafb8f54 100644 (file)
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -55,13 +55,6 @@ static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
         return bp->b_rhash_key == XFS_BUF_DADDR_NULL;
  }
  
-static inline int
-xfs_buf_vmap_len(
-       struct xfs_buf  *bp)
-{
-       return (bp->b_page_count * PAGE_SIZE);
-}
-
  /*
   * When we mark a buffer stale, we remove the buffer from the LRU and clear the
   * b_lru_ref count so that the buffer is freed immediately when the buffer
@@ -190,29 +183,6 @@ _xfs_buf_alloc(
         return 0;
  }
  
-static void
-xfs_buf_free_pages(
-       struct xfs_buf  *bp)
-{
-       uint            i;
-
-       ASSERT(bp->b_flags & _XBF_PAGES);
-
-       if (is_vmalloc_addr(bp->b_addr))
-               vm_unmap_ram(bp->b_addr, bp->b_page_count);
-
-       for (i = 0; i < bp->b_page_count; i++) {
-               if (bp->b_pages[i])
-                       folio_put(page_folio(bp->b_pages[i]));
-       }
-       mm_account_reclaimed_pages(howmany(BBTOB(bp->b_length), PAGE_SIZE));
-
-       if (bp->b_pages != bp->b_page_array)
-               kfree(bp->b_pages);
-       bp->b_pages = NULL;
-       bp->b_flags &= ~_XBF_PAGES;
-}
-
  static void
  xfs_buf_free_callback(
         struct callback_head    *cb)
@@ -227,16 +197,23 @@ static void
  xfs_buf_free(
         struct xfs_buf          *bp)
  {
+       unsigned int            size = BBTOB(bp->b_length);
+
         trace_xfs_buf_free(bp, _RET_IP_);
  
         ASSERT(list_empty(&bp->b_lru));
  
+       if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE)
+               mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT));
+
         if (xfs_buftarg_is_mem(bp->b_target))
                 xmbuf_unmap_page(bp);
-       else if (bp->b_flags & _XBF_PAGES)
-               xfs_buf_free_pages(bp);
+       else if (is_vmalloc_addr(bp->b_addr))
+               vfree(bp->b_addr);
         else if (bp->b_flags & _XBF_KMEM)
                 kfree(bp->b_addr);
+       else
+               folio_put(virt_to_folio(bp->b_addr));
  
         call_rcu(&bp->b_rcu, xfs_buf_free_callback);
  }
@@ -264,9 +241,6 @@ xfs_buf_alloc_kmem(
                 bp->b_addr = NULL;
                 return -ENOMEM;
         }
-       bp->b_pages = bp->b_page_array;
-       bp->b_pages[0] = kmem_to_page(bp->b_addr);
-       bp->b_page_count = 1;
         bp->b_flags |= _XBF_KMEM;
         return 0;
  }
@@ -287,9 +261,9 @@ xfs_buf_alloc_kmem(
   * by the rest of the code - the buffer memory spans a single contiguous memory
   * region that we don't have to map and unmap to access the data directly.
   *
- * The third type of buffer is the multi-page buffer. These are always made
- * up of single pages so that they can be fed to vmap_ram() to return a
- * contiguous memory region we can access the data through.
+ * The third type of buffer is the vmalloc()d buffer. This provides the buffer
+ * with the required contiguous memory region but backed by discontiguous
+ * physical pages.
   */
  static int
  xfs_buf_alloc_backing_mem(
@@ -299,7 +273,6 @@ xfs_buf_alloc_backing_mem(
         size_t          size = BBTOB(bp->b_length);
         gfp_t           gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
         struct folio    *folio;
-       long            filled = 0;
  
         if (xfs_buftarg_is_mem(bp->b_target))
                 return xmbuf_map_page(bp);
@@ -351,98 +324,18 @@ xfs_buf_alloc_backing_mem(
                 goto fallback;
         }
         bp->b_addr = folio_address(folio);
-       bp->b_page_array[0] = &folio->page;
-       bp->b_pages = bp->b_page_array;
-       bp->b_page_count = 1;
-       bp->b_flags |= _XBF_PAGES;
         return 0;
  
  fallback:
-       /* Fall back to allocating an array of single page folios. */
-       bp->b_page_count = DIV_ROUND_UP(size, PAGE_SIZE);
-       if (bp->b_page_count <= XB_PAGES) {
-               bp->b_pages = bp->b_page_array;
-       } else {
-               bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
-                                       gfp_mask);
-               if (!bp->b_pages)
-                       return -ENOMEM;
-       }
-       bp->b_flags |= _XBF_PAGES;
-
-       /*
-        * Bulk filling of pages can take multiple calls. Not filling the entire
-        * array is not an allocation failure, so don't back off if we get at
-        * least one extra page.
-        */
         for (;;) {
-               long    last = filled;
-
-               filled = alloc_pages_bulk(gfp_mask, bp->b_page_count,
-                                         bp->b_pages);
-               if (filled == bp->b_page_count) {
-                       XFS_STATS_INC(bp->b_mount, xb_page_found);
+               bp->b_addr = __vmalloc(size, gfp_mask);
+               if (bp->b_addr)
                         break;
-               }
-
-               if (filled != last)
-                       continue;
-
-               if (flags & XBF_READ_AHEAD) {
-                       xfs_buf_free_pages(bp);
+               if (flags & XBF_READ_AHEAD)
                         return -ENOMEM;
-               }
-
                 XFS_STATS_INC(bp->b_mount, xb_page_retries);
                 memalloc_retry_wait(gfp_mask);
         }
-       return 0;
-}
-
-/*
- *     Map buffer into kernel address-space if necessary.
- */
-STATIC int
-_xfs_buf_map_pages(
-       struct xfs_buf          *bp,
-       xfs_buf_flags_t         flags)
-{
-       ASSERT(bp->b_flags & _XBF_PAGES);
-       if (bp->b_page_count == 1) {
-               /* A single page buffer is always mappable */
-               bp->b_addr = page_address(bp->b_pages[0]);
-       } else {
-               int retried = 0;
-               unsigned nofs_flag;
-
-               /*
-                * vm_map_ram() will allocate auxiliary structures (e.g.
-                * pagetables) with GFP_KERNEL, yet we often under a scoped nofs
-                * context here. Mixing GFP_KERNEL with GFP_NOFS allocations
-                * from the same call site that can be run from both above and
-                * below memory reclaim causes lockdep false positives. Hence we
-                * always need to force this allocation to nofs context because
-                * we can't pass __GFP_NOLOCKDEP down to auxillary structures to
-                * prevent false positive lockdep reports.
-                *
-                * XXX(dgc): I think dquot reclaim is the only place we can get
-                * to this function from memory reclaim context now. If we fix
-                * that like we've fixed inode reclaim to avoid writeback from
-                * reclaim, this nofs wrapping can go away.
-                */
-               nofs_flag = memalloc_nofs_save();
-               do {
-                       bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-                                               -1);
-                       if (bp->b_addr)
-                               break;
-                       vm_unmap_aliases();
-               } while (retried++ <= 1);
-               memalloc_nofs_restore(nofs_flag);
-
-               if (!bp->b_addr)
-                       return -ENOMEM;
-       }
  
         return 0;
  }
@@ -562,7 +455,7 @@ xfs_buf_find_lock(
                         return -ENOENT;
                 }
                 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
-               bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
+               bp->b_flags &= _XBF_KMEM;
                 bp->b_ops = NULL;
         }
         return 0;
@@ -748,18 +641,6 @@ xfs_buf_get_map(
                         xfs_perag_put(pag);
         }
  
-       /* We do not hold a perag reference anymore. */
-       if (!bp->b_addr) {
-               error = _xfs_buf_map_pages(bp, flags);
-               if (unlikely(error)) {
-                       xfs_warn_ratelimited(btp->bt_mount,
-                               "%s: failed to map %u pages", __func__,
-                               bp->b_page_count);
-                       xfs_buf_relse(bp);
-                       return error;
-               }
-       }
-
         /*
          * Clear b_error if this is a lookup from a caller that doesn't expect
          * valid data to be found in the buffer.
@@ -1002,14 +883,6 @@ xfs_buf_get_uncached(
         if (error)
                 goto fail_free_buf;
  
-       if (!bp->b_addr)
-               error = _xfs_buf_map_pages(bp, 0);
-       if (unlikely(error)) {
-               xfs_warn(target->bt_mount,
-                       "%s: failed to map pages", __func__);
-               goto fail_free_buf;
-       }
-
         trace_xfs_buf_get_uncached(bp, _RET_IP_);
         *bpp = bp;
         return 0;
@@ -1343,7 +1216,7 @@ __xfs_buf_ioend(
         if (bp->b_flags & XBF_READ) {
                 if (!bp->b_error && is_vmalloc_addr(bp->b_addr))
                         invalidate_kernel_vmap_range(bp->b_addr,
-                                       xfs_buf_vmap_len(bp));
+                               roundup(BBTOB(bp->b_length), PAGE_SIZE));
                 if (!bp->b_error && bp->b_ops)
                         bp->b_ops->verify_read(bp);
                 if (!bp->b_error)
@@ -1504,29 +1377,48 @@ static void
  xfs_buf_submit_bio(
         struct xfs_buf          *bp)
  {
-       unsigned int            size = BBTOB(bp->b_length);
-       unsigned int            map = 0, p;
+       unsigned int            map = 0;
         struct blk_plug         plug;
         struct bio              *bio;
  
-       bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
-                       xfs_buf_bio_op(bp), GFP_NOIO);
-       bio->bi_private = bp;
-       bio->bi_end_io = xfs_buf_bio_end_io;
+       if (is_vmalloc_addr(bp->b_addr)) {
+               unsigned int    size = BBTOB(bp->b_length);
+               unsigned int    alloc_size = roundup(size, PAGE_SIZE);
+               void            *data = bp->b_addr;
  
-       if (bp->b_page_count == 1) {
-               __bio_add_page(bio, virt_to_page(bp->b_addr), size,
-                               offset_in_page(bp->b_addr));
-       } else {
-               for (p = 0; p < bp->b_page_count; p++)
-                       __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
-               bio->bi_iter.bi_size = size; /* limit to the actual size used */
+               bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT,
+                               xfs_buf_bio_op(bp), GFP_NOIO);
+
+               do {
+                       unsigned int    len = min(size, PAGE_SIZE);
  
-               if (is_vmalloc_addr(bp->b_addr))
-                       flush_kernel_vmap_range(bp->b_addr,
-                                       xfs_buf_vmap_len(bp));
+                       ASSERT(offset_in_page(data) == 0);
+                       __bio_add_page(bio, vmalloc_to_page(data), len, 0);
+                       data += len;
+                       size -= len;
+               } while (size);
+
+               flush_kernel_vmap_range(bp->b_addr, alloc_size);
+       } else {
+               /*
+                * Single folio or slab allocation.  Must be contiguous and thus
+                * only a single bvec is needed.
+                *
+                * This uses the page based bio add helper for now as that is
+                * the lowest common denominator between folios and slab
+                * allocations.  To be replaced with a better block layer
+                * helper soon (hopefully).
+                */
+               bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp),
+                               GFP_NOIO);
+               __bio_add_page(bio, virt_to_page(bp->b_addr),
+                               BBTOB(bp->b_length),
+                               offset_in_page(bp->b_addr));
         }
  
+       bio->bi_private = bp;
+       bio->bi_end_io = xfs_buf_bio_end_io;
+
         /*
          * If there is more than one map segment, split out a new bio for each
          * map except of the last one.  The last map is handled by the
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h

index 8db522f19b0c0fe29041f634479a1bec9bef073a..db43bdc17f55bb50fbab291330ebd8692c6f3e26 100644 (file)
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -36,7 +36,6 @@ struct xfs_buf;
  #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
  
  /* flags used only internally */
-#define _XBF_PAGES      (1u << 20)/* backed by refcounted pages */
  #define _XBF_KMEM       (1u << 21)/* backed by heap memory */
  #define _XBF_DELWRI_Q   (1u << 22)/* buffer on a delwri queue */
  
@@ -61,7 +60,6 @@ typedef unsigned int xfs_buf_flags_t;
         { XBF_STALE,            "STALE" }, \
         { XBF_WRITE_FAIL,       "WRITE_FAIL" }, \
         { _XBF_LOGRECOVERY,     "LOG_RECOVERY" }, \
-       { _XBF_PAGES,           "PAGES" }, \
         { _XBF_KMEM,            "KMEM" }, \
         { _XBF_DELWRI_Q,        "DELWRI_Q" }, \
         /* The following interface flags should never be set */ \
@@ -122,8 +120,6 @@ struct xfs_buftarg {
         struct xfs_buf_cache    bt_cache[];
  };
  
-#define XB_PAGES       2
-
  struct xfs_buf_map {
         xfs_daddr_t             bm_bn;  /* block number for I/O */
         int                     bm_len; /* size of I/O */
@@ -185,13 +181,10 @@ struct xfs_buf {
         struct xfs_buf_log_item *b_log_item;
         struct list_head        b_li_list;      /* Log items list head */
         struct xfs_trans        *b_transp;
-       struct page             **b_pages;      /* array of page pointers */
-       struct page             *b_page_array[XB_PAGES]; /* inline pages */
         struct xfs_buf_map      *b_maps;        /* compound buffer map */
         struct xfs_buf_map      __b_map;        /* inline compound buffer map */
         int                     b_map_count;
         atomic_t                b_pin_count;    /* pin count */
-       unsigned int            b_page_count;   /* size of page array */
         int                     b_error;        /* error code on I/O */
         void                    (*b_iodone)(struct xfs_buf *bp);
  
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c

index 5b64a2b3b113f96429658b596175142e2e8e51ad..b207754d2ee029965cabddf9eaf3f8d8db7cf56e 100644 (file)
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -169,9 +169,6 @@ xmbuf_map_page(
         unlock_page(page);
  
         bp->b_addr = page_address(page);
-       bp->b_pages = bp->b_page_array;
-       bp->b_pages[0] = page;
-       bp->b_page_count = 1;
         return 0;
  }
  
@@ -180,16 +177,10 @@ void
  xmbuf_unmap_page(
         struct xfs_buf          *bp)
  {
-       struct page             *page = bp->b_pages[0];
-
         ASSERT(xfs_buftarg_is_mem(bp->b_target));
  
-       put_page(page);
-
+       put_page(virt_to_page(bp->b_addr));
         bp->b_addr = NULL;
-       bp->b_pages[0] = NULL;
-       bp->b_pages = NULL;
-       bp->b_page_count = 0;
  }
  
  /* Is this a valid daddr within the buftarg? */
author	Christoph Hellwig <hch@lst.de>
	Mon, 10 Mar 2025 13:19:13 +0000 (14:19 +0100)
committer	Carlos Maiolino <cem@kernel.org>
	Mon, 10 Mar 2025 13:29:44 +0000 (14:29 +0100)
fs/xfs/xfs_buf.c		patch \| blob \| blame \| history
fs/xfs/xfs_buf.h		patch \| blob \| blame \| history
fs/xfs/xfs_buf_mem.c		patch \| blob \| blame \| history