btrfs: use buffer xarray for extent buffer writeback operations

author Josef Bacik <josef@toxicpanda.com>

Mon, 28 Apr 2025 14:52:57 +0000 (10:52 -0400)

committer David Sterba <dsterba@suse.com>

Thu, 15 May 2025 12:30:50 +0000 (14:30 +0200)
author Josef Bacik <josef@toxicpanda.com>
Mon, 28 Apr 2025 14:52:57 +0000 (10:52 -0400)
committer David Sterba <dsterba@suse.com>
Thu, 15 May 2025 12:30:50 +0000 (14:30 +0200)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index ed573a402706714f475f8f3c08914fd5f5cdd5ae..a3f7e036437a30b85f6442127dccd01c8b6c5bf5 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1895,6 +1895,111 @@ static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mar
         xas_unlock_irqrestore(&xas, flags);
  }
  
+static void buffer_tree_tag_for_writeback(struct btrfs_fs_info *fs_info,
+                                         unsigned long start, unsigned long end)
+{
+       XA_STATE(xas, &fs_info->buffer_tree, start);
+       unsigned int tagged = 0;
+       void *eb;
+
+       xas_lock_irq(&xas);
+       xas_for_each_marked(&xas, eb, end, PAGECACHE_TAG_DIRTY) {
+               xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
+               if (++tagged % XA_CHECK_SCHED)
+                       continue;
+               xas_pause(&xas);
+               xas_unlock_irq(&xas);
+               cond_resched();
+               xas_lock_irq(&xas);
+       }
+       xas_unlock_irq(&xas);
+}
+
+struct eb_batch {
+       unsigned int nr;
+       unsigned int cur;
+       struct extent_buffer *ebs[PAGEVEC_SIZE];
+};
+
+static inline bool eb_batch_add(struct eb_batch *batch, struct extent_buffer *eb)
+{
+       batch->ebs[batch->nr++] = eb;
+       return (batch->nr < PAGEVEC_SIZE);
+}
+
+static inline void eb_batch_init(struct eb_batch *batch)
+{
+       batch->nr = 0;
+       batch->cur = 0;
+}
+
+static inline struct extent_buffer *eb_batch_next(struct eb_batch *batch)
+{
+       if (batch->cur >= batch->nr)
+               return NULL;
+       return batch->ebs[batch->cur++];
+}
+
+static inline void eb_batch_release(struct eb_batch *batch)
+{
+       for (unsigned int i = 0; i < batch->nr; i++)
+               free_extent_buffer(batch->ebs[i]);
+       eb_batch_init(batch);
+}
+
+static inline struct extent_buffer *find_get_eb(struct xa_state *xas, unsigned long max,
+                                               xa_mark_t mark)
+{
+       struct extent_buffer *eb;
+
+retry:
+       eb = xas_find_marked(xas, max, mark);
+
+       if (xas_retry(xas, eb))
+               goto retry;
+
+       if (!eb)
+               return NULL;
+
+       if (!atomic_inc_not_zero(&eb->refs)) {
+               xas_reset(xas);
+               goto retry;
+       }
+
+       if (unlikely(eb != xas_reload(xas))) {
+               free_extent_buffer(eb);
+               xas_reset(xas);
+               goto retry;
+       }
+
+       return eb;
+}
+
+static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
+                                           unsigned long *start,
+                                           unsigned long end, xa_mark_t tag,
+                                           struct eb_batch *batch)
+{
+       XA_STATE(xas, &fs_info->buffer_tree, *start);
+       struct extent_buffer *eb;
+
+       rcu_read_lock();
+       while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
+               if (!eb_batch_add(batch, eb)) {
+                       *start = ((eb->start + eb->len) >> fs_info->sectorsize_bits);
+                       goto out;
+               }
+       }
+       if (end == ULONG_MAX)
+               *start = ULONG_MAX;
+       else
+               *start = end + 1;
+out:
+       rcu_read_unlock();
+
+       return batch->nr;
+}
+
  /*
   * The endio specific version which won't touch any unsafe spinlock in endio
   * context.
@@ -1997,163 +2102,36 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
  }
  
  /*
- * Submit one subpage btree page.
+ * Wait for all eb writeback in the given range to finish.
   *
- * The main difference to submit_eb_page() is:
- * - Page locking
- *   For subpage, we don't rely on page locking at all.
- *
- * - Flush write bio
- *   We only flush bio if we may be unable to fit current extent buffers into
- *   current bio.
- *
- * Return >=0 for the number of submitted extent buffers.
- * Return <0 for fatal error.
+ * @fs_info:   The fs_info for this file system.
+ * @start:     The offset of the range to start waiting on writeback.
+ * @end:       The end of the range, inclusive. This is meant to be used in
+ *             conjuction with wait_marked_extents, so this will usually be
+ *             the_next_eb->start - 1.
   */
-static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc)
+void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
+                                     u64 end)
  {
-       struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
-       int submitted = 0;
-       u64 folio_start = folio_pos(folio);
-       int bit_start = 0;
-       int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
-       const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
+       struct eb_batch batch;
+       unsigned long start_index = (start >> fs_info->sectorsize_bits);
+       unsigned long end_index = (end >> fs_info->sectorsize_bits);
  
-       /* Lock and write each dirty extent buffers in the range */
-       while (bit_start < blocks_per_folio) {
-               struct btrfs_subpage *subpage = folio_get_private(folio);
+       eb_batch_init(&batch);
+       while (start_index <= end_index) {
                 struct extent_buffer *eb;
-               unsigned long flags;
-               u64 start;
+               unsigned int nr_ebs;
  
-               /*
-                * Take private lock to ensure the subpage won't be detached
-                * in the meantime.
-                */
-               spin_lock(&folio->mapping->i_private_lock);
-               if (!folio_test_private(folio)) {
-                       spin_unlock(&folio->mapping->i_private_lock);
+               nr_ebs = buffer_tree_get_ebs_tag(fs_info, &start_index, end_index,
+                                                PAGECACHE_TAG_WRITEBACK, &batch);
+               if (!nr_ebs)
                         break;
-               }
-               spin_lock_irqsave(&subpage->lock, flags);
-               if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * blocks_per_folio,
-                             subpage->bitmaps)) {
-                       spin_unlock_irqrestore(&subpage->lock, flags);
-                       spin_unlock(&folio->mapping->i_private_lock);
-                       bit_start += sectors_per_node;
-                       continue;
-               }
-
-               start = folio_start + bit_start * fs_info->sectorsize;
-               bit_start += sectors_per_node;
-
-               /*
-                * Here we just want to grab the eb without touching extra
-                * spin locks, so call find_extent_buffer_nolock().
-                */
-               eb = find_extent_buffer_nolock(fs_info, start);
-               spin_unlock_irqrestore(&subpage->lock, flags);
-               spin_unlock(&folio->mapping->i_private_lock);
-
-               /*
-                * The eb has already reached 0 refs thus find_extent_buffer()
-                * doesn't return it. We don't need to write back such eb
-                * anyway.
-                */
-               if (!eb)
-                       continue;
-
-               if (lock_extent_buffer_for_io(eb, wbc)) {
-                       write_one_eb(eb, wbc);
-                       submitted++;
-               }
-               free_extent_buffer(eb);
-       }
-       return submitted;
-}
-
-/*
- * Submit all page(s) of one extent buffer.
- *
- * @page:      the page of one extent buffer
- * @eb_context:        to determine if we need to submit this page, if current page
- *             belongs to this eb, we don't need to submit
- *
- * The caller should pass each page in their bytenr order, and here we use
- * @eb_context to determine if we have submitted pages of one extent buffer.
- *
- * If we have, we just skip until we hit a new page that doesn't belong to
- * current @eb_context.
- *
- * If not, we submit all the page(s) of the extent buffer.
- *
- * Return >0 if we have submitted the extent buffer successfully.
- * Return 0 if we don't need to submit the page, as it's already submitted by
- * previous call.
- * Return <0 for fatal error.
- */
-static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ctx)
-{
-       struct writeback_control *wbc = ctx->wbc;
-       struct address_space *mapping = folio->mapping;
-       struct extent_buffer *eb;
-       int ret;
-
-       if (!folio_test_private(folio))
-               return 0;
-
-       if (btrfs_meta_is_subpage(folio_to_fs_info(folio)))
-               return submit_eb_subpage(folio, wbc);
-
-       spin_lock(&mapping->i_private_lock);
-       if (!folio_test_private(folio)) {
-               spin_unlock(&mapping->i_private_lock);
-               return 0;
-       }
-
-       eb = folio_get_private(folio);
-
-       /*
-        * Shouldn't happen and normally this would be a BUG_ON but no point
-        * crashing the machine for something we can survive anyway.
-        */
-       if (WARN_ON(!eb)) {
-               spin_unlock(&mapping->i_private_lock);
-               return 0;
-       }
-
-       if (eb == ctx->eb) {
-               spin_unlock(&mapping->i_private_lock);
-               return 0;
-       }
-       ret = atomic_inc_not_zero(&eb->refs);
-       spin_unlock(&mapping->i_private_lock);
-       if (!ret)
-               return 0;
  
-       ctx->eb = eb;
-
-       ret = btrfs_check_meta_write_pointer(eb->fs_info, ctx);
-       if (ret) {
-               if (ret == -EBUSY)
-                       ret = 0;
-               free_extent_buffer(eb);
-               return ret;
-       }
-
-       if (!lock_extent_buffer_for_io(eb, wbc)) {
-               free_extent_buffer(eb);
-               return 0;
-       }
-       /* Implies write in zoned mode. */
-       if (ctx->zoned_bg) {
-               /* Mark the last eb in the block group. */
-               btrfs_schedule_zone_finish_bg(ctx->zoned_bg, eb);
-               ctx->zoned_bg->meta_write_pointer += eb->len;
+               while ((eb = eb_batch_next(&batch)) != NULL)
+                       wait_on_extent_buffer_writeback(eb);
+               eb_batch_release(&batch);
+               cond_resched();
         }
-       write_one_eb(eb, wbc);
-       free_extent_buffer(eb);
-       return 1;
  }
  
  int btree_write_cache_pages(struct address_space *mapping,
@@ -2164,25 +2142,27 @@ int btree_write_cache_pages(struct address_space *mapping,
         int ret = 0;
         int done = 0;
         int nr_to_write_done = 0;
-       struct folio_batch fbatch;
-       unsigned int nr_folios;
-       pgoff_t index;
-       pgoff_t end;            /* Inclusive */
+       struct eb_batch batch;
+       unsigned int nr_ebs;
+       unsigned long index;
+       unsigned long end;
         int scanned = 0;
         xa_mark_t tag;
  
-       folio_batch_init(&fbatch);
+       eb_batch_init(&batch);
         if (wbc->range_cyclic) {
-               index = mapping->writeback_index; /* Start from prev offset */
+               index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits);
                 end = -1;
+
                 /*
                  * Start from the beginning does not need to cycle over the
                  * range, mark it as scanned.
                  */
                 scanned = (index == 0);
         } else {
-               index = wbc->range_start >> PAGE_SHIFT;
-               end = wbc->range_end >> PAGE_SHIFT;
+               index = (wbc->range_start >> fs_info->sectorsize_bits);
+               end = (wbc->range_end >> fs_info->sectorsize_bits);
+
                 scanned = 1;
         }
         if (wbc->sync_mode == WB_SYNC_ALL)
@@ -2192,31 +2172,40 @@ int btree_write_cache_pages(struct address_space *mapping,
         btrfs_zoned_meta_io_lock(fs_info);
  retry:
         if (wbc->sync_mode == WB_SYNC_ALL)
-               tag_pages_for_writeback(mapping, index, end);
+               buffer_tree_tag_for_writeback(fs_info, index, end);
         while (!done && !nr_to_write_done && (index <= end) &&
-              (nr_folios = filemap_get_folios_tag(mapping, &index, end,
-                                           tag, &fbatch))) {
-               unsigned i;
+              (nr_ebs = buffer_tree_get_ebs_tag(fs_info, &index, end, tag, &batch))) {
+               struct extent_buffer *eb;
  
-               for (i = 0; i < nr_folios; i++) {
-                       struct folio *folio = fbatch.folios[i];
+               while ((eb = eb_batch_next(&batch)) != NULL) {
+                       ctx.eb = eb;
+
+                       ret = btrfs_check_meta_write_pointer(eb->fs_info, &ctx);
+                       if (ret) {
+                               if (ret == -EBUSY)
+                                       ret = 0;
  
-                       ret = submit_eb_page(folio, &ctx);
-                       if (ret == 0)
+                               if (ret) {
+                                       done = 1;
+                                       break;
+                               }
+                               free_extent_buffer(eb);
                                 continue;
-                       if (ret < 0) {
-                               done = 1;
-                               break;
                         }
  
-                       /*
-                        * the filesystem may choose to bump up nr_to_write.
-                        * We have to make sure to honor the new nr_to_write
-                        * at any time
-                        */
-                       nr_to_write_done = wbc->nr_to_write <= 0;
+                       if (!lock_extent_buffer_for_io(eb, wbc))
+                               continue;
+
+                       /* Implies write in zoned mode. */
+                       if (ctx.zoned_bg) {
+                               /* Mark the last eb in the block group. */
+                               btrfs_schedule_zone_finish_bg(ctx.zoned_bg, eb);
+                               ctx.zoned_bg->meta_write_pointer += eb->len;
+                       }
+                       write_one_eb(eb, wbc);
                 }
-               folio_batch_release(&fbatch);
+               nr_to_write_done = (wbc->nr_to_write <= 0);
+               eb_batch_release(&batch);
                 cond_resched();
         }
         if (!scanned && !done) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h

index e8b92340b65a8c3779b86b18973112ac8f663867..e36e8d6a00bc50538f63ade5fe96a65990223483 100644 (file)
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -240,6 +240,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
  int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc);
  int btree_write_cache_pages(struct address_space *mapping,
                             struct writeback_control *wbc);
+void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
  void btrfs_readahead(struct readahead_control *rac);
  int set_folio_extent_mapped(struct folio *folio);
  void clear_folio_extent_mapped(struct folio *folio);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c

index 665c36f6e8172fa2575ee345ca7f5bfc81ca5494..b06254aba8de2574a580f4e3082fd957a10d9b1b 100644 (file)
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1155,7 +1155,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
                 if (!ret)
                         ret = filemap_fdatawrite_range(mapping, start, end);
                 if (!ret && wait_writeback)
-                       ret = filemap_fdatawait_range(mapping, start, end);
+                       btrfs_btree_wait_writeback_range(fs_info, start, end);
                 btrfs_free_extent_state(cached_state);
                 if (ret)
                         break;
@@ -1175,7 +1175,6 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
  static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
                                        struct extent_io_tree *dirty_pages)
  {
-       struct address_space *mapping = fs_info->btree_inode->i_mapping;
         struct extent_state *cached_state = NULL;
         u64 start = 0;
         u64 end;
@@ -1196,7 +1195,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
                 if (ret == -ENOMEM)
                         ret = 0;
                 if (!ret)
-                       ret = filemap_fdatawait_range(mapping, start, end);
+                       btrfs_btree_wait_writeback_range(fs_info, start, end);
                 btrfs_free_extent_state(cached_state);
                 if (ret)
                         break;
author	Josef Bacik <josef@toxicpanda.com>
	Mon, 28 Apr 2025 14:52:57 +0000 (10:52 -0400)
committer	David Sterba <dsterba@suse.com>
	Thu, 15 May 2025 12:30:50 +0000 (14:30 +0200)
fs/btrfs/extent_io.c		patch \| blob \| blame \| history
fs/btrfs/extent_io.h		patch \| blob \| blame \| history
fs/btrfs/transaction.c		patch \| blob \| blame \| history