btrfs: remove the COW fixup mechanism

author Qu Wenruo <wqu@suse.com>

Tue, 14 Apr 2026 03:35:26 +0000 (13:05 +0930)

committer David Sterba <dsterba@suse.com>

Mon, 8 Jun 2026 13:53:27 +0000 (15:53 +0200)
author Qu Wenruo <wqu@suse.com>
Tue, 14 Apr 2026 03:35:26 +0000 (13:05 +0930)
committer David Sterba <dsterba@suse.com>
Mon, 8 Jun 2026 13:53:27 +0000 (15:53 +0200)
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig

index 5e75438e0b7382ad7b5d31341679d33e03cbd44b..5d785d01097189b4a73fe0ddd572a6c1ce835cf5 100644 (file)
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -93,10 +93,6 @@ config BTRFS_EXPERIMENTAL
  
           Current list:
  
-         - COW fixup worker warning - last warning before removing the
-                                      functionality catching out-of-band page
-                                      dirtying, not necessary since 5.8
-
           - RAID mirror read policy - additional read policies for balancing
                                       reading from redundant block group
                                       profiles (currently: pid, round-robin,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h

index 55c272fe5d92a06130d0243c0d6da58a8ec7036b..6e696b350dc59318dbf7d7ba449eaddf31eb2ca6 100644 (file)
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -630,7 +630,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
                                     loff_t actual_len, u64 *alloc_hint);
  int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio,
                              u64 start, u64 end, struct writeback_control *wbc);
-int btrfs_writepage_cow_fixup(struct folio *folio);
  int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
                                              int compress_type);
  int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index c0a30bb213d7a09524068a8938592eda1791b802..9d0b80600e9c380893bff74e83583269fda2184b 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1736,7 +1736,6 @@ static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
  /* helper to cleanup workers */
  static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
  {
-       btrfs_destroy_workqueue(fs_info->fixup_workers);
         btrfs_destroy_workqueue(fs_info->delalloc_workers);
         btrfs_destroy_workqueue(fs_info->workers);
         if (fs_info->endio_workers)
@@ -1944,9 +1943,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
         fs_info->caching_workers =
                 btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
  
-       fs_info->fixup_workers =
-               btrfs_alloc_ordered_workqueue(fs_info, "fixup", ordered_flags);
-
         fs_info->endio_workers =
                 alloc_workqueue("btrfs-endio", flags, max_active);
         fs_info->endio_meta_workers =
@@ -1972,7 +1968,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
               fs_info->endio_workers && fs_info->endio_meta_workers &&
               fs_info->endio_write_workers &&
               fs_info->endio_freespace_worker && fs_info->rmw_workers &&
-             fs_info->caching_workers && fs_info->fixup_workers &&
+             fs_info->caching_workers &&
               fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
               fs_info->discard_ctl.discard_workers)) {
                 return -ENOMEM;
@@ -4279,16 +4275,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
         if (unlikely(BTRFS_FS_ERROR(fs_info)))
                 btrfs_error_commit_super(fs_info);
  
-       /*
-        * Wait for any fixup workers to complete.
-        * If we don't wait for them here and they are still running by the time
-        * we call kthread_stop() against the cleaner kthread further below, we
-        * get an use-after-free on the cleaner because the fixup worker adds an
-        * inode to the list of delayed iputs and then attempts to wakeup the
-        * cleaner kthread, which was already stopped and destroyed. We parked
-        * already the cleaner, but below we run all pending delayed iputs.
-        */
-       btrfs_flush_workqueue(fs_info->fixup_workers);
         /*
          * Similar case here, we have to wait for delalloc workers before we
          * proceed below and stop the cleaner kthread, otherwise we trigger a
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index 8aa9e1a88155011f3f865e38463968aa28a1525d..970097b47f14ef4c97edc0eb74d13d952499c9ea 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1738,18 +1738,17 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
         ASSERT(end <= folio_end, "start=%llu len=%u folio_start=%llu folio_size=%zu",
                start, len, folio_start, folio_size(folio));
  
-       ret = btrfs_writepage_cow_fixup(folio);
-       if (ret == -EAGAIN) {
-               /* Fixup worker will requeue */
-               folio_redirty_for_writepage(bio_ctrl->wbc, folio);
-               folio_unlock(folio);
-               return 1;
-       }
-       if (ret < 0) {
+       if (unlikely(!folio_test_ordered(folio))) {
+               DEBUG_WARN();
+               btrfs_err_rl(fs_info,
+       "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
+                            btrfs_root_id(inode->root),
+                            btrfs_ino(inode),
+                            folio_pos(folio));
                 btrfs_folio_clear_dirty(fs_info, folio, start, len);
                 btrfs_folio_set_writeback(fs_info, folio, start, len);
                 btrfs_folio_clear_writeback(fs_info, folio, start, len);
-               return ret;
+               return -EUCLEAN;
         }
  
         bitmap_set(&range_bitmap, (start - folio_pos(folio)) >> fs_info->sectorsize_bits,
@@ -1867,12 +1866,8 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
          *
          * So here we check if the page has private set to rule out such
          * case.
-        * But we also have a long history of relying on the COW fixup,
-        * so here we only enable this check for experimental builds until
-        * we're sure it's safe.
          */
-       if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) &&
-           unlikely(!folio_test_private(folio))) {
+       if (unlikely(!folio_test_private(folio))) {
                 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
                 btrfs_err_rl(fs_info,
         "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h

index a8aa086a4df865818e1adfeff7cff604681ec3b1..8fead5e8d2d01ea35a7bda16b5a39f4dcd93753c 100644 (file)
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -698,13 +698,6 @@ struct btrfs_fs_info {
         struct btrfs_workqueue *endio_write_workers;
         struct btrfs_workqueue *endio_freespace_worker;
         struct btrfs_workqueue *caching_workers;
-
-       /*
-        * Fixup workers take dirty pages that didn't properly go through the
-        * cow mechanism and make them safe to write.  It happens for the
-        * sys_munmap function call path.
-        */
-       struct btrfs_workqueue *fixup_workers;
         struct btrfs_workqueue *delayed_workers;
  
         struct task_struct *transaction_kthread;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 1ca1cbdf25bcd38e76a0a659016083bf4d4d6f77..f7054450200056d7abc34410ef5b696a72945b58 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2833,208 +2833,6 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
                                     EXTENT_DELALLOC | extra_bits, cached_state);
  }
  
-/* see btrfs_writepage_start_hook for details on why this is required */
-struct btrfs_writepage_fixup {
-       struct folio *folio;
-       struct btrfs_inode *inode;
-       struct btrfs_work work;
-};
-
-static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
-{
-       struct btrfs_writepage_fixup *fixup =
-               container_of(work, struct btrfs_writepage_fixup, work);
-       struct btrfs_ordered_extent *ordered;
-       struct extent_state *cached_state = NULL;
-       struct extent_changeset *data_reserved = NULL;
-       struct folio *folio = fixup->folio;
-       struct btrfs_inode *inode = fixup->inode;
-       struct btrfs_fs_info *fs_info = inode->root->fs_info;
-       u64 page_start = folio_pos(folio);
-       u64 page_end = folio_next_pos(folio) - 1;
-       int ret = 0;
-       bool free_delalloc_space = true;
-
-       /*
-        * This is similar to page_mkwrite, we need to reserve the space before
-        * we take the folio lock.
-        */
-       ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
-                                          folio_size(folio));
-again:
-       folio_lock(folio);
-
-       /*
-        * Before we queued this fixup, we took a reference on the folio.
-        * folio->mapping may go NULL, but it shouldn't be moved to a different
-        * address space.
-        */
-       if (!folio->mapping || !folio_test_dirty(folio) ||
-           !folio_test_checked(folio)) {
-               /*
-                * Unfortunately this is a little tricky, either
-                *
-                * 1) We got here and our folio had already been dealt with and
-                *    we reserved our space, thus ret == 0, so we need to just
-                *    drop our space reservation and bail.  This can happen the
-                *    first time we come into the fixup worker, or could happen
-                *    while waiting for the ordered extent.
-                * 2) Our folio was already dealt with, but we happened to get an
-                *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
-                *    this case we obviously don't have anything to release, but
-                *    because the folio was already dealt with we don't want to
-                *    mark the folio with an error, so make sure we're resetting
-                *    ret to 0.  This is why we have this check _before_ the ret
-                *    check, because we do not want to have a surprise ENOSPC
-                *    when the folio was already properly dealt with.
-                */
-               if (!ret) {
-                       btrfs_delalloc_release_extents(inode, folio_size(folio));
-                       btrfs_delalloc_release_space(inode, data_reserved,
-                                                    page_start, folio_size(folio),
-                                                    true);
-               }
-               ret = 0;
-               goto out_page;
-       }
-
-       /*
-        * We can't mess with the folio state unless it is locked, so now that
-        * it is locked bail if we failed to make our space reservation.
-        */
-       if (ret)
-               goto out_page;
-
-       btrfs_lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
-
-       /* already ordered? We're done */
-       if (folio_test_ordered(folio))
-               goto out_reserved;
-
-       ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
-       if (ordered) {
-               btrfs_unlock_extent(&inode->io_tree, page_start, page_end,
-                                   &cached_state);
-               folio_unlock(folio);
-               btrfs_start_ordered_extent(ordered);
-               btrfs_put_ordered_extent(ordered);
-               goto again;
-       }
-
-       ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
-                                       &cached_state);
-       if (ret)
-               goto out_reserved;
-
-       /*
-        * Everything went as planned, we're now the owner of a dirty page with
-        * delayed allocation bits set and space reserved for our COW
-        * destination.
-        *
-        * The page was dirty when we started, nothing should have cleaned it.
-        */
-       BUG_ON(!folio_test_dirty(folio));
-       free_delalloc_space = false;
-out_reserved:
-       btrfs_delalloc_release_extents(inode, PAGE_SIZE);
-       if (free_delalloc_space)
-               btrfs_delalloc_release_space(inode, data_reserved, page_start,
-                                            PAGE_SIZE, true);
-       btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
-out_page:
-       if (ret) {
-               /*
-                * We hit ENOSPC or other errors.  Update the mapping and page
-                * to reflect the errors and clean the page.
-                */
-               mapping_set_error(folio->mapping, ret);
-               btrfs_folio_clear_ordered(fs_info, folio, page_start,
-                                         folio_size(folio));
-               btrfs_mark_ordered_io_finished(inode, page_start,
-                                              folio_size(folio), !ret);
-               folio_clear_dirty_for_io(folio);
-       }
-       btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
-       folio_unlock(folio);
-       folio_put(folio);
-       kfree(fixup);
-       extent_changeset_free(data_reserved);
-       /*
-        * As a precaution, do a delayed iput in case it would be the last iput
-        * that could need flushing space. Recursing back to fixup worker would
-        * deadlock.
-        */
-       btrfs_add_delayed_iput(inode);
-}
-
-/*
- * There are a few paths in the higher layers of the kernel that directly
- * set the folio dirty bit without asking the filesystem if it is a
- * good idea.  This causes problems because we want to make sure COW
- * properly happens and the data=ordered rules are followed.
- *
- * In our case any range that doesn't have the ORDERED bit set
- * hasn't been properly setup for IO.  We kick off an async process
- * to fix it up.  The async helper will wait for ordered extents, set
- * the delalloc bit and make it safe to write the folio.
- */
-int btrfs_writepage_cow_fixup(struct folio *folio)
-{
-       struct inode *inode = folio->mapping->host;
-       struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
-       struct btrfs_writepage_fixup *fixup;
-
-       /* This folio has ordered extent covering it already */
-       if (folio_test_ordered(folio))
-               return 0;
-
-       /*
-        * For experimental build, we error out instead of EAGAIN.
-        *
-        * We should not hit such out-of-band dirty folios anymore.
-        */
-       if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
-               DEBUG_WARN();
-               btrfs_err_rl(fs_info,
-       "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
-                            btrfs_root_id(BTRFS_I(inode)->root),
-                            btrfs_ino(BTRFS_I(inode)),
-                            folio_pos(folio));
-               return -EUCLEAN;
-       }
-
-       /*
-        * folio_checked is set below when we create a fixup worker for this
-        * folio, don't try to create another one if we're already
-        * folio_test_checked.
-        *
-        * The extent_io writepage code will redirty the foio if we send back
-        * EAGAIN.
-        */
-       if (folio_test_checked(folio))
-               return -EAGAIN;
-
-       fixup = kzalloc_obj(*fixup, GFP_NOFS);
-       if (!fixup)
-               return -EAGAIN;
-
-       /*
-        * We are already holding a reference to this inode from
-        * write_cache_pages.  We need to hold it because the space reservation
-        * takes place outside of the folio lock, and we can't trust
-        * folio->mapping outside of the folio lock.
-        */
-       ihold(inode);
-       btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
-       folio_get(folio);
-       btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
-       fixup->folio = folio;
-       fixup->inode = BTRFS_I(inode);
-       btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
-
-       return -EAGAIN;
-}
-
  static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
                                        struct btrfs_inode *inode, u64 file_pos,
                                        struct btrfs_file_extent_item *stack_fi,
author	Qu Wenruo <wqu@suse.com>
	Tue, 14 Apr 2026 03:35:26 +0000 (13:05 +0930)
committer	David Sterba <dsterba@suse.com>
	Mon, 8 Jun 2026 13:53:27 +0000 (15:53 +0200)
fs/btrfs/Kconfig		patch \| blob \| blame \| history
fs/btrfs/btrfs_inode.h		patch \| blob \| blame \| history
fs/btrfs/disk-io.c		patch \| blob \| blame \| history
fs/btrfs/extent_io.c		patch \| blob \| blame \| history
fs/btrfs/fs.h		patch \| blob \| blame \| history
fs/btrfs/inode.c		patch \| blob \| blame \| history