btrfs: fix deadlock cloning inline extent when using flushoncommit

author Filipe Manana <fdmanana@suse.com>

Tue, 26 May 2026 13:44:30 +0000 (14:44 +0100)

committer Johannes Thumshirn <johannes.thumshirn@wdc.com>

Tue, 9 Jun 2026 16:22:45 +0000 (18:22 +0200)
author Filipe Manana <fdmanana@suse.com>
Tue, 26 May 2026 13:44:30 +0000 (14:44 +0100)
committer Johannes Thumshirn <johannes.thumshirn@wdc.com>
Tue, 9 Jun 2026 16:22:45 +0000 (18:22 +0200)
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c

index 0a4628b3007df0d4c82a78cd0c13ba3e8f9a5af0..9a49d2ecb9494bbf270a1502565531bdecdbd0dc 100644 (file)
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -179,10 +179,12 @@ static int clone_copy_inline_extent(struct btrfs_inode *inode,
         struct btrfs_drop_extents_args drop_args = { 0 };
         int ret;
         struct btrfs_key key;
+       bool copied_inline_to_page = false;
  
         if (new_key->offset > 0) {
                 ret = copy_inline_to_page(inode, new_key->offset,
                                           inline_data, size, datal, comp_type);
+               copied_inline_to_page = (ret == 0);
                 goto out;
         }
  
@@ -288,6 +290,60 @@ copy_inline_extent:
                 btrfs_abort_transaction(trans, ret);
  out:
         if (!ret && !trans) {
+               if (copied_inline_to_page &&
+                   new_key->offset + datal > i_size_read(&inode->vfs_inode)) {
+                       /*
+                        * If we copied the inline extent data to a page/folio
+                        * beyond the i_size of the destination inode, then we
+                        * need to increase the i_size before we start a
+                        * transaction to update the inode item. This is to
+                        * prevent a deadlock when the flushoncommit mount
+                        * option is used, which happens like this:
+                        *
+                        * 1) Task A clones an inline extent from inode X to an
+                        *    offset of inode Y that is beyond Y's current
+                        *    i_size. This means we copied the inline extent's
+                        *    data to a folio of inode Y that is beyond its EOF,
+                        *    using the call above to copy_inline_to_page();
+                        *
+                        * 2) Task B starts a transaction commit and calls
+                        *    btrfs_start_delalloc_flush() to flush delalloc;
+                        *
+                        * 3) The delalloc flushing sees the new dirty folio of
+                        *    inode Y and when it attempts to flush it, it ends
+                        *    up at extent_writepage() and sees that the offset
+                        *    of the folio is beyond the i_size of inode Y, so
+                        *    it attempts to invalidate the folio by calling
+                        *    folio_invalidate(), which ends up at btrfs' folio
+                        *    invalidate callback - btrfs_invalidate_folio().
+                        *    There it tries to lock the folio's range in inode
+                        *    Y's extent io tree, but it blocks since it's
+                        *    currently locked by task A - during reflink we
+                        *    lock the inodes and the source and destination
+                        *    ranges after flushing all delalloc and waiting for
+                        *    ordered extent completion - after that we don't
+                        *    expect to have dirty folios in the ranges, the
+                        *    exception is if we have to copy an inline extent's
+                        *    data (because the destination offset is not zero);
+                        *
+                        * 4) Task A then does the 'goto out' below and attempts
+                        *    to start a transaction to update the inode item,
+                        *    and then it's blocked since the current
+                        *    transaction is in the TRANS_STATE_COMMIT_START
+                        *    state. Therefore task A has to wait for the
+                        *    current transaction to become unblocked (its
+                        *    state >= TRANS_STATE_UNBLOCKED).
+                        *
+                        * This leads to a deadlock - the task committing the
+                        * transaction waiting for the delalloc flushing which
+                        * is blocked during folio invalidation on the inode's
+                        * extent lock and the reflink task waiting for the
+                        * current transaction to be unblocked so that it can
+                        * start a new one to update the inode item (while
+                        * holding the extent lock).
+                        */
+                       i_size_write(&inode->vfs_inode, new_key->offset + datal);
+               }
                 /*
                  * No transaction here means we copied the inline extent into a
                  * page of the destination inode.
@@ -320,50 +376,7 @@ copy_to_page:
  
         ret = copy_inline_to_page(inode, new_key->offset,
                                   inline_data, size, datal, comp_type);
-
-       /*
-        * If we copied the inline extent data to a page/folio beyond the i_size
-        * of the destination inode, then we need to increase the i_size before
-        * we start a transaction to update the inode item. This is to prevent a
-        * deadlock when the flushoncommit mount option is used, which happens
-        * like this:
-        *
-        * 1) Task A clones an inline extent from inode X to an offset of inode
-        *    Y that is beyond Y's current i_size. This means we copied the
-        *    inline extent's data to a folio of inode Y that is beyond its EOF,
-        *    using the call above to copy_inline_to_page();
-        *
-        * 2) Task B starts a transaction commit and calls
-        *    btrfs_start_delalloc_flush() to flush delalloc;
-        *
-        * 3) The delalloc flushing sees the new dirty folio of inode Y and when
-        *    it attempts to flush it, it ends up at extent_writepage() and sees
-        *    that the offset of the folio is beyond the i_size of inode Y, so
-        *    it attempts to invalidate the folio by calling folio_invalidate(),
-        *    which ends up at btrfs' folio invalidate callback -
-        *    btrfs_invalidate_folio(). There it tries to lock the folio's range
-        *    in inode Y's extent io tree, but it blocks since it's currently
-        *    locked by task A - during reflink we lock the inodes and the
-        *    source and destination ranges after flushing all delalloc and
-        *    waiting for ordered extent completion - after that we don't expect
-        *    to have dirty folios in the ranges, the exception is if we have to
-        *    copy an inline extent's data (because the destination offset is
-        *    not zero);
-        *
-        * 4) Task A then does the 'goto out' below and attempts to start a
-        *    transaction to update the inode item, and then it's blocked since
-        *    the current transaction is in the TRANS_STATE_COMMIT_START state.
-        *    Therefore task A has to wait for the current transaction to become
-        *    unblocked (its state >= TRANS_STATE_UNBLOCKED).
-        *
-        * This leads to a deadlock - the task committing the transaction
-        * waiting for the delalloc flushing which is blocked during folio
-        * invalidation on the inode's extent lock and the reflink task waiting
-        * for the current transaction to be unblocked so that it can start a
-        * a new one to update the inode item (while holding the extent lock).
-        */
-       if (ret == 0 && new_key->offset + datal > i_size_read(&inode->vfs_inode))
-               i_size_write(&inode->vfs_inode, new_key->offset + datal);
+       copied_inline_to_page = (ret == 0);
  
         goto out;
  }
author	Filipe Manana <fdmanana@suse.com>
	Tue, 26 May 2026 13:44:30 +0000 (14:44 +0100)
committer	Johannes Thumshirn <johannes.thumshirn@wdc.com>
	Tue, 9 Jun 2026 16:22:45 +0000 (18:22 +0200)