struct btrfs_drop_extents_args drop_args = { 0 };
int ret;
struct btrfs_key key;
+ bool copied_inline_to_page = false;
if (new_key->offset > 0) {
ret = copy_inline_to_page(inode, new_key->offset,
inline_data, size, datal, comp_type);
+ copied_inline_to_page = (ret == 0);
goto out;
}
btrfs_abort_transaction(trans, ret);
out:
if (!ret && !trans) {
+ if (copied_inline_to_page &&
+ new_key->offset + datal > i_size_read(&inode->vfs_inode)) {
+ /*
+ * If we copied the inline extent data to a page/folio
+ * beyond the i_size of the destination inode, then we
+ * need to increase the i_size before we start a
+ * transaction to update the inode item. This is to
+ * prevent a deadlock when the flushoncommit mount
+ * option is used, which happens like this:
+ *
+ * 1) Task A clones an inline extent from inode X to an
+ * offset of inode Y that is beyond Y's current
+ * i_size. This means we copied the inline extent's
+ * data to a folio of inode Y that is beyond its EOF,
+ * using the call above to copy_inline_to_page();
+ *
+ * 2) Task B starts a transaction commit and calls
+ * btrfs_start_delalloc_flush() to flush delalloc;
+ *
+ * 3) The delalloc flushing sees the new dirty folio of
+ * inode Y and when it attempts to flush it, it ends
+ * up at extent_writepage() and sees that the offset
+ * of the folio is beyond the i_size of inode Y, so
+ * it attempts to invalidate the folio by calling
+ * folio_invalidate(), which ends up at btrfs' folio
+ * invalidate callback - btrfs_invalidate_folio().
+ * There it tries to lock the folio's range in inode
+ * Y's extent io tree, but it blocks since it's
+ * currently locked by task A - during reflink we
+ * lock the inodes and the source and destination
+ * ranges after flushing all delalloc and waiting for
+ * ordered extent completion - after that we don't
+ * expect to have dirty folios in the ranges, the
+ * exception is if we have to copy an inline extent's
+ * data (because the destination offset is not zero);
+ *
+ * 4) Task A then does the 'goto out' below and attempts
+ * to start a transaction to update the inode item,
+ * and then it's blocked since the current
+ * transaction is in the TRANS_STATE_COMMIT_START
+ * state. Therefore task A has to wait for the
+ * current transaction to become unblocked (its
+ * state >= TRANS_STATE_UNBLOCKED).
+ *
+ * This leads to a deadlock - the task committing the
+ * transaction waiting for the delalloc flushing which
+ * is blocked during folio invalidation on the inode's
+ * extent lock and the reflink task waiting for the
+ * current transaction to be unblocked so that it can
+ * start a new one to update the inode item (while
+ * holding the extent lock).
+ */
+ i_size_write(&inode->vfs_inode, new_key->offset + datal);
+ }
/*
* No transaction here means we copied the inline extent into a
* page of the destination inode.
ret = copy_inline_to_page(inode, new_key->offset,
inline_data, size, datal, comp_type);
-
- /*
- * If we copied the inline extent data to a page/folio beyond the i_size
- * of the destination inode, then we need to increase the i_size before
- * we start a transaction to update the inode item. This is to prevent a
- * deadlock when the flushoncommit mount option is used, which happens
- * like this:
- *
- * 1) Task A clones an inline extent from inode X to an offset of inode
- * Y that is beyond Y's current i_size. This means we copied the
- * inline extent's data to a folio of inode Y that is beyond its EOF,
- * using the call above to copy_inline_to_page();
- *
- * 2) Task B starts a transaction commit and calls
- * btrfs_start_delalloc_flush() to flush delalloc;
- *
- * 3) The delalloc flushing sees the new dirty folio of inode Y and when
- * it attempts to flush it, it ends up at extent_writepage() and sees
- * that the offset of the folio is beyond the i_size of inode Y, so
- * it attempts to invalidate the folio by calling folio_invalidate(),
- * which ends up at btrfs' folio invalidate callback -
- * btrfs_invalidate_folio(). There it tries to lock the folio's range
- * in inode Y's extent io tree, but it blocks since it's currently
- * locked by task A - during reflink we lock the inodes and the
- * source and destination ranges after flushing all delalloc and
- * waiting for ordered extent completion - after that we don't expect
- * to have dirty folios in the ranges, the exception is if we have to
- * copy an inline extent's data (because the destination offset is
- * not zero);
- *
- * 4) Task A then does the 'goto out' below and attempts to start a
- * transaction to update the inode item, and then it's blocked since
- * the current transaction is in the TRANS_STATE_COMMIT_START state.
- * Therefore task A has to wait for the current transaction to become
- * unblocked (its state >= TRANS_STATE_UNBLOCKED).
- *
- * This leads to a deadlock - the task committing the transaction
- * waiting for the delalloc flushing which is blocked during folio
- * invalidation on the inode's extent lock and the reflink task waiting
- * for the current transaction to be unblocked so that it can start a
- * a new one to update the inode item (while holding the extent lock).
- */
- if (ret == 0 && new_key->offset + datal > i_size_read(&inode->vfs_inode))
- i_size_write(&inode->vfs_inode, new_key->offset + datal);
+ copied_inline_to_page = (ret == 0);
goto out;
}