]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
btrfs: fix false IO failure after falling back to buffered write
authorQu Wenruo <wqu@suse.com>
Thu, 4 Jun 2026 00:29:46 +0000 (09:59 +0930)
committerJohannes Thumshirn <johannes.thumshirn@wdc.com>
Tue, 9 Jun 2026 16:22:46 +0000 (18:22 +0200)
[BUG]
The test case generic/362 will fail with "nodatasum" mount option (*):

 MOUNT_OPTIONS -- -o nodatasum /dev/mapper/test-scratch1 /mnt/scratch

 generic/362  0s ... - output mismatch (see /home/adam/xfstests/results//generic/362.out.bad)
    --- tests/generic/362.out 2024-08-24 15:31:37.200000000 +0930
    +++ /home/adam/xfstests/results//generic/362.out.bad 2026-05-27 10:21:17.574771567 +0930
    @@ -1,2 +1,3 @@
     QA output created by 362
    +First write failed: Input/output error
     Silence is golden
    ...

*: If the test case has been executed before with default data checksum,
the failure will not reproduce. Need the following fix to make it
reliably reproducible:
https://lore.kernel.org/linux-btrfs/20260528111659.87113-1-wqu@suse.com/

[CAUSE]
Inside __iomap_dio_rw(), the -EFAULT/-ENOTBLK error is not directly returned.
Thus we never got an error pointer from __iomap_dio_rw().

The call chain looks like this:

 btrfs_direct_write()
 |- btrfs_dio_write()
 |-  __iomap_dio_rw()
 |  |- iomap_iter()
 |  |  |- btrfs_dio_iomap_begin()
 |  |     Now an ordered extent is allocated for the 4K write.
 |  |
 |  |- iomi.status = iomap_dio_iter()
 |  |  Where iomap_dio_iter() returned -EFAULT.
 |  |
 |  |- ret = iomap_iter()
 |  |  |- btrfs_dio_iomap_end()
 |  |  |  |- btrfs_finish_ordered_extent(uptodate = false)
 |  |  |  |  |- can_finish_ordered_extent()
 |  |  |  |     |- btrfs_mark_ordered_extent_error()
 |  |  |  |        |- mapping_set_error()
 |  |  |  |           Now the address space is marked error.
 |  |  |  | return -ENOTBLK
 |  |  |- return -ENOTBLK
 |  |- if (ret == -ENOTBLK) { ret = 0; }
 |     Now the return value is reset to 0.
 |     Thus no error pointer will be returned.
 |
 |- ret = iomap_dio_complete()
 |  Since no byte is submitted, @ret is 0.
 |
 |- Fallback to buffered IO
 |  And the buffered write finished without error
 |
 |- filemap_fdatawait_range()
    |- filemap_check_errors()
       The previous error is recorded, thus an error is returned

However the buffered write is properly submitted and finished, the error
is from the btrfs_finish_ordered_extent() call with @uptodate = false.

[FIX]
When a short dio write happened, any range that is submitted will have
btrfs_extract_ordered_extent() to be called, thus the submitted range
will always have an OE just covering the submitted range.

The remaining OE range is never submitted, thus they should be treated
as truncated, not an error. So that we can properly reclaim and not
insert an unnecessary file extent item, without marking the mapping as
error.

Extract a helper, btrfs_mark_ordered_extent_truncated(), and utilize
that helper to mark the direct IO ordered extent as truncated, so it
won't cause failure for the later buffered fallback.

[REASON FOR NO FIXES TAG]
The bug itself is pretty old, at commit f85781fb505e ("btrfs: switch to
iomap for direct IO") we're already passing @uptodate=false finishing
the OE.
But at that time OE with IOERR won't call mapping_set_error(), so it's
not exposed.
Later commit d61bec08b904 ("btrfs: mark ordered extent and inode with
error if we fail to finish") finally exposed the bug, but that commit
is doing a correct job, not the root cause.

Anyway the bug is very old, dating back to 5.1x days, thus only CC to
stable.

CC: stable@vger.kernel.org # 5.15+
Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/direct-io.c
fs/btrfs/inode.c
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h

index 57167d56dc7277e0bc4c36b6b0d2aede03edd7be..88cb2e82a50768221ae4399c83925e90280df06c 100644 (file)
@@ -624,12 +624,23 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
        if (submitted < length) {
                pos += submitted;
                length -= submitted;
-               if (write)
+               if (write) {
+                       /*
+                        * We have a short write, if there is any range
+                        * that is submitted properly, that part will have
+                        * its own OE split from the original one.
+                        *
+                        * So for the OE at dio_data->ordered, it's the part
+                        * that is not submitted, and should be marked
+                        * as fully truncated.
+                        */
+                       btrfs_mark_ordered_extent_truncated(dio_data->ordered, 0);
                        btrfs_finish_ordered_extent(dio_data->ordered,
-                                                   pos, length, false);
-               else
+                                                   pos, length, true);
+               } else {
                        btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
                                                pos + length - 1, NULL);
+               }
                ret = -ENOTBLK;
        }
        if (write) {
index 3c66853276b7e72049b7a7e9cf2ddadc21bc9873..0133b688ce04d971f748f0b07955fff4235467ff 100644 (file)
@@ -7590,11 +7590,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
                                               EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
                                               EXTENT_DEFRAG, &cached_state);
 
-               spin_lock(&inode->ordered_tree_lock);
-               set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
-               ordered->truncated_len = min(ordered->truncated_len,
-                                            cur - ordered->file_offset);
-               spin_unlock(&inode->ordered_tree_lock);
+               btrfs_mark_ordered_extent_truncated(ordered, cur - ordered->file_offset);
 
                /*
                 * If the ordered extent has finished, we're safe to delete all
index f5f77c33cf592dde40883735ae528369d7a3a4dc..b32d4eabe0abe448a9012421e5dee0d2fa9c1007 100644 (file)
@@ -358,6 +358,18 @@ void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered)
                mapping_set_error(ordered->inode->vfs_inode.i_mapping, -EIO);
 }
 
+void btrfs_mark_ordered_extent_truncated(struct btrfs_ordered_extent *ordered,
+                                        u64 truncate_len)
+{
+       struct btrfs_inode *inode = ordered->inode;
+
+       ASSERT(truncate_len <= ordered->num_bytes);
+       spin_lock(&inode->ordered_tree_lock);
+       set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
+       ordered->truncated_len = min(ordered->truncated_len, truncate_len);
+       spin_unlock(&inode->ordered_tree_lock);
+}
+
 static void finish_ordered_fn(struct btrfs_work *work)
 {
        struct btrfs_ordered_extent *ordered_extent;
index 03e12380a2fda949a76ed6a8ee208469c9b8178c..8d5d5ba1e02f08ed6fe858fc940d3693094fe6f0 100644 (file)
@@ -226,6 +226,8 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
 struct btrfs_ordered_extent *btrfs_split_ordered_extent(
                        struct btrfs_ordered_extent *ordered, u64 len);
 void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered);
+void btrfs_mark_ordered_extent_truncated(struct btrfs_ordered_extent *ordered,
+                                        u64 truncate_len);
 int __init ordered_data_init(void);
 void __cold ordered_data_exit(void);