6.16-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sat, 13 Sep 2025 14:26:21 +0000 (16:26 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sat, 13 Sep 2025 14:26:21 +0000 (16:26 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 13 Sep 2025 14:26:21 +0000 (16:26 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 13 Sep 2025 14:26:21 +0000 (16:26 +0200)
diff --git a/queue-6.16/btrfs-fix-corruption-reading-compressed-range-when-block-size-is-smaller-than-page-size.patch b/queue-6.16/btrfs-fix-corruption-reading-compressed-range-when-block-size-is-smaller-than-page-size.patch

new file mode 100644 (file)

index 0000000..3a684ad
--- /dev/null
+++ b/queue-6.16/btrfs-fix-corruption-reading-compressed-range-when-block-size-is-smaller-than-page-size.patch
@@ -0,0 +1,231 @@
+From stable+bounces-179491-greg=kroah.com@vger.kernel.org Sat Sep 13 16:13:05 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 13 Sep 2025 10:12:33 -0400
+Subject: btrfs: fix corruption reading compressed range when block size is smaller than page size
+To: stable@vger.kernel.org
+Cc: Qu Wenruo <wqu@suse.com>, Filipe Manana <fdmanana@suse.com>, David Sterba <dsterba@suse.com>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20250913141233.1363589-2-sashal@kernel.org>
+
+From: Qu Wenruo <wqu@suse.com>
+
+[ Upstream commit 9786531399a679fc2f4630d2c0a186205282ab2f ]
+
+[BUG]
+With 64K page size (aarch64 with 64K page size config) and 4K btrfs
+block size, the following workload can easily lead to a corrupted read:
+
+        mkfs.btrfs -f -s 4k $dev > /dev/null
+        mount -o compress $dev $mnt
+        xfs_io -f -c "pwrite -S 0xff 0 64k" $mnt/base > /dev/null
+       echo "correct result:"
+        od -Ad -t x1 $mnt/base
+        xfs_io -f -c "reflink $mnt/base 32k 0 32k" \
+                 -c "reflink $mnt/base 0 32k 32k" \
+                 -c "pwrite -S 0xff 60k 4k" $mnt/new > /dev/null
+       echo "incorrect result:"
+        od -Ad -t x1 $mnt/new
+        umount $mnt
+
+This shows the following result:
+
+correct result:
+0000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
+*
+0065536
+incorrect result:
+0000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
+*
+0032768 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+*
+0061440 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
+*
+0065536
+
+Notice the zero in the range [32K, 60K), which is incorrect.
+
+[CAUSE]
+With extra trace printk, it shows the following events during od:
+(some unrelated info removed like CPU and context)
+
+ od-3457   btrfs_do_readpage: enter r/i=5/258 folio=0(65536) prev_em_start=0000000000000000
+
+The "r/i" is indicating the root and inode number. In our case the file
+"new" is using ino 258 from fs tree (root 5).
+
+Here notice the @prev_em_start pointer is NULL. This means the
+btrfs_do_readpage() is called from btrfs_read_folio(), not from
+btrfs_readahead().
+
+ od-3457   btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=0 got em start=0 len=32768
+ od-3457   btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=4096 got em start=0 len=32768
+ od-3457   btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=8192 got em start=0 len=32768
+ od-3457   btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=12288 got em start=0 len=32768
+ od-3457   btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=16384 got em start=0 len=32768
+ od-3457   btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=20480 got em start=0 len=32768
+ od-3457   btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=24576 got em start=0 len=32768
+ od-3457   btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=28672 got em start=0 len=32768
+
+These above 32K blocks will be read from the first half of the
+compressed data extent.
+
+ od-3457   btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=32768 got em start=32768 len=32768
+
+Note here there is no btrfs_submit_compressed_read() call. Which is
+incorrect now.
+Although both extent maps at 0 and 32K are pointing to the same compressed
+data, their offsets are different thus can not be merged into the same
+read.
+
+So this means the compressed data read merge check is doing something
+wrong.
+
+ od-3457   btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=36864 got em start=32768 len=32768
+ od-3457   btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=40960 got em start=32768 len=32768
+ od-3457   btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=45056 got em start=32768 len=32768
+ od-3457   btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=49152 got em start=32768 len=32768
+ od-3457   btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=53248 got em start=32768 len=32768
+ od-3457   btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=57344 got em start=32768 len=32768
+ od-3457   btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=61440 skip uptodate
+ od-3457   btrfs_submit_compressed_read: cb orig_bio: file off=0 len=61440
+
+The function btrfs_submit_compressed_read() is only called at the end of
+folio read. The compressed bio will only have an extent map of range [0,
+32K), but the original bio passed in is for the whole 64K folio.
+
+This will cause the decompression part to only fill the first 32K,
+leaving the rest untouched (aka, filled with zero).
+
+This incorrect compressed read merge leads to the above data corruption.
+
+There were similar problems that happened in the past, commit 808f80b46790
+("Btrfs: update fix for read corruption of compressed and shared
+extents") is doing pretty much the same fix for readahead.
+
+But that's back to 2015, where btrfs still only supports bs (block size)
+== ps (page size) cases.
+This means btrfs_do_readpage() only needs to handle a folio which
+contains exactly one block.
+
+Only btrfs_readahead() can lead to a read covering multiple blocks.
+Thus only btrfs_readahead() passes a non-NULL @prev_em_start pointer.
+
+With v5.15 kernel btrfs introduced bs < ps support. This breaks the above
+assumption that a folio can only contain one block.
+
+Now btrfs_read_folio() can also read multiple blocks in one go.
+But btrfs_read_folio() doesn't pass a @prev_em_start pointer, thus the
+existing bio force submission check will never be triggered.
+
+In theory, this can also happen for btrfs with large folios, but since
+large folio is still experimental, we don't need to bother it, thus only
+bs < ps support is affected for now.
+
+[FIX]
+Instead of passing @prev_em_start to do the proper compressed extent
+check, introduce one new member, btrfs_bio_ctrl::last_em_start, so that
+the existing bio force submission logic will always be triggered.
+
+CC: stable@vger.kernel.org # 5.15+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent_io.c |   40 ++++++++++++++++++++++++++++++----------
+ 1 file changed, 30 insertions(+), 10 deletions(-)
+
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -111,6 +111,24 @@ struct btrfs_bio_ctrl {
+        */
+       unsigned long submit_bitmap;
+       struct readahead_control *ractl;
++
++      /*
++       * The start offset of the last used extent map by a read operation.
++       *
++       * This is for proper compressed read merge.
++       * U64_MAX means we are starting the read and have made no progress yet.
++       *
++       * The current btrfs_bio_is_contig() only uses disk_bytenr as
++       * the condition to check if the read can be merged with previous
++       * bio, which is not correct. E.g. two file extents pointing to the
++       * same extent but with different offset.
++       *
++       * So here we need to do extra checks to only merge reads that are
++       * covered by the same extent map.
++       * Just extent_map::start will be enough, as they are unique
++       * inside the same inode.
++       */
++      u64 last_em_start;
+ };
+ 
+ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
+@@ -910,7 +928,7 @@ static void btrfs_readahead_expand(struc
+  * return 0 on success, otherwise return error
+  */
+ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
+-                    struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
++                           struct btrfs_bio_ctrl *bio_ctrl)
+ {
+       struct inode *inode = folio->mapping->host;
+       struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+@@ -1020,12 +1038,11 @@ static int btrfs_do_readpage(struct foli
+                * non-optimal behavior (submitting 2 bios for the same extent).
+                */
+               if (compress_type != BTRFS_COMPRESS_NONE &&
+-                  prev_em_start && *prev_em_start != (u64)-1 &&
+-                  *prev_em_start != em->start)
++                  bio_ctrl->last_em_start != U64_MAX &&
++                  bio_ctrl->last_em_start != em->start)
+                       force_bio_submit = true;
+ 
+-              if (prev_em_start)
+-                      *prev_em_start = em->start;
++              bio_ctrl->last_em_start = em->start;
+ 
+               btrfs_free_extent_map(em);
+               em = NULL;
+@@ -1239,12 +1256,15 @@ int btrfs_read_folio(struct file *file,
+       const u64 start = folio_pos(folio);
+       const u64 end = start + folio_size(folio) - 1;
+       struct extent_state *cached_state = NULL;
+-      struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
++      struct btrfs_bio_ctrl bio_ctrl = {
++              .opf = REQ_OP_READ,
++              .last_em_start = U64_MAX,
++      };
+       struct extent_map *em_cached = NULL;
+       int ret;
+ 
+       lock_extents_for_read(inode, start, end, &cached_state);
+-      ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL);
++      ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
+       btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
+ 
+       btrfs_free_extent_map(em_cached);
+@@ -2582,7 +2602,8 @@ void btrfs_readahead(struct readahead_co
+ {
+       struct btrfs_bio_ctrl bio_ctrl = {
+               .opf = REQ_OP_READ | REQ_RAHEAD,
+-              .ractl = rac
++              .ractl = rac,
++              .last_em_start = U64_MAX,
+       };
+       struct folio *folio;
+       struct btrfs_inode *inode = BTRFS_I(rac->mapping->host);
+@@ -2590,12 +2611,11 @@ void btrfs_readahead(struct readahead_co
+       const u64 end = start + readahead_length(rac) - 1;
+       struct extent_state *cached_state = NULL;
+       struct extent_map *em_cached = NULL;
+-      u64 prev_em_start = (u64)-1;
+ 
+       lock_extents_for_read(inode, start, end, &cached_state);
+ 
+       while ((folio = readahead_folio(rac)) != NULL)
+-              btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start);
++              btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
+ 
+       btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
+ 
diff --git a/queue-6.16/btrfs-use-readahead_expand-on-compressed-extents.patch b/queue-6.16/btrfs-use-readahead_expand-on-compressed-extents.patch

new file mode 100644 (file)

index 0000000..96cef92
--- /dev/null
+++ b/queue-6.16/btrfs-use-readahead_expand-on-compressed-extents.patch
@@ -0,0 +1,205 @@
+From stable+bounces-179490-greg=kroah.com@vger.kernel.org Sat Sep 13 16:13:00 2025
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 13 Sep 2025 10:12:32 -0400
+Subject: btrfs: use readahead_expand() on compressed extents
+To: stable@vger.kernel.org
+Cc: Boris Burkov <boris@bur.io>, Dimitrios Apostolou <jimis@gmx.net>, Filipe Manana <fdmanana@suse.com>, David Sterba <dsterba@suse.com>, Sasha Levin <sashal@kernel.org>
+Message-ID: <20250913141233.1363589-1-sashal@kernel.org>
+
+From: Boris Burkov <boris@bur.io>
+
+[ Upstream commit 9e9ff875e4174be939371667d2cc81244e31232f ]
+
+We recently received a report of poor performance doing sequential
+buffered reads of a file with compressed extents. With bs=128k, a naive
+sequential dd ran as fast on a compressed file as on an uncompressed
+(1.2GB/s on my reproducing system) while with bs<32k, this performance
+tanked down to ~300MB/s.
+
+i.e., slow:
+
+  dd if=some-compressed-file of=/dev/null bs=4k count=X
+
+vs fast:
+
+  dd if=some-compressed-file of=/dev/null bs=128k count=Y
+
+The cause of this slowness is overhead to do with looking up extent_maps
+to enable readahead pre-caching on compressed extents
+(add_ra_bio_pages()), as well as some overhead in the generic VFS
+readahead code we hit more in the slow case. Notably, the main
+difference between the two read sizes is that in the large sized request
+case, we call btrfs_readahead() relatively rarely while in the smaller
+request we call it for every compressed extent. So the fast case stays
+in the btrfs readahead loop:
+
+    while ((folio = readahead_folio(rac)) != NULL)
+           btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start);
+
+where the slower one breaks out of that loop every time. This results in
+calling add_ra_bio_pages a lot, doing lots of extent_map lookups,
+extent_map locking, etc.
+
+This happens because although add_ra_bio_pages() does add the
+appropriate un-compressed file pages to the cache, it does not
+communicate back to the ractl in any way. To solve this, we should be
+using readahead_expand() to signal to readahead to expand the readahead
+window.
+
+This change passes the readahead_control into the btrfs_bio_ctrl and in
+the case of compressed reads sets the expansion to the size of the
+extent_map we already looked up anyway. It skips the subpage case as
+that one already doesn't do add_ra_bio_pages().
+
+With this change, whether we use bs=4k or bs=128k, btrfs expands the
+readahead window up to the largest compressed extent we have seen so far
+(in the trivial example: 128k) and the call stacks of the two modes look
+identical. Notably, we barely call add_ra_bio_pages at all. And the
+performance becomes identical as well. So this change certainly "fixes"
+this performance problem.
+
+Of course, it does seem to beg a few questions:
+
+1. Will this waste too much page cache with a too large ra window?
+2. Will this somehow cause bugs prevented by the more thoughtful
+   checking in add_ra_bio_pages?
+3. Should we delete add_ra_bio_pages?
+
+My stabs at some answers:
+
+1. Hard to say. See attempts at generic performance testing below. Is
+   there a "readahead_shrink" we should be using? Should we expand more
+   slowly, by half the remaining em size each time?
+2. I don't think so. Since the new behavior is indistinguishable from
+   reading the file with a larger read size passed in, I don't see why
+   one would be safe but not the other.
+3. Probably! I tested that and it was fine in fstests, and it seems like
+   the pages would get re-used just as well in the readahead case.
+   However, it is possible some reads that use page cache but not
+   btrfs_readahead() could suffer. I will investigate this further as a
+   follow up.
+
+I tested the performance implications of this change in 3 ways (using
+compress-force=zstd:3 for compression):
+
+Directly test the affected workload of small sequential reads on a
+compressed file (improved from ~250MB/s to ~1.2GB/s)
+
+==========for-next==========
+  dd /mnt/lol/non-cmpr 4k
+  1048576+0 records in
+  1048576+0 records out
+  4294967296 bytes (4.3 GB, 4.0 GiB) copied, 6.02983 s, 712 MB/s
+  dd /mnt/lol/non-cmpr 128k
+  32768+0 records in
+  32768+0 records out
+  4294967296 bytes (4.3 GB, 4.0 GiB) copied, 5.92403 s, 725 MB/s
+  dd /mnt/lol/cmpr 4k
+  1048576+0 records in
+  1048576+0 records out
+  4294967296 bytes (4.3 GB, 4.0 GiB) copied, 17.8832 s, 240 MB/s
+  dd /mnt/lol/cmpr 128k
+  32768+0 records in
+  32768+0 records out
+  4294967296 bytes (4.3 GB, 4.0 GiB) copied, 3.71001 s, 1.2 GB/s
+
+==========ra-expand==========
+  dd /mnt/lol/non-cmpr 4k
+  1048576+0 records in
+  1048576+0 records out
+  4294967296 bytes (4.3 GB, 4.0 GiB) copied, 6.09001 s, 705 MB/s
+  dd /mnt/lol/non-cmpr 128k
+  32768+0 records in
+  32768+0 records out
+  4294967296 bytes (4.3 GB, 4.0 GiB) copied, 6.07664 s, 707 MB/s
+  dd /mnt/lol/cmpr 4k
+  1048576+0 records in
+  1048576+0 records out
+  4294967296 bytes (4.3 GB, 4.0 GiB) copied, 3.79531 s, 1.1 GB/s
+  dd /mnt/lol/cmpr 128k
+  32768+0 records in
+  32768+0 records out
+  4294967296 bytes (4.3 GB, 4.0 GiB) copied, 3.69533 s, 1.2 GB/s
+
+Built the linux kernel from clean (no change)
+
+Ran fsperf. Mostly neutral results with some improvements and
+regressions here and there.
+
+Reported-by: Dimitrios Apostolou <jimis@gmx.net>
+Link: https://lore.kernel.org/linux-btrfs/34601559-6c16-6ccc-1793-20a97ca0dbba@gmx.net/
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Boris Burkov <boris@bur.io>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Stable-dep-of: 9786531399a6 ("btrfs: fix corruption reading compressed range when block size is smaller than page size")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent_io.c |   35 ++++++++++++++++++++++++++++++++++-
+ 1 file changed, 34 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -110,6 +110,7 @@ struct btrfs_bio_ctrl {
+        * This is to avoid touching ranges covered by compression/inline.
+        */
+       unsigned long submit_bitmap;
++      struct readahead_control *ractl;
+ };
+ 
+ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
+@@ -882,6 +883,25 @@ static struct extent_map *get_extent_map
+ 
+       return em;
+ }
++
++static void btrfs_readahead_expand(struct readahead_control *ractl,
++                                 const struct extent_map *em)
++{
++      const u64 ra_pos = readahead_pos(ractl);
++      const u64 ra_end = ra_pos + readahead_length(ractl);
++      const u64 em_end = em->start + em->ram_bytes;
++
++      /* No expansion for holes and inline extents. */
++      if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE)
++              return;
++
++      ASSERT(em_end >= ra_pos,
++             "extent_map %llu %llu ends before current readahead position %llu",
++             em->start, em->len, ra_pos);
++      if (em_end > ra_end)
++              readahead_expand(ractl, ra_pos, em_end - ra_pos);
++}
++
+ /*
+  * basic readpage implementation.  Locked extent state structs are inserted
+  * into the tree that are removed when the IO is done (by the end_io
+@@ -945,6 +965,16 @@ static int btrfs_do_readpage(struct foli
+ 
+               compress_type = btrfs_extent_map_compression(em);
+ 
++              /*
++               * Only expand readahead for extents which are already creating
++               * the pages anyway in add_ra_bio_pages, which is compressed
++               * extents in the non subpage case.
++               */
++              if (bio_ctrl->ractl &&
++                  !btrfs_is_subpage(fs_info, folio) &&
++                  compress_type != BTRFS_COMPRESS_NONE)
++                      btrfs_readahead_expand(bio_ctrl->ractl, em);
++
+               if (compress_type != BTRFS_COMPRESS_NONE)
+                       disk_bytenr = em->disk_bytenr;
+               else
+@@ -2550,7 +2580,10 @@ int btrfs_writepages(struct address_spac
+ 
+ void btrfs_readahead(struct readahead_control *rac)
+ {
+-      struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
++      struct btrfs_bio_ctrl bio_ctrl = {
++              .opf = REQ_OP_READ | REQ_RAHEAD,
++              .ractl = rac
++      };
+       struct folio *folio;
+       struct btrfs_inode *inode = BTRFS_I(rac->mapping->host);
+       const u64 start = readahead_pos(rac);
diff --git a/queue-6.16/series b/queue-6.16/series

index 5eb1edf09debd52ee6815edfa1b4c985ea6ba7d4..2c838093a9235c1cf3560a92d2d60b5a9f336166 100644 (file)
--- a/queue-6.16/series
+++ b/queue-6.16/series
@@ -94,3 +94,5 @@ drm-edid-define-the-quirks-in-an-enum-list.patch
  drm-edid-add-support-for-quirks-visible-to-drm-core-and-drivers.patch
  drm-dp-add-an-edid-quirk-for-the-dpcd-register-access-probe.patch
  drm-amd-display-disable-dpcd-probe-quirk.patch
+btrfs-use-readahead_expand-on-compressed-extents.patch
+btrfs-fix-corruption-reading-compressed-range-when-block-size-is-smaller-than-page-size.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sat, 13 Sep 2025 14:26:21 +0000 (16:26 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sat, 13 Sep 2025 14:26:21 +0000 (16:26 +0200)
queue-6.16/btrfs-fix-corruption-reading-compressed-range-when-block-size-is-smaller-than-page-size.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-use-readahead_expand-on-compressed-extents.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/series		patch \| blob \| blame \| history