From: Greg Kroah-Hartman Date: Tue, 23 Aug 2022 07:25:47 +0000 (+0200) Subject: 4.14-stable patches X-Git-Tag: v4.9.326~14 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=b253dc397da1d6c9a69298193c6113ac07c82e66;p=thirdparty%2Fkernel%2Fstable-queue.git 4.14-stable patches added patches: btrfs-only-write-the-sectors-in-the-vertical-stripe-which-has-data-stripes.patch btrfs-raid56-don-t-trust-any-cached-sector-in-__raid56_parity_recover.patch --- diff --git a/queue-4.14/btrfs-only-write-the-sectors-in-the-vertical-stripe-which-has-data-stripes.patch b/queue-4.14/btrfs-only-write-the-sectors-in-the-vertical-stripe-which-has-data-stripes.patch new file mode 100644 index 00000000000..402988767b7 --- /dev/null +++ b/queue-4.14/btrfs-only-write-the-sectors-in-the-vertical-stripe-which-has-data-stripes.patch @@ -0,0 +1,166 @@ +From foo@baz Tue Aug 23 09:24:14 AM CEST 2022 +From: Qu Wenruo +Date: Sat, 20 Aug 2022 16:49:22 +0800 +Subject: btrfs: only write the sectors in the vertical stripe which has data stripes +To: linux-btrfs@vger.kernel.org, stable@vger.kernel.org +Cc: David Sterba +Message-ID: <1b8e626fe339c091f1357f5ac741ed13876b4c88.1660985049.git.wqu@suse.com> + +From: Qu Wenruo + +commit bd8f7e627703ca5707833d623efcd43f104c7b3f upstream. + +If we have only 8K partial write at the beginning of a full RAID56 +stripe, we will write the following contents: + + 0 8K 32K 64K +Disk 1 (data): |XX| | | +Disk 2 (data): | | | +Disk 3 (parity): |XXXXXXXXXXXXXXX|XXXXXXXXXXXXXXX| + +|X| means the sector will be written back to disk. + +Note that, although we won't write any sectors from disk 2, but we will +write the full 64KiB of parity to disk. + +This behavior is fine for now, but not for the future (especially for +RAID56J, as we waste quite some space to journal the unused parity +stripes). + +So here we will also utilize the btrfs_raid_bio::dbitmap, anytime we +queue a higher level bio into an rbio, we will update rbio::dbitmap to +indicate which vertical stripes we need to writeback. + +And at finish_rmw(), we also check dbitmap to see if we need to write +any sector in the vertical stripe. + +So after the patch, above example will only lead to the following +writeback pattern: + + 0 8K 32K 64K +Disk 1 (data): |XX| | | +Disk 2 (data): | | | +Disk 3 (parity): |XX| | | + +Acked-by: David Sterba +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/raid56.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++---- + 1 file changed, 51 insertions(+), 4 deletions(-) + +--- a/fs/btrfs/raid56.c ++++ b/fs/btrfs/raid56.c +@@ -334,6 +334,9 @@ static void merge_rbio(struct btrfs_raid + { + bio_list_merge(&dest->bio_list, &victim->bio_list); + dest->bio_list_bytes += victim->bio_list_bytes; ++ /* Also inherit the bitmaps from @victim. */ ++ bitmap_or(dest->dbitmap, victim->dbitmap, dest->dbitmap, ++ dest->stripe_npages); + dest->generic_bio_cnt += victim->generic_bio_cnt; + bio_list_init(&victim->bio_list); + } +@@ -882,6 +885,12 @@ static void rbio_orig_end_io(struct btrf + + if (rbio->generic_bio_cnt) + btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); ++ /* ++ * Clear the data bitmap, as the rbio may be cached for later usage. ++ * do this before before unlock_stripe() so there will be no new bio ++ * for this bio. ++ */ ++ bitmap_clear(rbio->dbitmap, 0, rbio->stripe_npages); + + /* + * At this moment, rbio->bio_list is empty, however since rbio does not +@@ -1204,6 +1213,9 @@ static noinline void finish_rmw(struct b + else + BUG(); + ++ /* We should have at least one data sector. */ ++ ASSERT(bitmap_weight(rbio->dbitmap, rbio->stripe_npages)); ++ + /* at this point we either have a full stripe, + * or we've read the full stripe from the drive. + * recalculate the parity and write the new results. +@@ -1277,6 +1289,11 @@ static noinline void finish_rmw(struct b + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { + struct page *page; ++ ++ /* This vertical stripe has no data, skip it. */ ++ if (!test_bit(pagenr, rbio->dbitmap)) ++ continue; ++ + if (stripe < rbio->nr_data) { + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (!page) +@@ -1301,6 +1318,11 @@ static noinline void finish_rmw(struct b + + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { + struct page *page; ++ ++ /* This vertical stripe has no data, skip it. */ ++ if (!test_bit(pagenr, rbio->dbitmap)) ++ continue; ++ + if (stripe < rbio->nr_data) { + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (!page) +@@ -1745,6 +1767,33 @@ static void btrfs_raid_unplug(struct blk + run_plug(plug); + } + ++/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ ++static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) ++{ ++ const struct btrfs_fs_info *fs_info = rbio->fs_info; ++ const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; ++ const u64 full_stripe_start = rbio->bbio->raid_map[0]; ++ const u32 orig_len = orig_bio->bi_iter.bi_size; ++ const u32 sectorsize = fs_info->sectorsize; ++ u64 cur_logical; ++ ++ ASSERT(orig_logical >= full_stripe_start && ++ orig_logical + orig_len <= full_stripe_start + ++ rbio->nr_data * rbio->stripe_len); ++ ++ bio_list_add(&rbio->bio_list, orig_bio); ++ rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; ++ ++ /* Update the dbitmap. */ ++ for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; ++ cur_logical += sectorsize) { ++ int bit = ((u32)(cur_logical - full_stripe_start) >> ++ PAGE_SHIFT) % rbio->stripe_npages; ++ ++ set_bit(bit, rbio->dbitmap); ++ } ++} ++ + /* + * our main entry point for writes from the rest of the FS. + */ +@@ -1761,9 +1810,8 @@ int raid56_parity_write(struct btrfs_fs_ + btrfs_put_bbio(bbio); + return PTR_ERR(rbio); + } +- bio_list_add(&rbio->bio_list, bio); +- rbio->bio_list_bytes = bio->bi_iter.bi_size; + rbio->operation = BTRFS_RBIO_WRITE; ++ rbio_add_bio(rbio, bio); + + btrfs_bio_counter_inc_noblocked(fs_info); + rbio->generic_bio_cnt = 1; +@@ -2146,8 +2194,7 @@ int raid56_parity_recover(struct btrfs_f + } + + rbio->operation = BTRFS_RBIO_READ_REBUILD; +- bio_list_add(&rbio->bio_list, bio); +- rbio->bio_list_bytes = bio->bi_iter.bi_size; ++ rbio_add_bio(rbio, bio); + + rbio->faila = find_logical_bio_stripe(rbio, bio); + if (rbio->faila == -1) { diff --git a/queue-4.14/btrfs-raid56-don-t-trust-any-cached-sector-in-__raid56_parity_recover.patch b/queue-4.14/btrfs-raid56-don-t-trust-any-cached-sector-in-__raid56_parity_recover.patch new file mode 100644 index 00000000000..f233f1fe36e --- /dev/null +++ b/queue-4.14/btrfs-raid56-don-t-trust-any-cached-sector-in-__raid56_parity_recover.patch @@ -0,0 +1,207 @@ +From foo@baz Tue Aug 23 09:24:14 AM CEST 2022 +From: Qu Wenruo +Date: Sat, 20 Aug 2022 16:49:23 +0800 +Subject: btrfs: raid56: don't trust any cached sector in __raid56_parity_recover() +To: linux-btrfs@vger.kernel.org, stable@vger.kernel.org +Cc: David Sterba +Message-ID: <949b700622933e9a2c5fe999017c14f5597ce6a6.1660985049.git.wqu@suse.com> + +From: Qu Wenruo + +commit f6065f8edeb25f4a9dfe0b446030ad995a84a088 upstream. + +[BUG] +There is a small workload which will always fail with recent kernel: +(A simplified version from btrfs/125 test case) + + mkfs.btrfs -f -m raid5 -d raid5 -b 1G $dev1 $dev2 $dev3 + mount $dev1 $mnt + xfs_io -f -c "pwrite -S 0xee 0 1M" $mnt/file1 + sync + umount $mnt + btrfs dev scan -u $dev3 + mount -o degraded $dev1 $mnt + xfs_io -f -c "pwrite -S 0xff 0 128M" $mnt/file2 + umount $mnt + btrfs dev scan + mount $dev1 $mnt + btrfs balance start --full-balance $mnt + umount $mnt + +The failure is always failed to read some tree blocks: + + BTRFS info (device dm-4): relocating block group 217710592 flags data|raid5 + BTRFS error (device dm-4): parent transid verify failed on 38993920 wanted 9 found 7 + BTRFS error (device dm-4): parent transid verify failed on 38993920 wanted 9 found 7 + ... + +[CAUSE] +With the recently added debug output, we can see all RAID56 operations +related to full stripe 38928384: + + 56.1183: raid56_read_partial: full_stripe=38928384 devid=2 type=DATA1 offset=0 opf=0x0 physical=9502720 len=65536 + 56.1185: raid56_read_partial: full_stripe=38928384 devid=3 type=DATA2 offset=16384 opf=0x0 physical=9519104 len=16384 + 56.1185: raid56_read_partial: full_stripe=38928384 devid=3 type=DATA2 offset=49152 opf=0x0 physical=9551872 len=16384 + 56.1187: raid56_write_stripe: full_stripe=38928384 devid=3 type=DATA2 offset=0 opf=0x1 physical=9502720 len=16384 + 56.1188: raid56_write_stripe: full_stripe=38928384 devid=3 type=DATA2 offset=32768 opf=0x1 physical=9535488 len=16384 + 56.1188: raid56_write_stripe: full_stripe=38928384 devid=1 type=PQ1 offset=0 opf=0x1 physical=30474240 len=16384 + 56.1189: raid56_write_stripe: full_stripe=38928384 devid=1 type=PQ1 offset=32768 opf=0x1 physical=30507008 len=16384 + 56.1218: raid56_write_stripe: full_stripe=38928384 devid=3 type=DATA2 offset=49152 opf=0x1 physical=9551872 len=16384 + 56.1219: raid56_write_stripe: full_stripe=38928384 devid=1 type=PQ1 offset=49152 opf=0x1 physical=30523392 len=16384 + 56.2721: raid56_parity_recover: full stripe=38928384 eb=39010304 mirror=2 + 56.2723: raid56_parity_recover: full stripe=38928384 eb=39010304 mirror=2 + 56.2724: raid56_parity_recover: full stripe=38928384 eb=39010304 mirror=2 + +Before we enter raid56_parity_recover(), we have triggered some metadata +write for the full stripe 38928384, this leads to us to read all the +sectors from disk. + +Furthermore, btrfs raid56 write will cache its calculated P/Q sectors to +avoid unnecessary read. + +This means, for that full stripe, after any partial write, we will have +stale data, along with P/Q calculated using that stale data. + +Thankfully due to patch "btrfs: only write the sectors in the vertical stripe +which has data stripes" we haven't submitted all the corrupted P/Q to disk. + +When we really need to recover certain range, aka in +raid56_parity_recover(), we will use the cached rbio, along with its +cached sectors (the full stripe is all cached). + +This explains why we have no event raid56_scrub_read_recover() +triggered. + +Since we have the cached P/Q which is calculated using the stale data, +the recovered one will just be stale. + +In our particular test case, it will always return the same incorrect +metadata, thus causing the same error message "parent transid verify +failed on 39010304 wanted 9 found 7" again and again. + +[BTRFS DESTRUCTIVE RMW PROBLEM] + +Test case btrfs/125 (and above workload) always has its trouble with +the destructive read-modify-write (RMW) cycle: + + 0 32K 64K +Data1: | Good | Good | +Data2: | Bad | Bad | +Parity: | Good | Good | + +In above case, if we trigger any write into Data1, we will use the bad +data in Data2 to re-generate parity, killing the only chance to recovery +Data2, thus Data2 is lost forever. + +This destructive RMW cycle is not specific to btrfs RAID56, but there +are some btrfs specific behaviors making the case even worse: + +- Btrfs will cache sectors for unrelated vertical stripes. + + In above example, if we're only writing into 0~32K range, btrfs will + still read data range (32K ~ 64K) of Data1, and (64K~128K) of Data2. + This behavior is to cache sectors for later update. + + Incidentally commit d4e28d9b5f04 ("btrfs: raid56: make steal_rbio() + subpage compatible") has a bug which makes RAID56 to never trust the + cached sectors, thus slightly improve the situation for recovery. + + Unfortunately, follow up fix "btrfs: update stripe_sectors::uptodate in + steal_rbio" will revert the behavior back to the old one. + +- Btrfs raid56 partial write will update all P/Q sectors and cache them + + This means, even if data at (64K ~ 96K) of Data2 is free space, and + only (96K ~ 128K) of Data2 is really stale data. + And we write into that (96K ~ 128K), we will update all the parity + sectors for the full stripe. + + This unnecessary behavior will completely kill the chance of recovery. + + Thankfully, an unrelated optimization "btrfs: only write the sectors + in the vertical stripe which has data stripes" will prevent + submitting the write bio for untouched vertical sectors. + + That optimization will keep the on-disk P/Q untouched for a chance for + later recovery. + +[FIX] +Although we have no good way to completely fix the destructive RMW +(unless we go full scrub for each partial write), we can still limit the +damage. + +With patch "btrfs: only write the sectors in the vertical stripe which +has data stripes" now we won't really submit the P/Q of unrelated +vertical stripes, so the on-disk P/Q should still be fine. + +Now we really need to do is just drop all the cached sectors when doing +recovery. + +By this, we have a chance to read the original P/Q from disk, and have a +chance to recover the stale data, while still keep the cache to speed up +regular write path. + +In fact, just dropping all the cache for recovery path is good enough to +allow the test case btrfs/125 along with the small script to pass +reliably. + +The lack of metadata write after the degraded mount, and forced metadata +COW is saving us this time. + +So this patch will fix the behavior by not trust any cache in +__raid56_parity_recover(), to solve the problem while still keep the +cache useful. + +But please note that this test pass DOES NOT mean we have solved the +destructive RMW problem, we just do better damage control a little +better. + +Related patches: + +- btrfs: only write the sectors in the vertical stripe +- d4e28d9b5f04 ("btrfs: raid56: make steal_rbio() subpage compatible") +- btrfs: update stripe_sectors::uptodate in steal_rbio + +Acked-by: David Sterba +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/raid56.c | 19 ++++++------------- + 1 file changed, 6 insertions(+), 13 deletions(-) + +--- a/fs/btrfs/raid56.c ++++ b/fs/btrfs/raid56.c +@@ -2096,9 +2096,12 @@ static int __raid56_parity_recover(struc + atomic_set(&rbio->error, 0); + + /* +- * read everything that hasn't failed. Thanks to the +- * stripe cache, it is possible that some or all of these +- * pages are going to be uptodate. ++ * Read everything that hasn't failed. However this time we will ++ * not trust any cached sector. ++ * As we may read out some stale data but higher layer is not reading ++ * that stale part. ++ * ++ * So here we always re-read everything in recovery path. + */ + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + if (rbio->faila == stripe || rbio->failb == stripe) { +@@ -2107,16 +2110,6 @@ static int __raid56_parity_recover(struc + } + + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { +- struct page *p; +- +- /* +- * the rmw code may have already read this +- * page in +- */ +- p = rbio_stripe_page(rbio, stripe, pagenr); +- if (PageUptodate(p)) +- continue; +- + ret = rbio_add_io_page(rbio, &bio_list, + rbio_stripe_page(rbio, stripe, pagenr), + stripe, pagenr, rbio->stripe_len); diff --git a/queue-4.14/series b/queue-4.14/series index 3b48305973b..57dc856a1bb 100644 --- a/queue-4.14/series +++ b/queue-4.14/series @@ -206,6 +206,8 @@ i40e-fix-to-stop-tx_timeout-recovery-if-globr-fails.patch fec-fix-timer-capture-timing-in-fec_ptp_enable_pps.patch igb-add-lock-to-avoid-data-race.patch kbuild-clear-ldflags-in-the-top-makefile.patch +btrfs-only-write-the-sectors-in-the-vertical-stripe-which-has-data-stripes.patch +btrfs-raid56-don-t-trust-any-cached-sector-in-__raid56_parity_recover.patch drm-meson-fix-refcount-bugs-in-meson_vpu_has_availab.patch pci-add-acs-quirk-for-broadcom-bcm5750x-nics.patch irqchip-tegra-fix-overflow-implicit-truncation-warni.patch