From f3b4ae70f25d279543e4585c53aa81e1c44d5bfc Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 19 Aug 2022 13:27:18 +0200 Subject: [PATCH] 5.18-stable patches added patches: btrfs-only-write-the-sectors-in-the-vertical-stripe-which-has-data-stripes.patch btrfs-raid56-don-t-trust-any-cached-sector-in-__raid56_parity_recover.patch net_sched-cls_route-disallow-handle-of-0.patch tee-add-overflow-check-in-register_shm_helper.patch --- ...rtical-stripe-which-has-data-stripes.patch | 166 ++++++++++++++ ...ed-sector-in-__raid56_parity_recover.patch | 207 ++++++++++++++++++ ...sched-cls_route-disallow-handle-of-0.patch | 87 ++++++++ queue-5.18/series | 4 + ...verflow-check-in-register_shm_helper.patch | 59 +++++ 5 files changed, 523 insertions(+) create mode 100644 queue-5.18/btrfs-only-write-the-sectors-in-the-vertical-stripe-which-has-data-stripes.patch create mode 100644 queue-5.18/btrfs-raid56-don-t-trust-any-cached-sector-in-__raid56_parity_recover.patch create mode 100644 queue-5.18/net_sched-cls_route-disallow-handle-of-0.patch create mode 100644 queue-5.18/tee-add-overflow-check-in-register_shm_helper.patch diff --git a/queue-5.18/btrfs-only-write-the-sectors-in-the-vertical-stripe-which-has-data-stripes.patch b/queue-5.18/btrfs-only-write-the-sectors-in-the-vertical-stripe-which-has-data-stripes.patch new file mode 100644 index 00000000000..fcaaa31166e --- /dev/null +++ b/queue-5.18/btrfs-only-write-the-sectors-in-the-vertical-stripe-which-has-data-stripes.patch @@ -0,0 +1,166 @@ +From foo@baz Fri Aug 19 01:20:32 PM CEST 2022 +From: Qu Wenruo +Date: Fri, 19 Aug 2022 15:40:58 +0800 +Subject: btrfs: only write the sectors in the vertical stripe which has data stripes +To: linux-btrfs@vger.kernel.org, stable@vger.kernel.org +Cc: David Sterba +Message-ID: <26e484e1ea108b418cb8c91fa4aa394acd1239bd.1660894086.git.wqu@suse.com> + +From: Qu Wenruo + +commit bd8f7e627703ca5707833d623efcd43f104c7b3f upstream. + +If we have only 8K partial write at the beginning of a full RAID56 +stripe, we will write the following contents: + + 0 8K 32K 64K +Disk 1 (data): |XX| | | +Disk 2 (data): | | | +Disk 3 (parity): |XXXXXXXXXXXXXXX|XXXXXXXXXXXXXXX| + +|X| means the sector will be written back to disk. + +Note that, although we won't write any sectors from disk 2, but we will +write the full 64KiB of parity to disk. + +This behavior is fine for now, but not for the future (especially for +RAID56J, as we waste quite some space to journal the unused parity +stripes). + +So here we will also utilize the btrfs_raid_bio::dbitmap, anytime we +queue a higher level bio into an rbio, we will update rbio::dbitmap to +indicate which vertical stripes we need to writeback. + +And at finish_rmw(), we also check dbitmap to see if we need to write +any sector in the vertical stripe. + +So after the patch, above example will only lead to the following +writeback pattern: + + 0 8K 32K 64K +Disk 1 (data): |XX| | | +Disk 2 (data): | | | +Disk 3 (parity): |XX| | | + +Acked-by: David Sterba +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/raid56.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++---- + 1 file changed, 51 insertions(+), 4 deletions(-) + +--- a/fs/btrfs/raid56.c ++++ b/fs/btrfs/raid56.c +@@ -323,6 +323,9 @@ static void merge_rbio(struct btrfs_raid + { + bio_list_merge(&dest->bio_list, &victim->bio_list); + dest->bio_list_bytes += victim->bio_list_bytes; ++ /* Also inherit the bitmaps from @victim. */ ++ bitmap_or(dest->dbitmap, victim->dbitmap, dest->dbitmap, ++ dest->stripe_npages); + dest->generic_bio_cnt += victim->generic_bio_cnt; + bio_list_init(&victim->bio_list); + } +@@ -864,6 +867,12 @@ static void rbio_orig_end_io(struct btrf + + if (rbio->generic_bio_cnt) + btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt); ++ /* ++ * Clear the data bitmap, as the rbio may be cached for later usage. ++ * do this before before unlock_stripe() so there will be no new bio ++ * for this bio. ++ */ ++ bitmap_clear(rbio->dbitmap, 0, rbio->stripe_npages); + + /* + * At this moment, rbio->bio_list is empty, however since rbio does not +@@ -1195,6 +1204,9 @@ static noinline void finish_rmw(struct b + else + BUG(); + ++ /* We should have at least one data sector. */ ++ ASSERT(bitmap_weight(rbio->dbitmap, rbio->stripe_npages)); ++ + /* at this point we either have a full stripe, + * or we've read the full stripe from the drive. + * recalculate the parity and write the new results. +@@ -1266,6 +1278,11 @@ static noinline void finish_rmw(struct b + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { + struct page *page; ++ ++ /* This vertical stripe has no data, skip it. */ ++ if (!test_bit(pagenr, rbio->dbitmap)) ++ continue; ++ + if (stripe < rbio->nr_data) { + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (!page) +@@ -1290,6 +1307,11 @@ static noinline void finish_rmw(struct b + + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { + struct page *page; ++ ++ /* This vertical stripe has no data, skip it. */ ++ if (!test_bit(pagenr, rbio->dbitmap)) ++ continue; ++ + if (stripe < rbio->nr_data) { + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (!page) +@@ -1713,6 +1735,33 @@ static void btrfs_raid_unplug(struct blk + run_plug(plug); + } + ++/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ ++static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) ++{ ++ const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; ++ const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; ++ const u64 full_stripe_start = rbio->bioc->raid_map[0]; ++ const u32 orig_len = orig_bio->bi_iter.bi_size; ++ const u32 sectorsize = fs_info->sectorsize; ++ u64 cur_logical; ++ ++ ASSERT(orig_logical >= full_stripe_start && ++ orig_logical + orig_len <= full_stripe_start + ++ rbio->nr_data * rbio->stripe_len); ++ ++ bio_list_add(&rbio->bio_list, orig_bio); ++ rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; ++ ++ /* Update the dbitmap. */ ++ for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; ++ cur_logical += sectorsize) { ++ int bit = ((u32)(cur_logical - full_stripe_start) >> ++ fs_info->sectorsize_bits) % rbio->stripe_npages; ++ ++ set_bit(bit, rbio->dbitmap); ++ } ++} ++ + /* + * our main entry point for writes from the rest of the FS. + */ +@@ -1730,9 +1779,8 @@ int raid56_parity_write(struct bio *bio, + btrfs_put_bioc(bioc); + return PTR_ERR(rbio); + } +- bio_list_add(&rbio->bio_list, bio); +- rbio->bio_list_bytes = bio->bi_iter.bi_size; + rbio->operation = BTRFS_RBIO_WRITE; ++ rbio_add_bio(rbio, bio); + + btrfs_bio_counter_inc_noblocked(fs_info); + rbio->generic_bio_cnt = 1; +@@ -2134,8 +2182,7 @@ int raid56_parity_recover(struct bio *bi + } + + rbio->operation = BTRFS_RBIO_READ_REBUILD; +- bio_list_add(&rbio->bio_list, bio); +- rbio->bio_list_bytes = bio->bi_iter.bi_size; ++ rbio_add_bio(rbio, bio); + + rbio->faila = find_logical_bio_stripe(rbio, bio); + if (rbio->faila == -1) { diff --git a/queue-5.18/btrfs-raid56-don-t-trust-any-cached-sector-in-__raid56_parity_recover.patch b/queue-5.18/btrfs-raid56-don-t-trust-any-cached-sector-in-__raid56_parity_recover.patch new file mode 100644 index 00000000000..d0caf0d9184 --- /dev/null +++ b/queue-5.18/btrfs-raid56-don-t-trust-any-cached-sector-in-__raid56_parity_recover.patch @@ -0,0 +1,207 @@ +From foo@baz Fri Aug 19 01:20:32 PM CEST 2022 +From: Qu Wenruo +Date: Fri, 19 Aug 2022 15:40:59 +0800 +Subject: btrfs: raid56: don't trust any cached sector in __raid56_parity_recover() +To: linux-btrfs@vger.kernel.org, stable@vger.kernel.org +Cc: David Sterba +Message-ID: <28474da52cc523be597a3d56fed329af1e639934.1660894086.git.wqu@suse.com> + +From: Qu Wenruo + +commit f6065f8edeb25f4a9dfe0b446030ad995a84a088 upstream. + +[BUG] +There is a small workload which will always fail with recent kernel: +(A simplified version from btrfs/125 test case) + + mkfs.btrfs -f -m raid5 -d raid5 -b 1G $dev1 $dev2 $dev3 + mount $dev1 $mnt + xfs_io -f -c "pwrite -S 0xee 0 1M" $mnt/file1 + sync + umount $mnt + btrfs dev scan -u $dev3 + mount -o degraded $dev1 $mnt + xfs_io -f -c "pwrite -S 0xff 0 128M" $mnt/file2 + umount $mnt + btrfs dev scan + mount $dev1 $mnt + btrfs balance start --full-balance $mnt + umount $mnt + +The failure is always failed to read some tree blocks: + + BTRFS info (device dm-4): relocating block group 217710592 flags data|raid5 + BTRFS error (device dm-4): parent transid verify failed on 38993920 wanted 9 found 7 + BTRFS error (device dm-4): parent transid verify failed on 38993920 wanted 9 found 7 + ... + +[CAUSE] +With the recently added debug output, we can see all RAID56 operations +related to full stripe 38928384: + + 56.1183: raid56_read_partial: full_stripe=38928384 devid=2 type=DATA1 offset=0 opf=0x0 physical=9502720 len=65536 + 56.1185: raid56_read_partial: full_stripe=38928384 devid=3 type=DATA2 offset=16384 opf=0x0 physical=9519104 len=16384 + 56.1185: raid56_read_partial: full_stripe=38928384 devid=3 type=DATA2 offset=49152 opf=0x0 physical=9551872 len=16384 + 56.1187: raid56_write_stripe: full_stripe=38928384 devid=3 type=DATA2 offset=0 opf=0x1 physical=9502720 len=16384 + 56.1188: raid56_write_stripe: full_stripe=38928384 devid=3 type=DATA2 offset=32768 opf=0x1 physical=9535488 len=16384 + 56.1188: raid56_write_stripe: full_stripe=38928384 devid=1 type=PQ1 offset=0 opf=0x1 physical=30474240 len=16384 + 56.1189: raid56_write_stripe: full_stripe=38928384 devid=1 type=PQ1 offset=32768 opf=0x1 physical=30507008 len=16384 + 56.1218: raid56_write_stripe: full_stripe=38928384 devid=3 type=DATA2 offset=49152 opf=0x1 physical=9551872 len=16384 + 56.1219: raid56_write_stripe: full_stripe=38928384 devid=1 type=PQ1 offset=49152 opf=0x1 physical=30523392 len=16384 + 56.2721: raid56_parity_recover: full stripe=38928384 eb=39010304 mirror=2 + 56.2723: raid56_parity_recover: full stripe=38928384 eb=39010304 mirror=2 + 56.2724: raid56_parity_recover: full stripe=38928384 eb=39010304 mirror=2 + +Before we enter raid56_parity_recover(), we have triggered some metadata +write for the full stripe 38928384, this leads to us to read all the +sectors from disk. + +Furthermore, btrfs raid56 write will cache its calculated P/Q sectors to +avoid unnecessary read. + +This means, for that full stripe, after any partial write, we will have +stale data, along with P/Q calculated using that stale data. + +Thankfully due to patch "btrfs: only write the sectors in the vertical stripe +which has data stripes" we haven't submitted all the corrupted P/Q to disk. + +When we really need to recover certain range, aka in +raid56_parity_recover(), we will use the cached rbio, along with its +cached sectors (the full stripe is all cached). + +This explains why we have no event raid56_scrub_read_recover() +triggered. + +Since we have the cached P/Q which is calculated using the stale data, +the recovered one will just be stale. + +In our particular test case, it will always return the same incorrect +metadata, thus causing the same error message "parent transid verify +failed on 39010304 wanted 9 found 7" again and again. + +[BTRFS DESTRUCTIVE RMW PROBLEM] + +Test case btrfs/125 (and above workload) always has its trouble with +the destructive read-modify-write (RMW) cycle: + + 0 32K 64K +Data1: | Good | Good | +Data2: | Bad | Bad | +Parity: | Good | Good | + +In above case, if we trigger any write into Data1, we will use the bad +data in Data2 to re-generate parity, killing the only chance to recovery +Data2, thus Data2 is lost forever. + +This destructive RMW cycle is not specific to btrfs RAID56, but there +are some btrfs specific behaviors making the case even worse: + +- Btrfs will cache sectors for unrelated vertical stripes. + + In above example, if we're only writing into 0~32K range, btrfs will + still read data range (32K ~ 64K) of Data1, and (64K~128K) of Data2. + This behavior is to cache sectors for later update. + + Incidentally commit d4e28d9b5f04 ("btrfs: raid56: make steal_rbio() + subpage compatible") has a bug which makes RAID56 to never trust the + cached sectors, thus slightly improve the situation for recovery. + + Unfortunately, follow up fix "btrfs: update stripe_sectors::uptodate in + steal_rbio" will revert the behavior back to the old one. + +- Btrfs raid56 partial write will update all P/Q sectors and cache them + + This means, even if data at (64K ~ 96K) of Data2 is free space, and + only (96K ~ 128K) of Data2 is really stale data. + And we write into that (96K ~ 128K), we will update all the parity + sectors for the full stripe. + + This unnecessary behavior will completely kill the chance of recovery. + + Thankfully, an unrelated optimization "btrfs: only write the sectors + in the vertical stripe which has data stripes" will prevent + submitting the write bio for untouched vertical sectors. + + That optimization will keep the on-disk P/Q untouched for a chance for + later recovery. + +[FIX] +Although we have no good way to completely fix the destructive RMW +(unless we go full scrub for each partial write), we can still limit the +damage. + +With patch "btrfs: only write the sectors in the vertical stripe which +has data stripes" now we won't really submit the P/Q of unrelated +vertical stripes, so the on-disk P/Q should still be fine. + +Now we really need to do is just drop all the cached sectors when doing +recovery. + +By this, we have a chance to read the original P/Q from disk, and have a +chance to recover the stale data, while still keep the cache to speed up +regular write path. + +In fact, just dropping all the cache for recovery path is good enough to +allow the test case btrfs/125 along with the small script to pass +reliably. + +The lack of metadata write after the degraded mount, and forced metadata +COW is saving us this time. + +So this patch will fix the behavior by not trust any cache in +__raid56_parity_recover(), to solve the problem while still keep the +cache useful. + +But please note that this test pass DOES NOT mean we have solved the +destructive RMW problem, we just do better damage control a little +better. + +Related patches: + +- btrfs: only write the sectors in the vertical stripe +- d4e28d9b5f04 ("btrfs: raid56: make steal_rbio() subpage compatible") +- btrfs: update stripe_sectors::uptodate in steal_rbio + +Acked-by: David Sterba +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/raid56.c | 19 ++++++------------- + 1 file changed, 6 insertions(+), 13 deletions(-) + +--- a/fs/btrfs/raid56.c ++++ b/fs/btrfs/raid56.c +@@ -2084,9 +2084,12 @@ static int __raid56_parity_recover(struc + atomic_set(&rbio->error, 0); + + /* +- * read everything that hasn't failed. Thanks to the +- * stripe cache, it is possible that some or all of these +- * pages are going to be uptodate. ++ * Read everything that hasn't failed. However this time we will ++ * not trust any cached sector. ++ * As we may read out some stale data but higher layer is not reading ++ * that stale part. ++ * ++ * So here we always re-read everything in recovery path. + */ + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + if (rbio->faila == stripe || rbio->failb == stripe) { +@@ -2095,16 +2098,6 @@ static int __raid56_parity_recover(struc + } + + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { +- struct page *p; +- +- /* +- * the rmw code may have already read this +- * page in +- */ +- p = rbio_stripe_page(rbio, stripe, pagenr); +- if (PageUptodate(p)) +- continue; +- + ret = rbio_add_io_page(rbio, &bio_list, + rbio_stripe_page(rbio, stripe, pagenr), + stripe, pagenr, rbio->stripe_len); diff --git a/queue-5.18/net_sched-cls_route-disallow-handle-of-0.patch b/queue-5.18/net_sched-cls_route-disallow-handle-of-0.patch new file mode 100644 index 00000000000..5ea429da45c --- /dev/null +++ b/queue-5.18/net_sched-cls_route-disallow-handle-of-0.patch @@ -0,0 +1,87 @@ +From 02799571714dc5dd6948824b9d080b44a295f695 Mon Sep 17 00:00:00 2001 +From: Jamal Hadi Salim +Date: Sun, 14 Aug 2022 11:27:58 +0000 +Subject: net_sched: cls_route: disallow handle of 0 + +From: Jamal Hadi Salim + +commit 02799571714dc5dd6948824b9d080b44a295f695 upstream. + +Follows up on: +https://lore.kernel.org/all/20220809170518.164662-1-cascardo@canonical.com/ + +handle of 0 implies from/to of universe realm which is not very +sensible. + +Lets see what this patch will do: +$sudo tc qdisc add dev $DEV root handle 1:0 prio + +//lets manufacture a way to insert handle of 0 +$sudo tc filter add dev $DEV parent 1:0 protocol ip prio 100 \ +route to 0 from 0 classid 1:10 action ok + +//gets rejected... +Error: handle of 0 is not valid. +We have an error talking to the kernel, -1 + +//lets create a legit entry.. +sudo tc filter add dev $DEV parent 1:0 protocol ip prio 100 route from 10 \ +classid 1:10 action ok + +//what did the kernel insert? +$sudo tc filter ls dev $DEV parent 1:0 +filter protocol ip pref 100 route chain 0 +filter protocol ip pref 100 route chain 0 fh 0x000a8000 flowid 1:10 from 10 + action order 1: gact action pass + random type none pass val 0 + index 1 ref 1 bind 1 + +//Lets try to replace that legit entry with a handle of 0 +$ sudo tc filter replace dev $DEV parent 1:0 protocol ip prio 100 \ +handle 0x000a8000 route to 0 from 0 classid 1:10 action drop + +Error: Replacing with handle of 0 is invalid. +We have an error talking to the kernel, -1 + +And last, lets run Cascardo's POC: +$ ./poc +0 +0 +-22 +-22 +-22 + +Signed-off-by: Jamal Hadi Salim +Acked-by: Stephen Hemminger +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/cls_route.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/net/sched/cls_route.c ++++ b/net/sched/cls_route.c +@@ -424,6 +424,11 @@ static int route4_set_parms(struct net * + return -EINVAL; + } + ++ if (!nhandle) { ++ NL_SET_ERR_MSG(extack, "Replacing with handle of 0 is invalid"); ++ return -EINVAL; ++ } ++ + h1 = to_hash(nhandle); + b = rtnl_dereference(head->table[h1]); + if (!b) { +@@ -477,6 +482,11 @@ static int route4_change(struct net *net + int err; + bool new = true; + ++ if (!handle) { ++ NL_SET_ERR_MSG(extack, "Creating with handle of 0 is invalid"); ++ return -EINVAL; ++ } ++ + if (opt == NULL) + return handle ? -EINVAL : 0; + diff --git a/queue-5.18/series b/queue-5.18/series index e69de29bb2d..395024afe15 100644 --- a/queue-5.18/series +++ b/queue-5.18/series @@ -0,0 +1,4 @@ +tee-add-overflow-check-in-register_shm_helper.patch +net_sched-cls_route-disallow-handle-of-0.patch +btrfs-only-write-the-sectors-in-the-vertical-stripe-which-has-data-stripes.patch +btrfs-raid56-don-t-trust-any-cached-sector-in-__raid56_parity_recover.patch diff --git a/queue-5.18/tee-add-overflow-check-in-register_shm_helper.patch b/queue-5.18/tee-add-overflow-check-in-register_shm_helper.patch new file mode 100644 index 00000000000..fce362774ff --- /dev/null +++ b/queue-5.18/tee-add-overflow-check-in-register_shm_helper.patch @@ -0,0 +1,59 @@ +From 573ae4f13f630d6660008f1974c0a8a29c30e18a Mon Sep 17 00:00:00 2001 +From: Jens Wiklander +Date: Thu, 18 Aug 2022 13:08:59 +0200 +Subject: tee: add overflow check in register_shm_helper() + +From: Jens Wiklander + +commit 573ae4f13f630d6660008f1974c0a8a29c30e18a upstream. + +With special lengths supplied by user space, register_shm_helper() has +an integer overflow when calculating the number of pages covered by a +supplied user space memory region. + +This causes internal_get_user_pages_fast() a helper function of +pin_user_pages_fast() to do a NULL pointer dereference: + + Unable to handle kernel NULL pointer dereference at virtual address 0000000000000010 + Modules linked in: + CPU: 1 PID: 173 Comm: optee_example_a Not tainted 5.19.0 #11 + Hardware name: QEMU QEMU Virtual Machine, BIOS 0.0.0 02/06/2015 + pc : internal_get_user_pages_fast+0x474/0xa80 + Call trace: + internal_get_user_pages_fast+0x474/0xa80 + pin_user_pages_fast+0x24/0x4c + register_shm_helper+0x194/0x330 + tee_shm_register_user_buf+0x78/0x120 + tee_ioctl+0xd0/0x11a0 + __arm64_sys_ioctl+0xa8/0xec + invoke_syscall+0x48/0x114 + +Fix this by adding an an explicit call to access_ok() in +tee_shm_register_user_buf() to catch an invalid user space address +early. + +Fixes: 033ddf12bcf5 ("tee: add register user memory") +Cc: stable@vger.kernel.org +Reported-by: Nimish Mishra +Reported-by: Anirban Chakraborty +Reported-by: Debdeep Mukhopadhyay +Suggested-by: Jerome Forissier +Signed-off-by: Jens Wiklander +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + drivers/tee/tee_shm.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/drivers/tee/tee_shm.c ++++ b/drivers/tee/tee_shm.c +@@ -311,6 +311,9 @@ struct tee_shm *tee_shm_register_user_bu + void *ret; + int id; + ++ if (!access_ok((void __user *)addr, length)) ++ return ERR_PTR(-EFAULT); ++ + mutex_lock(&teedev->mutex); + id = idr_alloc(&teedev->idr, NULL, 1, 0, GFP_KERNEL); + mutex_unlock(&teedev->mutex); -- 2.47.3