--- /dev/null
+From foo@baz Fri Aug 19 01:19:24 PM CEST 2022
+From: Qu Wenruo <wqu@suse.com>
+Date: Fri, 19 Aug 2022 15:01:38 +0800
+Subject: btrfs: only write the sectors in the vertical stripe which has data stripes
+To: linux-btrfs@vger.kernel.org, stable@vger.kernel.org
+Cc: David Sterba <dsterba@suse.com>
+Message-ID: <21fd4ed644aede2078f639c00291f3247bd03fcf.1660891713.git.wqu@suse.com>
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit bd8f7e627703ca5707833d623efcd43f104c7b3f upstream.
+
+If we have only 8K partial write at the beginning of a full RAID56
+stripe, we will write the following contents:
+
+ 0 8K 32K 64K
+Disk 1 (data): |XX| | |
+Disk 2 (data): | | |
+Disk 3 (parity): |XXXXXXXXXXXXXXX|XXXXXXXXXXXXXXX|
+
+|X| means the sector will be written back to disk.
+
+Note that, although we won't write any sectors from disk 2, but we will
+write the full 64KiB of parity to disk.
+
+This behavior is fine for now, but not for the future (especially for
+RAID56J, as we waste quite some space to journal the unused parity
+stripes).
+
+So here we will also utilize the btrfs_raid_bio::dbitmap, anytime we
+queue a higher level bio into an rbio, we will update rbio::dbitmap to
+indicate which vertical stripes we need to writeback.
+
+And at finish_rmw(), we also check dbitmap to see if we need to write
+any sector in the vertical stripe.
+
+So after the patch, above example will only lead to the following
+writeback pattern:
+
+ 0 8K 32K 64K
+Disk 1 (data): |XX| | |
+Disk 2 (data): | | |
+Disk 3 (parity): |XX| | |
+
+Acked-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/raid56.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++----
+ 1 file changed, 49 insertions(+), 4 deletions(-)
+
+--- a/fs/btrfs/raid56.c
++++ b/fs/btrfs/raid56.c
+@@ -403,6 +403,9 @@ static void merge_rbio(struct btrfs_raid
+ {
+ bio_list_merge(&dest->bio_list, &victim->bio_list);
+ dest->bio_list_bytes += victim->bio_list_bytes;
++ /* Also inherit the bitmaps from @victim. */
++ bitmap_or(dest->dbitmap, victim->dbitmap, dest->dbitmap,
++ dest->stripe_nsectors);
+ dest->generic_bio_cnt += victim->generic_bio_cnt;
+ bio_list_init(&victim->bio_list);
+ }
+@@ -944,6 +947,12 @@ static void rbio_orig_end_io(struct btrf
+
+ if (rbio->generic_bio_cnt)
+ btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
++ /*
++ * Clear the data bitmap, as the rbio may be cached for later usage.
++ * do this before before unlock_stripe() so there will be no new bio
++ * for this bio.
++ */
++ bitmap_clear(rbio->dbitmap, 0, rbio->stripe_nsectors);
+
+ /*
+ * At this moment, rbio->bio_list is empty, however since rbio does not
+@@ -1294,6 +1303,9 @@ static noinline void finish_rmw(struct b
+ else
+ BUG();
+
++ /* We should have at least one data sector. */
++ ASSERT(bitmap_weight(rbio->dbitmap, rbio->stripe_nsectors));
++
+ /* at this point we either have a full stripe,
+ * or we've read the full stripe from the drive.
+ * recalculate the parity and write the new results.
+@@ -1368,6 +1380,10 @@ static noinline void finish_rmw(struct b
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
+
++ /* This vertical stripe has no data, skip it. */
++ if (!test_bit(sectornr, rbio->dbitmap))
++ continue;
++
+ if (stripe < rbio->nr_data) {
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (!sector)
+@@ -1394,6 +1410,10 @@ static noinline void finish_rmw(struct b
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
+
++ /* This vertical stripe has no data, skip it. */
++ if (!test_bit(sectornr, rbio->dbitmap))
++ continue;
++
+ if (stripe < rbio->nr_data) {
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (!sector)
+@@ -1845,6 +1865,33 @@ static void btrfs_raid_unplug(struct blk
+ run_plug(plug);
+ }
+
++/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
++static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
++{
++ const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
++ const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
++ const u64 full_stripe_start = rbio->bioc->raid_map[0];
++ const u32 orig_len = orig_bio->bi_iter.bi_size;
++ const u32 sectorsize = fs_info->sectorsize;
++ u64 cur_logical;
++
++ ASSERT(orig_logical >= full_stripe_start &&
++ orig_logical + orig_len <= full_stripe_start +
++ rbio->nr_data * rbio->stripe_len);
++
++ bio_list_add(&rbio->bio_list, orig_bio);
++ rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
++
++ /* Update the dbitmap. */
++ for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
++ cur_logical += sectorsize) {
++ int bit = ((u32)(cur_logical - full_stripe_start) >>
++ fs_info->sectorsize_bits) % rbio->stripe_nsectors;
++
++ set_bit(bit, rbio->dbitmap);
++ }
++}
++
+ /*
+ * our main entry point for writes from the rest of the FS.
+ */
+@@ -1861,9 +1908,8 @@ int raid56_parity_write(struct bio *bio,
+ btrfs_put_bioc(bioc);
+ return PTR_ERR(rbio);
+ }
+- bio_list_add(&rbio->bio_list, bio);
+- rbio->bio_list_bytes = bio->bi_iter.bi_size;
+ rbio->operation = BTRFS_RBIO_WRITE;
++ rbio_add_bio(rbio, bio);
+
+ btrfs_bio_counter_inc_noblocked(fs_info);
+ rbio->generic_bio_cnt = 1;
+@@ -2268,8 +2314,7 @@ int raid56_parity_recover(struct bio *bi
+ }
+
+ rbio->operation = BTRFS_RBIO_READ_REBUILD;
+- bio_list_add(&rbio->bio_list, bio);
+- rbio->bio_list_bytes = bio->bi_iter.bi_size;
++ rbio_add_bio(rbio, bio);
+
+ rbio->faila = find_logical_bio_stripe(rbio, bio);
+ if (rbio->faila == -1) {
--- /dev/null
+From foo@baz Fri Aug 19 01:19:24 PM CEST 2022
+From: Qu Wenruo <wqu@suse.com>
+Date: Fri, 19 Aug 2022 15:01:39 +0800
+Subject: btrfs: raid56: don't trust any cached sector in __raid56_parity_recover()
+To: linux-btrfs@vger.kernel.org, stable@vger.kernel.org
+Cc: David Sterba <dsterba@suse.com>
+Message-ID: <395e757174642361ce1db65d221a915ce6868e40.1660891713.git.wqu@suse.com>
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit f6065f8edeb25f4a9dfe0b446030ad995a84a088 upstream.
+
+[BUG]
+There is a small workload which will always fail with recent kernel:
+(A simplified version from btrfs/125 test case)
+
+ mkfs.btrfs -f -m raid5 -d raid5 -b 1G $dev1 $dev2 $dev3
+ mount $dev1 $mnt
+ xfs_io -f -c "pwrite -S 0xee 0 1M" $mnt/file1
+ sync
+ umount $mnt
+ btrfs dev scan -u $dev3
+ mount -o degraded $dev1 $mnt
+ xfs_io -f -c "pwrite -S 0xff 0 128M" $mnt/file2
+ umount $mnt
+ btrfs dev scan
+ mount $dev1 $mnt
+ btrfs balance start --full-balance $mnt
+ umount $mnt
+
+The failure is always failed to read some tree blocks:
+
+ BTRFS info (device dm-4): relocating block group 217710592 flags data|raid5
+ BTRFS error (device dm-4): parent transid verify failed on 38993920 wanted 9 found 7
+ BTRFS error (device dm-4): parent transid verify failed on 38993920 wanted 9 found 7
+ ...
+
+[CAUSE]
+With the recently added debug output, we can see all RAID56 operations
+related to full stripe 38928384:
+
+ 56.1183: raid56_read_partial: full_stripe=38928384 devid=2 type=DATA1 offset=0 opf=0x0 physical=9502720 len=65536
+ 56.1185: raid56_read_partial: full_stripe=38928384 devid=3 type=DATA2 offset=16384 opf=0x0 physical=9519104 len=16384
+ 56.1185: raid56_read_partial: full_stripe=38928384 devid=3 type=DATA2 offset=49152 opf=0x0 physical=9551872 len=16384
+ 56.1187: raid56_write_stripe: full_stripe=38928384 devid=3 type=DATA2 offset=0 opf=0x1 physical=9502720 len=16384
+ 56.1188: raid56_write_stripe: full_stripe=38928384 devid=3 type=DATA2 offset=32768 opf=0x1 physical=9535488 len=16384
+ 56.1188: raid56_write_stripe: full_stripe=38928384 devid=1 type=PQ1 offset=0 opf=0x1 physical=30474240 len=16384
+ 56.1189: raid56_write_stripe: full_stripe=38928384 devid=1 type=PQ1 offset=32768 opf=0x1 physical=30507008 len=16384
+ 56.1218: raid56_write_stripe: full_stripe=38928384 devid=3 type=DATA2 offset=49152 opf=0x1 physical=9551872 len=16384
+ 56.1219: raid56_write_stripe: full_stripe=38928384 devid=1 type=PQ1 offset=49152 opf=0x1 physical=30523392 len=16384
+ 56.2721: raid56_parity_recover: full stripe=38928384 eb=39010304 mirror=2
+ 56.2723: raid56_parity_recover: full stripe=38928384 eb=39010304 mirror=2
+ 56.2724: raid56_parity_recover: full stripe=38928384 eb=39010304 mirror=2
+
+Before we enter raid56_parity_recover(), we have triggered some metadata
+write for the full stripe 38928384, this leads to us to read all the
+sectors from disk.
+
+Furthermore, btrfs raid56 write will cache its calculated P/Q sectors to
+avoid unnecessary read.
+
+This means, for that full stripe, after any partial write, we will have
+stale data, along with P/Q calculated using that stale data.
+
+Thankfully due to patch "btrfs: only write the sectors in the vertical stripe
+which has data stripes" we haven't submitted all the corrupted P/Q to disk.
+
+When we really need to recover certain range, aka in
+raid56_parity_recover(), we will use the cached rbio, along with its
+cached sectors (the full stripe is all cached).
+
+This explains why we have no event raid56_scrub_read_recover()
+triggered.
+
+Since we have the cached P/Q which is calculated using the stale data,
+the recovered one will just be stale.
+
+In our particular test case, it will always return the same incorrect
+metadata, thus causing the same error message "parent transid verify
+failed on 39010304 wanted 9 found 7" again and again.
+
+[BTRFS DESTRUCTIVE RMW PROBLEM]
+
+Test case btrfs/125 (and above workload) always has its trouble with
+the destructive read-modify-write (RMW) cycle:
+
+ 0 32K 64K
+Data1: | Good | Good |
+Data2: | Bad | Bad |
+Parity: | Good | Good |
+
+In above case, if we trigger any write into Data1, we will use the bad
+data in Data2 to re-generate parity, killing the only chance to recovery
+Data2, thus Data2 is lost forever.
+
+This destructive RMW cycle is not specific to btrfs RAID56, but there
+are some btrfs specific behaviors making the case even worse:
+
+- Btrfs will cache sectors for unrelated vertical stripes.
+
+ In above example, if we're only writing into 0~32K range, btrfs will
+ still read data range (32K ~ 64K) of Data1, and (64K~128K) of Data2.
+ This behavior is to cache sectors for later update.
+
+ Incidentally commit d4e28d9b5f04 ("btrfs: raid56: make steal_rbio()
+ subpage compatible") has a bug which makes RAID56 to never trust the
+ cached sectors, thus slightly improve the situation for recovery.
+
+ Unfortunately, follow up fix "btrfs: update stripe_sectors::uptodate in
+ steal_rbio" will revert the behavior back to the old one.
+
+- Btrfs raid56 partial write will update all P/Q sectors and cache them
+
+ This means, even if data at (64K ~ 96K) of Data2 is free space, and
+ only (96K ~ 128K) of Data2 is really stale data.
+ And we write into that (96K ~ 128K), we will update all the parity
+ sectors for the full stripe.
+
+ This unnecessary behavior will completely kill the chance of recovery.
+
+ Thankfully, an unrelated optimization "btrfs: only write the sectors
+ in the vertical stripe which has data stripes" will prevent
+ submitting the write bio for untouched vertical sectors.
+
+ That optimization will keep the on-disk P/Q untouched for a chance for
+ later recovery.
+
+[FIX]
+Although we have no good way to completely fix the destructive RMW
+(unless we go full scrub for each partial write), we can still limit the
+damage.
+
+With patch "btrfs: only write the sectors in the vertical stripe which
+has data stripes" now we won't really submit the P/Q of unrelated
+vertical stripes, so the on-disk P/Q should still be fine.
+
+Now we really need to do is just drop all the cached sectors when doing
+recovery.
+
+By this, we have a chance to read the original P/Q from disk, and have a
+chance to recover the stale data, while still keep the cache to speed up
+regular write path.
+
+In fact, just dropping all the cache for recovery path is good enough to
+allow the test case btrfs/125 along with the small script to pass
+reliably.
+
+The lack of metadata write after the degraded mount, and forced metadata
+COW is saving us this time.
+
+So this patch will fix the behavior by not trust any cache in
+__raid56_parity_recover(), to solve the problem while still keep the
+cache useful.
+
+But please note that this test pass DOES NOT mean we have solved the
+destructive RMW problem, we just do better damage control a little
+better.
+
+Related patches:
+
+- btrfs: only write the sectors in the vertical stripe
+- d4e28d9b5f04 ("btrfs: raid56: make steal_rbio() subpage compatible")
+- btrfs: update stripe_sectors::uptodate in steal_rbio
+
+Acked-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/raid56.c | 15 ++++++---------
+ 1 file changed, 6 insertions(+), 9 deletions(-)
+
+--- a/fs/btrfs/raid56.c
++++ b/fs/btrfs/raid56.c
+@@ -2218,9 +2218,12 @@ static int __raid56_parity_recover(struc
+ atomic_set(&rbio->error, 0);
+
+ /*
+- * read everything that hasn't failed. Thanks to the
+- * stripe cache, it is possible that some or all of these
+- * pages are going to be uptodate.
++ * Read everything that hasn't failed. However this time we will
++ * not trust any cached sector.
++ * As we may read out some stale data but higher layer is not reading
++ * that stale part.
++ *
++ * So here we always re-read everything in recovery path.
+ */
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+ if (rbio->faila == stripe || rbio->failb == stripe) {
+@@ -2231,13 +2234,7 @@ static int __raid56_parity_recover(struc
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
+
+- /*
+- * the rmw code may have already read this
+- * page in
+- */
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+- if (sector->uptodate)
+- continue;
+
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ stripe, sectornr, rbio->stripe_len,
--- /dev/null
+From 02799571714dc5dd6948824b9d080b44a295f695 Mon Sep 17 00:00:00 2001
+From: Jamal Hadi Salim <jhs@mojatatu.com>
+Date: Sun, 14 Aug 2022 11:27:58 +0000
+Subject: net_sched: cls_route: disallow handle of 0
+
+From: Jamal Hadi Salim <jhs@mojatatu.com>
+
+commit 02799571714dc5dd6948824b9d080b44a295f695 upstream.
+
+Follows up on:
+https://lore.kernel.org/all/20220809170518.164662-1-cascardo@canonical.com/
+
+handle of 0 implies from/to of universe realm which is not very
+sensible.
+
+Lets see what this patch will do:
+$sudo tc qdisc add dev $DEV root handle 1:0 prio
+
+//lets manufacture a way to insert handle of 0
+$sudo tc filter add dev $DEV parent 1:0 protocol ip prio 100 \
+route to 0 from 0 classid 1:10 action ok
+
+//gets rejected...
+Error: handle of 0 is not valid.
+We have an error talking to the kernel, -1
+
+//lets create a legit entry..
+sudo tc filter add dev $DEV parent 1:0 protocol ip prio 100 route from 10 \
+classid 1:10 action ok
+
+//what did the kernel insert?
+$sudo tc filter ls dev $DEV parent 1:0
+filter protocol ip pref 100 route chain 0
+filter protocol ip pref 100 route chain 0 fh 0x000a8000 flowid 1:10 from 10
+ action order 1: gact action pass
+ random type none pass val 0
+ index 1 ref 1 bind 1
+
+//Lets try to replace that legit entry with a handle of 0
+$ sudo tc filter replace dev $DEV parent 1:0 protocol ip prio 100 \
+handle 0x000a8000 route to 0 from 0 classid 1:10 action drop
+
+Error: Replacing with handle of 0 is invalid.
+We have an error talking to the kernel, -1
+
+And last, lets run Cascardo's POC:
+$ ./poc
+0
+0
+-22
+-22
+-22
+
+Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
+Acked-by: Stephen Hemminger <stephen@networkplumber.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sched/cls_route.c | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/net/sched/cls_route.c
++++ b/net/sched/cls_route.c
+@@ -424,6 +424,11 @@ static int route4_set_parms(struct net *
+ return -EINVAL;
+ }
+
++ if (!nhandle) {
++ NL_SET_ERR_MSG(extack, "Replacing with handle of 0 is invalid");
++ return -EINVAL;
++ }
++
+ h1 = to_hash(nhandle);
+ b = rtnl_dereference(head->table[h1]);
+ if (!b) {
+@@ -477,6 +482,11 @@ static int route4_change(struct net *net
+ int err;
+ bool new = true;
+
++ if (!handle) {
++ NL_SET_ERR_MSG(extack, "Creating with handle of 0 is invalid");
++ return -EINVAL;
++ }
++
+ if (opt == NULL)
+ return handle ? -EINVAL : 0;
+
revert-mm-kfence-apply-kmemleak_ignore_phys-on-early-allocated-pool.patch
+tee-add-overflow-check-in-register_shm_helper.patch
+net_sched-cls_route-disallow-handle-of-0.patch
+btrfs-only-write-the-sectors-in-the-vertical-stripe-which-has-data-stripes.patch
+btrfs-raid56-don-t-trust-any-cached-sector-in-__raid56_parity_recover.patch
--- /dev/null
+From 573ae4f13f630d6660008f1974c0a8a29c30e18a Mon Sep 17 00:00:00 2001
+From: Jens Wiklander <jens.wiklander@linaro.org>
+Date: Thu, 18 Aug 2022 13:08:59 +0200
+Subject: tee: add overflow check in register_shm_helper()
+
+From: Jens Wiklander <jens.wiklander@linaro.org>
+
+commit 573ae4f13f630d6660008f1974c0a8a29c30e18a upstream.
+
+With special lengths supplied by user space, register_shm_helper() has
+an integer overflow when calculating the number of pages covered by a
+supplied user space memory region.
+
+This causes internal_get_user_pages_fast() a helper function of
+pin_user_pages_fast() to do a NULL pointer dereference:
+
+ Unable to handle kernel NULL pointer dereference at virtual address 0000000000000010
+ Modules linked in:
+ CPU: 1 PID: 173 Comm: optee_example_a Not tainted 5.19.0 #11
+ Hardware name: QEMU QEMU Virtual Machine, BIOS 0.0.0 02/06/2015
+ pc : internal_get_user_pages_fast+0x474/0xa80
+ Call trace:
+ internal_get_user_pages_fast+0x474/0xa80
+ pin_user_pages_fast+0x24/0x4c
+ register_shm_helper+0x194/0x330
+ tee_shm_register_user_buf+0x78/0x120
+ tee_ioctl+0xd0/0x11a0
+ __arm64_sys_ioctl+0xa8/0xec
+ invoke_syscall+0x48/0x114
+
+Fix this by adding an an explicit call to access_ok() in
+tee_shm_register_user_buf() to catch an invalid user space address
+early.
+
+Fixes: 033ddf12bcf5 ("tee: add register user memory")
+Cc: stable@vger.kernel.org
+Reported-by: Nimish Mishra <neelam.nimish@gmail.com>
+Reported-by: Anirban Chakraborty <ch.anirban00727@gmail.com>
+Reported-by: Debdeep Mukhopadhyay <debdeep.mukhopadhyay@gmail.com>
+Suggested-by: Jerome Forissier <jerome.forissier@linaro.org>
+Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/tee/tee_shm.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/drivers/tee/tee_shm.c
++++ b/drivers/tee/tee_shm.c
+@@ -326,6 +326,9 @@ struct tee_shm *tee_shm_register_user_bu
+ void *ret;
+ int id;
+
++ if (!access_ok((void __user *)addr, length))
++ return ERR_PTR(-EFAULT);
++
+ mutex_lock(&teedev->mutex);
+ id = idr_alloc(&teedev->idr, NULL, 1, 0, GFP_KERNEL);
+ mutex_unlock(&teedev->mutex);