]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.19-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 7 Nov 2020 15:26:56 +0000 (16:26 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 7 Nov 2020 15:26:56 +0000 (16:26 +0100)
added patches:
blktrace-fix-debugfs-use-after-free.patch
btrfs-don-t-submit-any-btree-write-bio-if-the-fs-has-errors.patch
btrfs-extent_io-add-proper-error-handling-to-lock_extent_buffer_for_io.patch
btrfs-extent_io-handle-errors-better-in-btree_write_cache_pages.patch
btrfs-extent_io-handle-errors-better-in-extent_write_full_page.patch
btrfs-extent_io-kill-the-forward-declaration-of-flush_write_bio.patch
btrfs-extent_io-move-the-bug_on-in-flush_write_bio-one-level-up.patch
btrfs-fix-unwritten-extent-buffers-and-hangs-on-future-writeback-attempts.patch
btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch
btrfs-move-btrfs_check_chunk_valid-to-tree-check.-and-export-it.patch
btrfs-tree-checker-check-chunk-item-at-tree-block-read-time.patch
btrfs-tree-checker-enhance-chunk-checker-to-validate-chunk-profile.patch
btrfs-tree-checker-fix-the-error-message-for-transid-error.patch
btrfs-tree-checker-fix-wrong-check-on-max-devid.patch
btrfs-tree-checker-make-btrfs_check_chunk_valid-return-euclean-instead-of-eio.patch
btrfs-tree-checker-make-chunk-item-checker-messages-more-readable.patch
btrfs-tree-checker-verify-dev-item.patch
btrfs-tree-checker-verify-inode-item.patch
revert-btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch

20 files changed:
queue-4.19/blktrace-fix-debugfs-use-after-free.patch [new file with mode: 0644]
queue-4.19/btrfs-don-t-submit-any-btree-write-bio-if-the-fs-has-errors.patch [new file with mode: 0644]
queue-4.19/btrfs-extent_io-add-proper-error-handling-to-lock_extent_buffer_for_io.patch [new file with mode: 0644]
queue-4.19/btrfs-extent_io-handle-errors-better-in-btree_write_cache_pages.patch [new file with mode: 0644]
queue-4.19/btrfs-extent_io-handle-errors-better-in-extent_write_full_page.patch [new file with mode: 0644]
queue-4.19/btrfs-extent_io-kill-the-forward-declaration-of-flush_write_bio.patch [new file with mode: 0644]
queue-4.19/btrfs-extent_io-move-the-bug_on-in-flush_write_bio-one-level-up.patch [new file with mode: 0644]
queue-4.19/btrfs-fix-unwritten-extent-buffers-and-hangs-on-future-writeback-attempts.patch [new file with mode: 0644]
queue-4.19/btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch [new file with mode: 0644]
queue-4.19/btrfs-move-btrfs_check_chunk_valid-to-tree-check.-and-export-it.patch [new file with mode: 0644]
queue-4.19/btrfs-tree-checker-check-chunk-item-at-tree-block-read-time.patch [new file with mode: 0644]
queue-4.19/btrfs-tree-checker-enhance-chunk-checker-to-validate-chunk-profile.patch [new file with mode: 0644]
queue-4.19/btrfs-tree-checker-fix-the-error-message-for-transid-error.patch [new file with mode: 0644]
queue-4.19/btrfs-tree-checker-fix-wrong-check-on-max-devid.patch [new file with mode: 0644]
queue-4.19/btrfs-tree-checker-make-btrfs_check_chunk_valid-return-euclean-instead-of-eio.patch [new file with mode: 0644]
queue-4.19/btrfs-tree-checker-make-chunk-item-checker-messages-more-readable.patch [new file with mode: 0644]
queue-4.19/btrfs-tree-checker-verify-dev-item.patch [new file with mode: 0644]
queue-4.19/btrfs-tree-checker-verify-inode-item.patch [new file with mode: 0644]
queue-4.19/revert-btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch [new file with mode: 0644]
queue-4.19/series

diff --git a/queue-4.19/blktrace-fix-debugfs-use-after-free.patch b/queue-4.19/blktrace-fix-debugfs-use-after-free.patch
new file mode 100644 (file)
index 0000000..2d0ae2b
--- /dev/null
@@ -0,0 +1,215 @@
+From foo@baz Sat Nov  7 04:26:01 PM CET 2020
+From: Luis Chamberlain <mcgrof@kernel.org>
+Date: Fri, 19 Jun 2020 20:47:28 +0000
+Subject: blktrace: fix debugfs use after free
+
+From: Luis Chamberlain <mcgrof@kernel.org>
+
+commit bad8e64fb19d3a0de5e564d9a7271c31bd684369 upstream.
+
+On commit 6ac93117ab00 ("blktrace: use existing disk debugfs directory")
+merged on v4.12 Omar fixed the original blktrace code for request-based
+drivers (multiqueue). This however left in place a possible crash, if you
+happen to abuse blktrace while racing to remove / add a device.
+
+We used to use asynchronous removal of the request_queue, and with that
+the issue was easier to reproduce. Now that we have reverted to
+synchronous removal of the request_queue, the issue is still possible to
+reproduce, its however just a bit more difficult.
+
+We essentially run two instances of break-blktrace which add/remove
+a loop device, and setup a blktrace and just never tear the blktrace
+down. We do this twice in parallel. This is easily reproduced with the
+script run_0004.sh from break-blktrace [0].
+
+We can end up with two types of panics each reflecting where we
+race, one a failed blktrace setup:
+
+[  252.426751] debugfs: Directory 'loop0' with parent 'block' already present!
+[  252.432265] BUG: kernel NULL pointer dereference, address: 00000000000000a0
+[  252.436592] #PF: supervisor write access in kernel mode
+[  252.439822] #PF: error_code(0x0002) - not-present page
+[  252.442967] PGD 0 P4D 0
+[  252.444656] Oops: 0002 [#1] SMP NOPTI
+[  252.446972] CPU: 10 PID: 1153 Comm: break-blktrace Tainted: G            E     5.7.0-rc2-next-20200420+ #164
+[  252.452673] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014
+[  252.456343] RIP: 0010:down_write+0x15/0x40
+[  252.458146] Code: eb ca e8 ae 22 8d ff cc cc cc cc cc cc cc cc cc cc cc cc
+               cc cc 0f 1f 44 00 00 55 48 89 fd e8 52 db ff ff 31 c0 ba 01 00
+               00 00 <f0> 48 0f b1 55 00 75 0f 48 8b 04 25 c0 8b 01 00 48 89
+               45 08 5d
+[  252.463638] RSP: 0018:ffffa626415abcc8 EFLAGS: 00010246
+[  252.464950] RAX: 0000000000000000 RBX: ffff958c25f0f5c0 RCX: ffffff8100000000
+[  252.466727] RDX: 0000000000000001 RSI: ffffff8100000000 RDI: 00000000000000a0
+[  252.468482] RBP: 00000000000000a0 R08: 0000000000000000 R09: 0000000000000001
+[  252.470014] R10: 0000000000000000 R11: ffff958d1f9227ff R12: 0000000000000000
+[  252.471473] R13: ffff958c25ea5380 R14: ffffffff8cce15f1 R15: 00000000000000a0
+[  252.473346] FS:  00007f2e69dee540(0000) GS:ffff958c2fc80000(0000) knlGS:0000000000000000
+[  252.475225] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[  252.476267] CR2: 00000000000000a0 CR3: 0000000427d10004 CR4: 0000000000360ee0
+[  252.477526] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+[  252.478776] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+[  252.479866] Call Trace:
+[  252.480322]  simple_recursive_removal+0x4e/0x2e0
+[  252.481078]  ? debugfs_remove+0x60/0x60
+[  252.481725]  ? relay_destroy_buf+0x77/0xb0
+[  252.482662]  debugfs_remove+0x40/0x60
+[  252.483518]  blk_remove_buf_file_callback+0x5/0x10
+[  252.484328]  relay_close_buf+0x2e/0x60
+[  252.484930]  relay_open+0x1ce/0x2c0
+[  252.485520]  do_blk_trace_setup+0x14f/0x2b0
+[  252.486187]  __blk_trace_setup+0x54/0xb0
+[  252.486803]  blk_trace_ioctl+0x90/0x140
+[  252.487423]  ? do_sys_openat2+0x1ab/0x2d0
+[  252.488053]  blkdev_ioctl+0x4d/0x260
+[  252.488636]  block_ioctl+0x39/0x40
+[  252.489139]  ksys_ioctl+0x87/0xc0
+[  252.489675]  __x64_sys_ioctl+0x16/0x20
+[  252.490380]  do_syscall_64+0x52/0x180
+[  252.491032]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+And the other on the device removal:
+
+[  128.528940] debugfs: Directory 'loop0' with parent 'block' already present!
+[  128.615325] BUG: kernel NULL pointer dereference, address: 00000000000000a0
+[  128.619537] #PF: supervisor write access in kernel mode
+[  128.622700] #PF: error_code(0x0002) - not-present page
+[  128.625842] PGD 0 P4D 0
+[  128.627585] Oops: 0002 [#1] SMP NOPTI
+[  128.629871] CPU: 12 PID: 544 Comm: break-blktrace Tainted: G            E     5.7.0-rc2-next-20200420+ #164
+[  128.635595] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014
+[  128.640471] RIP: 0010:down_write+0x15/0x40
+[  128.643041] Code: eb ca e8 ae 22 8d ff cc cc cc cc cc cc cc cc cc cc cc cc
+               cc cc 0f 1f 44 00 00 55 48 89 fd e8 52 db ff ff 31 c0 ba 01 00
+               00 00 <f0> 48 0f b1 55 00 75 0f 65 48 8b 04 25 c0 8b 01 00 48 89
+               45 08 5d
+[  128.650180] RSP: 0018:ffffa9c3c05ebd78 EFLAGS: 00010246
+[  128.651820] RAX: 0000000000000000 RBX: ffff8ae9a6370240 RCX: ffffff8100000000
+[  128.653942] RDX: 0000000000000001 RSI: ffffff8100000000 RDI: 00000000000000a0
+[  128.655720] RBP: 00000000000000a0 R08: 0000000000000002 R09: ffff8ae9afd2d3d0
+[  128.657400] R10: 0000000000000056 R11: 0000000000000000 R12: 0000000000000000
+[  128.659099] R13: 0000000000000000 R14: 0000000000000003 R15: 00000000000000a0
+[  128.660500] FS:  00007febfd995540(0000) GS:ffff8ae9afd00000(0000) knlGS:0000000000000000
+[  128.662204] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[  128.663426] CR2: 00000000000000a0 CR3: 0000000420042003 CR4: 0000000000360ee0
+[  128.664776] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+[  128.666022] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+[  128.667282] Call Trace:
+[  128.667801]  simple_recursive_removal+0x4e/0x2e0
+[  128.668663]  ? debugfs_remove+0x60/0x60
+[  128.669368]  debugfs_remove+0x40/0x60
+[  128.669985]  blk_trace_free+0xd/0x50
+[  128.670593]  __blk_trace_remove+0x27/0x40
+[  128.671274]  blk_trace_shutdown+0x30/0x40
+[  128.671935]  blk_release_queue+0x95/0xf0
+[  128.672589]  kobject_put+0xa5/0x1b0
+[  128.673188]  disk_release+0xa2/0xc0
+[  128.673786]  device_release+0x28/0x80
+[  128.674376]  kobject_put+0xa5/0x1b0
+[  128.674915]  loop_remove+0x39/0x50 [loop]
+[  128.675511]  loop_control_ioctl+0x113/0x130 [loop]
+[  128.676199]  ksys_ioctl+0x87/0xc0
+[  128.676708]  __x64_sys_ioctl+0x16/0x20
+[  128.677274]  do_syscall_64+0x52/0x180
+[  128.677823]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+The common theme here is:
+
+debugfs: Directory 'loop0' with parent 'block' already present
+
+This crash happens because of how blktrace uses the debugfs directory
+where it places its files. Upon init we always create the same directory
+which would be needed by blktrace but we only do this for make_request
+drivers (multiqueue) block drivers. When you race a removal of these
+devices with a blktrace setup you end up in a situation where the
+make_request recursive debugfs removal will sweep away the blktrace
+files and then later blktrace will also try to remove individual
+dentries which are already NULL. The inverse is also possible and hence
+the two types of use after frees.
+
+We don't create the block debugfs directory on init for these types of
+block devices:
+
+  * request-based block driver block devices
+  * every possible partition
+  * scsi-generic
+
+And so, this race should in theory only be possible with make_request
+drivers.
+
+We can fix the UAF by simply re-using the debugfs directory for
+make_request drivers (multiqueue) and only creating the ephemeral
+directory for the other type of block devices. The new clarifications
+on relying on the q->blk_trace_mutex *and* also checking for q->blk_trace
+*prior* to processing a blktrace ensures the debugfs directories are
+only created if no possible directory name clashes are possible.
+
+This goes tested with:
+
+  o nvme partitions
+  o ISCSI with tgt, and blktracing against scsi-generic with:
+    o block
+    o tape
+    o cdrom
+    o media changer
+  o blktests
+
+This patch is part of the work which disputes the severity of
+CVE-2019-19770 which shows this issue is not a core debugfs issue, but
+a misuse of debugfs within blktace.
+
+Fixes: 6ac93117ab00 ("blktrace: use existing disk debugfs directory")
+Reported-by: syzbot+603294af2d01acfdd6da@syzkaller.appspotmail.com
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Cc: Bart Van Assche <bvanassche@acm.org>
+Cc: Omar Sandoval <osandov@fb.com>
+Cc: Hannes Reinecke <hare@suse.com>
+Cc: Nicolai Stange <nstange@suse.de>
+Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
+Cc: "James E.J. Bottomley" <jejb@linux.ibm.com>
+Cc: yu kuai <yukuai3@huawei.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+[bwh: Backported to 4.19: open-code queue_is_mq()]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/blktrace.c |   18 ++++++++++++------
+ 1 file changed, 12 insertions(+), 6 deletions(-)
+
+--- a/kernel/trace/blktrace.c
++++ b/kernel/trace/blktrace.c
+@@ -521,10 +521,18 @@ static int do_blk_trace_setup(struct req
+       if (!bt->msg_data)
+               goto err;
+-      ret = -ENOENT;
+-
+-      dir = debugfs_lookup(buts->name, blk_debugfs_root);
+-      if (!dir)
++#ifdef CONFIG_BLK_DEBUG_FS
++      /*
++       * When tracing whole make_request drivers (multiqueue) block devices,
++       * reuse the existing debugfs directory created by the block layer on
++       * init. For request-based block devices, all partitions block devices,
++       * and scsi-generic block devices we create a temporary new debugfs
++       * directory that will be removed once the trace ends.
++       */
++      if (q->mq_ops && bdev && bdev == bdev->bd_contains)
++              dir = q->debugfs_dir;
++      else
++#endif
+               bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
+       if (!dir)
+               goto err;
+@@ -583,8 +591,6 @@ static int do_blk_trace_setup(struct req
+       ret = 0;
+ err:
+-      if (dir && !bt->dir)
+-              dput(dir);
+       if (ret)
+               blk_trace_free(bt);
+       return ret;
diff --git a/queue-4.19/btrfs-don-t-submit-any-btree-write-bio-if-the-fs-has-errors.patch b/queue-4.19/btrfs-don-t-submit-any-btree-write-bio-if-the-fs-has-errors.patch
new file mode 100644 (file)
index 0000000..f87d181
--- /dev/null
@@ -0,0 +1,221 @@
+From foo@baz Sat Nov  7 04:26:01 PM CET 2020
+From: Qu Wenruo <wqu@suse.com>
+Date: Wed, 12 Feb 2020 14:12:44 +0800
+Subject: btrfs: Don't submit any btree write bio if the fs has errors
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit b3ff8f1d380e65dddd772542aa9bff6c86bf715a upstream.
+
+[BUG]
+There is a fuzzed image which could cause KASAN report at unmount time.
+
+  BUG: KASAN: use-after-free in btrfs_queue_work+0x2c1/0x390
+  Read of size 8 at addr ffff888067cf6848 by task umount/1922
+
+  CPU: 0 PID: 1922 Comm: umount Tainted: G        W         5.0.21 #1
+  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
+  Call Trace:
+   dump_stack+0x5b/0x8b
+   print_address_description+0x70/0x280
+   kasan_report+0x13a/0x19b
+   btrfs_queue_work+0x2c1/0x390
+   btrfs_wq_submit_bio+0x1cd/0x240
+   btree_submit_bio_hook+0x18c/0x2a0
+   submit_one_bio+0x1be/0x320
+   flush_write_bio.isra.41+0x2c/0x70
+   btree_write_cache_pages+0x3bb/0x7f0
+   do_writepages+0x5c/0x130
+   __writeback_single_inode+0xa3/0x9a0
+   writeback_single_inode+0x23d/0x390
+   write_inode_now+0x1b5/0x280
+   iput+0x2ef/0x600
+   close_ctree+0x341/0x750
+   generic_shutdown_super+0x126/0x370
+   kill_anon_super+0x31/0x50
+   btrfs_kill_super+0x36/0x2b0
+   deactivate_locked_super+0x80/0xc0
+   deactivate_super+0x13c/0x150
+   cleanup_mnt+0x9a/0x130
+   task_work_run+0x11a/0x1b0
+   exit_to_usermode_loop+0x107/0x130
+   do_syscall_64+0x1e5/0x280
+   entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+[CAUSE]
+The fuzzed image has a completely screwd up extent tree:
+
+  leaf 29421568 gen 8 total ptrs 6 free space 3587 owner EXTENT_TREE
+  refs 2 lock (w:0 r:0 bw:0 br:0 sw:0 sr:0) lock_owner 0 current 5938
+          item 0 key (12587008 168 4096) itemoff 3942 itemsize 53
+                  extent refs 1 gen 9 flags 1
+                  ref#0: extent data backref root 5 objectid 259 offset 0 count 1
+          item 1 key (12591104 168 8192) itemoff 3889 itemsize 53
+                  extent refs 1 gen 9 flags 1
+                  ref#0: extent data backref root 5 objectid 271 offset 0 count 1
+          item 2 key (12599296 168 4096) itemoff 3836 itemsize 53
+                  extent refs 1 gen 9 flags 1
+                  ref#0: extent data backref root 5 objectid 259 offset 4096 count 1
+          item 3 key (29360128 169 0) itemoff 3803 itemsize 33
+                  extent refs 1 gen 9 flags 2
+                  ref#0: tree block backref root 5
+          item 4 key (29368320 169 1) itemoff 3770 itemsize 33
+                  extent refs 1 gen 9 flags 2
+                  ref#0: tree block backref root 5
+          item 5 key (29372416 169 0) itemoff 3737 itemsize 33
+                  extent refs 1 gen 9 flags 2
+                  ref#0: tree block backref root 5
+
+Note that leaf 29421568 doesn't have its backref in the extent tree.
+Thus extent allocator can re-allocate leaf 29421568 for other trees.
+
+In short, the bug is caused by:
+
+- Existing tree block gets allocated to log tree
+  This got its generation bumped.
+
+- Log tree balance cleaned dirty bit of offending tree block
+  It will not be written back to disk, thus no WRITTEN flag.
+
+- Original owner of the tree block gets COWed
+  Since the tree block has higher transid, no WRITTEN flag, it's reused,
+  and not traced by transaction::dirty_pages.
+
+- Transaction aborted
+  Tree blocks get cleaned according to transaction::dirty_pages. But the
+  offending tree block is not recorded at all.
+
+- Filesystem unmount
+  All pages are assumed to be are clean, destroying all workqueue, then
+  call iput(btree_inode).
+  But offending tree block is still dirty, which triggers writeback, and
+  causes use-after-free bug.
+
+The detailed sequence looks like this:
+
+- Initial status
+  eb: 29421568, header=WRITTEN bflags_dirty=0, page_dirty=0, gen=8,
+      not traced by any dirty extent_iot_tree.
+
+- New tree block is allocated
+  Since there is no backref for 29421568, it's re-allocated as new tree
+  block.
+  Keep in mind that tree block 29421568 is still referred by extent
+  tree.
+
+- Tree block 29421568 is filled for log tree
+  eb: 29421568, header=0 bflags_dirty=1, page_dirty=1, gen=9 << (gen bumped)
+      traced by btrfs_root::dirty_log_pages
+
+- Some log tree operations
+  Since the fs is using node size 4096, the log tree can easily go a
+  level higher.
+
+- Log tree needs balance
+  Tree block 29421568 gets all its content pushed to right, thus now
+  it is empty, and we don't need it.
+  btrfs_clean_tree_block() from __push_leaf_right() get called.
+
+  eb: 29421568, header=0 bflags_dirty=0, page_dirty=0, gen=9
+      traced by btrfs_root::dirty_log_pages
+
+- Log tree write back
+  btree_write_cache_pages() goes through dirty pages ranges, but since
+  page of tree block 29421568 gets cleaned already, it's not written
+  back to disk. Thus it doesn't have WRITTEN bit set.
+  But ranges in dirty_log_pages are cleared.
+
+  eb: 29421568, header=0 bflags_dirty=0, page_dirty=0, gen=9
+      not traced by any dirty extent_iot_tree.
+
+- Extent tree update when committing transaction
+  Since tree block 29421568 has transid equal to running trans, and has
+  no WRITTEN bit, should_cow_block() will use it directly without adding
+  it to btrfs_transaction::dirty_pages.
+
+  eb: 29421568, header=0 bflags_dirty=1, page_dirty=1, gen=9
+      not traced by any dirty extent_iot_tree.
+
+  At this stage, we're doomed. We have a dirty eb not tracked by any
+  extent io tree.
+
+- Transaction gets aborted due to corrupted extent tree
+  Btrfs cleans up dirty pages according to transaction::dirty_pages and
+  btrfs_root::dirty_log_pages.
+  But since tree block 29421568 is not tracked by neither of them, it's
+  still dirty.
+
+  eb: 29421568, header=0 bflags_dirty=1, page_dirty=1, gen=9
+      not traced by any dirty extent_iot_tree.
+
+- Filesystem unmount
+  Since all cleanup is assumed to be done, all workqueus are destroyed.
+  Then iput(btree_inode) is called, expecting no dirty pages.
+  But tree 29421568 is still dirty, thus triggering writeback.
+  Since all workqueues are already freed, we cause use-after-free.
+
+This shows us that, log tree blocks + bad extent tree can cause wild
+dirty pages.
+
+[FIX]
+To fix the problem, don't submit any btree write bio if the filesytem
+has any error.  This is the last safe net, just in case other cleanup
+haven't caught catch it.
+
+Link: https://github.com/bobfuzzer/CVE/tree/master/CVE-2019-19377
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+[bwh: Backported to 4.19: fs_info variable already exists in
+ btree_write_cache_pages()]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent_io.c |   34 +++++++++++++++++++++++++++++++++-
+ 1 file changed, 33 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -3947,7 +3947,39 @@ retry:
+               end_write_bio(&epd, ret);
+               return ret;
+       }
+-      ret = flush_write_bio(&epd);
++      /*
++       * If something went wrong, don't allow any metadata write bio to be
++       * submitted.
++       *
++       * This would prevent use-after-free if we had dirty pages not
++       * cleaned up, which can still happen by fuzzed images.
++       *
++       * - Bad extent tree
++       *   Allowing existing tree block to be allocated for other trees.
++       *
++       * - Log tree operations
++       *   Exiting tree blocks get allocated to log tree, bumps its
++       *   generation, then get cleaned in tree re-balance.
++       *   Such tree block will not be written back, since it's clean,
++       *   thus no WRITTEN flag set.
++       *   And after log writes back, this tree block is not traced by
++       *   any dirty extent_io_tree.
++       *
++       * - Offending tree block gets re-dirtied from its original owner
++       *   Since it has bumped generation, no WRITTEN flag, it can be
++       *   reused without COWing. This tree block will not be traced
++       *   by btrfs_transaction::dirty_pages.
++       *
++       *   Now such dirty tree block will not be cleaned by any dirty
++       *   extent io tree. Thus we don't want to submit such wild eb
++       *   if the fs already has error.
++       */
++      if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
++              ret = flush_write_bio(&epd);
++      } else {
++              ret = -EUCLEAN;
++              end_write_bio(&epd, ret);
++      }
+       return ret;
+ }
diff --git a/queue-4.19/btrfs-extent_io-add-proper-error-handling-to-lock_extent_buffer_for_io.patch b/queue-4.19/btrfs-extent_io-add-proper-error-handling-to-lock_extent_buffer_for_io.patch
new file mode 100644 (file)
index 0000000..39c24a4
--- /dev/null
@@ -0,0 +1,95 @@
+From foo@baz Sat Nov  7 04:26:01 PM CET 2020
+From: Qu Wenruo <wqu@suse.com>
+Date: Wed, 20 Mar 2019 14:27:46 +0800
+Subject: btrfs: extent_io: add proper error handling to lock_extent_buffer_for_io()
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 2e3c25136adfb293d517e17f761d3b8a43a8fc22 upstream.
+
+This function needs some extra checks on locked pages and eb.  For error
+handling we need to unlock locked pages and the eb.
+
+There is a rare >0 return value branch, where all pages get locked
+while write bio is not flushed.
+
+Thankfully it's handled by the only caller, btree_write_cache_pages(),
+as later write_one_eb() call will trigger submit_one_bio().  So there
+shouldn't be any problem.
+
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent_io.c |   27 ++++++++++++++++++++++-----
+ 1 file changed, 22 insertions(+), 5 deletions(-)
+
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -3554,19 +3554,27 @@ void wait_on_extent_buffer_writeback(str
+                      TASK_UNINTERRUPTIBLE);
+ }
++/*
++ * Lock eb pages and flush the bio if we can't the locks
++ *
++ * Return  0 if nothing went wrong
++ * Return >0 is same as 0, except bio is not submitted
++ * Return <0 if something went wrong, no page is locked
++ */
+ static noinline_for_stack int
+ lock_extent_buffer_for_io(struct extent_buffer *eb,
+                         struct btrfs_fs_info *fs_info,
+                         struct extent_page_data *epd)
+ {
+-      int i, num_pages;
++      int i, num_pages, failed_page_nr;
+       int flush = 0;
+       int ret = 0;
+       if (!btrfs_try_tree_write_lock(eb)) {
+-              flush = 1;
+               ret = flush_write_bio(epd);
+-              BUG_ON(ret < 0);
++              if (ret < 0)
++                      return ret;
++              flush = 1;
+               btrfs_tree_lock(eb);
+       }
+@@ -3576,7 +3584,8 @@ lock_extent_buffer_for_io(struct extent_
+                       return 0;
+               if (!flush) {
+                       ret = flush_write_bio(epd);
+-                      BUG_ON(ret < 0);
++                      if (ret < 0)
++                              return ret;
+                       flush = 1;
+               }
+               while (1) {
+@@ -3618,7 +3627,10 @@ lock_extent_buffer_for_io(struct extent_
+               if (!trylock_page(p)) {
+                       if (!flush) {
+                               ret = flush_write_bio(epd);
+-                              BUG_ON(ret < 0);
++                              if (ret < 0) {
++                                      failed_page_nr = i;
++                                      goto err_unlock;
++                              }
+                               flush = 1;
+                       }
+                       lock_page(p);
+@@ -3626,6 +3638,11 @@ lock_extent_buffer_for_io(struct extent_
+       }
+       return ret;
++err_unlock:
++      /* Unlock already locked pages */
++      for (i = 0; i < failed_page_nr; i++)
++              unlock_page(eb->pages[i]);
++      return ret;
+ }
+ static void end_extent_buffer_writeback(struct extent_buffer *eb)
diff --git a/queue-4.19/btrfs-extent_io-handle-errors-better-in-btree_write_cache_pages.patch b/queue-4.19/btrfs-extent_io-handle-errors-better-in-btree_write_cache_pages.patch
new file mode 100644 (file)
index 0000000..23f69ff
--- /dev/null
@@ -0,0 +1,54 @@
+From foo@baz Sat Nov  7 04:26:01 PM CET 2020
+From: Qu Wenruo <wqu@suse.com>
+Date: Wed, 20 Mar 2019 14:27:43 +0800
+Subject: btrfs: extent_io: Handle errors better in btree_write_cache_pages()
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 2b952eea813b1f7e7d4b9782271acd91625b9bb9 upstream.
+
+In btree_write_cache_pages(), we can only get @ret <= 0.
+Add an ASSERT() for it just in case.
+
+Then instead of submitting the write bio even we got some error, check
+the return value first.
+If we have already hit some error, just clean up the corrupted or
+half-baked bio, and return error.
+
+If there is no error so far, then call flush_write_bio() and return the
+result.
+
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent_io.c |    9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -3809,7 +3809,6 @@ int btree_write_cache_pages(struct addre
+               .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+       };
+       int ret = 0;
+-      int flush_ret;
+       int done = 0;
+       int nr_to_write_done = 0;
+       struct pagevec pvec;
+@@ -3909,8 +3908,12 @@ retry:
+               index = 0;
+               goto retry;
+       }
+-      flush_ret = flush_write_bio(&epd);
+-      BUG_ON(flush_ret < 0);
++      ASSERT(ret <= 0);
++      if (ret < 0) {
++              end_write_bio(&epd, ret);
++              return ret;
++      }
++      ret = flush_write_bio(&epd);
+       return ret;
+ }
diff --git a/queue-4.19/btrfs-extent_io-handle-errors-better-in-extent_write_full_page.patch b/queue-4.19/btrfs-extent_io-handle-errors-better-in-extent_write_full_page.patch
new file mode 100644 (file)
index 0000000..272ef9e
--- /dev/null
@@ -0,0 +1,90 @@
+From foo@baz Sat Nov  7 04:26:01 PM CET 2020
+From: Qu Wenruo <wqu@suse.com>
+Date: Wed, 20 Mar 2019 14:27:42 +0800
+Subject: btrfs: extent_io: Handle errors better in extent_write_full_page()
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 3065976b045f77a910809fa7699f99a1e7c0dbbb upstream.
+
+Since now flush_write_bio() could return error, kill the BUG_ON() first.
+Then don't call flush_write_bio() unconditionally, instead we check the
+return value from __extent_writepage() first.
+
+If __extent_writepage() fails, we do cleanup, and return error without
+submitting the possible corrupted or half-baked bio.
+
+If __extent_writepage() successes, then we call flush_write_bio() and
+return the result.
+
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent_io.c |   24 +++++++++++++++++++++---
+ 1 file changed, 21 insertions(+), 3 deletions(-)
+
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -160,6 +160,16 @@ static int __must_check submit_one_bio(s
+       return blk_status_to_errno(ret);
+ }
++/* Cleanup unsubmitted bios */
++static void end_write_bio(struct extent_page_data *epd, int ret)
++{
++      if (epd->bio) {
++              epd->bio->bi_status = errno_to_blk_status(ret);
++              bio_endio(epd->bio);
++              epd->bio = NULL;
++      }
++}
++
+ /*
+  * Submit bio from extent page data via submit_one_bio
+  *
+@@ -3461,6 +3471,9 @@ done:
+  * records are inserted to lock ranges in the tree, and as dirty areas
+  * are found, they are marked writeback.  Then the lock bits are removed
+  * and the end_io handler clears the writeback ranges
++ *
++ * Return 0 if everything goes well.
++ * Return <0 for error.
+  */
+ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+                             struct extent_page_data *epd)
+@@ -3528,6 +3541,7 @@ done:
+               end_extent_writepage(page, ret, start, page_end);
+       }
+       unlock_page(page);
++      ASSERT(ret <= 0);
+       return ret;
+ done_unlocked:
+@@ -4067,7 +4081,6 @@ retry:
+ int extent_write_full_page(struct page *page, struct writeback_control *wbc)
+ {
+       int ret;
+-      int flush_ret;
+       struct extent_page_data epd = {
+               .bio = NULL,
+               .tree = &BTRFS_I(page->mapping->host)->io_tree,
+@@ -4076,9 +4089,14 @@ int extent_write_full_page(struct page *
+       };
+       ret = __extent_writepage(page, wbc, &epd);
++      ASSERT(ret <= 0);
++      if (ret < 0) {
++              end_write_bio(&epd, ret);
++              return ret;
++      }
+-      flush_ret = flush_write_bio(&epd);
+-      BUG_ON(flush_ret < 0);
++      ret = flush_write_bio(&epd);
++      ASSERT(ret <= 0);
+       return ret;
+ }
diff --git a/queue-4.19/btrfs-extent_io-kill-the-forward-declaration-of-flush_write_bio.patch b/queue-4.19/btrfs-extent_io-kill-the-forward-declaration-of-flush_write_bio.patch
new file mode 100644 (file)
index 0000000..1e79d99
--- /dev/null
@@ -0,0 +1,113 @@
+From foo@baz Sat Nov  7 04:26:01 PM CET 2020
+From: Qu Wenruo <wqu@suse.com>
+Date: Fri, 25 Jan 2019 13:09:15 +0800
+Subject: btrfs: extent_io: Kill the forward declaration of flush_write_bio
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit bb58eb9e167d087cc518f7a71c3c00f1671958da upstream.
+
+There is no need to forward declare flush_write_bio(), as it only
+depends on submit_one_bio().  Both of them are pretty small, just move
+them to kill the forward declaration.
+
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+[bwh: Cherry-picked for 4.19 to ease backporting later fixes]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent_io.c |   66 ++++++++++++++++++++++++---------------------------
+ 1 file changed, 32 insertions(+), 34 deletions(-)
+
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -138,7 +138,38 @@ static int add_extent_changeset(struct e
+       return ret;
+ }
+-static void flush_write_bio(struct extent_page_data *epd);
++static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
++                                     unsigned long bio_flags)
++{
++      blk_status_t ret = 0;
++      struct bio_vec *bvec = bio_last_bvec_all(bio);
++      struct page *page = bvec->bv_page;
++      struct extent_io_tree *tree = bio->bi_private;
++      u64 start;
++
++      start = page_offset(page) + bvec->bv_offset;
++
++      bio->bi_private = NULL;
++
++      if (tree->ops)
++              ret = tree->ops->submit_bio_hook(tree->private_data, bio,
++                                         mirror_num, bio_flags, start);
++      else
++              btrfsic_submit_bio(bio);
++
++      return blk_status_to_errno(ret);
++}
++
++static void flush_write_bio(struct extent_page_data *epd)
++{
++      if (epd->bio) {
++              int ret;
++
++              ret = submit_one_bio(epd->bio, 0, 0);
++              BUG_ON(ret < 0); /* -ENOMEM */
++              epd->bio = NULL;
++      }
++}
+ int __init extent_io_init(void)
+ {
+@@ -2710,28 +2741,6 @@ struct bio *btrfs_bio_clone_partial(stru
+       return bio;
+ }
+-static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+-                                     unsigned long bio_flags)
+-{
+-      blk_status_t ret = 0;
+-      struct bio_vec *bvec = bio_last_bvec_all(bio);
+-      struct page *page = bvec->bv_page;
+-      struct extent_io_tree *tree = bio->bi_private;
+-      u64 start;
+-
+-      start = page_offset(page) + bvec->bv_offset;
+-
+-      bio->bi_private = NULL;
+-
+-      if (tree->ops)
+-              ret = tree->ops->submit_bio_hook(tree->private_data, bio,
+-                                         mirror_num, bio_flags, start);
+-      else
+-              btrfsic_submit_bio(bio);
+-
+-      return blk_status_to_errno(ret);
+-}
+-
+ /*
+  * @opf:      bio REQ_OP_* and REQ_* flags as one value
+  * @tree:     tree so we can call our merge_bio hook
+@@ -4033,17 +4042,6 @@ retry:
+       return ret;
+ }
+-static void flush_write_bio(struct extent_page_data *epd)
+-{
+-      if (epd->bio) {
+-              int ret;
+-
+-              ret = submit_one_bio(epd->bio, 0, 0);
+-              BUG_ON(ret < 0); /* -ENOMEM */
+-              epd->bio = NULL;
+-      }
+-}
+-
+ int extent_write_full_page(struct page *page, struct writeback_control *wbc)
+ {
+       int ret;
diff --git a/queue-4.19/btrfs-extent_io-move-the-bug_on-in-flush_write_bio-one-level-up.patch b/queue-4.19/btrfs-extent_io-move-the-bug_on-in-flush_write_bio-one-level-up.patch
new file mode 100644 (file)
index 0000000..d01e590
--- /dev/null
@@ -0,0 +1,189 @@
+From foo@baz Sat Nov  7 04:26:01 PM CET 2020
+From: Qu Wenruo <wqu@suse.com>
+Date: Wed, 20 Mar 2019 14:27:41 +0800
+Subject: btrfs: extent_io: Move the BUG_ON() in flush_write_bio() one level up
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit f4340622e02261fae599e3da936ff4808b418173 upstream.
+
+We have a BUG_ON() in flush_write_bio() to handle the return value of
+submit_one_bio().
+
+Move the BUG_ON() one level up to all its callers.
+
+This patch will introduce temporary variable, @flush_ret to keep code
+change minimal in this patch. That variable will be cleaned up when
+enhancing the error handling later.
+
+Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+[bwh: Cherry-picked for 4.19 to ease backporting later fixes]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent_io.c |   55 ++++++++++++++++++++++++++++++++++++++-------------
+ 1 file changed, 41 insertions(+), 14 deletions(-)
+
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -160,15 +160,28 @@ static int __must_check submit_one_bio(s
+       return blk_status_to_errno(ret);
+ }
+-static void flush_write_bio(struct extent_page_data *epd)
++/*
++ * Submit bio from extent page data via submit_one_bio
++ *
++ * Return 0 if everything is OK.
++ * Return <0 for error.
++ */
++static int __must_check flush_write_bio(struct extent_page_data *epd)
+ {
+-      if (epd->bio) {
+-              int ret;
++      int ret = 0;
++      if (epd->bio) {
+               ret = submit_one_bio(epd->bio, 0, 0);
+-              BUG_ON(ret < 0); /* -ENOMEM */
++              /*
++               * Clean up of epd->bio is handled by its endio function.
++               * And endio is either triggered by successful bio execution
++               * or the error handler of submit bio hook.
++               * So at this point, no matter what happened, we don't need
++               * to clean up epd->bio.
++               */
+               epd->bio = NULL;
+       }
++      return ret;
+ }
+ int __init extent_io_init(void)
+@@ -3538,7 +3551,8 @@ lock_extent_buffer_for_io(struct extent_
+       if (!btrfs_try_tree_write_lock(eb)) {
+               flush = 1;
+-              flush_write_bio(epd);
++              ret = flush_write_bio(epd);
++              BUG_ON(ret < 0);
+               btrfs_tree_lock(eb);
+       }
+@@ -3547,7 +3561,8 @@ lock_extent_buffer_for_io(struct extent_
+               if (!epd->sync_io)
+                       return 0;
+               if (!flush) {
+-                      flush_write_bio(epd);
++                      ret = flush_write_bio(epd);
++                      BUG_ON(ret < 0);
+                       flush = 1;
+               }
+               while (1) {
+@@ -3588,7 +3603,8 @@ lock_extent_buffer_for_io(struct extent_
+               if (!trylock_page(p)) {
+                       if (!flush) {
+-                              flush_write_bio(epd);
++                              ret = flush_write_bio(epd);
++                              BUG_ON(ret < 0);
+                               flush = 1;
+                       }
+                       lock_page(p);
+@@ -3779,6 +3795,7 @@ int btree_write_cache_pages(struct addre
+               .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+       };
+       int ret = 0;
++      int flush_ret;
+       int done = 0;
+       int nr_to_write_done = 0;
+       struct pagevec pvec;
+@@ -3878,7 +3895,8 @@ retry:
+               index = 0;
+               goto retry;
+       }
+-      flush_write_bio(&epd);
++      flush_ret = flush_write_bio(&epd);
++      BUG_ON(flush_ret < 0);
+       return ret;
+ }
+@@ -3975,7 +3993,8 @@ retry:
+                        * tmpfs file mapping
+                        */
+                       if (!trylock_page(page)) {
+-                              flush_write_bio(epd);
++                              ret = flush_write_bio(epd);
++                              BUG_ON(ret < 0);
+                               lock_page(page);
+                       }
+@@ -3985,8 +4004,10 @@ retry:
+                       }
+                       if (wbc->sync_mode != WB_SYNC_NONE) {
+-                              if (PageWriteback(page))
+-                                      flush_write_bio(epd);
++                              if (PageWriteback(page)) {
++                                      ret = flush_write_bio(epd);
++                                      BUG_ON(ret < 0);
++                              }
+                               wait_on_page_writeback(page);
+                       }
+@@ -4045,6 +4066,7 @@ retry:
+ int extent_write_full_page(struct page *page, struct writeback_control *wbc)
+ {
+       int ret;
++      int flush_ret;
+       struct extent_page_data epd = {
+               .bio = NULL,
+               .tree = &BTRFS_I(page->mapping->host)->io_tree,
+@@ -4054,7 +4076,8 @@ int extent_write_full_page(struct page *
+       ret = __extent_writepage(page, wbc, &epd);
+-      flush_write_bio(&epd);
++      flush_ret = flush_write_bio(&epd);
++      BUG_ON(flush_ret < 0);
+       return ret;
+ }
+@@ -4062,6 +4085,7 @@ int extent_write_locked_range(struct ino
+                             int mode)
+ {
+       int ret = 0;
++      int flush_ret;
+       struct address_space *mapping = inode->i_mapping;
+       struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+       struct page *page;
+@@ -4096,7 +4120,8 @@ int extent_write_locked_range(struct ino
+               start += PAGE_SIZE;
+       }
+-      flush_write_bio(&epd);
++      flush_ret = flush_write_bio(&epd);
++      BUG_ON(flush_ret < 0);
+       return ret;
+ }
+@@ -4104,6 +4129,7 @@ int extent_writepages(struct address_spa
+                     struct writeback_control *wbc)
+ {
+       int ret = 0;
++      int flush_ret;
+       struct extent_page_data epd = {
+               .bio = NULL,
+               .tree = &BTRFS_I(mapping->host)->io_tree,
+@@ -4112,7 +4138,8 @@ int extent_writepages(struct address_spa
+       };
+       ret = extent_write_cache_pages(mapping, wbc, &epd);
+-      flush_write_bio(&epd);
++      flush_ret = flush_write_bio(&epd);
++      BUG_ON(flush_ret < 0);
+       return ret;
+ }
diff --git a/queue-4.19/btrfs-fix-unwritten-extent-buffers-and-hangs-on-future-writeback-attempts.patch b/queue-4.19/btrfs-fix-unwritten-extent-buffers-and-hangs-on-future-writeback-attempts.patch
new file mode 100644 (file)
index 0000000..6b2624e
--- /dev/null
@@ -0,0 +1,146 @@
+From foo@baz Sat Nov  7 04:26:01 PM CET 2020
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 11 Sep 2019 17:42:00 +0100
+Subject: Btrfs: fix unwritten extent buffers and hangs on future writeback attempts
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 18dfa7117a3f379862dcd3f67cadd678013bb9dd upstream.
+
+The lock_extent_buffer_io() returns 1 to the caller to tell it everything
+went fine and the callers needs to start writeback for the extent buffer
+(submit a bio, etc), 0 to tell the caller everything went fine but it does
+not need to start writeback for the extent buffer, and a negative value if
+some error happened.
+
+When it's about to return 1 it tries to lock all pages, and if a try lock
+on a page fails, and we didn't flush any existing bio in our "epd", it
+calls flush_write_bio(epd) and overwrites the return value of 1 to 0 or
+an error. The page might have been locked elsewhere, not with the goal
+of starting writeback of the extent buffer, and even by some code other
+than btrfs, like page migration for example, so it does not mean the
+writeback of the extent buffer was already started by some other task,
+so returning a 0 tells the caller (btree_write_cache_pages()) to not
+start writeback for the extent buffer. Note that epd might currently have
+either no bio, so flush_write_bio() returns 0 (success) or it might have
+a bio for another extent buffer with a lower index (logical address).
+
+Since we return 0 with the EXTENT_BUFFER_WRITEBACK bit set on the
+extent buffer and writeback is never started for the extent buffer,
+future attempts to writeback the extent buffer will hang forever waiting
+on that bit to be cleared, since it can only be cleared after writeback
+completes. Such hang is reported with a trace like the following:
+
+  [49887.347053] INFO: task btrfs-transacti:1752 blocked for more than 122 seconds.
+  [49887.347059]       Not tainted 5.2.13-gentoo #2
+  [49887.347060] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+  [49887.347062] btrfs-transacti D    0  1752      2 0x80004000
+  [49887.347064] Call Trace:
+  [49887.347069]  ? __schedule+0x265/0x830
+  [49887.347071]  ? bit_wait+0x50/0x50
+  [49887.347072]  ? bit_wait+0x50/0x50
+  [49887.347074]  schedule+0x24/0x90
+  [49887.347075]  io_schedule+0x3c/0x60
+  [49887.347077]  bit_wait_io+0x8/0x50
+  [49887.347079]  __wait_on_bit+0x6c/0x80
+  [49887.347081]  ? __lock_release.isra.29+0x155/0x2d0
+  [49887.347083]  out_of_line_wait_on_bit+0x7b/0x80
+  [49887.347084]  ? var_wake_function+0x20/0x20
+  [49887.347087]  lock_extent_buffer_for_io+0x28c/0x390
+  [49887.347089]  btree_write_cache_pages+0x18e/0x340
+  [49887.347091]  do_writepages+0x29/0xb0
+  [49887.347093]  ? kmem_cache_free+0x132/0x160
+  [49887.347095]  ? convert_extent_bit+0x544/0x680
+  [49887.347097]  filemap_fdatawrite_range+0x70/0x90
+  [49887.347099]  btrfs_write_marked_extents+0x53/0x120
+  [49887.347100]  btrfs_write_and_wait_transaction.isra.4+0x38/0xa0
+  [49887.347102]  btrfs_commit_transaction+0x6bb/0x990
+  [49887.347103]  ? start_transaction+0x33e/0x500
+  [49887.347105]  transaction_kthread+0x139/0x15c
+
+So fix this by not overwriting the return value (ret) with the result
+from flush_write_bio(). We also need to clear the EXTENT_BUFFER_WRITEBACK
+bit in case flush_write_bio() returns an error, otherwise it will hang
+any future attempts to writeback the extent buffer, and undo all work
+done before (set back EXTENT_BUFFER_DIRTY, etc).
+
+This is a regression introduced in the 5.2 kernel.
+
+Fixes: 2e3c25136adfb ("btrfs: extent_io: add proper error handling to lock_extent_buffer_for_io()")
+Fixes: f4340622e0226 ("btrfs: extent_io: Move the BUG_ON() in flush_write_bio() one level up")
+Reported-by: Zdenek Sojka <zsojka@seznam.cz>
+Link: https://lore.kernel.org/linux-btrfs/GpO.2yos.3WGDOLpx6t%7D.1TUDYM@seznam.cz/T/#u
+Reported-by: Stefan Priebe - Profihost AG <s.priebe@profihost.ag>
+Link: https://lore.kernel.org/linux-btrfs/5c4688ac-10a7-fb07-70e8-c5d31a3fbb38@profihost.ag/T/#t
+Reported-by: Drazen Kacar <drazen.kacar@oradian.com>
+Link: https://lore.kernel.org/linux-btrfs/DB8PR03MB562876ECE2319B3E579590F799C80@DB8PR03MB5628.eurprd03.prod.outlook.com/
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=204377
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent_io.c |   35 ++++++++++++++++++++++++++---------
+ 1 file changed, 26 insertions(+), 9 deletions(-)
+
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -3554,6 +3554,13 @@ void wait_on_extent_buffer_writeback(str
+                      TASK_UNINTERRUPTIBLE);
+ }
++static void end_extent_buffer_writeback(struct extent_buffer *eb)
++{
++      clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
++      smp_mb__after_atomic();
++      wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
++}
++
+ /*
+  * Lock eb pages and flush the bio if we can't the locks
+  *
+@@ -3626,8 +3633,11 @@ lock_extent_buffer_for_io(struct extent_
+               if (!trylock_page(p)) {
+                       if (!flush) {
+-                              ret = flush_write_bio(epd);
+-                              if (ret < 0) {
++                              int err;
++
++                              err = flush_write_bio(epd);
++                              if (err < 0) {
++                                      ret = err;
+                                       failed_page_nr = i;
+                                       goto err_unlock;
+                               }
+@@ -3642,16 +3652,23 @@ err_unlock:
+       /* Unlock already locked pages */
+       for (i = 0; i < failed_page_nr; i++)
+               unlock_page(eb->pages[i]);
++      /*
++       * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
++       * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
++       * be made and undo everything done before.
++       */
++      btrfs_tree_lock(eb);
++      spin_lock(&eb->refs_lock);
++      set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
++      end_extent_buffer_writeback(eb);
++      spin_unlock(&eb->refs_lock);
++      percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
++                               fs_info->dirty_metadata_batch);
++      btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
++      btrfs_tree_unlock(eb);
+       return ret;
+ }
+-static void end_extent_buffer_writeback(struct extent_buffer *eb)
+-{
+-      clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+-      smp_mb__after_atomic();
+-      wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+-}
+-
+ static void set_btree_ioerr(struct page *page)
+ {
+       struct extent_buffer *eb = (struct extent_buffer *)page->private;
diff --git a/queue-4.19/btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch b/queue-4.19/btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch
new file mode 100644 (file)
index 0000000..60e78a2
--- /dev/null
@@ -0,0 +1,105 @@
+From foo@baz Sat Nov  7 04:26:01 PM CET 2020
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Thu, 23 Jan 2020 15:33:02 -0500
+Subject: btrfs: flush write bio if we loop in extent_write_cache_pages
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 42ffb0bf584ae5b6b38f72259af1e0ee417ac77f upstream.
+
+There exists a deadlock with range_cyclic that has existed forever.  If
+we loop around with a bio already built we could deadlock with a writer
+who has the page locked that we're attempting to write but is waiting on
+a page in our bio to be written out.  The task traces are as follows
+
+  PID: 1329874  TASK: ffff889ebcdf3800  CPU: 33  COMMAND: "kworker/u113:5"
+   #0 [ffffc900297bb658] __schedule at ffffffff81a4c33f
+   #1 [ffffc900297bb6e0] schedule at ffffffff81a4c6e3
+   #2 [ffffc900297bb6f8] io_schedule at ffffffff81a4ca42
+   #3 [ffffc900297bb708] __lock_page at ffffffff811f145b
+   #4 [ffffc900297bb798] __process_pages_contig at ffffffff814bc502
+   #5 [ffffc900297bb8c8] lock_delalloc_pages at ffffffff814bc684
+   #6 [ffffc900297bb900] find_lock_delalloc_range at ffffffff814be9ff
+   #7 [ffffc900297bb9a0] writepage_delalloc at ffffffff814bebd0
+   #8 [ffffc900297bba18] __extent_writepage at ffffffff814bfbf2
+   #9 [ffffc900297bba98] extent_write_cache_pages at ffffffff814bffbd
+
+  PID: 2167901  TASK: ffff889dc6a59c00  CPU: 14  COMMAND:
+  "aio-dio-invalid"
+   #0 [ffffc9003b50bb18] __schedule at ffffffff81a4c33f
+   #1 [ffffc9003b50bba0] schedule at ffffffff81a4c6e3
+   #2 [ffffc9003b50bbb8] io_schedule at ffffffff81a4ca42
+   #3 [ffffc9003b50bbc8] wait_on_page_bit at ffffffff811f24d6
+   #4 [ffffc9003b50bc60] prepare_pages at ffffffff814b05a7
+   #5 [ffffc9003b50bcd8] btrfs_buffered_write at ffffffff814b1359
+   #6 [ffffc9003b50bdb0] btrfs_file_write_iter at ffffffff814b5933
+   #7 [ffffc9003b50be38] new_sync_write at ffffffff8128f6a8
+   #8 [ffffc9003b50bec8] vfs_write at ffffffff81292b9d
+   #9 [ffffc9003b50bf00] ksys_pwrite64 at ffffffff81293032
+
+I used drgn to find the respective pages we were stuck on
+
+page_entry.page 0xffffea00fbfc7500 index 8148 bit 15 pid 2167901
+page_entry.page 0xffffea00f9bb7400 index 7680 bit 0 pid 1329874
+
+As you can see the kworker is waiting for bit 0 (PG_locked) on index
+7680, and aio-dio-invalid is waiting for bit 15 (PG_writeback) on index
+8148.  aio-dio-invalid has 7680, and the kworker epd looks like the
+following
+
+  crash> struct extent_page_data ffffc900297bbbb0
+  struct extent_page_data {
+    bio = 0xffff889f747ed830,
+    tree = 0xffff889eed6ba448,
+    extent_locked = 0,
+    sync_io = 0
+  }
+
+Probably worth mentioning as well that it waits for writeback of the
+page to complete while holding a lock on it (at prepare_pages()).
+
+Using drgn I walked the bio pages looking for page
+0xffffea00fbfc7500 which is the one we're waiting for writeback on
+
+  bio = Object(prog, 'struct bio', address=0xffff889f747ed830)
+  for i in range(0, bio.bi_vcnt.value_()):
+      bv = bio.bi_io_vec[i]
+      if bv.bv_page.value_() == 0xffffea00fbfc7500:
+         print("FOUND IT")
+
+which validated what I suspected.
+
+The fix for this is simple, flush the epd before we loop back around to
+the beginning of the file during writeout.
+
+Fixes: b293f02e1423 ("Btrfs: Add writepages support")
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent_io.c |   11 ++++++++++-
+ 1 file changed, 10 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -4045,7 +4045,16 @@ retry:
+                */
+               scanned = 1;
+               index = 0;
+-              goto retry;
++
++              /*
++               * If we're looping we could run into a page that is locked by a
++               * writer and that writer could be waiting on writeback for a
++               * page in our current bio, and thus deadlock, so flush the
++               * write bio here.
++               */
++              ret = flush_write_bio(epd);
++              if (!ret)
++                      goto retry;
+       }
+       if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
diff --git a/queue-4.19/btrfs-move-btrfs_check_chunk_valid-to-tree-check.-and-export-it.patch b/queue-4.19/btrfs-move-btrfs_check_chunk_valid-to-tree-check.-and-export-it.patch
new file mode 100644 (file)
index 0000000..b71abe3
--- /dev/null
@@ -0,0 +1,257 @@
+From foo@baz Sat Nov  7 04:26:01 PM CET 2020
+From: Qu Wenruo <wqu@suse.com>
+Date: Wed, 20 Mar 2019 13:16:42 +0800
+Subject: btrfs: Move btrfs_check_chunk_valid() to tree-check.[ch] and export it
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 82fc28fbedbb59642f05215db3b0ef4eb91aa31d upstream.
+
+By function, chunk item verification is more suitable to be done inside
+tree-checker.
+
+So move btrfs_check_chunk_valid() to tree-checker.c and export it.
+
+And since it's now moved to tree-checker, also add a better comment for
+what this function is doing.
+
+Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+[bwh: Cherry-picked for 4.19 to ease backporting later fixes]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-checker.c |   97 ++++++++++++++++++++++++++++++++++++++++++++++++
+ fs/btrfs/tree-checker.h |    4 +
+ fs/btrfs/volumes.c      |   94 ----------------------------------------------
+ 3 files changed, 102 insertions(+), 93 deletions(-)
+
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -449,6 +449,103 @@ static int check_block_group_item(struct
+ }
+ /*
++ * The common chunk check which could also work on super block sys chunk array.
++ *
++ * Return -EIO if anything is corrupted.
++ * Return 0 if everything is OK.
++ */
++int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
++                          struct extent_buffer *leaf,
++                          struct btrfs_chunk *chunk, u64 logical)
++{
++      u64 length;
++      u64 stripe_len;
++      u16 num_stripes;
++      u16 sub_stripes;
++      u64 type;
++      u64 features;
++      bool mixed = false;
++
++      length = btrfs_chunk_length(leaf, chunk);
++      stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
++      num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
++      sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
++      type = btrfs_chunk_type(leaf, chunk);
++
++      if (!num_stripes) {
++              btrfs_err(fs_info, "invalid chunk num_stripes: %u",
++                        num_stripes);
++              return -EIO;
++      }
++      if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
++              btrfs_err(fs_info, "invalid chunk logical %llu", logical);
++              return -EIO;
++      }
++      if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
++              btrfs_err(fs_info, "invalid chunk sectorsize %u",
++                        btrfs_chunk_sector_size(leaf, chunk));
++              return -EIO;
++      }
++      if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
++              btrfs_err(fs_info, "invalid chunk length %llu", length);
++              return -EIO;
++      }
++      if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
++              btrfs_err(fs_info, "invalid chunk stripe length: %llu",
++                        stripe_len);
++              return -EIO;
++      }
++      if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
++          type) {
++              btrfs_err(fs_info, "unrecognized chunk type: %llu",
++                        ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
++                          BTRFS_BLOCK_GROUP_PROFILE_MASK) &
++                        btrfs_chunk_type(leaf, chunk));
++              return -EIO;
++      }
++
++      if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) {
++              btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type);
++              return -EIO;
++      }
++
++      if ((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
++          (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) {
++              btrfs_err(fs_info,
++                      "system chunk with data or metadata type: 0x%llx", type);
++              return -EIO;
++      }
++
++      features = btrfs_super_incompat_flags(fs_info->super_copy);
++      if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
++              mixed = true;
++
++      if (!mixed) {
++              if ((type & BTRFS_BLOCK_GROUP_METADATA) &&
++                  (type & BTRFS_BLOCK_GROUP_DATA)) {
++                      btrfs_err(fs_info,
++                      "mixed chunk type in non-mixed mode: 0x%llx", type);
++                      return -EIO;
++              }
++      }
++
++      if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
++          (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
++          (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
++          (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
++          (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
++          ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) {
++              btrfs_err(fs_info,
++                      "invalid num_stripes:sub_stripes %u:%u for profile %llu",
++                      num_stripes, sub_stripes,
++                      type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
++              return -EIO;
++      }
++
++      return 0;
++}
++
++/*
+  * Common point to switch the item-specific validation.
+  */
+ static int check_leaf_item(struct btrfs_fs_info *fs_info,
+--- a/fs/btrfs/tree-checker.h
++++ b/fs/btrfs/tree-checker.h
+@@ -25,4 +25,8 @@ int btrfs_check_leaf_relaxed(struct btrf
+                            struct extent_buffer *leaf);
+ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node);
++int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
++                          struct extent_buffer *leaf,
++                          struct btrfs_chunk *chunk, u64 logical);
++
+ #endif
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -28,6 +28,7 @@
+ #include "math.h"
+ #include "dev-replace.h"
+ #include "sysfs.h"
++#include "tree-checker.h"
+ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+       [BTRFS_RAID_RAID10] = {
+@@ -6370,99 +6371,6 @@ struct btrfs_device *btrfs_alloc_device(
+       return dev;
+ }
+-/* Return -EIO if any error, otherwise return 0. */
+-static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
+-                                 struct extent_buffer *leaf,
+-                                 struct btrfs_chunk *chunk, u64 logical)
+-{
+-      u64 length;
+-      u64 stripe_len;
+-      u16 num_stripes;
+-      u16 sub_stripes;
+-      u64 type;
+-      u64 features;
+-      bool mixed = false;
+-
+-      length = btrfs_chunk_length(leaf, chunk);
+-      stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+-      num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+-      sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+-      type = btrfs_chunk_type(leaf, chunk);
+-
+-      if (!num_stripes) {
+-              btrfs_err(fs_info, "invalid chunk num_stripes: %u",
+-                        num_stripes);
+-              return -EIO;
+-      }
+-      if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
+-              btrfs_err(fs_info, "invalid chunk logical %llu", logical);
+-              return -EIO;
+-      }
+-      if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
+-              btrfs_err(fs_info, "invalid chunk sectorsize %u",
+-                        btrfs_chunk_sector_size(leaf, chunk));
+-              return -EIO;
+-      }
+-      if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
+-              btrfs_err(fs_info, "invalid chunk length %llu", length);
+-              return -EIO;
+-      }
+-      if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
+-              btrfs_err(fs_info, "invalid chunk stripe length: %llu",
+-                        stripe_len);
+-              return -EIO;
+-      }
+-      if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+-          type) {
+-              btrfs_err(fs_info, "unrecognized chunk type: %llu",
+-                        ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+-                          BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+-                        btrfs_chunk_type(leaf, chunk));
+-              return -EIO;
+-      }
+-
+-      if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) {
+-              btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type);
+-              return -EIO;
+-      }
+-
+-      if ((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+-          (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) {
+-              btrfs_err(fs_info,
+-                      "system chunk with data or metadata type: 0x%llx", type);
+-              return -EIO;
+-      }
+-
+-      features = btrfs_super_incompat_flags(fs_info->super_copy);
+-      if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+-              mixed = true;
+-
+-      if (!mixed) {
+-              if ((type & BTRFS_BLOCK_GROUP_METADATA) &&
+-                  (type & BTRFS_BLOCK_GROUP_DATA)) {
+-                      btrfs_err(fs_info,
+-                      "mixed chunk type in non-mixed mode: 0x%llx", type);
+-                      return -EIO;
+-              }
+-      }
+-
+-      if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
+-          (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
+-          (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
+-          (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
+-          (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
+-          ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
+-           num_stripes != 1)) {
+-              btrfs_err(fs_info,
+-                      "invalid num_stripes:sub_stripes %u:%u for profile %llu",
+-                      num_stripes, sub_stripes,
+-                      type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+-              return -EIO;
+-      }
+-
+-      return 0;
+-}
+-
+ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
+                                       u64 devid, u8 *uuid, bool error)
+ {
diff --git a/queue-4.19/btrfs-tree-checker-check-chunk-item-at-tree-block-read-time.patch b/queue-4.19/btrfs-tree-checker-check-chunk-item-at-tree-block-read-time.patch
new file mode 100644 (file)
index 0000000..300897f
--- /dev/null
@@ -0,0 +1,74 @@
+From foo@baz Sat Nov  7 04:26:01 PM CET 2020
+From: Qu Wenruo <wqu@suse.com>
+Date: Wed, 20 Mar 2019 13:42:33 +0800
+Subject: btrfs: tree-checker: Check chunk item at tree block read time
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 075cb3c78fe7976c9f29ca1fa23f9728634ecefc upstream.
+
+Since we have btrfs_check_chunk_valid() in tree-checker, let's do
+chunk item verification in tree-checker too.
+
+Since the tree-checker is run at endio time, if one chunk leaf fails
+chunk verification, we can still retry the other copy, making btrfs more
+robust to fuzzed image as we may still get a good chunk item.
+
+Also since we have done chunk verification in tree block read time, skip
+the btrfs_check_chunk_valid() call in read_one_chunk() if we're reading
+chunk items from leaf.
+
+Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-checker.c |    6 ++++++
+ fs/btrfs/volumes.c      |   12 +++++++++---
+ 2 files changed, 15 insertions(+), 3 deletions(-)
+
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -608,6 +608,7 @@ static int check_leaf_item(struct btrfs_
+                          struct btrfs_key *key, int slot)
+ {
+       int ret = 0;
++      struct btrfs_chunk *chunk;
+       switch (key->type) {
+       case BTRFS_EXTENT_DATA_KEY:
+@@ -624,6 +625,11 @@ static int check_leaf_item(struct btrfs_
+       case BTRFS_BLOCK_GROUP_ITEM_KEY:
+               ret = check_block_group_item(fs_info, leaf, key, slot);
+               break;
++      case BTRFS_CHUNK_ITEM_KEY:
++              chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
++              ret = btrfs_check_chunk_valid(fs_info, leaf, chunk,
++                                            key->offset);
++              break;
+       }
+       return ret;
+ }
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -6401,9 +6401,15 @@ static int read_one_chunk(struct btrfs_f
+       length = btrfs_chunk_length(leaf, chunk);
+       num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+-      ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);
+-      if (ret)
+-              return ret;
++      /*
++       * Only need to verify chunk item if we're reading from sys chunk array,
++       * as chunk item in tree block is already verified by tree-checker.
++       */
++      if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
++              ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);
++              if (ret)
++                      return ret;
++      }
+       read_lock(&map_tree->map_tree.lock);
+       em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
diff --git a/queue-4.19/btrfs-tree-checker-enhance-chunk-checker-to-validate-chunk-profile.patch b/queue-4.19/btrfs-tree-checker-enhance-chunk-checker-to-validate-chunk-profile.patch
new file mode 100644 (file)
index 0000000..44dfd91
--- /dev/null
@@ -0,0 +1,44 @@
+From foo@baz Sat Nov  7 04:26:01 PM CET 2020
+From: Qu Wenruo <wqu@suse.com>
+Date: Wed, 13 Mar 2019 12:17:50 +0800
+Subject: btrfs: tree-checker: Enhance chunk checker to validate chunk profile
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 80e46cf22ba0bcb57b39c7c3b52961ab3a0fd5f2 upstream.
+
+Btrfs-progs already have a comprehensive type checker, to ensure there
+is only 0 (SINGLE profile) or 1 (DUP/RAID0/1/5/6/10) bit set for chunk
+profile bits.
+
+Do the same work for kernel.
+
+Reported-by: Yoon Jungyeon <jungyeon@gatech.edu>
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=202765
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-checker.c |    7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -556,6 +556,13 @@ int btrfs_check_chunk_valid(struct btrfs
+               return -EUCLEAN;
+       }
++      if (!is_power_of_2(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
++          (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) {
++              chunk_err(fs_info, leaf, chunk, logical,
++              "invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set",
++                        type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
++              return -EUCLEAN;
++      }
+       if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) {
+               chunk_err(fs_info, leaf, chunk, logical,
+       "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx",
diff --git a/queue-4.19/btrfs-tree-checker-fix-the-error-message-for-transid-error.patch b/queue-4.19/btrfs-tree-checker-fix-the-error-message-for-transid-error.patch
new file mode 100644 (file)
index 0000000..ca366bc
--- /dev/null
@@ -0,0 +1,36 @@
+From foo@baz Sat Nov  7 04:26:01 PM CET 2020
+From: Qu Wenruo <wqu@suse.com>
+Date: Tue, 25 Aug 2020 21:42:51 +0800
+Subject: btrfs: tree-checker: fix the error message for transid error
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit f96d6960abbc52e26ad124e69e6815283d3e1674 upstream.
+
+The error message for inode transid is the same as for inode generation,
+which makes us unable to detect the real problem.
+
+Reported-by: Tyler Richmond <t.d.richmond@gmail.com>
+Fixes: 496245cac57e ("btrfs: tree-checker: Verify inode item")
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-by: Marcos Paulo de Souza <mpdesouza@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+[bwh: Backported to 4.19: adjust context]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-checker.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -715,7 +715,7 @@ static int check_inode_item(struct btrfs
+       /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */
+       if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) {
+               inode_item_err(fs_info, leaf, slot,
+-                      "invalid inode generation: has %llu expect [0, %llu]",
++                      "invalid inode transid: has %llu expect [0, %llu]",
+                              btrfs_inode_transid(leaf, iitem), super_gen + 1);
+               return -EUCLEAN;
+       }
diff --git a/queue-4.19/btrfs-tree-checker-fix-wrong-check-on-max-devid.patch b/queue-4.19/btrfs-tree-checker-fix-wrong-check-on-max-devid.patch
new file mode 100644 (file)
index 0000000..b17b036
--- /dev/null
@@ -0,0 +1,92 @@
+From foo@baz Sat Nov  7 04:26:01 PM CET 2020
+From: Qu Wenruo <wqu@suse.com>
+Date: Wed, 28 Aug 2019 10:33:13 +0800
+Subject: btrfs: tree-checker: Fix wrong check on max devid
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 8bb177d18f114358a57d8ae7e206861b48b8b4de upstream.
+
+[BUG]
+The following script will cause false alert on devid check.
+  #!/bin/bash
+
+  dev1=/dev/test/test
+  dev2=/dev/test/scratch1
+  mnt=/mnt/btrfs
+
+  umount $dev1 &> /dev/null
+  umount $dev2 &> /dev/null
+  umount $mnt &> /dev/null
+
+  mkfs.btrfs -f $dev1
+
+  mount $dev1 $mnt
+
+  _fail()
+  {
+          echo "!!! FAILED !!!"
+          exit 1
+  }
+
+  for ((i = 0; i < 4096; i++)); do
+          btrfs dev add -f $dev2 $mnt || _fail
+          btrfs dev del $dev1 $mnt || _fail
+          dev_tmp=$dev1
+          dev1=$dev2
+          dev2=$dev_tmp
+  done
+
+[CAUSE]
+Tree-checker uses BTRFS_MAX_DEVS() and BTRFS_MAX_DEVS_SYS_CHUNK() as
+upper limit for devid.  But we can have devid holes just like above
+script.
+
+So the check for devid is incorrect and could cause false alert.
+
+[FIX]
+Just remove the whole devid check.  We don't have any hard requirement
+for devid assignment.
+
+Furthermore, even devid could get corrupted by a bitflip, we still have
+dev extents verification at mount time, so corrupted data won't sneak
+in.
+
+This fixes fstests btrfs/194.
+
+Reported-by: Anand Jain <anand.jain@oracle.com>
+Fixes: ab4ba2e13346 ("btrfs: tree-checker: Verify dev item")
+CC: stable@vger.kernel.org # 5.2+
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+[bwh: Backported to 4.19: adjust context]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-checker.c |    7 -------
+ 1 file changed, 7 deletions(-)
+
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -629,7 +629,6 @@ static int check_dev_item(struct btrfs_f
+                         struct btrfs_key *key, int slot)
+ {
+       struct btrfs_dev_item *ditem;
+-      u64 max_devid = max(BTRFS_MAX_DEVS(fs_info), BTRFS_MAX_DEVS_SYS_CHUNK);
+       if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) {
+               dev_item_err(fs_info, leaf, slot,
+@@ -637,12 +636,6 @@ static int check_dev_item(struct btrfs_f
+                            key->objectid, BTRFS_DEV_ITEMS_OBJECTID);
+               return -EUCLEAN;
+       }
+-      if (key->offset > max_devid) {
+-              dev_item_err(fs_info, leaf, slot,
+-                           "invalid devid: has=%llu expect=[0, %llu]",
+-                           key->offset, max_devid);
+-              return -EUCLEAN;
+-      }
+       ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item);
+       if (btrfs_device_id(leaf, ditem) != key->offset) {
+               dev_item_err(fs_info, leaf, slot,
diff --git a/queue-4.19/btrfs-tree-checker-make-btrfs_check_chunk_valid-return-euclean-instead-of-eio.patch b/queue-4.19/btrfs-tree-checker-make-btrfs_check_chunk_valid-return-euclean-instead-of-eio.patch
new file mode 100644 (file)
index 0000000..969792e
--- /dev/null
@@ -0,0 +1,114 @@
+From foo@baz Sat Nov  7 04:26:01 PM CET 2020
+From: Qu Wenruo <wqu@suse.com>
+Date: Wed, 20 Mar 2019 13:39:14 +0800
+Subject: btrfs: tree-checker: Make btrfs_check_chunk_valid() return EUCLEAN instead of EIO
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit bf871c3b43b1dcc3f2a076ff39a8f1ce7959d958 upstream.
+
+To follow the standard behavior of tree-checker.
+
+Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+[bwh: Cherry-picked for 4.19 to ease backporting later fixes]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-checker.c |   22 +++++++++++-----------
+ 1 file changed, 11 insertions(+), 11 deletions(-)
+
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -496,7 +496,7 @@ static void chunk_err(const struct btrfs
+ /*
+  * The common chunk check which could also work on super block sys chunk array.
+  *
+- * Return -EIO if anything is corrupted.
++ * Return -EUCLEAN if anything is corrupted.
+  * Return 0 if everything is OK.
+  */
+ int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
+@@ -520,31 +520,31 @@ int btrfs_check_chunk_valid(struct btrfs
+       if (!num_stripes) {
+               chunk_err(fs_info, leaf, chunk, logical,
+                         "invalid chunk num_stripes, have %u", num_stripes);
+-              return -EIO;
++              return -EUCLEAN;
+       }
+       if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
+               chunk_err(fs_info, leaf, chunk, logical,
+               "invalid chunk logical, have %llu should aligned to %u",
+                         logical, fs_info->sectorsize);
+-              return -EIO;
++              return -EUCLEAN;
+       }
+       if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
+               chunk_err(fs_info, leaf, chunk, logical,
+                         "invalid chunk sectorsize, have %u expect %u",
+                         btrfs_chunk_sector_size(leaf, chunk),
+                         fs_info->sectorsize);
+-              return -EIO;
++              return -EUCLEAN;
+       }
+       if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
+               chunk_err(fs_info, leaf, chunk, logical,
+                         "invalid chunk length, have %llu", length);
+-              return -EIO;
++              return -EUCLEAN;
+       }
+       if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
+               chunk_err(fs_info, leaf, chunk, logical,
+                         "invalid chunk stripe length: %llu",
+                         stripe_len);
+-              return -EIO;
++              return -EUCLEAN;
+       }
+       if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+           type) {
+@@ -553,14 +553,14 @@ int btrfs_check_chunk_valid(struct btrfs
+                         ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+                           BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+                         btrfs_chunk_type(leaf, chunk));
+-              return -EIO;
++              return -EUCLEAN;
+       }
+       if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) {
+               chunk_err(fs_info, leaf, chunk, logical,
+       "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx",
+                         type, BTRFS_BLOCK_GROUP_TYPE_MASK);
+-              return -EIO;
++              return -EUCLEAN;
+       }
+       if ((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+@@ -568,7 +568,7 @@ int btrfs_check_chunk_valid(struct btrfs
+               chunk_err(fs_info, leaf, chunk, logical,
+                         "system chunk with data or metadata type: 0x%llx",
+                         type);
+-              return -EIO;
++              return -EUCLEAN;
+       }
+       features = btrfs_super_incompat_flags(fs_info->super_copy);
+@@ -580,7 +580,7 @@ int btrfs_check_chunk_valid(struct btrfs
+                   (type & BTRFS_BLOCK_GROUP_DATA)) {
+                       chunk_err(fs_info, leaf, chunk, logical,
+                       "mixed chunk type in non-mixed mode: 0x%llx", type);
+-                      return -EIO;
++                      return -EUCLEAN;
+               }
+       }
+@@ -594,7 +594,7 @@ int btrfs_check_chunk_valid(struct btrfs
+                       "invalid num_stripes:sub_stripes %u:%u for profile %llu",
+                       num_stripes, sub_stripes,
+                       type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+-              return -EIO;
++              return -EUCLEAN;
+       }
+       return 0;
diff --git a/queue-4.19/btrfs-tree-checker-make-chunk-item-checker-messages-more-readable.patch b/queue-4.19/btrfs-tree-checker-make-chunk-item-checker-messages-more-readable.patch
new file mode 100644 (file)
index 0000000..60c7cf3
--- /dev/null
@@ -0,0 +1,172 @@
+From foo@baz Sat Nov  7 04:26:01 PM CET 2020
+From: Qu Wenruo <wqu@suse.com>
+Date: Wed, 20 Mar 2019 13:36:06 +0800
+Subject: btrfs: tree-checker: Make chunk item checker messages more readable
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit f114024376bceb1c0f61a7bad4a72a0f978767af upstream.
+
+Old error message would be something like:
+  BTRFS error (device dm-3): invalid chunk num_stipres: 0
+
+New error message would be:
+  Btrfs critical (device dm-3): corrupt superblock syschunk array: chunk_start=2097152, invalid chunk num_stripes: 0
+Or
+  Btrfs critical (device dm-3): corrupt leaf: root=3 block=8388608 slot=3 chunk_start=2097152, invalid chunk num_stripes: 0
+
+And for certain error message, also output expected value.
+
+The error message levels are changed from error to critical.
+
+Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+[bwh: Cherry-picked for 4.19 to ease backporting later fixes]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-checker.c |   81 ++++++++++++++++++++++++++++++++++++++++--------
+ 1 file changed, 68 insertions(+), 13 deletions(-)
+
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -448,6 +448,51 @@ static int check_block_group_item(struct
+       return 0;
+ }
++__printf(5, 6)
++__cold
++static void chunk_err(const struct btrfs_fs_info *fs_info,
++                    const struct extent_buffer *leaf,
++                    const struct btrfs_chunk *chunk, u64 logical,
++                    const char *fmt, ...)
++{
++      bool is_sb;
++      struct va_format vaf;
++      va_list args;
++      int i;
++      int slot = -1;
++
++      /* Only superblock eb is able to have such small offset */
++      is_sb = (leaf->start == BTRFS_SUPER_INFO_OFFSET);
++
++      if (!is_sb) {
++              /*
++               * Get the slot number by iterating through all slots, this
++               * would provide better readability.
++               */
++              for (i = 0; i < btrfs_header_nritems(leaf); i++) {
++                      if (btrfs_item_ptr_offset(leaf, i) ==
++                                      (unsigned long)chunk) {
++                              slot = i;
++                              break;
++                      }
++              }
++      }
++      va_start(args, fmt);
++      vaf.fmt = fmt;
++      vaf.va = &args;
++
++      if (is_sb)
++              btrfs_crit(fs_info,
++              "corrupt superblock syschunk array: chunk_start=%llu, %pV",
++                         logical, &vaf);
++      else
++              btrfs_crit(fs_info,
++      "corrupt leaf: root=%llu block=%llu slot=%d chunk_start=%llu, %pV",
++                         BTRFS_CHUNK_TREE_OBJECTID, leaf->start, slot,
++                         logical, &vaf);
++      va_end(args);
++}
++
+ /*
+  * The common chunk check which could also work on super block sys chunk array.
+  *
+@@ -473,31 +518,38 @@ int btrfs_check_chunk_valid(struct btrfs
+       type = btrfs_chunk_type(leaf, chunk);
+       if (!num_stripes) {
+-              btrfs_err(fs_info, "invalid chunk num_stripes: %u",
+-                        num_stripes);
++              chunk_err(fs_info, leaf, chunk, logical,
++                        "invalid chunk num_stripes, have %u", num_stripes);
+               return -EIO;
+       }
+       if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
+-              btrfs_err(fs_info, "invalid chunk logical %llu", logical);
++              chunk_err(fs_info, leaf, chunk, logical,
++              "invalid chunk logical, have %llu should aligned to %u",
++                        logical, fs_info->sectorsize);
+               return -EIO;
+       }
+       if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
+-              btrfs_err(fs_info, "invalid chunk sectorsize %u",
+-                        btrfs_chunk_sector_size(leaf, chunk));
++              chunk_err(fs_info, leaf, chunk, logical,
++                        "invalid chunk sectorsize, have %u expect %u",
++                        btrfs_chunk_sector_size(leaf, chunk),
++                        fs_info->sectorsize);
+               return -EIO;
+       }
+       if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
+-              btrfs_err(fs_info, "invalid chunk length %llu", length);
++              chunk_err(fs_info, leaf, chunk, logical,
++                        "invalid chunk length, have %llu", length);
+               return -EIO;
+       }
+       if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
+-              btrfs_err(fs_info, "invalid chunk stripe length: %llu",
++              chunk_err(fs_info, leaf, chunk, logical,
++                        "invalid chunk stripe length: %llu",
+                         stripe_len);
+               return -EIO;
+       }
+       if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+           type) {
+-              btrfs_err(fs_info, "unrecognized chunk type: %llu",
++              chunk_err(fs_info, leaf, chunk, logical,
++                        "unrecognized chunk type: 0x%llx",
+                         ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+                           BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+                         btrfs_chunk_type(leaf, chunk));
+@@ -505,14 +557,17 @@ int btrfs_check_chunk_valid(struct btrfs
+       }
+       if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) {
+-              btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type);
++              chunk_err(fs_info, leaf, chunk, logical,
++      "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx",
++                        type, BTRFS_BLOCK_GROUP_TYPE_MASK);
+               return -EIO;
+       }
+       if ((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+           (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) {
+-              btrfs_err(fs_info,
+-                      "system chunk with data or metadata type: 0x%llx", type);
++              chunk_err(fs_info, leaf, chunk, logical,
++                        "system chunk with data or metadata type: 0x%llx",
++                        type);
+               return -EIO;
+       }
+@@ -523,7 +578,7 @@ int btrfs_check_chunk_valid(struct btrfs
+       if (!mixed) {
+               if ((type & BTRFS_BLOCK_GROUP_METADATA) &&
+                   (type & BTRFS_BLOCK_GROUP_DATA)) {
+-                      btrfs_err(fs_info,
++                      chunk_err(fs_info, leaf, chunk, logical,
+                       "mixed chunk type in non-mixed mode: 0x%llx", type);
+                       return -EIO;
+               }
+@@ -535,7 +590,7 @@ int btrfs_check_chunk_valid(struct btrfs
+           (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
+           (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
+           ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) {
+-              btrfs_err(fs_info,
++              chunk_err(fs_info, leaf, chunk, logical,
+                       "invalid num_stripes:sub_stripes %u:%u for profile %llu",
+                       num_stripes, sub_stripes,
+                       type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
diff --git a/queue-4.19/btrfs-tree-checker-verify-dev-item.patch b/queue-4.19/btrfs-tree-checker-verify-dev-item.patch
new file mode 100644 (file)
index 0000000..7ceb4bd
--- /dev/null
@@ -0,0 +1,172 @@
+From foo@baz Sat Nov  7 04:26:01 PM CET 2020
+From: Qu Wenruo <wqu@suse.com>
+Date: Fri, 8 Mar 2019 14:20:03 +0800
+Subject: btrfs: tree-checker: Verify dev item
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit ab4ba2e133463c702b37242560d7fabedd2dc750 upstream.
+
+[BUG]
+For fuzzed image whose DEV_ITEM has invalid total_bytes as 0, then
+kernel will just panic:
+  BUG: unable to handle kernel NULL pointer dereference at 0000000000000098
+  #PF error: [normal kernel read fault]
+  PGD 800000022b2bd067 P4D 800000022b2bd067 PUD 22b2bc067 PMD 0
+  Oops: 0000 [#1] SMP PTI
+  CPU: 0 PID: 1106 Comm: mount Not tainted 5.0.0-rc8+ #9
+  RIP: 0010:btrfs_verify_dev_extents+0x2a5/0x5a0
+  Call Trace:
+   open_ctree+0x160d/0x2149
+   btrfs_mount_root+0x5b2/0x680
+
+[CAUSE]
+If device extent verification finds a deivce with 0 total_bytes, then it
+assumes it's a seed dummy, then search for seed devices.
+
+But in this case, there is no seed device at all, causing NULL pointer.
+
+[FIX]
+Since this is caused by fuzzed image, let's go the tree-check way, just
+add a new verification for device item.
+
+Reported-by: Yoon Jungyeon <jungyeon@gatech.edu>
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=202691
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-checker.c |   74 ++++++++++++++++++++++++++++++++++++++++++++++++
+ fs/btrfs/volumes.c      |    9 -----
+ fs/btrfs/volumes.h      |    9 +++++
+ 3 files changed, 83 insertions(+), 9 deletions(-)
+
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -600,6 +600,77 @@ int btrfs_check_chunk_valid(struct btrfs
+       return 0;
+ }
++__printf(4, 5)
++__cold
++static void dev_item_err(const struct btrfs_fs_info *fs_info,
++                       const struct extent_buffer *eb, int slot,
++                       const char *fmt, ...)
++{
++      struct btrfs_key key;
++      struct va_format vaf;
++      va_list args;
++
++      btrfs_item_key_to_cpu(eb, &key, slot);
++      va_start(args, fmt);
++
++      vaf.fmt = fmt;
++      vaf.va = &args;
++
++      btrfs_crit(fs_info,
++      "corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV",
++              btrfs_header_level(eb) == 0 ? "leaf" : "node",
++              btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
++              key.objectid, &vaf);
++      va_end(args);
++}
++
++static int check_dev_item(struct btrfs_fs_info *fs_info,
++                        struct extent_buffer *leaf,
++                        struct btrfs_key *key, int slot)
++{
++      struct btrfs_dev_item *ditem;
++      u64 max_devid = max(BTRFS_MAX_DEVS(fs_info), BTRFS_MAX_DEVS_SYS_CHUNK);
++
++      if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) {
++              dev_item_err(fs_info, leaf, slot,
++                           "invalid objectid: has=%llu expect=%llu",
++                           key->objectid, BTRFS_DEV_ITEMS_OBJECTID);
++              return -EUCLEAN;
++      }
++      if (key->offset > max_devid) {
++              dev_item_err(fs_info, leaf, slot,
++                           "invalid devid: has=%llu expect=[0, %llu]",
++                           key->offset, max_devid);
++              return -EUCLEAN;
++      }
++      ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item);
++      if (btrfs_device_id(leaf, ditem) != key->offset) {
++              dev_item_err(fs_info, leaf, slot,
++                           "devid mismatch: key has=%llu item has=%llu",
++                           key->offset, btrfs_device_id(leaf, ditem));
++              return -EUCLEAN;
++      }
++
++      /*
++       * For device total_bytes, we don't have reliable way to check it, as
++       * it can be 0 for device removal. Device size check can only be done
++       * by dev extents check.
++       */
++      if (btrfs_device_bytes_used(leaf, ditem) >
++          btrfs_device_total_bytes(leaf, ditem)) {
++              dev_item_err(fs_info, leaf, slot,
++                           "invalid bytes used: have %llu expect [0, %llu]",
++                           btrfs_device_bytes_used(leaf, ditem),
++                           btrfs_device_total_bytes(leaf, ditem));
++              return -EUCLEAN;
++      }
++      /*
++       * Remaining members like io_align/type/gen/dev_group aren't really
++       * utilized.  Skip them to make later usage of them easier.
++       */
++      return 0;
++}
++
+ /*
+  * Common point to switch the item-specific validation.
+  */
+@@ -630,6 +701,9 @@ static int check_leaf_item(struct btrfs_
+               ret = btrfs_check_chunk_valid(fs_info, leaf, chunk,
+                                             key->offset);
+               break;
++      case BTRFS_DEV_ITEM_KEY:
++              ret = check_dev_item(fs_info, leaf, key, slot);
++              break;
+       }
+       return ret;
+ }
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -4606,15 +4606,6 @@ static void check_raid56_incompat_flag(s
+       btrfs_set_fs_incompat(info, RAID56);
+ }
+-#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info)      \
+-                      - sizeof(struct btrfs_chunk))           \
+-                      / sizeof(struct btrfs_stripe) + 1)
+-
+-#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE      \
+-                              - 2 * sizeof(struct btrfs_disk_key)     \
+-                              - 2 * sizeof(struct btrfs_chunk))       \
+-                              / sizeof(struct btrfs_stripe) + 1)
+-
+ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+                              u64 start, u64 type)
+ {
+--- a/fs/btrfs/volumes.h
++++ b/fs/btrfs/volumes.h
+@@ -257,6 +257,15 @@ struct btrfs_fs_devices {
+ #define BTRFS_BIO_INLINE_CSUM_SIZE    64
++#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info)      \
++                      - sizeof(struct btrfs_chunk))           \
++                      / sizeof(struct btrfs_stripe) + 1)
++
++#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE      \
++                              - 2 * sizeof(struct btrfs_disk_key)     \
++                              - 2 * sizeof(struct btrfs_chunk))       \
++                              / sizeof(struct btrfs_stripe) + 1)
++
+ /*
+  * we need the mirror number and stripe index to be passed around
+  * the call chain while we are processing end_io (especially errors).
diff --git a/queue-4.19/btrfs-tree-checker-verify-inode-item.patch b/queue-4.19/btrfs-tree-checker-verify-inode-item.patch
new file mode 100644 (file)
index 0000000..6b2e658
--- /dev/null
@@ -0,0 +1,183 @@
+From foo@baz Sat Nov  7 04:26:01 PM CET 2020
+From: Qu Wenruo <wqu@suse.com>
+Date: Wed, 13 Mar 2019 14:31:35 +0800
+Subject: btrfs: tree-checker: Verify inode item
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 496245cac57e26d8b738d85c7a29cf9a47610f3f upstream.
+
+There is a report in kernel bugzilla about mismatch file type in dir
+item and inode item.
+
+This inspires us to check inode mode in inode item.
+
+This patch will check the following members:
+
+- inode key objectid
+  Should be ROOT_DIR_DIR or [256, (u64)-256] or FREE_INO.
+
+- inode key offset
+  Should be 0
+
+- inode item generation
+- inode item transid
+  No newer than sb generation + 1.
+  The +1 is for log tree.
+
+- inode item mode
+  No unknown bits.
+  No invalid S_IF* bit.
+  NOTE: S_IFMT check is not enough, need to check every know type.
+
+- inode item nlink
+  Dir should have no more link than 1.
+
+- inode item flags
+
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ctree.h        |   15 +++++++
+ fs/btrfs/tree-checker.c |   94 ++++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 109 insertions(+)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -1459,6 +1459,21 @@ do {
+ #define BTRFS_INODE_ROOT_ITEM_INIT    (1 << 31)
++#define BTRFS_INODE_FLAG_MASK                                         \
++      (BTRFS_INODE_NODATASUM |                                        \
++       BTRFS_INODE_NODATACOW |                                        \
++       BTRFS_INODE_READONLY |                                         \
++       BTRFS_INODE_NOCOMPRESS |                                       \
++       BTRFS_INODE_PREALLOC |                                         \
++       BTRFS_INODE_SYNC |                                             \
++       BTRFS_INODE_IMMUTABLE |                                        \
++       BTRFS_INODE_APPEND |                                           \
++       BTRFS_INODE_NODUMP |                                           \
++       BTRFS_INODE_NOATIME |                                          \
++       BTRFS_INODE_DIRSYNC |                                          \
++       BTRFS_INODE_COMPRESS |                                         \
++       BTRFS_INODE_ROOT_ITEM_INIT)
++
+ struct btrfs_map_token {
+       const struct extent_buffer *eb;
+       char *kaddr;
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -671,6 +671,97 @@ static int check_dev_item(struct btrfs_f
+       return 0;
+ }
++/* Inode item error output has the same format as dir_item_err() */
++#define inode_item_err(fs_info, eb, slot, fmt, ...)                   \
++      dir_item_err(fs_info, eb, slot, fmt, __VA_ARGS__)
++
++static int check_inode_item(struct btrfs_fs_info *fs_info,
++                          struct extent_buffer *leaf,
++                          struct btrfs_key *key, int slot)
++{
++      struct btrfs_inode_item *iitem;
++      u64 super_gen = btrfs_super_generation(fs_info->super_copy);
++      u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777);
++      u32 mode;
++
++      if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID ||
++           key->objectid > BTRFS_LAST_FREE_OBJECTID) &&
++          key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID &&
++          key->objectid != BTRFS_FREE_INO_OBJECTID) {
++              generic_err(fs_info, leaf, slot,
++      "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu",
++                          key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID,
++                          BTRFS_FIRST_FREE_OBJECTID,
++                          BTRFS_LAST_FREE_OBJECTID,
++                          BTRFS_FREE_INO_OBJECTID);
++              return -EUCLEAN;
++      }
++      if (key->offset != 0) {
++              inode_item_err(fs_info, leaf, slot,
++                      "invalid key offset: has %llu expect 0",
++                      key->offset);
++              return -EUCLEAN;
++      }
++      iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item);
++
++      /* Here we use super block generation + 1 to handle log tree */
++      if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) {
++              inode_item_err(fs_info, leaf, slot,
++                      "invalid inode generation: has %llu expect (0, %llu]",
++                             btrfs_inode_generation(leaf, iitem),
++                             super_gen + 1);
++              return -EUCLEAN;
++      }
++      /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */
++      if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) {
++              inode_item_err(fs_info, leaf, slot,
++                      "invalid inode generation: has %llu expect [0, %llu]",
++                             btrfs_inode_transid(leaf, iitem), super_gen + 1);
++              return -EUCLEAN;
++      }
++
++      /*
++       * For size and nbytes it's better not to be too strict, as for dir
++       * item its size/nbytes can easily get wrong, but doesn't affect
++       * anything in the fs. So here we skip the check.
++       */
++      mode = btrfs_inode_mode(leaf, iitem);
++      if (mode & ~valid_mask) {
++              inode_item_err(fs_info, leaf, slot,
++                             "unknown mode bit detected: 0x%x",
++                             mode & ~valid_mask);
++              return -EUCLEAN;
++      }
++
++      /*
++       * S_IFMT is not bit mapped so we can't completely rely on is_power_of_2,
++       * but is_power_of_2() can save us from checking FIFO/CHR/DIR/REG.
++       * Only needs to check BLK, LNK and SOCKS
++       */
++      if (!is_power_of_2(mode & S_IFMT)) {
++              if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) {
++                      inode_item_err(fs_info, leaf, slot,
++                      "invalid mode: has 0%o expect valid S_IF* bit(s)",
++                                     mode & S_IFMT);
++                      return -EUCLEAN;
++              }
++      }
++      if (S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1) {
++              inode_item_err(fs_info, leaf, slot,
++                     "invalid nlink: has %u expect no more than 1 for dir",
++                      btrfs_inode_nlink(leaf, iitem));
++              return -EUCLEAN;
++      }
++      if (btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) {
++              inode_item_err(fs_info, leaf, slot,
++                             "unknown flags detected: 0x%llx",
++                             btrfs_inode_flags(leaf, iitem) &
++                             ~BTRFS_INODE_FLAG_MASK);
++              return -EUCLEAN;
++      }
++      return 0;
++}
++
+ /*
+  * Common point to switch the item-specific validation.
+  */
+@@ -704,6 +795,9 @@ static int check_leaf_item(struct btrfs_
+       case BTRFS_DEV_ITEM_KEY:
+               ret = check_dev_item(fs_info, leaf, key, slot);
+               break;
++      case BTRFS_INODE_ITEM_KEY:
++              ret = check_inode_item(fs_info, leaf, key, slot);
++              break;
+       }
+       return ret;
+ }
diff --git a/queue-4.19/revert-btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch b/queue-4.19/revert-btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch
new file mode 100644 (file)
index 0000000..96202a2
--- /dev/null
@@ -0,0 +1,35 @@
+From foo@baz Sat Nov  7 04:26:01 PM CET 2020
+From: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Date: Mon, 12 Oct 2020 23:18:11 +0100
+Subject: Revert "btrfs: flush write bio if we loop in extent_write_cache_pages"
+
+From: Ben Hutchings <ben.hutchings@codethink.co.uk>
+
+This reverts commit 860473714cbe7fbedcf92bfe3eb6d69fae8c74ff.  That
+has an incorrect upstream commit reference, and was modified in a way
+that conflicts with some older fixes.  We can cleanly cherry-pick the
+upstream commit *after* those fixes.
+
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent_io.c |    8 --------
+ 1 file changed, 8 deletions(-)
+
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -4045,14 +4045,6 @@ retry:
+                */
+               scanned = 1;
+               index = 0;
+-
+-              /*
+-               * If we're looping we could run into a page that is locked by a
+-               * writer and that writer could be waiting on writeback for a
+-               * page in our current bio, and thus deadlock, so flush the
+-               * write bio here.
+-               */
+-              flush_write_bio(epd);
+               goto retry;
+       }
index be61efd9a6a019c48d25a1a85ca71909bbe178dd..7f6d2d70a5e1daf828ae7ad5daf6a125758b0eb1 100644 (file)
@@ -9,3 +9,22 @@ gianfar-account-for-tx-ptp-timestamp-in-the-skb-headroom.patch
 net-usb-qmi_wwan-add-telit-le910cx-0x1230-composition.patch
 sctp-fix-comm_lost-cant_str_assoc-err-reporting-on-big-endian-platforms.patch
 sfp-fix-error-handing-in-sfp_probe.patch
+blktrace-fix-debugfs-use-after-free.patch
+btrfs-extent_io-kill-the-forward-declaration-of-flush_write_bio.patch
+btrfs-extent_io-move-the-bug_on-in-flush_write_bio-one-level-up.patch
+revert-btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch
+btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch
+btrfs-extent_io-handle-errors-better-in-extent_write_full_page.patch
+btrfs-extent_io-handle-errors-better-in-btree_write_cache_pages.patch
+btrfs-extent_io-add-proper-error-handling-to-lock_extent_buffer_for_io.patch
+btrfs-fix-unwritten-extent-buffers-and-hangs-on-future-writeback-attempts.patch
+btrfs-don-t-submit-any-btree-write-bio-if-the-fs-has-errors.patch
+btrfs-move-btrfs_check_chunk_valid-to-tree-check.-and-export-it.patch
+btrfs-tree-checker-make-chunk-item-checker-messages-more-readable.patch
+btrfs-tree-checker-make-btrfs_check_chunk_valid-return-euclean-instead-of-eio.patch
+btrfs-tree-checker-check-chunk-item-at-tree-block-read-time.patch
+btrfs-tree-checker-verify-dev-item.patch
+btrfs-tree-checker-fix-wrong-check-on-max-devid.patch
+btrfs-tree-checker-enhance-chunk-checker-to-validate-chunk-profile.patch
+btrfs-tree-checker-verify-inode-item.patch
+btrfs-tree-checker-fix-the-error-message-for-transid-error.patch