From d38a5f16d282e147974eedec0b1f44cbdab9609b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 7 Nov 2020 16:26:56 +0100 Subject: [PATCH] 4.19-stable patches added patches: blktrace-fix-debugfs-use-after-free.patch btrfs-don-t-submit-any-btree-write-bio-if-the-fs-has-errors.patch btrfs-extent_io-add-proper-error-handling-to-lock_extent_buffer_for_io.patch btrfs-extent_io-handle-errors-better-in-btree_write_cache_pages.patch btrfs-extent_io-handle-errors-better-in-extent_write_full_page.patch btrfs-extent_io-kill-the-forward-declaration-of-flush_write_bio.patch btrfs-extent_io-move-the-bug_on-in-flush_write_bio-one-level-up.patch btrfs-fix-unwritten-extent-buffers-and-hangs-on-future-writeback-attempts.patch btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch btrfs-move-btrfs_check_chunk_valid-to-tree-check.-and-export-it.patch btrfs-tree-checker-check-chunk-item-at-tree-block-read-time.patch btrfs-tree-checker-enhance-chunk-checker-to-validate-chunk-profile.patch btrfs-tree-checker-fix-the-error-message-for-transid-error.patch btrfs-tree-checker-fix-wrong-check-on-max-devid.patch btrfs-tree-checker-make-btrfs_check_chunk_valid-return-euclean-instead-of-eio.patch btrfs-tree-checker-make-chunk-item-checker-messages-more-readable.patch btrfs-tree-checker-verify-dev-item.patch btrfs-tree-checker-verify-inode-item.patch revert-btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch --- .../blktrace-fix-debugfs-use-after-free.patch | 215 +++++++++++++++ ...btree-write-bio-if-the-fs-has-errors.patch | 221 +++++++++++++++ ...andling-to-lock_extent_buffer_for_io.patch | 95 +++++++ ...rs-better-in-btree_write_cache_pages.patch | 54 ++++ ...ors-better-in-extent_write_full_page.patch | 90 ++++++ ...rward-declaration-of-flush_write_bio.patch | 113 ++++++++ ...g_on-in-flush_write_bio-one-level-up.patch | 189 +++++++++++++ ...d-hangs-on-future-writeback-attempts.patch | 146 ++++++++++ ...-we-loop-in-extent_write_cache_pages.patch | 105 +++++++ ...k_valid-to-tree-check.-and-export-it.patch | 257 ++++++++++++++++++ ...k-chunk-item-at-tree-block-read-time.patch | 74 +++++ ...nk-checker-to-validate-chunk-profile.patch | 44 +++ ...-the-error-message-for-transid-error.patch | 36 +++ ...checker-fix-wrong-check-on-max-devid.patch | 92 +++++++ ..._valid-return-euclean-instead-of-eio.patch | 114 ++++++++ ...-item-checker-messages-more-readable.patch | 172 ++++++++++++ .../btrfs-tree-checker-verify-dev-item.patch | 172 ++++++++++++ ...btrfs-tree-checker-verify-inode-item.patch | 183 +++++++++++++ ...-we-loop-in-extent_write_cache_pages.patch | 35 +++ queue-4.19/series | 19 ++ 20 files changed, 2426 insertions(+) create mode 100644 queue-4.19/blktrace-fix-debugfs-use-after-free.patch create mode 100644 queue-4.19/btrfs-don-t-submit-any-btree-write-bio-if-the-fs-has-errors.patch create mode 100644 queue-4.19/btrfs-extent_io-add-proper-error-handling-to-lock_extent_buffer_for_io.patch create mode 100644 queue-4.19/btrfs-extent_io-handle-errors-better-in-btree_write_cache_pages.patch create mode 100644 queue-4.19/btrfs-extent_io-handle-errors-better-in-extent_write_full_page.patch create mode 100644 queue-4.19/btrfs-extent_io-kill-the-forward-declaration-of-flush_write_bio.patch create mode 100644 queue-4.19/btrfs-extent_io-move-the-bug_on-in-flush_write_bio-one-level-up.patch create mode 100644 queue-4.19/btrfs-fix-unwritten-extent-buffers-and-hangs-on-future-writeback-attempts.patch create mode 100644 queue-4.19/btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch create mode 100644 queue-4.19/btrfs-move-btrfs_check_chunk_valid-to-tree-check.-and-export-it.patch create mode 100644 queue-4.19/btrfs-tree-checker-check-chunk-item-at-tree-block-read-time.patch create mode 100644 queue-4.19/btrfs-tree-checker-enhance-chunk-checker-to-validate-chunk-profile.patch create mode 100644 queue-4.19/btrfs-tree-checker-fix-the-error-message-for-transid-error.patch create mode 100644 queue-4.19/btrfs-tree-checker-fix-wrong-check-on-max-devid.patch create mode 100644 queue-4.19/btrfs-tree-checker-make-btrfs_check_chunk_valid-return-euclean-instead-of-eio.patch create mode 100644 queue-4.19/btrfs-tree-checker-make-chunk-item-checker-messages-more-readable.patch create mode 100644 queue-4.19/btrfs-tree-checker-verify-dev-item.patch create mode 100644 queue-4.19/btrfs-tree-checker-verify-inode-item.patch create mode 100644 queue-4.19/revert-btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch diff --git a/queue-4.19/blktrace-fix-debugfs-use-after-free.patch b/queue-4.19/blktrace-fix-debugfs-use-after-free.patch new file mode 100644 index 00000000000..2d0ae2b3b80 --- /dev/null +++ b/queue-4.19/blktrace-fix-debugfs-use-after-free.patch @@ -0,0 +1,215 @@ +From foo@baz Sat Nov 7 04:26:01 PM CET 2020 +From: Luis Chamberlain +Date: Fri, 19 Jun 2020 20:47:28 +0000 +Subject: blktrace: fix debugfs use after free + +From: Luis Chamberlain + +commit bad8e64fb19d3a0de5e564d9a7271c31bd684369 upstream. + +On commit 6ac93117ab00 ("blktrace: use existing disk debugfs directory") +merged on v4.12 Omar fixed the original blktrace code for request-based +drivers (multiqueue). This however left in place a possible crash, if you +happen to abuse blktrace while racing to remove / add a device. + +We used to use asynchronous removal of the request_queue, and with that +the issue was easier to reproduce. Now that we have reverted to +synchronous removal of the request_queue, the issue is still possible to +reproduce, its however just a bit more difficult. + +We essentially run two instances of break-blktrace which add/remove +a loop device, and setup a blktrace and just never tear the blktrace +down. We do this twice in parallel. This is easily reproduced with the +script run_0004.sh from break-blktrace [0]. + +We can end up with two types of panics each reflecting where we +race, one a failed blktrace setup: + +[ 252.426751] debugfs: Directory 'loop0' with parent 'block' already present! +[ 252.432265] BUG: kernel NULL pointer dereference, address: 00000000000000a0 +[ 252.436592] #PF: supervisor write access in kernel mode +[ 252.439822] #PF: error_code(0x0002) - not-present page +[ 252.442967] PGD 0 P4D 0 +[ 252.444656] Oops: 0002 [#1] SMP NOPTI +[ 252.446972] CPU: 10 PID: 1153 Comm: break-blktrace Tainted: G E 5.7.0-rc2-next-20200420+ #164 +[ 252.452673] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 +[ 252.456343] RIP: 0010:down_write+0x15/0x40 +[ 252.458146] Code: eb ca e8 ae 22 8d ff cc cc cc cc cc cc cc cc cc cc cc cc + cc cc 0f 1f 44 00 00 55 48 89 fd e8 52 db ff ff 31 c0 ba 01 00 + 00 00 48 0f b1 55 00 75 0f 48 8b 04 25 c0 8b 01 00 48 89 + 45 08 5d +[ 252.463638] RSP: 0018:ffffa626415abcc8 EFLAGS: 00010246 +[ 252.464950] RAX: 0000000000000000 RBX: ffff958c25f0f5c0 RCX: ffffff8100000000 +[ 252.466727] RDX: 0000000000000001 RSI: ffffff8100000000 RDI: 00000000000000a0 +[ 252.468482] RBP: 00000000000000a0 R08: 0000000000000000 R09: 0000000000000001 +[ 252.470014] R10: 0000000000000000 R11: ffff958d1f9227ff R12: 0000000000000000 +[ 252.471473] R13: ffff958c25ea5380 R14: ffffffff8cce15f1 R15: 00000000000000a0 +[ 252.473346] FS: 00007f2e69dee540(0000) GS:ffff958c2fc80000(0000) knlGS:0000000000000000 +[ 252.475225] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 252.476267] CR2: 00000000000000a0 CR3: 0000000427d10004 CR4: 0000000000360ee0 +[ 252.477526] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +[ 252.478776] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +[ 252.479866] Call Trace: +[ 252.480322] simple_recursive_removal+0x4e/0x2e0 +[ 252.481078] ? debugfs_remove+0x60/0x60 +[ 252.481725] ? relay_destroy_buf+0x77/0xb0 +[ 252.482662] debugfs_remove+0x40/0x60 +[ 252.483518] blk_remove_buf_file_callback+0x5/0x10 +[ 252.484328] relay_close_buf+0x2e/0x60 +[ 252.484930] relay_open+0x1ce/0x2c0 +[ 252.485520] do_blk_trace_setup+0x14f/0x2b0 +[ 252.486187] __blk_trace_setup+0x54/0xb0 +[ 252.486803] blk_trace_ioctl+0x90/0x140 +[ 252.487423] ? do_sys_openat2+0x1ab/0x2d0 +[ 252.488053] blkdev_ioctl+0x4d/0x260 +[ 252.488636] block_ioctl+0x39/0x40 +[ 252.489139] ksys_ioctl+0x87/0xc0 +[ 252.489675] __x64_sys_ioctl+0x16/0x20 +[ 252.490380] do_syscall_64+0x52/0x180 +[ 252.491032] entry_SYSCALL_64_after_hwframe+0x44/0xa9 + +And the other on the device removal: + +[ 128.528940] debugfs: Directory 'loop0' with parent 'block' already present! +[ 128.615325] BUG: kernel NULL pointer dereference, address: 00000000000000a0 +[ 128.619537] #PF: supervisor write access in kernel mode +[ 128.622700] #PF: error_code(0x0002) - not-present page +[ 128.625842] PGD 0 P4D 0 +[ 128.627585] Oops: 0002 [#1] SMP NOPTI +[ 128.629871] CPU: 12 PID: 544 Comm: break-blktrace Tainted: G E 5.7.0-rc2-next-20200420+ #164 +[ 128.635595] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 +[ 128.640471] RIP: 0010:down_write+0x15/0x40 +[ 128.643041] Code: eb ca e8 ae 22 8d ff cc cc cc cc cc cc cc cc cc cc cc cc + cc cc 0f 1f 44 00 00 55 48 89 fd e8 52 db ff ff 31 c0 ba 01 00 + 00 00 48 0f b1 55 00 75 0f 65 48 8b 04 25 c0 8b 01 00 48 89 + 45 08 5d +[ 128.650180] RSP: 0018:ffffa9c3c05ebd78 EFLAGS: 00010246 +[ 128.651820] RAX: 0000000000000000 RBX: ffff8ae9a6370240 RCX: ffffff8100000000 +[ 128.653942] RDX: 0000000000000001 RSI: ffffff8100000000 RDI: 00000000000000a0 +[ 128.655720] RBP: 00000000000000a0 R08: 0000000000000002 R09: ffff8ae9afd2d3d0 +[ 128.657400] R10: 0000000000000056 R11: 0000000000000000 R12: 0000000000000000 +[ 128.659099] R13: 0000000000000000 R14: 0000000000000003 R15: 00000000000000a0 +[ 128.660500] FS: 00007febfd995540(0000) GS:ffff8ae9afd00000(0000) knlGS:0000000000000000 +[ 128.662204] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 128.663426] CR2: 00000000000000a0 CR3: 0000000420042003 CR4: 0000000000360ee0 +[ 128.664776] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +[ 128.666022] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +[ 128.667282] Call Trace: +[ 128.667801] simple_recursive_removal+0x4e/0x2e0 +[ 128.668663] ? debugfs_remove+0x60/0x60 +[ 128.669368] debugfs_remove+0x40/0x60 +[ 128.669985] blk_trace_free+0xd/0x50 +[ 128.670593] __blk_trace_remove+0x27/0x40 +[ 128.671274] blk_trace_shutdown+0x30/0x40 +[ 128.671935] blk_release_queue+0x95/0xf0 +[ 128.672589] kobject_put+0xa5/0x1b0 +[ 128.673188] disk_release+0xa2/0xc0 +[ 128.673786] device_release+0x28/0x80 +[ 128.674376] kobject_put+0xa5/0x1b0 +[ 128.674915] loop_remove+0x39/0x50 [loop] +[ 128.675511] loop_control_ioctl+0x113/0x130 [loop] +[ 128.676199] ksys_ioctl+0x87/0xc0 +[ 128.676708] __x64_sys_ioctl+0x16/0x20 +[ 128.677274] do_syscall_64+0x52/0x180 +[ 128.677823] entry_SYSCALL_64_after_hwframe+0x44/0xa9 + +The common theme here is: + +debugfs: Directory 'loop0' with parent 'block' already present + +This crash happens because of how blktrace uses the debugfs directory +where it places its files. Upon init we always create the same directory +which would be needed by blktrace but we only do this for make_request +drivers (multiqueue) block drivers. When you race a removal of these +devices with a blktrace setup you end up in a situation where the +make_request recursive debugfs removal will sweep away the blktrace +files and then later blktrace will also try to remove individual +dentries which are already NULL. The inverse is also possible and hence +the two types of use after frees. + +We don't create the block debugfs directory on init for these types of +block devices: + + * request-based block driver block devices + * every possible partition + * scsi-generic + +And so, this race should in theory only be possible with make_request +drivers. + +We can fix the UAF by simply re-using the debugfs directory for +make_request drivers (multiqueue) and only creating the ephemeral +directory for the other type of block devices. The new clarifications +on relying on the q->blk_trace_mutex *and* also checking for q->blk_trace +*prior* to processing a blktrace ensures the debugfs directories are +only created if no possible directory name clashes are possible. + +This goes tested with: + + o nvme partitions + o ISCSI with tgt, and blktracing against scsi-generic with: + o block + o tape + o cdrom + o media changer + o blktests + +This patch is part of the work which disputes the severity of +CVE-2019-19770 which shows this issue is not a core debugfs issue, but +a misuse of debugfs within blktace. + +Fixes: 6ac93117ab00 ("blktrace: use existing disk debugfs directory") +Reported-by: syzbot+603294af2d01acfdd6da@syzkaller.appspotmail.com +Signed-off-by: Luis Chamberlain +Reviewed-by: Christoph Hellwig +Cc: Bart Van Assche +Cc: Omar Sandoval +Cc: Hannes Reinecke +Cc: Nicolai Stange +Cc: Greg Kroah-Hartman +Cc: Michal Hocko +Cc: "Martin K. Petersen" +Cc: "James E.J. Bottomley" +Cc: yu kuai +Signed-off-by: Jens Axboe +[bwh: Backported to 4.19: open-code queue_is_mq()] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/blktrace.c | 18 ++++++++++++------ + 1 file changed, 12 insertions(+), 6 deletions(-) + +--- a/kernel/trace/blktrace.c ++++ b/kernel/trace/blktrace.c +@@ -521,10 +521,18 @@ static int do_blk_trace_setup(struct req + if (!bt->msg_data) + goto err; + +- ret = -ENOENT; +- +- dir = debugfs_lookup(buts->name, blk_debugfs_root); +- if (!dir) ++#ifdef CONFIG_BLK_DEBUG_FS ++ /* ++ * When tracing whole make_request drivers (multiqueue) block devices, ++ * reuse the existing debugfs directory created by the block layer on ++ * init. For request-based block devices, all partitions block devices, ++ * and scsi-generic block devices we create a temporary new debugfs ++ * directory that will be removed once the trace ends. ++ */ ++ if (q->mq_ops && bdev && bdev == bdev->bd_contains) ++ dir = q->debugfs_dir; ++ else ++#endif + bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); + if (!dir) + goto err; +@@ -583,8 +591,6 @@ static int do_blk_trace_setup(struct req + + ret = 0; + err: +- if (dir && !bt->dir) +- dput(dir); + if (ret) + blk_trace_free(bt); + return ret; diff --git a/queue-4.19/btrfs-don-t-submit-any-btree-write-bio-if-the-fs-has-errors.patch b/queue-4.19/btrfs-don-t-submit-any-btree-write-bio-if-the-fs-has-errors.patch new file mode 100644 index 00000000000..f87d1815cde --- /dev/null +++ b/queue-4.19/btrfs-don-t-submit-any-btree-write-bio-if-the-fs-has-errors.patch @@ -0,0 +1,221 @@ +From foo@baz Sat Nov 7 04:26:01 PM CET 2020 +From: Qu Wenruo +Date: Wed, 12 Feb 2020 14:12:44 +0800 +Subject: btrfs: Don't submit any btree write bio if the fs has errors + +From: Qu Wenruo + +commit b3ff8f1d380e65dddd772542aa9bff6c86bf715a upstream. + +[BUG] +There is a fuzzed image which could cause KASAN report at unmount time. + + BUG: KASAN: use-after-free in btrfs_queue_work+0x2c1/0x390 + Read of size 8 at addr ffff888067cf6848 by task umount/1922 + + CPU: 0 PID: 1922 Comm: umount Tainted: G W 5.0.21 #1 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 + Call Trace: + dump_stack+0x5b/0x8b + print_address_description+0x70/0x280 + kasan_report+0x13a/0x19b + btrfs_queue_work+0x2c1/0x390 + btrfs_wq_submit_bio+0x1cd/0x240 + btree_submit_bio_hook+0x18c/0x2a0 + submit_one_bio+0x1be/0x320 + flush_write_bio.isra.41+0x2c/0x70 + btree_write_cache_pages+0x3bb/0x7f0 + do_writepages+0x5c/0x130 + __writeback_single_inode+0xa3/0x9a0 + writeback_single_inode+0x23d/0x390 + write_inode_now+0x1b5/0x280 + iput+0x2ef/0x600 + close_ctree+0x341/0x750 + generic_shutdown_super+0x126/0x370 + kill_anon_super+0x31/0x50 + btrfs_kill_super+0x36/0x2b0 + deactivate_locked_super+0x80/0xc0 + deactivate_super+0x13c/0x150 + cleanup_mnt+0x9a/0x130 + task_work_run+0x11a/0x1b0 + exit_to_usermode_loop+0x107/0x130 + do_syscall_64+0x1e5/0x280 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + +[CAUSE] +The fuzzed image has a completely screwd up extent tree: + + leaf 29421568 gen 8 total ptrs 6 free space 3587 owner EXTENT_TREE + refs 2 lock (w:0 r:0 bw:0 br:0 sw:0 sr:0) lock_owner 0 current 5938 + item 0 key (12587008 168 4096) itemoff 3942 itemsize 53 + extent refs 1 gen 9 flags 1 + ref#0: extent data backref root 5 objectid 259 offset 0 count 1 + item 1 key (12591104 168 8192) itemoff 3889 itemsize 53 + extent refs 1 gen 9 flags 1 + ref#0: extent data backref root 5 objectid 271 offset 0 count 1 + item 2 key (12599296 168 4096) itemoff 3836 itemsize 53 + extent refs 1 gen 9 flags 1 + ref#0: extent data backref root 5 objectid 259 offset 4096 count 1 + item 3 key (29360128 169 0) itemoff 3803 itemsize 33 + extent refs 1 gen 9 flags 2 + ref#0: tree block backref root 5 + item 4 key (29368320 169 1) itemoff 3770 itemsize 33 + extent refs 1 gen 9 flags 2 + ref#0: tree block backref root 5 + item 5 key (29372416 169 0) itemoff 3737 itemsize 33 + extent refs 1 gen 9 flags 2 + ref#0: tree block backref root 5 + +Note that leaf 29421568 doesn't have its backref in the extent tree. +Thus extent allocator can re-allocate leaf 29421568 for other trees. + +In short, the bug is caused by: + +- Existing tree block gets allocated to log tree + This got its generation bumped. + +- Log tree balance cleaned dirty bit of offending tree block + It will not be written back to disk, thus no WRITTEN flag. + +- Original owner of the tree block gets COWed + Since the tree block has higher transid, no WRITTEN flag, it's reused, + and not traced by transaction::dirty_pages. + +- Transaction aborted + Tree blocks get cleaned according to transaction::dirty_pages. But the + offending tree block is not recorded at all. + +- Filesystem unmount + All pages are assumed to be are clean, destroying all workqueue, then + call iput(btree_inode). + But offending tree block is still dirty, which triggers writeback, and + causes use-after-free bug. + +The detailed sequence looks like this: + +- Initial status + eb: 29421568, header=WRITTEN bflags_dirty=0, page_dirty=0, gen=8, + not traced by any dirty extent_iot_tree. + +- New tree block is allocated + Since there is no backref for 29421568, it's re-allocated as new tree + block. + Keep in mind that tree block 29421568 is still referred by extent + tree. + +- Tree block 29421568 is filled for log tree + eb: 29421568, header=0 bflags_dirty=1, page_dirty=1, gen=9 << (gen bumped) + traced by btrfs_root::dirty_log_pages + +- Some log tree operations + Since the fs is using node size 4096, the log tree can easily go a + level higher. + +- Log tree needs balance + Tree block 29421568 gets all its content pushed to right, thus now + it is empty, and we don't need it. + btrfs_clean_tree_block() from __push_leaf_right() get called. + + eb: 29421568, header=0 bflags_dirty=0, page_dirty=0, gen=9 + traced by btrfs_root::dirty_log_pages + +- Log tree write back + btree_write_cache_pages() goes through dirty pages ranges, but since + page of tree block 29421568 gets cleaned already, it's not written + back to disk. Thus it doesn't have WRITTEN bit set. + But ranges in dirty_log_pages are cleared. + + eb: 29421568, header=0 bflags_dirty=0, page_dirty=0, gen=9 + not traced by any dirty extent_iot_tree. + +- Extent tree update when committing transaction + Since tree block 29421568 has transid equal to running trans, and has + no WRITTEN bit, should_cow_block() will use it directly without adding + it to btrfs_transaction::dirty_pages. + + eb: 29421568, header=0 bflags_dirty=1, page_dirty=1, gen=9 + not traced by any dirty extent_iot_tree. + + At this stage, we're doomed. We have a dirty eb not tracked by any + extent io tree. + +- Transaction gets aborted due to corrupted extent tree + Btrfs cleans up dirty pages according to transaction::dirty_pages and + btrfs_root::dirty_log_pages. + But since tree block 29421568 is not tracked by neither of them, it's + still dirty. + + eb: 29421568, header=0 bflags_dirty=1, page_dirty=1, gen=9 + not traced by any dirty extent_iot_tree. + +- Filesystem unmount + Since all cleanup is assumed to be done, all workqueus are destroyed. + Then iput(btree_inode) is called, expecting no dirty pages. + But tree 29421568 is still dirty, thus triggering writeback. + Since all workqueues are already freed, we cause use-after-free. + +This shows us that, log tree blocks + bad extent tree can cause wild +dirty pages. + +[FIX] +To fix the problem, don't submit any btree write bio if the filesytem +has any error. This is the last safe net, just in case other cleanup +haven't caught catch it. + +Link: https://github.com/bobfuzzer/CVE/tree/master/CVE-2019-19377 +CC: stable@vger.kernel.org # 5.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +[bwh: Backported to 4.19: fs_info variable already exists in + btree_write_cache_pages()] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent_io.c | 34 +++++++++++++++++++++++++++++++++- + 1 file changed, 33 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -3947,7 +3947,39 @@ retry: + end_write_bio(&epd, ret); + return ret; + } +- ret = flush_write_bio(&epd); ++ /* ++ * If something went wrong, don't allow any metadata write bio to be ++ * submitted. ++ * ++ * This would prevent use-after-free if we had dirty pages not ++ * cleaned up, which can still happen by fuzzed images. ++ * ++ * - Bad extent tree ++ * Allowing existing tree block to be allocated for other trees. ++ * ++ * - Log tree operations ++ * Exiting tree blocks get allocated to log tree, bumps its ++ * generation, then get cleaned in tree re-balance. ++ * Such tree block will not be written back, since it's clean, ++ * thus no WRITTEN flag set. ++ * And after log writes back, this tree block is not traced by ++ * any dirty extent_io_tree. ++ * ++ * - Offending tree block gets re-dirtied from its original owner ++ * Since it has bumped generation, no WRITTEN flag, it can be ++ * reused without COWing. This tree block will not be traced ++ * by btrfs_transaction::dirty_pages. ++ * ++ * Now such dirty tree block will not be cleaned by any dirty ++ * extent io tree. Thus we don't want to submit such wild eb ++ * if the fs already has error. ++ */ ++ if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { ++ ret = flush_write_bio(&epd); ++ } else { ++ ret = -EUCLEAN; ++ end_write_bio(&epd, ret); ++ } + return ret; + } + diff --git a/queue-4.19/btrfs-extent_io-add-proper-error-handling-to-lock_extent_buffer_for_io.patch b/queue-4.19/btrfs-extent_io-add-proper-error-handling-to-lock_extent_buffer_for_io.patch new file mode 100644 index 00000000000..39c24a43290 --- /dev/null +++ b/queue-4.19/btrfs-extent_io-add-proper-error-handling-to-lock_extent_buffer_for_io.patch @@ -0,0 +1,95 @@ +From foo@baz Sat Nov 7 04:26:01 PM CET 2020 +From: Qu Wenruo +Date: Wed, 20 Mar 2019 14:27:46 +0800 +Subject: btrfs: extent_io: add proper error handling to lock_extent_buffer_for_io() + +From: Qu Wenruo + +commit 2e3c25136adfb293d517e17f761d3b8a43a8fc22 upstream. + +This function needs some extra checks on locked pages and eb. For error +handling we need to unlock locked pages and the eb. + +There is a rare >0 return value branch, where all pages get locked +while write bio is not flushed. + +Thankfully it's handled by the only caller, btree_write_cache_pages(), +as later write_one_eb() call will trigger submit_one_bio(). So there +shouldn't be any problem. + +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent_io.c | 27 ++++++++++++++++++++++----- + 1 file changed, 22 insertions(+), 5 deletions(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -3554,19 +3554,27 @@ void wait_on_extent_buffer_writeback(str + TASK_UNINTERRUPTIBLE); + } + ++/* ++ * Lock eb pages and flush the bio if we can't the locks ++ * ++ * Return 0 if nothing went wrong ++ * Return >0 is same as 0, except bio is not submitted ++ * Return <0 if something went wrong, no page is locked ++ */ + static noinline_for_stack int + lock_extent_buffer_for_io(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct extent_page_data *epd) + { +- int i, num_pages; ++ int i, num_pages, failed_page_nr; + int flush = 0; + int ret = 0; + + if (!btrfs_try_tree_write_lock(eb)) { +- flush = 1; + ret = flush_write_bio(epd); +- BUG_ON(ret < 0); ++ if (ret < 0) ++ return ret; ++ flush = 1; + btrfs_tree_lock(eb); + } + +@@ -3576,7 +3584,8 @@ lock_extent_buffer_for_io(struct extent_ + return 0; + if (!flush) { + ret = flush_write_bio(epd); +- BUG_ON(ret < 0); ++ if (ret < 0) ++ return ret; + flush = 1; + } + while (1) { +@@ -3618,7 +3627,10 @@ lock_extent_buffer_for_io(struct extent_ + if (!trylock_page(p)) { + if (!flush) { + ret = flush_write_bio(epd); +- BUG_ON(ret < 0); ++ if (ret < 0) { ++ failed_page_nr = i; ++ goto err_unlock; ++ } + flush = 1; + } + lock_page(p); +@@ -3626,6 +3638,11 @@ lock_extent_buffer_for_io(struct extent_ + } + + return ret; ++err_unlock: ++ /* Unlock already locked pages */ ++ for (i = 0; i < failed_page_nr; i++) ++ unlock_page(eb->pages[i]); ++ return ret; + } + + static void end_extent_buffer_writeback(struct extent_buffer *eb) diff --git a/queue-4.19/btrfs-extent_io-handle-errors-better-in-btree_write_cache_pages.patch b/queue-4.19/btrfs-extent_io-handle-errors-better-in-btree_write_cache_pages.patch new file mode 100644 index 00000000000..23f69ff6922 --- /dev/null +++ b/queue-4.19/btrfs-extent_io-handle-errors-better-in-btree_write_cache_pages.patch @@ -0,0 +1,54 @@ +From foo@baz Sat Nov 7 04:26:01 PM CET 2020 +From: Qu Wenruo +Date: Wed, 20 Mar 2019 14:27:43 +0800 +Subject: btrfs: extent_io: Handle errors better in btree_write_cache_pages() + +From: Qu Wenruo + +commit 2b952eea813b1f7e7d4b9782271acd91625b9bb9 upstream. + +In btree_write_cache_pages(), we can only get @ret <= 0. +Add an ASSERT() for it just in case. + +Then instead of submitting the write bio even we got some error, check +the return value first. +If we have already hit some error, just clean up the corrupted or +half-baked bio, and return error. + +If there is no error so far, then call flush_write_bio() and return the +result. + +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent_io.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -3809,7 +3809,6 @@ int btree_write_cache_pages(struct addre + .sync_io = wbc->sync_mode == WB_SYNC_ALL, + }; + int ret = 0; +- int flush_ret; + int done = 0; + int nr_to_write_done = 0; + struct pagevec pvec; +@@ -3909,8 +3908,12 @@ retry: + index = 0; + goto retry; + } +- flush_ret = flush_write_bio(&epd); +- BUG_ON(flush_ret < 0); ++ ASSERT(ret <= 0); ++ if (ret < 0) { ++ end_write_bio(&epd, ret); ++ return ret; ++ } ++ ret = flush_write_bio(&epd); + return ret; + } + diff --git a/queue-4.19/btrfs-extent_io-handle-errors-better-in-extent_write_full_page.patch b/queue-4.19/btrfs-extent_io-handle-errors-better-in-extent_write_full_page.patch new file mode 100644 index 00000000000..272ef9e9e83 --- /dev/null +++ b/queue-4.19/btrfs-extent_io-handle-errors-better-in-extent_write_full_page.patch @@ -0,0 +1,90 @@ +From foo@baz Sat Nov 7 04:26:01 PM CET 2020 +From: Qu Wenruo +Date: Wed, 20 Mar 2019 14:27:42 +0800 +Subject: btrfs: extent_io: Handle errors better in extent_write_full_page() + +From: Qu Wenruo + +commit 3065976b045f77a910809fa7699f99a1e7c0dbbb upstream. + +Since now flush_write_bio() could return error, kill the BUG_ON() first. +Then don't call flush_write_bio() unconditionally, instead we check the +return value from __extent_writepage() first. + +If __extent_writepage() fails, we do cleanup, and return error without +submitting the possible corrupted or half-baked bio. + +If __extent_writepage() successes, then we call flush_write_bio() and +return the result. + +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent_io.c | 24 +++++++++++++++++++++--- + 1 file changed, 21 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -160,6 +160,16 @@ static int __must_check submit_one_bio(s + return blk_status_to_errno(ret); + } + ++/* Cleanup unsubmitted bios */ ++static void end_write_bio(struct extent_page_data *epd, int ret) ++{ ++ if (epd->bio) { ++ epd->bio->bi_status = errno_to_blk_status(ret); ++ bio_endio(epd->bio); ++ epd->bio = NULL; ++ } ++} ++ + /* + * Submit bio from extent page data via submit_one_bio + * +@@ -3461,6 +3471,9 @@ done: + * records are inserted to lock ranges in the tree, and as dirty areas + * are found, they are marked writeback. Then the lock bits are removed + * and the end_io handler clears the writeback ranges ++ * ++ * Return 0 if everything goes well. ++ * Return <0 for error. + */ + static int __extent_writepage(struct page *page, struct writeback_control *wbc, + struct extent_page_data *epd) +@@ -3528,6 +3541,7 @@ done: + end_extent_writepage(page, ret, start, page_end); + } + unlock_page(page); ++ ASSERT(ret <= 0); + return ret; + + done_unlocked: +@@ -4067,7 +4081,6 @@ retry: + int extent_write_full_page(struct page *page, struct writeback_control *wbc) + { + int ret; +- int flush_ret; + struct extent_page_data epd = { + .bio = NULL, + .tree = &BTRFS_I(page->mapping->host)->io_tree, +@@ -4076,9 +4089,14 @@ int extent_write_full_page(struct page * + }; + + ret = __extent_writepage(page, wbc, &epd); ++ ASSERT(ret <= 0); ++ if (ret < 0) { ++ end_write_bio(&epd, ret); ++ return ret; ++ } + +- flush_ret = flush_write_bio(&epd); +- BUG_ON(flush_ret < 0); ++ ret = flush_write_bio(&epd); ++ ASSERT(ret <= 0); + return ret; + } + diff --git a/queue-4.19/btrfs-extent_io-kill-the-forward-declaration-of-flush_write_bio.patch b/queue-4.19/btrfs-extent_io-kill-the-forward-declaration-of-flush_write_bio.patch new file mode 100644 index 00000000000..1e79d99957b --- /dev/null +++ b/queue-4.19/btrfs-extent_io-kill-the-forward-declaration-of-flush_write_bio.patch @@ -0,0 +1,113 @@ +From foo@baz Sat Nov 7 04:26:01 PM CET 2020 +From: Qu Wenruo +Date: Fri, 25 Jan 2019 13:09:15 +0800 +Subject: btrfs: extent_io: Kill the forward declaration of flush_write_bio + +From: Qu Wenruo + +commit bb58eb9e167d087cc518f7a71c3c00f1671958da upstream. + +There is no need to forward declare flush_write_bio(), as it only +depends on submit_one_bio(). Both of them are pretty small, just move +them to kill the forward declaration. + +Reviewed-by: Nikolay Borisov +Reviewed-by: Johannes Thumshirn +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +[bwh: Cherry-picked for 4.19 to ease backporting later fixes] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent_io.c | 66 ++++++++++++++++++++++++--------------------------- + 1 file changed, 32 insertions(+), 34 deletions(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -138,7 +138,38 @@ static int add_extent_changeset(struct e + return ret; + } + +-static void flush_write_bio(struct extent_page_data *epd); ++static int __must_check submit_one_bio(struct bio *bio, int mirror_num, ++ unsigned long bio_flags) ++{ ++ blk_status_t ret = 0; ++ struct bio_vec *bvec = bio_last_bvec_all(bio); ++ struct page *page = bvec->bv_page; ++ struct extent_io_tree *tree = bio->bi_private; ++ u64 start; ++ ++ start = page_offset(page) + bvec->bv_offset; ++ ++ bio->bi_private = NULL; ++ ++ if (tree->ops) ++ ret = tree->ops->submit_bio_hook(tree->private_data, bio, ++ mirror_num, bio_flags, start); ++ else ++ btrfsic_submit_bio(bio); ++ ++ return blk_status_to_errno(ret); ++} ++ ++static void flush_write_bio(struct extent_page_data *epd) ++{ ++ if (epd->bio) { ++ int ret; ++ ++ ret = submit_one_bio(epd->bio, 0, 0); ++ BUG_ON(ret < 0); /* -ENOMEM */ ++ epd->bio = NULL; ++ } ++} + + int __init extent_io_init(void) + { +@@ -2710,28 +2741,6 @@ struct bio *btrfs_bio_clone_partial(stru + return bio; + } + +-static int __must_check submit_one_bio(struct bio *bio, int mirror_num, +- unsigned long bio_flags) +-{ +- blk_status_t ret = 0; +- struct bio_vec *bvec = bio_last_bvec_all(bio); +- struct page *page = bvec->bv_page; +- struct extent_io_tree *tree = bio->bi_private; +- u64 start; +- +- start = page_offset(page) + bvec->bv_offset; +- +- bio->bi_private = NULL; +- +- if (tree->ops) +- ret = tree->ops->submit_bio_hook(tree->private_data, bio, +- mirror_num, bio_flags, start); +- else +- btrfsic_submit_bio(bio); +- +- return blk_status_to_errno(ret); +-} +- + /* + * @opf: bio REQ_OP_* and REQ_* flags as one value + * @tree: tree so we can call our merge_bio hook +@@ -4033,17 +4042,6 @@ retry: + return ret; + } + +-static void flush_write_bio(struct extent_page_data *epd) +-{ +- if (epd->bio) { +- int ret; +- +- ret = submit_one_bio(epd->bio, 0, 0); +- BUG_ON(ret < 0); /* -ENOMEM */ +- epd->bio = NULL; +- } +-} +- + int extent_write_full_page(struct page *page, struct writeback_control *wbc) + { + int ret; diff --git a/queue-4.19/btrfs-extent_io-move-the-bug_on-in-flush_write_bio-one-level-up.patch b/queue-4.19/btrfs-extent_io-move-the-bug_on-in-flush_write_bio-one-level-up.patch new file mode 100644 index 00000000000..d01e5905f7c --- /dev/null +++ b/queue-4.19/btrfs-extent_io-move-the-bug_on-in-flush_write_bio-one-level-up.patch @@ -0,0 +1,189 @@ +From foo@baz Sat Nov 7 04:26:01 PM CET 2020 +From: Qu Wenruo +Date: Wed, 20 Mar 2019 14:27:41 +0800 +Subject: btrfs: extent_io: Move the BUG_ON() in flush_write_bio() one level up + +From: Qu Wenruo + +commit f4340622e02261fae599e3da936ff4808b418173 upstream. + +We have a BUG_ON() in flush_write_bio() to handle the return value of +submit_one_bio(). + +Move the BUG_ON() one level up to all its callers. + +This patch will introduce temporary variable, @flush_ret to keep code +change minimal in this patch. That variable will be cleaned up when +enhancing the error handling later. + +Reviewed-by: Johannes Thumshirn +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +[bwh: Cherry-picked for 4.19 to ease backporting later fixes] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent_io.c | 55 ++++++++++++++++++++++++++++++++++++++------------- + 1 file changed, 41 insertions(+), 14 deletions(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -160,15 +160,28 @@ static int __must_check submit_one_bio(s + return blk_status_to_errno(ret); + } + +-static void flush_write_bio(struct extent_page_data *epd) ++/* ++ * Submit bio from extent page data via submit_one_bio ++ * ++ * Return 0 if everything is OK. ++ * Return <0 for error. ++ */ ++static int __must_check flush_write_bio(struct extent_page_data *epd) + { +- if (epd->bio) { +- int ret; ++ int ret = 0; + ++ if (epd->bio) { + ret = submit_one_bio(epd->bio, 0, 0); +- BUG_ON(ret < 0); /* -ENOMEM */ ++ /* ++ * Clean up of epd->bio is handled by its endio function. ++ * And endio is either triggered by successful bio execution ++ * or the error handler of submit bio hook. ++ * So at this point, no matter what happened, we don't need ++ * to clean up epd->bio. ++ */ + epd->bio = NULL; + } ++ return ret; + } + + int __init extent_io_init(void) +@@ -3538,7 +3551,8 @@ lock_extent_buffer_for_io(struct extent_ + + if (!btrfs_try_tree_write_lock(eb)) { + flush = 1; +- flush_write_bio(epd); ++ ret = flush_write_bio(epd); ++ BUG_ON(ret < 0); + btrfs_tree_lock(eb); + } + +@@ -3547,7 +3561,8 @@ lock_extent_buffer_for_io(struct extent_ + if (!epd->sync_io) + return 0; + if (!flush) { +- flush_write_bio(epd); ++ ret = flush_write_bio(epd); ++ BUG_ON(ret < 0); + flush = 1; + } + while (1) { +@@ -3588,7 +3603,8 @@ lock_extent_buffer_for_io(struct extent_ + + if (!trylock_page(p)) { + if (!flush) { +- flush_write_bio(epd); ++ ret = flush_write_bio(epd); ++ BUG_ON(ret < 0); + flush = 1; + } + lock_page(p); +@@ -3779,6 +3795,7 @@ int btree_write_cache_pages(struct addre + .sync_io = wbc->sync_mode == WB_SYNC_ALL, + }; + int ret = 0; ++ int flush_ret; + int done = 0; + int nr_to_write_done = 0; + struct pagevec pvec; +@@ -3878,7 +3895,8 @@ retry: + index = 0; + goto retry; + } +- flush_write_bio(&epd); ++ flush_ret = flush_write_bio(&epd); ++ BUG_ON(flush_ret < 0); + return ret; + } + +@@ -3975,7 +3993,8 @@ retry: + * tmpfs file mapping + */ + if (!trylock_page(page)) { +- flush_write_bio(epd); ++ ret = flush_write_bio(epd); ++ BUG_ON(ret < 0); + lock_page(page); + } + +@@ -3985,8 +4004,10 @@ retry: + } + + if (wbc->sync_mode != WB_SYNC_NONE) { +- if (PageWriteback(page)) +- flush_write_bio(epd); ++ if (PageWriteback(page)) { ++ ret = flush_write_bio(epd); ++ BUG_ON(ret < 0); ++ } + wait_on_page_writeback(page); + } + +@@ -4045,6 +4066,7 @@ retry: + int extent_write_full_page(struct page *page, struct writeback_control *wbc) + { + int ret; ++ int flush_ret; + struct extent_page_data epd = { + .bio = NULL, + .tree = &BTRFS_I(page->mapping->host)->io_tree, +@@ -4054,7 +4076,8 @@ int extent_write_full_page(struct page * + + ret = __extent_writepage(page, wbc, &epd); + +- flush_write_bio(&epd); ++ flush_ret = flush_write_bio(&epd); ++ BUG_ON(flush_ret < 0); + return ret; + } + +@@ -4062,6 +4085,7 @@ int extent_write_locked_range(struct ino + int mode) + { + int ret = 0; ++ int flush_ret; + struct address_space *mapping = inode->i_mapping; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct page *page; +@@ -4096,7 +4120,8 @@ int extent_write_locked_range(struct ino + start += PAGE_SIZE; + } + +- flush_write_bio(&epd); ++ flush_ret = flush_write_bio(&epd); ++ BUG_ON(flush_ret < 0); + return ret; + } + +@@ -4104,6 +4129,7 @@ int extent_writepages(struct address_spa + struct writeback_control *wbc) + { + int ret = 0; ++ int flush_ret; + struct extent_page_data epd = { + .bio = NULL, + .tree = &BTRFS_I(mapping->host)->io_tree, +@@ -4112,7 +4138,8 @@ int extent_writepages(struct address_spa + }; + + ret = extent_write_cache_pages(mapping, wbc, &epd); +- flush_write_bio(&epd); ++ flush_ret = flush_write_bio(&epd); ++ BUG_ON(flush_ret < 0); + return ret; + } + diff --git a/queue-4.19/btrfs-fix-unwritten-extent-buffers-and-hangs-on-future-writeback-attempts.patch b/queue-4.19/btrfs-fix-unwritten-extent-buffers-and-hangs-on-future-writeback-attempts.patch new file mode 100644 index 00000000000..6b2624e2af5 --- /dev/null +++ b/queue-4.19/btrfs-fix-unwritten-extent-buffers-and-hangs-on-future-writeback-attempts.patch @@ -0,0 +1,146 @@ +From foo@baz Sat Nov 7 04:26:01 PM CET 2020 +From: Filipe Manana +Date: Wed, 11 Sep 2019 17:42:00 +0100 +Subject: Btrfs: fix unwritten extent buffers and hangs on future writeback attempts + +From: Filipe Manana + +commit 18dfa7117a3f379862dcd3f67cadd678013bb9dd upstream. + +The lock_extent_buffer_io() returns 1 to the caller to tell it everything +went fine and the callers needs to start writeback for the extent buffer +(submit a bio, etc), 0 to tell the caller everything went fine but it does +not need to start writeback for the extent buffer, and a negative value if +some error happened. + +When it's about to return 1 it tries to lock all pages, and if a try lock +on a page fails, and we didn't flush any existing bio in our "epd", it +calls flush_write_bio(epd) and overwrites the return value of 1 to 0 or +an error. The page might have been locked elsewhere, not with the goal +of starting writeback of the extent buffer, and even by some code other +than btrfs, like page migration for example, so it does not mean the +writeback of the extent buffer was already started by some other task, +so returning a 0 tells the caller (btree_write_cache_pages()) to not +start writeback for the extent buffer. Note that epd might currently have +either no bio, so flush_write_bio() returns 0 (success) or it might have +a bio for another extent buffer with a lower index (logical address). + +Since we return 0 with the EXTENT_BUFFER_WRITEBACK bit set on the +extent buffer and writeback is never started for the extent buffer, +future attempts to writeback the extent buffer will hang forever waiting +on that bit to be cleared, since it can only be cleared after writeback +completes. Such hang is reported with a trace like the following: + + [49887.347053] INFO: task btrfs-transacti:1752 blocked for more than 122 seconds. + [49887.347059] Not tainted 5.2.13-gentoo #2 + [49887.347060] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. + [49887.347062] btrfs-transacti D 0 1752 2 0x80004000 + [49887.347064] Call Trace: + [49887.347069] ? __schedule+0x265/0x830 + [49887.347071] ? bit_wait+0x50/0x50 + [49887.347072] ? bit_wait+0x50/0x50 + [49887.347074] schedule+0x24/0x90 + [49887.347075] io_schedule+0x3c/0x60 + [49887.347077] bit_wait_io+0x8/0x50 + [49887.347079] __wait_on_bit+0x6c/0x80 + [49887.347081] ? __lock_release.isra.29+0x155/0x2d0 + [49887.347083] out_of_line_wait_on_bit+0x7b/0x80 + [49887.347084] ? var_wake_function+0x20/0x20 + [49887.347087] lock_extent_buffer_for_io+0x28c/0x390 + [49887.347089] btree_write_cache_pages+0x18e/0x340 + [49887.347091] do_writepages+0x29/0xb0 + [49887.347093] ? kmem_cache_free+0x132/0x160 + [49887.347095] ? convert_extent_bit+0x544/0x680 + [49887.347097] filemap_fdatawrite_range+0x70/0x90 + [49887.347099] btrfs_write_marked_extents+0x53/0x120 + [49887.347100] btrfs_write_and_wait_transaction.isra.4+0x38/0xa0 + [49887.347102] btrfs_commit_transaction+0x6bb/0x990 + [49887.347103] ? start_transaction+0x33e/0x500 + [49887.347105] transaction_kthread+0x139/0x15c + +So fix this by not overwriting the return value (ret) with the result +from flush_write_bio(). We also need to clear the EXTENT_BUFFER_WRITEBACK +bit in case flush_write_bio() returns an error, otherwise it will hang +any future attempts to writeback the extent buffer, and undo all work +done before (set back EXTENT_BUFFER_DIRTY, etc). + +This is a regression introduced in the 5.2 kernel. + +Fixes: 2e3c25136adfb ("btrfs: extent_io: add proper error handling to lock_extent_buffer_for_io()") +Fixes: f4340622e0226 ("btrfs: extent_io: Move the BUG_ON() in flush_write_bio() one level up") +Reported-by: Zdenek Sojka +Link: https://lore.kernel.org/linux-btrfs/GpO.2yos.3WGDOLpx6t%7D.1TUDYM@seznam.cz/T/#u +Reported-by: Stefan Priebe - Profihost AG +Link: https://lore.kernel.org/linux-btrfs/5c4688ac-10a7-fb07-70e8-c5d31a3fbb38@profihost.ag/T/#t +Reported-by: Drazen Kacar +Link: https://lore.kernel.org/linux-btrfs/DB8PR03MB562876ECE2319B3E579590F799C80@DB8PR03MB5628.eurprd03.prod.outlook.com/ +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=204377 +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent_io.c | 35 ++++++++++++++++++++++++++--------- + 1 file changed, 26 insertions(+), 9 deletions(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -3554,6 +3554,13 @@ void wait_on_extent_buffer_writeback(str + TASK_UNINTERRUPTIBLE); + } + ++static void end_extent_buffer_writeback(struct extent_buffer *eb) ++{ ++ clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); ++ smp_mb__after_atomic(); ++ wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); ++} ++ + /* + * Lock eb pages and flush the bio if we can't the locks + * +@@ -3626,8 +3633,11 @@ lock_extent_buffer_for_io(struct extent_ + + if (!trylock_page(p)) { + if (!flush) { +- ret = flush_write_bio(epd); +- if (ret < 0) { ++ int err; ++ ++ err = flush_write_bio(epd); ++ if (err < 0) { ++ ret = err; + failed_page_nr = i; + goto err_unlock; + } +@@ -3642,16 +3652,23 @@ err_unlock: + /* Unlock already locked pages */ + for (i = 0; i < failed_page_nr; i++) + unlock_page(eb->pages[i]); ++ /* ++ * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it. ++ * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can ++ * be made and undo everything done before. ++ */ ++ btrfs_tree_lock(eb); ++ spin_lock(&eb->refs_lock); ++ set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); ++ end_extent_buffer_writeback(eb); ++ spin_unlock(&eb->refs_lock); ++ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len, ++ fs_info->dirty_metadata_batch); ++ btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); ++ btrfs_tree_unlock(eb); + return ret; + } + +-static void end_extent_buffer_writeback(struct extent_buffer *eb) +-{ +- clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); +- smp_mb__after_atomic(); +- wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); +-} +- + static void set_btree_ioerr(struct page *page) + { + struct extent_buffer *eb = (struct extent_buffer *)page->private; diff --git a/queue-4.19/btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch b/queue-4.19/btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch new file mode 100644 index 00000000000..60e78a21c99 --- /dev/null +++ b/queue-4.19/btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch @@ -0,0 +1,105 @@ +From foo@baz Sat Nov 7 04:26:01 PM CET 2020 +From: Josef Bacik +Date: Thu, 23 Jan 2020 15:33:02 -0500 +Subject: btrfs: flush write bio if we loop in extent_write_cache_pages + +From: Josef Bacik + +commit 42ffb0bf584ae5b6b38f72259af1e0ee417ac77f upstream. + +There exists a deadlock with range_cyclic that has existed forever. If +we loop around with a bio already built we could deadlock with a writer +who has the page locked that we're attempting to write but is waiting on +a page in our bio to be written out. The task traces are as follows + + PID: 1329874 TASK: ffff889ebcdf3800 CPU: 33 COMMAND: "kworker/u113:5" + #0 [ffffc900297bb658] __schedule at ffffffff81a4c33f + #1 [ffffc900297bb6e0] schedule at ffffffff81a4c6e3 + #2 [ffffc900297bb6f8] io_schedule at ffffffff81a4ca42 + #3 [ffffc900297bb708] __lock_page at ffffffff811f145b + #4 [ffffc900297bb798] __process_pages_contig at ffffffff814bc502 + #5 [ffffc900297bb8c8] lock_delalloc_pages at ffffffff814bc684 + #6 [ffffc900297bb900] find_lock_delalloc_range at ffffffff814be9ff + #7 [ffffc900297bb9a0] writepage_delalloc at ffffffff814bebd0 + #8 [ffffc900297bba18] __extent_writepage at ffffffff814bfbf2 + #9 [ffffc900297bba98] extent_write_cache_pages at ffffffff814bffbd + + PID: 2167901 TASK: ffff889dc6a59c00 CPU: 14 COMMAND: + "aio-dio-invalid" + #0 [ffffc9003b50bb18] __schedule at ffffffff81a4c33f + #1 [ffffc9003b50bba0] schedule at ffffffff81a4c6e3 + #2 [ffffc9003b50bbb8] io_schedule at ffffffff81a4ca42 + #3 [ffffc9003b50bbc8] wait_on_page_bit at ffffffff811f24d6 + #4 [ffffc9003b50bc60] prepare_pages at ffffffff814b05a7 + #5 [ffffc9003b50bcd8] btrfs_buffered_write at ffffffff814b1359 + #6 [ffffc9003b50bdb0] btrfs_file_write_iter at ffffffff814b5933 + #7 [ffffc9003b50be38] new_sync_write at ffffffff8128f6a8 + #8 [ffffc9003b50bec8] vfs_write at ffffffff81292b9d + #9 [ffffc9003b50bf00] ksys_pwrite64 at ffffffff81293032 + +I used drgn to find the respective pages we were stuck on + +page_entry.page 0xffffea00fbfc7500 index 8148 bit 15 pid 2167901 +page_entry.page 0xffffea00f9bb7400 index 7680 bit 0 pid 1329874 + +As you can see the kworker is waiting for bit 0 (PG_locked) on index +7680, and aio-dio-invalid is waiting for bit 15 (PG_writeback) on index +8148. aio-dio-invalid has 7680, and the kworker epd looks like the +following + + crash> struct extent_page_data ffffc900297bbbb0 + struct extent_page_data { + bio = 0xffff889f747ed830, + tree = 0xffff889eed6ba448, + extent_locked = 0, + sync_io = 0 + } + +Probably worth mentioning as well that it waits for writeback of the +page to complete while holding a lock on it (at prepare_pages()). + +Using drgn I walked the bio pages looking for page +0xffffea00fbfc7500 which is the one we're waiting for writeback on + + bio = Object(prog, 'struct bio', address=0xffff889f747ed830) + for i in range(0, bio.bi_vcnt.value_()): + bv = bio.bi_io_vec[i] + if bv.bv_page.value_() == 0xffffea00fbfc7500: + print("FOUND IT") + +which validated what I suspected. + +The fix for this is simple, flush the epd before we loop back around to +the beginning of the file during writeout. + +Fixes: b293f02e1423 ("Btrfs: Add writepages support") +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Filipe Manana +Signed-off-by: Josef Bacik +Signed-off-by: David Sterba +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent_io.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -4045,7 +4045,16 @@ retry: + */ + scanned = 1; + index = 0; +- goto retry; ++ ++ /* ++ * If we're looping we could run into a page that is locked by a ++ * writer and that writer could be waiting on writeback for a ++ * page in our current bio, and thus deadlock, so flush the ++ * write bio here. ++ */ ++ ret = flush_write_bio(epd); ++ if (!ret) ++ goto retry; + } + + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) diff --git a/queue-4.19/btrfs-move-btrfs_check_chunk_valid-to-tree-check.-and-export-it.patch b/queue-4.19/btrfs-move-btrfs_check_chunk_valid-to-tree-check.-and-export-it.patch new file mode 100644 index 00000000000..b71abe335af --- /dev/null +++ b/queue-4.19/btrfs-move-btrfs_check_chunk_valid-to-tree-check.-and-export-it.patch @@ -0,0 +1,257 @@ +From foo@baz Sat Nov 7 04:26:01 PM CET 2020 +From: Qu Wenruo +Date: Wed, 20 Mar 2019 13:16:42 +0800 +Subject: btrfs: Move btrfs_check_chunk_valid() to tree-check.[ch] and export it + +From: Qu Wenruo + +commit 82fc28fbedbb59642f05215db3b0ef4eb91aa31d upstream. + +By function, chunk item verification is more suitable to be done inside +tree-checker. + +So move btrfs_check_chunk_valid() to tree-checker.c and export it. + +And since it's now moved to tree-checker, also add a better comment for +what this function is doing. + +Reviewed-by: Johannes Thumshirn +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +[bwh: Cherry-picked for 4.19 to ease backporting later fixes] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-checker.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++ + fs/btrfs/tree-checker.h | 4 + + fs/btrfs/volumes.c | 94 ---------------------------------------------- + 3 files changed, 102 insertions(+), 93 deletions(-) + +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -449,6 +449,103 @@ static int check_block_group_item(struct + } + + /* ++ * The common chunk check which could also work on super block sys chunk array. ++ * ++ * Return -EIO if anything is corrupted. ++ * Return 0 if everything is OK. ++ */ ++int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, ++ struct extent_buffer *leaf, ++ struct btrfs_chunk *chunk, u64 logical) ++{ ++ u64 length; ++ u64 stripe_len; ++ u16 num_stripes; ++ u16 sub_stripes; ++ u64 type; ++ u64 features; ++ bool mixed = false; ++ ++ length = btrfs_chunk_length(leaf, chunk); ++ stripe_len = btrfs_chunk_stripe_len(leaf, chunk); ++ num_stripes = btrfs_chunk_num_stripes(leaf, chunk); ++ sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); ++ type = btrfs_chunk_type(leaf, chunk); ++ ++ if (!num_stripes) { ++ btrfs_err(fs_info, "invalid chunk num_stripes: %u", ++ num_stripes); ++ return -EIO; ++ } ++ if (!IS_ALIGNED(logical, fs_info->sectorsize)) { ++ btrfs_err(fs_info, "invalid chunk logical %llu", logical); ++ return -EIO; ++ } ++ if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { ++ btrfs_err(fs_info, "invalid chunk sectorsize %u", ++ btrfs_chunk_sector_size(leaf, chunk)); ++ return -EIO; ++ } ++ if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { ++ btrfs_err(fs_info, "invalid chunk length %llu", length); ++ return -EIO; ++ } ++ if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { ++ btrfs_err(fs_info, "invalid chunk stripe length: %llu", ++ stripe_len); ++ return -EIO; ++ } ++ if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & ++ type) { ++ btrfs_err(fs_info, "unrecognized chunk type: %llu", ++ ~(BTRFS_BLOCK_GROUP_TYPE_MASK | ++ BTRFS_BLOCK_GROUP_PROFILE_MASK) & ++ btrfs_chunk_type(leaf, chunk)); ++ return -EIO; ++ } ++ ++ if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { ++ btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type); ++ return -EIO; ++ } ++ ++ if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && ++ (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { ++ btrfs_err(fs_info, ++ "system chunk with data or metadata type: 0x%llx", type); ++ return -EIO; ++ } ++ ++ features = btrfs_super_incompat_flags(fs_info->super_copy); ++ if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) ++ mixed = true; ++ ++ if (!mixed) { ++ if ((type & BTRFS_BLOCK_GROUP_METADATA) && ++ (type & BTRFS_BLOCK_GROUP_DATA)) { ++ btrfs_err(fs_info, ++ "mixed chunk type in non-mixed mode: 0x%llx", type); ++ return -EIO; ++ } ++ } ++ ++ if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || ++ (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) || ++ (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || ++ (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || ++ (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || ++ ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) { ++ btrfs_err(fs_info, ++ "invalid num_stripes:sub_stripes %u:%u for profile %llu", ++ num_stripes, sub_stripes, ++ type & BTRFS_BLOCK_GROUP_PROFILE_MASK); ++ return -EIO; ++ } ++ ++ return 0; ++} ++ ++/* + * Common point to switch the item-specific validation. + */ + static int check_leaf_item(struct btrfs_fs_info *fs_info, +--- a/fs/btrfs/tree-checker.h ++++ b/fs/btrfs/tree-checker.h +@@ -25,4 +25,8 @@ int btrfs_check_leaf_relaxed(struct btrf + struct extent_buffer *leaf); + int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node); + ++int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, ++ struct extent_buffer *leaf, ++ struct btrfs_chunk *chunk, u64 logical); ++ + #endif +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -28,6 +28,7 @@ + #include "math.h" + #include "dev-replace.h" + #include "sysfs.h" ++#include "tree-checker.h" + + const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = { +@@ -6370,99 +6371,6 @@ struct btrfs_device *btrfs_alloc_device( + return dev; + } + +-/* Return -EIO if any error, otherwise return 0. */ +-static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, +- struct extent_buffer *leaf, +- struct btrfs_chunk *chunk, u64 logical) +-{ +- u64 length; +- u64 stripe_len; +- u16 num_stripes; +- u16 sub_stripes; +- u64 type; +- u64 features; +- bool mixed = false; +- +- length = btrfs_chunk_length(leaf, chunk); +- stripe_len = btrfs_chunk_stripe_len(leaf, chunk); +- num_stripes = btrfs_chunk_num_stripes(leaf, chunk); +- sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); +- type = btrfs_chunk_type(leaf, chunk); +- +- if (!num_stripes) { +- btrfs_err(fs_info, "invalid chunk num_stripes: %u", +- num_stripes); +- return -EIO; +- } +- if (!IS_ALIGNED(logical, fs_info->sectorsize)) { +- btrfs_err(fs_info, "invalid chunk logical %llu", logical); +- return -EIO; +- } +- if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { +- btrfs_err(fs_info, "invalid chunk sectorsize %u", +- btrfs_chunk_sector_size(leaf, chunk)); +- return -EIO; +- } +- if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { +- btrfs_err(fs_info, "invalid chunk length %llu", length); +- return -EIO; +- } +- if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { +- btrfs_err(fs_info, "invalid chunk stripe length: %llu", +- stripe_len); +- return -EIO; +- } +- if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & +- type) { +- btrfs_err(fs_info, "unrecognized chunk type: %llu", +- ~(BTRFS_BLOCK_GROUP_TYPE_MASK | +- BTRFS_BLOCK_GROUP_PROFILE_MASK) & +- btrfs_chunk_type(leaf, chunk)); +- return -EIO; +- } +- +- if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { +- btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type); +- return -EIO; +- } +- +- if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && +- (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { +- btrfs_err(fs_info, +- "system chunk with data or metadata type: 0x%llx", type); +- return -EIO; +- } +- +- features = btrfs_super_incompat_flags(fs_info->super_copy); +- if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) +- mixed = true; +- +- if (!mixed) { +- if ((type & BTRFS_BLOCK_GROUP_METADATA) && +- (type & BTRFS_BLOCK_GROUP_DATA)) { +- btrfs_err(fs_info, +- "mixed chunk type in non-mixed mode: 0x%llx", type); +- return -EIO; +- } +- } +- +- if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || +- (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) || +- (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || +- (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || +- (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || +- ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && +- num_stripes != 1)) { +- btrfs_err(fs_info, +- "invalid num_stripes:sub_stripes %u:%u for profile %llu", +- num_stripes, sub_stripes, +- type & BTRFS_BLOCK_GROUP_PROFILE_MASK); +- return -EIO; +- } +- +- return 0; +-} +- + static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, + u64 devid, u8 *uuid, bool error) + { diff --git a/queue-4.19/btrfs-tree-checker-check-chunk-item-at-tree-block-read-time.patch b/queue-4.19/btrfs-tree-checker-check-chunk-item-at-tree-block-read-time.patch new file mode 100644 index 00000000000..300897f957c --- /dev/null +++ b/queue-4.19/btrfs-tree-checker-check-chunk-item-at-tree-block-read-time.patch @@ -0,0 +1,74 @@ +From foo@baz Sat Nov 7 04:26:01 PM CET 2020 +From: Qu Wenruo +Date: Wed, 20 Mar 2019 13:42:33 +0800 +Subject: btrfs: tree-checker: Check chunk item at tree block read time + +From: Qu Wenruo + +commit 075cb3c78fe7976c9f29ca1fa23f9728634ecefc upstream. + +Since we have btrfs_check_chunk_valid() in tree-checker, let's do +chunk item verification in tree-checker too. + +Since the tree-checker is run at endio time, if one chunk leaf fails +chunk verification, we can still retry the other copy, making btrfs more +robust to fuzzed image as we may still get a good chunk item. + +Also since we have done chunk verification in tree block read time, skip +the btrfs_check_chunk_valid() call in read_one_chunk() if we're reading +chunk items from leaf. + +Reviewed-by: Johannes Thumshirn +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-checker.c | 6 ++++++ + fs/btrfs/volumes.c | 12 +++++++++--- + 2 files changed, 15 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -608,6 +608,7 @@ static int check_leaf_item(struct btrfs_ + struct btrfs_key *key, int slot) + { + int ret = 0; ++ struct btrfs_chunk *chunk; + + switch (key->type) { + case BTRFS_EXTENT_DATA_KEY: +@@ -624,6 +625,11 @@ static int check_leaf_item(struct btrfs_ + case BTRFS_BLOCK_GROUP_ITEM_KEY: + ret = check_block_group_item(fs_info, leaf, key, slot); + break; ++ case BTRFS_CHUNK_ITEM_KEY: ++ chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); ++ ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, ++ key->offset); ++ break; + } + return ret; + } +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -6401,9 +6401,15 @@ static int read_one_chunk(struct btrfs_f + length = btrfs_chunk_length(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + +- ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical); +- if (ret) +- return ret; ++ /* ++ * Only need to verify chunk item if we're reading from sys chunk array, ++ * as chunk item in tree block is already verified by tree-checker. ++ */ ++ if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { ++ ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical); ++ if (ret) ++ return ret; ++ } + + read_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); diff --git a/queue-4.19/btrfs-tree-checker-enhance-chunk-checker-to-validate-chunk-profile.patch b/queue-4.19/btrfs-tree-checker-enhance-chunk-checker-to-validate-chunk-profile.patch new file mode 100644 index 00000000000..44dfd91a1c2 --- /dev/null +++ b/queue-4.19/btrfs-tree-checker-enhance-chunk-checker-to-validate-chunk-profile.patch @@ -0,0 +1,44 @@ +From foo@baz Sat Nov 7 04:26:01 PM CET 2020 +From: Qu Wenruo +Date: Wed, 13 Mar 2019 12:17:50 +0800 +Subject: btrfs: tree-checker: Enhance chunk checker to validate chunk profile + +From: Qu Wenruo + +commit 80e46cf22ba0bcb57b39c7c3b52961ab3a0fd5f2 upstream. + +Btrfs-progs already have a comprehensive type checker, to ensure there +is only 0 (SINGLE profile) or 1 (DUP/RAID0/1/5/6/10) bit set for chunk +profile bits. + +Do the same work for kernel. + +Reported-by: Yoon Jungyeon +Link: https://bugzilla.kernel.org/show_bug.cgi?id=202765 +Reviewed-by: Nikolay Borisov +Reviewed-by: Johannes Thumshirn +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-checker.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -556,6 +556,13 @@ int btrfs_check_chunk_valid(struct btrfs + return -EUCLEAN; + } + ++ if (!is_power_of_2(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && ++ (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) { ++ chunk_err(fs_info, leaf, chunk, logical, ++ "invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set", ++ type & BTRFS_BLOCK_GROUP_PROFILE_MASK); ++ return -EUCLEAN; ++ } + if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { + chunk_err(fs_info, leaf, chunk, logical, + "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx", diff --git a/queue-4.19/btrfs-tree-checker-fix-the-error-message-for-transid-error.patch b/queue-4.19/btrfs-tree-checker-fix-the-error-message-for-transid-error.patch new file mode 100644 index 00000000000..ca366bc1575 --- /dev/null +++ b/queue-4.19/btrfs-tree-checker-fix-the-error-message-for-transid-error.patch @@ -0,0 +1,36 @@ +From foo@baz Sat Nov 7 04:26:01 PM CET 2020 +From: Qu Wenruo +Date: Tue, 25 Aug 2020 21:42:51 +0800 +Subject: btrfs: tree-checker: fix the error message for transid error + +From: Qu Wenruo + +commit f96d6960abbc52e26ad124e69e6815283d3e1674 upstream. + +The error message for inode transid is the same as for inode generation, +which makes us unable to detect the real problem. + +Reported-by: Tyler Richmond +Fixes: 496245cac57e ("btrfs: tree-checker: Verify inode item") +CC: stable@vger.kernel.org # 5.4+ +Reviewed-by: Marcos Paulo de Souza +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +[bwh: Backported to 4.19: adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-checker.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -715,7 +715,7 @@ static int check_inode_item(struct btrfs + /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */ + if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) { + inode_item_err(fs_info, leaf, slot, +- "invalid inode generation: has %llu expect [0, %llu]", ++ "invalid inode transid: has %llu expect [0, %llu]", + btrfs_inode_transid(leaf, iitem), super_gen + 1); + return -EUCLEAN; + } diff --git a/queue-4.19/btrfs-tree-checker-fix-wrong-check-on-max-devid.patch b/queue-4.19/btrfs-tree-checker-fix-wrong-check-on-max-devid.patch new file mode 100644 index 00000000000..b17b036c33a --- /dev/null +++ b/queue-4.19/btrfs-tree-checker-fix-wrong-check-on-max-devid.patch @@ -0,0 +1,92 @@ +From foo@baz Sat Nov 7 04:26:01 PM CET 2020 +From: Qu Wenruo +Date: Wed, 28 Aug 2019 10:33:13 +0800 +Subject: btrfs: tree-checker: Fix wrong check on max devid + +From: Qu Wenruo + +commit 8bb177d18f114358a57d8ae7e206861b48b8b4de upstream. + +[BUG] +The following script will cause false alert on devid check. + #!/bin/bash + + dev1=/dev/test/test + dev2=/dev/test/scratch1 + mnt=/mnt/btrfs + + umount $dev1 &> /dev/null + umount $dev2 &> /dev/null + umount $mnt &> /dev/null + + mkfs.btrfs -f $dev1 + + mount $dev1 $mnt + + _fail() + { + echo "!!! FAILED !!!" + exit 1 + } + + for ((i = 0; i < 4096; i++)); do + btrfs dev add -f $dev2 $mnt || _fail + btrfs dev del $dev1 $mnt || _fail + dev_tmp=$dev1 + dev1=$dev2 + dev2=$dev_tmp + done + +[CAUSE] +Tree-checker uses BTRFS_MAX_DEVS() and BTRFS_MAX_DEVS_SYS_CHUNK() as +upper limit for devid. But we can have devid holes just like above +script. + +So the check for devid is incorrect and could cause false alert. + +[FIX] +Just remove the whole devid check. We don't have any hard requirement +for devid assignment. + +Furthermore, even devid could get corrupted by a bitflip, we still have +dev extents verification at mount time, so corrupted data won't sneak +in. + +This fixes fstests btrfs/194. + +Reported-by: Anand Jain +Fixes: ab4ba2e13346 ("btrfs: tree-checker: Verify dev item") +CC: stable@vger.kernel.org # 5.2+ +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +[bwh: Backported to 4.19: adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-checker.c | 7 ------- + 1 file changed, 7 deletions(-) + +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -629,7 +629,6 @@ static int check_dev_item(struct btrfs_f + struct btrfs_key *key, int slot) + { + struct btrfs_dev_item *ditem; +- u64 max_devid = max(BTRFS_MAX_DEVS(fs_info), BTRFS_MAX_DEVS_SYS_CHUNK); + + if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) { + dev_item_err(fs_info, leaf, slot, +@@ -637,12 +636,6 @@ static int check_dev_item(struct btrfs_f + key->objectid, BTRFS_DEV_ITEMS_OBJECTID); + return -EUCLEAN; + } +- if (key->offset > max_devid) { +- dev_item_err(fs_info, leaf, slot, +- "invalid devid: has=%llu expect=[0, %llu]", +- key->offset, max_devid); +- return -EUCLEAN; +- } + ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); + if (btrfs_device_id(leaf, ditem) != key->offset) { + dev_item_err(fs_info, leaf, slot, diff --git a/queue-4.19/btrfs-tree-checker-make-btrfs_check_chunk_valid-return-euclean-instead-of-eio.patch b/queue-4.19/btrfs-tree-checker-make-btrfs_check_chunk_valid-return-euclean-instead-of-eio.patch new file mode 100644 index 00000000000..969792e8e2e --- /dev/null +++ b/queue-4.19/btrfs-tree-checker-make-btrfs_check_chunk_valid-return-euclean-instead-of-eio.patch @@ -0,0 +1,114 @@ +From foo@baz Sat Nov 7 04:26:01 PM CET 2020 +From: Qu Wenruo +Date: Wed, 20 Mar 2019 13:39:14 +0800 +Subject: btrfs: tree-checker: Make btrfs_check_chunk_valid() return EUCLEAN instead of EIO + +From: Qu Wenruo + +commit bf871c3b43b1dcc3f2a076ff39a8f1ce7959d958 upstream. + +To follow the standard behavior of tree-checker. + +Reviewed-by: Johannes Thumshirn +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +[bwh: Cherry-picked for 4.19 to ease backporting later fixes] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-checker.c | 22 +++++++++++----------- + 1 file changed, 11 insertions(+), 11 deletions(-) + +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -496,7 +496,7 @@ static void chunk_err(const struct btrfs + /* + * The common chunk check which could also work on super block sys chunk array. + * +- * Return -EIO if anything is corrupted. ++ * Return -EUCLEAN if anything is corrupted. + * Return 0 if everything is OK. + */ + int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, +@@ -520,31 +520,31 @@ int btrfs_check_chunk_valid(struct btrfs + if (!num_stripes) { + chunk_err(fs_info, leaf, chunk, logical, + "invalid chunk num_stripes, have %u", num_stripes); +- return -EIO; ++ return -EUCLEAN; + } + if (!IS_ALIGNED(logical, fs_info->sectorsize)) { + chunk_err(fs_info, leaf, chunk, logical, + "invalid chunk logical, have %llu should aligned to %u", + logical, fs_info->sectorsize); +- return -EIO; ++ return -EUCLEAN; + } + if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { + chunk_err(fs_info, leaf, chunk, logical, + "invalid chunk sectorsize, have %u expect %u", + btrfs_chunk_sector_size(leaf, chunk), + fs_info->sectorsize); +- return -EIO; ++ return -EUCLEAN; + } + if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { + chunk_err(fs_info, leaf, chunk, logical, + "invalid chunk length, have %llu", length); +- return -EIO; ++ return -EUCLEAN; + } + if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { + chunk_err(fs_info, leaf, chunk, logical, + "invalid chunk stripe length: %llu", + stripe_len); +- return -EIO; ++ return -EUCLEAN; + } + if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & + type) { +@@ -553,14 +553,14 @@ int btrfs_check_chunk_valid(struct btrfs + ~(BTRFS_BLOCK_GROUP_TYPE_MASK | + BTRFS_BLOCK_GROUP_PROFILE_MASK) & + btrfs_chunk_type(leaf, chunk)); +- return -EIO; ++ return -EUCLEAN; + } + + if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { + chunk_err(fs_info, leaf, chunk, logical, + "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx", + type, BTRFS_BLOCK_GROUP_TYPE_MASK); +- return -EIO; ++ return -EUCLEAN; + } + + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && +@@ -568,7 +568,7 @@ int btrfs_check_chunk_valid(struct btrfs + chunk_err(fs_info, leaf, chunk, logical, + "system chunk with data or metadata type: 0x%llx", + type); +- return -EIO; ++ return -EUCLEAN; + } + + features = btrfs_super_incompat_flags(fs_info->super_copy); +@@ -580,7 +580,7 @@ int btrfs_check_chunk_valid(struct btrfs + (type & BTRFS_BLOCK_GROUP_DATA)) { + chunk_err(fs_info, leaf, chunk, logical, + "mixed chunk type in non-mixed mode: 0x%llx", type); +- return -EIO; ++ return -EUCLEAN; + } + } + +@@ -594,7 +594,7 @@ int btrfs_check_chunk_valid(struct btrfs + "invalid num_stripes:sub_stripes %u:%u for profile %llu", + num_stripes, sub_stripes, + type & BTRFS_BLOCK_GROUP_PROFILE_MASK); +- return -EIO; ++ return -EUCLEAN; + } + + return 0; diff --git a/queue-4.19/btrfs-tree-checker-make-chunk-item-checker-messages-more-readable.patch b/queue-4.19/btrfs-tree-checker-make-chunk-item-checker-messages-more-readable.patch new file mode 100644 index 00000000000..60c7cf3aecf --- /dev/null +++ b/queue-4.19/btrfs-tree-checker-make-chunk-item-checker-messages-more-readable.patch @@ -0,0 +1,172 @@ +From foo@baz Sat Nov 7 04:26:01 PM CET 2020 +From: Qu Wenruo +Date: Wed, 20 Mar 2019 13:36:06 +0800 +Subject: btrfs: tree-checker: Make chunk item checker messages more readable + +From: Qu Wenruo + +commit f114024376bceb1c0f61a7bad4a72a0f978767af upstream. + +Old error message would be something like: + BTRFS error (device dm-3): invalid chunk num_stipres: 0 + +New error message would be: + Btrfs critical (device dm-3): corrupt superblock syschunk array: chunk_start=2097152, invalid chunk num_stripes: 0 +Or + Btrfs critical (device dm-3): corrupt leaf: root=3 block=8388608 slot=3 chunk_start=2097152, invalid chunk num_stripes: 0 + +And for certain error message, also output expected value. + +The error message levels are changed from error to critical. + +Reviewed-by: Johannes Thumshirn +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +[bwh: Cherry-picked for 4.19 to ease backporting later fixes] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-checker.c | 81 ++++++++++++++++++++++++++++++++++++++++-------- + 1 file changed, 68 insertions(+), 13 deletions(-) + +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -448,6 +448,51 @@ static int check_block_group_item(struct + return 0; + } + ++__printf(5, 6) ++__cold ++static void chunk_err(const struct btrfs_fs_info *fs_info, ++ const struct extent_buffer *leaf, ++ const struct btrfs_chunk *chunk, u64 logical, ++ const char *fmt, ...) ++{ ++ bool is_sb; ++ struct va_format vaf; ++ va_list args; ++ int i; ++ int slot = -1; ++ ++ /* Only superblock eb is able to have such small offset */ ++ is_sb = (leaf->start == BTRFS_SUPER_INFO_OFFSET); ++ ++ if (!is_sb) { ++ /* ++ * Get the slot number by iterating through all slots, this ++ * would provide better readability. ++ */ ++ for (i = 0; i < btrfs_header_nritems(leaf); i++) { ++ if (btrfs_item_ptr_offset(leaf, i) == ++ (unsigned long)chunk) { ++ slot = i; ++ break; ++ } ++ } ++ } ++ va_start(args, fmt); ++ vaf.fmt = fmt; ++ vaf.va = &args; ++ ++ if (is_sb) ++ btrfs_crit(fs_info, ++ "corrupt superblock syschunk array: chunk_start=%llu, %pV", ++ logical, &vaf); ++ else ++ btrfs_crit(fs_info, ++ "corrupt leaf: root=%llu block=%llu slot=%d chunk_start=%llu, %pV", ++ BTRFS_CHUNK_TREE_OBJECTID, leaf->start, slot, ++ logical, &vaf); ++ va_end(args); ++} ++ + /* + * The common chunk check which could also work on super block sys chunk array. + * +@@ -473,31 +518,38 @@ int btrfs_check_chunk_valid(struct btrfs + type = btrfs_chunk_type(leaf, chunk); + + if (!num_stripes) { +- btrfs_err(fs_info, "invalid chunk num_stripes: %u", +- num_stripes); ++ chunk_err(fs_info, leaf, chunk, logical, ++ "invalid chunk num_stripes, have %u", num_stripes); + return -EIO; + } + if (!IS_ALIGNED(logical, fs_info->sectorsize)) { +- btrfs_err(fs_info, "invalid chunk logical %llu", logical); ++ chunk_err(fs_info, leaf, chunk, logical, ++ "invalid chunk logical, have %llu should aligned to %u", ++ logical, fs_info->sectorsize); + return -EIO; + } + if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { +- btrfs_err(fs_info, "invalid chunk sectorsize %u", +- btrfs_chunk_sector_size(leaf, chunk)); ++ chunk_err(fs_info, leaf, chunk, logical, ++ "invalid chunk sectorsize, have %u expect %u", ++ btrfs_chunk_sector_size(leaf, chunk), ++ fs_info->sectorsize); + return -EIO; + } + if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { +- btrfs_err(fs_info, "invalid chunk length %llu", length); ++ chunk_err(fs_info, leaf, chunk, logical, ++ "invalid chunk length, have %llu", length); + return -EIO; + } + if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { +- btrfs_err(fs_info, "invalid chunk stripe length: %llu", ++ chunk_err(fs_info, leaf, chunk, logical, ++ "invalid chunk stripe length: %llu", + stripe_len); + return -EIO; + } + if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & + type) { +- btrfs_err(fs_info, "unrecognized chunk type: %llu", ++ chunk_err(fs_info, leaf, chunk, logical, ++ "unrecognized chunk type: 0x%llx", + ~(BTRFS_BLOCK_GROUP_TYPE_MASK | + BTRFS_BLOCK_GROUP_PROFILE_MASK) & + btrfs_chunk_type(leaf, chunk)); +@@ -505,14 +557,17 @@ int btrfs_check_chunk_valid(struct btrfs + } + + if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { +- btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type); ++ chunk_err(fs_info, leaf, chunk, logical, ++ "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx", ++ type, BTRFS_BLOCK_GROUP_TYPE_MASK); + return -EIO; + } + + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && + (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { +- btrfs_err(fs_info, +- "system chunk with data or metadata type: 0x%llx", type); ++ chunk_err(fs_info, leaf, chunk, logical, ++ "system chunk with data or metadata type: 0x%llx", ++ type); + return -EIO; + } + +@@ -523,7 +578,7 @@ int btrfs_check_chunk_valid(struct btrfs + if (!mixed) { + if ((type & BTRFS_BLOCK_GROUP_METADATA) && + (type & BTRFS_BLOCK_GROUP_DATA)) { +- btrfs_err(fs_info, ++ chunk_err(fs_info, leaf, chunk, logical, + "mixed chunk type in non-mixed mode: 0x%llx", type); + return -EIO; + } +@@ -535,7 +590,7 @@ int btrfs_check_chunk_valid(struct btrfs + (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || + (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || + ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) { +- btrfs_err(fs_info, ++ chunk_err(fs_info, leaf, chunk, logical, + "invalid num_stripes:sub_stripes %u:%u for profile %llu", + num_stripes, sub_stripes, + type & BTRFS_BLOCK_GROUP_PROFILE_MASK); diff --git a/queue-4.19/btrfs-tree-checker-verify-dev-item.patch b/queue-4.19/btrfs-tree-checker-verify-dev-item.patch new file mode 100644 index 00000000000..7ceb4bd06d2 --- /dev/null +++ b/queue-4.19/btrfs-tree-checker-verify-dev-item.patch @@ -0,0 +1,172 @@ +From foo@baz Sat Nov 7 04:26:01 PM CET 2020 +From: Qu Wenruo +Date: Fri, 8 Mar 2019 14:20:03 +0800 +Subject: btrfs: tree-checker: Verify dev item + +From: Qu Wenruo + +commit ab4ba2e133463c702b37242560d7fabedd2dc750 upstream. + +[BUG] +For fuzzed image whose DEV_ITEM has invalid total_bytes as 0, then +kernel will just panic: + BUG: unable to handle kernel NULL pointer dereference at 0000000000000098 + #PF error: [normal kernel read fault] + PGD 800000022b2bd067 P4D 800000022b2bd067 PUD 22b2bc067 PMD 0 + Oops: 0000 [#1] SMP PTI + CPU: 0 PID: 1106 Comm: mount Not tainted 5.0.0-rc8+ #9 + RIP: 0010:btrfs_verify_dev_extents+0x2a5/0x5a0 + Call Trace: + open_ctree+0x160d/0x2149 + btrfs_mount_root+0x5b2/0x680 + +[CAUSE] +If device extent verification finds a deivce with 0 total_bytes, then it +assumes it's a seed dummy, then search for seed devices. + +But in this case, there is no seed device at all, causing NULL pointer. + +[FIX] +Since this is caused by fuzzed image, let's go the tree-check way, just +add a new verification for device item. + +Reported-by: Yoon Jungyeon +Link: https://bugzilla.kernel.org/show_bug.cgi?id=202691 +Reviewed-by: Nikolay Borisov +Signed-off-by: Qu Wenruo +Reviewed-by: Johannes Thumshirn +Signed-off-by: David Sterba +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-checker.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ + fs/btrfs/volumes.c | 9 ----- + fs/btrfs/volumes.h | 9 +++++ + 3 files changed, 83 insertions(+), 9 deletions(-) + +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -600,6 +600,77 @@ int btrfs_check_chunk_valid(struct btrfs + return 0; + } + ++__printf(4, 5) ++__cold ++static void dev_item_err(const struct btrfs_fs_info *fs_info, ++ const struct extent_buffer *eb, int slot, ++ const char *fmt, ...) ++{ ++ struct btrfs_key key; ++ struct va_format vaf; ++ va_list args; ++ ++ btrfs_item_key_to_cpu(eb, &key, slot); ++ va_start(args, fmt); ++ ++ vaf.fmt = fmt; ++ vaf.va = &args; ++ ++ btrfs_crit(fs_info, ++ "corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV", ++ btrfs_header_level(eb) == 0 ? "leaf" : "node", ++ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, ++ key.objectid, &vaf); ++ va_end(args); ++} ++ ++static int check_dev_item(struct btrfs_fs_info *fs_info, ++ struct extent_buffer *leaf, ++ struct btrfs_key *key, int slot) ++{ ++ struct btrfs_dev_item *ditem; ++ u64 max_devid = max(BTRFS_MAX_DEVS(fs_info), BTRFS_MAX_DEVS_SYS_CHUNK); ++ ++ if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) { ++ dev_item_err(fs_info, leaf, slot, ++ "invalid objectid: has=%llu expect=%llu", ++ key->objectid, BTRFS_DEV_ITEMS_OBJECTID); ++ return -EUCLEAN; ++ } ++ if (key->offset > max_devid) { ++ dev_item_err(fs_info, leaf, slot, ++ "invalid devid: has=%llu expect=[0, %llu]", ++ key->offset, max_devid); ++ return -EUCLEAN; ++ } ++ ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); ++ if (btrfs_device_id(leaf, ditem) != key->offset) { ++ dev_item_err(fs_info, leaf, slot, ++ "devid mismatch: key has=%llu item has=%llu", ++ key->offset, btrfs_device_id(leaf, ditem)); ++ return -EUCLEAN; ++ } ++ ++ /* ++ * For device total_bytes, we don't have reliable way to check it, as ++ * it can be 0 for device removal. Device size check can only be done ++ * by dev extents check. ++ */ ++ if (btrfs_device_bytes_used(leaf, ditem) > ++ btrfs_device_total_bytes(leaf, ditem)) { ++ dev_item_err(fs_info, leaf, slot, ++ "invalid bytes used: have %llu expect [0, %llu]", ++ btrfs_device_bytes_used(leaf, ditem), ++ btrfs_device_total_bytes(leaf, ditem)); ++ return -EUCLEAN; ++ } ++ /* ++ * Remaining members like io_align/type/gen/dev_group aren't really ++ * utilized. Skip them to make later usage of them easier. ++ */ ++ return 0; ++} ++ + /* + * Common point to switch the item-specific validation. + */ +@@ -630,6 +701,9 @@ static int check_leaf_item(struct btrfs_ + ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, + key->offset); + break; ++ case BTRFS_DEV_ITEM_KEY: ++ ret = check_dev_item(fs_info, leaf, key, slot); ++ break; + } + return ret; + } +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -4606,15 +4606,6 @@ static void check_raid56_incompat_flag(s + btrfs_set_fs_incompat(info, RAID56); + } + +-#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ +- - sizeof(struct btrfs_chunk)) \ +- / sizeof(struct btrfs_stripe) + 1) +- +-#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ +- - 2 * sizeof(struct btrfs_disk_key) \ +- - 2 * sizeof(struct btrfs_chunk)) \ +- / sizeof(struct btrfs_stripe) + 1) +- + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + u64 start, u64 type) + { +--- a/fs/btrfs/volumes.h ++++ b/fs/btrfs/volumes.h +@@ -257,6 +257,15 @@ struct btrfs_fs_devices { + + #define BTRFS_BIO_INLINE_CSUM_SIZE 64 + ++#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ ++ - sizeof(struct btrfs_chunk)) \ ++ / sizeof(struct btrfs_stripe) + 1) ++ ++#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ ++ - 2 * sizeof(struct btrfs_disk_key) \ ++ - 2 * sizeof(struct btrfs_chunk)) \ ++ / sizeof(struct btrfs_stripe) + 1) ++ + /* + * we need the mirror number and stripe index to be passed around + * the call chain while we are processing end_io (especially errors). diff --git a/queue-4.19/btrfs-tree-checker-verify-inode-item.patch b/queue-4.19/btrfs-tree-checker-verify-inode-item.patch new file mode 100644 index 00000000000..6b2e6586ea4 --- /dev/null +++ b/queue-4.19/btrfs-tree-checker-verify-inode-item.patch @@ -0,0 +1,183 @@ +From foo@baz Sat Nov 7 04:26:01 PM CET 2020 +From: Qu Wenruo +Date: Wed, 13 Mar 2019 14:31:35 +0800 +Subject: btrfs: tree-checker: Verify inode item + +From: Qu Wenruo + +commit 496245cac57e26d8b738d85c7a29cf9a47610f3f upstream. + +There is a report in kernel bugzilla about mismatch file type in dir +item and inode item. + +This inspires us to check inode mode in inode item. + +This patch will check the following members: + +- inode key objectid + Should be ROOT_DIR_DIR or [256, (u64)-256] or FREE_INO. + +- inode key offset + Should be 0 + +- inode item generation +- inode item transid + No newer than sb generation + 1. + The +1 is for log tree. + +- inode item mode + No unknown bits. + No invalid S_IF* bit. + NOTE: S_IFMT check is not enough, need to check every know type. + +- inode item nlink + Dir should have no more link than 1. + +- inode item flags + +Reviewed-by: Nikolay Borisov +Reviewed-by: Johannes Thumshirn +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/ctree.h | 15 +++++++ + fs/btrfs/tree-checker.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 109 insertions(+) + +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -1459,6 +1459,21 @@ do { + + #define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) + ++#define BTRFS_INODE_FLAG_MASK \ ++ (BTRFS_INODE_NODATASUM | \ ++ BTRFS_INODE_NODATACOW | \ ++ BTRFS_INODE_READONLY | \ ++ BTRFS_INODE_NOCOMPRESS | \ ++ BTRFS_INODE_PREALLOC | \ ++ BTRFS_INODE_SYNC | \ ++ BTRFS_INODE_IMMUTABLE | \ ++ BTRFS_INODE_APPEND | \ ++ BTRFS_INODE_NODUMP | \ ++ BTRFS_INODE_NOATIME | \ ++ BTRFS_INODE_DIRSYNC | \ ++ BTRFS_INODE_COMPRESS | \ ++ BTRFS_INODE_ROOT_ITEM_INIT) ++ + struct btrfs_map_token { + const struct extent_buffer *eb; + char *kaddr; +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -671,6 +671,97 @@ static int check_dev_item(struct btrfs_f + return 0; + } + ++/* Inode item error output has the same format as dir_item_err() */ ++#define inode_item_err(fs_info, eb, slot, fmt, ...) \ ++ dir_item_err(fs_info, eb, slot, fmt, __VA_ARGS__) ++ ++static int check_inode_item(struct btrfs_fs_info *fs_info, ++ struct extent_buffer *leaf, ++ struct btrfs_key *key, int slot) ++{ ++ struct btrfs_inode_item *iitem; ++ u64 super_gen = btrfs_super_generation(fs_info->super_copy); ++ u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777); ++ u32 mode; ++ ++ if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID || ++ key->objectid > BTRFS_LAST_FREE_OBJECTID) && ++ key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID && ++ key->objectid != BTRFS_FREE_INO_OBJECTID) { ++ generic_err(fs_info, leaf, slot, ++ "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu", ++ key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID, ++ BTRFS_FIRST_FREE_OBJECTID, ++ BTRFS_LAST_FREE_OBJECTID, ++ BTRFS_FREE_INO_OBJECTID); ++ return -EUCLEAN; ++ } ++ if (key->offset != 0) { ++ inode_item_err(fs_info, leaf, slot, ++ "invalid key offset: has %llu expect 0", ++ key->offset); ++ return -EUCLEAN; ++ } ++ iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item); ++ ++ /* Here we use super block generation + 1 to handle log tree */ ++ if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) { ++ inode_item_err(fs_info, leaf, slot, ++ "invalid inode generation: has %llu expect (0, %llu]", ++ btrfs_inode_generation(leaf, iitem), ++ super_gen + 1); ++ return -EUCLEAN; ++ } ++ /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */ ++ if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) { ++ inode_item_err(fs_info, leaf, slot, ++ "invalid inode generation: has %llu expect [0, %llu]", ++ btrfs_inode_transid(leaf, iitem), super_gen + 1); ++ return -EUCLEAN; ++ } ++ ++ /* ++ * For size and nbytes it's better not to be too strict, as for dir ++ * item its size/nbytes can easily get wrong, but doesn't affect ++ * anything in the fs. So here we skip the check. ++ */ ++ mode = btrfs_inode_mode(leaf, iitem); ++ if (mode & ~valid_mask) { ++ inode_item_err(fs_info, leaf, slot, ++ "unknown mode bit detected: 0x%x", ++ mode & ~valid_mask); ++ return -EUCLEAN; ++ } ++ ++ /* ++ * S_IFMT is not bit mapped so we can't completely rely on is_power_of_2, ++ * but is_power_of_2() can save us from checking FIFO/CHR/DIR/REG. ++ * Only needs to check BLK, LNK and SOCKS ++ */ ++ if (!is_power_of_2(mode & S_IFMT)) { ++ if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) { ++ inode_item_err(fs_info, leaf, slot, ++ "invalid mode: has 0%o expect valid S_IF* bit(s)", ++ mode & S_IFMT); ++ return -EUCLEAN; ++ } ++ } ++ if (S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1) { ++ inode_item_err(fs_info, leaf, slot, ++ "invalid nlink: has %u expect no more than 1 for dir", ++ btrfs_inode_nlink(leaf, iitem)); ++ return -EUCLEAN; ++ } ++ if (btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) { ++ inode_item_err(fs_info, leaf, slot, ++ "unknown flags detected: 0x%llx", ++ btrfs_inode_flags(leaf, iitem) & ++ ~BTRFS_INODE_FLAG_MASK); ++ return -EUCLEAN; ++ } ++ return 0; ++} ++ + /* + * Common point to switch the item-specific validation. + */ +@@ -704,6 +795,9 @@ static int check_leaf_item(struct btrfs_ + case BTRFS_DEV_ITEM_KEY: + ret = check_dev_item(fs_info, leaf, key, slot); + break; ++ case BTRFS_INODE_ITEM_KEY: ++ ret = check_inode_item(fs_info, leaf, key, slot); ++ break; + } + return ret; + } diff --git a/queue-4.19/revert-btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch b/queue-4.19/revert-btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch new file mode 100644 index 00000000000..96202a2026d --- /dev/null +++ b/queue-4.19/revert-btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch @@ -0,0 +1,35 @@ +From foo@baz Sat Nov 7 04:26:01 PM CET 2020 +From: Ben Hutchings +Date: Mon, 12 Oct 2020 23:18:11 +0100 +Subject: Revert "btrfs: flush write bio if we loop in extent_write_cache_pages" + +From: Ben Hutchings + +This reverts commit 860473714cbe7fbedcf92bfe3eb6d69fae8c74ff. That +has an incorrect upstream commit reference, and was modified in a way +that conflicts with some older fixes. We can cleanly cherry-pick the +upstream commit *after* those fixes. + +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent_io.c | 8 -------- + 1 file changed, 8 deletions(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -4045,14 +4045,6 @@ retry: + */ + scanned = 1; + index = 0; +- +- /* +- * If we're looping we could run into a page that is locked by a +- * writer and that writer could be waiting on writeback for a +- * page in our current bio, and thus deadlock, so flush the +- * write bio here. +- */ +- flush_write_bio(epd); + goto retry; + } + diff --git a/queue-4.19/series b/queue-4.19/series index be61efd9a6a..7f6d2d70a5e 100644 --- a/queue-4.19/series +++ b/queue-4.19/series @@ -9,3 +9,22 @@ gianfar-account-for-tx-ptp-timestamp-in-the-skb-headroom.patch net-usb-qmi_wwan-add-telit-le910cx-0x1230-composition.patch sctp-fix-comm_lost-cant_str_assoc-err-reporting-on-big-endian-platforms.patch sfp-fix-error-handing-in-sfp_probe.patch +blktrace-fix-debugfs-use-after-free.patch +btrfs-extent_io-kill-the-forward-declaration-of-flush_write_bio.patch +btrfs-extent_io-move-the-bug_on-in-flush_write_bio-one-level-up.patch +revert-btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch +btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch +btrfs-extent_io-handle-errors-better-in-extent_write_full_page.patch +btrfs-extent_io-handle-errors-better-in-btree_write_cache_pages.patch +btrfs-extent_io-add-proper-error-handling-to-lock_extent_buffer_for_io.patch +btrfs-fix-unwritten-extent-buffers-and-hangs-on-future-writeback-attempts.patch +btrfs-don-t-submit-any-btree-write-bio-if-the-fs-has-errors.patch +btrfs-move-btrfs_check_chunk_valid-to-tree-check.-and-export-it.patch +btrfs-tree-checker-make-chunk-item-checker-messages-more-readable.patch +btrfs-tree-checker-make-btrfs_check_chunk_valid-return-euclean-instead-of-eio.patch +btrfs-tree-checker-check-chunk-item-at-tree-block-read-time.patch +btrfs-tree-checker-verify-dev-item.patch +btrfs-tree-checker-fix-wrong-check-on-max-devid.patch +btrfs-tree-checker-enhance-chunk-checker-to-validate-chunk-profile.patch +btrfs-tree-checker-verify-inode-item.patch +btrfs-tree-checker-fix-the-error-message-for-transid-error.patch -- 2.47.3