From 3014efe7f4d624a8c0dff874d2fee17b9d1b42ce Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 6 Jun 2022 12:28:27 +0200 Subject: [PATCH] 5.18-stable patches added patches: btrfs-add-0x-prefix-for-unsupported-optional-features.patch btrfs-fix-deadlock-between-concurrent-dio-writes-when-low-on-free-data-space.patch btrfs-fix-the-error-handling-for-submit_extent_page-for-btrfs_do_readpage.patch btrfs-repair-super-block-num_devices-automatically.patch btrfs-return-correct-error-number-for-__extent_writepage_io.patch btrfs-zoned-finish-block-group-when-there-are-no-more-allocatable-bytes-left.patch btrfs-zoned-fix-comparison-of-alloc_offset-vs-meta_write_pointer.patch btrfs-zoned-properly-finish-block-group-on-metadata-write.patch btrfs-zoned-zone-finish-unused-block-group.patch --- ...ix-for-unsupported-optional-features.patch | 47 +++ ...o-writes-when-low-on-free-data-space.patch | 348 ++++++++++++++++++ ...it_extent_page-for-btrfs_do_readpage.patch | 71 ++++ ...uper-block-num_devices-automatically.patch | 94 +++++ ...ror-number-for-__extent_writepage_io.patch | 88 +++++ ...e-are-no-more-allocatable-bytes-left.patch | 51 +++ ...f-alloc_offset-vs-meta_write_pointer.patch | 34 ++ ...finish-block-group-on-metadata-write.patch | 144 ++++++++ ...zoned-zone-finish-unused-block-group.patch | 49 +++ queue-5.18/series | 9 + 10 files changed, 935 insertions(+) create mode 100644 queue-5.18/btrfs-add-0x-prefix-for-unsupported-optional-features.patch create mode 100644 queue-5.18/btrfs-fix-deadlock-between-concurrent-dio-writes-when-low-on-free-data-space.patch create mode 100644 queue-5.18/btrfs-fix-the-error-handling-for-submit_extent_page-for-btrfs_do_readpage.patch create mode 100644 queue-5.18/btrfs-repair-super-block-num_devices-automatically.patch create mode 100644 queue-5.18/btrfs-return-correct-error-number-for-__extent_writepage_io.patch create mode 100644 queue-5.18/btrfs-zoned-finish-block-group-when-there-are-no-more-allocatable-bytes-left.patch create mode 100644 queue-5.18/btrfs-zoned-fix-comparison-of-alloc_offset-vs-meta_write_pointer.patch create mode 100644 queue-5.18/btrfs-zoned-properly-finish-block-group-on-metadata-write.patch create mode 100644 queue-5.18/btrfs-zoned-zone-finish-unused-block-group.patch diff --git a/queue-5.18/btrfs-add-0x-prefix-for-unsupported-optional-features.patch b/queue-5.18/btrfs-add-0x-prefix-for-unsupported-optional-features.patch new file mode 100644 index 00000000000..15b72ce3117 --- /dev/null +++ b/queue-5.18/btrfs-add-0x-prefix-for-unsupported-optional-features.patch @@ -0,0 +1,47 @@ +From d5321a0fa8bc49f11bea0b470800962c17d92d8f Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Tue, 10 May 2022 15:10:18 +0800 +Subject: btrfs: add "0x" prefix for unsupported optional features + +From: Qu Wenruo + +commit d5321a0fa8bc49f11bea0b470800962c17d92d8f upstream. + +The following error message lack the "0x" obviously: + + cannot mount because of unsupported optional features (4000) + +Add the prefix to make it less confusing. This can happen on older +kernels that try to mount a filesystem with newer features so it makes +sense to backport to older trees. + +CC: stable@vger.kernel.org # 4.14+ +Reviewed-by: Nikolay Borisov +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/disk-io.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -3611,7 +3611,7 @@ int __cold open_ctree(struct super_block + ~BTRFS_FEATURE_INCOMPAT_SUPP; + if (features) { + btrfs_err(fs_info, +- "cannot mount because of unsupported optional features (%llx)", ++ "cannot mount because of unsupported optional features (0x%llx)", + features); + err = -EINVAL; + goto fail_alloc; +@@ -3649,7 +3649,7 @@ int __cold open_ctree(struct super_block + ~BTRFS_FEATURE_COMPAT_RO_SUPP; + if (!sb_rdonly(sb) && features) { + btrfs_err(fs_info, +- "cannot mount read-write because of unsupported optional features (%llx)", ++ "cannot mount read-write because of unsupported optional features (0x%llx)", + features); + err = -EINVAL; + goto fail_alloc; diff --git a/queue-5.18/btrfs-fix-deadlock-between-concurrent-dio-writes-when-low-on-free-data-space.patch b/queue-5.18/btrfs-fix-deadlock-between-concurrent-dio-writes-when-low-on-free-data-space.patch new file mode 100644 index 00000000000..f7514eb577f --- /dev/null +++ b/queue-5.18/btrfs-fix-deadlock-between-concurrent-dio-writes-when-low-on-free-data-space.patch @@ -0,0 +1,348 @@ +From f5585f4f0ef5b17026bbd60fbff6fcc91b99d5bf Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Thu, 28 Apr 2022 14:59:46 +0100 +Subject: btrfs: fix deadlock between concurrent dio writes when low on free data space + +From: Filipe Manana + +commit f5585f4f0ef5b17026bbd60fbff6fcc91b99d5bf upstream. + +When reserving data space for a direct IO write we can end up deadlocking +if we have multiple tasks attempting a write to the same file range, there +are multiple extents covered by that file range, we are low on available +space for data and the writes don't expand the inode's i_size. + +The deadlock can happen like this: + +1) We have a file with an i_size of 1M, at offset 0 it has an extent with + a size of 128K and at offset 128K it has another extent also with a + size of 128K; + +2) Task A does a direct IO write against file range [0, 256K), and because + the write is within the i_size boundary, it takes the inode's lock (VFS + level) in shared mode; + +3) Task A locks the file range [0, 256K) at btrfs_dio_iomap_begin(), and + then gets the extent map for the extent covering the range [0, 128K). + At btrfs_get_blocks_direct_write(), it creates an ordered extent for + that file range ([0, 128K)); + +4) Before returning from btrfs_dio_iomap_begin(), it unlocks the file + range [0, 256K); + +5) Task A executes btrfs_dio_iomap_begin() again, this time for the file + range [128K, 256K), and locks the file range [128K, 256K); + +6) Task B starts a direct IO write against file range [0, 256K) as well. + It also locks the inode in shared mode, as it's within the i_size limit, + and then tries to lock file range [0, 256K). It is able to lock the + subrange [0, 128K) but then blocks waiting for the range [128K, 256K), + as it is currently locked by task A; + +7) Task A enters btrfs_get_blocks_direct_write() and tries to reserve data + space. Because we are low on available free space, it triggers the + async data reclaim task, and waits for it to reserve data space; + +8) The async reclaim task decides to wait for all existing ordered extents + to complete (through btrfs_wait_ordered_roots()). + It finds the ordered extent previously created by task A for the file + range [0, 128K) and waits for it to complete; + +9) The ordered extent for the file range [0, 128K) can not complete + because it blocks at btrfs_finish_ordered_io() when trying to lock the + file range [0, 128K). + + This results in a deadlock, because: + + - task B is holding the file range [0, 128K) locked, waiting for the + range [128K, 256K) to be unlocked by task A; + + - task A is holding the file range [128K, 256K) locked and it's waiting + for the async data reclaim task to satisfy its space reservation + request; + + - the async data reclaim task is waiting for ordered extent [0, 128K) + to complete, but the ordered extent can not complete because the + file range [0, 128K) is currently locked by task B, which is waiting + on task A to unlock file range [128K, 256K) and task A waiting + on the async data reclaim task. + + This results in a deadlock between 4 task: task A, task B, the async + data reclaim task and the task doing ordered extent completion (a work + queue task). + +This type of deadlock can sporadically be triggered by the test case +generic/300 from fstests, and results in a stack trace like the following: + +[12084.033689] INFO: task kworker/u16:7:123749 blocked for more than 241 seconds. +[12084.034877] Not tainted 5.18.0-rc2-btrfs-next-115 #1 +[12084.035562] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +[12084.036548] task:kworker/u16:7 state:D stack: 0 pid:123749 ppid: 2 flags:0x00004000 +[12084.036554] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs] +[12084.036599] Call Trace: +[12084.036601] +[12084.036606] __schedule+0x3cb/0xed0 +[12084.036616] schedule+0x4e/0xb0 +[12084.036620] btrfs_start_ordered_extent+0x109/0x1c0 [btrfs] +[12084.036651] ? prepare_to_wait_exclusive+0xc0/0xc0 +[12084.036659] btrfs_run_ordered_extent_work+0x1a/0x30 [btrfs] +[12084.036688] btrfs_work_helper+0xf8/0x400 [btrfs] +[12084.036719] ? lock_is_held_type+0xe8/0x140 +[12084.036727] process_one_work+0x252/0x5a0 +[12084.036736] ? process_one_work+0x5a0/0x5a0 +[12084.036738] worker_thread+0x52/0x3b0 +[12084.036743] ? process_one_work+0x5a0/0x5a0 +[12084.036745] kthread+0xf2/0x120 +[12084.036747] ? kthread_complete_and_exit+0x20/0x20 +[12084.036751] ret_from_fork+0x22/0x30 +[12084.036765] +[12084.036769] INFO: task kworker/u16:11:153787 blocked for more than 241 seconds. +[12084.037702] Not tainted 5.18.0-rc2-btrfs-next-115 #1 +[12084.038540] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +[12084.039506] task:kworker/u16:11 state:D stack: 0 pid:153787 ppid: 2 flags:0x00004000 +[12084.039511] Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs] +[12084.039551] Call Trace: +[12084.039553] +[12084.039557] __schedule+0x3cb/0xed0 +[12084.039566] schedule+0x4e/0xb0 +[12084.039569] schedule_timeout+0xed/0x130 +[12084.039573] ? mark_held_locks+0x50/0x80 +[12084.039578] ? _raw_spin_unlock_irq+0x24/0x50 +[12084.039580] ? lockdep_hardirqs_on+0x7d/0x100 +[12084.039585] __wait_for_common+0xaf/0x1f0 +[12084.039587] ? usleep_range_state+0xb0/0xb0 +[12084.039596] btrfs_wait_ordered_extents+0x3d6/0x470 [btrfs] +[12084.039636] btrfs_wait_ordered_roots+0x175/0x240 [btrfs] +[12084.039670] flush_space+0x25b/0x630 [btrfs] +[12084.039712] btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs] +[12084.039747] process_one_work+0x252/0x5a0 +[12084.039756] ? process_one_work+0x5a0/0x5a0 +[12084.039758] worker_thread+0x52/0x3b0 +[12084.039762] ? process_one_work+0x5a0/0x5a0 +[12084.039765] kthread+0xf2/0x120 +[12084.039766] ? kthread_complete_and_exit+0x20/0x20 +[12084.039770] ret_from_fork+0x22/0x30 +[12084.039783] +[12084.039800] INFO: task kworker/u16:17:217907 blocked for more than 241 seconds. +[12084.040709] Not tainted 5.18.0-rc2-btrfs-next-115 #1 +[12084.041398] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +[12084.042404] task:kworker/u16:17 state:D stack: 0 pid:217907 ppid: 2 flags:0x00004000 +[12084.042411] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs] +[12084.042461] Call Trace: +[12084.042463] +[12084.042471] __schedule+0x3cb/0xed0 +[12084.042485] schedule+0x4e/0xb0 +[12084.042490] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs] +[12084.042539] ? prepare_to_wait_exclusive+0xc0/0xc0 +[12084.042551] lock_extent_bits+0x37/0x90 [btrfs] +[12084.042601] btrfs_finish_ordered_io.isra.0+0x3fd/0x960 [btrfs] +[12084.042656] ? lock_is_held_type+0xe8/0x140 +[12084.042667] btrfs_work_helper+0xf8/0x400 [btrfs] +[12084.042716] ? lock_is_held_type+0xe8/0x140 +[12084.042727] process_one_work+0x252/0x5a0 +[12084.042742] worker_thread+0x52/0x3b0 +[12084.042750] ? process_one_work+0x5a0/0x5a0 +[12084.042754] kthread+0xf2/0x120 +[12084.042757] ? kthread_complete_and_exit+0x20/0x20 +[12084.042763] ret_from_fork+0x22/0x30 +[12084.042783] +[12084.042798] INFO: task fio:234517 blocked for more than 241 seconds. +[12084.043598] Not tainted 5.18.0-rc2-btrfs-next-115 #1 +[12084.044282] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +[12084.045244] task:fio state:D stack: 0 pid:234517 ppid:234515 flags:0x00004000 +[12084.045248] Call Trace: +[12084.045250] +[12084.045254] __schedule+0x3cb/0xed0 +[12084.045263] schedule+0x4e/0xb0 +[12084.045266] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs] +[12084.045298] ? prepare_to_wait_exclusive+0xc0/0xc0 +[12084.045306] lock_extent_bits+0x37/0x90 [btrfs] +[12084.045336] btrfs_dio_iomap_begin+0x336/0xc60 [btrfs] +[12084.045370] ? lock_is_held_type+0xe8/0x140 +[12084.045378] iomap_iter+0x184/0x4c0 +[12084.045383] __iomap_dio_rw+0x2c6/0x8a0 +[12084.045406] iomap_dio_rw+0xa/0x30 +[12084.045408] btrfs_do_write_iter+0x370/0x5e0 [btrfs] +[12084.045440] aio_write+0xfa/0x2c0 +[12084.045448] ? __might_fault+0x2a/0x70 +[12084.045451] ? kvm_sched_clock_read+0x14/0x40 +[12084.045455] ? lock_release+0x153/0x4a0 +[12084.045463] io_submit_one+0x615/0x9f0 +[12084.045467] ? __might_fault+0x2a/0x70 +[12084.045469] ? kvm_sched_clock_read+0x14/0x40 +[12084.045478] __x64_sys_io_submit+0x83/0x160 +[12084.045483] ? syscall_enter_from_user_mode+0x1d/0x50 +[12084.045489] do_syscall_64+0x3b/0x90 +[12084.045517] entry_SYSCALL_64_after_hwframe+0x44/0xae +[12084.045521] RIP: 0033:0x7fa76511af79 +[12084.045525] RSP: 002b:00007ffd6d6b9058 EFLAGS: 00000246 ORIG_RAX: 00000000000000d1 +[12084.045530] RAX: ffffffffffffffda RBX: 00007fa75ba6e760 RCX: 00007fa76511af79 +[12084.045532] RDX: 0000557b304ff3f0 RSI: 0000000000000001 RDI: 00007fa75ba4c000 +[12084.045535] RBP: 00007fa75ba4c000 R08: 00007fa751b76000 R09: 0000000000000330 +[12084.045537] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001 +[12084.045540] R13: 0000000000000000 R14: 0000557b304ff3f0 R15: 0000557b30521eb0 +[12084.045561] + +Fix this issue by always reserving data space before locking a file range +at btrfs_dio_iomap_begin(). If we can't reserve the space, then we don't +error out immediately - instead after locking the file range, check if we +can do a NOCOW write, and if we can we don't error out since we don't need +to allocate a data extent, however if we can't NOCOW then error out with +-ENOSPC. This also implies that we may end up reserving space when it's +not needed because the write will end up being done in NOCOW mode - in that +case we just release the space after we noticed we did a NOCOW write - this +is the same type of logic that is done in the path for buffered IO writes. + +Fixes: f0bfa76a11e93d ("btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range") +CC: stable@vger.kernel.org # 5.17+ +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/inode.c | 81 +++++++++++++++++++++++++++++++++++++++++++------------ + 1 file changed, 64 insertions(+), 17 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -64,6 +64,8 @@ struct btrfs_iget_args { + struct btrfs_dio_data { + ssize_t submitted; + struct extent_changeset *data_reserved; ++ bool data_space_reserved; ++ bool nocow_done; + }; + + struct btrfs_rename_ctx { +@@ -7489,15 +7491,25 @@ static int btrfs_get_blocks_direct_write + ret = PTR_ERR(em2); + goto out; + } ++ ++ dio_data->nocow_done = true; + } else { + /* Our caller expects us to free the input extent map. */ + free_extent_map(em); + *map = NULL; + +- /* We have to COW, so need to reserve metadata and data space. */ +- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), +- &dio_data->data_reserved, +- start, len); ++ /* ++ * If we could not allocate data space before locking the file ++ * range and we can't do a NOCOW write, then we have to fail. ++ */ ++ if (!dio_data->data_space_reserved) ++ return -ENOSPC; ++ ++ /* ++ * We have to COW and we have already reserved data space before, ++ * so now we reserve only metadata. ++ */ ++ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len); + if (ret < 0) + goto out; + space_reserved = true; +@@ -7510,10 +7522,8 @@ static int btrfs_get_blocks_direct_write + *map = em; + len = min(len, em->len - (start - em->start)); + if (len < prev_len) +- btrfs_delalloc_release_space(BTRFS_I(inode), +- dio_data->data_reserved, +- start + len, prev_len - len, +- true); ++ btrfs_delalloc_release_metadata(BTRFS_I(inode), ++ prev_len - len, true); + } + + /* +@@ -7531,15 +7541,7 @@ static int btrfs_get_blocks_direct_write + out: + if (ret && space_reserved) { + btrfs_delalloc_release_extents(BTRFS_I(inode), len); +- if (can_nocow) { +- btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); +- } else { +- btrfs_delalloc_release_space(BTRFS_I(inode), +- dio_data->data_reserved, +- start, len, true); +- extent_changeset_free(dio_data->data_reserved); +- dio_data->data_reserved = NULL; +- } ++ btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); + } + return ret; + } +@@ -7556,6 +7558,7 @@ static int btrfs_dio_iomap_begin(struct + const bool write = !!(flags & IOMAP_WRITE); + int ret = 0; + u64 len = length; ++ const u64 data_alloc_len = length; + bool unlock_extents = false; + + if (!write) +@@ -7584,6 +7587,25 @@ static int btrfs_dio_iomap_begin(struct + + iomap->private = dio_data; + ++ /* ++ * We always try to allocate data space and must do it before locking ++ * the file range, to avoid deadlocks with concurrent writes to the same ++ * range if the range has several extents and the writes don't expand the ++ * current i_size (the inode lock is taken in shared mode). If we fail to ++ * allocate data space here we continue and later, after locking the ++ * file range, we fail with ENOSPC only if we figure out we can not do a ++ * NOCOW write. ++ */ ++ if (write && !(flags & IOMAP_NOWAIT)) { ++ ret = btrfs_check_data_free_space(BTRFS_I(inode), ++ &dio_data->data_reserved, ++ start, data_alloc_len); ++ if (!ret) ++ dio_data->data_space_reserved = true; ++ else if (ret && !(BTRFS_I(inode)->flags & ++ (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) ++ goto err; ++ } + + /* + * If this errors out it's because we couldn't invalidate pagecache for +@@ -7658,6 +7680,24 @@ static int btrfs_dio_iomap_begin(struct + unlock_extents = true; + /* Recalc len in case the new em is smaller than requested */ + len = min(len, em->len - (start - em->start)); ++ if (dio_data->data_space_reserved) { ++ u64 release_offset; ++ u64 release_len = 0; ++ ++ if (dio_data->nocow_done) { ++ release_offset = start; ++ release_len = data_alloc_len; ++ } else if (len < data_alloc_len) { ++ release_offset = start + len; ++ release_len = data_alloc_len - len; ++ } ++ ++ if (release_len > 0) ++ btrfs_free_reserved_data_space(BTRFS_I(inode), ++ dio_data->data_reserved, ++ release_offset, ++ release_len); ++ } + } else { + /* + * We need to unlock only the end area that we aren't using. +@@ -7702,6 +7742,13 @@ unlock_err: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); + err: ++ if (dio_data->data_space_reserved) { ++ btrfs_free_reserved_data_space(BTRFS_I(inode), ++ dio_data->data_reserved, ++ start, data_alloc_len); ++ extent_changeset_free(dio_data->data_reserved); ++ } ++ + kfree(dio_data); + + return ret; diff --git a/queue-5.18/btrfs-fix-the-error-handling-for-submit_extent_page-for-btrfs_do_readpage.patch b/queue-5.18/btrfs-fix-the-error-handling-for-submit_extent_page-for-btrfs_do_readpage.patch new file mode 100644 index 00000000000..4b3da441384 --- /dev/null +++ b/queue-5.18/btrfs-fix-the-error-handling-for-submit_extent_page-for-btrfs_do_readpage.patch @@ -0,0 +1,71 @@ +From 10f7f6f879c28f8368d6516ab1ccf3517a1f5d3d Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Tue, 12 Apr 2022 20:30:14 +0800 +Subject: btrfs: fix the error handling for submit_extent_page() for btrfs_do_readpage() + +From: Qu Wenruo + +commit 10f7f6f879c28f8368d6516ab1ccf3517a1f5d3d upstream. + +[BUG] +Test case generic/475 have a very high chance (almost 100%) to hit a fs +hang, where a data page will never be unlocked and hang all later +operations. + +[CAUSE] +In btrfs_do_readpage(), if we hit an error from submit_extent_page() we +will try to do the cleanup for our current io range, and exit. + +This works fine for PAGE_SIZE == sectorsize cases, but not for subpage. + +For subpage btrfs_do_readpage() will lock the full page first, which can +contain several different sectors and extents: + + btrfs_do_readpage() + |- begin_page_read() + | |- btrfs_subpage_start_reader(); + | Now the page will have PAGE_SIZE / sectorsize reader pending, + | and the page is locked. + | + |- end_page_read() for different branches + | This function will reduce subpage readers, and when readers + | reach 0, it will unlock the page. + +But when submit_extent_page() failed, we only cleanup the current +io range, while the remaining io range will never be cleaned up, and the +page remains locked forever. + +[FIX] +Update the error handling of submit_extent_page() to cleanup all the +remaining subpage range before exiting the loop. + +Please note that, now submit_extent_page() can only fail due to +sanity check in alloc_new_bio(). + +Thus regular IO errors are impossible to trigger the error path. + +CC: stable@vger.kernel.org # 5.15+ +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent_io.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -3743,8 +3743,12 @@ int btrfs_do_readpage(struct page *page, + this_bio_flag, + force_bio_submit); + if (ret) { +- unlock_extent(tree, cur, cur + iosize - 1); +- end_page_read(page, false, cur, iosize); ++ /* ++ * We have to unlock the remaining range, or the page ++ * will never be unlocked. ++ */ ++ unlock_extent(tree, cur, end); ++ end_page_read(page, false, cur, end + 1 - cur); + goto out; + } + cur = cur + iosize; diff --git a/queue-5.18/btrfs-repair-super-block-num_devices-automatically.patch b/queue-5.18/btrfs-repair-super-block-num_devices-automatically.patch new file mode 100644 index 00000000000..d14d81b07fc --- /dev/null +++ b/queue-5.18/btrfs-repair-super-block-num_devices-automatically.patch @@ -0,0 +1,94 @@ +From d201238ccd2f30b9bfcfadaeae0972e3a486a176 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Mon, 28 Feb 2022 15:05:53 +0800 +Subject: btrfs: repair super block num_devices automatically +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Qu Wenruo + +commit d201238ccd2f30b9bfcfadaeae0972e3a486a176 upstream. + +[BUG] +There is a report that a btrfs has a bad super block num devices. + +This makes btrfs to reject the fs completely. + + BTRFS error (device sdd3): super_num_devices 3 mismatch with num_devices 2 found here + BTRFS error (device sdd3): failed to read chunk tree: -22 + BTRFS error (device sdd3): open_ctree failed + +[CAUSE] +During btrfs device removal, chunk tree and super block num devs are +updated in two different transactions: + + btrfs_rm_device() + |- btrfs_rm_dev_item(device) + | |- trans = btrfs_start_transaction() + | | Now we got transaction X + | | + | |- btrfs_del_item() + | | Now device item is removed from chunk tree + | | + | |- btrfs_commit_transaction() + | Transaction X got committed, super num devs untouched, + | but device item removed from chunk tree. + | (AKA, super num devs is already incorrect) + | + |- cur_devices->num_devices--; + |- cur_devices->total_devices--; + |- btrfs_set_super_num_devices() + All those operations are not in transaction X, thus it will + only be written back to disk in next transaction. + +So after the transaction X in btrfs_rm_dev_item() committed, but before +transaction X+1 (which can be minutes away), a power loss happen, then +we got the super num mismatch. + +This has been fixed by commit bbac58698a55 ("btrfs: remove device item +and update super block in the same transaction"). + +[FIX] +Make the super_num_devices check less strict, converting it from a hard +error to a warning, and reset the value to a correct one for the current +or next transaction commit. + +As the number of device items is the critical information where the +super block num_devices is only a cached value (and also useful for +cross checking), it's safe to automatically update it. Other device +related problems like missing device are handled after that and may +require other means to resolve, like degraded mount. With this fix, +potentially affected filesystems won't fail mount and require the manual +repair by btrfs check. + +Reported-by: Luca Béla Palkovics +Link: https://lore.kernel.org/linux-btrfs/CA+8xDSpvdm_U0QLBAnrH=zqDq_cWCOH5TiV46CKmp3igr44okQ@mail.gmail.com/ +CC: stable@vger.kernel.org # 4.14+ +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/volumes.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -7671,12 +7671,12 @@ int btrfs_read_chunk_tree(struct btrfs_f + * do another round of validation checks. + */ + if (total_dev != fs_info->fs_devices->total_devices) { +- btrfs_err(fs_info, +- "super_num_devices %llu mismatch with num_devices %llu found here", ++ btrfs_warn(fs_info, ++"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit", + btrfs_super_num_devices(fs_info->super_copy), + total_dev); +- ret = -EINVAL; +- goto error; ++ fs_info->fs_devices->total_devices = total_dev; ++ btrfs_set_super_num_devices(fs_info->super_copy, total_dev); + } + if (btrfs_super_total_bytes(fs_info->super_copy) < + fs_info->fs_devices->total_rw_bytes) { diff --git a/queue-5.18/btrfs-return-correct-error-number-for-__extent_writepage_io.patch b/queue-5.18/btrfs-return-correct-error-number-for-__extent_writepage_io.patch new file mode 100644 index 00000000000..8b75e6dddbb --- /dev/null +++ b/queue-5.18/btrfs-return-correct-error-number-for-__extent_writepage_io.patch @@ -0,0 +1,88 @@ +From 44e5801fada6925d2bba1987c7b59cbcc9d0d592 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Tue, 12 Apr 2022 20:30:15 +0800 +Subject: btrfs: return correct error number for __extent_writepage_io() + +From: Qu Wenruo + +commit 44e5801fada6925d2bba1987c7b59cbcc9d0d592 upstream. + +[BUG] +If we hit an error from submit_extent_page() inside +__extent_writepage_io(), we could still return 0 to the caller, and +even trigger the warning in btrfs_page_assert_not_dirty(). + +[CAUSE] +In __extent_writepage_io(), if we hit an error from +submit_extent_page(), we will just clean up the range and continue. + +This is completely fine for regular PAGE_SIZE == sectorsize, as we can +only hit one sector in one page, thus after the error we're ensured to +exit and @ret will be saved. + +But for subpage case, we may have other dirty subpage range in the page, +and in the next loop, we may succeeded submitting the next range. + +In that case, @ret will be overwritten, and we return 0 to the caller, +while we have hit some error. + +[FIX] +Introduce @has_error and @saved_ret to record the first error we hit, so +we will never forget what error we hit. + +CC: stable@vger.kernel.org # 5.15+ +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent_io.c | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -3920,10 +3920,12 @@ static noinline_for_stack int __extent_w + u64 extent_offset; + u64 block_start; + struct extent_map *em; ++ int saved_ret = 0; + int ret = 0; + int nr = 0; + u32 opf = REQ_OP_WRITE; + const unsigned int write_flags = wbc_to_write_flags(wbc); ++ bool has_error = false; + bool compressed; + + ret = btrfs_writepage_cow_fixup(page); +@@ -3973,6 +3975,9 @@ static noinline_for_stack int __extent_w + if (IS_ERR(em)) { + btrfs_page_set_error(fs_info, page, cur, end - cur + 1); + ret = PTR_ERR_OR_ZERO(em); ++ has_error = true; ++ if (!saved_ret) ++ saved_ret = ret; + break; + } + +@@ -4036,6 +4041,10 @@ static noinline_for_stack int __extent_w + end_bio_extent_writepage, + 0, 0, false); + if (ret) { ++ has_error = true; ++ if (!saved_ret) ++ saved_ret = ret; ++ + btrfs_page_set_error(fs_info, page, cur, iosize); + if (PageWriteback(page)) + btrfs_page_clear_writeback(fs_info, page, cur, +@@ -4049,8 +4058,10 @@ static noinline_for_stack int __extent_w + * If we finish without problem, we should not only clear page dirty, + * but also empty subpage dirty bits + */ +- if (!ret) ++ if (!has_error) + btrfs_page_assert_not_dirty(fs_info, page); ++ else ++ ret = saved_ret; + *nr_ret = nr; + return ret; + } diff --git a/queue-5.18/btrfs-zoned-finish-block-group-when-there-are-no-more-allocatable-bytes-left.patch b/queue-5.18/btrfs-zoned-finish-block-group-when-there-are-no-more-allocatable-bytes-left.patch new file mode 100644 index 00000000000..d5f6a74b227 --- /dev/null +++ b/queue-5.18/btrfs-zoned-finish-block-group-when-there-are-no-more-allocatable-bytes-left.patch @@ -0,0 +1,51 @@ +From 8b8a53998caefebfe5c8da7a74c2b601caf5dd48 Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Tue, 3 May 2022 17:48:52 -0700 +Subject: btrfs: zoned: finish block group when there are no more allocatable bytes left + +From: Naohiro Aota + +commit 8b8a53998caefebfe5c8da7a74c2b601caf5dd48 upstream. + +Currently, btrfs_zone_finish_endio() finishes a block group only when the +written region reaches the end of the block group. We can also finish the +block group when no more allocation is possible. + +Fixes: be1a1d7a5d24 ("btrfs: zoned: finish fully written block group") +CC: stable@vger.kernel.org # 5.16+ +Reviewed-by: Pankaj Raghav +Reviewed-by: Johannes Thumshirn +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/zoned.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -2000,6 +2000,7 @@ void btrfs_zone_finish_endio(struct btrf + struct btrfs_block_group *block_group; + struct map_lookup *map; + struct btrfs_device *device; ++ u64 min_alloc_bytes; + u64 physical; + + if (!btrfs_is_zoned(fs_info)) +@@ -2008,7 +2009,15 @@ void btrfs_zone_finish_endio(struct btrf + block_group = btrfs_lookup_block_group(fs_info, logical); + ASSERT(block_group); + +- if (logical + length < block_group->start + block_group->zone_capacity) ++ /* No MIXED_BG on zoned btrfs. */ ++ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) ++ min_alloc_bytes = fs_info->sectorsize; ++ else ++ min_alloc_bytes = fs_info->nodesize; ++ ++ /* Bail out if we can allocate more data from this block group. */ ++ if (logical + length + min_alloc_bytes <= ++ block_group->start + block_group->zone_capacity) + goto out; + + spin_lock(&block_group->lock); diff --git a/queue-5.18/btrfs-zoned-fix-comparison-of-alloc_offset-vs-meta_write_pointer.patch b/queue-5.18/btrfs-zoned-fix-comparison-of-alloc_offset-vs-meta_write_pointer.patch new file mode 100644 index 00000000000..e73bf66cded --- /dev/null +++ b/queue-5.18/btrfs-zoned-fix-comparison-of-alloc_offset-vs-meta_write_pointer.patch @@ -0,0 +1,34 @@ +From aa9ffadfcae33e611d8c2d476bcc2aa0d273b587 Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Wed, 4 May 2022 16:12:48 -0700 +Subject: btrfs: zoned: fix comparison of alloc_offset vs meta_write_pointer + +From: Naohiro Aota + +commit aa9ffadfcae33e611d8c2d476bcc2aa0d273b587 upstream. + +The block_group->alloc_offset is an offset from the start of the block +group. OTOH, the ->meta_write_pointer is an address in the logical +space. So, we should compare the alloc_offset shifted with the +block_group->start. + +Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking") +CC: stable@vger.kernel.org # 5.16+ +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/zoned.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -1896,7 +1896,7 @@ int btrfs_zone_finish(struct btrfs_block + /* Check if we have unwritten allocated space */ + if ((block_group->flags & + (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) && +- block_group->alloc_offset > block_group->meta_write_pointer) { ++ block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) { + spin_unlock(&block_group->lock); + return -EAGAIN; + } diff --git a/queue-5.18/btrfs-zoned-properly-finish-block-group-on-metadata-write.patch b/queue-5.18/btrfs-zoned-properly-finish-block-group-on-metadata-write.patch new file mode 100644 index 00000000000..c33fcba10e4 --- /dev/null +++ b/queue-5.18/btrfs-zoned-properly-finish-block-group-on-metadata-write.patch @@ -0,0 +1,144 @@ +From 56fbb0a4e8b3e929e41cc846e6ef89eb01152201 Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Tue, 3 May 2022 17:48:53 -0700 +Subject: btrfs: zoned: properly finish block group on metadata write + +From: Naohiro Aota + +commit 56fbb0a4e8b3e929e41cc846e6ef89eb01152201 upstream. + +Commit be1a1d7a5d24 ("btrfs: zoned: finish fully written block group") +introduced zone finishing code both for data and metadata end_io path. +However, the metadata side is not working as it should. First, it +compares logical address (eb->start + eb->len) with offset within a +block group (cache->zone_capacity) in submit_eb_page(). That essentially +disabled zone finishing on metadata end_io path. + +Furthermore, fixing the issue above revealed we cannot call +btrfs_zone_finish_endio() in end_extent_buffer_writeback(). We cannot +call btrfs_lookup_block_group() which require spin lock inside end_io +context. + +Introduce btrfs_schedule_zone_finish_bg() to wait for the extent buffer +writeback and do the zone finish IO in a workqueue. + +Also, drop EXTENT_BUFFER_ZONE_FINISH as it is no longer used. + +Fixes: be1a1d7a5d24 ("btrfs: zoned: finish fully written block group") +CC: stable@vger.kernel.org # 5.16+ +Reviewed-by: Johannes Thumshirn +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/block-group.h | 2 ++ + fs/btrfs/extent_io.c | 6 +----- + fs/btrfs/extent_io.h | 1 - + fs/btrfs/zoned.c | 31 +++++++++++++++++++++++++++++++ + fs/btrfs/zoned.h | 5 +++++ + 5 files changed, 39 insertions(+), 6 deletions(-) + +--- a/fs/btrfs/block-group.h ++++ b/fs/btrfs/block-group.h +@@ -212,6 +212,8 @@ struct btrfs_block_group { + u64 meta_write_pointer; + struct map_lookup *physical_map; + struct list_head active_bg_list; ++ struct work_struct zone_finish_work; ++ struct extent_buffer *last_eb; + }; + + static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -4196,9 +4196,6 @@ void wait_on_extent_buffer_writeback(str + + static void end_extent_buffer_writeback(struct extent_buffer *eb) + { +- if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags)) +- btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len); +- + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); +@@ -4818,8 +4815,7 @@ static int submit_eb_page(struct page *p + /* + * Implies write in zoned mode. Mark the last eb in a block group. + */ +- if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity) +- set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags); ++ btrfs_schedule_zone_finish_bg(cache, eb); + btrfs_put_block_group(cache); + } + ret = write_one_eb(eb, wbc, epd); +--- a/fs/btrfs/extent_io.h ++++ b/fs/btrfs/extent_io.h +@@ -32,7 +32,6 @@ enum { + /* write IO error */ + EXTENT_BUFFER_WRITE_ERR, + EXTENT_BUFFER_NO_CHECK, +- EXTENT_BUFFER_ZONE_FINISH, + }; + + /* these are flags for __process_pages_contig */ +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -2046,6 +2046,37 @@ out: + btrfs_put_block_group(block_group); + } + ++static void btrfs_zone_finish_endio_workfn(struct work_struct *work) ++{ ++ struct btrfs_block_group *bg = ++ container_of(work, struct btrfs_block_group, zone_finish_work); ++ ++ wait_on_extent_buffer_writeback(bg->last_eb); ++ free_extent_buffer(bg->last_eb); ++ btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length); ++ btrfs_put_block_group(bg); ++} ++ ++void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, ++ struct extent_buffer *eb) ++{ ++ if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) ++ return; ++ ++ if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) { ++ btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing", ++ bg->start); ++ return; ++ } ++ ++ /* For the work */ ++ btrfs_get_block_group(bg); ++ atomic_inc(&eb->refs); ++ bg->last_eb = eb; ++ INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); ++ queue_work(system_unbound_wq, &bg->zone_finish_work); ++} ++ + void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) + { + struct btrfs_fs_info *fs_info = bg->fs_info; +--- a/fs/btrfs/zoned.h ++++ b/fs/btrfs/zoned.h +@@ -76,6 +76,8 @@ int btrfs_zone_finish(struct btrfs_block + bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); + void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); ++void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, ++ struct extent_buffer *eb); + void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); + void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); + #else /* CONFIG_BLK_DEV_ZONED */ +@@ -233,6 +235,9 @@ static inline bool btrfs_can_activate_zo + static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { } + ++static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, ++ struct extent_buffer *eb) { } ++ + static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } + + static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } diff --git a/queue-5.18/btrfs-zoned-zone-finish-unused-block-group.patch b/queue-5.18/btrfs-zoned-zone-finish-unused-block-group.patch new file mode 100644 index 00000000000..6ea6be08ebd --- /dev/null +++ b/queue-5.18/btrfs-zoned-zone-finish-unused-block-group.patch @@ -0,0 +1,49 @@ +From 74e91b12b11560f01d120751d99d91d54b265d3d Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Tue, 3 May 2022 17:48:54 -0700 +Subject: btrfs: zoned: zone finish unused block group + +From: Naohiro Aota + +commit 74e91b12b11560f01d120751d99d91d54b265d3d upstream. + +While the active zones within an active block group are reset, and their +active resource is released, the block group itself is kept in the active +block group list and marked as active. As a result, the list will contain +more than max_active_zones block groups. That itself is not fatal for the +device as the zones are properly reset. + +However, that inflated list is, of course, strange. Also, a to-appear +patch series, which deactivates an active block group on demand, gets +confused with the wrong list. + +So, fix the issue by finishing the unused block group once it gets +read-only, so that we can release the active resource in an early stage. + +Fixes: be1a1d7a5d24 ("btrfs: zoned: finish fully written block group") +CC: stable@vger.kernel.org # 5.16+ +Reviewed-by: Johannes Thumshirn +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/block-group.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/fs/btrfs/block-group.c ++++ b/fs/btrfs/block-group.c +@@ -1367,6 +1367,14 @@ void btrfs_delete_unused_bgs(struct btrf + goto next; + } + ++ ret = btrfs_zone_finish(block_group); ++ if (ret < 0) { ++ btrfs_dec_block_group_ro(block_group); ++ if (ret == -EAGAIN) ++ ret = 0; ++ goto next; ++ } ++ + /* + * Want to do this before we do anything else so we can recover + * properly if we fail to join the transaction. diff --git a/queue-5.18/series b/queue-5.18/series index 9ce884e699c..7f1ab4032c7 100644 --- a/queue-5.18/series +++ b/queue-5.18/series @@ -42,3 +42,12 @@ kthread-don-t-allocate-kthread_struct-for-init-and-umh.patch ptrace-um-replace-pt_dtrace-with-tif_singlestep.patch ptrace-xtensa-replace-pt_singlestep-with-tif_singlestep.patch ptrace-reimplement-ptrace_kill-by-always-sending-sigkill.patch +btrfs-add-0x-prefix-for-unsupported-optional-features.patch +btrfs-return-correct-error-number-for-__extent_writepage_io.patch +btrfs-repair-super-block-num_devices-automatically.patch +btrfs-fix-the-error-handling-for-submit_extent_page-for-btrfs_do_readpage.patch +btrfs-fix-deadlock-between-concurrent-dio-writes-when-low-on-free-data-space.patch +btrfs-zoned-properly-finish-block-group-on-metadata-write.patch +btrfs-zoned-zone-finish-unused-block-group.patch +btrfs-zoned-finish-block-group-when-there-are-no-more-allocatable-bytes-left.patch +btrfs-zoned-fix-comparison-of-alloc_offset-vs-meta_write_pointer.patch -- 2.47.3