From 3014efe7f4d624a8c0dff874d2fee17b9d1b42ce Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 6 Jun 2022 12:28:27 +0200
Subject: [PATCH] 5.18-stable patches

added patches:
	btrfs-add-0x-prefix-for-unsupported-optional-features.patch
	btrfs-fix-deadlock-between-concurrent-dio-writes-when-low-on-free-data-space.patch
	btrfs-fix-the-error-handling-for-submit_extent_page-for-btrfs_do_readpage.patch
	btrfs-repair-super-block-num_devices-automatically.patch
	btrfs-return-correct-error-number-for-__extent_writepage_io.patch
	btrfs-zoned-finish-block-group-when-there-are-no-more-allocatable-bytes-left.patch
	btrfs-zoned-fix-comparison-of-alloc_offset-vs-meta_write_pointer.patch
	btrfs-zoned-properly-finish-block-group-on-metadata-write.patch
	btrfs-zoned-zone-finish-unused-block-group.patch
---
 ...ix-for-unsupported-optional-features.patch |  47 +++
 ...o-writes-when-low-on-free-data-space.patch | 348 ++++++++++++++++++
 ...it_extent_page-for-btrfs_do_readpage.patch |  71 ++++
 ...uper-block-num_devices-automatically.patch |  94 +++++
 ...ror-number-for-__extent_writepage_io.patch |  88 +++++
 ...e-are-no-more-allocatable-bytes-left.patch |  51 +++
 ...f-alloc_offset-vs-meta_write_pointer.patch |  34 ++
 ...finish-block-group-on-metadata-write.patch | 144 ++++++++
 ...zoned-zone-finish-unused-block-group.patch |  49 +++
 queue-5.18/series                             |   9 +
 10 files changed, 935 insertions(+)
 create mode 100644 queue-5.18/btrfs-add-0x-prefix-for-unsupported-optional-features.patch
 create mode 100644 queue-5.18/btrfs-fix-deadlock-between-concurrent-dio-writes-when-low-on-free-data-space.patch
 create mode 100644 queue-5.18/btrfs-fix-the-error-handling-for-submit_extent_page-for-btrfs_do_readpage.patch
 create mode 100644 queue-5.18/btrfs-repair-super-block-num_devices-automatically.patch
 create mode 100644 queue-5.18/btrfs-return-correct-error-number-for-__extent_writepage_io.patch
 create mode 100644 queue-5.18/btrfs-zoned-finish-block-group-when-there-are-no-more-allocatable-bytes-left.patch
 create mode 100644 queue-5.18/btrfs-zoned-fix-comparison-of-alloc_offset-vs-meta_write_pointer.patch
 create mode 100644 queue-5.18/btrfs-zoned-properly-finish-block-group-on-metadata-write.patch
 create mode 100644 queue-5.18/btrfs-zoned-zone-finish-unused-block-group.patch

diff --git a/queue-5.18/btrfs-add-0x-prefix-for-unsupported-optional-features.patch b/queue-5.18/btrfs-add-0x-prefix-for-unsupported-optional-features.patch
new file mode 100644
index 00000000000..15b72ce3117
--- /dev/null
+++ b/queue-5.18/btrfs-add-0x-prefix-for-unsupported-optional-features.patch
@@ -0,0 +1,47 @@
+From d5321a0fa8bc49f11bea0b470800962c17d92d8f Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Tue, 10 May 2022 15:10:18 +0800
+Subject: btrfs: add "0x" prefix for unsupported optional features
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit d5321a0fa8bc49f11bea0b470800962c17d92d8f upstream.
+
+The following error message lack the "0x" obviously:
+
+  cannot mount because of unsupported optional features (4000)
+
+Add the prefix to make it less confusing. This can happen on older
+kernels that try to mount a filesystem with newer features so it makes
+sense to backport to older trees.
+
+CC: stable@vger.kernel.org # 4.14+
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/disk-io.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -3611,7 +3611,7 @@ int __cold open_ctree(struct super_block
+ 		~BTRFS_FEATURE_INCOMPAT_SUPP;
+ 	if (features) {
+ 		btrfs_err(fs_info,
+-		    "cannot mount because of unsupported optional features (%llx)",
++		    "cannot mount because of unsupported optional features (0x%llx)",
+ 		    features);
+ 		err = -EINVAL;
+ 		goto fail_alloc;
+@@ -3649,7 +3649,7 @@ int __cold open_ctree(struct super_block
+ 		~BTRFS_FEATURE_COMPAT_RO_SUPP;
+ 	if (!sb_rdonly(sb) && features) {
+ 		btrfs_err(fs_info,
+-	"cannot mount read-write because of unsupported optional features (%llx)",
++	"cannot mount read-write because of unsupported optional features (0x%llx)",
+ 		       features);
+ 		err = -EINVAL;
+ 		goto fail_alloc;
diff --git a/queue-5.18/btrfs-fix-deadlock-between-concurrent-dio-writes-when-low-on-free-data-space.patch b/queue-5.18/btrfs-fix-deadlock-between-concurrent-dio-writes-when-low-on-free-data-space.patch
new file mode 100644
index 00000000000..f7514eb577f
--- /dev/null
+++ b/queue-5.18/btrfs-fix-deadlock-between-concurrent-dio-writes-when-low-on-free-data-space.patch
@@ -0,0 +1,348 @@
+From f5585f4f0ef5b17026bbd60fbff6fcc91b99d5bf Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 28 Apr 2022 14:59:46 +0100
+Subject: btrfs: fix deadlock between concurrent dio writes when low on free data space
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit f5585f4f0ef5b17026bbd60fbff6fcc91b99d5bf upstream.
+
+When reserving data space for a direct IO write we can end up deadlocking
+if we have multiple tasks attempting a write to the same file range, there
+are multiple extents covered by that file range, we are low on available
+space for data and the writes don't expand the inode's i_size.
+
+The deadlock can happen like this:
+
+1) We have a file with an i_size of 1M, at offset 0 it has an extent with
+   a size of 128K and at offset 128K it has another extent also with a
+   size of 128K;
+
+2) Task A does a direct IO write against file range [0, 256K), and because
+   the write is within the i_size boundary, it takes the inode's lock (VFS
+   level) in shared mode;
+
+3) Task A locks the file range [0, 256K) at btrfs_dio_iomap_begin(), and
+   then gets the extent map for the extent covering the range [0, 128K).
+   At btrfs_get_blocks_direct_write(), it creates an ordered extent for
+   that file range ([0, 128K));
+
+4) Before returning from btrfs_dio_iomap_begin(), it unlocks the file
+   range [0, 256K);
+
+5) Task A executes btrfs_dio_iomap_begin() again, this time for the file
+   range [128K, 256K), and locks the file range [128K, 256K);
+
+6) Task B starts a direct IO write against file range [0, 256K) as well.
+   It also locks the inode in shared mode, as it's within the i_size limit,
+   and then tries to lock file range [0, 256K). It is able to lock the
+   subrange [0, 128K) but then blocks waiting for the range [128K, 256K),
+   as it is currently locked by task A;
+
+7) Task A enters btrfs_get_blocks_direct_write() and tries to reserve data
+   space. Because we are low on available free space, it triggers the
+   async data reclaim task, and waits for it to reserve data space;
+
+8) The async reclaim task decides to wait for all existing ordered extents
+   to complete (through btrfs_wait_ordered_roots()).
+   It finds the ordered extent previously created by task A for the file
+   range [0, 128K) and waits for it to complete;
+
+9) The ordered extent for the file range [0, 128K) can not complete
+   because it blocks at btrfs_finish_ordered_io() when trying to lock the
+   file range [0, 128K).
+
+   This results in a deadlock, because:
+
+   - task B is holding the file range [0, 128K) locked, waiting for the
+     range [128K, 256K) to be unlocked by task A;
+
+   - task A is holding the file range [128K, 256K) locked and it's waiting
+     for the async data reclaim task to satisfy its space reservation
+     request;
+
+   - the async data reclaim task is waiting for ordered extent [0, 128K)
+     to complete, but the ordered extent can not complete because the
+     file range [0, 128K) is currently locked by task B, which is waiting
+     on task A to unlock file range [128K, 256K) and task A waiting
+     on the async data reclaim task.
+
+   This results in a deadlock between 4 task: task A, task B, the async
+   data reclaim task and the task doing ordered extent completion (a work
+   queue task).
+
+This type of deadlock can sporadically be triggered by the test case
+generic/300 from fstests, and results in a stack trace like the following:
+
+[12084.033689] INFO: task kworker/u16:7:123749 blocked for more than 241 seconds.
+[12084.034877]       Not tainted 5.18.0-rc2-btrfs-next-115 #1
+[12084.035562] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+[12084.036548] task:kworker/u16:7   state:D stack:    0 pid:123749 ppid:     2 flags:0x00004000
+[12084.036554] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs]
+[12084.036599] Call Trace:
+[12084.036601]  <TASK>
+[12084.036606]  __schedule+0x3cb/0xed0
+[12084.036616]  schedule+0x4e/0xb0
+[12084.036620]  btrfs_start_ordered_extent+0x109/0x1c0 [btrfs]
+[12084.036651]  ? prepare_to_wait_exclusive+0xc0/0xc0
+[12084.036659]  btrfs_run_ordered_extent_work+0x1a/0x30 [btrfs]
+[12084.036688]  btrfs_work_helper+0xf8/0x400 [btrfs]
+[12084.036719]  ? lock_is_held_type+0xe8/0x140
+[12084.036727]  process_one_work+0x252/0x5a0
+[12084.036736]  ? process_one_work+0x5a0/0x5a0
+[12084.036738]  worker_thread+0x52/0x3b0
+[12084.036743]  ? process_one_work+0x5a0/0x5a0
+[12084.036745]  kthread+0xf2/0x120
+[12084.036747]  ? kthread_complete_and_exit+0x20/0x20
+[12084.036751]  ret_from_fork+0x22/0x30
+[12084.036765]  </TASK>
+[12084.036769] INFO: task kworker/u16:11:153787 blocked for more than 241 seconds.
+[12084.037702]       Not tainted 5.18.0-rc2-btrfs-next-115 #1
+[12084.038540] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+[12084.039506] task:kworker/u16:11  state:D stack:    0 pid:153787 ppid:     2 flags:0x00004000
+[12084.039511] Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
+[12084.039551] Call Trace:
+[12084.039553]  <TASK>
+[12084.039557]  __schedule+0x3cb/0xed0
+[12084.039566]  schedule+0x4e/0xb0
+[12084.039569]  schedule_timeout+0xed/0x130
+[12084.039573]  ? mark_held_locks+0x50/0x80
+[12084.039578]  ? _raw_spin_unlock_irq+0x24/0x50
+[12084.039580]  ? lockdep_hardirqs_on+0x7d/0x100
+[12084.039585]  __wait_for_common+0xaf/0x1f0
+[12084.039587]  ? usleep_range_state+0xb0/0xb0
+[12084.039596]  btrfs_wait_ordered_extents+0x3d6/0x470 [btrfs]
+[12084.039636]  btrfs_wait_ordered_roots+0x175/0x240 [btrfs]
+[12084.039670]  flush_space+0x25b/0x630 [btrfs]
+[12084.039712]  btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs]
+[12084.039747]  process_one_work+0x252/0x5a0
+[12084.039756]  ? process_one_work+0x5a0/0x5a0
+[12084.039758]  worker_thread+0x52/0x3b0
+[12084.039762]  ? process_one_work+0x5a0/0x5a0
+[12084.039765]  kthread+0xf2/0x120
+[12084.039766]  ? kthread_complete_and_exit+0x20/0x20
+[12084.039770]  ret_from_fork+0x22/0x30
+[12084.039783]  </TASK>
+[12084.039800] INFO: task kworker/u16:17:217907 blocked for more than 241 seconds.
+[12084.040709]       Not tainted 5.18.0-rc2-btrfs-next-115 #1
+[12084.041398] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+[12084.042404] task:kworker/u16:17  state:D stack:    0 pid:217907 ppid:     2 flags:0x00004000
+[12084.042411] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs]
+[12084.042461] Call Trace:
+[12084.042463]  <TASK>
+[12084.042471]  __schedule+0x3cb/0xed0
+[12084.042485]  schedule+0x4e/0xb0
+[12084.042490]  wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
+[12084.042539]  ? prepare_to_wait_exclusive+0xc0/0xc0
+[12084.042551]  lock_extent_bits+0x37/0x90 [btrfs]
+[12084.042601]  btrfs_finish_ordered_io.isra.0+0x3fd/0x960 [btrfs]
+[12084.042656]  ? lock_is_held_type+0xe8/0x140
+[12084.042667]  btrfs_work_helper+0xf8/0x400 [btrfs]
+[12084.042716]  ? lock_is_held_type+0xe8/0x140
+[12084.042727]  process_one_work+0x252/0x5a0
+[12084.042742]  worker_thread+0x52/0x3b0
+[12084.042750]  ? process_one_work+0x5a0/0x5a0
+[12084.042754]  kthread+0xf2/0x120
+[12084.042757]  ? kthread_complete_and_exit+0x20/0x20
+[12084.042763]  ret_from_fork+0x22/0x30
+[12084.042783]  </TASK>
+[12084.042798] INFO: task fio:234517 blocked for more than 241 seconds.
+[12084.043598]       Not tainted 5.18.0-rc2-btrfs-next-115 #1
+[12084.044282] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+[12084.045244] task:fio             state:D stack:    0 pid:234517 ppid:234515 flags:0x00004000
+[12084.045248] Call Trace:
+[12084.045250]  <TASK>
+[12084.045254]  __schedule+0x3cb/0xed0
+[12084.045263]  schedule+0x4e/0xb0
+[12084.045266]  wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
+[12084.045298]  ? prepare_to_wait_exclusive+0xc0/0xc0
+[12084.045306]  lock_extent_bits+0x37/0x90 [btrfs]
+[12084.045336]  btrfs_dio_iomap_begin+0x336/0xc60 [btrfs]
+[12084.045370]  ? lock_is_held_type+0xe8/0x140
+[12084.045378]  iomap_iter+0x184/0x4c0
+[12084.045383]  __iomap_dio_rw+0x2c6/0x8a0
+[12084.045406]  iomap_dio_rw+0xa/0x30
+[12084.045408]  btrfs_do_write_iter+0x370/0x5e0 [btrfs]
+[12084.045440]  aio_write+0xfa/0x2c0
+[12084.045448]  ? __might_fault+0x2a/0x70
+[12084.045451]  ? kvm_sched_clock_read+0x14/0x40
+[12084.045455]  ? lock_release+0x153/0x4a0
+[12084.045463]  io_submit_one+0x615/0x9f0
+[12084.045467]  ? __might_fault+0x2a/0x70
+[12084.045469]  ? kvm_sched_clock_read+0x14/0x40
+[12084.045478]  __x64_sys_io_submit+0x83/0x160
+[12084.045483]  ? syscall_enter_from_user_mode+0x1d/0x50
+[12084.045489]  do_syscall_64+0x3b/0x90
+[12084.045517]  entry_SYSCALL_64_after_hwframe+0x44/0xae
+[12084.045521] RIP: 0033:0x7fa76511af79
+[12084.045525] RSP: 002b:00007ffd6d6b9058 EFLAGS: 00000246 ORIG_RAX: 00000000000000d1
+[12084.045530] RAX: ffffffffffffffda RBX: 00007fa75ba6e760 RCX: 00007fa76511af79
+[12084.045532] RDX: 0000557b304ff3f0 RSI: 0000000000000001 RDI: 00007fa75ba4c000
+[12084.045535] RBP: 00007fa75ba4c000 R08: 00007fa751b76000 R09: 0000000000000330
+[12084.045537] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001
+[12084.045540] R13: 0000000000000000 R14: 0000557b304ff3f0 R15: 0000557b30521eb0
+[12084.045561]  </TASK>
+
+Fix this issue by always reserving data space before locking a file range
+at btrfs_dio_iomap_begin(). If we can't reserve the space, then we don't
+error out immediately - instead after locking the file range, check if we
+can do a NOCOW write, and if we can we don't error out since we don't need
+to allocate a data extent, however if we can't NOCOW then error out with
+-ENOSPC. This also implies that we may end up reserving space when it's
+not needed because the write will end up being done in NOCOW mode - in that
+case we just release the space after we noticed we did a NOCOW write - this
+is the same type of logic that is done in the path for buffered IO writes.
+
+Fixes: f0bfa76a11e93d ("btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range")
+CC: stable@vger.kernel.org # 5.17+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/inode.c |   81 +++++++++++++++++++++++++++++++++++++++++++------------
+ 1 file changed, 64 insertions(+), 17 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -64,6 +64,8 @@ struct btrfs_iget_args {
+ struct btrfs_dio_data {
+ 	ssize_t submitted;
+ 	struct extent_changeset *data_reserved;
++	bool data_space_reserved;
++	bool nocow_done;
+ };
+ 
+ struct btrfs_rename_ctx {
+@@ -7489,15 +7491,25 @@ static int btrfs_get_blocks_direct_write
+ 			ret = PTR_ERR(em2);
+ 			goto out;
+ 		}
++
++		dio_data->nocow_done = true;
+ 	} else {
+ 		/* Our caller expects us to free the input extent map. */
+ 		free_extent_map(em);
+ 		*map = NULL;
+ 
+-		/* We have to COW, so need to reserve metadata and data space. */
+-		ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
+-						   &dio_data->data_reserved,
+-						   start, len);
++		/*
++		 * If we could not allocate data space before locking the file
++		 * range and we can't do a NOCOW write, then we have to fail.
++		 */
++		if (!dio_data->data_space_reserved)
++			return -ENOSPC;
++
++		/*
++		 * We have to COW and we have already reserved data space before,
++		 * so now we reserve only metadata.
++		 */
++		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len);
+ 		if (ret < 0)
+ 			goto out;
+ 		space_reserved = true;
+@@ -7510,10 +7522,8 @@ static int btrfs_get_blocks_direct_write
+ 		*map = em;
+ 		len = min(len, em->len - (start - em->start));
+ 		if (len < prev_len)
+-			btrfs_delalloc_release_space(BTRFS_I(inode),
+-						     dio_data->data_reserved,
+-						     start + len, prev_len - len,
+-						     true);
++			btrfs_delalloc_release_metadata(BTRFS_I(inode),
++							prev_len - len, true);
+ 	}
+ 
+ 	/*
+@@ -7531,15 +7541,7 @@ static int btrfs_get_blocks_direct_write
+ out:
+ 	if (ret && space_reserved) {
+ 		btrfs_delalloc_release_extents(BTRFS_I(inode), len);
+-		if (can_nocow) {
+-			btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
+-		} else {
+-			btrfs_delalloc_release_space(BTRFS_I(inode),
+-						     dio_data->data_reserved,
+-						     start, len, true);
+-			extent_changeset_free(dio_data->data_reserved);
+-			dio_data->data_reserved = NULL;
+-		}
++		btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
+ 	}
+ 	return ret;
+ }
+@@ -7556,6 +7558,7 @@ static int btrfs_dio_iomap_begin(struct
+ 	const bool write = !!(flags & IOMAP_WRITE);
+ 	int ret = 0;
+ 	u64 len = length;
++	const u64 data_alloc_len = length;
+ 	bool unlock_extents = false;
+ 
+ 	if (!write)
+@@ -7584,6 +7587,25 @@ static int btrfs_dio_iomap_begin(struct
+ 
+ 	iomap->private = dio_data;
+ 
++	/*
++	 * We always try to allocate data space and must do it before locking
++	 * the file range, to avoid deadlocks with concurrent writes to the same
++	 * range if the range has several extents and the writes don't expand the
++	 * current i_size (the inode lock is taken in shared mode). If we fail to
++	 * allocate data space here we continue and later, after locking the
++	 * file range, we fail with ENOSPC only if we figure out we can not do a
++	 * NOCOW write.
++	 */
++	if (write && !(flags & IOMAP_NOWAIT)) {
++		ret = btrfs_check_data_free_space(BTRFS_I(inode),
++						  &dio_data->data_reserved,
++						  start, data_alloc_len);
++		if (!ret)
++			dio_data->data_space_reserved = true;
++		else if (ret && !(BTRFS_I(inode)->flags &
++				  (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
++			goto err;
++	}
+ 
+ 	/*
+ 	 * If this errors out it's because we couldn't invalidate pagecache for
+@@ -7658,6 +7680,24 @@ static int btrfs_dio_iomap_begin(struct
+ 		unlock_extents = true;
+ 		/* Recalc len in case the new em is smaller than requested */
+ 		len = min(len, em->len - (start - em->start));
++		if (dio_data->data_space_reserved) {
++			u64 release_offset;
++			u64 release_len = 0;
++
++			if (dio_data->nocow_done) {
++				release_offset = start;
++				release_len = data_alloc_len;
++			} else if (len < data_alloc_len) {
++				release_offset = start + len;
++				release_len = data_alloc_len - len;
++			}
++
++			if (release_len > 0)
++				btrfs_free_reserved_data_space(BTRFS_I(inode),
++							       dio_data->data_reserved,
++							       release_offset,
++							       release_len);
++		}
+ 	} else {
+ 		/*
+ 		 * We need to unlock only the end area that we aren't using.
+@@ -7702,6 +7742,13 @@ unlock_err:
+ 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ 			     &cached_state);
+ err:
++	if (dio_data->data_space_reserved) {
++		btrfs_free_reserved_data_space(BTRFS_I(inode),
++					       dio_data->data_reserved,
++					       start, data_alloc_len);
++		extent_changeset_free(dio_data->data_reserved);
++	}
++
+ 	kfree(dio_data);
+ 
+ 	return ret;
diff --git a/queue-5.18/btrfs-fix-the-error-handling-for-submit_extent_page-for-btrfs_do_readpage.patch b/queue-5.18/btrfs-fix-the-error-handling-for-submit_extent_page-for-btrfs_do_readpage.patch
new file mode 100644
index 00000000000..4b3da441384
--- /dev/null
+++ b/queue-5.18/btrfs-fix-the-error-handling-for-submit_extent_page-for-btrfs_do_readpage.patch
@@ -0,0 +1,71 @@
+From 10f7f6f879c28f8368d6516ab1ccf3517a1f5d3d Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Tue, 12 Apr 2022 20:30:14 +0800
+Subject: btrfs: fix the error handling for submit_extent_page() for btrfs_do_readpage()
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 10f7f6f879c28f8368d6516ab1ccf3517a1f5d3d upstream.
+
+[BUG]
+Test case generic/475 have a very high chance (almost 100%) to hit a fs
+hang, where a data page will never be unlocked and hang all later
+operations.
+
+[CAUSE]
+In btrfs_do_readpage(), if we hit an error from submit_extent_page() we
+will try to do the cleanup for our current io range, and exit.
+
+This works fine for PAGE_SIZE == sectorsize cases, but not for subpage.
+
+For subpage btrfs_do_readpage() will lock the full page first, which can
+contain several different sectors and extents:
+
+ btrfs_do_readpage()
+ |- begin_page_read()
+ |  |- btrfs_subpage_start_reader();
+ |     Now the page will have PAGE_SIZE / sectorsize reader pending,
+ |     and the page is locked.
+ |
+ |- end_page_read() for different branches
+ |  This function will reduce subpage readers, and when readers
+ |  reach 0, it will unlock the page.
+
+But when submit_extent_page() failed, we only cleanup the current
+io range, while the remaining io range will never be cleaned up, and the
+page remains locked forever.
+
+[FIX]
+Update the error handling of submit_extent_page() to cleanup all the
+remaining subpage range before exiting the loop.
+
+Please note that, now submit_extent_page() can only fail due to
+sanity check in alloc_new_bio().
+
+Thus regular IO errors are impossible to trigger the error path.
+
+CC: stable@vger.kernel.org # 5.15+
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent_io.c |    8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -3743,8 +3743,12 @@ int btrfs_do_readpage(struct page *page,
+ 					 this_bio_flag,
+ 					 force_bio_submit);
+ 		if (ret) {
+-			unlock_extent(tree, cur, cur + iosize - 1);
+-			end_page_read(page, false, cur, iosize);
++			/*
++			 * We have to unlock the remaining range, or the page
++			 * will never be unlocked.
++			 */
++			unlock_extent(tree, cur, end);
++			end_page_read(page, false, cur, end + 1 - cur);
+ 			goto out;
+ 		}
+ 		cur = cur + iosize;
diff --git a/queue-5.18/btrfs-repair-super-block-num_devices-automatically.patch b/queue-5.18/btrfs-repair-super-block-num_devices-automatically.patch
new file mode 100644
index 00000000000..d14d81b07fc
--- /dev/null
+++ b/queue-5.18/btrfs-repair-super-block-num_devices-automatically.patch
@@ -0,0 +1,94 @@
+From d201238ccd2f30b9bfcfadaeae0972e3a486a176 Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Mon, 28 Feb 2022 15:05:53 +0800
+Subject: btrfs: repair super block num_devices automatically
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit d201238ccd2f30b9bfcfadaeae0972e3a486a176 upstream.
+
+[BUG]
+There is a report that a btrfs has a bad super block num devices.
+
+This makes btrfs to reject the fs completely.
+
+  BTRFS error (device sdd3): super_num_devices 3 mismatch with num_devices 2 found here
+  BTRFS error (device sdd3): failed to read chunk tree: -22
+  BTRFS error (device sdd3): open_ctree failed
+
+[CAUSE]
+During btrfs device removal, chunk tree and super block num devs are
+updated in two different transactions:
+
+  btrfs_rm_device()
+  |- btrfs_rm_dev_item(device)
+  |  |- trans = btrfs_start_transaction()
+  |  |  Now we got transaction X
+  |  |
+  |  |- btrfs_del_item()
+  |  |  Now device item is removed from chunk tree
+  |  |
+  |  |- btrfs_commit_transaction()
+  |     Transaction X got committed, super num devs untouched,
+  |     but device item removed from chunk tree.
+  |     (AKA, super num devs is already incorrect)
+  |
+  |- cur_devices->num_devices--;
+  |- cur_devices->total_devices--;
+  |- btrfs_set_super_num_devices()
+     All those operations are not in transaction X, thus it will
+     only be written back to disk in next transaction.
+
+So after the transaction X in btrfs_rm_dev_item() committed, but before
+transaction X+1 (which can be minutes away), a power loss happen, then
+we got the super num mismatch.
+
+This has been fixed by commit bbac58698a55 ("btrfs: remove device item
+and update super block in the same transaction").
+
+[FIX]
+Make the super_num_devices check less strict, converting it from a hard
+error to a warning, and reset the value to a correct one for the current
+or next transaction commit.
+
+As the number of device items is the critical information where the
+super block num_devices is only a cached value (and also useful for
+cross checking), it's safe to automatically update it. Other device
+related problems like missing device are handled after that and may
+require other means to resolve, like degraded mount. With this fix,
+potentially affected filesystems won't fail mount and require the manual
+repair by btrfs check.
+
+Reported-by: Luca BÃ©la Palkovics <luca.bela.palkovics@gmail.com>
+Link: https://lore.kernel.org/linux-btrfs/CA+8xDSpvdm_U0QLBAnrH=zqDq_cWCOH5TiV46CKmp3igr44okQ@mail.gmail.com/
+CC: stable@vger.kernel.org # 4.14+
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/volumes.c |    8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -7671,12 +7671,12 @@ int btrfs_read_chunk_tree(struct btrfs_f
+ 	 * do another round of validation checks.
+ 	 */
+ 	if (total_dev != fs_info->fs_devices->total_devices) {
+-		btrfs_err(fs_info,
+-	   "super_num_devices %llu mismatch with num_devices %llu found here",
++		btrfs_warn(fs_info,
++"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
+ 			  btrfs_super_num_devices(fs_info->super_copy),
+ 			  total_dev);
+-		ret = -EINVAL;
+-		goto error;
++		fs_info->fs_devices->total_devices = total_dev;
++		btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
+ 	}
+ 	if (btrfs_super_total_bytes(fs_info->super_copy) <
+ 	    fs_info->fs_devices->total_rw_bytes) {
diff --git a/queue-5.18/btrfs-return-correct-error-number-for-__extent_writepage_io.patch b/queue-5.18/btrfs-return-correct-error-number-for-__extent_writepage_io.patch
new file mode 100644
index 00000000000..8b75e6dddbb
--- /dev/null
+++ b/queue-5.18/btrfs-return-correct-error-number-for-__extent_writepage_io.patch
@@ -0,0 +1,88 @@
+From 44e5801fada6925d2bba1987c7b59cbcc9d0d592 Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Tue, 12 Apr 2022 20:30:15 +0800
+Subject: btrfs: return correct error number for __extent_writepage_io()
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 44e5801fada6925d2bba1987c7b59cbcc9d0d592 upstream.
+
+[BUG]
+If we hit an error from submit_extent_page() inside
+__extent_writepage_io(), we could still return 0 to the caller, and
+even trigger the warning in btrfs_page_assert_not_dirty().
+
+[CAUSE]
+In __extent_writepage_io(), if we hit an error from
+submit_extent_page(), we will just clean up the range and continue.
+
+This is completely fine for regular PAGE_SIZE == sectorsize, as we can
+only hit one sector in one page, thus after the error we're ensured to
+exit and @ret will be saved.
+
+But for subpage case, we may have other dirty subpage range in the page,
+and in the next loop, we may succeeded submitting the next range.
+
+In that case, @ret will be overwritten, and we return 0 to the caller,
+while we have hit some error.
+
+[FIX]
+Introduce @has_error and @saved_ret to record the first error we hit, so
+we will never forget what error we hit.
+
+CC: stable@vger.kernel.org # 5.15+
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent_io.c |   13 ++++++++++++-
+ 1 file changed, 12 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -3920,10 +3920,12 @@ static noinline_for_stack int __extent_w
+ 	u64 extent_offset;
+ 	u64 block_start;
+ 	struct extent_map *em;
++	int saved_ret = 0;
+ 	int ret = 0;
+ 	int nr = 0;
+ 	u32 opf = REQ_OP_WRITE;
+ 	const unsigned int write_flags = wbc_to_write_flags(wbc);
++	bool has_error = false;
+ 	bool compressed;
+ 
+ 	ret = btrfs_writepage_cow_fixup(page);
+@@ -3973,6 +3975,9 @@ static noinline_for_stack int __extent_w
+ 		if (IS_ERR(em)) {
+ 			btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
+ 			ret = PTR_ERR_OR_ZERO(em);
++			has_error = true;
++			if (!saved_ret)
++				saved_ret = ret;
+ 			break;
+ 		}
+ 
+@@ -4036,6 +4041,10 @@ static noinline_for_stack int __extent_w
+ 					 end_bio_extent_writepage,
+ 					 0, 0, false);
+ 		if (ret) {
++			has_error = true;
++			if (!saved_ret)
++				saved_ret = ret;
++
+ 			btrfs_page_set_error(fs_info, page, cur, iosize);
+ 			if (PageWriteback(page))
+ 				btrfs_page_clear_writeback(fs_info, page, cur,
+@@ -4049,8 +4058,10 @@ static noinline_for_stack int __extent_w
+ 	 * If we finish without problem, we should not only clear page dirty,
+ 	 * but also empty subpage dirty bits
+ 	 */
+-	if (!ret)
++	if (!has_error)
+ 		btrfs_page_assert_not_dirty(fs_info, page);
++	else
++		ret = saved_ret;
+ 	*nr_ret = nr;
+ 	return ret;
+ }
diff --git a/queue-5.18/btrfs-zoned-finish-block-group-when-there-are-no-more-allocatable-bytes-left.patch b/queue-5.18/btrfs-zoned-finish-block-group-when-there-are-no-more-allocatable-bytes-left.patch
new file mode 100644
index 00000000000..d5f6a74b227
--- /dev/null
+++ b/queue-5.18/btrfs-zoned-finish-block-group-when-there-are-no-more-allocatable-bytes-left.patch
@@ -0,0 +1,51 @@
+From 8b8a53998caefebfe5c8da7a74c2b601caf5dd48 Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Tue, 3 May 2022 17:48:52 -0700
+Subject: btrfs: zoned: finish block group when there are no more allocatable bytes left
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 8b8a53998caefebfe5c8da7a74c2b601caf5dd48 upstream.
+
+Currently, btrfs_zone_finish_endio() finishes a block group only when the
+written region reaches the end of the block group. We can also finish the
+block group when no more allocation is possible.
+
+Fixes: be1a1d7a5d24 ("btrfs: zoned: finish fully written block group")
+CC: stable@vger.kernel.org # 5.16+
+Reviewed-by: Pankaj Raghav <p.raghav@samsung.com>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/zoned.c |   11 ++++++++++-
+ 1 file changed, 10 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -2000,6 +2000,7 @@ void btrfs_zone_finish_endio(struct btrf
+ 	struct btrfs_block_group *block_group;
+ 	struct map_lookup *map;
+ 	struct btrfs_device *device;
++	u64 min_alloc_bytes;
+ 	u64 physical;
+ 
+ 	if (!btrfs_is_zoned(fs_info))
+@@ -2008,7 +2009,15 @@ void btrfs_zone_finish_endio(struct btrf
+ 	block_group = btrfs_lookup_block_group(fs_info, logical);
+ 	ASSERT(block_group);
+ 
+-	if (logical + length < block_group->start + block_group->zone_capacity)
++	/* No MIXED_BG on zoned btrfs. */
++	if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
++		min_alloc_bytes = fs_info->sectorsize;
++	else
++		min_alloc_bytes = fs_info->nodesize;
++
++	/* Bail out if we can allocate more data from this block group. */
++	if (logical + length + min_alloc_bytes <=
++	    block_group->start + block_group->zone_capacity)
+ 		goto out;
+ 
+ 	spin_lock(&block_group->lock);
diff --git a/queue-5.18/btrfs-zoned-fix-comparison-of-alloc_offset-vs-meta_write_pointer.patch b/queue-5.18/btrfs-zoned-fix-comparison-of-alloc_offset-vs-meta_write_pointer.patch
new file mode 100644
index 00000000000..e73bf66cded
--- /dev/null
+++ b/queue-5.18/btrfs-zoned-fix-comparison-of-alloc_offset-vs-meta_write_pointer.patch
@@ -0,0 +1,34 @@
+From aa9ffadfcae33e611d8c2d476bcc2aa0d273b587 Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Wed, 4 May 2022 16:12:48 -0700
+Subject: btrfs: zoned: fix comparison of alloc_offset vs meta_write_pointer
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit aa9ffadfcae33e611d8c2d476bcc2aa0d273b587 upstream.
+
+The block_group->alloc_offset is an offset from the start of the block
+group. OTOH, the ->meta_write_pointer is an address in the logical
+space. So, we should compare the alloc_offset shifted with the
+block_group->start.
+
+Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking")
+CC: stable@vger.kernel.org # 5.16+
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/zoned.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -1896,7 +1896,7 @@ int btrfs_zone_finish(struct btrfs_block
+ 	/* Check if we have unwritten allocated space */
+ 	if ((block_group->flags &
+ 	     (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) &&
+-	    block_group->alloc_offset > block_group->meta_write_pointer) {
++	    block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
+ 		spin_unlock(&block_group->lock);
+ 		return -EAGAIN;
+ 	}
diff --git a/queue-5.18/btrfs-zoned-properly-finish-block-group-on-metadata-write.patch b/queue-5.18/btrfs-zoned-properly-finish-block-group-on-metadata-write.patch
new file mode 100644
index 00000000000..c33fcba10e4
--- /dev/null
+++ b/queue-5.18/btrfs-zoned-properly-finish-block-group-on-metadata-write.patch
@@ -0,0 +1,144 @@
+From 56fbb0a4e8b3e929e41cc846e6ef89eb01152201 Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Tue, 3 May 2022 17:48:53 -0700
+Subject: btrfs: zoned: properly finish block group on metadata write
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 56fbb0a4e8b3e929e41cc846e6ef89eb01152201 upstream.
+
+Commit be1a1d7a5d24 ("btrfs: zoned: finish fully written block group")
+introduced zone finishing code both for data and metadata end_io path.
+However, the metadata side is not working as it should. First, it
+compares logical address (eb->start + eb->len) with offset within a
+block group (cache->zone_capacity) in submit_eb_page(). That essentially
+disabled zone finishing on metadata end_io path.
+
+Furthermore, fixing the issue above revealed we cannot call
+btrfs_zone_finish_endio() in end_extent_buffer_writeback(). We cannot
+call btrfs_lookup_block_group() which require spin lock inside end_io
+context.
+
+Introduce btrfs_schedule_zone_finish_bg() to wait for the extent buffer
+writeback and do the zone finish IO in a workqueue.
+
+Also, drop EXTENT_BUFFER_ZONE_FINISH as it is no longer used.
+
+Fixes: be1a1d7a5d24 ("btrfs: zoned: finish fully written block group")
+CC: stable@vger.kernel.org # 5.16+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/block-group.h |    2 ++
+ fs/btrfs/extent_io.c   |    6 +-----
+ fs/btrfs/extent_io.h   |    1 -
+ fs/btrfs/zoned.c       |   31 +++++++++++++++++++++++++++++++
+ fs/btrfs/zoned.h       |    5 +++++
+ 5 files changed, 39 insertions(+), 6 deletions(-)
+
+--- a/fs/btrfs/block-group.h
++++ b/fs/btrfs/block-group.h
+@@ -212,6 +212,8 @@ struct btrfs_block_group {
+ 	u64 meta_write_pointer;
+ 	struct map_lookup *physical_map;
+ 	struct list_head active_bg_list;
++	struct work_struct zone_finish_work;
++	struct extent_buffer *last_eb;
+ };
+ 
+ static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -4196,9 +4196,6 @@ void wait_on_extent_buffer_writeback(str
+ 
+ static void end_extent_buffer_writeback(struct extent_buffer *eb)
+ {
+-	if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags))
+-		btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len);
+-
+ 	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+ 	smp_mb__after_atomic();
+ 	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+@@ -4818,8 +4815,7 @@ static int submit_eb_page(struct page *p
+ 		/*
+ 		 * Implies write in zoned mode. Mark the last eb in a block group.
+ 		 */
+-		if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity)
+-			set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags);
++		btrfs_schedule_zone_finish_bg(cache, eb);
+ 		btrfs_put_block_group(cache);
+ 	}
+ 	ret = write_one_eb(eb, wbc, epd);
+--- a/fs/btrfs/extent_io.h
++++ b/fs/btrfs/extent_io.h
+@@ -32,7 +32,6 @@ enum {
+ 	/* write IO error */
+ 	EXTENT_BUFFER_WRITE_ERR,
+ 	EXTENT_BUFFER_NO_CHECK,
+-	EXTENT_BUFFER_ZONE_FINISH,
+ };
+ 
+ /* these are flags for __process_pages_contig */
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -2046,6 +2046,37 @@ out:
+ 	btrfs_put_block_group(block_group);
+ }
+ 
++static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
++{
++	struct btrfs_block_group *bg =
++		container_of(work, struct btrfs_block_group, zone_finish_work);
++
++	wait_on_extent_buffer_writeback(bg->last_eb);
++	free_extent_buffer(bg->last_eb);
++	btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
++	btrfs_put_block_group(bg);
++}
++
++void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
++				   struct extent_buffer *eb)
++{
++	if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
++		return;
++
++	if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
++		btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing",
++			  bg->start);
++		return;
++	}
++
++	/* For the work */
++	btrfs_get_block_group(bg);
++	atomic_inc(&eb->refs);
++	bg->last_eb = eb;
++	INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
++	queue_work(system_unbound_wq, &bg->zone_finish_work);
++}
++
+ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
+ {
+ 	struct btrfs_fs_info *fs_info = bg->fs_info;
+--- a/fs/btrfs/zoned.h
++++ b/fs/btrfs/zoned.h
+@@ -76,6 +76,8 @@ int btrfs_zone_finish(struct btrfs_block
+ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
+ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+ 			     u64 length);
++void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
++				   struct extent_buffer *eb);
+ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
+ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
+ #else /* CONFIG_BLK_DEV_ZONED */
+@@ -233,6 +235,9 @@ static inline bool btrfs_can_activate_zo
+ static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
+ 					   u64 logical, u64 length) { }
+ 
++static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
++						 struct extent_buffer *eb) { }
++
+ static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
+ 
+ static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
diff --git a/queue-5.18/btrfs-zoned-zone-finish-unused-block-group.patch b/queue-5.18/btrfs-zoned-zone-finish-unused-block-group.patch
new file mode 100644
index 00000000000..6ea6be08ebd
--- /dev/null
+++ b/queue-5.18/btrfs-zoned-zone-finish-unused-block-group.patch
@@ -0,0 +1,49 @@
+From 74e91b12b11560f01d120751d99d91d54b265d3d Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Tue, 3 May 2022 17:48:54 -0700
+Subject: btrfs: zoned: zone finish unused block group
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 74e91b12b11560f01d120751d99d91d54b265d3d upstream.
+
+While the active zones within an active block group are reset, and their
+active resource is released, the block group itself is kept in the active
+block group list and marked as active. As a result, the list will contain
+more than max_active_zones block groups. That itself is not fatal for the
+device as the zones are properly reset.
+
+However, that inflated list is, of course, strange. Also, a to-appear
+patch series, which deactivates an active block group on demand, gets
+confused with the wrong list.
+
+So, fix the issue by finishing the unused block group once it gets
+read-only, so that we can release the active resource in an early stage.
+
+Fixes: be1a1d7a5d24 ("btrfs: zoned: finish fully written block group")
+CC: stable@vger.kernel.org # 5.16+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/block-group.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -1367,6 +1367,14 @@ void btrfs_delete_unused_bgs(struct btrf
+ 			goto next;
+ 		}
+ 
++		ret = btrfs_zone_finish(block_group);
++		if (ret < 0) {
++			btrfs_dec_block_group_ro(block_group);
++			if (ret == -EAGAIN)
++				ret = 0;
++			goto next;
++		}
++
+ 		/*
+ 		 * Want to do this before we do anything else so we can recover
+ 		 * properly if we fail to join the transaction.
diff --git a/queue-5.18/series b/queue-5.18/series
index 9ce884e699c..7f1ab4032c7 100644
--- a/queue-5.18/series
+++ b/queue-5.18/series
@@ -42,3 +42,12 @@ kthread-don-t-allocate-kthread_struct-for-init-and-umh.patch
 ptrace-um-replace-pt_dtrace-with-tif_singlestep.patch
 ptrace-xtensa-replace-pt_singlestep-with-tif_singlestep.patch
 ptrace-reimplement-ptrace_kill-by-always-sending-sigkill.patch
+btrfs-add-0x-prefix-for-unsupported-optional-features.patch
+btrfs-return-correct-error-number-for-__extent_writepage_io.patch
+btrfs-repair-super-block-num_devices-automatically.patch
+btrfs-fix-the-error-handling-for-submit_extent_page-for-btrfs_do_readpage.patch
+btrfs-fix-deadlock-between-concurrent-dio-writes-when-low-on-free-data-space.patch
+btrfs-zoned-properly-finish-block-group-on-metadata-write.patch
+btrfs-zoned-zone-finish-unused-block-group.patch
+btrfs-zoned-finish-block-group-when-there-are-no-more-allocatable-bytes-left.patch
+btrfs-zoned-fix-comparison-of-alloc_offset-vs-meta_write_pointer.patch
-- 
2.47.3