From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 15 Sep 2021 11:37:27 +0000 (+0200)
Subject: 5.14-stable patches
X-Git-Tag: v5.14.5~51
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=b283682235d150a1a9a59a1ff48de9b5a7a99b26;p=thirdparty%2Fkernel%2Fstable-queue.git

5.14-stable patches

added patches:
	9p-xen-fix-end-of-loop-tests-for-list_for_each_entry.patch
	blk-zoned-allow-blkreportzone-without-cap_sys_admin.patch
	blk-zoned-allow-zone-management-send-operations-without-cap_sys_admin.patch
	btrfs-do-not-do-preemptive-flushing-if-the-majority-is-global-rsv.patch
	btrfs-fix-upper-limit-for-max_inline-for-page-size-64k.patch
	btrfs-reduce-the-preemptive-flushing-threshold-to-90.patch
	btrfs-reset-replace-target-device-to-allocation-state-on-close.patch
	btrfs-use-delalloc_bytes-to-determine-flush-amount-for-shrink_delalloc.patch
	btrfs-wait-on-async-extents-when-flushing-delalloc.patch
	btrfs-wake-up-async_delalloc_pages-waiters-after-submit.patch
	btrfs-zoned-fix-block-group-alloc_offset-calculation.patch
	btrfs-zoned-fix-double-counting-of-split-ordered-extent.patch
	btrfs-zoned-suppress-reclaim-error-message-on-eagain.patch
	ceph-fix-dereference-of-null-pointer-cf.patch
	input-elan_i2c-reduce-the-resume-time-for-controller-in-whitebox.patch
	powerpc-perf-hv-gpci-fix-counter-value-parsing.patch
	xen-fix-setting-of-max_pfn-in-shared_info.patch
---

diff --git a/queue-5.14/9p-xen-fix-end-of-loop-tests-for-list_for_each_entry.patch b/queue-5.14/9p-xen-fix-end-of-loop-tests-for-list_for_each_entry.patch
new file mode 100644
index 00000000000..bd280e9137f
--- /dev/null
+++ b/queue-5.14/9p-xen-fix-end-of-loop-tests-for-list_for_each_entry.patch
@@ -0,0 +1,46 @@
+From 732b33d0dbf17e9483f0b50385bf606f724f50a2 Mon Sep 17 00:00:00 2001
+From: Harshvardhan Jha <harshvardhan.jha@oracle.com>
+Date: Tue, 27 Jul 2021 05:37:10 +0530
+Subject: 9p/xen: Fix end of loop tests for list_for_each_entry
+
+From: Harshvardhan Jha <harshvardhan.jha@oracle.com>
+
+commit 732b33d0dbf17e9483f0b50385bf606f724f50a2 upstream.
+
+This patch addresses the following problems:
+ - priv can never be NULL, so this part of the check is useless
+ - if the loop ran through the whole list, priv->client is invalid and
+it is more appropriate and sufficient to check for the end of
+list_for_each_entry loop condition.
+
+Link: http://lkml.kernel.org/r/20210727000709.225032-1-harshvardhan.jha@oracle.com
+Signed-off-by: Harshvardhan Jha <harshvardhan.jha@oracle.com>
+Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
+Tested-by: Stefano Stabellini <sstabellini@kernel.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/9p/trans_xen.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/net/9p/trans_xen.c
++++ b/net/9p/trans_xen.c
+@@ -138,7 +138,7 @@ static bool p9_xen_write_todo(struct xen
+ 
+ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req)
+ {
+-	struct xen_9pfs_front_priv *priv = NULL;
++	struct xen_9pfs_front_priv *priv;
+ 	RING_IDX cons, prod, masked_cons, masked_prod;
+ 	unsigned long flags;
+ 	u32 size = p9_req->tc.size;
+@@ -151,7 +151,7 @@ static int p9_xen_request(struct p9_clie
+ 			break;
+ 	}
+ 	read_unlock(&xen_9pfs_lock);
+-	if (!priv || priv->client != client)
++	if (list_entry_is_head(priv, &xen_9pfs_devs, list))
+ 		return -EINVAL;
+ 
+ 	num = p9_req->tc.tag % priv->num_rings;
diff --git a/queue-5.14/blk-zoned-allow-blkreportzone-without-cap_sys_admin.patch b/queue-5.14/blk-zoned-allow-blkreportzone-without-cap_sys_admin.patch
new file mode 100644
index 00000000000..6f72d8da17b
--- /dev/null
+++ b/queue-5.14/blk-zoned-allow-blkreportzone-without-cap_sys_admin.patch
@@ -0,0 +1,45 @@
+From 4d643b66089591b4769bcdb6fd1bfeff2fe301b8 Mon Sep 17 00:00:00 2001
+From: Niklas Cassel <niklas.cassel@wdc.com>
+Date: Wed, 11 Aug 2021 11:05:19 +0000
+Subject: blk-zoned: allow BLKREPORTZONE without CAP_SYS_ADMIN
+
+From: Niklas Cassel <niklas.cassel@wdc.com>
+
+commit 4d643b66089591b4769bcdb6fd1bfeff2fe301b8 upstream.
+
+A user space process should not need the CAP_SYS_ADMIN capability set
+in order to perform a BLKREPORTZONE ioctl.
+
+Getting the zone report is required in order to get the write pointer.
+Neither read() nor write() requires CAP_SYS_ADMIN, so it is reasonable
+that a user space process that can read/write from/to the device, also
+can get the write pointer. (Since e.g. writes have to be at the write
+pointer.)
+
+Fixes: 3ed05a987e0f ("blk-zoned: implement ioctls")
+Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
+Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
+Reviewed-by: Aravind Ramesh <aravind.ramesh@wdc.com>
+Reviewed-by: Adam Manzanares <a.manzanares@samsung.com>
+Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Cc: stable@vger.kernel.org # v4.10+
+Link: https://lore.kernel.org/r/20210811110505.29649-3-Niklas.Cassel@wdc.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/blk-zoned.c |    3 ---
+ 1 file changed, 3 deletions(-)
+
+--- a/block/blk-zoned.c
++++ b/block/blk-zoned.c
+@@ -360,9 +360,6 @@ int blkdev_report_zones_ioctl(struct blo
+ 	if (!blk_queue_is_zoned(q))
+ 		return -ENOTTY;
+ 
+-	if (!capable(CAP_SYS_ADMIN))
+-		return -EACCES;
+-
+ 	if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
+ 		return -EFAULT;
+ 
diff --git a/queue-5.14/blk-zoned-allow-zone-management-send-operations-without-cap_sys_admin.patch b/queue-5.14/blk-zoned-allow-zone-management-send-operations-without-cap_sys_admin.patch
new file mode 100644
index 00000000000..b64a7326d44
--- /dev/null
+++ b/queue-5.14/blk-zoned-allow-zone-management-send-operations-without-cap_sys_admin.patch
@@ -0,0 +1,51 @@
+From ead3b768bb51259e3a5f2287ff5fc9041eb6f450 Mon Sep 17 00:00:00 2001
+From: Niklas Cassel <niklas.cassel@wdc.com>
+Date: Wed, 11 Aug 2021 11:05:18 +0000
+Subject: blk-zoned: allow zone management send operations without CAP_SYS_ADMIN
+
+From: Niklas Cassel <niklas.cassel@wdc.com>
+
+commit ead3b768bb51259e3a5f2287ff5fc9041eb6f450 upstream.
+
+Zone management send operations (BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE
+and BLKFINISHZONE) should be allowed under the same permissions as write().
+(write() does not require CAP_SYS_ADMIN).
+
+Additionally, other ioctls like BLKSECDISCARD and BLKZEROOUT only check if
+the fd was successfully opened with FMODE_WRITE.
+(They do not require CAP_SYS_ADMIN).
+
+Currently, zone management send operations require both CAP_SYS_ADMIN
+and that the fd was successfully opened with FMODE_WRITE.
+
+Remove the CAP_SYS_ADMIN requirement, so that zone management send
+operations match the access control requirement of write(), BLKSECDISCARD
+and BLKZEROOUT.
+
+Fixes: 3ed05a987e0f ("blk-zoned: implement ioctls")
+Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
+Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
+Reviewed-by: Aravind Ramesh <aravind.ramesh@wdc.com>
+Reviewed-by: Adam Manzanares <a.manzanares@samsung.com>
+Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Cc: stable@vger.kernel.org # v4.10+
+Link: https://lore.kernel.org/r/20210811110505.29649-2-Niklas.Cassel@wdc.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/blk-zoned.c |    3 ---
+ 1 file changed, 3 deletions(-)
+
+--- a/block/blk-zoned.c
++++ b/block/blk-zoned.c
+@@ -421,9 +421,6 @@ int blkdev_zone_mgmt_ioctl(struct block_
+ 	if (!blk_queue_is_zoned(q))
+ 		return -ENOTTY;
+ 
+-	if (!capable(CAP_SYS_ADMIN))
+-		return -EACCES;
+-
+ 	if (!(mode & FMODE_WRITE))
+ 		return -EBADF;
+ 
diff --git a/queue-5.14/btrfs-do-not-do-preemptive-flushing-if-the-majority-is-global-rsv.patch b/queue-5.14/btrfs-do-not-do-preemptive-flushing-if-the-majority-is-global-rsv.patch
new file mode 100644
index 00000000000..777de276ee4
--- /dev/null
+++ b/queue-5.14/btrfs-do-not-do-preemptive-flushing-if-the-majority-is-global-rsv.patch
@@ -0,0 +1,49 @@
+From 114623979405abf0b143f9c6688b3ff00ee48338 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 11 Aug 2021 14:37:16 -0400
+Subject: btrfs: do not do preemptive flushing if the majority is global rsv
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 114623979405abf0b143f9c6688b3ff00ee48338 upstream.
+
+A common characteristic of the bug report where preemptive flushing was
+going full tilt was the fact that the vast majority of the free metadata
+space was used up by the global reserve.  The hard 90% threshold would
+cover the majority of these cases, but to be even smarter we should take
+into account how much of the outstanding reservations are covered by the
+global block reserve.  If the global block reserve accounts for the vast
+majority of outstanding reservations, skip preemptive flushing, as it
+will likely just cause churn and pain.
+
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=212185
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/space-info.c |   14 ++++++++++++++
+ 1 file changed, 14 insertions(+)
+
+--- a/fs/btrfs/space-info.c
++++ b/fs/btrfs/space-info.c
+@@ -741,6 +741,20 @@ static bool need_preemptive_reclaim(stru
+ 	     global_rsv_size) >= thresh)
+ 		return false;
+ 
++	used = space_info->bytes_may_use + space_info->bytes_pinned;
++
++	/* The total flushable belongs to the global rsv, don't flush. */
++	if (global_rsv_size >= used)
++		return false;
++
++	/*
++	 * 128MiB is 1/4 of the maximum global rsv size.  If we have less than
++	 * that devoted to other reservations then there's no sense in flushing,
++	 * we don't have a lot of things that need flushing.
++	 */
++	if (used - global_rsv_size <= SZ_128M)
++		return false;
++
+ 	/*
+ 	 * We have tickets queued, bail so we don't compete with the async
+ 	 * flushers.
diff --git a/queue-5.14/btrfs-fix-upper-limit-for-max_inline-for-page-size-64k.patch b/queue-5.14/btrfs-fix-upper-limit-for-max_inline-for-page-size-64k.patch
new file mode 100644
index 00000000000..1f1f499f1c2
--- /dev/null
+++ b/queue-5.14/btrfs-fix-upper-limit-for-max_inline-for-page-size-64k.patch
@@ -0,0 +1,93 @@
+From 6f93e834fa7c5faa0372e46828b4b2a966ac61d7 Mon Sep 17 00:00:00 2001
+From: Anand Jain <anand.jain@oracle.com>
+Date: Tue, 10 Aug 2021 23:23:44 +0800
+Subject: btrfs: fix upper limit for max_inline for page size 64K
+
+From: Anand Jain <anand.jain@oracle.com>
+
+commit 6f93e834fa7c5faa0372e46828b4b2a966ac61d7 upstream.
+
+The mount option max_inline ranges from 0 to the sectorsize (which is
+now equal to page size). But we parse the mount options too early and
+before the actual sectorsize is read from the superblock. So the upper
+limit of max_inline is unaware of the actual sectorsize and is limited
+by the temporary sectorsize 4096, even on a system where the default
+sectorsize is 64K.
+
+Fix this by reading the superblock sectorsize before the mount option
+parse.
+
+Reported-by: Alexander Tsvetkov <alexander.tsvetkov@oracle.com>
+CC: stable@vger.kernel.org # 5.4+
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/disk-io.c |   48 ++++++++++++++++++++++++------------------------
+ 1 file changed, 24 insertions(+), 24 deletions(-)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -3314,6 +3314,30 @@ int __cold open_ctree(struct super_block
+ 	 */
+ 	fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+ 
++	/*
++	 * Flag our filesystem as having big metadata blocks if they are bigger
++	 * than the page size.
++	 */
++	if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
++		if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
++			btrfs_info(fs_info,
++				"flagging fs with big metadata feature");
++		features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
++	}
++
++	/* Set up fs_info before parsing mount options */
++	nodesize = btrfs_super_nodesize(disk_super);
++	sectorsize = btrfs_super_sectorsize(disk_super);
++	stripesize = sectorsize;
++	fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
++	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
++
++	fs_info->nodesize = nodesize;
++	fs_info->sectorsize = sectorsize;
++	fs_info->sectorsize_bits = ilog2(sectorsize);
++	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
++	fs_info->stripesize = stripesize;
++
+ 	ret = btrfs_parse_options(fs_info, options, sb->s_flags);
+ 	if (ret) {
+ 		err = ret;
+@@ -3341,30 +3365,6 @@ int __cold open_ctree(struct super_block
+ 		btrfs_info(fs_info, "has skinny extents");
+ 
+ 	/*
+-	 * flag our filesystem as having big metadata blocks if
+-	 * they are bigger than the page size
+-	 */
+-	if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
+-		if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
+-			btrfs_info(fs_info,
+-				"flagging fs with big metadata feature");
+-		features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+-	}
+-
+-	nodesize = btrfs_super_nodesize(disk_super);
+-	sectorsize = btrfs_super_sectorsize(disk_super);
+-	stripesize = sectorsize;
+-	fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
+-	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
+-
+-	/* Cache block sizes */
+-	fs_info->nodesize = nodesize;
+-	fs_info->sectorsize = sectorsize;
+-	fs_info->sectorsize_bits = ilog2(sectorsize);
+-	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
+-	fs_info->stripesize = stripesize;
+-
+-	/*
+ 	 * mixed block groups end up with duplicate but slightly offset
+ 	 * extent buffers for the same range.  It leads to corruptions
+ 	 */
diff --git a/queue-5.14/btrfs-reduce-the-preemptive-flushing-threshold-to-90.patch b/queue-5.14/btrfs-reduce-the-preemptive-flushing-threshold-to-90.patch
new file mode 100644
index 00000000000..d89e77c9993
--- /dev/null
+++ b/queue-5.14/btrfs-reduce-the-preemptive-flushing-threshold-to-90.patch
@@ -0,0 +1,39 @@
+From 93c60b17f2b5fca2c5931d7944788d1ef5f25528 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 11 Aug 2021 14:37:15 -0400
+Subject: btrfs: reduce the preemptive flushing threshold to 90%
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 93c60b17f2b5fca2c5931d7944788d1ef5f25528 upstream.
+
+The preemptive flushing code was added in order to avoid needing to
+synchronously wait for ENOSPC flushing to recover space.  Once we're
+almost full however we can essentially flush constantly.  We were using
+98% as a threshold to determine if we were simply full, however in
+practice this is a really high bar to hit.  For example reports of
+systems running into this problem had around 94% usage and thus
+continued to flush.  Fix this by lowering the threshold to 90%, which is
+a more sane value, especially for smaller file systems.
+
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=212185
+CC: stable@vger.kernel.org # 5.12+
+Fixes: 576fa34830af ("btrfs: improve preemptive background space flushing")
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/space-info.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/space-info.c
++++ b/fs/btrfs/space-info.c
+@@ -733,7 +733,7 @@ static bool need_preemptive_reclaim(stru
+ {
+ 	u64 global_rsv_size = fs_info->global_block_rsv.reserved;
+ 	u64 ordered, delalloc;
+-	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
++	u64 thresh = div_factor_fine(space_info->total_bytes, 90);
+ 	u64 used;
+ 
+ 	/* If we're just plain full then async reclaim just slows us down. */
diff --git a/queue-5.14/btrfs-reset-replace-target-device-to-allocation-state-on-close.patch b/queue-5.14/btrfs-reset-replace-target-device-to-allocation-state-on-close.patch
new file mode 100644
index 00000000000..4798af24cc0
--- /dev/null
+++ b/queue-5.14/btrfs-reset-replace-target-device-to-allocation-state-on-close.patch
@@ -0,0 +1,125 @@
+From 0d977e0eba234e01a60bdde27314dc21374201b3 Mon Sep 17 00:00:00 2001
+From: Desmond Cheong Zhi Xi <desmondcheongzx@gmail.com>
+Date: Sat, 21 Aug 2021 01:50:40 +0800
+Subject: btrfs: reset replace target device to allocation state on close
+
+From: Desmond Cheong Zhi Xi <desmondcheongzx@gmail.com>
+
+commit 0d977e0eba234e01a60bdde27314dc21374201b3 upstream.
+
+This crash was observed with a failed assertion on device close:
+
+  BTRFS: Transaction aborted (error -28)
+  WARNING: CPU: 1 PID: 3902 at fs/btrfs/extent-tree.c:2150 btrfs_run_delayed_refs+0x1d2/0x1e0 [btrfs]
+  Modules linked in: btrfs blake2b_generic libcrc32c crc32c_intel xor zstd_decompress zstd_compress xxhash lzo_compress lzo_decompress raid6_pq loop
+  CPU: 1 PID: 3902 Comm: kworker/u8:4 Not tainted 5.14.0-rc5-default+ #1532
+  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
+  Workqueue: events_unbound btrfs_async_reclaim_metadata_space [btrfs]
+  RIP: 0010:btrfs_run_delayed_refs+0x1d2/0x1e0 [btrfs]
+  RSP: 0018:ffffb7a5452d7d80 EFLAGS: 00010282
+  RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000000
+  RDX: 0000000000000001 RSI: ffffffffabee13c4 RDI: 00000000ffffffff
+  RBP: ffff97834176a378 R08: 0000000000000001 R09: 0000000000000001
+  R10: 0000000000000000 R11: 0000000000000001 R12: ffff97835195d388
+  R13: 0000000005b08000 R14: ffff978385484000 R15: 000000000000016c
+  FS:  0000000000000000(0000) GS:ffff9783bd800000(0000) knlGS:0000000000000000
+  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+  CR2: 000056190d003fe8 CR3: 000000002a81e005 CR4: 0000000000170ea0
+  Call Trace:
+   flush_space+0x197/0x2f0 [btrfs]
+   btrfs_async_reclaim_metadata_space+0x139/0x300 [btrfs]
+   process_one_work+0x262/0x5e0
+   worker_thread+0x4c/0x320
+   ? process_one_work+0x5e0/0x5e0
+   kthread+0x144/0x170
+   ? set_kthread_struct+0x40/0x40
+   ret_from_fork+0x1f/0x30
+  irq event stamp: 19334989
+  hardirqs last  enabled at (19334997): [<ffffffffab0e0c87>] console_unlock+0x2b7/0x400
+  hardirqs last disabled at (19335006): [<ffffffffab0e0d0d>] console_unlock+0x33d/0x400
+  softirqs last  enabled at (19334900): [<ffffffffaba0030d>] __do_softirq+0x30d/0x574
+  softirqs last disabled at (19334893): [<ffffffffab0721ec>] irq_exit_rcu+0x12c/0x140
+  ---[ end trace 45939e308e0dd3c7 ]---
+  BTRFS: error (device vdd) in btrfs_run_delayed_refs:2150: errno=-28 No space left
+  BTRFS info (device vdd): forced readonly
+  BTRFS warning (device vdd): failed setting block group ro: -30
+  BTRFS info (device vdd): suspending dev_replace for unmount
+  assertion failed: !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state), in fs/btrfs/volumes.c:1150
+  ------------[ cut here ]------------
+  kernel BUG at fs/btrfs/ctree.h:3431!
+  invalid opcode: 0000 [#1] PREEMPT SMP
+  CPU: 1 PID: 3982 Comm: umount Tainted: G        W         5.14.0-rc5-default+ #1532
+  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
+  RIP: 0010:assertfail.constprop.0+0x18/0x1a [btrfs]
+  RSP: 0018:ffffb7a5454c7db8 EFLAGS: 00010246
+  RAX: 0000000000000068 RBX: ffff978364b91c00 RCX: 0000000000000000
+  RDX: 0000000000000000 RSI: ffffffffabee13c4 RDI: 00000000ffffffff
+  RBP: ffff9783523a4c00 R08: 0000000000000001 R09: 0000000000000001
+  R10: 0000000000000000 R11: 0000000000000001 R12: ffff9783523a4d18
+  R13: 0000000000000000 R14: 0000000000000004 R15: 0000000000000003
+  FS:  00007f61c8f42800(0000) GS:ffff9783bd800000(0000) knlGS:0000000000000000
+  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+  CR2: 000056190cffa810 CR3: 0000000030b96002 CR4: 0000000000170ea0
+  Call Trace:
+   btrfs_close_one_device.cold+0x11/0x55 [btrfs]
+   close_fs_devices+0x44/0xb0 [btrfs]
+   btrfs_close_devices+0x48/0x160 [btrfs]
+   generic_shutdown_super+0x69/0x100
+   kill_anon_super+0x14/0x30
+   btrfs_kill_super+0x12/0x20 [btrfs]
+   deactivate_locked_super+0x2c/0xa0
+   cleanup_mnt+0x144/0x1b0
+   task_work_run+0x59/0xa0
+   exit_to_user_mode_loop+0xe7/0xf0
+   exit_to_user_mode_prepare+0xaf/0xf0
+   syscall_exit_to_user_mode+0x19/0x50
+   do_syscall_64+0x4a/0x90
+   entry_SYSCALL_64_after_hwframe+0x44/0xae
+
+This happens when close_ctree is called while a dev_replace hasn't
+completed. In close_ctree, we suspend the dev_replace, but keep the
+replace target around so that we can resume the dev_replace procedure
+when we mount the root again. This is the call trace:
+
+  close_ctree():
+    btrfs_dev_replace_suspend_for_unmount();
+    btrfs_close_devices():
+      btrfs_close_fs_devices():
+        btrfs_close_one_device():
+          ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
+                 &device->dev_state));
+
+However, since the replace target sticks around, there is a device
+with BTRFS_DEV_STATE_REPLACE_TGT set on close, and we fail the
+assertion in btrfs_close_one_device.
+
+To fix this, if we come across the replace target device when
+closing, we should properly reset it back to allocation state. This
+fix also ensures that if a non-target device has a corrupted state and
+has the BTRFS_DEV_STATE_REPLACE_TGT bit set, the assertion will still
+catch the error.
+
+Reported-by: David Sterba <dsterba@suse.com>
+Fixes: b2a616676839 ("btrfs: fix rw device counting in __btrfs_free_extra_devids")
+CC: stable@vger.kernel.org # 4.19+
+Reviewed-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Desmond Cheong Zhi Xi <desmondcheongzx@gmail.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/volumes.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -1130,6 +1130,9 @@ static void btrfs_close_one_device(struc
+ 		fs_devices->rw_devices--;
+ 	}
+ 
++	if (device->devid == BTRFS_DEV_REPLACE_DEVID)
++		clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
++
+ 	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ 		fs_devices->missing_devices--;
+ 
diff --git a/queue-5.14/btrfs-use-delalloc_bytes-to-determine-flush-amount-for-shrink_delalloc.patch b/queue-5.14/btrfs-use-delalloc_bytes-to-determine-flush-amount-for-shrink_delalloc.patch
new file mode 100644
index 00000000000..7d82edd52ae
--- /dev/null
+++ b/queue-5.14/btrfs-use-delalloc_bytes-to-determine-flush-amount-for-shrink_delalloc.patch
@@ -0,0 +1,176 @@
+From 03fe78cc2942c55cc13be5ca42578750f17204a1 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 14 Jul 2021 14:47:20 -0400
+Subject: btrfs: use delalloc_bytes to determine flush amount for shrink_delalloc
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 03fe78cc2942c55cc13be5ca42578750f17204a1 upstream.
+
+We have been hitting some early ENOSPC issues in production with more
+recent kernels, and I tracked it down to us simply not flushing delalloc
+as aggressively as we should be.  With tracing I was seeing us failing
+all tickets with all of the block rsvs at or around 0, with very little
+pinned space, but still around 120MiB of outstanding bytes_may_used.
+Upon further investigation I saw that we were flushing around 14 pages
+per shrink call for delalloc, despite having around 2GiB of delalloc
+outstanding.
+
+Consider the example of a 8 way machine, all CPUs trying to create a
+file in parallel, which at the time of this commit requires 5 items to
+do.  Assuming a 16k leaf size, we have 10MiB of total metadata reclaim
+size waiting on reservations.  Now assume we have 128MiB of delalloc
+outstanding.  With our current math we would set items to 20, and then
+set to_reclaim to 20 * 256k, or 5MiB.
+
+Assuming that we went through this loop all 3 times, for both
+FLUSH_DELALLOC and FLUSH_DELALLOC_WAIT, and then did the full loop
+twice, we'd only flush 60MiB of the 128MiB delalloc space.  This could
+leave a fair bit of delalloc reservations still hanging around by the
+time we go to ENOSPC out all the remaining tickets.
+
+Fix this two ways.  First, change the calculations to be a fraction of
+the total delalloc bytes on the system.  Prior to this change we were
+calculating based on dirty inodes so our math made more sense, now it's
+just completely unrelated to what we're actually doing.
+
+Second add a FLUSH_DELALLOC_FULL state, that we hold off until we've
+gone through the flush states at least once.  This will empty the system
+of all delalloc so we're sure to be truly out of space when we start
+failing tickets.
+
+I'm tagging stable 5.10 and forward, because this is where we started
+using the page stuff heavily again.  This affects earlier kernel
+versions as well, but would be a pain to backport to them as the
+flushing mechanisms aren't the same.
+
+CC: stable@vger.kernel.org # 5.10+
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ctree.h             |    9 +++++----
+ fs/btrfs/space-info.c        |   40 ++++++++++++++++++++++++++++------------
+ include/trace/events/btrfs.h |    1 +
+ 3 files changed, 34 insertions(+), 16 deletions(-)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -2781,10 +2781,11 @@ enum btrfs_flush_state {
+ 	FLUSH_DELAYED_REFS	=	4,
+ 	FLUSH_DELALLOC		=	5,
+ 	FLUSH_DELALLOC_WAIT	=	6,
+-	ALLOC_CHUNK		=	7,
+-	ALLOC_CHUNK_FORCE	=	8,
+-	RUN_DELAYED_IPUTS	=	9,
+-	COMMIT_TRANS		=	10,
++	FLUSH_DELALLOC_FULL	=	7,
++	ALLOC_CHUNK		=	8,
++	ALLOC_CHUNK_FORCE	=	9,
++	RUN_DELAYED_IPUTS	=	10,
++	COMMIT_TRANS		=	11,
+ };
+ 
+ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+--- a/fs/btrfs/space-info.c
++++ b/fs/btrfs/space-info.c
+@@ -493,6 +493,11 @@ static void shrink_delalloc(struct btrfs
+ 	long time_left;
+ 	int loops;
+ 
++	delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
++	ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
++	if (delalloc_bytes == 0 && ordered_bytes == 0)
++		return;
++
+ 	/* Calc the number of the pages we need flush for space reservation */
+ 	if (to_reclaim == U64_MAX) {
+ 		items = U64_MAX;
+@@ -500,22 +505,21 @@ static void shrink_delalloc(struct btrfs
+ 		/*
+ 		 * to_reclaim is set to however much metadata we need to
+ 		 * reclaim, but reclaiming that much data doesn't really track
+-		 * exactly, so increase the amount to reclaim by 2x in order to
+-		 * make sure we're flushing enough delalloc to hopefully reclaim
+-		 * some metadata reservations.
++		 * exactly.  What we really want to do is reclaim full inode's
++		 * worth of reservations, however that's not available to us
++		 * here.  We will take a fraction of the delalloc bytes for our
++		 * flushing loops and hope for the best.  Delalloc will expand
++		 * the amount we write to cover an entire dirty extent, which
++		 * will reclaim the metadata reservation for that range.  If
++		 * it's not enough subsequent flush stages will be more
++		 * aggressive.
+ 		 */
++		to_reclaim = max(to_reclaim, delalloc_bytes >> 3);
+ 		items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
+-		to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+ 	}
+ 
+ 	trans = (struct btrfs_trans_handle *)current->journal_info;
+ 
+-	delalloc_bytes = percpu_counter_sum_positive(
+-						&fs_info->delalloc_bytes);
+-	ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
+-	if (delalloc_bytes == 0 && ordered_bytes == 0)
+-		return;
+-
+ 	/*
+ 	 * If we are doing more ordered than delalloc we need to just wait on
+ 	 * ordered extents, otherwise we'll waste time trying to flush delalloc
+@@ -595,8 +599,11 @@ static void flush_space(struct btrfs_fs_
+ 		break;
+ 	case FLUSH_DELALLOC:
+ 	case FLUSH_DELALLOC_WAIT:
++	case FLUSH_DELALLOC_FULL:
++		if (state == FLUSH_DELALLOC_FULL)
++			num_bytes = U64_MAX;
+ 		shrink_delalloc(fs_info, space_info, num_bytes,
+-				state == FLUSH_DELALLOC_WAIT, for_preempt);
++				state != FLUSH_DELALLOC, for_preempt);
+ 		break;
+ 	case FLUSH_DELAYED_REFS_NR:
+ 	case FLUSH_DELAYED_REFS:
+@@ -905,6 +912,14 @@ static void btrfs_async_reclaim_metadata
+ 		}
+ 
+ 		/*
++		 * We do not want to empty the system of delalloc unless we're
++		 * under heavy pressure, so allow one trip through the flushing
++		 * logic before we start doing a FLUSH_DELALLOC_FULL.
++		 */
++		if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles)
++			flush_state++;
++
++		/*
+ 		 * We don't want to force a chunk allocation until we've tried
+ 		 * pretty hard to reclaim space.  Think of the case where we
+ 		 * freed up a bunch of space and so have a lot of pinned space
+@@ -1067,7 +1082,7 @@ static void btrfs_preempt_reclaim_metada
+  *   so if we now have space to allocate do the force chunk allocation.
+  */
+ static const enum btrfs_flush_state data_flush_states[] = {
+-	FLUSH_DELALLOC_WAIT,
++	FLUSH_DELALLOC_FULL,
+ 	RUN_DELAYED_IPUTS,
+ 	COMMIT_TRANS,
+ 	ALLOC_CHUNK_FORCE,
+@@ -1156,6 +1171,7 @@ static const enum btrfs_flush_state evic
+ 	FLUSH_DELAYED_REFS,
+ 	FLUSH_DELALLOC,
+ 	FLUSH_DELALLOC_WAIT,
++	FLUSH_DELALLOC_FULL,
+ 	ALLOC_CHUNK,
+ 	COMMIT_TRANS,
+ };
+--- a/include/trace/events/btrfs.h
++++ b/include/trace/events/btrfs.h
+@@ -94,6 +94,7 @@ struct btrfs_space_info;
+ 	EM( FLUSH_DELAYED_ITEMS,	"FLUSH_DELAYED_ITEMS")		\
+ 	EM( FLUSH_DELALLOC,		"FLUSH_DELALLOC")		\
+ 	EM( FLUSH_DELALLOC_WAIT,	"FLUSH_DELALLOC_WAIT")		\
++	EM( FLUSH_DELALLOC_FULL,	"FLUSH_DELALLOC_FULL")		\
+ 	EM( FLUSH_DELAYED_REFS_NR,	"FLUSH_DELAYED_REFS_NR")	\
+ 	EM( FLUSH_DELAYED_REFS,		"FLUSH_ELAYED_REFS")		\
+ 	EM( ALLOC_CHUNK,		"ALLOC_CHUNK")			\
diff --git a/queue-5.14/btrfs-wait-on-async-extents-when-flushing-delalloc.patch b/queue-5.14/btrfs-wait-on-async-extents-when-flushing-delalloc.patch
new file mode 100644
index 00000000000..5facef5ea80
--- /dev/null
+++ b/queue-5.14/btrfs-wait-on-async-extents-when-flushing-delalloc.patch
@@ -0,0 +1,108 @@
+From e16460707e94c3d4c1b5418cb68b28b8efa903b2 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 14 Jul 2021 14:47:21 -0400
+Subject: btrfs: wait on async extents when flushing delalloc
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit e16460707e94c3d4c1b5418cb68b28b8efa903b2 upstream.
+
+I've been debugging an early ENOSPC problem in production and finally
+root caused it to this problem.  When we switched to the per-inode in
+38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in
+shrink_delalloc") I pulled out the async extent handling, because we
+were doing the correct thing by calling filemap_flush() if we had async
+extents set.  This would properly wait on any async extents by locking
+the page in the second flush, thus making sure our ordered extents were
+properly set up.
+
+However when I switched us back to page based flushing, I used
+sync_inode(), which allows us to pass in our own wbc.  The problem here
+is that sync_inode() is smarter than the filemap_* helpers, it tries to
+avoid calling writepages at all.  This means that our second call could
+skip calling do_writepages altogether, and thus not wait on the pagelock
+for the async helpers.  This means we could come back before any ordered
+extents were created and then simply continue on in our flushing
+mechanisms and ENOSPC out when we have plenty of space to use.
+
+Fix this by putting back the async pages logic in shrink_delalloc.  This
+allows us to bulk write out everything that we need to, and then we can
+wait in one place for the async helpers to catch up, and then wait on
+any ordered extents that are created.
+
+Fixes: e076ab2a2ca7 ("btrfs: shrink delalloc pages instead of full inodes")
+CC: stable@vger.kernel.org # 5.10+
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/inode.c      |    4 ----
+ fs/btrfs/space-info.c |   40 ++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 40 insertions(+), 4 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -9809,10 +9809,6 @@ static int start_delalloc_inodes(struct
+ 					 &work->work);
+ 		} else {
+ 			ret = sync_inode(inode, wbc);
+-			if (!ret &&
+-			    test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+-				     &BTRFS_I(inode)->runtime_flags))
+-				ret = sync_inode(inode, wbc);
+ 			btrfs_add_delayed_iput(inode);
+ 			if (ret || wbc->nr_to_write <= 0)
+ 				goto out;
+--- a/fs/btrfs/space-info.c
++++ b/fs/btrfs/space-info.c
+@@ -532,9 +532,49 @@ static void shrink_delalloc(struct btrfs
+ 	while ((delalloc_bytes || ordered_bytes) && loops < 3) {
+ 		u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+ 		long nr_pages = min_t(u64, temp, LONG_MAX);
++		int async_pages;
+ 
+ 		btrfs_start_delalloc_roots(fs_info, nr_pages, true);
+ 
++		/*
++		 * We need to make sure any outstanding async pages are now
++		 * processed before we continue.  This is because things like
++		 * sync_inode() try to be smart and skip writing if the inode is
++		 * marked clean.  We don't use filemap_fwrite for flushing
++		 * because we want to control how many pages we write out at a
++		 * time, thus this is the only safe way to make sure we've
++		 * waited for outstanding compressed workers to have started
++		 * their jobs and thus have ordered extents set up properly.
++		 *
++		 * This exists because we do not want to wait for each
++		 * individual inode to finish its async work, we simply want to
++		 * start the IO on everybody, and then come back here and wait
++		 * for all of the async work to catch up.  Once we're done with
++		 * that we know we'll have ordered extents for everything and we
++		 * can decide if we wait for that or not.
++		 *
++		 * If we choose to replace this in the future, make absolutely
++		 * sure that the proper waiting is being done in the async case,
++		 * as there have been bugs in that area before.
++		 */
++		async_pages = atomic_read(&fs_info->async_delalloc_pages);
++		if (!async_pages)
++			goto skip_async;
++
++		/*
++		 * We don't want to wait forever, if we wrote less pages in this
++		 * loop than we have outstanding, only wait for that number of
++		 * pages, otherwise we can wait for all async pages to finish
++		 * before continuing.
++		 */
++		if (async_pages > nr_pages)
++			async_pages -= nr_pages;
++		else
++			async_pages = 0;
++		wait_event(fs_info->async_submit_wait,
++			   atomic_read(&fs_info->async_delalloc_pages) <=
++			   async_pages);
++skip_async:
+ 		loops++;
+ 		if (wait_ordered && !trans) {
+ 			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
diff --git a/queue-5.14/btrfs-wake-up-async_delalloc_pages-waiters-after-submit.patch b/queue-5.14/btrfs-wake-up-async_delalloc_pages-waiters-after-submit.patch
new file mode 100644
index 00000000000..367c67b9751
--- /dev/null
+++ b/queue-5.14/btrfs-wake-up-async_delalloc_pages-waiters-after-submit.patch
@@ -0,0 +1,54 @@
+From ac98141d140444fe93e26471d3074c603b70e2ca Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 14 Jul 2021 14:47:17 -0400
+Subject: btrfs: wake up async_delalloc_pages waiters after submit
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit ac98141d140444fe93e26471d3074c603b70e2ca upstream.
+
+We use the async_delalloc_pages mechanism to make sure that we've
+completed our async work before trying to continue our delalloc
+flushing.  The reason for this is we need to see any ordered extents
+that were created by our delalloc flushing.  However we're waking up
+before we do the submit work, which is before we create the ordered
+extents.  This is a pretty wide race window where we could potentially
+think there are no ordered extents and thus exit shrink_delalloc
+prematurely.  Fix this by waking us up after we've done the work to
+create ordered extents.
+
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/inode.c |   10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -1290,11 +1290,6 @@ static noinline void async_cow_submit(st
+ 	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
+ 		PAGE_SHIFT;
+ 
+-	/* atomic_sub_return implies a barrier */
+-	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
+-	    5 * SZ_1M)
+-		cond_wake_up_nomb(&fs_info->async_submit_wait);
+-
+ 	/*
+ 	 * ->inode could be NULL if async_chunk_start has failed to compress,
+ 	 * in which case we don't have anything to submit, yet we need to
+@@ -1303,6 +1298,11 @@ static noinline void async_cow_submit(st
+ 	 */
+ 	if (async_chunk->inode)
+ 		submit_compressed_extents(async_chunk);
++
++	/* atomic_sub_return implies a barrier */
++	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
++	    5 * SZ_1M)
++		cond_wake_up_nomb(&fs_info->async_submit_wait);
+ }
+ 
+ static noinline void async_cow_free(struct btrfs_work *work)
diff --git a/queue-5.14/btrfs-zoned-fix-block-group-alloc_offset-calculation.patch b/queue-5.14/btrfs-zoned-fix-block-group-alloc_offset-calculation.patch
new file mode 100644
index 00000000000..d5739134f47
--- /dev/null
+++ b/queue-5.14/btrfs-zoned-fix-block-group-alloc_offset-calculation.patch
@@ -0,0 +1,38 @@
+From 0ae79c6fe70d5c5c645733b7ed39d5e6021d8c9a Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Mon, 9 Aug 2021 13:13:44 +0900
+Subject: btrfs: zoned: fix block group alloc_offset calculation
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 0ae79c6fe70d5c5c645733b7ed39d5e6021d8c9a upstream.
+
+alloc_offset is offset from the start of a block group and @offset is
+actually an address in logical space. Thus, we need to consider
+block_group->start when calculating them.
+
+Fixes: 011b41bffa3d ("btrfs: zoned: advance allocation pointer after tree log node")
+CC: stable@vger.kernel.org # 5.12+
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/free-space-cache.c |    7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/free-space-cache.c
++++ b/fs/btrfs/free-space-cache.c
+@@ -2652,8 +2652,11 @@ int btrfs_remove_free_space(struct btrfs
+ 		 * btrfs_pin_extent_for_log_replay() when replaying the log.
+ 		 * Advance the pointer not to overwrite the tree-log nodes.
+ 		 */
+-		if (block_group->alloc_offset < offset + bytes)
+-			block_group->alloc_offset = offset + bytes;
++		if (block_group->start + block_group->alloc_offset <
++		    offset + bytes) {
++			block_group->alloc_offset =
++				offset + bytes - block_group->start;
++		}
+ 		return 0;
+ 	}
+ 
diff --git a/queue-5.14/btrfs-zoned-fix-double-counting-of-split-ordered-extent.patch b/queue-5.14/btrfs-zoned-fix-double-counting-of-split-ordered-extent.patch
new file mode 100644
index 00000000000..1969da52268
--- /dev/null
+++ b/queue-5.14/btrfs-zoned-fix-double-counting-of-split-ordered-extent.patch
@@ -0,0 +1,54 @@
+From f79645df806565a03abb2847a1d20e6930b25e7e Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Tue, 7 Sep 2021 00:04:28 +0900
+Subject: btrfs: zoned: fix double counting of split ordered extent
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit f79645df806565a03abb2847a1d20e6930b25e7e upstream.
+
+btrfs_add_ordered_extent_*() add num_bytes to fs_info->ordered_bytes.
+Then, splitting an ordered extent will call btrfs_add_ordered_extent_*()
+again for split extents, leading to double counting of the region of
+a split extent. These leaked bytes are finally reported at unmount time
+as follow:
+
+  BTRFS info (device dm-1): at unmount dio bytes count 364544
+
+Fix the double counting by subtracting split extent's size from
+fs_info->ordered_bytes.
+
+Fixes: d22002fd37bd ("btrfs: zoned: split ordered extent when bio is sent")
+CC: stable@vger.kernel.org # 5.12+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ordered-data.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/fs/btrfs/ordered-data.c
++++ b/fs/btrfs/ordered-data.c
+@@ -1052,6 +1052,7 @@ static int clone_ordered_extent(struct b
+ 				u64 len)
+ {
+ 	struct inode *inode = ordered->inode;
++	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ 	u64 file_offset = ordered->file_offset + pos;
+ 	u64 disk_bytenr = ordered->disk_bytenr + pos;
+ 	u64 num_bytes = len;
+@@ -1069,6 +1070,13 @@ static int clone_ordered_extent(struct b
+ 	else
+ 		type = __ffs(flags_masked);
+ 
++	/*
++	 * The splitting extent is already counted and will be added again
++	 * in btrfs_add_ordered_extent_*(). Subtract num_bytes to avoid
++	 * double counting.
++	 */
++	percpu_counter_add_batch(&fs_info->ordered_bytes, -num_bytes,
++				 fs_info->delalloc_batch);
+ 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) {
+ 		WARN_ON_ONCE(1);
+ 		ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode),
diff --git a/queue-5.14/btrfs-zoned-suppress-reclaim-error-message-on-eagain.patch b/queue-5.14/btrfs-zoned-suppress-reclaim-error-message-on-eagain.patch
new file mode 100644
index 00000000000..013c7feeac1
--- /dev/null
+++ b/queue-5.14/btrfs-zoned-suppress-reclaim-error-message-on-eagain.patch
@@ -0,0 +1,42 @@
+From ba86dd9fe60e5853fbff96f2658212908b83f271 Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Mon, 9 Aug 2021 13:32:30 +0900
+Subject: btrfs: zoned: suppress reclaim error message on EAGAIN
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit ba86dd9fe60e5853fbff96f2658212908b83f271 upstream.
+
+btrfs_relocate_chunk() can fail with -EAGAIN when e.g. send operations are
+running. The message can fail btrfs/187 and it's unnecessary because we
+anyway add it back to the reclaim list.
+
+btrfs_reclaim_bgs_work()
+`-> btrfs_relocate_chunk()
+    `-> btrfs_relocate_block_group()
+        `-> reloc_chunk_start()
+            `-> if (fs_info->send_in_progress)
+                `-> return -EAGAIN
+
+CC: stable@vger.kernel.org # 5.13+
+Fixes: 18bb8bbf13c1 ("btrfs: zoned: automatically reclaim zones")
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/block-group.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -1561,7 +1561,7 @@ void btrfs_reclaim_bgs_work(struct work_
+ 				div64_u64(zone_unusable * 100, bg->length));
+ 		trace_btrfs_reclaim_block_group(bg);
+ 		ret = btrfs_relocate_chunk(fs_info, bg->start);
+-		if (ret)
++		if (ret && ret != -EAGAIN)
+ 			btrfs_err(fs_info, "error relocating chunk %llu",
+ 				  bg->start);
+ 
diff --git a/queue-5.14/ceph-fix-dereference-of-null-pointer-cf.patch b/queue-5.14/ceph-fix-dereference-of-null-pointer-cf.patch
new file mode 100644
index 00000000000..6ab41c6b104
--- /dev/null
+++ b/queue-5.14/ceph-fix-dereference-of-null-pointer-cf.patch
@@ -0,0 +1,36 @@
+From 05a444d3f90a3c3e6362e88a1bf13e1a60f8cace Mon Sep 17 00:00:00 2001
+From: Colin Ian King <colin.king@canonical.com>
+Date: Sun, 29 Aug 2021 19:18:24 +0100
+Subject: ceph: fix dereference of null pointer cf
+
+From: Colin Ian King <colin.king@canonical.com>
+
+commit 05a444d3f90a3c3e6362e88a1bf13e1a60f8cace upstream.
+
+Currently in the case where kmem_cache_alloc fails the null pointer
+cf is dereferenced when assigning cf->is_capsnap = false. Fix this
+by adding a null pointer check and return path.
+
+Cc: stable@vger.kernel.org
+Addresses-Coverity: ("Dereference null return")
+Fixes: b2f9fa1f3bd8 ("ceph: correctly handle releasing an embedded cap flush")
+Signed-off-by: Colin Ian King <colin.king@canonical.com>
+Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ceph/caps.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/fs/ceph/caps.c
++++ b/fs/ceph/caps.c
+@@ -1746,6 +1746,9 @@ struct ceph_cap_flush *ceph_alloc_cap_fl
+ 	struct ceph_cap_flush *cf;
+ 
+ 	cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
++	if (!cf)
++		return NULL;
++
+ 	cf->is_capsnap = false;
+ 	return cf;
+ }
diff --git a/queue-5.14/input-elan_i2c-reduce-the-resume-time-for-controller-in-whitebox.patch b/queue-5.14/input-elan_i2c-reduce-the-resume-time-for-controller-in-whitebox.patch
new file mode 100644
index 00000000000..f6402ae2c4e
--- /dev/null
+++ b/queue-5.14/input-elan_i2c-reduce-the-resume-time-for-controller-in-whitebox.patch
@@ -0,0 +1,46 @@
+From d198b8273e3006818ea287a93eb4d8fd2543e512 Mon Sep 17 00:00:00 2001
+From: "jingle.wu" <jingle.wu@emc.com.tw>
+Date: Mon, 6 Sep 2021 21:52:05 -0700
+Subject: Input: elan_i2c - reduce the resume time for controller in Whitebox
+
+From: jingle.wu <jingle.wu@emc.com.tw>
+
+commit d198b8273e3006818ea287a93eb4d8fd2543e512 upstream.
+
+Similar to controllers found Voxel, Delbin, Magpie and Bobba, the one found
+in Whitebox does not need to be reset after issuing power-on command, and
+skipping reset saves resume time.
+
+Signed-off-by: Jingle Wu <jingle.wu@emc.com.tw>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210907012924.11391-1-jingle.wu@emc.com.tw
+Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/input/mouse/elan_i2c.h      |    3 ++-
+ drivers/input/mouse/elan_i2c_core.c |    1 +
+ 2 files changed, 3 insertions(+), 1 deletion(-)
+
+--- a/drivers/input/mouse/elan_i2c.h
++++ b/drivers/input/mouse/elan_i2c.h
+@@ -55,8 +55,9 @@
+ #define ETP_FW_PAGE_SIZE_512	512
+ #define ETP_FW_SIGNATURE_SIZE	6
+ 
+-#define ETP_PRODUCT_ID_DELBIN	0x00C2
++#define ETP_PRODUCT_ID_WHITEBOX	0x00B8
+ #define ETP_PRODUCT_ID_VOXEL	0x00BF
++#define ETP_PRODUCT_ID_DELBIN	0x00C2
+ #define ETP_PRODUCT_ID_MAGPIE	0x0120
+ #define ETP_PRODUCT_ID_BOBBA	0x0121
+ 
+--- a/drivers/input/mouse/elan_i2c_core.c
++++ b/drivers/input/mouse/elan_i2c_core.c
+@@ -105,6 +105,7 @@ static u32 elan_i2c_lookup_quirks(u16 ic
+ 		u32 quirks;
+ 	} elan_i2c_quirks[] = {
+ 		{ 0x0D, ETP_PRODUCT_ID_DELBIN, ETP_QUIRK_QUICK_WAKEUP },
++		{ 0x0D, ETP_PRODUCT_ID_WHITEBOX, ETP_QUIRK_QUICK_WAKEUP },
+ 		{ 0x10, ETP_PRODUCT_ID_VOXEL, ETP_QUIRK_QUICK_WAKEUP },
+ 		{ 0x14, ETP_PRODUCT_ID_MAGPIE, ETP_QUIRK_QUICK_WAKEUP },
+ 		{ 0x14, ETP_PRODUCT_ID_BOBBA, ETP_QUIRK_QUICK_WAKEUP },
diff --git a/queue-5.14/powerpc-perf-hv-gpci-fix-counter-value-parsing.patch b/queue-5.14/powerpc-perf-hv-gpci-fix-counter-value-parsing.patch
new file mode 100644
index 00000000000..d6cf2ab385b
--- /dev/null
+++ b/queue-5.14/powerpc-perf-hv-gpci-fix-counter-value-parsing.patch
@@ -0,0 +1,67 @@
+From f9addd85fbfacf0d155e83dbee8696d6df5ed0c7 Mon Sep 17 00:00:00 2001
+From: Kajol Jain <kjain@linux.ibm.com>
+Date: Fri, 13 Aug 2021 13:51:58 +0530
+Subject: powerpc/perf/hv-gpci: Fix counter value parsing
+
+From: Kajol Jain <kjain@linux.ibm.com>
+
+commit f9addd85fbfacf0d155e83dbee8696d6df5ed0c7 upstream.
+
+H_GetPerformanceCounterInfo (0xF080) hcall returns the counter data in
+the result buffer. Result buffer has specific format defined in the PAPR
+specification. One of the fields is counter offset and width of the
+counter data returned.
+
+Counter data are returned in a unsigned char array in big endian byte
+order. To get the final counter data, the values must be left shifted
+byte at a time. But commit 220a0c609ad17 ("powerpc/perf: Add support for
+the hv gpci (get performance counter info) interface") made the shifting
+bitwise and also assumed little endian order. Because of that, hcall
+counters values are reported incorrectly.
+
+In particular this can lead to counters go backwards which messes up the
+counter prev vs now calculation and leads to huge counter value
+reporting:
+
+  #: perf stat -e hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/
+           -C 0 -I 1000
+        time             counts unit events
+     1.000078854 18,446,744,073,709,535,232      hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/
+     2.000213293                  0      hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/
+     3.000320107                  0      hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/
+     4.000428392                  0      hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/
+     5.000537864                  0      hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/
+     6.000649087                  0      hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/
+     7.000760312                  0      hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/
+     8.000865218             16,448      hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/
+     9.000978985 18,446,744,073,709,535,232      hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/
+    10.001088891             16,384      hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/
+    11.001201435                  0      hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/
+    12.001307937 18,446,744,073,709,535,232      hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/
+
+Fix the shifting logic to correct match the format, ie. read bytes in
+big endian order.
+
+Fixes: e4f226b1580b ("powerpc/perf/hv-gpci: Increase request buffer size")
+Cc: stable@vger.kernel.org # v4.6+
+Reported-by: Nageswara R Sastry<rnsastry@linux.ibm.com>
+Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
+Tested-by: Nageswara R Sastry<rnsastry@linux.ibm.com>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/20210813082158.429023-1-kjain@linux.ibm.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/perf/hv-gpci.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/powerpc/perf/hv-gpci.c
++++ b/arch/powerpc/perf/hv-gpci.c
+@@ -175,7 +175,7 @@ static unsigned long single_gpci_request
+ 	 */
+ 	count = 0;
+ 	for (i = offset; i < offset + length; i++)
+-		count |= arg->bytes[i] << (i - offset);
++		count |= (u64)(arg->bytes[i]) << ((length - 1 - (i - offset)) * 8);
+ 
+ 	*value = count;
+ out:
diff --git a/queue-5.14/series b/queue-5.14/series
index bee6ed1757a..72568d1a6df 100644
--- a/queue-5.14/series
+++ b/queue-5.14/series
@@ -5,3 +5,20 @@ io_uring-add-splice_fd_in-checks.patch
 io_uring-fix-io_try_cancel_userdata-race-for-iowq.patch
 io-wq-fix-wakeup-race-when-adding-new-work.patch
 io-wq-fix-race-between-adding-work-and-activating-a-free-worker.patch
+btrfs-use-delalloc_bytes-to-determine-flush-amount-for-shrink_delalloc.patch
+btrfs-wake-up-async_delalloc_pages-waiters-after-submit.patch
+btrfs-wait-on-async-extents-when-flushing-delalloc.patch
+btrfs-reduce-the-preemptive-flushing-threshold-to-90.patch
+btrfs-do-not-do-preemptive-flushing-if-the-majority-is-global-rsv.patch
+btrfs-zoned-fix-block-group-alloc_offset-calculation.patch
+btrfs-zoned-suppress-reclaim-error-message-on-eagain.patch
+btrfs-fix-upper-limit-for-max_inline-for-page-size-64k.patch
+btrfs-reset-replace-target-device-to-allocation-state-on-close.patch
+btrfs-zoned-fix-double-counting-of-split-ordered-extent.patch
+blk-zoned-allow-zone-management-send-operations-without-cap_sys_admin.patch
+blk-zoned-allow-blkreportzone-without-cap_sys_admin.patch
+powerpc-perf-hv-gpci-fix-counter-value-parsing.patch
+xen-fix-setting-of-max_pfn-in-shared_info.patch
+9p-xen-fix-end-of-loop-tests-for-list_for_each_entry.patch
+ceph-fix-dereference-of-null-pointer-cf.patch
+input-elan_i2c-reduce-the-resume-time-for-controller-in-whitebox.patch
diff --git a/queue-5.14/xen-fix-setting-of-max_pfn-in-shared_info.patch b/queue-5.14/xen-fix-setting-of-max_pfn-in-shared_info.patch
new file mode 100644
index 00000000000..cea4a72de87
--- /dev/null
+++ b/queue-5.14/xen-fix-setting-of-max_pfn-in-shared_info.patch
@@ -0,0 +1,51 @@
+From 4b511d5bfa74b1926daefd1694205c7f1bcf677f Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Fri, 30 Jul 2021 11:26:21 +0200
+Subject: xen: fix setting of max_pfn in shared_info
+
+From: Juergen Gross <jgross@suse.com>
+
+commit 4b511d5bfa74b1926daefd1694205c7f1bcf677f upstream.
+
+Xen PV guests are specifying the highest used PFN via the max_pfn
+field in shared_info. This value is used by the Xen tools when saving
+or migrating the guest.
+
+Unfortunately this field is misnamed, as in reality it is specifying
+the number of pages (including any memory holes) of the guest, so it
+is the highest used PFN + 1. Renaming isn't possible, as this is a
+public Xen hypervisor interface which needs to be kept stable.
+
+The kernel will set the value correctly initially at boot time, but
+when adding more pages (e.g. due to memory hotplug or ballooning) a
+real PFN number is stored in max_pfn. This is done when expanding the
+p2m array, and the PFN stored there is even possibly wrong, as it
+should be the last possible PFN of the just added P2M frame, and not
+one which led to the P2M expansion.
+
+Fix that by setting shared_info->max_pfn to the last possible PFN + 1.
+
+Fixes: 98dd166ea3a3c3 ("x86/xen/p2m: hint at the last populated P2M entry")
+Cc: stable@vger.kernel.org
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Link: https://lore.kernel.org/r/20210730092622.9973-2-jgross@suse.com
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/xen/p2m.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/xen/p2m.c
++++ b/arch/x86/xen/p2m.c
+@@ -618,8 +618,8 @@ int xen_alloc_p2m_entry(unsigned long pf
+ 	}
+ 
+ 	/* Expanded the p2m? */
+-	if (pfn > xen_p2m_last_pfn) {
+-		xen_p2m_last_pfn = pfn;
++	if (pfn >= xen_p2m_last_pfn) {
++		xen_p2m_last_pfn = ALIGN(pfn + 1, P2M_PER_PAGE);
+ 		HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn;
+ 	}
+