From 435f9920f94d8bd937cbba7dbb61d3f68925b1e6 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 19 Jul 2021 14:16:28 +0200
Subject: [PATCH] 5.13-stable patches

added patches:
	arm64-avoid-premature-usercopy-failure.patch
	btrfs-don-t-block-if-we-can-t-acquire-the-reclaim-lock.patch
	btrfs-fix-deadlock-with-concurrent-chunk-allocations-involving-system-chunks.patch
	btrfs-properly-split-extent_map-for-req_op_zone_append.patch
	btrfs-rework-chunk-allocation-to-avoid-exhaustion-of-the-system-chunk-array.patch
	btrfs-zoned-fix-types-for-u64-division-in-btrfs_reclaim_bgs_work.patch
	btrfs-zoned-fix-wrong-mutex-unlock-on-failure-to-allocate-log-root-tree.patch
	cgroup-verify-that-source-is-a-string.patch
	drm-amdgpu-add-another-renoir-did.patch
	drm-i915-gt-fix-edeadlk-handling-regression.patch
	drm-i915-gtt-drop-the-page-table-optimisation.patch
	edac-igen6-fix-core-dependency-again.patch
	fbmem-do-not-delete-the-mode-that-is-still-in-use.patch
	io_uring-use-right-task-for-exiting-checks.patch
	iommu-vt-d-fix-clearing-real-dma-device-s-scalable-mode-context-entries.patch
	iommu-vt-d-global-devtlb-flush-when-present-context-entry-changed.patch
	mm-hugetlb-fix-refs-calculation-from-unaligned-vaddr.patch
	scsi-core-fix-bad-pointer-dereference-when-ehandler-kthread-is-invalid.patch
	scsi-zfcp-report-port-fc_security-as-unknown-early-during-remote-cable-pull.patch
	tracing-do-not-reference-char-as-a-string-in-histograms.patch
---
 ...m64-avoid-premature-usercopy-failure.patch |  199 +++
 ...if-we-can-t-acquire-the-reclaim-lock.patch |   44 +
 ...-allocations-involving-system-chunks.patch |  212 +++
 ...it-extent_map-for-req_op_zone_append.patch |  248 ++++
 ...exhaustion-of-the-system-chunk-array.patch | 1280 +++++++++++++++++
 ...4-division-in-btrfs_reclaim_bgs_work.patch |   35 +
 ...on-failure-to-allocate-log-root-tree.patch |   43 +
 ...group-verify-that-source-is-a-string.patch |   64 +
 .../drm-amdgpu-add-another-renoir-did.patch   |   31 +
 ...5-gt-fix-edeadlk-handling-regression.patch |   60 +
 ...gtt-drop-the-page-table-optimisation.patch |   55 +
 ...edac-igen6-fix-core-dependency-again.patch |   38 +
 ...delete-the-mode-that-is-still-in-use.patch |   85 ++
 ...ng-use-right-task-for-exiting-checks.patch |   35 +
 ...vice-s-scalable-mode-context-entries.patch |   47 +
 ...h-when-present-context-entry-changed.patch |  107 ++
 ...efs-calculation-from-unaligned-vaddr.patch |   45 +
 ...nce-when-ehandler-kthread-is-invalid.patch |   96 ++
 ...known-early-during-remote-cable-pull.patch |   38 +
 queue-5.13/series                             |   20 +
 ...rence-char-as-a-string-in-histograms.patch |  105 ++
 21 files changed, 2887 insertions(+)
 create mode 100644 queue-5.13/arm64-avoid-premature-usercopy-failure.patch
 create mode 100644 queue-5.13/btrfs-don-t-block-if-we-can-t-acquire-the-reclaim-lock.patch
 create mode 100644 queue-5.13/btrfs-fix-deadlock-with-concurrent-chunk-allocations-involving-system-chunks.patch
 create mode 100644 queue-5.13/btrfs-properly-split-extent_map-for-req_op_zone_append.patch
 create mode 100644 queue-5.13/btrfs-rework-chunk-allocation-to-avoid-exhaustion-of-the-system-chunk-array.patch
 create mode 100644 queue-5.13/btrfs-zoned-fix-types-for-u64-division-in-btrfs_reclaim_bgs_work.patch
 create mode 100644 queue-5.13/btrfs-zoned-fix-wrong-mutex-unlock-on-failure-to-allocate-log-root-tree.patch
 create mode 100644 queue-5.13/cgroup-verify-that-source-is-a-string.patch
 create mode 100644 queue-5.13/drm-amdgpu-add-another-renoir-did.patch
 create mode 100644 queue-5.13/drm-i915-gt-fix-edeadlk-handling-regression.patch
 create mode 100644 queue-5.13/drm-i915-gtt-drop-the-page-table-optimisation.patch
 create mode 100644 queue-5.13/edac-igen6-fix-core-dependency-again.patch
 create mode 100644 queue-5.13/fbmem-do-not-delete-the-mode-that-is-still-in-use.patch
 create mode 100644 queue-5.13/io_uring-use-right-task-for-exiting-checks.patch
 create mode 100644 queue-5.13/iommu-vt-d-fix-clearing-real-dma-device-s-scalable-mode-context-entries.patch
 create mode 100644 queue-5.13/iommu-vt-d-global-devtlb-flush-when-present-context-entry-changed.patch
 create mode 100644 queue-5.13/mm-hugetlb-fix-refs-calculation-from-unaligned-vaddr.patch
 create mode 100644 queue-5.13/scsi-core-fix-bad-pointer-dereference-when-ehandler-kthread-is-invalid.patch
 create mode 100644 queue-5.13/scsi-zfcp-report-port-fc_security-as-unknown-early-during-remote-cable-pull.patch
 create mode 100644 queue-5.13/tracing-do-not-reference-char-as-a-string-in-histograms.patch

diff --git a/queue-5.13/arm64-avoid-premature-usercopy-failure.patch b/queue-5.13/arm64-avoid-premature-usercopy-failure.patch
new file mode 100644
index 00000000000..5e7f1cf46a0
--- /dev/null
+++ b/queue-5.13/arm64-avoid-premature-usercopy-failure.patch
@@ -0,0 +1,199 @@
+From 295cf156231ca3f9e3a66bde7fab5e09c41835e0 Mon Sep 17 00:00:00 2001
+From: Robin Murphy <robin.murphy@arm.com>
+Date: Mon, 12 Jul 2021 15:27:46 +0100
+Subject: arm64: Avoid premature usercopy failure
+
+From: Robin Murphy <robin.murphy@arm.com>
+
+commit 295cf156231ca3f9e3a66bde7fab5e09c41835e0 upstream.
+
+Al reminds us that the usercopy API must only return complete failure
+if absolutely nothing could be copied. Currently, if userspace does
+something silly like giving us an unaligned pointer to Device memory,
+or a size which overruns MTE tag bounds, we may fail to honour that
+requirement when faulting on a multi-byte access even though a smaller
+access could have succeeded.
+
+Add a mitigation to the fixup routines to fall back to a single-byte
+copy if we faulted on a larger access before anything has been written
+to the destination, to guarantee making *some* forward progress. We
+needn't be too concerned about the overall performance since this should
+only occur when callers are doing something a bit dodgy in the first
+place. Particularly broken userspace might still be able to trick
+generic_perform_write() into an infinite loop by targeting write() at
+an mmap() of some read-only device register where the fault-in load
+succeeds but any store synchronously aborts such that copy_to_user() is
+genuinely unable to make progress, but, well, don't do that...
+
+CC: stable@vger.kernel.org
+Reported-by: Chen Huang <chenhuang5@huawei.com>
+Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
+Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Robin Murphy <robin.murphy@arm.com>
+Link: https://lore.kernel.org/r/dc03d5c675731a1f24a62417dba5429ad744234e.1626098433.git.robin.murphy@arm.com
+Signed-off-by: Will Deacon <will@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/lib/copy_from_user.S |   13 ++++++++++---
+ arch/arm64/lib/copy_in_user.S   |   21 ++++++++++++++-------
+ arch/arm64/lib/copy_to_user.S   |   14 +++++++++++---
+ 3 files changed, 35 insertions(+), 13 deletions(-)
+
+--- a/arch/arm64/lib/copy_from_user.S
++++ b/arch/arm64/lib/copy_from_user.S
+@@ -29,7 +29,7 @@
+ 	.endm
+ 
+ 	.macro ldrh1 reg, ptr, val
+-	user_ldst 9998f, ldtrh, \reg, \ptr, \val
++	user_ldst 9997f, ldtrh, \reg, \ptr, \val
+ 	.endm
+ 
+ 	.macro strh1 reg, ptr, val
+@@ -37,7 +37,7 @@
+ 	.endm
+ 
+ 	.macro ldr1 reg, ptr, val
+-	user_ldst 9998f, ldtr, \reg, \ptr, \val
++	user_ldst 9997f, ldtr, \reg, \ptr, \val
+ 	.endm
+ 
+ 	.macro str1 reg, ptr, val
+@@ -45,7 +45,7 @@
+ 	.endm
+ 
+ 	.macro ldp1 reg1, reg2, ptr, val
+-	user_ldp 9998f, \reg1, \reg2, \ptr, \val
++	user_ldp 9997f, \reg1, \reg2, \ptr, \val
+ 	.endm
+ 
+ 	.macro stp1 reg1, reg2, ptr, val
+@@ -53,8 +53,10 @@
+ 	.endm
+ 
+ end	.req	x5
++srcin	.req	x15
+ SYM_FUNC_START(__arch_copy_from_user)
+ 	add	end, x0, x2
++	mov	srcin, x1
+ #include "copy_template.S"
+ 	mov	x0, #0				// Nothing to copy
+ 	ret
+@@ -63,6 +65,11 @@ EXPORT_SYMBOL(__arch_copy_from_user)
+ 
+ 	.section .fixup,"ax"
+ 	.align	2
++9997:	cmp	dst, dstin
++	b.ne	9998f
++	// Before being absolutely sure we couldn't copy anything, try harder
++USER(9998f, ldtrb tmp1w, [srcin])
++	strb	tmp1w, [dst], #1
+ 9998:	sub	x0, end, dst			// bytes not copied
+ 	ret
+ 	.previous
+--- a/arch/arm64/lib/copy_in_user.S
++++ b/arch/arm64/lib/copy_in_user.S
+@@ -30,33 +30,34 @@
+ 	.endm
+ 
+ 	.macro ldrh1 reg, ptr, val
+-	user_ldst 9998f, ldtrh, \reg, \ptr, \val
++	user_ldst 9997f, ldtrh, \reg, \ptr, \val
+ 	.endm
+ 
+ 	.macro strh1 reg, ptr, val
+-	user_ldst 9998f, sttrh, \reg, \ptr, \val
++	user_ldst 9997f, sttrh, \reg, \ptr, \val
+ 	.endm
+ 
+ 	.macro ldr1 reg, ptr, val
+-	user_ldst 9998f, ldtr, \reg, \ptr, \val
++	user_ldst 9997f, ldtr, \reg, \ptr, \val
+ 	.endm
+ 
+ 	.macro str1 reg, ptr, val
+-	user_ldst 9998f, sttr, \reg, \ptr, \val
++	user_ldst 9997f, sttr, \reg, \ptr, \val
+ 	.endm
+ 
+ 	.macro ldp1 reg1, reg2, ptr, val
+-	user_ldp 9998f, \reg1, \reg2, \ptr, \val
++	user_ldp 9997f, \reg1, \reg2, \ptr, \val
+ 	.endm
+ 
+ 	.macro stp1 reg1, reg2, ptr, val
+-	user_stp 9998f, \reg1, \reg2, \ptr, \val
++	user_stp 9997f, \reg1, \reg2, \ptr, \val
+ 	.endm
+ 
+ end	.req	x5
+-
++srcin	.req	x15
+ SYM_FUNC_START(__arch_copy_in_user)
+ 	add	end, x0, x2
++	mov	srcin, x1
+ #include "copy_template.S"
+ 	mov	x0, #0
+ 	ret
+@@ -65,6 +66,12 @@ EXPORT_SYMBOL(__arch_copy_in_user)
+ 
+ 	.section .fixup,"ax"
+ 	.align	2
++9997:	cmp	dst, dstin
++	b.ne	9998f
++	// Before being absolutely sure we couldn't copy anything, try harder
++USER(9998f, ldtrb tmp1w, [srcin])
++USER(9998f, sttrb tmp1w, [dst])
++	add	dst, dst, #1
+ 9998:	sub	x0, end, dst			// bytes not copied
+ 	ret
+ 	.previous
+--- a/arch/arm64/lib/copy_to_user.S
++++ b/arch/arm64/lib/copy_to_user.S
+@@ -32,7 +32,7 @@
+ 	.endm
+ 
+ 	.macro strh1 reg, ptr, val
+-	user_ldst 9998f, sttrh, \reg, \ptr, \val
++	user_ldst 9997f, sttrh, \reg, \ptr, \val
+ 	.endm
+ 
+ 	.macro ldr1 reg, ptr, val
+@@ -40,7 +40,7 @@
+ 	.endm
+ 
+ 	.macro str1 reg, ptr, val
+-	user_ldst 9998f, sttr, \reg, \ptr, \val
++	user_ldst 9997f, sttr, \reg, \ptr, \val
+ 	.endm
+ 
+ 	.macro ldp1 reg1, reg2, ptr, val
+@@ -48,12 +48,14 @@
+ 	.endm
+ 
+ 	.macro stp1 reg1, reg2, ptr, val
+-	user_stp 9998f, \reg1, \reg2, \ptr, \val
++	user_stp 9997f, \reg1, \reg2, \ptr, \val
+ 	.endm
+ 
+ end	.req	x5
++srcin	.req	x15
+ SYM_FUNC_START(__arch_copy_to_user)
+ 	add	end, x0, x2
++	mov	srcin, x1
+ #include "copy_template.S"
+ 	mov	x0, #0
+ 	ret
+@@ -62,6 +64,12 @@ EXPORT_SYMBOL(__arch_copy_to_user)
+ 
+ 	.section .fixup,"ax"
+ 	.align	2
++9997:	cmp	dst, dstin
++	b.ne	9998f
++	// Before being absolutely sure we couldn't copy anything, try harder
++	ldrb	tmp1w, [srcin]
++USER(9998f, sttrb tmp1w, [dst])
++	add	dst, dst, #1
+ 9998:	sub	x0, end, dst			// bytes not copied
+ 	ret
+ 	.previous
diff --git a/queue-5.13/btrfs-don-t-block-if-we-can-t-acquire-the-reclaim-lock.patch b/queue-5.13/btrfs-don-t-block-if-we-can-t-acquire-the-reclaim-lock.patch
new file mode 100644
index 00000000000..62f8bb25fda
--- /dev/null
+++ b/queue-5.13/btrfs-don-t-block-if-we-can-t-acquire-the-reclaim-lock.patch
@@ -0,0 +1,44 @@
+From 9cc0b837e14ae913581ec1ea6e979a738f71b0fd Mon Sep 17 00:00:00 2001
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Date: Tue, 6 Jul 2021 01:32:38 +0900
+Subject: btrfs: don't block if we can't acquire the reclaim lock
+
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+
+commit 9cc0b837e14ae913581ec1ea6e979a738f71b0fd upstream.
+
+If we can't acquire the reclaim_bgs_lock on block group reclaim, we
+block until it is free. This can potentially stall for a long time.
+
+While reclaim of block groups is necessary for a good user experience on
+a zoned file system, there still is no need to block as it is best
+effort only, just like when we're deleting unused block groups.
+
+CC: stable@vger.kernel.org # 5.13
+Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/block-group.c |   10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -1499,7 +1499,15 @@ void btrfs_reclaim_bgs_work(struct work_
+ 	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
+ 		return;
+ 
+-	mutex_lock(&fs_info->reclaim_bgs_lock);
++	/*
++	 * Long running balances can keep us blocked here for eternity, so
++	 * simply skip reclaim if we're unable to get the mutex.
++	 */
++	if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
++		btrfs_exclop_finish(fs_info);
++		return;
++	}
++
+ 	spin_lock(&fs_info->unused_bgs_lock);
+ 	while (!list_empty(&fs_info->reclaim_bgs)) {
+ 		bg = list_first_entry(&fs_info->reclaim_bgs,
diff --git a/queue-5.13/btrfs-fix-deadlock-with-concurrent-chunk-allocations-involving-system-chunks.patch b/queue-5.13/btrfs-fix-deadlock-with-concurrent-chunk-allocations-involving-system-chunks.patch
new file mode 100644
index 00000000000..559851213e7
--- /dev/null
+++ b/queue-5.13/btrfs-fix-deadlock-with-concurrent-chunk-allocations-involving-system-chunks.patch
@@ -0,0 +1,212 @@
+From 1cb3db1cf383a3c7dbda1aa0ce748b0958759947 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 29 Jun 2021 14:43:05 +0100
+Subject: btrfs: fix deadlock with concurrent chunk allocations involving system chunks
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 1cb3db1cf383a3c7dbda1aa0ce748b0958759947 upstream.
+
+When a task attempting to allocate a new chunk verifies that there is not
+currently enough free space in the system space_info and there is another
+task that allocated a new system chunk but it did not finish yet the
+creation of the respective block group, it waits for that other task to
+finish creating the block group. This is to avoid exhaustion of the system
+chunk array in the superblock, which is limited, when we have a thundering
+herd of tasks allocating new chunks. This problem was described and fixed
+by commit eafa4fd0ad0607 ("btrfs: fix exhaustion of the system chunk array
+due to concurrent allocations").
+
+However there are two very similar scenarios where this can lead to a
+deadlock:
+
+1) Task B allocated a new system chunk and task A is waiting on task B
+   to finish creation of the respective system block group. However before
+   task B ends its transaction handle and finishes the creation of the
+   system block group, it attempts to allocate another chunk (like a data
+   chunk for an fallocate operation for a very large range). Task B will
+   be unable to progress and allocate the new chunk, because task A set
+   space_info->chunk_alloc to 1 and therefore it loops at
+   btrfs_chunk_alloc() waiting for task A to finish its chunk allocation
+   and set space_info->chunk_alloc to 0, but task A is waiting on task B
+   to finish creation of the new system block group, therefore resulting
+   in a deadlock;
+
+2) Task B allocated a new system chunk and task A is waiting on task B to
+   finish creation of the respective system block group. By the time that
+   task B enter the final phase of block group allocation, which happens
+   at btrfs_create_pending_block_groups(), when it modifies the extent
+   tree, the device tree or the chunk tree to insert the items for some
+   new block group, it needs to allocate a new chunk, so it ends up at
+   btrfs_chunk_alloc() and keeps looping there because task A has set
+   space_info->chunk_alloc to 1, but task A is waiting for task B to
+   finish creation of the new system block group and release the reserved
+   system space, therefore resulting in a deadlock.
+
+In short, the problem is if a task B needs to allocate a new chunk after
+it previously allocated a new system chunk and if another task A is
+currently waiting for task B to complete the allocation of the new system
+chunk.
+
+Unfortunately this deadlock scenario introduced by the previous fix for
+the system chunk array exhaustion problem does not have a simple and short
+fix, and requires a big change to rework the chunk allocation code so that
+chunk btree updates are all made in the first phase of chunk allocation.
+And since this deadlock regression is being frequently hit on zoned
+filesystems and the system chunk array exhaustion problem is triggered
+in more extreme cases (originally observed on PowerPC with a node size
+of 64K when running the fallocate tests from stress-ng), revert the
+changes from that commit. The next patch in the series, with a subject
+of "btrfs: rework chunk allocation to avoid exhaustion of the system
+chunk array" does the necessary changes to fix the system chunk array
+exhaustion problem.
+
+Reported-by: Naohiro Aota <naohiro.aota@wdc.com>
+Link: https://lore.kernel.org/linux-btrfs/20210621015922.ewgbffxuawia7liz@naota-xeon/
+Fixes: eafa4fd0ad0607 ("btrfs: fix exhaustion of the system chunk array due to concurrent allocations")
+CC: stable@vger.kernel.org # 5.12+
+Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
+Tested-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Tested-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/block-group.c |   58 -------------------------------------------------
+ fs/btrfs/transaction.c |    5 ----
+ fs/btrfs/transaction.h |    7 -----
+ 3 files changed, 1 insertion(+), 69 deletions(-)
+
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -3364,7 +3364,6 @@ static u64 get_profile_num_devs(struct b
+  */
+ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+ {
+-	struct btrfs_transaction *cur_trans = trans->transaction;
+ 	struct btrfs_fs_info *fs_info = trans->fs_info;
+ 	struct btrfs_space_info *info;
+ 	u64 left;
+@@ -3379,7 +3378,6 @@ void check_system_chunk(struct btrfs_tra
+ 	lockdep_assert_held(&fs_info->chunk_mutex);
+ 
+ 	info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+-again:
+ 	spin_lock(&info->lock);
+ 	left = info->total_bytes - btrfs_space_info_used(info, true);
+ 	spin_unlock(&info->lock);
+@@ -3398,58 +3396,6 @@ again:
+ 
+ 	if (left < thresh) {
+ 		u64 flags = btrfs_system_alloc_profile(fs_info);
+-		u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved);
+-
+-		/*
+-		 * If there's not available space for the chunk tree (system
+-		 * space) and there are other tasks that reserved space for
+-		 * creating a new system block group, wait for them to complete
+-		 * the creation of their system block group and release excess
+-		 * reserved space. We do this because:
+-		 *
+-		 * *) We can end up allocating more system chunks than necessary
+-		 *    when there are multiple tasks that are concurrently
+-		 *    allocating block groups, which can lead to exhaustion of
+-		 *    the system array in the superblock;
+-		 *
+-		 * *) If we allocate extra and unnecessary system block groups,
+-		 *    despite being empty for a long time, and possibly forever,
+-		 *    they end not being added to the list of unused block groups
+-		 *    because that typically happens only when deallocating the
+-		 *    last extent from a block group - which never happens since
+-		 *    we never allocate from them in the first place. The few
+-		 *    exceptions are when mounting a filesystem or running scrub,
+-		 *    which add unused block groups to the list of unused block
+-		 *    groups, to be deleted by the cleaner kthread.
+-		 *    And even when they are added to the list of unused block
+-		 *    groups, it can take a long time until they get deleted,
+-		 *    since the cleaner kthread might be sleeping or busy with
+-		 *    other work (deleting subvolumes, running delayed iputs,
+-		 *    defrag scheduling, etc);
+-		 *
+-		 * This is rare in practice, but can happen when too many tasks
+-		 * are allocating blocks groups in parallel (via fallocate())
+-		 * and before the one that reserved space for a new system block
+-		 * group finishes the block group creation and releases the space
+-		 * reserved in excess (at btrfs_create_pending_block_groups()),
+-		 * other tasks end up here and see free system space temporarily
+-		 * not enough for updating the chunk tree.
+-		 *
+-		 * We unlock the chunk mutex before waiting for such tasks and
+-		 * lock it again after the wait, otherwise we would deadlock.
+-		 * It is safe to do so because allocating a system chunk is the
+-		 * first thing done while allocating a new block group.
+-		 */
+-		if (reserved > trans->chunk_bytes_reserved) {
+-			const u64 min_needed = reserved - thresh;
+-
+-			mutex_unlock(&fs_info->chunk_mutex);
+-			wait_event(cur_trans->chunk_reserve_wait,
+-			   atomic64_read(&cur_trans->chunk_bytes_reserved) <=
+-			   min_needed);
+-			mutex_lock(&fs_info->chunk_mutex);
+-			goto again;
+-		}
+ 
+ 		/*
+ 		 * Ignore failure to create system chunk. We might end up not
+@@ -3464,10 +3410,8 @@ again:
+ 		ret = btrfs_block_rsv_add(fs_info->chunk_root,
+ 					  &fs_info->chunk_block_rsv,
+ 					  thresh, BTRFS_RESERVE_NO_FLUSH);
+-		if (!ret) {
+-			atomic64_add(thresh, &cur_trans->chunk_bytes_reserved);
++		if (!ret)
+ 			trans->chunk_bytes_reserved += thresh;
+-		}
+ 	}
+ }
+ 
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -260,7 +260,6 @@ static inline int extwriter_counter_read
+ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
+ {
+ 	struct btrfs_fs_info *fs_info = trans->fs_info;
+-	struct btrfs_transaction *cur_trans = trans->transaction;
+ 
+ 	if (!trans->chunk_bytes_reserved)
+ 		return;
+@@ -269,8 +268,6 @@ void btrfs_trans_release_chunk_metadata(
+ 
+ 	btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
+ 				trans->chunk_bytes_reserved, NULL);
+-	atomic64_sub(trans->chunk_bytes_reserved, &cur_trans->chunk_bytes_reserved);
+-	cond_wake_up(&cur_trans->chunk_reserve_wait);
+ 	trans->chunk_bytes_reserved = 0;
+ }
+ 
+@@ -386,8 +383,6 @@ loop:
+ 	spin_lock_init(&cur_trans->dropped_roots_lock);
+ 	INIT_LIST_HEAD(&cur_trans->releasing_ebs);
+ 	spin_lock_init(&cur_trans->releasing_ebs_lock);
+-	atomic64_set(&cur_trans->chunk_bytes_reserved, 0);
+-	init_waitqueue_head(&cur_trans->chunk_reserve_wait);
+ 	list_add_tail(&cur_trans->list, &fs_info->trans_list);
+ 	extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
+ 			IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
+--- a/fs/btrfs/transaction.h
++++ b/fs/btrfs/transaction.h
+@@ -96,13 +96,6 @@ struct btrfs_transaction {
+ 
+ 	spinlock_t releasing_ebs_lock;
+ 	struct list_head releasing_ebs;
+-
+-	/*
+-	 * The number of bytes currently reserved, by all transaction handles
+-	 * attached to this transaction, for metadata extents of the chunk tree.
+-	 */
+-	atomic64_t chunk_bytes_reserved;
+-	wait_queue_head_t chunk_reserve_wait;
+ };
+ 
+ #define __TRANS_FREEZABLE	(1U << 0)
diff --git a/queue-5.13/btrfs-properly-split-extent_map-for-req_op_zone_append.patch b/queue-5.13/btrfs-properly-split-extent_map-for-req_op_zone_append.patch
new file mode 100644
index 00000000000..818d59e24d8
--- /dev/null
+++ b/queue-5.13/btrfs-properly-split-extent_map-for-req_op_zone_append.patch
@@ -0,0 +1,248 @@
+From abb99cfdaf0759f8a619e5fecf52ccccdf310c8c Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Mon, 28 Jun 2021 17:57:28 +0900
+Subject: btrfs: properly split extent_map for REQ_OP_ZONE_APPEND
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit abb99cfdaf0759f8a619e5fecf52ccccdf310c8c upstream.
+
+Damien reported a test failure with btrfs/209. The test itself ran fine,
+but the fsck ran afterwards reported a corrupted filesystem.
+
+The filesystem corruption happens because we're splitting an extent and
+then writing the extent twice. We have to split the extent though, because
+we're creating too large extents for a REQ_OP_ZONE_APPEND operation.
+
+When dumping the extent tree, we can see two EXTENT_ITEMs at the same
+start address but different lengths.
+
+$ btrfs inspect dump-tree /dev/nullb1 -t extent
+...
+   item 19 key (269484032 EXTENT_ITEM 126976) itemoff 15470 itemsize 53
+           refs 1 gen 7 flags DATA
+           extent data backref root FS_TREE objectid 257 offset 786432 count 1
+   item 20 key (269484032 EXTENT_ITEM 262144) itemoff 15417 itemsize 53
+           refs 1 gen 7 flags DATA
+           extent data backref root FS_TREE objectid 257 offset 786432 count 1
+
+The duplicated EXTENT_ITEMs originally come from wrongly split extent_map in
+extract_ordered_extent(). Since extract_ordered_extent() uses
+create_io_em() to split an existing extent_map, we will have
+split->orig_start != split->start. Then, it will be logged with non-zero
+"extent data offset". Finally, the logged entries are replayed into
+a duplicated EXTENT_ITEM.
+
+Introduce and use proper splitting function for extent_map. The function is
+intended to be simple and specific usage for extract_ordered_extent() e.g.
+not supporting compression case (we do not allow splitting compressed
+extent_map anyway).
+
+There was a question raised by Qu, in summary why we want to split the
+extent map (and not the bio):
+
+The problem is not the limit on the zone end, which as you mention is
+the same as the block group end. The problem is that data write use zone
+append (ZA) operations. ZA BIOs cannot be split so a large extent may
+need to be processed with multiple ZA BIOs, While that is also true for
+regular writes, the major difference is that ZA are "nameless" write
+operation giving back the written sectors on completion. And ZA
+operations may be reordered by the block layer (not intentionally
+though). Combine both of these characteristics and you can see that the
+data for a large extent may end up being shuffled when written resulting
+in data corruption and the impossibility to map the extent to some start
+sector.
+
+To avoid this problem, zoned btrfs uses the principle "one data extent
+== one ZA BIO". So large extents need to be split. This is unfortunate,
+but we can revisit this later and optimize, e.g. merge back together the
+fragments of an extent once written if they actually were written
+sequentially in the zone.
+
+Reported-by: Damien Le Moal <damien.lemoal@wdc.com>
+Fixes: d22002fd37bd ("btrfs: zoned: split ordered extent when bio is sent")
+CC: stable@vger.kernel.org # 5.12+
+CC: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/inode.c |  147 ++++++++++++++++++++++++++++++++++++++++++++-----------
+ 1 file changed, 118 insertions(+), 29 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -2260,13 +2260,127 @@ bool btrfs_bio_fits_in_ordered_extent(st
+ 	return ret;
+ }
+ 
++/*
++ * Split an extent_map at [start, start + len]
++ *
++ * This function is intended to be used only for extract_ordered_extent().
++ */
++static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
++			  u64 pre, u64 post)
++{
++	struct extent_map_tree *em_tree = &inode->extent_tree;
++	struct extent_map *em;
++	struct extent_map *split_pre = NULL;
++	struct extent_map *split_mid = NULL;
++	struct extent_map *split_post = NULL;
++	int ret = 0;
++	int modified;
++	unsigned long flags;
++
++	/* Sanity check */
++	if (pre == 0 && post == 0)
++		return 0;
++
++	split_pre = alloc_extent_map();
++	if (pre)
++		split_mid = alloc_extent_map();
++	if (post)
++		split_post = alloc_extent_map();
++	if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
++		ret = -ENOMEM;
++		goto out;
++	}
++
++	ASSERT(pre + post < len);
++
++	lock_extent(&inode->io_tree, start, start + len - 1);
++	write_lock(&em_tree->lock);
++	em = lookup_extent_mapping(em_tree, start, len);
++	if (!em) {
++		ret = -EIO;
++		goto out_unlock;
++	}
++
++	ASSERT(em->len == len);
++	ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
++	ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
++
++	flags = em->flags;
++	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
++	clear_bit(EXTENT_FLAG_LOGGING, &flags);
++	modified = !list_empty(&em->list);
++
++	/* First, replace the em with a new extent_map starting from * em->start */
++	split_pre->start = em->start;
++	split_pre->len = (pre ? pre : em->len - post);
++	split_pre->orig_start = split_pre->start;
++	split_pre->block_start = em->block_start;
++	split_pre->block_len = split_pre->len;
++	split_pre->orig_block_len = split_pre->block_len;
++	split_pre->ram_bytes = split_pre->len;
++	split_pre->flags = flags;
++	split_pre->compress_type = em->compress_type;
++	split_pre->generation = em->generation;
++
++	replace_extent_mapping(em_tree, em, split_pre, modified);
++
++	/*
++	 * Now we only have an extent_map at:
++	 *     [em->start, em->start + pre] if pre != 0
++	 *     [em->start, em->start + em->len - post] if pre == 0
++	 */
++
++	if (pre) {
++		/* Insert the middle extent_map */
++		split_mid->start = em->start + pre;
++		split_mid->len = em->len - pre - post;
++		split_mid->orig_start = split_mid->start;
++		split_mid->block_start = em->block_start + pre;
++		split_mid->block_len = split_mid->len;
++		split_mid->orig_block_len = split_mid->block_len;
++		split_mid->ram_bytes = split_mid->len;
++		split_mid->flags = flags;
++		split_mid->compress_type = em->compress_type;
++		split_mid->generation = em->generation;
++		add_extent_mapping(em_tree, split_mid, modified);
++	}
++
++	if (post) {
++		split_post->start = em->start + em->len - post;
++		split_post->len = post;
++		split_post->orig_start = split_post->start;
++		split_post->block_start = em->block_start + em->len - post;
++		split_post->block_len = split_post->len;
++		split_post->orig_block_len = split_post->block_len;
++		split_post->ram_bytes = split_post->len;
++		split_post->flags = flags;
++		split_post->compress_type = em->compress_type;
++		split_post->generation = em->generation;
++		add_extent_mapping(em_tree, split_post, modified);
++	}
++
++	/* Once for us */
++	free_extent_map(em);
++	/* Once for the tree */
++	free_extent_map(em);
++
++out_unlock:
++	write_unlock(&em_tree->lock);
++	unlock_extent(&inode->io_tree, start, start + len - 1);
++out:
++	free_extent_map(split_pre);
++	free_extent_map(split_mid);
++	free_extent_map(split_post);
++
++	return ret;
++}
++
+ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
+ 					   struct bio *bio, loff_t file_offset)
+ {
+ 	struct btrfs_ordered_extent *ordered;
+-	struct extent_map *em = NULL, *em_new = NULL;
+-	struct extent_map_tree *em_tree = &inode->extent_tree;
+ 	u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
++	u64 file_len;
+ 	u64 len = bio->bi_iter.bi_size;
+ 	u64 end = start + len;
+ 	u64 ordered_end;
+@@ -2306,41 +2420,16 @@ static blk_status_t extract_ordered_exte
+ 		goto out;
+ 	}
+ 
++	file_len = ordered->num_bytes;
+ 	pre = start - ordered->disk_bytenr;
+ 	post = ordered_end - end;
+ 
+ 	ret = btrfs_split_ordered_extent(ordered, pre, post);
+ 	if (ret)
+ 		goto out;
+-
+-	read_lock(&em_tree->lock);
+-	em = lookup_extent_mapping(em_tree, ordered->file_offset, len);
+-	if (!em) {
+-		read_unlock(&em_tree->lock);
+-		ret = -EIO;
+-		goto out;
+-	}
+-	read_unlock(&em_tree->lock);
+-
+-	ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+-	/*
+-	 * We cannot reuse em_new here but have to create a new one, as
+-	 * unpin_extent_cache() expects the start of the extent map to be the
+-	 * logical offset of the file, which does not hold true anymore after
+-	 * splitting.
+-	 */
+-	em_new = create_io_em(inode, em->start + pre, len,
+-			      em->start + pre, em->block_start + pre, len,
+-			      len, len, BTRFS_COMPRESS_NONE,
+-			      BTRFS_ORDERED_REGULAR);
+-	if (IS_ERR(em_new)) {
+-		ret = PTR_ERR(em_new);
+-		goto out;
+-	}
+-	free_extent_map(em_new);
++	ret = split_zoned_em(inode, file_offset, file_len, pre, post);
+ 
+ out:
+-	free_extent_map(em);
+ 	btrfs_put_ordered_extent(ordered);
+ 
+ 	return errno_to_blk_status(ret);
diff --git a/queue-5.13/btrfs-rework-chunk-allocation-to-avoid-exhaustion-of-the-system-chunk-array.patch b/queue-5.13/btrfs-rework-chunk-allocation-to-avoid-exhaustion-of-the-system-chunk-array.patch
new file mode 100644
index 00000000000..4d3e35e1454
--- /dev/null
+++ b/queue-5.13/btrfs-rework-chunk-allocation-to-avoid-exhaustion-of-the-system-chunk-array.patch
@@ -0,0 +1,1280 @@
+From 79bd37120b149532af5b21953643ed74af69654f Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 29 Jun 2021 14:43:06 +0100
+Subject: btrfs: rework chunk allocation to avoid exhaustion of the system chunk array
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 79bd37120b149532af5b21953643ed74af69654f upstream.
+
+Commit eafa4fd0ad0607 ("btrfs: fix exhaustion of the system chunk array
+due to concurrent allocations") fixed a problem that resulted in
+exhausting the system chunk array in the superblock when there are many
+tasks allocating chunks in parallel. Basically too many tasks enter the
+first phase of chunk allocation without previous tasks having finished
+their second phase of allocation, resulting in too many system chunks
+being allocated. That was originally observed when running the fallocate
+tests of stress-ng on a PowerPC machine, using a node size of 64K.
+
+However that commit also introduced a deadlock where a task in phase 1 of
+the chunk allocation waited for another task that had allocated a system
+chunk to finish its phase 2, but that other task was waiting on an extent
+buffer lock held by the first task, therefore resulting in both tasks not
+making any progress. That change was later reverted by a patch with the
+subject "btrfs: fix deadlock with concurrent chunk allocations involving
+system chunks", since there is no simple and short solution to address it
+and the deadlock is relatively easy to trigger on zoned filesystems, while
+the system chunk array exhaustion is not so common.
+
+This change reworks the chunk allocation to avoid the system chunk array
+exhaustion. It accomplishes that by making the first phase of chunk
+allocation do the updates of the device items in the chunk btree and the
+insertion of the new chunk item in the chunk btree. This is done while
+under the protection of the chunk mutex (fs_info->chunk_mutex), in the
+same critical section that checks for available system space, allocates
+a new system chunk if needed and reserves system chunk space. This way
+we do not have chunk space reserved until the second phase completes.
+
+The same logic is applied to chunk removal as well, since it keeps
+reserved system space long after it is done updating the chunk btree.
+
+For direct allocation of system chunks, the previous behaviour remains,
+because otherwise we would deadlock on extent buffers of the chunk btree.
+Changes to the chunk btree are by large done by chunk allocation and chunk
+removal, which first reserve chunk system space and then later do changes
+to the chunk btree. The other remaining cases are uncommon and correspond
+to adding a device, removing a device and resizing a device. All these
+other cases do not pre-reserve system space, they modify the chunk btree
+right away, so they don't hold reserved space for a long period like chunk
+allocation and chunk removal do.
+
+The diff of this change is huge, but more than half of it is just addition
+of comments describing both how things work regarding chunk allocation and
+removal, including both the new behavior and the parts of the old behavior
+that did not change.
+
+CC: stable@vger.kernel.org # 5.12+
+Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
+Tested-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Tested-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/block-group.c |  285 ++++++++++++++++++++++++++++++++++-----
+ fs/btrfs/block-group.h |    6 
+ fs/btrfs/ctree.c       |   67 +--------
+ fs/btrfs/transaction.c |   10 -
+ fs/btrfs/transaction.h |    2 
+ fs/btrfs/volumes.c     |  355 +++++++++++++++++++++++++++++++++++++------------
+ fs/btrfs/volumes.h     |    5 
+ 7 files changed, 546 insertions(+), 184 deletions(-)
+
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -2192,6 +2192,13 @@ error:
+ 	return ret;
+ }
+ 
++/*
++ * This function, insert_block_group_item(), belongs to the phase 2 of chunk
++ * allocation.
++ *
++ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
++ * phases.
++ */
+ static int insert_block_group_item(struct btrfs_trans_handle *trans,
+ 				   struct btrfs_block_group *block_group)
+ {
+@@ -2214,15 +2221,19 @@ static int insert_block_group_item(struc
+ 	return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
+ }
+ 
++/*
++ * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
++ * chunk allocation.
++ *
++ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
++ * phases.
++ */
+ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
+ {
+ 	struct btrfs_fs_info *fs_info = trans->fs_info;
+ 	struct btrfs_block_group *block_group;
+ 	int ret = 0;
+ 
+-	if (!trans->can_flush_pending_bgs)
+-		return;
+-
+ 	while (!list_empty(&trans->new_bgs)) {
+ 		int index;
+ 
+@@ -2237,6 +2248,13 @@ void btrfs_create_pending_block_groups(s
+ 		ret = insert_block_group_item(trans, block_group);
+ 		if (ret)
+ 			btrfs_abort_transaction(trans, ret);
++		if (!block_group->chunk_item_inserted) {
++			mutex_lock(&fs_info->chunk_mutex);
++			ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
++			mutex_unlock(&fs_info->chunk_mutex);
++			if (ret)
++				btrfs_abort_transaction(trans, ret);
++		}
+ 		ret = btrfs_finish_chunk_alloc(trans, block_group->start,
+ 					block_group->length);
+ 		if (ret)
+@@ -2260,8 +2278,9 @@ next:
+ 	btrfs_trans_release_chunk_metadata(trans);
+ }
+ 
+-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
+-			   u64 type, u64 chunk_offset, u64 size)
++struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
++						 u64 bytes_used, u64 type,
++						 u64 chunk_offset, u64 size)
+ {
+ 	struct btrfs_fs_info *fs_info = trans->fs_info;
+ 	struct btrfs_block_group *cache;
+@@ -2271,7 +2290,7 @@ int btrfs_make_block_group(struct btrfs_
+ 
+ 	cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
+ 	if (!cache)
+-		return -ENOMEM;
++		return ERR_PTR(-ENOMEM);
+ 
+ 	cache->length = size;
+ 	set_free_space_tree_thresholds(cache);
+@@ -2285,7 +2304,7 @@ int btrfs_make_block_group(struct btrfs_
+ 	ret = btrfs_load_block_group_zone_info(cache, true);
+ 	if (ret) {
+ 		btrfs_put_block_group(cache);
+-		return ret;
++		return ERR_PTR(ret);
+ 	}
+ 
+ 	ret = exclude_super_stripes(cache);
+@@ -2293,7 +2312,7 @@ int btrfs_make_block_group(struct btrfs_
+ 		/* We may have excluded something, so call this just in case */
+ 		btrfs_free_excluded_extents(cache);
+ 		btrfs_put_block_group(cache);
+-		return ret;
++		return ERR_PTR(ret);
+ 	}
+ 
+ 	add_new_free_space(cache, chunk_offset, chunk_offset + size);
+@@ -2320,7 +2339,7 @@ int btrfs_make_block_group(struct btrfs_
+ 	if (ret) {
+ 		btrfs_remove_free_space_cache(cache);
+ 		btrfs_put_block_group(cache);
+-		return ret;
++		return ERR_PTR(ret);
+ 	}
+ 
+ 	/*
+@@ -2339,7 +2358,7 @@ int btrfs_make_block_group(struct btrfs_
+ 	btrfs_update_delayed_refs_rsv(trans);
+ 
+ 	set_avail_alloc_bits(fs_info, type);
+-	return 0;
++	return cache;
+ }
+ 
+ /*
+@@ -3219,11 +3238,203 @@ int btrfs_force_chunk_alloc(struct btrfs
+ 	return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+ }
+ 
++static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
++{
++	struct btrfs_block_group *bg;
++	int ret;
++
++	/*
++	 * Check if we have enough space in the system space info because we
++	 * will need to update device items in the chunk btree and insert a new
++	 * chunk item in the chunk btree as well. This will allocate a new
++	 * system block group if needed.
++	 */
++	check_system_chunk(trans, flags);
++
++	bg = btrfs_alloc_chunk(trans, flags);
++	if (IS_ERR(bg)) {
++		ret = PTR_ERR(bg);
++		goto out;
++	}
++
++	/*
++	 * If this is a system chunk allocation then stop right here and do not
++	 * add the chunk item to the chunk btree. This is to prevent a deadlock
++	 * because this system chunk allocation can be triggered while COWing
++	 * some extent buffer of the chunk btree and while holding a lock on a
++	 * parent extent buffer, in which case attempting to insert the chunk
++	 * item (or update the device item) would result in a deadlock on that
++	 * parent extent buffer. In this case defer the chunk btree updates to
++	 * the second phase of chunk allocation and keep our reservation until
++	 * the second phase completes.
++	 *
++	 * This is a rare case and can only be triggered by the very few cases
++	 * we have where we need to touch the chunk btree outside chunk allocation
++	 * and chunk removal. These cases are basically adding a device, removing
++	 * a device or resizing a device.
++	 */
++	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
++		return 0;
++
++	ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
++	/*
++	 * Normally we are not expected to fail with -ENOSPC here, since we have
++	 * previously reserved space in the system space_info and allocated one
++	 * new system chunk if necessary. However there are two exceptions:
++	 *
++	 * 1) We may have enough free space in the system space_info but all the
++	 *    existing system block groups have a profile which can not be used
++	 *    for extent allocation.
++	 *
++	 *    This happens when mounting in degraded mode. For example we have a
++	 *    RAID1 filesystem with 2 devices, lose one device and mount the fs
++	 *    using the other device in degraded mode. If we then allocate a chunk,
++	 *    we may have enough free space in the existing system space_info, but
++	 *    none of the block groups can be used for extent allocation since they
++	 *    have a RAID1 profile, and because we are in degraded mode with a
++	 *    single device, we are forced to allocate a new system chunk with a
++	 *    SINGLE profile. Making check_system_chunk() iterate over all system
++	 *    block groups and check if they have a usable profile and enough space
++	 *    can be slow on very large filesystems, so we tolerate the -ENOSPC and
++	 *    try again after forcing allocation of a new system chunk. Like this
++	 *    we avoid paying the cost of that search in normal circumstances, when
++	 *    we were not mounted in degraded mode;
++	 *
++	 * 2) We had enough free space info the system space_info, and one suitable
++	 *    block group to allocate from when we called check_system_chunk()
++	 *    above. However right after we called it, the only system block group
++	 *    with enough free space got turned into RO mode by a running scrub,
++	 *    and in this case we have to allocate a new one and retry. We only
++	 *    need do this allocate and retry once, since we have a transaction
++	 *    handle and scrub uses the commit root to search for block groups.
++	 */
++	if (ret == -ENOSPC) {
++		const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
++		struct btrfs_block_group *sys_bg;
++
++		sys_bg = btrfs_alloc_chunk(trans, sys_flags);
++		if (IS_ERR(sys_bg)) {
++			ret = PTR_ERR(sys_bg);
++			btrfs_abort_transaction(trans, ret);
++			goto out;
++		}
++
++		ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
++		if (ret) {
++			btrfs_abort_transaction(trans, ret);
++			goto out;
++		}
++
++		ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
++		if (ret) {
++			btrfs_abort_transaction(trans, ret);
++			goto out;
++		}
++	} else if (ret) {
++		btrfs_abort_transaction(trans, ret);
++		goto out;
++	}
++out:
++	btrfs_trans_release_chunk_metadata(trans);
++
++	return ret;
++}
++
+ /*
+- * If force is CHUNK_ALLOC_FORCE:
++ * Chunk allocation is done in 2 phases:
++ *
++ * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
++ *    the chunk, the chunk mapping, create its block group and add the items
++ *    that belong in the chunk btree to it - more specifically, we need to
++ *    update device items in the chunk btree and add a new chunk item to it.
++ *
++ * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
++ *    group item to the extent btree and the device extent items to the devices
++ *    btree.
++ *
++ * This is done to prevent deadlocks. For example when COWing a node from the
++ * extent btree we are holding a write lock on the node's parent and if we
++ * trigger chunk allocation and attempted to insert the new block group item
++ * in the extent btree right way, we could deadlock because the path for the
++ * insertion can include that parent node. At first glance it seems impossible
++ * to trigger chunk allocation after starting a transaction since tasks should
++ * reserve enough transaction units (metadata space), however while that is true
++ * most of the time, chunk allocation may still be triggered for several reasons:
++ *
++ * 1) When reserving metadata, we check if there is enough free space in the
++ *    metadata space_info and therefore don't trigger allocation of a new chunk.
++ *    However later when the task actually tries to COW an extent buffer from
++ *    the extent btree or from the device btree for example, it is forced to
++ *    allocate a new block group (chunk) because the only one that had enough
++ *    free space was just turned to RO mode by a running scrub for example (or
++ *    device replace, block group reclaim thread, etc), so we can not use it
++ *    for allocating an extent and end up being forced to allocate a new one;
++ *
++ * 2) Because we only check that the metadata space_info has enough free bytes,
++ *    we end up not allocating a new metadata chunk in that case. However if
++ *    the filesystem was mounted in degraded mode, none of the existing block
++ *    groups might be suitable for extent allocation due to their incompatible
++ *    profile (for e.g. mounting a 2 devices filesystem, where all block groups
++ *    use a RAID1 profile, in degraded mode using a single device). In this case
++ *    when the task attempts to COW some extent buffer of the extent btree for
++ *    example, it will trigger allocation of a new metadata block group with a
++ *    suitable profile (SINGLE profile in the example of the degraded mount of
++ *    the RAID1 filesystem);
++ *
++ * 3) The task has reserved enough transaction units / metadata space, but when
++ *    it attempts to COW an extent buffer from the extent or device btree for
++ *    example, it does not find any free extent in any metadata block group,
++ *    therefore forced to try to allocate a new metadata block group.
++ *    This is because some other task allocated all available extents in the
++ *    meanwhile - this typically happens with tasks that don't reserve space
++ *    properly, either intentionally or as a bug. One example where this is
++ *    done intentionally is fsync, as it does not reserve any transaction units
++ *    and ends up allocating a variable number of metadata extents for log
++ *    tree extent buffers.
++ *
++ * We also need this 2 phases setup when adding a device to a filesystem with
++ * a seed device - we must create new metadata and system chunks without adding
++ * any of the block group items to the chunk, extent and device btrees. If we
++ * did not do it this way, we would get ENOSPC when attempting to update those
++ * btrees, since all the chunks from the seed device are read-only.
++ *
++ * Phase 1 does the updates and insertions to the chunk btree because if we had
++ * it done in phase 2 and have a thundering herd of tasks allocating chunks in
++ * parallel, we risk having too many system chunks allocated by many tasks if
++ * many tasks reach phase 1 without the previous ones completing phase 2. In the
++ * extreme case this leads to exhaustion of the system chunk array in the
++ * superblock. This is easier to trigger if using a btree node/leaf size of 64K
++ * and with RAID filesystems (so we have more device items in the chunk btree).
++ * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
++ * the system chunk array due to concurrent allocations") provides more details.
++ *
++ * For allocation of system chunks, we defer the updates and insertions into the
++ * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
++ * if the chunk allocation is triggered while COWing an extent buffer of the
++ * chunk btree, we are holding a lock on the parent of that extent buffer and
++ * doing the chunk btree updates and insertions can require locking that parent.
++ * This is for the very few and rare cases where we update the chunk btree that
++ * are not chunk allocation or chunk removal: adding a device, removing a device
++ * or resizing a device.
++ *
++ * The reservation of system space, done through check_system_chunk(), as well
++ * as all the updates and insertions into the chunk btree must be done while
++ * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
++ * an extent buffer from the chunks btree we never trigger allocation of a new
++ * system chunk, which would result in a deadlock (trying to lock twice an
++ * extent buffer of the chunk btree, first time before triggering the chunk
++ * allocation and the second time during chunk allocation while attempting to
++ * update the chunks btree). The system chunk array is also updated while holding
++ * that mutex. The same logic applies to removing chunks - we must reserve system
++ * space, update the chunk btree and the system chunk array in the superblock
++ * while holding fs_info->chunk_mutex.
++ *
++ * This function, btrfs_chunk_alloc(), belongs to phase 1.
++ *
++ * If @force is CHUNK_ALLOC_FORCE:
+  *    - return 1 if it successfully allocates a chunk,
+  *    - return errors including -ENOSPC otherwise.
+- * If force is NOT CHUNK_ALLOC_FORCE:
++ * If @force is NOT CHUNK_ALLOC_FORCE:
+  *    - return 0 if it doesn't need to allocate a new chunk,
+  *    - return 1 if it successfully allocates a chunk,
+  *    - return errors including -ENOSPC otherwise.
+@@ -3240,6 +3451,13 @@ int btrfs_chunk_alloc(struct btrfs_trans
+ 	/* Don't re-enter if we're already allocating a chunk */
+ 	if (trans->allocating_chunk)
+ 		return -ENOSPC;
++	/*
++	 * If we are removing a chunk, don't re-enter or we would deadlock.
++	 * System space reservation and system chunk allocation is done by the
++	 * chunk remove operation (btrfs_remove_chunk()).
++	 */
++	if (trans->removing_chunk)
++		return -ENOSPC;
+ 
+ 	space_info = btrfs_find_space_info(fs_info, flags);
+ 	ASSERT(space_info);
+@@ -3303,13 +3521,7 @@ int btrfs_chunk_alloc(struct btrfs_trans
+ 			force_metadata_allocation(fs_info);
+ 	}
+ 
+-	/*
+-	 * Check if we have enough space in SYSTEM chunk because we may need
+-	 * to update devices.
+-	 */
+-	check_system_chunk(trans, flags);
+-
+-	ret = btrfs_alloc_chunk(trans, flags);
++	ret = do_chunk_alloc(trans, flags);
+ 	trans->allocating_chunk = false;
+ 
+ 	spin_lock(&space_info->lock);
+@@ -3328,22 +3540,6 @@ out:
+ 	space_info->chunk_alloc = 0;
+ 	spin_unlock(&space_info->lock);
+ 	mutex_unlock(&fs_info->chunk_mutex);
+-	/*
+-	 * When we allocate a new chunk we reserve space in the chunk block
+-	 * reserve to make sure we can COW nodes/leafs in the chunk tree or
+-	 * add new nodes/leafs to it if we end up needing to do it when
+-	 * inserting the chunk item and updating device items as part of the
+-	 * second phase of chunk allocation, performed by
+-	 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
+-	 * large number of new block groups to create in our transaction
+-	 * handle's new_bgs list to avoid exhausting the chunk block reserve
+-	 * in extreme cases - like having a single transaction create many new
+-	 * block groups when starting to write out the free space caches of all
+-	 * the block groups that were made dirty during the lifetime of the
+-	 * transaction.
+-	 */
+-	if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
+-		btrfs_create_pending_block_groups(trans);
+ 
+ 	return ret;
+ }
+@@ -3396,14 +3592,31 @@ void check_system_chunk(struct btrfs_tra
+ 
+ 	if (left < thresh) {
+ 		u64 flags = btrfs_system_alloc_profile(fs_info);
++		struct btrfs_block_group *bg;
+ 
+ 		/*
+ 		 * Ignore failure to create system chunk. We might end up not
+ 		 * needing it, as we might not need to COW all nodes/leafs from
+ 		 * the paths we visit in the chunk tree (they were already COWed
+ 		 * or created in the current transaction for example).
++		 *
++		 * Also, if our caller is allocating a system chunk, do not
++		 * attempt to insert the chunk item in the chunk btree, as we
++		 * could deadlock on an extent buffer since our caller may be
++		 * COWing an extent buffer from the chunk btree.
+ 		 */
+-		ret = btrfs_alloc_chunk(trans, flags);
++		bg = btrfs_alloc_chunk(trans, flags);
++		if (IS_ERR(bg)) {
++			ret = PTR_ERR(bg);
++		} else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
++			/*
++			 * If we fail to add the chunk item here, we end up
++			 * trying again at phase 2 of chunk allocation, at
++			 * btrfs_create_pending_block_groups(). So ignore
++			 * any error here.
++			 */
++			btrfs_chunk_alloc_add_chunk_item(trans, bg);
++		}
+ 	}
+ 
+ 	if (!ret) {
+--- a/fs/btrfs/block-group.h
++++ b/fs/btrfs/block-group.h
+@@ -97,6 +97,7 @@ struct btrfs_block_group {
+ 	unsigned int removed:1;
+ 	unsigned int to_copy:1;
+ 	unsigned int relocating_repair:1;
++	unsigned int chunk_item_inserted:1;
+ 
+ 	int disk_cache_state;
+ 
+@@ -268,8 +269,9 @@ void btrfs_reclaim_bgs_work(struct work_
+ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
+ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
+ int btrfs_read_block_groups(struct btrfs_fs_info *info);
+-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
+-			   u64 type, u64 chunk_offset, u64 size);
++struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
++						 u64 bytes_used, u64 type,
++						 u64 chunk_offset, u64 size);
+ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
+ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
+ 			     bool do_chunk_alloc);
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -364,49 +364,6 @@ static noinline int update_ref_for_cow(s
+ 	return 0;
+ }
+ 
+-static struct extent_buffer *alloc_tree_block_no_bg_flush(
+-					  struct btrfs_trans_handle *trans,
+-					  struct btrfs_root *root,
+-					  u64 parent_start,
+-					  const struct btrfs_disk_key *disk_key,
+-					  int level,
+-					  u64 hint,
+-					  u64 empty_size,
+-					  enum btrfs_lock_nesting nest)
+-{
+-	struct btrfs_fs_info *fs_info = root->fs_info;
+-	struct extent_buffer *ret;
+-
+-	/*
+-	 * If we are COWing a node/leaf from the extent, chunk, device or free
+-	 * space trees, make sure that we do not finish block group creation of
+-	 * pending block groups. We do this to avoid a deadlock.
+-	 * COWing can result in allocation of a new chunk, and flushing pending
+-	 * block groups (btrfs_create_pending_block_groups()) can be triggered
+-	 * when finishing allocation of a new chunk. Creation of a pending block
+-	 * group modifies the extent, chunk, device and free space trees,
+-	 * therefore we could deadlock with ourselves since we are holding a
+-	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
+-	 * try to COW later.
+-	 * For similar reasons, we also need to delay flushing pending block
+-	 * groups when splitting a leaf or node, from one of those trees, since
+-	 * we are holding a write lock on it and its parent or when inserting a
+-	 * new root node for one of those trees.
+-	 */
+-	if (root == fs_info->extent_root ||
+-	    root == fs_info->chunk_root ||
+-	    root == fs_info->dev_root ||
+-	    root == fs_info->free_space_root)
+-		trans->can_flush_pending_bgs = false;
+-
+-	ret = btrfs_alloc_tree_block(trans, root, parent_start,
+-				     root->root_key.objectid, disk_key, level,
+-				     hint, empty_size, nest);
+-	trans->can_flush_pending_bgs = true;
+-
+-	return ret;
+-}
+-
+ /*
+  * does the dirty work in cow of a single block.  The parent block (if
+  * supplied) is updated to point to the new cow copy.  The new buffer is marked
+@@ -455,8 +412,9 @@ static noinline int __btrfs_cow_block(st
+ 	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
+ 		parent_start = parent->start;
+ 
+-	cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
+-					   level, search_start, empty_size, nest);
++	cow = btrfs_alloc_tree_block(trans, root, parent_start,
++				     root->root_key.objectid, &disk_key, level,
++				     search_start, empty_size, nest);
+ 	if (IS_ERR(cow))
+ 		return PTR_ERR(cow);
+ 
+@@ -2458,9 +2416,9 @@ static noinline int insert_new_root(stru
+ 	else
+ 		btrfs_node_key(lower, &lower_key, 0);
+ 
+-	c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
+-					 root->node->start, 0,
+-					 BTRFS_NESTING_NEW_ROOT);
++	c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
++				   &lower_key, level, root->node->start, 0,
++				   BTRFS_NESTING_NEW_ROOT);
+ 	if (IS_ERR(c))
+ 		return PTR_ERR(c);
+ 
+@@ -2589,8 +2547,9 @@ static noinline int split_node(struct bt
+ 	mid = (c_nritems + 1) / 2;
+ 	btrfs_node_key(c, &disk_key, mid);
+ 
+-	split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
+-					     c->start, 0, BTRFS_NESTING_SPLIT);
++	split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
++				       &disk_key, level, c->start, 0,
++				       BTRFS_NESTING_SPLIT);
+ 	if (IS_ERR(split))
+ 		return PTR_ERR(split);
+ 
+@@ -3381,10 +3340,10 @@ again:
+ 	 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
+ 	 * use BTRFS_NESTING_NEW_ROOT.
+ 	 */
+-	right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
+-					     l->start, 0, num_doubles ?
+-					     BTRFS_NESTING_NEW_ROOT :
+-					     BTRFS_NESTING_SPLIT);
++	right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
++				       &disk_key, 0, l->start, 0,
++				       num_doubles ? BTRFS_NESTING_NEW_ROOT :
++				       BTRFS_NESTING_SPLIT);
+ 	if (IS_ERR(right))
+ 		return PTR_ERR(right);
+ 
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -254,8 +254,11 @@ static inline int extwriter_counter_read
+ }
+ 
+ /*
+- * To be called after all the new block groups attached to the transaction
+- * handle have been created (btrfs_create_pending_block_groups()).
++ * To be called after doing the chunk btree updates right after allocating a new
++ * chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a
++ * chunk after all chunk btree updates and after finishing the second phase of
++ * chunk allocation (btrfs_create_pending_block_groups()) in case some block
++ * group had its chunk item insertion delayed to the second phase.
+  */
+ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
+ {
+@@ -264,8 +267,6 @@ void btrfs_trans_release_chunk_metadata(
+ 	if (!trans->chunk_bytes_reserved)
+ 		return;
+ 
+-	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
+-
+ 	btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
+ 				trans->chunk_bytes_reserved, NULL);
+ 	trans->chunk_bytes_reserved = 0;
+@@ -699,7 +700,6 @@ again:
+ 	h->fs_info = root->fs_info;
+ 
+ 	h->type = type;
+-	h->can_flush_pending_bgs = true;
+ 	INIT_LIST_HEAD(&h->new_bgs);
+ 
+ 	smp_mb();
+--- a/fs/btrfs/transaction.h
++++ b/fs/btrfs/transaction.h
+@@ -134,7 +134,7 @@ struct btrfs_trans_handle {
+ 	short aborted;
+ 	bool adding_csums;
+ 	bool allocating_chunk;
+-	bool can_flush_pending_bgs;
++	bool removing_chunk;
+ 	bool reloc_reserved;
+ 	bool in_fsync;
+ 	struct btrfs_root *root;
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -1745,19 +1745,14 @@ again:
+ 		extent = btrfs_item_ptr(leaf, path->slots[0],
+ 					struct btrfs_dev_extent);
+ 	} else {
+-		btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
+ 		goto out;
+ 	}
+ 
+ 	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
+ 
+ 	ret = btrfs_del_item(trans, root, path);
+-	if (ret) {
+-		btrfs_handle_fs_error(fs_info, ret,
+-				      "Failed to remove dev extent item");
+-	} else {
++	if (ret == 0)
+ 		set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
+-	}
+ out:
+ 	btrfs_free_path(path);
+ 	return ret;
+@@ -2942,7 +2937,7 @@ static int btrfs_del_sys_chunk(struct bt
+ 	u32 cur;
+ 	struct btrfs_key key;
+ 
+-	mutex_lock(&fs_info->chunk_mutex);
++	lockdep_assert_held(&fs_info->chunk_mutex);
+ 	array_size = btrfs_super_sys_array_size(super_copy);
+ 
+ 	ptr = super_copy->sys_chunk_array;
+@@ -2972,7 +2967,6 @@ static int btrfs_del_sys_chunk(struct bt
+ 			cur += len;
+ 		}
+ 	}
+-	mutex_unlock(&fs_info->chunk_mutex);
+ 	return ret;
+ }
+ 
+@@ -3012,6 +3006,29 @@ struct extent_map *btrfs_get_chunk_map(s
+ 	return em;
+ }
+ 
++static int remove_chunk_item(struct btrfs_trans_handle *trans,
++			     struct map_lookup *map, u64 chunk_offset)
++{
++	int i;
++
++	/*
++	 * Removing chunk items and updating the device items in the chunks btree
++	 * requires holding the chunk_mutex.
++	 * See the comment at btrfs_chunk_alloc() for the details.
++	 */
++	lockdep_assert_held(&trans->fs_info->chunk_mutex);
++
++	for (i = 0; i < map->num_stripes; i++) {
++		int ret;
++
++		ret = btrfs_update_device(trans, map->stripes[i].dev);
++		if (ret)
++			return ret;
++	}
++
++	return btrfs_free_chunk(trans, chunk_offset);
++}
++
+ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
+ {
+ 	struct btrfs_fs_info *fs_info = trans->fs_info;
+@@ -3032,14 +3049,16 @@ int btrfs_remove_chunk(struct btrfs_tran
+ 		return PTR_ERR(em);
+ 	}
+ 	map = em->map_lookup;
+-	mutex_lock(&fs_info->chunk_mutex);
+-	check_system_chunk(trans, map->type);
+-	mutex_unlock(&fs_info->chunk_mutex);
+ 
+ 	/*
+-	 * Take the device list mutex to prevent races with the final phase of
+-	 * a device replace operation that replaces the device object associated
+-	 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
++	 * First delete the device extent items from the devices btree.
++	 * We take the device_list_mutex to avoid racing with the finishing phase
++	 * of a device replace operation. See the comment below before acquiring
++	 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
++	 * because that can result in a deadlock when deleting the device extent
++	 * items from the devices btree - COWing an extent buffer from the btree
++	 * may result in allocating a new metadata chunk, which would attempt to
++	 * lock again fs_info->chunk_mutex.
+ 	 */
+ 	mutex_lock(&fs_devices->device_list_mutex);
+ 	for (i = 0; i < map->num_stripes; i++) {
+@@ -3061,18 +3080,73 @@ int btrfs_remove_chunk(struct btrfs_tran
+ 			btrfs_clear_space_info_full(fs_info);
+ 			mutex_unlock(&fs_info->chunk_mutex);
+ 		}
++	}
++	mutex_unlock(&fs_devices->device_list_mutex);
+ 
+-		ret = btrfs_update_device(trans, device);
++	/*
++	 * We acquire fs_info->chunk_mutex for 2 reasons:
++	 *
++	 * 1) Just like with the first phase of the chunk allocation, we must
++	 *    reserve system space, do all chunk btree updates and deletions, and
++	 *    update the system chunk array in the superblock while holding this
++	 *    mutex. This is for similar reasons as explained on the comment at
++	 *    the top of btrfs_chunk_alloc();
++	 *
++	 * 2) Prevent races with the final phase of a device replace operation
++	 *    that replaces the device object associated with the map's stripes,
++	 *    because the device object's id can change at any time during that
++	 *    final phase of the device replace operation
++	 *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
++	 *    replaced device and then see it with an ID of
++	 *    BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
++	 *    the device item, which does not exists on the chunk btree.
++	 *    The finishing phase of device replace acquires both the
++	 *    device_list_mutex and the chunk_mutex, in that order, so we are
++	 *    safe by just acquiring the chunk_mutex.
++	 */
++	trans->removing_chunk = true;
++	mutex_lock(&fs_info->chunk_mutex);
++
++	check_system_chunk(trans, map->type);
++
++	ret = remove_chunk_item(trans, map, chunk_offset);
++	/*
++	 * Normally we should not get -ENOSPC since we reserved space before
++	 * through the call to check_system_chunk().
++	 *
++	 * Despite our system space_info having enough free space, we may not
++	 * be able to allocate extents from its block groups, because all have
++	 * an incompatible profile, which will force us to allocate a new system
++	 * block group with the right profile, or right after we called
++	 * check_system_space() above, a scrub turned the only system block group
++	 * with enough free space into RO mode.
++	 * This is explained with more detail at do_chunk_alloc().
++	 *
++	 * So if we get -ENOSPC, allocate a new system chunk and retry once.
++	 */
++	if (ret == -ENOSPC) {
++		const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
++		struct btrfs_block_group *sys_bg;
++
++		sys_bg = btrfs_alloc_chunk(trans, sys_flags);
++		if (IS_ERR(sys_bg)) {
++			ret = PTR_ERR(sys_bg);
++			btrfs_abort_transaction(trans, ret);
++			goto out;
++		}
++
++		ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
+ 		if (ret) {
+-			mutex_unlock(&fs_devices->device_list_mutex);
+ 			btrfs_abort_transaction(trans, ret);
+ 			goto out;
+ 		}
+-	}
+-	mutex_unlock(&fs_devices->device_list_mutex);
+ 
+-	ret = btrfs_free_chunk(trans, chunk_offset);
+-	if (ret) {
++		ret = remove_chunk_item(trans, map, chunk_offset);
++		if (ret) {
++			btrfs_abort_transaction(trans, ret);
++			goto out;
++		}
++	} else if (ret) {
+ 		btrfs_abort_transaction(trans, ret);
+ 		goto out;
+ 	}
+@@ -3087,6 +3161,15 @@ int btrfs_remove_chunk(struct btrfs_tran
+ 		}
+ 	}
+ 
++	mutex_unlock(&fs_info->chunk_mutex);
++	trans->removing_chunk = false;
++
++	/*
++	 * We are done with chunk btree updates and deletions, so release the
++	 * system space we previously reserved (with check_system_chunk()).
++	 */
++	btrfs_trans_release_chunk_metadata(trans);
++
+ 	ret = btrfs_remove_block_group(trans, chunk_offset, em);
+ 	if (ret) {
+ 		btrfs_abort_transaction(trans, ret);
+@@ -3094,6 +3177,10 @@ int btrfs_remove_chunk(struct btrfs_tran
+ 	}
+ 
+ out:
++	if (trans->removing_chunk) {
++		mutex_unlock(&fs_info->chunk_mutex);
++		trans->removing_chunk = false;
++	}
+ 	/* once for us */
+ 	free_extent_map(em);
+ 	return ret;
+@@ -4868,13 +4955,12 @@ static int btrfs_add_system_chunk(struct
+ 	u32 array_size;
+ 	u8 *ptr;
+ 
+-	mutex_lock(&fs_info->chunk_mutex);
++	lockdep_assert_held(&fs_info->chunk_mutex);
++
+ 	array_size = btrfs_super_sys_array_size(super_copy);
+ 	if (array_size + item_size + sizeof(disk_key)
+-			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+-		mutex_unlock(&fs_info->chunk_mutex);
++			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+ 		return -EFBIG;
+-	}
+ 
+ 	ptr = super_copy->sys_chunk_array + array_size;
+ 	btrfs_cpu_key_to_disk(&disk_key, key);
+@@ -4883,7 +4969,6 @@ static int btrfs_add_system_chunk(struct
+ 	memcpy(ptr, chunk, item_size);
+ 	item_size += sizeof(disk_key);
+ 	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+-	mutex_unlock(&fs_info->chunk_mutex);
+ 
+ 	return 0;
+ }
+@@ -5233,13 +5318,14 @@ static int decide_stripe_size(struct btr
+ 	}
+ }
+ 
+-static int create_chunk(struct btrfs_trans_handle *trans,
++static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
+ 			struct alloc_chunk_ctl *ctl,
+ 			struct btrfs_device_info *devices_info)
+ {
+ 	struct btrfs_fs_info *info = trans->fs_info;
+ 	struct map_lookup *map = NULL;
+ 	struct extent_map_tree *em_tree;
++	struct btrfs_block_group *block_group;
+ 	struct extent_map *em;
+ 	u64 start = ctl->start;
+ 	u64 type = ctl->type;
+@@ -5249,7 +5335,7 @@ static int create_chunk(struct btrfs_tra
+ 
+ 	map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
+ 	if (!map)
+-		return -ENOMEM;
++		return ERR_PTR(-ENOMEM);
+ 	map->num_stripes = ctl->num_stripes;
+ 
+ 	for (i = 0; i < ctl->ndevs; ++i) {
+@@ -5271,7 +5357,7 @@ static int create_chunk(struct btrfs_tra
+ 	em = alloc_extent_map();
+ 	if (!em) {
+ 		kfree(map);
+-		return -ENOMEM;
++		return ERR_PTR(-ENOMEM);
+ 	}
+ 	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
+ 	em->map_lookup = map;
+@@ -5287,12 +5373,12 @@ static int create_chunk(struct btrfs_tra
+ 	if (ret) {
+ 		write_unlock(&em_tree->lock);
+ 		free_extent_map(em);
+-		return ret;
++		return ERR_PTR(ret);
+ 	}
+ 	write_unlock(&em_tree->lock);
+ 
+-	ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
+-	if (ret)
++	block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
++	if (IS_ERR(block_group))
+ 		goto error_del_extent;
+ 
+ 	for (i = 0; i < map->num_stripes; i++) {
+@@ -5312,7 +5398,7 @@ static int create_chunk(struct btrfs_tra
+ 	check_raid56_incompat_flag(info, type);
+ 	check_raid1c34_incompat_flag(info, type);
+ 
+-	return 0;
++	return block_group;
+ 
+ error_del_extent:
+ 	write_lock(&em_tree->lock);
+@@ -5324,34 +5410,36 @@ error_del_extent:
+ 	/* One for the tree reference */
+ 	free_extent_map(em);
+ 
+-	return ret;
++	return block_group;
+ }
+ 
+-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
++struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
++					    u64 type)
+ {
+ 	struct btrfs_fs_info *info = trans->fs_info;
+ 	struct btrfs_fs_devices *fs_devices = info->fs_devices;
+ 	struct btrfs_device_info *devices_info = NULL;
+ 	struct alloc_chunk_ctl ctl;
++	struct btrfs_block_group *block_group;
+ 	int ret;
+ 
+ 	lockdep_assert_held(&info->chunk_mutex);
+ 
+ 	if (!alloc_profile_is_valid(type, 0)) {
+ 		ASSERT(0);
+-		return -EINVAL;
++		return ERR_PTR(-EINVAL);
+ 	}
+ 
+ 	if (list_empty(&fs_devices->alloc_list)) {
+ 		if (btrfs_test_opt(info, ENOSPC_DEBUG))
+ 			btrfs_debug(info, "%s: no writable device", __func__);
+-		return -ENOSPC;
++		return ERR_PTR(-ENOSPC);
+ 	}
+ 
+ 	if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ 		btrfs_err(info, "invalid chunk type 0x%llx requested", type);
+ 		ASSERT(0);
+-		return -EINVAL;
++		return ERR_PTR(-EINVAL);
+ 	}
+ 
+ 	ctl.start = find_next_chunk(info);
+@@ -5361,46 +5449,43 @@ int btrfs_alloc_chunk(struct btrfs_trans
+ 	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
+ 			       GFP_NOFS);
+ 	if (!devices_info)
+-		return -ENOMEM;
++		return ERR_PTR(-ENOMEM);
+ 
+ 	ret = gather_device_info(fs_devices, &ctl, devices_info);
+-	if (ret < 0)
++	if (ret < 0) {
++		block_group = ERR_PTR(ret);
+ 		goto out;
++	}
+ 
+ 	ret = decide_stripe_size(fs_devices, &ctl, devices_info);
+-	if (ret < 0)
++	if (ret < 0) {
++		block_group = ERR_PTR(ret);
+ 		goto out;
++	}
+ 
+-	ret = create_chunk(trans, &ctl, devices_info);
++	block_group = create_chunk(trans, &ctl, devices_info);
+ 
+ out:
+ 	kfree(devices_info);
+-	return ret;
++	return block_group;
+ }
+ 
+ /*
+- * Chunk allocation falls into two parts. The first part does work
+- * that makes the new allocated chunk usable, but does not do any operation
+- * that modifies the chunk tree. The second part does the work that
+- * requires modifying the chunk tree. This division is important for the
+- * bootstrap process of adding storage to a seed btrfs.
++ * This function, btrfs_finish_chunk_alloc(), belongs to phase 2.
++ *
++ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
++ * phases.
+  */
+ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+ 			     u64 chunk_offset, u64 chunk_size)
+ {
+ 	struct btrfs_fs_info *fs_info = trans->fs_info;
+-	struct btrfs_root *extent_root = fs_info->extent_root;
+-	struct btrfs_root *chunk_root = fs_info->chunk_root;
+-	struct btrfs_key key;
+ 	struct btrfs_device *device;
+-	struct btrfs_chunk *chunk;
+-	struct btrfs_stripe *stripe;
+ 	struct extent_map *em;
+ 	struct map_lookup *map;
+-	size_t item_size;
+ 	u64 dev_offset;
+ 	u64 stripe_size;
+-	int i = 0;
++	int i;
+ 	int ret = 0;
+ 
+ 	em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
+@@ -5408,53 +5493,117 @@ int btrfs_finish_chunk_alloc(struct btrf
+ 		return PTR_ERR(em);
+ 
+ 	map = em->map_lookup;
+-	item_size = btrfs_chunk_item_size(map->num_stripes);
+ 	stripe_size = em->orig_block_len;
+ 
+-	chunk = kzalloc(item_size, GFP_NOFS);
+-	if (!chunk) {
+-		ret = -ENOMEM;
+-		goto out;
+-	}
+-
+ 	/*
+ 	 * Take the device list mutex to prevent races with the final phase of
+ 	 * a device replace operation that replaces the device object associated
+ 	 * with the map's stripes, because the device object's id can change
+ 	 * at any time during that final phase of the device replace operation
+-	 * (dev-replace.c:btrfs_dev_replace_finishing()).
++	 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
++	 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
++	 * resulting in persisting a device extent item with such ID.
+ 	 */
+ 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ 	for (i = 0; i < map->num_stripes; i++) {
+ 		device = map->stripes[i].dev;
+ 		dev_offset = map->stripes[i].physical;
+ 
+-		ret = btrfs_update_device(trans, device);
+-		if (ret)
+-			break;
+ 		ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
+ 					     dev_offset, stripe_size);
+ 		if (ret)
+ 			break;
+ 	}
+-	if (ret) {
+-		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
++	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
++
++	free_extent_map(em);
++	return ret;
++}
++
++/*
++ * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
++ * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
++ * chunks.
++ *
++ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
++ * phases.
++ */
++int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
++				     struct btrfs_block_group *bg)
++{
++	struct btrfs_fs_info *fs_info = trans->fs_info;
++	struct btrfs_root *extent_root = fs_info->extent_root;
++	struct btrfs_root *chunk_root = fs_info->chunk_root;
++	struct btrfs_key key;
++	struct btrfs_chunk *chunk;
++	struct btrfs_stripe *stripe;
++	struct extent_map *em;
++	struct map_lookup *map;
++	size_t item_size;
++	int i;
++	int ret;
++
++	/*
++	 * We take the chunk_mutex for 2 reasons:
++	 *
++	 * 1) Updates and insertions in the chunk btree must be done while holding
++	 *    the chunk_mutex, as well as updating the system chunk array in the
++	 *    superblock. See the comment on top of btrfs_chunk_alloc() for the
++	 *    details;
++	 *
++	 * 2) To prevent races with the final phase of a device replace operation
++	 *    that replaces the device object associated with the map's stripes,
++	 *    because the device object's id can change at any time during that
++	 *    final phase of the device replace operation
++	 *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
++	 *    replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
++	 *    which would cause a failure when updating the device item, which does
++	 *    not exists, or persisting a stripe of the chunk item with such ID.
++	 *    Here we can't use the device_list_mutex because our caller already
++	 *    has locked the chunk_mutex, and the final phase of device replace
++	 *    acquires both mutexes - first the device_list_mutex and then the
++	 *    chunk_mutex. Using any of those two mutexes protects us from a
++	 *    concurrent device replace.
++	 */
++	lockdep_assert_held(&fs_info->chunk_mutex);
++
++	em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
++	if (IS_ERR(em)) {
++		ret = PTR_ERR(em);
++		btrfs_abort_transaction(trans, ret);
++		return ret;
++	}
++
++	map = em->map_lookup;
++	item_size = btrfs_chunk_item_size(map->num_stripes);
++
++	chunk = kzalloc(item_size, GFP_NOFS);
++	if (!chunk) {
++		ret = -ENOMEM;
++		btrfs_abort_transaction(trans, ret);
+ 		goto out;
+ 	}
+ 
++	for (i = 0; i < map->num_stripes; i++) {
++		struct btrfs_device *device = map->stripes[i].dev;
++
++		ret = btrfs_update_device(trans, device);
++		if (ret)
++			goto out;
++	}
++
+ 	stripe = &chunk->stripe;
+ 	for (i = 0; i < map->num_stripes; i++) {
+-		device = map->stripes[i].dev;
+-		dev_offset = map->stripes[i].physical;
++		struct btrfs_device *device = map->stripes[i].dev;
++		const u64 dev_offset = map->stripes[i].physical;
+ 
+ 		btrfs_set_stack_stripe_devid(stripe, device->devid);
+ 		btrfs_set_stack_stripe_offset(stripe, dev_offset);
+ 		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
+ 		stripe++;
+ 	}
+-	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ 
+-	btrfs_set_stack_chunk_length(chunk, chunk_size);
++	btrfs_set_stack_chunk_length(chunk, bg->length);
+ 	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+ 	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
+ 	btrfs_set_stack_chunk_type(chunk, map->type);
+@@ -5466,15 +5615,18 @@ int btrfs_finish_chunk_alloc(struct btrf
+ 
+ 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ 	key.type = BTRFS_CHUNK_ITEM_KEY;
+-	key.offset = chunk_offset;
++	key.offset = bg->start;
+ 
+ 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
+-	if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+-		/*
+-		 * TODO: Cleanup of inserted chunk root in case of
+-		 * failure.
+-		 */
++	if (ret)
++		goto out;
++
++	bg->chunk_item_inserted = 1;
++
++	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ 		ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
++		if (ret)
++			goto out;
+ 	}
+ 
+ out:
+@@ -5487,16 +5639,41 @@ static noinline int init_first_rw_device
+ {
+ 	struct btrfs_fs_info *fs_info = trans->fs_info;
+ 	u64 alloc_profile;
+-	int ret;
++	struct btrfs_block_group *meta_bg;
++	struct btrfs_block_group *sys_bg;
++
++	/*
++	 * When adding a new device for sprouting, the seed device is read-only
++	 * so we must first allocate a metadata and a system chunk. But before
++	 * adding the block group items to the extent, device and chunk btrees,
++	 * we must first:
++	 *
++	 * 1) Create both chunks without doing any changes to the btrees, as
++	 *    otherwise we would get -ENOSPC since the block groups from the
++	 *    seed device are read-only;
++	 *
++	 * 2) Add the device item for the new sprout device - finishing the setup
++	 *    of a new block group requires updating the device item in the chunk
++	 *    btree, so it must exist when we attempt to do it. The previous step
++	 *    ensures this does not fail with -ENOSPC.
++	 *
++	 * After that we can add the block group items to their btrees:
++	 * update existing device item in the chunk btree, add a new block group
++	 * item to the extent btree, add a new chunk item to the chunk btree and
++	 * finally add the new device extent items to the devices btree.
++	 */
+ 
+ 	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
+-	ret = btrfs_alloc_chunk(trans, alloc_profile);
+-	if (ret)
+-		return ret;
++	meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
++	if (IS_ERR(meta_bg))
++		return PTR_ERR(meta_bg);
+ 
+ 	alloc_profile = btrfs_system_alloc_profile(fs_info);
+-	ret = btrfs_alloc_chunk(trans, alloc_profile);
+-	return ret;
++	sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
++	if (IS_ERR(sys_bg))
++		return PTR_ERR(sys_bg);
++
++	return 0;
+ }
+ 
+ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
+@@ -7425,10 +7602,18 @@ int btrfs_read_chunk_tree(struct btrfs_f
+ 			total_dev++;
+ 		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
+ 			struct btrfs_chunk *chunk;
++
++			/*
++			 * We are only called at mount time, so no need to take
++			 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
++			 * we always lock first fs_info->chunk_mutex before
++			 * acquiring any locks on the chunk tree. This is a
++			 * requirement for chunk allocation, see the comment on
++			 * top of btrfs_chunk_alloc() for details.
++			 */
++			ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
+ 			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+-			mutex_lock(&fs_info->chunk_mutex);
+ 			ret = read_one_chunk(&found_key, leaf, chunk);
+-			mutex_unlock(&fs_info->chunk_mutex);
+ 			if (ret)
+ 				goto error;
+ 		}
+--- a/fs/btrfs/volumes.h
++++ b/fs/btrfs/volumes.h
+@@ -447,7 +447,8 @@ int btrfs_get_io_geometry(struct btrfs_f
+ 			  struct btrfs_io_geometry *io_geom);
+ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
+ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
+-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type);
++struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
++					    u64 type);
+ void btrfs_mapping_tree_free(struct extent_map_tree *tree);
+ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+ 			   int mirror_num);
+@@ -506,6 +507,8 @@ unsigned long btrfs_full_stripe_len(stru
+ 				    u64 logical);
+ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+ 			     u64 chunk_offset, u64 chunk_size);
++int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
++				     struct btrfs_block_group *bg);
+ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
+ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+ 				       u64 logical, u64 length);
diff --git a/queue-5.13/btrfs-zoned-fix-types-for-u64-division-in-btrfs_reclaim_bgs_work.patch b/queue-5.13/btrfs-zoned-fix-types-for-u64-division-in-btrfs_reclaim_bgs_work.patch
new file mode 100644
index 00000000000..c6f78bff116
--- /dev/null
+++ b/queue-5.13/btrfs-zoned-fix-types-for-u64-division-in-btrfs_reclaim_bgs_work.patch
@@ -0,0 +1,35 @@
+From 54afaae34ee49e98c1c902b444b42832551d090c Mon Sep 17 00:00:00 2001
+From: David Sterba <dsterba@suse.com>
+Date: Wed, 23 Jun 2021 17:54:54 +0200
+Subject: btrfs: zoned: fix types for u64 division in btrfs_reclaim_bgs_work
+
+From: David Sterba <dsterba@suse.com>
+
+commit 54afaae34ee49e98c1c902b444b42832551d090c upstream.
+
+The types in calculation of the used percentage in the reclaiming
+messages are both u64, though bg->length is either 1GiB (non-zoned) or
+the zone size in the zoned mode. The upper limit on zone size is 8GiB so
+this could theoretically overflow in the future, right now the values
+fit.
+
+Fixes: 18bb8bbf13c1 ("btrfs: zoned: automatically reclaim zones")
+CC: stable@vger.kernel.org # 5.13
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/block-group.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -1539,7 +1539,7 @@ void btrfs_reclaim_bgs_work(struct work_
+ 			goto next;
+ 
+ 		btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used",
+-				bg->start, div_u64(bg->used * 100, bg->length));
++				bg->start, div64_u64(bg->used * 100, bg->length));
+ 		trace_btrfs_reclaim_block_group(bg);
+ 		ret = btrfs_relocate_chunk(fs_info, bg->start);
+ 		if (ret)
diff --git a/queue-5.13/btrfs-zoned-fix-wrong-mutex-unlock-on-failure-to-allocate-log-root-tree.patch b/queue-5.13/btrfs-zoned-fix-wrong-mutex-unlock-on-failure-to-allocate-log-root-tree.patch
new file mode 100644
index 00000000000..2f9cbca7072
--- /dev/null
+++ b/queue-5.13/btrfs-zoned-fix-wrong-mutex-unlock-on-failure-to-allocate-log-root-tree.patch
@@ -0,0 +1,43 @@
+From ea32af47f00a046a1f953370514d6d946efe0152 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 7 Jul 2021 12:23:45 +0100
+Subject: btrfs: zoned: fix wrong mutex unlock on failure to allocate log root tree
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit ea32af47f00a046a1f953370514d6d946efe0152 upstream.
+
+When syncing the log, if we fail to allocate the root node for the log
+root tree:
+
+1) We are unlocking fs_info->tree_log_mutex, but at this point we have
+   not yet locked this mutex;
+
+2) We have locked fs_info->tree_root->log_mutex, but we end up not
+   unlocking it;
+
+So fix this by unlocking fs_info->tree_root->log_mutex instead of
+fs_info->tree_log_mutex.
+
+Fixes: e75f9fd194090e ("btrfs: zoned: move log tree node allocation out of log_root_tree->log_mutex")
+CC: stable@vger.kernel.org # 5.13+
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -3173,7 +3173,7 @@ int btrfs_sync_log(struct btrfs_trans_ha
+ 		if (!log_root_tree->node) {
+ 			ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
+ 			if (ret) {
+-				mutex_unlock(&fs_info->tree_log_mutex);
++				mutex_unlock(&fs_info->tree_root->log_mutex);
+ 				goto out;
+ 			}
+ 		}
diff --git a/queue-5.13/cgroup-verify-that-source-is-a-string.patch b/queue-5.13/cgroup-verify-that-source-is-a-string.patch
new file mode 100644
index 00000000000..36955203579
--- /dev/null
+++ b/queue-5.13/cgroup-verify-that-source-is-a-string.patch
@@ -0,0 +1,64 @@
+From 3b0462726e7ef281c35a7a4ae33e93ee2bc9975b Mon Sep 17 00:00:00 2001
+From: Christian Brauner <christian.brauner@ubuntu.com>
+Date: Wed, 14 Jul 2021 15:47:49 +0200
+Subject: cgroup: verify that source is a string
+
+From: Christian Brauner <christian.brauner@ubuntu.com>
+
+commit 3b0462726e7ef281c35a7a4ae33e93ee2bc9975b upstream.
+
+The following sequence can be used to trigger a UAF:
+
+    int fscontext_fd = fsopen("cgroup");
+    int fd_null = open("/dev/null, O_RDONLY);
+    int fsconfig(fscontext_fd, FSCONFIG_SET_FD, "source", fd_null);
+    close_range(3, ~0U, 0);
+
+The cgroup v1 specific fs parser expects a string for the "source"
+parameter.  However, it is perfectly legitimate to e.g.  specify a file
+descriptor for the "source" parameter.  The fs parser doesn't know what
+a filesystem allows there.  So it's a bug to assume that "source" is
+always of type fs_value_is_string when it can reasonably also be
+fs_value_is_file.
+
+This assumption in the cgroup code causes a UAF because struct
+fs_parameter uses a union for the actual value.  Access to that union is
+guarded by the param->type member.  Since the cgroup paramter parser
+didn't check param->type but unconditionally moved param->string into
+fc->source a close on the fscontext_fd would trigger a UAF during
+put_fs_context() which frees fc->source thereby freeing the file stashed
+in param->file causing a UAF during a close of the fd_null.
+
+Fix this by verifying that param->type is actually a string and report
+an error if not.
+
+In follow up patches I'll add a new generic helper that can be used here
+and by other filesystems instead of this error-prone copy-pasta fix.
+But fixing it in here first makes backporting a it to stable a lot
+easier.
+
+Fixes: 8d2451f4994f ("cgroup1: switch to option-by-option parsing")
+Reported-by: syzbot+283ce5a46486d6acdbaf@syzkaller.appspotmail.com
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Alexander Viro <viro@zeniv.linux.org.uk>
+Cc: Dmitry Vyukov <dvyukov@google.com>
+Cc: <stable@kernel.org>
+Cc: syzkaller-bugs <syzkaller-bugs@googlegroups.com>
+Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/cgroup/cgroup-v1.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/kernel/cgroup/cgroup-v1.c
++++ b/kernel/cgroup/cgroup-v1.c
+@@ -912,6 +912,8 @@ int cgroup1_parse_param(struct fs_contex
+ 	opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
+ 	if (opt == -ENOPARAM) {
+ 		if (strcmp(param->key, "source") == 0) {
++			if (param->type != fs_value_is_string)
++				return invalf(fc, "Non-string source");
+ 			if (fc->source)
+ 				return invalf(fc, "Multiple sources not supported");
+ 			fc->source = param->string;
diff --git a/queue-5.13/drm-amdgpu-add-another-renoir-did.patch b/queue-5.13/drm-amdgpu-add-another-renoir-did.patch
new file mode 100644
index 00000000000..462ac881d26
--- /dev/null
+++ b/queue-5.13/drm-amdgpu-add-another-renoir-did.patch
@@ -0,0 +1,31 @@
+From 775da83005cb61d4c213c636df9337da05714ff1 Mon Sep 17 00:00:00 2001
+From: Jinzhou Su <Jinzhou.Su@amd.com>
+Date: Tue, 13 Jul 2021 09:26:11 +0800
+Subject: drm/amdgpu: add another Renoir DID
+
+From: Jinzhou Su <Jinzhou.Su@amd.com>
+
+commit 775da83005cb61d4c213c636df9337da05714ff1 upstream.
+
+Add new PCI device id.
+
+Signed-off-by: Jinzhou Su <Jinzhou.Su@amd.com>
+Reviewed-by: Huang Rui <ray.huang@amd.com>
+Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+Cc: stable@vger.kernel.org # 5.11.x
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+@@ -1148,6 +1148,7 @@ static const struct pci_device_id pciidl
+ 	{0x1002, 0x734F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14},
+ 
+ 	/* Renoir */
++	{0x1002, 0x15E7, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU},
+ 	{0x1002, 0x1636, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU},
+ 	{0x1002, 0x1638, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU},
+ 	{0x1002, 0x164C, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU},
diff --git a/queue-5.13/drm-i915-gt-fix-edeadlk-handling-regression.patch b/queue-5.13/drm-i915-gt-fix-edeadlk-handling-regression.patch
new file mode 100644
index 00000000000..e1f0c7ff978
--- /dev/null
+++ b/queue-5.13/drm-i915-gt-fix-edeadlk-handling-regression.patch
@@ -0,0 +1,60 @@
+From 2feeb52859fc1ab94cd35b61ada3a6ac4ff24243 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
+Date: Wed, 30 Jun 2021 19:44:13 +0300
+Subject: drm/i915/gt: Fix -EDEADLK handling regression
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Ville SyrjÃ¤lÃ¤ <ville.syrjala@linux.intel.com>
+
+commit 2feeb52859fc1ab94cd35b61ada3a6ac4ff24243 upstream.
+
+The conversion to ww mutexes failed to address the fence code which
+already returns -EDEADLK when we run out of fences. Ww mutexes on
+the other hand treat -EDEADLK as an internal errno value indicating
+a need to restart the operation due to a deadlock. So now when the
+fence code returns -EDEADLK the higher level code erroneously
+restarts everything instead of returning the error to userspace
+as is expected.
+
+To remedy this let's switch the fence code to use a different errno
+value for this. -ENOBUFS seems like a semi-reasonable unique choice.
+Apart from igt the only user of this I could find is sna, and even
+there all we do is dump the current fence registers from debugfs
+into the X server log. So no user visible functionality is affected.
+If we really cared about preserving this we could of course convert
+back to -EDEADLK higher up, but doesn't seem like that's worth
+the hassle here.
+
+Not quite sure which commit specifically broke this, but I'll
+just attribute it to the general gem ww mutex work.
+
+Cc: stable@vger.kernel.org
+Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
+Cc: Thomas HellstrÃ¶m <thomas.hellstrom@intel.com>
+Testcase: igt/gem_pread/exhaustion
+Testcase: igt/gem_pwrite/basic-exhaustion
+Testcase: igt/gem_fenced_exec_thrash/too-many-fences
+Fixes: 80f0b679d6f0 ("drm/i915: Add an implementation for i915_gem_ww_ctx locking, v2.")
+Signed-off-by: Ville SyrjÃ¤lÃ¤ <ville.syrjala@linux.intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20210630164413.25481-1-ville.syrjala@linux.intel.com
+Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
+(cherry picked from commit 78d2ad7eb4e1f0e9cd5d79788446b6092c21d3e0)
+Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c
++++ b/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c
+@@ -348,7 +348,7 @@ static struct i915_fence_reg *fence_find
+ 	if (intel_has_pending_fb_unpin(ggtt->vm.i915))
+ 		return ERR_PTR(-EAGAIN);
+ 
+-	return ERR_PTR(-EDEADLK);
++	return ERR_PTR(-ENOBUFS);
+ }
+ 
+ int __i915_vma_pin_fence(struct i915_vma *vma)
diff --git a/queue-5.13/drm-i915-gtt-drop-the-page-table-optimisation.patch b/queue-5.13/drm-i915-gtt-drop-the-page-table-optimisation.patch
new file mode 100644
index 00000000000..8dfb4b68d38
--- /dev/null
+++ b/queue-5.13/drm-i915-gtt-drop-the-page-table-optimisation.patch
@@ -0,0 +1,55 @@
+From 0abb33bfca0fb74df76aac03e90ce685016ef7be Mon Sep 17 00:00:00 2001
+From: Matthew Auld <matthew.auld@intel.com>
+Date: Tue, 13 Jul 2021 14:04:31 +0100
+Subject: drm/i915/gtt: drop the page table optimisation
+
+From: Matthew Auld <matthew.auld@intel.com>
+
+commit 0abb33bfca0fb74df76aac03e90ce685016ef7be upstream.
+
+We skip filling out the pt with scratch entries if the va range covers
+the entire pt, since we later have to fill it with the PTEs for the
+object pages anyway. However this might leave open a small window where
+the PTEs don't point to anything valid for the HW to consume.
+
+When for example using 2M GTT pages this fill_px() showed up as being
+quite significant in perf measurements, and ends up being completely
+wasted since we ignore the pt and just use the pde directly.
+
+Anyway, currently we have our PTE construction split between alloc and
+insert, which is probably slightly iffy nowadays, since the alloc
+doesn't actually allocate anything anymore, instead it just sets up the
+page directories and points the PTEs at the scratch page. Later when we
+do the insert step we re-program the PTEs again. Better might be to
+squash the alloc and insert into a single step, then bringing back this
+optimisation(along with some others) should be possible.
+
+Fixes: 14826673247e ("drm/i915: Only initialize partially filled pagetables")
+Signed-off-by: Matthew Auld <matthew.auld@intel.com>
+Cc: Jon Bloomfield <jon.bloomfield@intel.com>
+Cc: Chris Wilson <chris.p.wilson@intel.com>
+Cc: Daniel Vetter <daniel@ffwll.ch>
+Cc: <stable@vger.kernel.org> # v4.15+
+Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
+Link: https://patchwork.freedesktop.org/patch/msgid/20210713130431.2392740-1-matthew.auld@intel.com
+(cherry picked from commit 8f88ca76b3942d82e2c1cea8735ec368d89ecc15)
+Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/i915/gt/gen8_ppgtt.c |    5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+--- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
++++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
+@@ -304,10 +304,7 @@ static void __gen8_ppgtt_alloc(struct i9
+ 			__i915_gem_object_pin_pages(pt->base);
+ 			i915_gem_object_make_unshrinkable(pt->base);
+ 
+-			if (lvl ||
+-			    gen8_pt_count(*start, end) < I915_PDES ||
+-			    intel_vgpu_active(vm->i915))
+-				fill_px(pt, vm->scratch[lvl]->encode);
++			fill_px(pt, vm->scratch[lvl]->encode);
+ 
+ 			spin_lock(&pd->lock);
+ 			if (likely(!pd->entry[idx])) {
diff --git a/queue-5.13/edac-igen6-fix-core-dependency-again.patch b/queue-5.13/edac-igen6-fix-core-dependency-again.patch
new file mode 100644
index 00000000000..18ec5268bca
--- /dev/null
+++ b/queue-5.13/edac-igen6-fix-core-dependency-again.patch
@@ -0,0 +1,38 @@
+From a1c9ca5f65c9acfd7c02474b9d5cacbd7ea288df Mon Sep 17 00:00:00 2001
+From: Randy Dunlap <rdunlap@infradead.org>
+Date: Thu, 15 Jul 2021 11:55:31 -0700
+Subject: EDAC/igen6: fix core dependency AGAIN
+
+From: Randy Dunlap <rdunlap@infradead.org>
+
+commit a1c9ca5f65c9acfd7c02474b9d5cacbd7ea288df upstream.
+
+My previous patch had a typo/thinko which prevents this driver
+from being enabled: change X64_64 to X86_64.
+
+Fixes: 0a9ece9ba154 ("EDAC/igen6: fix core dependency")
+Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
+Cc: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
+Cc: linux-edac@vger.kernel.org
+Cc: bowsingbetee <bowsingbetee@protonmail.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Tony Luck <tony.luck@intel.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/edac/Kconfig |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/edac/Kconfig
++++ b/drivers/edac/Kconfig
+@@ -271,7 +271,7 @@ config EDAC_PND2
+ config EDAC_IGEN6
+ 	tristate "Intel client SoC Integrated MC"
+ 	depends on PCI && PCI_MMCONFIG && ARCH_HAVE_NMI_SAFE_CMPXCHG
+-	depends on X64_64 && X86_MCE_INTEL
++	depends on X86_64 && X86_MCE_INTEL
+ 	help
+ 	  Support for error detection and correction on the Intel
+ 	  client SoC Integrated Memory Controller using In-Band ECC IP.
diff --git a/queue-5.13/fbmem-do-not-delete-the-mode-that-is-still-in-use.patch b/queue-5.13/fbmem-do-not-delete-the-mode-that-is-still-in-use.patch
new file mode 100644
index 00000000000..4bb50b45e21
--- /dev/null
+++ b/queue-5.13/fbmem-do-not-delete-the-mode-that-is-still-in-use.patch
@@ -0,0 +1,85 @@
+From 0af778269a522c988ef0b4188556aba97fb420cc Mon Sep 17 00:00:00 2001
+From: Zhen Lei <thunder.leizhen@huawei.com>
+Date: Mon, 12 Jul 2021 16:55:44 +0800
+Subject: fbmem: Do not delete the mode that is still in use
+
+From: Zhen Lei <thunder.leizhen@huawei.com>
+
+commit 0af778269a522c988ef0b4188556aba97fb420cc upstream.
+
+The execution of fb_delete_videomode() is not based on the result of the
+previous fbcon_mode_deleted(). As a result, the mode is directly deleted,
+regardless of whether it is still in use, which may cause UAF.
+
+==================================================================
+BUG: KASAN: use-after-free in fb_mode_is_equal+0x36e/0x5e0 \
+drivers/video/fbdev/core/modedb.c:924
+Read of size 4 at addr ffff88807e0ddb1c by task syz-executor.0/18962
+
+CPU: 2 PID: 18962 Comm: syz-executor.0 Not tainted 5.10.45-rc1+ #3
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ...
+Call Trace:
+ __dump_stack lib/dump_stack.c:77 [inline]
+ dump_stack+0x137/0x1be lib/dump_stack.c:118
+ print_address_description+0x6c/0x640 mm/kasan/report.c:385
+ __kasan_report mm/kasan/report.c:545 [inline]
+ kasan_report+0x13d/0x1e0 mm/kasan/report.c:562
+ fb_mode_is_equal+0x36e/0x5e0 drivers/video/fbdev/core/modedb.c:924
+ fbcon_mode_deleted+0x16a/0x220 drivers/video/fbdev/core/fbcon.c:2746
+ fb_set_var+0x1e1/0xdb0 drivers/video/fbdev/core/fbmem.c:975
+ do_fb_ioctl+0x4d9/0x6e0 drivers/video/fbdev/core/fbmem.c:1108
+ vfs_ioctl fs/ioctl.c:48 [inline]
+ __do_sys_ioctl fs/ioctl.c:753 [inline]
+ __se_sys_ioctl+0xfb/0x170 fs/ioctl.c:739
+ do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+Freed by task 18960:
+ kasan_save_stack mm/kasan/common.c:48 [inline]
+ kasan_set_track+0x3d/0x70 mm/kasan/common.c:56
+ kasan_set_free_info+0x17/0x30 mm/kasan/generic.c:355
+ __kasan_slab_free+0x108/0x140 mm/kasan/common.c:422
+ slab_free_hook mm/slub.c:1541 [inline]
+ slab_free_freelist_hook+0xd6/0x1a0 mm/slub.c:1574
+ slab_free mm/slub.c:3139 [inline]
+ kfree+0xca/0x3d0 mm/slub.c:4121
+ fb_delete_videomode+0x56a/0x820 drivers/video/fbdev/core/modedb.c:1104
+ fb_set_var+0x1f3/0xdb0 drivers/video/fbdev/core/fbmem.c:978
+ do_fb_ioctl+0x4d9/0x6e0 drivers/video/fbdev/core/fbmem.c:1108
+ vfs_ioctl fs/ioctl.c:48 [inline]
+ __do_sys_ioctl fs/ioctl.c:753 [inline]
+ __se_sys_ioctl+0xfb/0x170 fs/ioctl.c:739
+ do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+Fixes: 13ff178ccd6d ("fbcon: Call fbcon_mode_deleted/new_modelist directly")
+Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
+Cc: <stable@vger.kernel.org> # v5.3+
+Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
+Link: https://patchwork.freedesktop.org/patch/msgid/20210712085544.2828-1-thunder.leizhen@huawei.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/video/fbdev/core/fbmem.c |   12 +++++-------
+ 1 file changed, 5 insertions(+), 7 deletions(-)
+
+--- a/drivers/video/fbdev/core/fbmem.c
++++ b/drivers/video/fbdev/core/fbmem.c
+@@ -970,13 +970,11 @@ fb_set_var(struct fb_info *info, struct
+ 		fb_var_to_videomode(&mode2, &info->var);
+ 		/* make sure we don't delete the videomode of current var */
+ 		ret = fb_mode_is_equal(&mode1, &mode2);
+-
+-		if (!ret)
+-			fbcon_mode_deleted(info, &mode1);
+-
+-		if (!ret)
+-			fb_delete_videomode(&mode1, &info->modelist);
+-
++		if (!ret) {
++			ret = fbcon_mode_deleted(info, &mode1);
++			if (!ret)
++				fb_delete_videomode(&mode1, &info->modelist);
++		}
+ 
+ 		return ret ? -EINVAL : 0;
+ 	}
diff --git a/queue-5.13/io_uring-use-right-task-for-exiting-checks.patch b/queue-5.13/io_uring-use-right-task-for-exiting-checks.patch
new file mode 100644
index 00000000000..65979ccdfbc
--- /dev/null
+++ b/queue-5.13/io_uring-use-right-task-for-exiting-checks.patch
@@ -0,0 +1,35 @@
+From 9c6882608bce249a8918744ecdb65748534e3f17 Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Sat, 10 Jul 2021 02:45:59 +0100
+Subject: io_uring: use right task for exiting checks
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+commit 9c6882608bce249a8918744ecdb65748534e3f17 upstream.
+
+When we use delayed_work for fallback execution of requests, current
+will be not of the submitter task, and so checks in io_req_task_submit()
+may not behave as expected. Currently, it leaves inline completions not
+flushed, so making io_ring_exit_work() to hang. Use the submitter task
+for all those checks.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/cb413c715bed0bc9c98b169059ea9c8a2c770715.1625881431.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -2040,7 +2040,7 @@ static void __io_req_task_submit(struct
+ 
+ 	/* ctx stays valid until unlock, even if we drop all ours ctx->refs */
+ 	mutex_lock(&ctx->uring_lock);
+-	if (!(current->flags & PF_EXITING) && !current->in_execve)
++	if (!(req->task->flags & PF_EXITING) && !req->task->in_execve)
+ 		__io_queue_sqe(req);
+ 	else
+ 		io_req_complete_failed(req, -EFAULT);
diff --git a/queue-5.13/iommu-vt-d-fix-clearing-real-dma-device-s-scalable-mode-context-entries.patch b/queue-5.13/iommu-vt-d-fix-clearing-real-dma-device-s-scalable-mode-context-entries.patch
new file mode 100644
index 00000000000..a81432bf801
--- /dev/null
+++ b/queue-5.13/iommu-vt-d-fix-clearing-real-dma-device-s-scalable-mode-context-entries.patch
@@ -0,0 +1,47 @@
+From 474dd1c6506411752a9b2f2233eec11f1733a099 Mon Sep 17 00:00:00 2001
+From: Lu Baolu <baolu.lu@linux.intel.com>
+Date: Mon, 12 Jul 2021 15:17:12 +0800
+Subject: iommu/vt-d: Fix clearing real DMA device's scalable-mode context entries
+
+From: Lu Baolu <baolu.lu@linux.intel.com>
+
+commit 474dd1c6506411752a9b2f2233eec11f1733a099 upstream.
+
+The commit 2b0140c69637e ("iommu/vt-d: Use pci_real_dma_dev() for mapping")
+fixes an issue of "sub-device is removed where the context entry is cleared
+for all aliases". But this commit didn't consider the PASID entry and PASID
+table in VT-d scalable mode. This fix increases the coverage of scalable
+mode.
+
+Suggested-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
+Fixes: 8038bdb855331 ("iommu/vt-d: Only clear real DMA device's context entries")
+Fixes: 2b0140c69637e ("iommu/vt-d: Use pci_real_dma_dev() for mapping")
+Cc: stable@vger.kernel.org # v5.6+
+Cc: Jon Derrick <jonathan.derrick@intel.com>
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Link: https://lore.kernel.org/r/20210712071712.3416949-1-baolu.lu@linux.intel.com
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/iommu/intel/iommu.c |    5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+--- a/drivers/iommu/intel/iommu.c
++++ b/drivers/iommu/intel/iommu.c
+@@ -4483,14 +4483,13 @@ static void __dmar_remove_one_dev_info(s
+ 	iommu = info->iommu;
+ 	domain = info->domain;
+ 
+-	if (info->dev) {
++	if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
+ 		if (dev_is_pci(info->dev) && sm_supported(iommu))
+ 			intel_pasid_tear_down_entry(iommu, info->dev,
+ 					PASID_RID2PASID, false);
+ 
+ 		iommu_disable_dev_iotlb(info);
+-		if (!dev_is_real_dma_subdevice(info->dev))
+-			domain_context_clear(info);
++		domain_context_clear(info);
+ 		intel_pasid_free_table(info->dev);
+ 	}
+ 
diff --git a/queue-5.13/iommu-vt-d-global-devtlb-flush-when-present-context-entry-changed.patch b/queue-5.13/iommu-vt-d-global-devtlb-flush-when-present-context-entry-changed.patch
new file mode 100644
index 00000000000..9b89fd32a3f
--- /dev/null
+++ b/queue-5.13/iommu-vt-d-global-devtlb-flush-when-present-context-entry-changed.patch
@@ -0,0 +1,107 @@
+From 37764b952e1b39053defc7ebe5dcd8c4e3e78de9 Mon Sep 17 00:00:00 2001
+From: Sanjay Kumar <sanjay.k.kumar@intel.com>
+Date: Mon, 12 Jul 2021 15:13:15 +0800
+Subject: iommu/vt-d: Global devTLB flush when present context entry changed
+
+From: Sanjay Kumar <sanjay.k.kumar@intel.com>
+
+commit 37764b952e1b39053defc7ebe5dcd8c4e3e78de9 upstream.
+
+This fixes a bug in context cache clear operation. The code was not
+following the correct invalidation flow. A global device TLB invalidation
+should be added after the IOTLB invalidation. At the same time, it
+uses the domain ID from the context entry. But in scalable mode, the
+domain ID is in PASID table entry, not context entry.
+
+Fixes: 7373a8cc38197 ("iommu/vt-d: Setup context and enable RID2PASID support")
+Cc: stable@vger.kernel.org # v5.0+
+Signed-off-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Link: https://lore.kernel.org/r/20210712071315.3416543-1-baolu.lu@linux.intel.com
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/iommu/intel/iommu.c |   31 ++++++++++++++++++++++---------
+ 1 file changed, 22 insertions(+), 9 deletions(-)
+
+--- a/drivers/iommu/intel/iommu.c
++++ b/drivers/iommu/intel/iommu.c
+@@ -2434,10 +2434,11 @@ __domain_mapping(struct dmar_domain *dom
+ 	return 0;
+ }
+ 
+-static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
++static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
+ {
+-	unsigned long flags;
++	struct intel_iommu *iommu = info->iommu;
+ 	struct context_entry *context;
++	unsigned long flags;
+ 	u16 did_old;
+ 
+ 	if (!iommu)
+@@ -2449,7 +2450,16 @@ static void domain_context_clear_one(str
+ 		spin_unlock_irqrestore(&iommu->lock, flags);
+ 		return;
+ 	}
+-	did_old = context_domain_id(context);
++
++	if (sm_supported(iommu)) {
++		if (hw_pass_through && domain_type_is_si(info->domain))
++			did_old = FLPT_DEFAULT_DID;
++		else
++			did_old = info->domain->iommu_did[iommu->seq_id];
++	} else {
++		did_old = context_domain_id(context);
++	}
++
+ 	context_clear_entry(context);
+ 	__iommu_flush_cache(iommu, context, sizeof(*context));
+ 	spin_unlock_irqrestore(&iommu->lock, flags);
+@@ -2467,6 +2477,8 @@ static void domain_context_clear_one(str
+ 				 0,
+ 				 0,
+ 				 DMA_TLB_DSI_FLUSH);
++
++	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
+ }
+ 
+ static inline void unlink_domain_info(struct device_domain_info *info)
+@@ -4436,9 +4448,9 @@ out_free_dmar:
+ 
+ static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
+ {
+-	struct intel_iommu *iommu = opaque;
++	struct device_domain_info *info = opaque;
+ 
+-	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
++	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
+ 	return 0;
+ }
+ 
+@@ -4448,12 +4460,13 @@ static int domain_context_clear_one_cb(s
+  * devices, unbinding the driver from any one of them will possibly leave
+  * the others unable to operate.
+  */
+-static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
++static void domain_context_clear(struct device_domain_info *info)
+ {
+-	if (!iommu || !dev || !dev_is_pci(dev))
++	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
+ 		return;
+ 
+-	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
++	pci_for_each_dma_alias(to_pci_dev(info->dev),
++			       &domain_context_clear_one_cb, info);
+ }
+ 
+ static void __dmar_remove_one_dev_info(struct device_domain_info *info)
+@@ -4477,7 +4490,7 @@ static void __dmar_remove_one_dev_info(s
+ 
+ 		iommu_disable_dev_iotlb(info);
+ 		if (!dev_is_real_dma_subdevice(info->dev))
+-			domain_context_clear(iommu, info->dev);
++			domain_context_clear(info);
+ 		intel_pasid_free_table(info->dev);
+ 	}
+ 
diff --git a/queue-5.13/mm-hugetlb-fix-refs-calculation-from-unaligned-vaddr.patch b/queue-5.13/mm-hugetlb-fix-refs-calculation-from-unaligned-vaddr.patch
new file mode 100644
index 00000000000..92ff94cf323
--- /dev/null
+++ b/queue-5.13/mm-hugetlb-fix-refs-calculation-from-unaligned-vaddr.patch
@@ -0,0 +1,45 @@
+From d08af0a59684e18a51aa4bfd24c658994ea3fc5b Mon Sep 17 00:00:00 2001
+From: Joao Martins <joao.m.martins@oracle.com>
+Date: Wed, 14 Jul 2021 21:27:11 -0700
+Subject: mm/hugetlb: fix refs calculation from unaligned @vaddr
+
+From: Joao Martins <joao.m.martins@oracle.com>
+
+commit d08af0a59684e18a51aa4bfd24c658994ea3fc5b upstream.
+
+Commit 82e5d378b0e47 ("mm/hugetlb: refactor subpage recording")
+refactored the count of subpages but missed an edge case when @vaddr is
+not aligned to PAGE_SIZE e.g.  when close to vma->vm_end.  It would then
+errousnly set @refs to 0 and record_subpages_vmas() wouldn't set the
+@pages array element to its value, consequently causing the reported
+null-deref by syzbot.
+
+Fix it by aligning down @vaddr by PAGE_SIZE in @refs calculation.
+
+Link: https://lkml.kernel.org/r/20210713152440.28650-1-joao.m.martins@oracle.com
+Fixes: 82e5d378b0e47 ("mm/hugetlb: refactor subpage recording")
+Reported-by: syzbot+a3fcd59df1b372066f5a@syzkaller.appspotmail.com
+Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
+Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/hugetlb.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -5216,8 +5216,9 @@ long follow_hugetlb_page(struct mm_struc
+ 			continue;
+ 		}
+ 
+-		refs = min3(pages_per_huge_page(h) - pfn_offset,
+-			    (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);
++		/* vaddr may not be aligned to PAGE_SIZE */
++		refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
++		    (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
+ 
+ 		if (pages || vmas)
+ 			record_subpages_vmas(mem_map_offset(page, pfn_offset),
diff --git a/queue-5.13/scsi-core-fix-bad-pointer-dereference-when-ehandler-kthread-is-invalid.patch b/queue-5.13/scsi-core-fix-bad-pointer-dereference-when-ehandler-kthread-is-invalid.patch
new file mode 100644
index 00000000000..a3c101b8943
--- /dev/null
+++ b/queue-5.13/scsi-core-fix-bad-pointer-dereference-when-ehandler-kthread-is-invalid.patch
@@ -0,0 +1,96 @@
+From 93aa71ad7379900e61c8adff6a710a4c18c7c99b Mon Sep 17 00:00:00 2001
+From: Tyrel Datwyler <tyreld@linux.ibm.com>
+Date: Thu, 1 Jul 2021 13:56:59 -0600
+Subject: scsi: core: Fix bad pointer dereference when ehandler kthread is invalid
+
+From: Tyrel Datwyler <tyreld@linux.ibm.com>
+
+commit 93aa71ad7379900e61c8adff6a710a4c18c7c99b upstream.
+
+Commit 66a834d09293 ("scsi: core: Fix error handling of scsi_host_alloc()")
+changed the allocation logic to call put_device() to perform host cleanup
+with the assumption that IDA removal and stopping the kthread would
+properly be performed in scsi_host_dev_release(). However, in the unlikely
+case that the error handler thread fails to spawn, shost->ehandler is set
+to ERR_PTR(-ENOMEM).
+
+The error handler cleanup code in scsi_host_dev_release() will call
+kthread_stop() if shost->ehandler != NULL which will always be the case
+whether the kthread was successfully spawned or not. In the case that it
+failed to spawn this has the nasty side effect of trying to dereference an
+invalid pointer when kthread_stop() is called. The following splat provides
+an example of this behavior in the wild:
+
+scsi host11: error handler thread failed to spawn, error = -4
+Kernel attempted to read user page (10c) - exploit attempt? (uid: 0)
+BUG: Kernel NULL pointer dereference on read at 0x0000010c
+Faulting instruction address: 0xc00000000818e9a8
+Oops: Kernel access of bad area, sig: 11 [#1]
+LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
+Modules linked in: ibmvscsi(+) scsi_transport_srp dm_multipath dm_mirror dm_region
+ hash dm_log dm_mod fuse overlay squashfs loop
+CPU: 12 PID: 274 Comm: systemd-udevd Not tainted 5.13.0-rc7 #1
+NIP:  c00000000818e9a8 LR: c0000000089846e8 CTR: 0000000000007ee8
+REGS: c000000037d12ea0 TRAP: 0300   Not tainted  (5.13.0-rc7)
+MSR:  800000000280b033 &lt;SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE&gt;  CR: 28228228
+XER: 20040001
+CFAR: c0000000089846e4 DAR: 000000000000010c DSISR: 40000000 IRQMASK: 0
+GPR00: c0000000089846e8 c000000037d13140 c000000009cc1100 fffffffffffffffc
+GPR04: 0000000000000001 0000000000000000 0000000000000000 c000000037dc0000
+GPR08: 0000000000000000 c000000037dc0000 0000000000000001 00000000fffff7ff
+GPR12: 0000000000008000 c00000000a049000 c000000037d13d00 000000011134d5a0
+GPR16: 0000000000001740 c0080000190d0000 c0080000190d1740 c000000009129288
+GPR20: c000000037d13bc0 0000000000000001 c000000037d13bc0 c0080000190b7898
+GPR24: c0080000190b7708 0000000000000000 c000000033bb2c48 0000000000000000
+GPR28: c000000046b28280 0000000000000000 000000000000010c fffffffffffffffc
+NIP [c00000000818e9a8] kthread_stop+0x38/0x230
+LR [c0000000089846e8] scsi_host_dev_release+0x98/0x160
+Call Trace:
+[c000000033bb2c48] 0xc000000033bb2c48 (unreliable)
+[c0000000089846e8] scsi_host_dev_release+0x98/0x160
+[c00000000891e960] device_release+0x60/0x100
+[c0000000087e55c4] kobject_release+0x84/0x210
+[c00000000891ec78] put_device+0x28/0x40
+[c000000008984ea4] scsi_host_alloc+0x314/0x430
+[c0080000190b38bc] ibmvscsi_probe+0x54/0xad0 [ibmvscsi]
+[c000000008110104] vio_bus_probe+0xa4/0x4b0
+[c00000000892a860] really_probe+0x140/0x680
+[c00000000892aefc] driver_probe_device+0x15c/0x200
+[c00000000892b63c] device_driver_attach+0xcc/0xe0
+[c00000000892b740] __driver_attach+0xf0/0x200
+[c000000008926f28] bus_for_each_dev+0xa8/0x130
+[c000000008929ce4] driver_attach+0x34/0x50
+[c000000008928fc0] bus_add_driver+0x1b0/0x300
+[c00000000892c798] driver_register+0x98/0x1a0
+[c00000000810eb60] __vio_register_driver+0x80/0xe0
+[c0080000190b4a30] ibmvscsi_module_init+0x9c/0xdc [ibmvscsi]
+[c0000000080121d0] do_one_initcall+0x60/0x2d0
+[c000000008261abc] do_init_module+0x7c/0x320
+[c000000008265700] load_module+0x2350/0x25b0
+[c000000008265cb4] __do_sys_finit_module+0xd4/0x160
+[c000000008031110] system_call_exception+0x150/0x2d0
+[c00000000800d35c] system_call_common+0xec/0x278
+
+Fix this be nulling shost->ehandler when the kthread fails to spawn.
+
+Link: https://lore.kernel.org/r/20210701195659.3185475-1-tyreld@linux.ibm.com
+Fixes: 66a834d09293 ("scsi: core: Fix error handling of scsi_host_alloc()")
+Cc: stable@vger.kernel.org
+Reviewed-by: Ming Lei <ming.lei@redhat.com>
+Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/scsi/hosts.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/scsi/hosts.c
++++ b/drivers/scsi/hosts.c
+@@ -485,6 +485,7 @@ struct Scsi_Host *scsi_host_alloc(struct
+ 		shost_printk(KERN_WARNING, shost,
+ 			"error handler thread failed to spawn, error = %ld\n",
+ 			PTR_ERR(shost->ehandler));
++		shost->ehandler = NULL;
+ 		goto fail;
+ 	}
+ 
diff --git a/queue-5.13/scsi-zfcp-report-port-fc_security-as-unknown-early-during-remote-cable-pull.patch b/queue-5.13/scsi-zfcp-report-port-fc_security-as-unknown-early-during-remote-cable-pull.patch
new file mode 100644
index 00000000000..c0b4beadf8c
--- /dev/null
+++ b/queue-5.13/scsi-zfcp-report-port-fc_security-as-unknown-early-during-remote-cable-pull.patch
@@ -0,0 +1,38 @@
+From 8b3bdd99c092bbaeaa7d9eecb1a3e5dc9112002b Mon Sep 17 00:00:00 2001
+From: Steffen Maier <maier@linux.ibm.com>
+Date: Fri, 2 Jul 2021 18:09:22 +0200
+Subject: scsi: zfcp: Report port fc_security as unknown early during remote cable pull
+
+From: Steffen Maier <maier@linux.ibm.com>
+
+commit 8b3bdd99c092bbaeaa7d9eecb1a3e5dc9112002b upstream.
+
+On remote cable pull, a zfcp_port keeps its status and only gets
+ZFCP_STATUS_PORT_LINK_TEST added. Only after an ADISC timeout, we would
+actually start port recovery and remove ZFCP_STATUS_COMMON_UNBLOCKED which
+zfcp_sysfs_port_fc_security_show() detected and reported as "unknown"
+instead of the old and possibly stale zfcp_port->connection_info.
+
+Add check for ZFCP_STATUS_PORT_LINK_TEST for timely "unknown" report.
+
+Link: https://lore.kernel.org/r/20210702160922.2667874-1-maier@linux.ibm.com
+Fixes: a17c78460093 ("scsi: zfcp: report FC Endpoint Security in sysfs")
+Cc: <stable@vger.kernel.org> #5.7+
+Reviewed-by: Benjamin Block <bblock@linux.ibm.com>
+Signed-off-by: Steffen Maier <maier@linux.ibm.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/s390/scsi/zfcp_sysfs.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/s390/scsi/zfcp_sysfs.c
++++ b/drivers/s390/scsi/zfcp_sysfs.c
+@@ -487,6 +487,7 @@ static ssize_t zfcp_sysfs_port_fc_securi
+ 	if (0 == (status & ZFCP_STATUS_COMMON_OPEN) ||
+ 	    0 == (status & ZFCP_STATUS_COMMON_UNBLOCKED) ||
+ 	    0 == (status & ZFCP_STATUS_PORT_PHYS_OPEN) ||
++	    0 != (status & ZFCP_STATUS_PORT_LINK_TEST) ||
+ 	    0 != (status & ZFCP_STATUS_COMMON_ERP_FAILED) ||
+ 	    0 != (status & ZFCP_STATUS_COMMON_ACCESS_BOXED))
+ 		i = sprintf(buf, "unknown\n");
diff --git a/queue-5.13/series b/queue-5.13/series
index 2799920be80..532b10d0c0b 100644
--- a/queue-5.13/series
+++ b/queue-5.13/series
@@ -11,3 +11,23 @@ kvm-nsvm-check-the-value-written-to-msr_vm_hsave_pa.patch
 kvm-x86-disable-hardware-breakpoints-unconditionally-before-kvm_x86-run.patch
 kvm-svm-smi-interception-must-not-skip-the-instruction.patch
 kvm-svm-remove-init-intercept-handler.patch
+scsi-core-fix-bad-pointer-dereference-when-ehandler-kthread-is-invalid.patch
+scsi-zfcp-report-port-fc_security-as-unknown-early-during-remote-cable-pull.patch
+iommu-vt-d-global-devtlb-flush-when-present-context-entry-changed.patch
+iommu-vt-d-fix-clearing-real-dma-device-s-scalable-mode-context-entries.patch
+tracing-do-not-reference-char-as-a-string-in-histograms.patch
+drm-amdgpu-add-another-renoir-did.patch
+drm-i915-gtt-drop-the-page-table-optimisation.patch
+drm-i915-gt-fix-edeadlk-handling-regression.patch
+cgroup-verify-that-source-is-a-string.patch
+fbmem-do-not-delete-the-mode-that-is-still-in-use.patch
+edac-igen6-fix-core-dependency-again.patch
+mm-hugetlb-fix-refs-calculation-from-unaligned-vaddr.patch
+arm64-avoid-premature-usercopy-failure.patch
+io_uring-use-right-task-for-exiting-checks.patch
+btrfs-properly-split-extent_map-for-req_op_zone_append.patch
+btrfs-zoned-fix-types-for-u64-division-in-btrfs_reclaim_bgs_work.patch
+btrfs-fix-deadlock-with-concurrent-chunk-allocations-involving-system-chunks.patch
+btrfs-rework-chunk-allocation-to-avoid-exhaustion-of-the-system-chunk-array.patch
+btrfs-don-t-block-if-we-can-t-acquire-the-reclaim-lock.patch
+btrfs-zoned-fix-wrong-mutex-unlock-on-failure-to-allocate-log-root-tree.patch
diff --git a/queue-5.13/tracing-do-not-reference-char-as-a-string-in-histograms.patch b/queue-5.13/tracing-do-not-reference-char-as-a-string-in-histograms.patch
new file mode 100644
index 00000000000..5b36d4fdefe
--- /dev/null
+++ b/queue-5.13/tracing-do-not-reference-char-as-a-string-in-histograms.patch
@@ -0,0 +1,105 @@
+From 704adfb5a9978462cd861f170201ae2b5e3d3a80 Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
+Date: Thu, 15 Jul 2021 00:02:06 -0400
+Subject: tracing: Do not reference char * as a string in histograms
+
+From: Steven Rostedt (VMware) <rostedt@goodmis.org>
+
+commit 704adfb5a9978462cd861f170201ae2b5e3d3a80 upstream.
+
+The histogram logic was allowing events with char * pointers to be used as
+normal strings. But it was easy to crash the kernel with:
+
+ # echo 'hist:keys=filename' > events/syscalls/sys_enter_openat/trigger
+
+And open some files, and boom!
+
+ BUG: unable to handle page fault for address: 00007f2ced0c3280
+ #PF: supervisor read access in kernel mode
+ #PF: error_code(0x0000) - not-present page
+ PGD 1173fa067 P4D 1173fa067 PUD 1171b6067 PMD 1171dd067 PTE 0
+ Oops: 0000 [#1] PREEMPT SMP
+ CPU: 6 PID: 1810 Comm: cat Not tainted 5.13.0-rc5-test+ #61
+ Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01
+v03.03 07/14/2016
+ RIP: 0010:strlen+0x0/0x20
+ Code: f6 82 80 2a 0b a9 20 74 11 0f b6 50 01 48 83 c0 01 f6 82 80 2a 0b
+a9 20 75 ef c3 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 <80> 3f 00 74
+10 48 89 f8 48 83 c0 01 80 38 00 75 f7 48 29 f8 c3
+
+ RSP: 0018:ffffbdbf81567b50 EFLAGS: 00010246
+ RAX: 0000000000000003 RBX: ffff93815cdb3800 RCX: ffff9382401a22d0
+ RDX: 0000000000000100 RSI: 0000000000000000 RDI: 00007f2ced0c3280
+ RBP: 0000000000000100 R08: ffff9382409ff074 R09: ffffbdbf81567c98
+ R10: ffff9382409ff074 R11: 0000000000000000 R12: ffff9382409ff074
+ R13: 0000000000000001 R14: ffff93815a744f00 R15: 00007f2ced0c3280
+ FS:  00007f2ced0f8580(0000) GS:ffff93825a800000(0000)
+knlGS:0000000000000000
+ CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ CR2: 00007f2ced0c3280 CR3: 0000000107069005 CR4: 00000000001706e0
+ Call Trace:
+  event_hist_trigger+0x463/0x5f0
+  ? find_held_lock+0x32/0x90
+  ? sched_clock_cpu+0xe/0xd0
+  ? lock_release+0x155/0x440
+  ? kernel_init_free_pages+0x6d/0x90
+  ? preempt_count_sub+0x9b/0xd0
+  ? kernel_init_free_pages+0x6d/0x90
+  ? get_page_from_freelist+0x12c4/0x1680
+  ? __rb_reserve_next+0xe5/0x460
+  ? ring_buffer_lock_reserve+0x12a/0x3f0
+  event_triggers_call+0x52/0xe0
+  ftrace_syscall_enter+0x264/0x2c0
+  syscall_trace_enter.constprop.0+0x1ee/0x210
+  do_syscall_64+0x1c/0x80
+  entry_SYSCALL_64_after_hwframe+0x44/0xae
+
+Where it triggered a fault on strlen(key) where key was the filename.
+
+The reason is that filename is a char * to user space, and the histogram
+code just blindly dereferenced it, with obvious bad results.
+
+I originally tried to use strncpy_from_user/kernel_nofault() but found
+that there's other places that its dereferenced and not worth the effort.
+
+Just do not allow "char *" to act like strings.
+
+Link: https://lkml.kernel.org/r/20210715000206.025df9d2@rorschach.local.home
+
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Tzvetomir Stoyanov <tz.stoyanov@gmail.com>
+Cc: stable@vger.kernel.org
+Acked-by: Namhyung Kim <namhyung@kernel.org>
+Acked-by: Tom Zanussi <zanussi@kernel.org>
+Fixes: 79e577cbce4c4 ("tracing: Support string type key properly")
+Fixes: 5967bd5c4239 ("tracing: Let filter_assign_type() detect FILTER_PTR_STRING")
+Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/trace_events_hist.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/kernel/trace/trace_events_hist.c
++++ b/kernel/trace/trace_events_hist.c
+@@ -1689,7 +1689,9 @@ static struct hist_field *create_hist_fi
+ 	if (WARN_ON_ONCE(!field))
+ 		goto out;
+ 
+-	if (is_string_field(field)) {
++	/* Pointers to strings are just pointers and dangerous to dereference */
++	if (is_string_field(field) &&
++	    (field->filter_type != FILTER_PTR_STRING)) {
+ 		flags |= HIST_FIELD_FL_STRING;
+ 
+ 		hist_field->size = MAX_FILTER_STR_VAL;
+@@ -4495,8 +4497,6 @@ static inline void add_to_key(char *comp
+ 		field = key_field->field;
+ 		if (field->filter_type == FILTER_DYN_STRING)
+ 			size = *(u32 *)(rec + field->offset) >> 16;
+-		else if (field->filter_type == FILTER_PTR_STRING)
+-			size = strlen(key);
+ 		else if (field->filter_type == FILTER_STATIC_STRING)
+ 			size = field->size;
+ 
-- 
2.47.3