--- /dev/null
+From 295cf156231ca3f9e3a66bde7fab5e09c41835e0 Mon Sep 17 00:00:00 2001
+From: Robin Murphy <robin.murphy@arm.com>
+Date: Mon, 12 Jul 2021 15:27:46 +0100
+Subject: arm64: Avoid premature usercopy failure
+
+From: Robin Murphy <robin.murphy@arm.com>
+
+commit 295cf156231ca3f9e3a66bde7fab5e09c41835e0 upstream.
+
+Al reminds us that the usercopy API must only return complete failure
+if absolutely nothing could be copied. Currently, if userspace does
+something silly like giving us an unaligned pointer to Device memory,
+or a size which overruns MTE tag bounds, we may fail to honour that
+requirement when faulting on a multi-byte access even though a smaller
+access could have succeeded.
+
+Add a mitigation to the fixup routines to fall back to a single-byte
+copy if we faulted on a larger access before anything has been written
+to the destination, to guarantee making *some* forward progress. We
+needn't be too concerned about the overall performance since this should
+only occur when callers are doing something a bit dodgy in the first
+place. Particularly broken userspace might still be able to trick
+generic_perform_write() into an infinite loop by targeting write() at
+an mmap() of some read-only device register where the fault-in load
+succeeds but any store synchronously aborts such that copy_to_user() is
+genuinely unable to make progress, but, well, don't do that...
+
+CC: stable@vger.kernel.org
+Reported-by: Chen Huang <chenhuang5@huawei.com>
+Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
+Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Robin Murphy <robin.murphy@arm.com>
+Link: https://lore.kernel.org/r/dc03d5c675731a1f24a62417dba5429ad744234e.1626098433.git.robin.murphy@arm.com
+Signed-off-by: Will Deacon <will@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/lib/copy_from_user.S | 13 ++++++++++---
+ arch/arm64/lib/copy_in_user.S | 21 ++++++++++++++-------
+ arch/arm64/lib/copy_to_user.S | 14 +++++++++++---
+ 3 files changed, 35 insertions(+), 13 deletions(-)
+
+--- a/arch/arm64/lib/copy_from_user.S
++++ b/arch/arm64/lib/copy_from_user.S
+@@ -29,7 +29,7 @@
+ .endm
+
+ .macro ldrh1 reg, ptr, val
+- user_ldst 9998f, ldtrh, \reg, \ptr, \val
++ user_ldst 9997f, ldtrh, \reg, \ptr, \val
+ .endm
+
+ .macro strh1 reg, ptr, val
+@@ -37,7 +37,7 @@
+ .endm
+
+ .macro ldr1 reg, ptr, val
+- user_ldst 9998f, ldtr, \reg, \ptr, \val
++ user_ldst 9997f, ldtr, \reg, \ptr, \val
+ .endm
+
+ .macro str1 reg, ptr, val
+@@ -45,7 +45,7 @@
+ .endm
+
+ .macro ldp1 reg1, reg2, ptr, val
+- user_ldp 9998f, \reg1, \reg2, \ptr, \val
++ user_ldp 9997f, \reg1, \reg2, \ptr, \val
+ .endm
+
+ .macro stp1 reg1, reg2, ptr, val
+@@ -53,8 +53,10 @@
+ .endm
+
+ end .req x5
++srcin .req x15
+ SYM_FUNC_START(__arch_copy_from_user)
+ add end, x0, x2
++ mov srcin, x1
+ #include "copy_template.S"
+ mov x0, #0 // Nothing to copy
+ ret
+@@ -63,6 +65,11 @@ EXPORT_SYMBOL(__arch_copy_from_user)
+
+ .section .fixup,"ax"
+ .align 2
++9997: cmp dst, dstin
++ b.ne 9998f
++ // Before being absolutely sure we couldn't copy anything, try harder
++USER(9998f, ldtrb tmp1w, [srcin])
++ strb tmp1w, [dst], #1
+ 9998: sub x0, end, dst // bytes not copied
+ ret
+ .previous
+--- a/arch/arm64/lib/copy_in_user.S
++++ b/arch/arm64/lib/copy_in_user.S
+@@ -30,33 +30,34 @@
+ .endm
+
+ .macro ldrh1 reg, ptr, val
+- user_ldst 9998f, ldtrh, \reg, \ptr, \val
++ user_ldst 9997f, ldtrh, \reg, \ptr, \val
+ .endm
+
+ .macro strh1 reg, ptr, val
+- user_ldst 9998f, sttrh, \reg, \ptr, \val
++ user_ldst 9997f, sttrh, \reg, \ptr, \val
+ .endm
+
+ .macro ldr1 reg, ptr, val
+- user_ldst 9998f, ldtr, \reg, \ptr, \val
++ user_ldst 9997f, ldtr, \reg, \ptr, \val
+ .endm
+
+ .macro str1 reg, ptr, val
+- user_ldst 9998f, sttr, \reg, \ptr, \val
++ user_ldst 9997f, sttr, \reg, \ptr, \val
+ .endm
+
+ .macro ldp1 reg1, reg2, ptr, val
+- user_ldp 9998f, \reg1, \reg2, \ptr, \val
++ user_ldp 9997f, \reg1, \reg2, \ptr, \val
+ .endm
+
+ .macro stp1 reg1, reg2, ptr, val
+- user_stp 9998f, \reg1, \reg2, \ptr, \val
++ user_stp 9997f, \reg1, \reg2, \ptr, \val
+ .endm
+
+ end .req x5
+-
++srcin .req x15
+ SYM_FUNC_START(__arch_copy_in_user)
+ add end, x0, x2
++ mov srcin, x1
+ #include "copy_template.S"
+ mov x0, #0
+ ret
+@@ -65,6 +66,12 @@ EXPORT_SYMBOL(__arch_copy_in_user)
+
+ .section .fixup,"ax"
+ .align 2
++9997: cmp dst, dstin
++ b.ne 9998f
++ // Before being absolutely sure we couldn't copy anything, try harder
++USER(9998f, ldtrb tmp1w, [srcin])
++USER(9998f, sttrb tmp1w, [dst])
++ add dst, dst, #1
+ 9998: sub x0, end, dst // bytes not copied
+ ret
+ .previous
+--- a/arch/arm64/lib/copy_to_user.S
++++ b/arch/arm64/lib/copy_to_user.S
+@@ -32,7 +32,7 @@
+ .endm
+
+ .macro strh1 reg, ptr, val
+- user_ldst 9998f, sttrh, \reg, \ptr, \val
++ user_ldst 9997f, sttrh, \reg, \ptr, \val
+ .endm
+
+ .macro ldr1 reg, ptr, val
+@@ -40,7 +40,7 @@
+ .endm
+
+ .macro str1 reg, ptr, val
+- user_ldst 9998f, sttr, \reg, \ptr, \val
++ user_ldst 9997f, sttr, \reg, \ptr, \val
+ .endm
+
+ .macro ldp1 reg1, reg2, ptr, val
+@@ -48,12 +48,14 @@
+ .endm
+
+ .macro stp1 reg1, reg2, ptr, val
+- user_stp 9998f, \reg1, \reg2, \ptr, \val
++ user_stp 9997f, \reg1, \reg2, \ptr, \val
+ .endm
+
+ end .req x5
++srcin .req x15
+ SYM_FUNC_START(__arch_copy_to_user)
+ add end, x0, x2
++ mov srcin, x1
+ #include "copy_template.S"
+ mov x0, #0
+ ret
+@@ -62,6 +64,12 @@ EXPORT_SYMBOL(__arch_copy_to_user)
+
+ .section .fixup,"ax"
+ .align 2
++9997: cmp dst, dstin
++ b.ne 9998f
++ // Before being absolutely sure we couldn't copy anything, try harder
++ ldrb tmp1w, [srcin]
++USER(9998f, sttrb tmp1w, [dst])
++ add dst, dst, #1
+ 9998: sub x0, end, dst // bytes not copied
+ ret
+ .previous
--- /dev/null
+From 1cb3db1cf383a3c7dbda1aa0ce748b0958759947 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 29 Jun 2021 14:43:05 +0100
+Subject: btrfs: fix deadlock with concurrent chunk allocations involving system chunks
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 1cb3db1cf383a3c7dbda1aa0ce748b0958759947 upstream.
+
+When a task attempting to allocate a new chunk verifies that there is not
+currently enough free space in the system space_info and there is another
+task that allocated a new system chunk but it did not finish yet the
+creation of the respective block group, it waits for that other task to
+finish creating the block group. This is to avoid exhaustion of the system
+chunk array in the superblock, which is limited, when we have a thundering
+herd of tasks allocating new chunks. This problem was described and fixed
+by commit eafa4fd0ad0607 ("btrfs: fix exhaustion of the system chunk array
+due to concurrent allocations").
+
+However there are two very similar scenarios where this can lead to a
+deadlock:
+
+1) Task B allocated a new system chunk and task A is waiting on task B
+ to finish creation of the respective system block group. However before
+ task B ends its transaction handle and finishes the creation of the
+ system block group, it attempts to allocate another chunk (like a data
+ chunk for an fallocate operation for a very large range). Task B will
+ be unable to progress and allocate the new chunk, because task A set
+ space_info->chunk_alloc to 1 and therefore it loops at
+ btrfs_chunk_alloc() waiting for task A to finish its chunk allocation
+ and set space_info->chunk_alloc to 0, but task A is waiting on task B
+ to finish creation of the new system block group, therefore resulting
+ in a deadlock;
+
+2) Task B allocated a new system chunk and task A is waiting on task B to
+ finish creation of the respective system block group. By the time that
+ task B enter the final phase of block group allocation, which happens
+ at btrfs_create_pending_block_groups(), when it modifies the extent
+ tree, the device tree or the chunk tree to insert the items for some
+ new block group, it needs to allocate a new chunk, so it ends up at
+ btrfs_chunk_alloc() and keeps looping there because task A has set
+ space_info->chunk_alloc to 1, but task A is waiting for task B to
+ finish creation of the new system block group and release the reserved
+ system space, therefore resulting in a deadlock.
+
+In short, the problem is if a task B needs to allocate a new chunk after
+it previously allocated a new system chunk and if another task A is
+currently waiting for task B to complete the allocation of the new system
+chunk.
+
+Unfortunately this deadlock scenario introduced by the previous fix for
+the system chunk array exhaustion problem does not have a simple and short
+fix, and requires a big change to rework the chunk allocation code so that
+chunk btree updates are all made in the first phase of chunk allocation.
+And since this deadlock regression is being frequently hit on zoned
+filesystems and the system chunk array exhaustion problem is triggered
+in more extreme cases (originally observed on PowerPC with a node size
+of 64K when running the fallocate tests from stress-ng), revert the
+changes from that commit. The next patch in the series, with a subject
+of "btrfs: rework chunk allocation to avoid exhaustion of the system
+chunk array" does the necessary changes to fix the system chunk array
+exhaustion problem.
+
+Reported-by: Naohiro Aota <naohiro.aota@wdc.com>
+Link: https://lore.kernel.org/linux-btrfs/20210621015922.ewgbffxuawia7liz@naota-xeon/
+Fixes: eafa4fd0ad0607 ("btrfs: fix exhaustion of the system chunk array due to concurrent allocations")
+CC: stable@vger.kernel.org # 5.12+
+Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
+Tested-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Tested-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/block-group.c | 58 -------------------------------------------------
+ fs/btrfs/transaction.c | 5 ----
+ fs/btrfs/transaction.h | 7 -----
+ 3 files changed, 1 insertion(+), 69 deletions(-)
+
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -3269,7 +3269,6 @@ static u64 get_profile_num_devs(struct b
+ */
+ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+ {
+- struct btrfs_transaction *cur_trans = trans->transaction;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_space_info *info;
+ u64 left;
+@@ -3284,7 +3283,6 @@ void check_system_chunk(struct btrfs_tra
+ lockdep_assert_held(&fs_info->chunk_mutex);
+
+ info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+-again:
+ spin_lock(&info->lock);
+ left = info->total_bytes - btrfs_space_info_used(info, true);
+ spin_unlock(&info->lock);
+@@ -3303,58 +3301,6 @@ again:
+
+ if (left < thresh) {
+ u64 flags = btrfs_system_alloc_profile(fs_info);
+- u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved);
+-
+- /*
+- * If there's not available space for the chunk tree (system
+- * space) and there are other tasks that reserved space for
+- * creating a new system block group, wait for them to complete
+- * the creation of their system block group and release excess
+- * reserved space. We do this because:
+- *
+- * *) We can end up allocating more system chunks than necessary
+- * when there are multiple tasks that are concurrently
+- * allocating block groups, which can lead to exhaustion of
+- * the system array in the superblock;
+- *
+- * *) If we allocate extra and unnecessary system block groups,
+- * despite being empty for a long time, and possibly forever,
+- * they end not being added to the list of unused block groups
+- * because that typically happens only when deallocating the
+- * last extent from a block group - which never happens since
+- * we never allocate from them in the first place. The few
+- * exceptions are when mounting a filesystem or running scrub,
+- * which add unused block groups to the list of unused block
+- * groups, to be deleted by the cleaner kthread.
+- * And even when they are added to the list of unused block
+- * groups, it can take a long time until they get deleted,
+- * since the cleaner kthread might be sleeping or busy with
+- * other work (deleting subvolumes, running delayed iputs,
+- * defrag scheduling, etc);
+- *
+- * This is rare in practice, but can happen when too many tasks
+- * are allocating blocks groups in parallel (via fallocate())
+- * and before the one that reserved space for a new system block
+- * group finishes the block group creation and releases the space
+- * reserved in excess (at btrfs_create_pending_block_groups()),
+- * other tasks end up here and see free system space temporarily
+- * not enough for updating the chunk tree.
+- *
+- * We unlock the chunk mutex before waiting for such tasks and
+- * lock it again after the wait, otherwise we would deadlock.
+- * It is safe to do so because allocating a system chunk is the
+- * first thing done while allocating a new block group.
+- */
+- if (reserved > trans->chunk_bytes_reserved) {
+- const u64 min_needed = reserved - thresh;
+-
+- mutex_unlock(&fs_info->chunk_mutex);
+- wait_event(cur_trans->chunk_reserve_wait,
+- atomic64_read(&cur_trans->chunk_bytes_reserved) <=
+- min_needed);
+- mutex_lock(&fs_info->chunk_mutex);
+- goto again;
+- }
+
+ /*
+ * Ignore failure to create system chunk. We might end up not
+@@ -3369,10 +3315,8 @@ again:
+ ret = btrfs_block_rsv_add(fs_info->chunk_root,
+ &fs_info->chunk_block_rsv,
+ thresh, BTRFS_RESERVE_NO_FLUSH);
+- if (!ret) {
+- atomic64_add(thresh, &cur_trans->chunk_bytes_reserved);
++ if (!ret)
+ trans->chunk_bytes_reserved += thresh;
+- }
+ }
+ }
+
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -260,7 +260,6 @@ static inline int extwriter_counter_read
+ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
+ {
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+- struct btrfs_transaction *cur_trans = trans->transaction;
+
+ if (!trans->chunk_bytes_reserved)
+ return;
+@@ -269,8 +268,6 @@ void btrfs_trans_release_chunk_metadata(
+
+ btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
+ trans->chunk_bytes_reserved, NULL);
+- atomic64_sub(trans->chunk_bytes_reserved, &cur_trans->chunk_bytes_reserved);
+- cond_wake_up(&cur_trans->chunk_reserve_wait);
+ trans->chunk_bytes_reserved = 0;
+ }
+
+@@ -386,8 +383,6 @@ loop:
+ spin_lock_init(&cur_trans->dropped_roots_lock);
+ INIT_LIST_HEAD(&cur_trans->releasing_ebs);
+ spin_lock_init(&cur_trans->releasing_ebs_lock);
+- atomic64_set(&cur_trans->chunk_bytes_reserved, 0);
+- init_waitqueue_head(&cur_trans->chunk_reserve_wait);
+ list_add_tail(&cur_trans->list, &fs_info->trans_list);
+ extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
+ IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
+--- a/fs/btrfs/transaction.h
++++ b/fs/btrfs/transaction.h
+@@ -96,13 +96,6 @@ struct btrfs_transaction {
+
+ spinlock_t releasing_ebs_lock;
+ struct list_head releasing_ebs;
+-
+- /*
+- * The number of bytes currently reserved, by all transaction handles
+- * attached to this transaction, for metadata extents of the chunk tree.
+- */
+- atomic64_t chunk_bytes_reserved;
+- wait_queue_head_t chunk_reserve_wait;
+ };
+
+ #define __TRANS_FREEZABLE (1U << 0)
--- /dev/null
+From abb99cfdaf0759f8a619e5fecf52ccccdf310c8c Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Mon, 28 Jun 2021 17:57:28 +0900
+Subject: btrfs: properly split extent_map for REQ_OP_ZONE_APPEND
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit abb99cfdaf0759f8a619e5fecf52ccccdf310c8c upstream.
+
+Damien reported a test failure with btrfs/209. The test itself ran fine,
+but the fsck ran afterwards reported a corrupted filesystem.
+
+The filesystem corruption happens because we're splitting an extent and
+then writing the extent twice. We have to split the extent though, because
+we're creating too large extents for a REQ_OP_ZONE_APPEND operation.
+
+When dumping the extent tree, we can see two EXTENT_ITEMs at the same
+start address but different lengths.
+
+$ btrfs inspect dump-tree /dev/nullb1 -t extent
+...
+ item 19 key (269484032 EXTENT_ITEM 126976) itemoff 15470 itemsize 53
+ refs 1 gen 7 flags DATA
+ extent data backref root FS_TREE objectid 257 offset 786432 count 1
+ item 20 key (269484032 EXTENT_ITEM 262144) itemoff 15417 itemsize 53
+ refs 1 gen 7 flags DATA
+ extent data backref root FS_TREE objectid 257 offset 786432 count 1
+
+The duplicated EXTENT_ITEMs originally come from wrongly split extent_map in
+extract_ordered_extent(). Since extract_ordered_extent() uses
+create_io_em() to split an existing extent_map, we will have
+split->orig_start != split->start. Then, it will be logged with non-zero
+"extent data offset". Finally, the logged entries are replayed into
+a duplicated EXTENT_ITEM.
+
+Introduce and use proper splitting function for extent_map. The function is
+intended to be simple and specific usage for extract_ordered_extent() e.g.
+not supporting compression case (we do not allow splitting compressed
+extent_map anyway).
+
+There was a question raised by Qu, in summary why we want to split the
+extent map (and not the bio):
+
+The problem is not the limit on the zone end, which as you mention is
+the same as the block group end. The problem is that data write use zone
+append (ZA) operations. ZA BIOs cannot be split so a large extent may
+need to be processed with multiple ZA BIOs, While that is also true for
+regular writes, the major difference is that ZA are "nameless" write
+operation giving back the written sectors on completion. And ZA
+operations may be reordered by the block layer (not intentionally
+though). Combine both of these characteristics and you can see that the
+data for a large extent may end up being shuffled when written resulting
+in data corruption and the impossibility to map the extent to some start
+sector.
+
+To avoid this problem, zoned btrfs uses the principle "one data extent
+== one ZA BIO". So large extents need to be split. This is unfortunate,
+but we can revisit this later and optimize, e.g. merge back together the
+fragments of an extent once written if they actually were written
+sequentially in the zone.
+
+Reported-by: Damien Le Moal <damien.lemoal@wdc.com>
+Fixes: d22002fd37bd ("btrfs: zoned: split ordered extent when bio is sent")
+CC: stable@vger.kernel.org # 5.12+
+CC: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/inode.c | 147 ++++++++++++++++++++++++++++++++++++++++++++-----------
+ 1 file changed, 118 insertions(+), 29 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -2271,13 +2271,127 @@ bool btrfs_bio_fits_in_ordered_extent(st
+ return ret;
+ }
+
++/*
++ * Split an extent_map at [start, start + len]
++ *
++ * This function is intended to be used only for extract_ordered_extent().
++ */
++static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
++ u64 pre, u64 post)
++{
++ struct extent_map_tree *em_tree = &inode->extent_tree;
++ struct extent_map *em;
++ struct extent_map *split_pre = NULL;
++ struct extent_map *split_mid = NULL;
++ struct extent_map *split_post = NULL;
++ int ret = 0;
++ int modified;
++ unsigned long flags;
++
++ /* Sanity check */
++ if (pre == 0 && post == 0)
++ return 0;
++
++ split_pre = alloc_extent_map();
++ if (pre)
++ split_mid = alloc_extent_map();
++ if (post)
++ split_post = alloc_extent_map();
++ if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
++ ret = -ENOMEM;
++ goto out;
++ }
++
++ ASSERT(pre + post < len);
++
++ lock_extent(&inode->io_tree, start, start + len - 1);
++ write_lock(&em_tree->lock);
++ em = lookup_extent_mapping(em_tree, start, len);
++ if (!em) {
++ ret = -EIO;
++ goto out_unlock;
++ }
++
++ ASSERT(em->len == len);
++ ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
++ ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
++
++ flags = em->flags;
++ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
++ clear_bit(EXTENT_FLAG_LOGGING, &flags);
++ modified = !list_empty(&em->list);
++
++ /* First, replace the em with a new extent_map starting from * em->start */
++ split_pre->start = em->start;
++ split_pre->len = (pre ? pre : em->len - post);
++ split_pre->orig_start = split_pre->start;
++ split_pre->block_start = em->block_start;
++ split_pre->block_len = split_pre->len;
++ split_pre->orig_block_len = split_pre->block_len;
++ split_pre->ram_bytes = split_pre->len;
++ split_pre->flags = flags;
++ split_pre->compress_type = em->compress_type;
++ split_pre->generation = em->generation;
++
++ replace_extent_mapping(em_tree, em, split_pre, modified);
++
++ /*
++ * Now we only have an extent_map at:
++ * [em->start, em->start + pre] if pre != 0
++ * [em->start, em->start + em->len - post] if pre == 0
++ */
++
++ if (pre) {
++ /* Insert the middle extent_map */
++ split_mid->start = em->start + pre;
++ split_mid->len = em->len - pre - post;
++ split_mid->orig_start = split_mid->start;
++ split_mid->block_start = em->block_start + pre;
++ split_mid->block_len = split_mid->len;
++ split_mid->orig_block_len = split_mid->block_len;
++ split_mid->ram_bytes = split_mid->len;
++ split_mid->flags = flags;
++ split_mid->compress_type = em->compress_type;
++ split_mid->generation = em->generation;
++ add_extent_mapping(em_tree, split_mid, modified);
++ }
++
++ if (post) {
++ split_post->start = em->start + em->len - post;
++ split_post->len = post;
++ split_post->orig_start = split_post->start;
++ split_post->block_start = em->block_start + em->len - post;
++ split_post->block_len = split_post->len;
++ split_post->orig_block_len = split_post->block_len;
++ split_post->ram_bytes = split_post->len;
++ split_post->flags = flags;
++ split_post->compress_type = em->compress_type;
++ split_post->generation = em->generation;
++ add_extent_mapping(em_tree, split_post, modified);
++ }
++
++ /* Once for us */
++ free_extent_map(em);
++ /* Once for the tree */
++ free_extent_map(em);
++
++out_unlock:
++ write_unlock(&em_tree->lock);
++ unlock_extent(&inode->io_tree, start, start + len - 1);
++out:
++ free_extent_map(split_pre);
++ free_extent_map(split_mid);
++ free_extent_map(split_post);
++
++ return ret;
++}
++
+ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
+ struct bio *bio, loff_t file_offset)
+ {
+ struct btrfs_ordered_extent *ordered;
+- struct extent_map *em = NULL, *em_new = NULL;
+- struct extent_map_tree *em_tree = &inode->extent_tree;
+ u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
++ u64 file_len;
+ u64 len = bio->bi_iter.bi_size;
+ u64 end = start + len;
+ u64 ordered_end;
+@@ -2317,41 +2431,16 @@ static blk_status_t extract_ordered_exte
+ goto out;
+ }
+
++ file_len = ordered->num_bytes;
+ pre = start - ordered->disk_bytenr;
+ post = ordered_end - end;
+
+ ret = btrfs_split_ordered_extent(ordered, pre, post);
+ if (ret)
+ goto out;
+-
+- read_lock(&em_tree->lock);
+- em = lookup_extent_mapping(em_tree, ordered->file_offset, len);
+- if (!em) {
+- read_unlock(&em_tree->lock);
+- ret = -EIO;
+- goto out;
+- }
+- read_unlock(&em_tree->lock);
+-
+- ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+- /*
+- * We cannot reuse em_new here but have to create a new one, as
+- * unpin_extent_cache() expects the start of the extent map to be the
+- * logical offset of the file, which does not hold true anymore after
+- * splitting.
+- */
+- em_new = create_io_em(inode, em->start + pre, len,
+- em->start + pre, em->block_start + pre, len,
+- len, len, BTRFS_COMPRESS_NONE,
+- BTRFS_ORDERED_REGULAR);
+- if (IS_ERR(em_new)) {
+- ret = PTR_ERR(em_new);
+- goto out;
+- }
+- free_extent_map(em_new);
++ ret = split_zoned_em(inode, file_offset, file_len, pre, post);
+
+ out:
+- free_extent_map(em);
+ btrfs_put_ordered_extent(ordered);
+
+ return errno_to_blk_status(ret);
--- /dev/null
+From 79bd37120b149532af5b21953643ed74af69654f Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 29 Jun 2021 14:43:06 +0100
+Subject: btrfs: rework chunk allocation to avoid exhaustion of the system chunk array
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 79bd37120b149532af5b21953643ed74af69654f upstream.
+
+Commit eafa4fd0ad0607 ("btrfs: fix exhaustion of the system chunk array
+due to concurrent allocations") fixed a problem that resulted in
+exhausting the system chunk array in the superblock when there are many
+tasks allocating chunks in parallel. Basically too many tasks enter the
+first phase of chunk allocation without previous tasks having finished
+their second phase of allocation, resulting in too many system chunks
+being allocated. That was originally observed when running the fallocate
+tests of stress-ng on a PowerPC machine, using a node size of 64K.
+
+However that commit also introduced a deadlock where a task in phase 1 of
+the chunk allocation waited for another task that had allocated a system
+chunk to finish its phase 2, but that other task was waiting on an extent
+buffer lock held by the first task, therefore resulting in both tasks not
+making any progress. That change was later reverted by a patch with the
+subject "btrfs: fix deadlock with concurrent chunk allocations involving
+system chunks", since there is no simple and short solution to address it
+and the deadlock is relatively easy to trigger on zoned filesystems, while
+the system chunk array exhaustion is not so common.
+
+This change reworks the chunk allocation to avoid the system chunk array
+exhaustion. It accomplishes that by making the first phase of chunk
+allocation do the updates of the device items in the chunk btree and the
+insertion of the new chunk item in the chunk btree. This is done while
+under the protection of the chunk mutex (fs_info->chunk_mutex), in the
+same critical section that checks for available system space, allocates
+a new system chunk if needed and reserves system chunk space. This way
+we do not have chunk space reserved until the second phase completes.
+
+The same logic is applied to chunk removal as well, since it keeps
+reserved system space long after it is done updating the chunk btree.
+
+For direct allocation of system chunks, the previous behaviour remains,
+because otherwise we would deadlock on extent buffers of the chunk btree.
+Changes to the chunk btree are by large done by chunk allocation and chunk
+removal, which first reserve chunk system space and then later do changes
+to the chunk btree. The other remaining cases are uncommon and correspond
+to adding a device, removing a device and resizing a device. All these
+other cases do not pre-reserve system space, they modify the chunk btree
+right away, so they don't hold reserved space for a long period like chunk
+allocation and chunk removal do.
+
+The diff of this change is huge, but more than half of it is just addition
+of comments describing both how things work regarding chunk allocation and
+removal, including both the new behavior and the parts of the old behavior
+that did not change.
+
+CC: stable@vger.kernel.org # 5.12+
+Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
+Tested-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Tested-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/block-group.c | 285 ++++++++++++++++++++++++++++++++++-----
+ fs/btrfs/block-group.h | 6
+ fs/btrfs/ctree.c | 67 +--------
+ fs/btrfs/transaction.c | 10 -
+ fs/btrfs/transaction.h | 2
+ fs/btrfs/volumes.c | 355 +++++++++++++++++++++++++++++++++++++------------
+ fs/btrfs/volumes.h | 5
+ 7 files changed, 546 insertions(+), 184 deletions(-)
+
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -2101,6 +2101,13 @@ error:
+ return ret;
+ }
+
++/*
++ * This function, insert_block_group_item(), belongs to the phase 2 of chunk
++ * allocation.
++ *
++ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
++ * phases.
++ */
+ static int insert_block_group_item(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group)
+ {
+@@ -2123,15 +2130,19 @@ static int insert_block_group_item(struc
+ return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
+ }
+
++/*
++ * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
++ * chunk allocation.
++ *
++ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
++ * phases.
++ */
+ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
+ {
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group *block_group;
+ int ret = 0;
+
+- if (!trans->can_flush_pending_bgs)
+- return;
+-
+ while (!list_empty(&trans->new_bgs)) {
+ int index;
+
+@@ -2146,6 +2157,13 @@ void btrfs_create_pending_block_groups(s
+ ret = insert_block_group_item(trans, block_group);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
++ if (!block_group->chunk_item_inserted) {
++ mutex_lock(&fs_info->chunk_mutex);
++ ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
++ mutex_unlock(&fs_info->chunk_mutex);
++ if (ret)
++ btrfs_abort_transaction(trans, ret);
++ }
+ ret = btrfs_finish_chunk_alloc(trans, block_group->start,
+ block_group->length);
+ if (ret)
+@@ -2169,8 +2187,9 @@ next:
+ btrfs_trans_release_chunk_metadata(trans);
+ }
+
+-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
+- u64 type, u64 chunk_offset, u64 size)
++struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
++ u64 bytes_used, u64 type,
++ u64 chunk_offset, u64 size)
+ {
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group *cache;
+@@ -2180,7 +2199,7 @@ int btrfs_make_block_group(struct btrfs_
+
+ cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
+ if (!cache)
+- return -ENOMEM;
++ return ERR_PTR(-ENOMEM);
+
+ cache->length = size;
+ set_free_space_tree_thresholds(cache);
+@@ -2194,7 +2213,7 @@ int btrfs_make_block_group(struct btrfs_
+ ret = btrfs_load_block_group_zone_info(cache, true);
+ if (ret) {
+ btrfs_put_block_group(cache);
+- return ret;
++ return ERR_PTR(ret);
+ }
+
+ ret = exclude_super_stripes(cache);
+@@ -2202,7 +2221,7 @@ int btrfs_make_block_group(struct btrfs_
+ /* We may have excluded something, so call this just in case */
+ btrfs_free_excluded_extents(cache);
+ btrfs_put_block_group(cache);
+- return ret;
++ return ERR_PTR(ret);
+ }
+
+ add_new_free_space(cache, chunk_offset, chunk_offset + size);
+@@ -2229,7 +2248,7 @@ int btrfs_make_block_group(struct btrfs_
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ btrfs_put_block_group(cache);
+- return ret;
++ return ERR_PTR(ret);
+ }
+
+ /*
+@@ -2248,7 +2267,7 @@ int btrfs_make_block_group(struct btrfs_
+ btrfs_update_delayed_refs_rsv(trans);
+
+ set_avail_alloc_bits(fs_info, type);
+- return 0;
++ return cache;
+ }
+
+ /*
+@@ -3124,11 +3143,203 @@ int btrfs_force_chunk_alloc(struct btrfs
+ return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+ }
+
++static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
++{
++ struct btrfs_block_group *bg;
++ int ret;
++
++ /*
++ * Check if we have enough space in the system space info because we
++ * will need to update device items in the chunk btree and insert a new
++ * chunk item in the chunk btree as well. This will allocate a new
++ * system block group if needed.
++ */
++ check_system_chunk(trans, flags);
++
++ bg = btrfs_alloc_chunk(trans, flags);
++ if (IS_ERR(bg)) {
++ ret = PTR_ERR(bg);
++ goto out;
++ }
++
++ /*
++ * If this is a system chunk allocation then stop right here and do not
++ * add the chunk item to the chunk btree. This is to prevent a deadlock
++ * because this system chunk allocation can be triggered while COWing
++ * some extent buffer of the chunk btree and while holding a lock on a
++ * parent extent buffer, in which case attempting to insert the chunk
++ * item (or update the device item) would result in a deadlock on that
++ * parent extent buffer. In this case defer the chunk btree updates to
++ * the second phase of chunk allocation and keep our reservation until
++ * the second phase completes.
++ *
++ * This is a rare case and can only be triggered by the very few cases
++ * we have where we need to touch the chunk btree outside chunk allocation
++ * and chunk removal. These cases are basically adding a device, removing
++ * a device or resizing a device.
++ */
++ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
++ return 0;
++
++ ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
++ /*
++ * Normally we are not expected to fail with -ENOSPC here, since we have
++ * previously reserved space in the system space_info and allocated one
++ * new system chunk if necessary. However there are two exceptions:
++ *
++ * 1) We may have enough free space in the system space_info but all the
++ * existing system block groups have a profile which can not be used
++ * for extent allocation.
++ *
++ * This happens when mounting in degraded mode. For example we have a
++ * RAID1 filesystem with 2 devices, lose one device and mount the fs
++ * using the other device in degraded mode. If we then allocate a chunk,
++ * we may have enough free space in the existing system space_info, but
++ * none of the block groups can be used for extent allocation since they
++ * have a RAID1 profile, and because we are in degraded mode with a
++ * single device, we are forced to allocate a new system chunk with a
++ * SINGLE profile. Making check_system_chunk() iterate over all system
++ * block groups and check if they have a usable profile and enough space
++ * can be slow on very large filesystems, so we tolerate the -ENOSPC and
++ * try again after forcing allocation of a new system chunk. Like this
++ * we avoid paying the cost of that search in normal circumstances, when
++ * we were not mounted in degraded mode;
++ *
++ * 2) We had enough free space info the system space_info, and one suitable
++ * block group to allocate from when we called check_system_chunk()
++ * above. However right after we called it, the only system block group
++ * with enough free space got turned into RO mode by a running scrub,
++ * and in this case we have to allocate a new one and retry. We only
++ * need do this allocate and retry once, since we have a transaction
++ * handle and scrub uses the commit root to search for block groups.
++ */
++ if (ret == -ENOSPC) {
++ const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
++ struct btrfs_block_group *sys_bg;
++
++ sys_bg = btrfs_alloc_chunk(trans, sys_flags);
++ if (IS_ERR(sys_bg)) {
++ ret = PTR_ERR(sys_bg);
++ btrfs_abort_transaction(trans, ret);
++ goto out;
++ }
++
++ ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
++ if (ret) {
++ btrfs_abort_transaction(trans, ret);
++ goto out;
++ }
++
++ ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
++ if (ret) {
++ btrfs_abort_transaction(trans, ret);
++ goto out;
++ }
++ } else if (ret) {
++ btrfs_abort_transaction(trans, ret);
++ goto out;
++ }
++out:
++ btrfs_trans_release_chunk_metadata(trans);
++
++ return ret;
++}
++
+ /*
+- * If force is CHUNK_ALLOC_FORCE:
++ * Chunk allocation is done in 2 phases:
++ *
++ * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
++ * the chunk, the chunk mapping, create its block group and add the items
++ * that belong in the chunk btree to it - more specifically, we need to
++ * update device items in the chunk btree and add a new chunk item to it.
++ *
++ * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
++ * group item to the extent btree and the device extent items to the devices
++ * btree.
++ *
++ * This is done to prevent deadlocks. For example when COWing a node from the
++ * extent btree we are holding a write lock on the node's parent and if we
++ * trigger chunk allocation and attempted to insert the new block group item
++ * in the extent btree right way, we could deadlock because the path for the
++ * insertion can include that parent node. At first glance it seems impossible
++ * to trigger chunk allocation after starting a transaction since tasks should
++ * reserve enough transaction units (metadata space), however while that is true
++ * most of the time, chunk allocation may still be triggered for several reasons:
++ *
++ * 1) When reserving metadata, we check if there is enough free space in the
++ * metadata space_info and therefore don't trigger allocation of a new chunk.
++ * However later when the task actually tries to COW an extent buffer from
++ * the extent btree or from the device btree for example, it is forced to
++ * allocate a new block group (chunk) because the only one that had enough
++ * free space was just turned to RO mode by a running scrub for example (or
++ * device replace, block group reclaim thread, etc), so we can not use it
++ * for allocating an extent and end up being forced to allocate a new one;
++ *
++ * 2) Because we only check that the metadata space_info has enough free bytes,
++ * we end up not allocating a new metadata chunk in that case. However if
++ * the filesystem was mounted in degraded mode, none of the existing block
++ * groups might be suitable for extent allocation due to their incompatible
++ * profile (for e.g. mounting a 2 devices filesystem, where all block groups
++ * use a RAID1 profile, in degraded mode using a single device). In this case
++ * when the task attempts to COW some extent buffer of the extent btree for
++ * example, it will trigger allocation of a new metadata block group with a
++ * suitable profile (SINGLE profile in the example of the degraded mount of
++ * the RAID1 filesystem);
++ *
++ * 3) The task has reserved enough transaction units / metadata space, but when
++ * it attempts to COW an extent buffer from the extent or device btree for
++ * example, it does not find any free extent in any metadata block group,
++ * therefore forced to try to allocate a new metadata block group.
++ * This is because some other task allocated all available extents in the
++ * meanwhile - this typically happens with tasks that don't reserve space
++ * properly, either intentionally or as a bug. One example where this is
++ * done intentionally is fsync, as it does not reserve any transaction units
++ * and ends up allocating a variable number of metadata extents for log
++ * tree extent buffers.
++ *
++ * We also need this 2 phases setup when adding a device to a filesystem with
++ * a seed device - we must create new metadata and system chunks without adding
++ * any of the block group items to the chunk, extent and device btrees. If we
++ * did not do it this way, we would get ENOSPC when attempting to update those
++ * btrees, since all the chunks from the seed device are read-only.
++ *
++ * Phase 1 does the updates and insertions to the chunk btree because if we had
++ * it done in phase 2 and have a thundering herd of tasks allocating chunks in
++ * parallel, we risk having too many system chunks allocated by many tasks if
++ * many tasks reach phase 1 without the previous ones completing phase 2. In the
++ * extreme case this leads to exhaustion of the system chunk array in the
++ * superblock. This is easier to trigger if using a btree node/leaf size of 64K
++ * and with RAID filesystems (so we have more device items in the chunk btree).
++ * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
++ * the system chunk array due to concurrent allocations") provides more details.
++ *
++ * For allocation of system chunks, we defer the updates and insertions into the
++ * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
++ * if the chunk allocation is triggered while COWing an extent buffer of the
++ * chunk btree, we are holding a lock on the parent of that extent buffer and
++ * doing the chunk btree updates and insertions can require locking that parent.
++ * This is for the very few and rare cases where we update the chunk btree that
++ * are not chunk allocation or chunk removal: adding a device, removing a device
++ * or resizing a device.
++ *
++ * The reservation of system space, done through check_system_chunk(), as well
++ * as all the updates and insertions into the chunk btree must be done while
++ * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
++ * an extent buffer from the chunks btree we never trigger allocation of a new
++ * system chunk, which would result in a deadlock (trying to lock twice an
++ * extent buffer of the chunk btree, first time before triggering the chunk
++ * allocation and the second time during chunk allocation while attempting to
++ * update the chunks btree). The system chunk array is also updated while holding
++ * that mutex. The same logic applies to removing chunks - we must reserve system
++ * space, update the chunk btree and the system chunk array in the superblock
++ * while holding fs_info->chunk_mutex.
++ *
++ * This function, btrfs_chunk_alloc(), belongs to phase 1.
++ *
++ * If @force is CHUNK_ALLOC_FORCE:
+ * - return 1 if it successfully allocates a chunk,
+ * - return errors including -ENOSPC otherwise.
+- * If force is NOT CHUNK_ALLOC_FORCE:
++ * If @force is NOT CHUNK_ALLOC_FORCE:
+ * - return 0 if it doesn't need to allocate a new chunk,
+ * - return 1 if it successfully allocates a chunk,
+ * - return errors including -ENOSPC otherwise.
+@@ -3145,6 +3356,13 @@ int btrfs_chunk_alloc(struct btrfs_trans
+ /* Don't re-enter if we're already allocating a chunk */
+ if (trans->allocating_chunk)
+ return -ENOSPC;
++ /*
++ * If we are removing a chunk, don't re-enter or we would deadlock.
++ * System space reservation and system chunk allocation is done by the
++ * chunk remove operation (btrfs_remove_chunk()).
++ */
++ if (trans->removing_chunk)
++ return -ENOSPC;
+
+ space_info = btrfs_find_space_info(fs_info, flags);
+ ASSERT(space_info);
+@@ -3208,13 +3426,7 @@ int btrfs_chunk_alloc(struct btrfs_trans
+ force_metadata_allocation(fs_info);
+ }
+
+- /*
+- * Check if we have enough space in SYSTEM chunk because we may need
+- * to update devices.
+- */
+- check_system_chunk(trans, flags);
+-
+- ret = btrfs_alloc_chunk(trans, flags);
++ ret = do_chunk_alloc(trans, flags);
+ trans->allocating_chunk = false;
+
+ spin_lock(&space_info->lock);
+@@ -3233,22 +3445,6 @@ out:
+ space_info->chunk_alloc = 0;
+ spin_unlock(&space_info->lock);
+ mutex_unlock(&fs_info->chunk_mutex);
+- /*
+- * When we allocate a new chunk we reserve space in the chunk block
+- * reserve to make sure we can COW nodes/leafs in the chunk tree or
+- * add new nodes/leafs to it if we end up needing to do it when
+- * inserting the chunk item and updating device items as part of the
+- * second phase of chunk allocation, performed by
+- * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
+- * large number of new block groups to create in our transaction
+- * handle's new_bgs list to avoid exhausting the chunk block reserve
+- * in extreme cases - like having a single transaction create many new
+- * block groups when starting to write out the free space caches of all
+- * the block groups that were made dirty during the lifetime of the
+- * transaction.
+- */
+- if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
+- btrfs_create_pending_block_groups(trans);
+
+ return ret;
+ }
+@@ -3301,14 +3497,31 @@ void check_system_chunk(struct btrfs_tra
+
+ if (left < thresh) {
+ u64 flags = btrfs_system_alloc_profile(fs_info);
++ struct btrfs_block_group *bg;
+
+ /*
+ * Ignore failure to create system chunk. We might end up not
+ * needing it, as we might not need to COW all nodes/leafs from
+ * the paths we visit in the chunk tree (they were already COWed
+ * or created in the current transaction for example).
++ *
++ * Also, if our caller is allocating a system chunk, do not
++ * attempt to insert the chunk item in the chunk btree, as we
++ * could deadlock on an extent buffer since our caller may be
++ * COWing an extent buffer from the chunk btree.
+ */
+- ret = btrfs_alloc_chunk(trans, flags);
++ bg = btrfs_alloc_chunk(trans, flags);
++ if (IS_ERR(bg)) {
++ ret = PTR_ERR(bg);
++ } else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
++ /*
++ * If we fail to add the chunk item here, we end up
++ * trying again at phase 2 of chunk allocation, at
++ * btrfs_create_pending_block_groups(). So ignore
++ * any error here.
++ */
++ btrfs_chunk_alloc_add_chunk_item(trans, bg);
++ }
+ }
+
+ if (!ret) {
+--- a/fs/btrfs/block-group.h
++++ b/fs/btrfs/block-group.h
+@@ -97,6 +97,7 @@ struct btrfs_block_group {
+ unsigned int removed:1;
+ unsigned int to_copy:1;
+ unsigned int relocating_repair:1;
++ unsigned int chunk_item_inserted:1;
+
+ int disk_cache_state;
+
+@@ -265,8 +266,9 @@ int btrfs_remove_block_group(struct btrf
+ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
+ void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
+ int btrfs_read_block_groups(struct btrfs_fs_info *info);
+-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
+- u64 type, u64 chunk_offset, u64 size);
++struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
++ u64 bytes_used, u64 type,
++ u64 chunk_offset, u64 size);
+ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
+ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
+ bool do_chunk_alloc);
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -954,49 +954,6 @@ static noinline int update_ref_for_cow(s
+ return 0;
+ }
+
+-static struct extent_buffer *alloc_tree_block_no_bg_flush(
+- struct btrfs_trans_handle *trans,
+- struct btrfs_root *root,
+- u64 parent_start,
+- const struct btrfs_disk_key *disk_key,
+- int level,
+- u64 hint,
+- u64 empty_size,
+- enum btrfs_lock_nesting nest)
+-{
+- struct btrfs_fs_info *fs_info = root->fs_info;
+- struct extent_buffer *ret;
+-
+- /*
+- * If we are COWing a node/leaf from the extent, chunk, device or free
+- * space trees, make sure that we do not finish block group creation of
+- * pending block groups. We do this to avoid a deadlock.
+- * COWing can result in allocation of a new chunk, and flushing pending
+- * block groups (btrfs_create_pending_block_groups()) can be triggered
+- * when finishing allocation of a new chunk. Creation of a pending block
+- * group modifies the extent, chunk, device and free space trees,
+- * therefore we could deadlock with ourselves since we are holding a
+- * lock on an extent buffer that btrfs_create_pending_block_groups() may
+- * try to COW later.
+- * For similar reasons, we also need to delay flushing pending block
+- * groups when splitting a leaf or node, from one of those trees, since
+- * we are holding a write lock on it and its parent or when inserting a
+- * new root node for one of those trees.
+- */
+- if (root == fs_info->extent_root ||
+- root == fs_info->chunk_root ||
+- root == fs_info->dev_root ||
+- root == fs_info->free_space_root)
+- trans->can_flush_pending_bgs = false;
+-
+- ret = btrfs_alloc_tree_block(trans, root, parent_start,
+- root->root_key.objectid, disk_key, level,
+- hint, empty_size, nest);
+- trans->can_flush_pending_bgs = true;
+-
+- return ret;
+-}
+-
+ /*
+ * does the dirty work in cow of a single block. The parent block (if
+ * supplied) is updated to point to the new cow copy. The new buffer is marked
+@@ -1045,8 +1002,9 @@ static noinline int __btrfs_cow_block(st
+ if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
+ parent_start = parent->start;
+
+- cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
+- level, search_start, empty_size, nest);
++ cow = btrfs_alloc_tree_block(trans, root, parent_start,
++ root->root_key.objectid, &disk_key, level,
++ search_start, empty_size, nest);
+ if (IS_ERR(cow))
+ return PTR_ERR(cow);
+
+@@ -3340,9 +3298,9 @@ static noinline int insert_new_root(stru
+ else
+ btrfs_node_key(lower, &lower_key, 0);
+
+- c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
+- root->node->start, 0,
+- BTRFS_NESTING_NEW_ROOT);
++ c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
++ &lower_key, level, root->node->start, 0,
++ BTRFS_NESTING_NEW_ROOT);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+
+@@ -3471,8 +3429,9 @@ static noinline int split_node(struct bt
+ mid = (c_nritems + 1) / 2;
+ btrfs_node_key(c, &disk_key, mid);
+
+- split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
+- c->start, 0, BTRFS_NESTING_SPLIT);
++ split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
++ &disk_key, level, c->start, 0,
++ BTRFS_NESTING_SPLIT);
+ if (IS_ERR(split))
+ return PTR_ERR(split);
+
+@@ -4263,10 +4222,10 @@ again:
+ * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
+ * use BTRFS_NESTING_NEW_ROOT.
+ */
+- right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
+- l->start, 0, num_doubles ?
+- BTRFS_NESTING_NEW_ROOT :
+- BTRFS_NESTING_SPLIT);
++ right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
++ &disk_key, 0, l->start, 0,
++ num_doubles ? BTRFS_NESTING_NEW_ROOT :
++ BTRFS_NESTING_SPLIT);
+ if (IS_ERR(right))
+ return PTR_ERR(right);
+
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -254,8 +254,11 @@ static inline int extwriter_counter_read
+ }
+
+ /*
+- * To be called after all the new block groups attached to the transaction
+- * handle have been created (btrfs_create_pending_block_groups()).
++ * To be called after doing the chunk btree updates right after allocating a new
++ * chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a
++ * chunk after all chunk btree updates and after finishing the second phase of
++ * chunk allocation (btrfs_create_pending_block_groups()) in case some block
++ * group had its chunk item insertion delayed to the second phase.
+ */
+ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
+ {
+@@ -264,8 +267,6 @@ void btrfs_trans_release_chunk_metadata(
+ if (!trans->chunk_bytes_reserved)
+ return;
+
+- WARN_ON_ONCE(!list_empty(&trans->new_bgs));
+-
+ btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
+ trans->chunk_bytes_reserved, NULL);
+ trans->chunk_bytes_reserved = 0;
+@@ -697,7 +698,6 @@ again:
+ h->fs_info = root->fs_info;
+
+ h->type = type;
+- h->can_flush_pending_bgs = true;
+ INIT_LIST_HEAD(&h->new_bgs);
+
+ smp_mb();
+--- a/fs/btrfs/transaction.h
++++ b/fs/btrfs/transaction.h
+@@ -134,7 +134,7 @@ struct btrfs_trans_handle {
+ short aborted;
+ bool adding_csums;
+ bool allocating_chunk;
+- bool can_flush_pending_bgs;
++ bool removing_chunk;
+ bool reloc_reserved;
+ bool in_fsync;
+ struct btrfs_root *root;
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -1744,19 +1744,14 @@ again:
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_dev_extent);
+ } else {
+- btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
+ goto out;
+ }
+
+ *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
+
+ ret = btrfs_del_item(trans, root, path);
+- if (ret) {
+- btrfs_handle_fs_error(fs_info, ret,
+- "Failed to remove dev extent item");
+- } else {
++ if (ret == 0)
+ set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
+- }
+ out:
+ btrfs_free_path(path);
+ return ret;
+@@ -2941,7 +2936,7 @@ static int btrfs_del_sys_chunk(struct bt
+ u32 cur;
+ struct btrfs_key key;
+
+- mutex_lock(&fs_info->chunk_mutex);
++ lockdep_assert_held(&fs_info->chunk_mutex);
+ array_size = btrfs_super_sys_array_size(super_copy);
+
+ ptr = super_copy->sys_chunk_array;
+@@ -2971,7 +2966,6 @@ static int btrfs_del_sys_chunk(struct bt
+ cur += len;
+ }
+ }
+- mutex_unlock(&fs_info->chunk_mutex);
+ return ret;
+ }
+
+@@ -3011,6 +3005,29 @@ struct extent_map *btrfs_get_chunk_map(s
+ return em;
+ }
+
++static int remove_chunk_item(struct btrfs_trans_handle *trans,
++ struct map_lookup *map, u64 chunk_offset)
++{
++ int i;
++
++ /*
++ * Removing chunk items and updating the device items in the chunks btree
++ * requires holding the chunk_mutex.
++ * See the comment at btrfs_chunk_alloc() for the details.
++ */
++ lockdep_assert_held(&trans->fs_info->chunk_mutex);
++
++ for (i = 0; i < map->num_stripes; i++) {
++ int ret;
++
++ ret = btrfs_update_device(trans, map->stripes[i].dev);
++ if (ret)
++ return ret;
++ }
++
++ return btrfs_free_chunk(trans, chunk_offset);
++}
++
+ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
+ {
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+@@ -3031,14 +3048,16 @@ int btrfs_remove_chunk(struct btrfs_tran
+ return PTR_ERR(em);
+ }
+ map = em->map_lookup;
+- mutex_lock(&fs_info->chunk_mutex);
+- check_system_chunk(trans, map->type);
+- mutex_unlock(&fs_info->chunk_mutex);
+
+ /*
+- * Take the device list mutex to prevent races with the final phase of
+- * a device replace operation that replaces the device object associated
+- * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
++ * First delete the device extent items from the devices btree.
++ * We take the device_list_mutex to avoid racing with the finishing phase
++ * of a device replace operation. See the comment below before acquiring
++ * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
++ * because that can result in a deadlock when deleting the device extent
++ * items from the devices btree - COWing an extent buffer from the btree
++ * may result in allocating a new metadata chunk, which would attempt to
++ * lock again fs_info->chunk_mutex.
+ */
+ mutex_lock(&fs_devices->device_list_mutex);
+ for (i = 0; i < map->num_stripes; i++) {
+@@ -3060,18 +3079,73 @@ int btrfs_remove_chunk(struct btrfs_tran
+ btrfs_clear_space_info_full(fs_info);
+ mutex_unlock(&fs_info->chunk_mutex);
+ }
++ }
++ mutex_unlock(&fs_devices->device_list_mutex);
+
+- ret = btrfs_update_device(trans, device);
++ /*
++ * We acquire fs_info->chunk_mutex for 2 reasons:
++ *
++ * 1) Just like with the first phase of the chunk allocation, we must
++ * reserve system space, do all chunk btree updates and deletions, and
++ * update the system chunk array in the superblock while holding this
++ * mutex. This is for similar reasons as explained on the comment at
++ * the top of btrfs_chunk_alloc();
++ *
++ * 2) Prevent races with the final phase of a device replace operation
++ * that replaces the device object associated with the map's stripes,
++ * because the device object's id can change at any time during that
++ * final phase of the device replace operation
++ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
++ * replaced device and then see it with an ID of
++ * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
++ * the device item, which does not exists on the chunk btree.
++ * The finishing phase of device replace acquires both the
++ * device_list_mutex and the chunk_mutex, in that order, so we are
++ * safe by just acquiring the chunk_mutex.
++ */
++ trans->removing_chunk = true;
++ mutex_lock(&fs_info->chunk_mutex);
++
++ check_system_chunk(trans, map->type);
++
++ ret = remove_chunk_item(trans, map, chunk_offset);
++ /*
++ * Normally we should not get -ENOSPC since we reserved space before
++ * through the call to check_system_chunk().
++ *
++ * Despite our system space_info having enough free space, we may not
++ * be able to allocate extents from its block groups, because all have
++ * an incompatible profile, which will force us to allocate a new system
++ * block group with the right profile, or right after we called
++ * check_system_space() above, a scrub turned the only system block group
++ * with enough free space into RO mode.
++ * This is explained with more detail at do_chunk_alloc().
++ *
++ * So if we get -ENOSPC, allocate a new system chunk and retry once.
++ */
++ if (ret == -ENOSPC) {
++ const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
++ struct btrfs_block_group *sys_bg;
++
++ sys_bg = btrfs_alloc_chunk(trans, sys_flags);
++ if (IS_ERR(sys_bg)) {
++ ret = PTR_ERR(sys_bg);
++ btrfs_abort_transaction(trans, ret);
++ goto out;
++ }
++
++ ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
+ if (ret) {
+- mutex_unlock(&fs_devices->device_list_mutex);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+- }
+- mutex_unlock(&fs_devices->device_list_mutex);
+
+- ret = btrfs_free_chunk(trans, chunk_offset);
+- if (ret) {
++ ret = remove_chunk_item(trans, map, chunk_offset);
++ if (ret) {
++ btrfs_abort_transaction(trans, ret);
++ goto out;
++ }
++ } else if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+@@ -3086,6 +3160,15 @@ int btrfs_remove_chunk(struct btrfs_tran
+ }
+ }
+
++ mutex_unlock(&fs_info->chunk_mutex);
++ trans->removing_chunk = false;
++
++ /*
++ * We are done with chunk btree updates and deletions, so release the
++ * system space we previously reserved (with check_system_chunk()).
++ */
++ btrfs_trans_release_chunk_metadata(trans);
++
+ ret = btrfs_remove_block_group(trans, chunk_offset, em);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+@@ -3093,6 +3176,10 @@ int btrfs_remove_chunk(struct btrfs_tran
+ }
+
+ out:
++ if (trans->removing_chunk) {
++ mutex_unlock(&fs_info->chunk_mutex);
++ trans->removing_chunk = false;
++ }
+ /* once for us */
+ free_extent_map(em);
+ return ret;
+@@ -4851,13 +4938,12 @@ static int btrfs_add_system_chunk(struct
+ u32 array_size;
+ u8 *ptr;
+
+- mutex_lock(&fs_info->chunk_mutex);
++ lockdep_assert_held(&fs_info->chunk_mutex);
++
+ array_size = btrfs_super_sys_array_size(super_copy);
+ if (array_size + item_size + sizeof(disk_key)
+- > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+- mutex_unlock(&fs_info->chunk_mutex);
++ > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+ return -EFBIG;
+- }
+
+ ptr = super_copy->sys_chunk_array + array_size;
+ btrfs_cpu_key_to_disk(&disk_key, key);
+@@ -4866,7 +4952,6 @@ static int btrfs_add_system_chunk(struct
+ memcpy(ptr, chunk, item_size);
+ item_size += sizeof(disk_key);
+ btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+- mutex_unlock(&fs_info->chunk_mutex);
+
+ return 0;
+ }
+@@ -5216,13 +5301,14 @@ static int decide_stripe_size(struct btr
+ }
+ }
+
+-static int create_chunk(struct btrfs_trans_handle *trans,
++static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
+ struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+ {
+ struct btrfs_fs_info *info = trans->fs_info;
+ struct map_lookup *map = NULL;
+ struct extent_map_tree *em_tree;
++ struct btrfs_block_group *block_group;
+ struct extent_map *em;
+ u64 start = ctl->start;
+ u64 type = ctl->type;
+@@ -5232,7 +5318,7 @@ static int create_chunk(struct btrfs_tra
+
+ map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
+ if (!map)
+- return -ENOMEM;
++ return ERR_PTR(-ENOMEM);
+ map->num_stripes = ctl->num_stripes;
+
+ for (i = 0; i < ctl->ndevs; ++i) {
+@@ -5254,7 +5340,7 @@ static int create_chunk(struct btrfs_tra
+ em = alloc_extent_map();
+ if (!em) {
+ kfree(map);
+- return -ENOMEM;
++ return ERR_PTR(-ENOMEM);
+ }
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
+ em->map_lookup = map;
+@@ -5270,12 +5356,12 @@ static int create_chunk(struct btrfs_tra
+ if (ret) {
+ write_unlock(&em_tree->lock);
+ free_extent_map(em);
+- return ret;
++ return ERR_PTR(ret);
+ }
+ write_unlock(&em_tree->lock);
+
+- ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
+- if (ret)
++ block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
++ if (IS_ERR(block_group))
+ goto error_del_extent;
+
+ for (i = 0; i < map->num_stripes; i++) {
+@@ -5295,7 +5381,7 @@ static int create_chunk(struct btrfs_tra
+ check_raid56_incompat_flag(info, type);
+ check_raid1c34_incompat_flag(info, type);
+
+- return 0;
++ return block_group;
+
+ error_del_extent:
+ write_lock(&em_tree->lock);
+@@ -5307,34 +5393,36 @@ error_del_extent:
+ /* One for the tree reference */
+ free_extent_map(em);
+
+- return ret;
++ return block_group;
+ }
+
+-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
++struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
++ u64 type)
+ {
+ struct btrfs_fs_info *info = trans->fs_info;
+ struct btrfs_fs_devices *fs_devices = info->fs_devices;
+ struct btrfs_device_info *devices_info = NULL;
+ struct alloc_chunk_ctl ctl;
++ struct btrfs_block_group *block_group;
+ int ret;
+
+ lockdep_assert_held(&info->chunk_mutex);
+
+ if (!alloc_profile_is_valid(type, 0)) {
+ ASSERT(0);
+- return -EINVAL;
++ return ERR_PTR(-EINVAL);
+ }
+
+ if (list_empty(&fs_devices->alloc_list)) {
+ if (btrfs_test_opt(info, ENOSPC_DEBUG))
+ btrfs_debug(info, "%s: no writable device", __func__);
+- return -ENOSPC;
++ return ERR_PTR(-ENOSPC);
+ }
+
+ if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ btrfs_err(info, "invalid chunk type 0x%llx requested", type);
+ ASSERT(0);
+- return -EINVAL;
++ return ERR_PTR(-EINVAL);
+ }
+
+ ctl.start = find_next_chunk(info);
+@@ -5344,46 +5432,43 @@ int btrfs_alloc_chunk(struct btrfs_trans
+ devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
+ GFP_NOFS);
+ if (!devices_info)
+- return -ENOMEM;
++ return ERR_PTR(-ENOMEM);
+
+ ret = gather_device_info(fs_devices, &ctl, devices_info);
+- if (ret < 0)
++ if (ret < 0) {
++ block_group = ERR_PTR(ret);
+ goto out;
++ }
+
+ ret = decide_stripe_size(fs_devices, &ctl, devices_info);
+- if (ret < 0)
++ if (ret < 0) {
++ block_group = ERR_PTR(ret);
+ goto out;
++ }
+
+- ret = create_chunk(trans, &ctl, devices_info);
++ block_group = create_chunk(trans, &ctl, devices_info);
+
+ out:
+ kfree(devices_info);
+- return ret;
++ return block_group;
+ }
+
+ /*
+- * Chunk allocation falls into two parts. The first part does work
+- * that makes the new allocated chunk usable, but does not do any operation
+- * that modifies the chunk tree. The second part does the work that
+- * requires modifying the chunk tree. This division is important for the
+- * bootstrap process of adding storage to a seed btrfs.
++ * This function, btrfs_finish_chunk_alloc(), belongs to phase 2.
++ *
++ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
++ * phases.
+ */
+ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+ u64 chunk_offset, u64 chunk_size)
+ {
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+- struct btrfs_root *extent_root = fs_info->extent_root;
+- struct btrfs_root *chunk_root = fs_info->chunk_root;
+- struct btrfs_key key;
+ struct btrfs_device *device;
+- struct btrfs_chunk *chunk;
+- struct btrfs_stripe *stripe;
+ struct extent_map *em;
+ struct map_lookup *map;
+- size_t item_size;
+ u64 dev_offset;
+ u64 stripe_size;
+- int i = 0;
++ int i;
+ int ret = 0;
+
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
+@@ -5391,53 +5476,117 @@ int btrfs_finish_chunk_alloc(struct btrf
+ return PTR_ERR(em);
+
+ map = em->map_lookup;
+- item_size = btrfs_chunk_item_size(map->num_stripes);
+ stripe_size = em->orig_block_len;
+
+- chunk = kzalloc(item_size, GFP_NOFS);
+- if (!chunk) {
+- ret = -ENOMEM;
+- goto out;
+- }
+-
+ /*
+ * Take the device list mutex to prevent races with the final phase of
+ * a device replace operation that replaces the device object associated
+ * with the map's stripes, because the device object's id can change
+ * at any time during that final phase of the device replace operation
+- * (dev-replace.c:btrfs_dev_replace_finishing()).
++ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
++ * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
++ * resulting in persisting a device extent item with such ID.
+ */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ dev_offset = map->stripes[i].physical;
+
+- ret = btrfs_update_device(trans, device);
+- if (ret)
+- break;
+ ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
+ dev_offset, stripe_size);
+ if (ret)
+ break;
+ }
+- if (ret) {
+- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
++ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
++
++ free_extent_map(em);
++ return ret;
++}
++
++/*
++ * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
++ * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
++ * chunks.
++ *
++ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
++ * phases.
++ */
++int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
++ struct btrfs_block_group *bg)
++{
++ struct btrfs_fs_info *fs_info = trans->fs_info;
++ struct btrfs_root *extent_root = fs_info->extent_root;
++ struct btrfs_root *chunk_root = fs_info->chunk_root;
++ struct btrfs_key key;
++ struct btrfs_chunk *chunk;
++ struct btrfs_stripe *stripe;
++ struct extent_map *em;
++ struct map_lookup *map;
++ size_t item_size;
++ int i;
++ int ret;
++
++ /*
++ * We take the chunk_mutex for 2 reasons:
++ *
++ * 1) Updates and insertions in the chunk btree must be done while holding
++ * the chunk_mutex, as well as updating the system chunk array in the
++ * superblock. See the comment on top of btrfs_chunk_alloc() for the
++ * details;
++ *
++ * 2) To prevent races with the final phase of a device replace operation
++ * that replaces the device object associated with the map's stripes,
++ * because the device object's id can change at any time during that
++ * final phase of the device replace operation
++ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
++ * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
++ * which would cause a failure when updating the device item, which does
++ * not exists, or persisting a stripe of the chunk item with such ID.
++ * Here we can't use the device_list_mutex because our caller already
++ * has locked the chunk_mutex, and the final phase of device replace
++ * acquires both mutexes - first the device_list_mutex and then the
++ * chunk_mutex. Using any of those two mutexes protects us from a
++ * concurrent device replace.
++ */
++ lockdep_assert_held(&fs_info->chunk_mutex);
++
++ em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
++ if (IS_ERR(em)) {
++ ret = PTR_ERR(em);
++ btrfs_abort_transaction(trans, ret);
++ return ret;
++ }
++
++ map = em->map_lookup;
++ item_size = btrfs_chunk_item_size(map->num_stripes);
++
++ chunk = kzalloc(item_size, GFP_NOFS);
++ if (!chunk) {
++ ret = -ENOMEM;
++ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
++ for (i = 0; i < map->num_stripes; i++) {
++ struct btrfs_device *device = map->stripes[i].dev;
++
++ ret = btrfs_update_device(trans, device);
++ if (ret)
++ goto out;
++ }
++
+ stripe = &chunk->stripe;
+ for (i = 0; i < map->num_stripes; i++) {
+- device = map->stripes[i].dev;
+- dev_offset = map->stripes[i].physical;
++ struct btrfs_device *device = map->stripes[i].dev;
++ const u64 dev_offset = map->stripes[i].physical;
+
+ btrfs_set_stack_stripe_devid(stripe, device->devid);
+ btrfs_set_stack_stripe_offset(stripe, dev_offset);
+ memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
+ stripe++;
+ }
+- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+- btrfs_set_stack_chunk_length(chunk, chunk_size);
++ btrfs_set_stack_chunk_length(chunk, bg->length);
+ btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+ btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_type(chunk, map->type);
+@@ -5449,15 +5598,18 @@ int btrfs_finish_chunk_alloc(struct btrf
+
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+- key.offset = chunk_offset;
++ key.offset = bg->start;
+
+ ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
+- if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+- /*
+- * TODO: Cleanup of inserted chunk root in case of
+- * failure.
+- */
++ if (ret)
++ goto out;
++
++ bg->chunk_item_inserted = 1;
++
++ if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
++ if (ret)
++ goto out;
+ }
+
+ out:
+@@ -5470,16 +5622,41 @@ static noinline int init_first_rw_device
+ {
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ u64 alloc_profile;
+- int ret;
++ struct btrfs_block_group *meta_bg;
++ struct btrfs_block_group *sys_bg;
++
++ /*
++ * When adding a new device for sprouting, the seed device is read-only
++ * so we must first allocate a metadata and a system chunk. But before
++ * adding the block group items to the extent, device and chunk btrees,
++ * we must first:
++ *
++ * 1) Create both chunks without doing any changes to the btrees, as
++ * otherwise we would get -ENOSPC since the block groups from the
++ * seed device are read-only;
++ *
++ * 2) Add the device item for the new sprout device - finishing the setup
++ * of a new block group requires updating the device item in the chunk
++ * btree, so it must exist when we attempt to do it. The previous step
++ * ensures this does not fail with -ENOSPC.
++ *
++ * After that we can add the block group items to their btrees:
++ * update existing device item in the chunk btree, add a new block group
++ * item to the extent btree, add a new chunk item to the chunk btree and
++ * finally add the new device extent items to the devices btree.
++ */
+
+ alloc_profile = btrfs_metadata_alloc_profile(fs_info);
+- ret = btrfs_alloc_chunk(trans, alloc_profile);
+- if (ret)
+- return ret;
++ meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
++ if (IS_ERR(meta_bg))
++ return PTR_ERR(meta_bg);
+
+ alloc_profile = btrfs_system_alloc_profile(fs_info);
+- ret = btrfs_alloc_chunk(trans, alloc_profile);
+- return ret;
++ sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
++ if (IS_ERR(sys_bg))
++ return PTR_ERR(sys_bg);
++
++ return 0;
+ }
+
+ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
+@@ -7359,10 +7536,18 @@ int btrfs_read_chunk_tree(struct btrfs_f
+ total_dev++;
+ } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
+ struct btrfs_chunk *chunk;
++
++ /*
++ * We are only called at mount time, so no need to take
++ * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
++ * we always lock first fs_info->chunk_mutex before
++ * acquiring any locks on the chunk tree. This is a
++ * requirement for chunk allocation, see the comment on
++ * top of btrfs_chunk_alloc() for details.
++ */
++ ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
+ chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+- mutex_lock(&fs_info->chunk_mutex);
+ ret = read_one_chunk(&found_key, leaf, chunk);
+- mutex_unlock(&fs_info->chunk_mutex);
+ if (ret)
+ goto error;
+ }
+--- a/fs/btrfs/volumes.h
++++ b/fs/btrfs/volumes.h
+@@ -447,7 +447,8 @@ int btrfs_get_io_geometry(struct btrfs_f
+ struct btrfs_io_geometry *io_geom);
+ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
+ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
+-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type);
++struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
++ u64 type);
+ void btrfs_mapping_tree_free(struct extent_map_tree *tree);
+ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+ int mirror_num);
+@@ -505,6 +506,8 @@ unsigned long btrfs_full_stripe_len(stru
+ u64 logical);
+ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+ u64 chunk_offset, u64 chunk_size);
++int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
++ struct btrfs_block_group *bg);
+ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
+ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
--- /dev/null
+From ea32af47f00a046a1f953370514d6d946efe0152 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 7 Jul 2021 12:23:45 +0100
+Subject: btrfs: zoned: fix wrong mutex unlock on failure to allocate log root tree
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit ea32af47f00a046a1f953370514d6d946efe0152 upstream.
+
+When syncing the log, if we fail to allocate the root node for the log
+root tree:
+
+1) We are unlocking fs_info->tree_log_mutex, but at this point we have
+ not yet locked this mutex;
+
+2) We have locked fs_info->tree_root->log_mutex, but we end up not
+ unlocking it;
+
+So fix this by unlocking fs_info->tree_root->log_mutex instead of
+fs_info->tree_log_mutex.
+
+Fixes: e75f9fd194090e ("btrfs: zoned: move log tree node allocation out of log_root_tree->log_mutex")
+CC: stable@vger.kernel.org # 5.13+
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -3173,7 +3173,7 @@ int btrfs_sync_log(struct btrfs_trans_ha
+ if (!log_root_tree->node) {
+ ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
+ if (ret) {
+- mutex_unlock(&fs_info->tree_log_mutex);
++ mutex_unlock(&fs_info->tree_root->log_mutex);
+ goto out;
+ }
+ }
--- /dev/null
+From 3b0462726e7ef281c35a7a4ae33e93ee2bc9975b Mon Sep 17 00:00:00 2001
+From: Christian Brauner <christian.brauner@ubuntu.com>
+Date: Wed, 14 Jul 2021 15:47:49 +0200
+Subject: cgroup: verify that source is a string
+
+From: Christian Brauner <christian.brauner@ubuntu.com>
+
+commit 3b0462726e7ef281c35a7a4ae33e93ee2bc9975b upstream.
+
+The following sequence can be used to trigger a UAF:
+
+ int fscontext_fd = fsopen("cgroup");
+ int fd_null = open("/dev/null, O_RDONLY);
+ int fsconfig(fscontext_fd, FSCONFIG_SET_FD, "source", fd_null);
+ close_range(3, ~0U, 0);
+
+The cgroup v1 specific fs parser expects a string for the "source"
+parameter. However, it is perfectly legitimate to e.g. specify a file
+descriptor for the "source" parameter. The fs parser doesn't know what
+a filesystem allows there. So it's a bug to assume that "source" is
+always of type fs_value_is_string when it can reasonably also be
+fs_value_is_file.
+
+This assumption in the cgroup code causes a UAF because struct
+fs_parameter uses a union for the actual value. Access to that union is
+guarded by the param->type member. Since the cgroup paramter parser
+didn't check param->type but unconditionally moved param->string into
+fc->source a close on the fscontext_fd would trigger a UAF during
+put_fs_context() which frees fc->source thereby freeing the file stashed
+in param->file causing a UAF during a close of the fd_null.
+
+Fix this by verifying that param->type is actually a string and report
+an error if not.
+
+In follow up patches I'll add a new generic helper that can be used here
+and by other filesystems instead of this error-prone copy-pasta fix.
+But fixing it in here first makes backporting a it to stable a lot
+easier.
+
+Fixes: 8d2451f4994f ("cgroup1: switch to option-by-option parsing")
+Reported-by: syzbot+283ce5a46486d6acdbaf@syzkaller.appspotmail.com
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Alexander Viro <viro@zeniv.linux.org.uk>
+Cc: Dmitry Vyukov <dvyukov@google.com>
+Cc: <stable@kernel.org>
+Cc: syzkaller-bugs <syzkaller-bugs@googlegroups.com>
+Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/cgroup/cgroup-v1.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/kernel/cgroup/cgroup-v1.c
++++ b/kernel/cgroup/cgroup-v1.c
+@@ -912,6 +912,8 @@ int cgroup1_parse_param(struct fs_contex
+ opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
+ if (opt == -ENOPARAM) {
+ if (strcmp(param->key, "source") == 0) {
++ if (param->type != fs_value_is_string)
++ return invalf(fc, "Non-string source");
+ if (fc->source)
+ return invalf(fc, "Multiple sources not supported");
+ fc->source = param->string;
--- /dev/null
+From 775da83005cb61d4c213c636df9337da05714ff1 Mon Sep 17 00:00:00 2001
+From: Jinzhou Su <Jinzhou.Su@amd.com>
+Date: Tue, 13 Jul 2021 09:26:11 +0800
+Subject: drm/amdgpu: add another Renoir DID
+
+From: Jinzhou Su <Jinzhou.Su@amd.com>
+
+commit 775da83005cb61d4c213c636df9337da05714ff1 upstream.
+
+Add new PCI device id.
+
+Signed-off-by: Jinzhou Su <Jinzhou.Su@amd.com>
+Reviewed-by: Huang Rui <ray.huang@amd.com>
+Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+Cc: stable@vger.kernel.org # 5.11.x
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+@@ -1092,6 +1092,7 @@ static const struct pci_device_id pciidl
+ {0x1002, 0x734F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14},
+
+ /* Renoir */
++ {0x1002, 0x15E7, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU},
+ {0x1002, 0x1636, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU},
+ {0x1002, 0x1638, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU},
+ {0x1002, 0x164C, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU},
--- /dev/null
+From 2feeb52859fc1ab94cd35b61ada3a6ac4ff24243 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
+Date: Wed, 30 Jun 2021 19:44:13 +0300
+Subject: drm/i915/gt: Fix -EDEADLK handling regression
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Ville Syrjälä <ville.syrjala@linux.intel.com>
+
+commit 2feeb52859fc1ab94cd35b61ada3a6ac4ff24243 upstream.
+
+The conversion to ww mutexes failed to address the fence code which
+already returns -EDEADLK when we run out of fences. Ww mutexes on
+the other hand treat -EDEADLK as an internal errno value indicating
+a need to restart the operation due to a deadlock. So now when the
+fence code returns -EDEADLK the higher level code erroneously
+restarts everything instead of returning the error to userspace
+as is expected.
+
+To remedy this let's switch the fence code to use a different errno
+value for this. -ENOBUFS seems like a semi-reasonable unique choice.
+Apart from igt the only user of this I could find is sna, and even
+there all we do is dump the current fence registers from debugfs
+into the X server log. So no user visible functionality is affected.
+If we really cared about preserving this we could of course convert
+back to -EDEADLK higher up, but doesn't seem like that's worth
+the hassle here.
+
+Not quite sure which commit specifically broke this, but I'll
+just attribute it to the general gem ww mutex work.
+
+Cc: stable@vger.kernel.org
+Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
+Cc: Thomas Hellström <thomas.hellstrom@intel.com>
+Testcase: igt/gem_pread/exhaustion
+Testcase: igt/gem_pwrite/basic-exhaustion
+Testcase: igt/gem_fenced_exec_thrash/too-many-fences
+Fixes: 80f0b679d6f0 ("drm/i915: Add an implementation for i915_gem_ww_ctx locking, v2.")
+Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20210630164413.25481-1-ville.syrjala@linux.intel.com
+Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
+(cherry picked from commit 78d2ad7eb4e1f0e9cd5d79788446b6092c21d3e0)
+Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c
++++ b/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c
+@@ -366,7 +366,7 @@ static struct i915_fence_reg *fence_find
+ if (intel_has_pending_fb_unpin(ggtt->vm.i915))
+ return ERR_PTR(-EAGAIN);
+
+- return ERR_PTR(-EDEADLK);
++ return ERR_PTR(-ENOBUFS);
+ }
+
+ int __i915_vma_pin_fence(struct i915_vma *vma)
--- /dev/null
+From 0abb33bfca0fb74df76aac03e90ce685016ef7be Mon Sep 17 00:00:00 2001
+From: Matthew Auld <matthew.auld@intel.com>
+Date: Tue, 13 Jul 2021 14:04:31 +0100
+Subject: drm/i915/gtt: drop the page table optimisation
+
+From: Matthew Auld <matthew.auld@intel.com>
+
+commit 0abb33bfca0fb74df76aac03e90ce685016ef7be upstream.
+
+We skip filling out the pt with scratch entries if the va range covers
+the entire pt, since we later have to fill it with the PTEs for the
+object pages anyway. However this might leave open a small window where
+the PTEs don't point to anything valid for the HW to consume.
+
+When for example using 2M GTT pages this fill_px() showed up as being
+quite significant in perf measurements, and ends up being completely
+wasted since we ignore the pt and just use the pde directly.
+
+Anyway, currently we have our PTE construction split between alloc and
+insert, which is probably slightly iffy nowadays, since the alloc
+doesn't actually allocate anything anymore, instead it just sets up the
+page directories and points the PTEs at the scratch page. Later when we
+do the insert step we re-program the PTEs again. Better might be to
+squash the alloc and insert into a single step, then bringing back this
+optimisation(along with some others) should be possible.
+
+Fixes: 14826673247e ("drm/i915: Only initialize partially filled pagetables")
+Signed-off-by: Matthew Auld <matthew.auld@intel.com>
+Cc: Jon Bloomfield <jon.bloomfield@intel.com>
+Cc: Chris Wilson <chris.p.wilson@intel.com>
+Cc: Daniel Vetter <daniel@ffwll.ch>
+Cc: <stable@vger.kernel.org> # v4.15+
+Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
+Link: https://patchwork.freedesktop.org/patch/msgid/20210713130431.2392740-1-matthew.auld@intel.com
+(cherry picked from commit 8f88ca76b3942d82e2c1cea8735ec368d89ecc15)
+Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+--- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
++++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
+@@ -298,10 +298,7 @@ static void __gen8_ppgtt_alloc(struct i9
+ __i915_gem_object_pin_pages(pt->base);
+ i915_gem_object_make_unshrinkable(pt->base);
+
+- if (lvl ||
+- gen8_pt_count(*start, end) < I915_PDES ||
+- intel_vgpu_active(vm->i915))
+- fill_px(pt, vm->scratch[lvl]->encode);
++ fill_px(pt, vm->scratch[lvl]->encode);
+
+ spin_lock(&pd->lock);
+ if (likely(!pd->entry[idx])) {
--- /dev/null
+From a1c9ca5f65c9acfd7c02474b9d5cacbd7ea288df Mon Sep 17 00:00:00 2001
+From: Randy Dunlap <rdunlap@infradead.org>
+Date: Thu, 15 Jul 2021 11:55:31 -0700
+Subject: EDAC/igen6: fix core dependency AGAIN
+
+From: Randy Dunlap <rdunlap@infradead.org>
+
+commit a1c9ca5f65c9acfd7c02474b9d5cacbd7ea288df upstream.
+
+My previous patch had a typo/thinko which prevents this driver
+from being enabled: change X64_64 to X86_64.
+
+Fixes: 0a9ece9ba154 ("EDAC/igen6: fix core dependency")
+Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
+Cc: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
+Cc: linux-edac@vger.kernel.org
+Cc: bowsingbetee <bowsingbetee@protonmail.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Tony Luck <tony.luck@intel.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/edac/Kconfig | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/edac/Kconfig
++++ b/drivers/edac/Kconfig
+@@ -271,7 +271,7 @@ config EDAC_PND2
+ config EDAC_IGEN6
+ tristate "Intel client SoC Integrated MC"
+ depends on PCI && PCI_MMCONFIG && ARCH_HAVE_NMI_SAFE_CMPXCHG
+- depends on X64_64 && X86_MCE_INTEL
++ depends on X86_64 && X86_MCE_INTEL
+ help
+ Support for error detection and correction on the Intel
+ client SoC Integrated Memory Controller using In-Band ECC IP.
--- /dev/null
+From 0af778269a522c988ef0b4188556aba97fb420cc Mon Sep 17 00:00:00 2001
+From: Zhen Lei <thunder.leizhen@huawei.com>
+Date: Mon, 12 Jul 2021 16:55:44 +0800
+Subject: fbmem: Do not delete the mode that is still in use
+
+From: Zhen Lei <thunder.leizhen@huawei.com>
+
+commit 0af778269a522c988ef0b4188556aba97fb420cc upstream.
+
+The execution of fb_delete_videomode() is not based on the result of the
+previous fbcon_mode_deleted(). As a result, the mode is directly deleted,
+regardless of whether it is still in use, which may cause UAF.
+
+==================================================================
+BUG: KASAN: use-after-free in fb_mode_is_equal+0x36e/0x5e0 \
+drivers/video/fbdev/core/modedb.c:924
+Read of size 4 at addr ffff88807e0ddb1c by task syz-executor.0/18962
+
+CPU: 2 PID: 18962 Comm: syz-executor.0 Not tainted 5.10.45-rc1+ #3
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ...
+Call Trace:
+ __dump_stack lib/dump_stack.c:77 [inline]
+ dump_stack+0x137/0x1be lib/dump_stack.c:118
+ print_address_description+0x6c/0x640 mm/kasan/report.c:385
+ __kasan_report mm/kasan/report.c:545 [inline]
+ kasan_report+0x13d/0x1e0 mm/kasan/report.c:562
+ fb_mode_is_equal+0x36e/0x5e0 drivers/video/fbdev/core/modedb.c:924
+ fbcon_mode_deleted+0x16a/0x220 drivers/video/fbdev/core/fbcon.c:2746
+ fb_set_var+0x1e1/0xdb0 drivers/video/fbdev/core/fbmem.c:975
+ do_fb_ioctl+0x4d9/0x6e0 drivers/video/fbdev/core/fbmem.c:1108
+ vfs_ioctl fs/ioctl.c:48 [inline]
+ __do_sys_ioctl fs/ioctl.c:753 [inline]
+ __se_sys_ioctl+0xfb/0x170 fs/ioctl.c:739
+ do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+Freed by task 18960:
+ kasan_save_stack mm/kasan/common.c:48 [inline]
+ kasan_set_track+0x3d/0x70 mm/kasan/common.c:56
+ kasan_set_free_info+0x17/0x30 mm/kasan/generic.c:355
+ __kasan_slab_free+0x108/0x140 mm/kasan/common.c:422
+ slab_free_hook mm/slub.c:1541 [inline]
+ slab_free_freelist_hook+0xd6/0x1a0 mm/slub.c:1574
+ slab_free mm/slub.c:3139 [inline]
+ kfree+0xca/0x3d0 mm/slub.c:4121
+ fb_delete_videomode+0x56a/0x820 drivers/video/fbdev/core/modedb.c:1104
+ fb_set_var+0x1f3/0xdb0 drivers/video/fbdev/core/fbmem.c:978
+ do_fb_ioctl+0x4d9/0x6e0 drivers/video/fbdev/core/fbmem.c:1108
+ vfs_ioctl fs/ioctl.c:48 [inline]
+ __do_sys_ioctl fs/ioctl.c:753 [inline]
+ __se_sys_ioctl+0xfb/0x170 fs/ioctl.c:739
+ do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+Fixes: 13ff178ccd6d ("fbcon: Call fbcon_mode_deleted/new_modelist directly")
+Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
+Cc: <stable@vger.kernel.org> # v5.3+
+Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
+Link: https://patchwork.freedesktop.org/patch/msgid/20210712085544.2828-1-thunder.leizhen@huawei.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/video/fbdev/core/fbmem.c | 12 +++++-------
+ 1 file changed, 5 insertions(+), 7 deletions(-)
+
+--- a/drivers/video/fbdev/core/fbmem.c
++++ b/drivers/video/fbdev/core/fbmem.c
+@@ -970,13 +970,11 @@ fb_set_var(struct fb_info *info, struct
+ fb_var_to_videomode(&mode2, &info->var);
+ /* make sure we don't delete the videomode of current var */
+ ret = fb_mode_is_equal(&mode1, &mode2);
+-
+- if (!ret)
+- fbcon_mode_deleted(info, &mode1);
+-
+- if (!ret)
+- fb_delete_videomode(&mode1, &info->modelist);
+-
++ if (!ret) {
++ ret = fbcon_mode_deleted(info, &mode1);
++ if (!ret)
++ fb_delete_videomode(&mode1, &info->modelist);
++ }
+
+ return ret ? -EINVAL : 0;
+ }
--- /dev/null
+From 9c6882608bce249a8918744ecdb65748534e3f17 Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Sat, 10 Jul 2021 02:45:59 +0100
+Subject: io_uring: use right task for exiting checks
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+commit 9c6882608bce249a8918744ecdb65748534e3f17 upstream.
+
+When we use delayed_work for fallback execution of requests, current
+will be not of the submitter task, and so checks in io_req_task_submit()
+may not behave as expected. Currently, it leaves inline completions not
+flushed, so making io_ring_exit_work() to hang. Use the submitter task
+for all those checks.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/cb413c715bed0bc9c98b169059ea9c8a2c770715.1625881431.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -2023,7 +2023,7 @@ static void __io_req_task_submit(struct
+
+ /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
+ mutex_lock(&ctx->uring_lock);
+- if (!(current->flags & PF_EXITING) && !current->in_execve)
++ if (!(req->task->flags & PF_EXITING) && !req->task->in_execve)
+ __io_queue_sqe(req);
+ else
+ __io_req_task_cancel(req, -EFAULT);
--- /dev/null
+From 474dd1c6506411752a9b2f2233eec11f1733a099 Mon Sep 17 00:00:00 2001
+From: Lu Baolu <baolu.lu@linux.intel.com>
+Date: Mon, 12 Jul 2021 15:17:12 +0800
+Subject: iommu/vt-d: Fix clearing real DMA device's scalable-mode context entries
+
+From: Lu Baolu <baolu.lu@linux.intel.com>
+
+commit 474dd1c6506411752a9b2f2233eec11f1733a099 upstream.
+
+The commit 2b0140c69637e ("iommu/vt-d: Use pci_real_dma_dev() for mapping")
+fixes an issue of "sub-device is removed where the context entry is cleared
+for all aliases". But this commit didn't consider the PASID entry and PASID
+table in VT-d scalable mode. This fix increases the coverage of scalable
+mode.
+
+Suggested-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
+Fixes: 8038bdb855331 ("iommu/vt-d: Only clear real DMA device's context entries")
+Fixes: 2b0140c69637e ("iommu/vt-d: Use pci_real_dma_dev() for mapping")
+Cc: stable@vger.kernel.org # v5.6+
+Cc: Jon Derrick <jonathan.derrick@intel.com>
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Link: https://lore.kernel.org/r/20210712071712.3416949-1-baolu.lu@linux.intel.com
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/iommu/intel/iommu.c | 5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+--- a/drivers/iommu/intel/iommu.c
++++ b/drivers/iommu/intel/iommu.c
+@@ -4503,14 +4503,13 @@ static void __dmar_remove_one_dev_info(s
+ iommu = info->iommu;
+ domain = info->domain;
+
+- if (info->dev) {
++ if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
+ if (dev_is_pci(info->dev) && sm_supported(iommu))
+ intel_pasid_tear_down_entry(iommu, info->dev,
+ PASID_RID2PASID, false);
+
+ iommu_disable_dev_iotlb(info);
+- if (!dev_is_real_dma_subdevice(info->dev))
+- domain_context_clear(info);
++ domain_context_clear(info);
+ intel_pasid_free_table(info->dev);
+ }
+
--- /dev/null
+From 37764b952e1b39053defc7ebe5dcd8c4e3e78de9 Mon Sep 17 00:00:00 2001
+From: Sanjay Kumar <sanjay.k.kumar@intel.com>
+Date: Mon, 12 Jul 2021 15:13:15 +0800
+Subject: iommu/vt-d: Global devTLB flush when present context entry changed
+
+From: Sanjay Kumar <sanjay.k.kumar@intel.com>
+
+commit 37764b952e1b39053defc7ebe5dcd8c4e3e78de9 upstream.
+
+This fixes a bug in context cache clear operation. The code was not
+following the correct invalidation flow. A global device TLB invalidation
+should be added after the IOTLB invalidation. At the same time, it
+uses the domain ID from the context entry. But in scalable mode, the
+domain ID is in PASID table entry, not context entry.
+
+Fixes: 7373a8cc38197 ("iommu/vt-d: Setup context and enable RID2PASID support")
+Cc: stable@vger.kernel.org # v5.0+
+Signed-off-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Link: https://lore.kernel.org/r/20210712071315.3416543-1-baolu.lu@linux.intel.com
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/iommu/intel/iommu.c | 31 ++++++++++++++++++++++---------
+ 1 file changed, 22 insertions(+), 9 deletions(-)
+
+--- a/drivers/iommu/intel/iommu.c
++++ b/drivers/iommu/intel/iommu.c
+@@ -2434,10 +2434,11 @@ __domain_mapping(struct dmar_domain *dom
+ return 0;
+ }
+
+-static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
++static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
+ {
+- unsigned long flags;
++ struct intel_iommu *iommu = info->iommu;
+ struct context_entry *context;
++ unsigned long flags;
+ u16 did_old;
+
+ if (!iommu)
+@@ -2449,7 +2450,16 @@ static void domain_context_clear_one(str
+ spin_unlock_irqrestore(&iommu->lock, flags);
+ return;
+ }
+- did_old = context_domain_id(context);
++
++ if (sm_supported(iommu)) {
++ if (hw_pass_through && domain_type_is_si(info->domain))
++ did_old = FLPT_DEFAULT_DID;
++ else
++ did_old = info->domain->iommu_did[iommu->seq_id];
++ } else {
++ did_old = context_domain_id(context);
++ }
++
+ context_clear_entry(context);
+ __iommu_flush_cache(iommu, context, sizeof(*context));
+ spin_unlock_irqrestore(&iommu->lock, flags);
+@@ -2467,6 +2477,8 @@ static void domain_context_clear_one(str
+ 0,
+ 0,
+ DMA_TLB_DSI_FLUSH);
++
++ __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
+ }
+
+ static inline void unlink_domain_info(struct device_domain_info *info)
+@@ -4456,9 +4468,9 @@ out_free_dmar:
+
+ static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
+ {
+- struct intel_iommu *iommu = opaque;
++ struct device_domain_info *info = opaque;
+
+- domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
++ domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
+ return 0;
+ }
+
+@@ -4468,12 +4480,13 @@ static int domain_context_clear_one_cb(s
+ * devices, unbinding the driver from any one of them will possibly leave
+ * the others unable to operate.
+ */
+-static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
++static void domain_context_clear(struct device_domain_info *info)
+ {
+- if (!iommu || !dev || !dev_is_pci(dev))
++ if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
+ return;
+
+- pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
++ pci_for_each_dma_alias(to_pci_dev(info->dev),
++ &domain_context_clear_one_cb, info);
+ }
+
+ static void __dmar_remove_one_dev_info(struct device_domain_info *info)
+@@ -4497,7 +4510,7 @@ static void __dmar_remove_one_dev_info(s
+
+ iommu_disable_dev_iotlb(info);
+ if (!dev_is_real_dma_subdevice(info->dev))
+- domain_context_clear(iommu, info->dev);
++ domain_context_clear(info);
+ intel_pasid_free_table(info->dev);
+ }
+
--- /dev/null
+From d08af0a59684e18a51aa4bfd24c658994ea3fc5b Mon Sep 17 00:00:00 2001
+From: Joao Martins <joao.m.martins@oracle.com>
+Date: Wed, 14 Jul 2021 21:27:11 -0700
+Subject: mm/hugetlb: fix refs calculation from unaligned @vaddr
+
+From: Joao Martins <joao.m.martins@oracle.com>
+
+commit d08af0a59684e18a51aa4bfd24c658994ea3fc5b upstream.
+
+Commit 82e5d378b0e47 ("mm/hugetlb: refactor subpage recording")
+refactored the count of subpages but missed an edge case when @vaddr is
+not aligned to PAGE_SIZE e.g. when close to vma->vm_end. It would then
+errousnly set @refs to 0 and record_subpages_vmas() wouldn't set the
+@pages array element to its value, consequently causing the reported
+null-deref by syzbot.
+
+Fix it by aligning down @vaddr by PAGE_SIZE in @refs calculation.
+
+Link: https://lkml.kernel.org/r/20210713152440.28650-1-joao.m.martins@oracle.com
+Fixes: 82e5d378b0e47 ("mm/hugetlb: refactor subpage recording")
+Reported-by: syzbot+a3fcd59df1b372066f5a@syzkaller.appspotmail.com
+Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
+Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/hugetlb.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -5029,8 +5029,9 @@ long follow_hugetlb_page(struct mm_struc
+ continue;
+ }
+
+- refs = min3(pages_per_huge_page(h) - pfn_offset,
+- (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);
++ /* vaddr may not be aligned to PAGE_SIZE */
++ refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
++ (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
+
+ if (pages || vmas)
+ record_subpages_vmas(mem_map_offset(page, pfn_offset),
--- /dev/null
+From 93aa71ad7379900e61c8adff6a710a4c18c7c99b Mon Sep 17 00:00:00 2001
+From: Tyrel Datwyler <tyreld@linux.ibm.com>
+Date: Thu, 1 Jul 2021 13:56:59 -0600
+Subject: scsi: core: Fix bad pointer dereference when ehandler kthread is invalid
+
+From: Tyrel Datwyler <tyreld@linux.ibm.com>
+
+commit 93aa71ad7379900e61c8adff6a710a4c18c7c99b upstream.
+
+Commit 66a834d09293 ("scsi: core: Fix error handling of scsi_host_alloc()")
+changed the allocation logic to call put_device() to perform host cleanup
+with the assumption that IDA removal and stopping the kthread would
+properly be performed in scsi_host_dev_release(). However, in the unlikely
+case that the error handler thread fails to spawn, shost->ehandler is set
+to ERR_PTR(-ENOMEM).
+
+The error handler cleanup code in scsi_host_dev_release() will call
+kthread_stop() if shost->ehandler != NULL which will always be the case
+whether the kthread was successfully spawned or not. In the case that it
+failed to spawn this has the nasty side effect of trying to dereference an
+invalid pointer when kthread_stop() is called. The following splat provides
+an example of this behavior in the wild:
+
+scsi host11: error handler thread failed to spawn, error = -4
+Kernel attempted to read user page (10c) - exploit attempt? (uid: 0)
+BUG: Kernel NULL pointer dereference on read at 0x0000010c
+Faulting instruction address: 0xc00000000818e9a8
+Oops: Kernel access of bad area, sig: 11 [#1]
+LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
+Modules linked in: ibmvscsi(+) scsi_transport_srp dm_multipath dm_mirror dm_region
+ hash dm_log dm_mod fuse overlay squashfs loop
+CPU: 12 PID: 274 Comm: systemd-udevd Not tainted 5.13.0-rc7 #1
+NIP: c00000000818e9a8 LR: c0000000089846e8 CTR: 0000000000007ee8
+REGS: c000000037d12ea0 TRAP: 0300 Not tainted (5.13.0-rc7)
+MSR: 800000000280b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE> CR: 28228228
+XER: 20040001
+CFAR: c0000000089846e4 DAR: 000000000000010c DSISR: 40000000 IRQMASK: 0
+GPR00: c0000000089846e8 c000000037d13140 c000000009cc1100 fffffffffffffffc
+GPR04: 0000000000000001 0000000000000000 0000000000000000 c000000037dc0000
+GPR08: 0000000000000000 c000000037dc0000 0000000000000001 00000000fffff7ff
+GPR12: 0000000000008000 c00000000a049000 c000000037d13d00 000000011134d5a0
+GPR16: 0000000000001740 c0080000190d0000 c0080000190d1740 c000000009129288
+GPR20: c000000037d13bc0 0000000000000001 c000000037d13bc0 c0080000190b7898
+GPR24: c0080000190b7708 0000000000000000 c000000033bb2c48 0000000000000000
+GPR28: c000000046b28280 0000000000000000 000000000000010c fffffffffffffffc
+NIP [c00000000818e9a8] kthread_stop+0x38/0x230
+LR [c0000000089846e8] scsi_host_dev_release+0x98/0x160
+Call Trace:
+[c000000033bb2c48] 0xc000000033bb2c48 (unreliable)
+[c0000000089846e8] scsi_host_dev_release+0x98/0x160
+[c00000000891e960] device_release+0x60/0x100
+[c0000000087e55c4] kobject_release+0x84/0x210
+[c00000000891ec78] put_device+0x28/0x40
+[c000000008984ea4] scsi_host_alloc+0x314/0x430
+[c0080000190b38bc] ibmvscsi_probe+0x54/0xad0 [ibmvscsi]
+[c000000008110104] vio_bus_probe+0xa4/0x4b0
+[c00000000892a860] really_probe+0x140/0x680
+[c00000000892aefc] driver_probe_device+0x15c/0x200
+[c00000000892b63c] device_driver_attach+0xcc/0xe0
+[c00000000892b740] __driver_attach+0xf0/0x200
+[c000000008926f28] bus_for_each_dev+0xa8/0x130
+[c000000008929ce4] driver_attach+0x34/0x50
+[c000000008928fc0] bus_add_driver+0x1b0/0x300
+[c00000000892c798] driver_register+0x98/0x1a0
+[c00000000810eb60] __vio_register_driver+0x80/0xe0
+[c0080000190b4a30] ibmvscsi_module_init+0x9c/0xdc [ibmvscsi]
+[c0000000080121d0] do_one_initcall+0x60/0x2d0
+[c000000008261abc] do_init_module+0x7c/0x320
+[c000000008265700] load_module+0x2350/0x25b0
+[c000000008265cb4] __do_sys_finit_module+0xd4/0x160
+[c000000008031110] system_call_exception+0x150/0x2d0
+[c00000000800d35c] system_call_common+0xec/0x278
+
+Fix this be nulling shost->ehandler when the kthread fails to spawn.
+
+Link: https://lore.kernel.org/r/20210701195659.3185475-1-tyreld@linux.ibm.com
+Fixes: 66a834d09293 ("scsi: core: Fix error handling of scsi_host_alloc()")
+Cc: stable@vger.kernel.org
+Reviewed-by: Ming Lei <ming.lei@redhat.com>
+Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/scsi/hosts.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/scsi/hosts.c
++++ b/drivers/scsi/hosts.c
+@@ -490,6 +490,7 @@ struct Scsi_Host *scsi_host_alloc(struct
+ shost_printk(KERN_WARNING, shost,
+ "error handler thread failed to spawn, error = %ld\n",
+ PTR_ERR(shost->ehandler));
++ shost->ehandler = NULL;
+ goto fail;
+ }
+
--- /dev/null
+From 8b3bdd99c092bbaeaa7d9eecb1a3e5dc9112002b Mon Sep 17 00:00:00 2001
+From: Steffen Maier <maier@linux.ibm.com>
+Date: Fri, 2 Jul 2021 18:09:22 +0200
+Subject: scsi: zfcp: Report port fc_security as unknown early during remote cable pull
+
+From: Steffen Maier <maier@linux.ibm.com>
+
+commit 8b3bdd99c092bbaeaa7d9eecb1a3e5dc9112002b upstream.
+
+On remote cable pull, a zfcp_port keeps its status and only gets
+ZFCP_STATUS_PORT_LINK_TEST added. Only after an ADISC timeout, we would
+actually start port recovery and remove ZFCP_STATUS_COMMON_UNBLOCKED which
+zfcp_sysfs_port_fc_security_show() detected and reported as "unknown"
+instead of the old and possibly stale zfcp_port->connection_info.
+
+Add check for ZFCP_STATUS_PORT_LINK_TEST for timely "unknown" report.
+
+Link: https://lore.kernel.org/r/20210702160922.2667874-1-maier@linux.ibm.com
+Fixes: a17c78460093 ("scsi: zfcp: report FC Endpoint Security in sysfs")
+Cc: <stable@vger.kernel.org> #5.7+
+Reviewed-by: Benjamin Block <bblock@linux.ibm.com>
+Signed-off-by: Steffen Maier <maier@linux.ibm.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/s390/scsi/zfcp_sysfs.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/s390/scsi/zfcp_sysfs.c
++++ b/drivers/s390/scsi/zfcp_sysfs.c
+@@ -487,6 +487,7 @@ static ssize_t zfcp_sysfs_port_fc_securi
+ if (0 == (status & ZFCP_STATUS_COMMON_OPEN) ||
+ 0 == (status & ZFCP_STATUS_COMMON_UNBLOCKED) ||
+ 0 == (status & ZFCP_STATUS_PORT_PHYS_OPEN) ||
++ 0 != (status & ZFCP_STATUS_PORT_LINK_TEST) ||
+ 0 != (status & ZFCP_STATUS_COMMON_ERP_FAILED) ||
+ 0 != (status & ZFCP_STATUS_COMMON_ACCESS_BOXED))
+ i = sprintf(buf, "unknown\n");
kvm-x86-mmu-do-not-apply-hpa-memory-encryption-mask-to-gpas.patch
kvm-nsvm-check-the-value-written-to-msr_vm_hsave_pa.patch
kvm-x86-disable-hardware-breakpoints-unconditionally-before-kvm_x86-run.patch
+scsi-core-fix-bad-pointer-dereference-when-ehandler-kthread-is-invalid.patch
+scsi-zfcp-report-port-fc_security-as-unknown-early-during-remote-cable-pull.patch
+iommu-vt-d-global-devtlb-flush-when-present-context-entry-changed.patch
+iommu-vt-d-fix-clearing-real-dma-device-s-scalable-mode-context-entries.patch
+tracing-do-not-reference-char-as-a-string-in-histograms.patch
+drm-amdgpu-add-another-renoir-did.patch
+drm-i915-gtt-drop-the-page-table-optimisation.patch
+drm-i915-gt-fix-edeadlk-handling-regression.patch
+cgroup-verify-that-source-is-a-string.patch
+fbmem-do-not-delete-the-mode-that-is-still-in-use.patch
+edac-igen6-fix-core-dependency-again.patch
+mm-hugetlb-fix-refs-calculation-from-unaligned-vaddr.patch
+arm64-avoid-premature-usercopy-failure.patch
+io_uring-use-right-task-for-exiting-checks.patch
+btrfs-properly-split-extent_map-for-req_op_zone_append.patch
+btrfs-fix-deadlock-with-concurrent-chunk-allocations-involving-system-chunks.patch
+btrfs-rework-chunk-allocation-to-avoid-exhaustion-of-the-system-chunk-array.patch
+btrfs-zoned-fix-wrong-mutex-unlock-on-failure-to-allocate-log-root-tree.patch
--- /dev/null
+From 704adfb5a9978462cd861f170201ae2b5e3d3a80 Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
+Date: Thu, 15 Jul 2021 00:02:06 -0400
+Subject: tracing: Do not reference char * as a string in histograms
+
+From: Steven Rostedt (VMware) <rostedt@goodmis.org>
+
+commit 704adfb5a9978462cd861f170201ae2b5e3d3a80 upstream.
+
+The histogram logic was allowing events with char * pointers to be used as
+normal strings. But it was easy to crash the kernel with:
+
+ # echo 'hist:keys=filename' > events/syscalls/sys_enter_openat/trigger
+
+And open some files, and boom!
+
+ BUG: unable to handle page fault for address: 00007f2ced0c3280
+ #PF: supervisor read access in kernel mode
+ #PF: error_code(0x0000) - not-present page
+ PGD 1173fa067 P4D 1173fa067 PUD 1171b6067 PMD 1171dd067 PTE 0
+ Oops: 0000 [#1] PREEMPT SMP
+ CPU: 6 PID: 1810 Comm: cat Not tainted 5.13.0-rc5-test+ #61
+ Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01
+v03.03 07/14/2016
+ RIP: 0010:strlen+0x0/0x20
+ Code: f6 82 80 2a 0b a9 20 74 11 0f b6 50 01 48 83 c0 01 f6 82 80 2a 0b
+a9 20 75 ef c3 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 <80> 3f 00 74
+10 48 89 f8 48 83 c0 01 80 38 00 75 f7 48 29 f8 c3
+
+ RSP: 0018:ffffbdbf81567b50 EFLAGS: 00010246
+ RAX: 0000000000000003 RBX: ffff93815cdb3800 RCX: ffff9382401a22d0
+ RDX: 0000000000000100 RSI: 0000000000000000 RDI: 00007f2ced0c3280
+ RBP: 0000000000000100 R08: ffff9382409ff074 R09: ffffbdbf81567c98
+ R10: ffff9382409ff074 R11: 0000000000000000 R12: ffff9382409ff074
+ R13: 0000000000000001 R14: ffff93815a744f00 R15: 00007f2ced0c3280
+ FS: 00007f2ced0f8580(0000) GS:ffff93825a800000(0000)
+knlGS:0000000000000000
+ CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ CR2: 00007f2ced0c3280 CR3: 0000000107069005 CR4: 00000000001706e0
+ Call Trace:
+ event_hist_trigger+0x463/0x5f0
+ ? find_held_lock+0x32/0x90
+ ? sched_clock_cpu+0xe/0xd0
+ ? lock_release+0x155/0x440
+ ? kernel_init_free_pages+0x6d/0x90
+ ? preempt_count_sub+0x9b/0xd0
+ ? kernel_init_free_pages+0x6d/0x90
+ ? get_page_from_freelist+0x12c4/0x1680
+ ? __rb_reserve_next+0xe5/0x460
+ ? ring_buffer_lock_reserve+0x12a/0x3f0
+ event_triggers_call+0x52/0xe0
+ ftrace_syscall_enter+0x264/0x2c0
+ syscall_trace_enter.constprop.0+0x1ee/0x210
+ do_syscall_64+0x1c/0x80
+ entry_SYSCALL_64_after_hwframe+0x44/0xae
+
+Where it triggered a fault on strlen(key) where key was the filename.
+
+The reason is that filename is a char * to user space, and the histogram
+code just blindly dereferenced it, with obvious bad results.
+
+I originally tried to use strncpy_from_user/kernel_nofault() but found
+that there's other places that its dereferenced and not worth the effort.
+
+Just do not allow "char *" to act like strings.
+
+Link: https://lkml.kernel.org/r/20210715000206.025df9d2@rorschach.local.home
+
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Tzvetomir Stoyanov <tz.stoyanov@gmail.com>
+Cc: stable@vger.kernel.org
+Acked-by: Namhyung Kim <namhyung@kernel.org>
+Acked-by: Tom Zanussi <zanussi@kernel.org>
+Fixes: 79e577cbce4c4 ("tracing: Support string type key properly")
+Fixes: 5967bd5c4239 ("tracing: Let filter_assign_type() detect FILTER_PTR_STRING")
+Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/trace_events_hist.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/kernel/trace/trace_events_hist.c
++++ b/kernel/trace/trace_events_hist.c
+@@ -1673,7 +1673,9 @@ static struct hist_field *create_hist_fi
+ if (WARN_ON_ONCE(!field))
+ goto out;
+
+- if (is_string_field(field)) {
++ /* Pointers to strings are just pointers and dangerous to dereference */
++ if (is_string_field(field) &&
++ (field->filter_type != FILTER_PTR_STRING)) {
+ flags |= HIST_FIELD_FL_STRING;
+
+ hist_field->size = MAX_FILTER_STR_VAL;
+@@ -4469,8 +4471,6 @@ static inline void add_to_key(char *comp
+ field = key_field->field;
+ if (field->filter_type == FILTER_DYN_STRING)
+ size = *(u32 *)(rec + field->offset) >> 16;
+- else if (field->filter_type == FILTER_PTR_STRING)
+- size = strlen(key);
+ else if (field->filter_type == FILTER_STATIC_STRING)
+ size = field->size;
+