5.12-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 19 Jul 2021 12:16:10 +0000 (14:16 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 19 Jul 2021 12:16:10 +0000 (14:16 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 19 Jul 2021 12:16:10 +0000 (14:16 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 19 Jul 2021 12:16:10 +0000 (14:16 +0200)
diff --git a/queue-5.12/arm64-avoid-premature-usercopy-failure.patch b/queue-5.12/arm64-avoid-premature-usercopy-failure.patch

new file mode 100644 (file)

index 0000000..5e7f1cf
--- /dev/null
+++ b/queue-5.12/arm64-avoid-premature-usercopy-failure.patch
@@ -0,0 +1,199 @@
+From 295cf156231ca3f9e3a66bde7fab5e09c41835e0 Mon Sep 17 00:00:00 2001
+From: Robin Murphy <robin.murphy@arm.com>
+Date: Mon, 12 Jul 2021 15:27:46 +0100
+Subject: arm64: Avoid premature usercopy failure
+
+From: Robin Murphy <robin.murphy@arm.com>
+
+commit 295cf156231ca3f9e3a66bde7fab5e09c41835e0 upstream.
+
+Al reminds us that the usercopy API must only return complete failure
+if absolutely nothing could be copied. Currently, if userspace does
+something silly like giving us an unaligned pointer to Device memory,
+or a size which overruns MTE tag bounds, we may fail to honour that
+requirement when faulting on a multi-byte access even though a smaller
+access could have succeeded.
+
+Add a mitigation to the fixup routines to fall back to a single-byte
+copy if we faulted on a larger access before anything has been written
+to the destination, to guarantee making *some* forward progress. We
+needn't be too concerned about the overall performance since this should
+only occur when callers are doing something a bit dodgy in the first
+place. Particularly broken userspace might still be able to trick
+generic_perform_write() into an infinite loop by targeting write() at
+an mmap() of some read-only device register where the fault-in load
+succeeds but any store synchronously aborts such that copy_to_user() is
+genuinely unable to make progress, but, well, don't do that...
+
+CC: stable@vger.kernel.org
+Reported-by: Chen Huang <chenhuang5@huawei.com>
+Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
+Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Robin Murphy <robin.murphy@arm.com>
+Link: https://lore.kernel.org/r/dc03d5c675731a1f24a62417dba5429ad744234e.1626098433.git.robin.murphy@arm.com
+Signed-off-by: Will Deacon <will@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/lib/copy_from_user.S |   13 ++++++++++---
+ arch/arm64/lib/copy_in_user.S   |   21 ++++++++++++++-------
+ arch/arm64/lib/copy_to_user.S   |   14 +++++++++++---
+ 3 files changed, 35 insertions(+), 13 deletions(-)
+
+--- a/arch/arm64/lib/copy_from_user.S
++++ b/arch/arm64/lib/copy_from_user.S
+@@ -29,7 +29,7 @@
+       .endm
+ 
+       .macro ldrh1 reg, ptr, val
+-      user_ldst 9998f, ldtrh, \reg, \ptr, \val
++      user_ldst 9997f, ldtrh, \reg, \ptr, \val
+       .endm
+ 
+       .macro strh1 reg, ptr, val
+@@ -37,7 +37,7 @@
+       .endm
+ 
+       .macro ldr1 reg, ptr, val
+-      user_ldst 9998f, ldtr, \reg, \ptr, \val
++      user_ldst 9997f, ldtr, \reg, \ptr, \val
+       .endm
+ 
+       .macro str1 reg, ptr, val
+@@ -45,7 +45,7 @@
+       .endm
+ 
+       .macro ldp1 reg1, reg2, ptr, val
+-      user_ldp 9998f, \reg1, \reg2, \ptr, \val
++      user_ldp 9997f, \reg1, \reg2, \ptr, \val
+       .endm
+ 
+       .macro stp1 reg1, reg2, ptr, val
+@@ -53,8 +53,10 @@
+       .endm
+ 
+ end   .req    x5
++srcin .req    x15
+ SYM_FUNC_START(__arch_copy_from_user)
+       add     end, x0, x2
++      mov     srcin, x1
+ #include "copy_template.S"
+       mov     x0, #0                          // Nothing to copy
+       ret
+@@ -63,6 +65,11 @@ EXPORT_SYMBOL(__arch_copy_from_user)
+ 
+       .section .fixup,"ax"
+       .align  2
++9997: cmp     dst, dstin
++      b.ne    9998f
++      // Before being absolutely sure we couldn't copy anything, try harder
++USER(9998f, ldtrb tmp1w, [srcin])
++      strb    tmp1w, [dst], #1
+ 9998: sub     x0, end, dst                    // bytes not copied
+       ret
+       .previous
+--- a/arch/arm64/lib/copy_in_user.S
++++ b/arch/arm64/lib/copy_in_user.S
+@@ -30,33 +30,34 @@
+       .endm
+ 
+       .macro ldrh1 reg, ptr, val
+-      user_ldst 9998f, ldtrh, \reg, \ptr, \val
++      user_ldst 9997f, ldtrh, \reg, \ptr, \val
+       .endm
+ 
+       .macro strh1 reg, ptr, val
+-      user_ldst 9998f, sttrh, \reg, \ptr, \val
++      user_ldst 9997f, sttrh, \reg, \ptr, \val
+       .endm
+ 
+       .macro ldr1 reg, ptr, val
+-      user_ldst 9998f, ldtr, \reg, \ptr, \val
++      user_ldst 9997f, ldtr, \reg, \ptr, \val
+       .endm
+ 
+       .macro str1 reg, ptr, val
+-      user_ldst 9998f, sttr, \reg, \ptr, \val
++      user_ldst 9997f, sttr, \reg, \ptr, \val
+       .endm
+ 
+       .macro ldp1 reg1, reg2, ptr, val
+-      user_ldp 9998f, \reg1, \reg2, \ptr, \val
++      user_ldp 9997f, \reg1, \reg2, \ptr, \val
+       .endm
+ 
+       .macro stp1 reg1, reg2, ptr, val
+-      user_stp 9998f, \reg1, \reg2, \ptr, \val
++      user_stp 9997f, \reg1, \reg2, \ptr, \val
+       .endm
+ 
+ end   .req    x5
+-
++srcin .req    x15
+ SYM_FUNC_START(__arch_copy_in_user)
+       add     end, x0, x2
++      mov     srcin, x1
+ #include "copy_template.S"
+       mov     x0, #0
+       ret
+@@ -65,6 +66,12 @@ EXPORT_SYMBOL(__arch_copy_in_user)
+ 
+       .section .fixup,"ax"
+       .align  2
++9997: cmp     dst, dstin
++      b.ne    9998f
++      // Before being absolutely sure we couldn't copy anything, try harder
++USER(9998f, ldtrb tmp1w, [srcin])
++USER(9998f, sttrb tmp1w, [dst])
++      add     dst, dst, #1
+ 9998: sub     x0, end, dst                    // bytes not copied
+       ret
+       .previous
+--- a/arch/arm64/lib/copy_to_user.S
++++ b/arch/arm64/lib/copy_to_user.S
+@@ -32,7 +32,7 @@
+       .endm
+ 
+       .macro strh1 reg, ptr, val
+-      user_ldst 9998f, sttrh, \reg, \ptr, \val
++      user_ldst 9997f, sttrh, \reg, \ptr, \val
+       .endm
+ 
+       .macro ldr1 reg, ptr, val
+@@ -40,7 +40,7 @@
+       .endm
+ 
+       .macro str1 reg, ptr, val
+-      user_ldst 9998f, sttr, \reg, \ptr, \val
++      user_ldst 9997f, sttr, \reg, \ptr, \val
+       .endm
+ 
+       .macro ldp1 reg1, reg2, ptr, val
+@@ -48,12 +48,14 @@
+       .endm
+ 
+       .macro stp1 reg1, reg2, ptr, val
+-      user_stp 9998f, \reg1, \reg2, \ptr, \val
++      user_stp 9997f, \reg1, \reg2, \ptr, \val
+       .endm
+ 
+ end   .req    x5
++srcin .req    x15
+ SYM_FUNC_START(__arch_copy_to_user)
+       add     end, x0, x2
++      mov     srcin, x1
+ #include "copy_template.S"
+       mov     x0, #0
+       ret
+@@ -62,6 +64,12 @@ EXPORT_SYMBOL(__arch_copy_to_user)
+ 
+       .section .fixup,"ax"
+       .align  2
++9997: cmp     dst, dstin
++      b.ne    9998f
++      // Before being absolutely sure we couldn't copy anything, try harder
++      ldrb    tmp1w, [srcin]
++USER(9998f, sttrb tmp1w, [dst])
++      add     dst, dst, #1
+ 9998: sub     x0, end, dst                    // bytes not copied
+       ret
+       .previous
diff --git a/queue-5.12/btrfs-fix-deadlock-with-concurrent-chunk-allocations-involving-system-chunks.patch b/queue-5.12/btrfs-fix-deadlock-with-concurrent-chunk-allocations-involving-system-chunks.patch

new file mode 100644 (file)

index 0000000..9f23e54
--- /dev/null
+++ b/queue-5.12/btrfs-fix-deadlock-with-concurrent-chunk-allocations-involving-system-chunks.patch
@@ -0,0 +1,212 @@
+From 1cb3db1cf383a3c7dbda1aa0ce748b0958759947 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 29 Jun 2021 14:43:05 +0100
+Subject: btrfs: fix deadlock with concurrent chunk allocations involving system chunks
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 1cb3db1cf383a3c7dbda1aa0ce748b0958759947 upstream.
+
+When a task attempting to allocate a new chunk verifies that there is not
+currently enough free space in the system space_info and there is another
+task that allocated a new system chunk but it did not finish yet the
+creation of the respective block group, it waits for that other task to
+finish creating the block group. This is to avoid exhaustion of the system
+chunk array in the superblock, which is limited, when we have a thundering
+herd of tasks allocating new chunks. This problem was described and fixed
+by commit eafa4fd0ad0607 ("btrfs: fix exhaustion of the system chunk array
+due to concurrent allocations").
+
+However there are two very similar scenarios where this can lead to a
+deadlock:
+
+1) Task B allocated a new system chunk and task A is waiting on task B
+   to finish creation of the respective system block group. However before
+   task B ends its transaction handle and finishes the creation of the
+   system block group, it attempts to allocate another chunk (like a data
+   chunk for an fallocate operation for a very large range). Task B will
+   be unable to progress and allocate the new chunk, because task A set
+   space_info->chunk_alloc to 1 and therefore it loops at
+   btrfs_chunk_alloc() waiting for task A to finish its chunk allocation
+   and set space_info->chunk_alloc to 0, but task A is waiting on task B
+   to finish creation of the new system block group, therefore resulting
+   in a deadlock;
+
+2) Task B allocated a new system chunk and task A is waiting on task B to
+   finish creation of the respective system block group. By the time that
+   task B enter the final phase of block group allocation, which happens
+   at btrfs_create_pending_block_groups(), when it modifies the extent
+   tree, the device tree or the chunk tree to insert the items for some
+   new block group, it needs to allocate a new chunk, so it ends up at
+   btrfs_chunk_alloc() and keeps looping there because task A has set
+   space_info->chunk_alloc to 1, but task A is waiting for task B to
+   finish creation of the new system block group and release the reserved
+   system space, therefore resulting in a deadlock.
+
+In short, the problem is if a task B needs to allocate a new chunk after
+it previously allocated a new system chunk and if another task A is
+currently waiting for task B to complete the allocation of the new system
+chunk.
+
+Unfortunately this deadlock scenario introduced by the previous fix for
+the system chunk array exhaustion problem does not have a simple and short
+fix, and requires a big change to rework the chunk allocation code so that
+chunk btree updates are all made in the first phase of chunk allocation.
+And since this deadlock regression is being frequently hit on zoned
+filesystems and the system chunk array exhaustion problem is triggered
+in more extreme cases (originally observed on PowerPC with a node size
+of 64K when running the fallocate tests from stress-ng), revert the
+changes from that commit. The next patch in the series, with a subject
+of "btrfs: rework chunk allocation to avoid exhaustion of the system
+chunk array" does the necessary changes to fix the system chunk array
+exhaustion problem.
+
+Reported-by: Naohiro Aota <naohiro.aota@wdc.com>
+Link: https://lore.kernel.org/linux-btrfs/20210621015922.ewgbffxuawia7liz@naota-xeon/
+Fixes: eafa4fd0ad0607 ("btrfs: fix exhaustion of the system chunk array due to concurrent allocations")
+CC: stable@vger.kernel.org # 5.12+
+Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
+Tested-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Tested-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/block-group.c |   58 -------------------------------------------------
+ fs/btrfs/transaction.c |    5 ----
+ fs/btrfs/transaction.h |    7 -----
+ 3 files changed, 1 insertion(+), 69 deletions(-)
+
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -3269,7 +3269,6 @@ static u64 get_profile_num_devs(struct b
+  */
+ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+ {
+-      struct btrfs_transaction *cur_trans = trans->transaction;
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+       struct btrfs_space_info *info;
+       u64 left;
+@@ -3284,7 +3283,6 @@ void check_system_chunk(struct btrfs_tra
+       lockdep_assert_held(&fs_info->chunk_mutex);
+ 
+       info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+-again:
+       spin_lock(&info->lock);
+       left = info->total_bytes - btrfs_space_info_used(info, true);
+       spin_unlock(&info->lock);
+@@ -3303,58 +3301,6 @@ again:
+ 
+       if (left < thresh) {
+               u64 flags = btrfs_system_alloc_profile(fs_info);
+-              u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved);
+-
+-              /*
+-               * If there's not available space for the chunk tree (system
+-               * space) and there are other tasks that reserved space for
+-               * creating a new system block group, wait for them to complete
+-               * the creation of their system block group and release excess
+-               * reserved space. We do this because:
+-               *
+-               * *) We can end up allocating more system chunks than necessary
+-               *    when there are multiple tasks that are concurrently
+-               *    allocating block groups, which can lead to exhaustion of
+-               *    the system array in the superblock;
+-               *
+-               * *) If we allocate extra and unnecessary system block groups,
+-               *    despite being empty for a long time, and possibly forever,
+-               *    they end not being added to the list of unused block groups
+-               *    because that typically happens only when deallocating the
+-               *    last extent from a block group - which never happens since
+-               *    we never allocate from them in the first place. The few
+-               *    exceptions are when mounting a filesystem or running scrub,
+-               *    which add unused block groups to the list of unused block
+-               *    groups, to be deleted by the cleaner kthread.
+-               *    And even when they are added to the list of unused block
+-               *    groups, it can take a long time until they get deleted,
+-               *    since the cleaner kthread might be sleeping or busy with
+-               *    other work (deleting subvolumes, running delayed iputs,
+-               *    defrag scheduling, etc);
+-               *
+-               * This is rare in practice, but can happen when too many tasks
+-               * are allocating blocks groups in parallel (via fallocate())
+-               * and before the one that reserved space for a new system block
+-               * group finishes the block group creation and releases the space
+-               * reserved in excess (at btrfs_create_pending_block_groups()),
+-               * other tasks end up here and see free system space temporarily
+-               * not enough for updating the chunk tree.
+-               *
+-               * We unlock the chunk mutex before waiting for such tasks and
+-               * lock it again after the wait, otherwise we would deadlock.
+-               * It is safe to do so because allocating a system chunk is the
+-               * first thing done while allocating a new block group.
+-               */
+-              if (reserved > trans->chunk_bytes_reserved) {
+-                      const u64 min_needed = reserved - thresh;
+-
+-                      mutex_unlock(&fs_info->chunk_mutex);
+-                      wait_event(cur_trans->chunk_reserve_wait,
+-                         atomic64_read(&cur_trans->chunk_bytes_reserved) <=
+-                         min_needed);
+-                      mutex_lock(&fs_info->chunk_mutex);
+-                      goto again;
+-              }
+ 
+               /*
+                * Ignore failure to create system chunk. We might end up not
+@@ -3369,10 +3315,8 @@ again:
+               ret = btrfs_block_rsv_add(fs_info->chunk_root,
+                                         &fs_info->chunk_block_rsv,
+                                         thresh, BTRFS_RESERVE_NO_FLUSH);
+-              if (!ret) {
+-                      atomic64_add(thresh, &cur_trans->chunk_bytes_reserved);
++              if (!ret)
+                       trans->chunk_bytes_reserved += thresh;
+-              }
+       }
+ }
+ 
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -260,7 +260,6 @@ static inline int extwriter_counter_read
+ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
+ {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+-      struct btrfs_transaction *cur_trans = trans->transaction;
+ 
+       if (!trans->chunk_bytes_reserved)
+               return;
+@@ -269,8 +268,6 @@ void btrfs_trans_release_chunk_metadata(
+ 
+       btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
+                               trans->chunk_bytes_reserved, NULL);
+-      atomic64_sub(trans->chunk_bytes_reserved, &cur_trans->chunk_bytes_reserved);
+-      cond_wake_up(&cur_trans->chunk_reserve_wait);
+       trans->chunk_bytes_reserved = 0;
+ }
+ 
+@@ -386,8 +383,6 @@ loop:
+       spin_lock_init(&cur_trans->dropped_roots_lock);
+       INIT_LIST_HEAD(&cur_trans->releasing_ebs);
+       spin_lock_init(&cur_trans->releasing_ebs_lock);
+-      atomic64_set(&cur_trans->chunk_bytes_reserved, 0);
+-      init_waitqueue_head(&cur_trans->chunk_reserve_wait);
+       list_add_tail(&cur_trans->list, &fs_info->trans_list);
+       extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
+                       IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
+--- a/fs/btrfs/transaction.h
++++ b/fs/btrfs/transaction.h
+@@ -96,13 +96,6 @@ struct btrfs_transaction {
+ 
+       spinlock_t releasing_ebs_lock;
+       struct list_head releasing_ebs;
+-
+-      /*
+-       * The number of bytes currently reserved, by all transaction handles
+-       * attached to this transaction, for metadata extents of the chunk tree.
+-       */
+-      atomic64_t chunk_bytes_reserved;
+-      wait_queue_head_t chunk_reserve_wait;
+ };
+ 
+ #define __TRANS_FREEZABLE     (1U << 0)
diff --git a/queue-5.12/btrfs-properly-split-extent_map-for-req_op_zone_append.patch b/queue-5.12/btrfs-properly-split-extent_map-for-req_op_zone_append.patch

new file mode 100644 (file)

index 0000000..0628c9d
--- /dev/null
+++ b/queue-5.12/btrfs-properly-split-extent_map-for-req_op_zone_append.patch
@@ -0,0 +1,248 @@
+From abb99cfdaf0759f8a619e5fecf52ccccdf310c8c Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Mon, 28 Jun 2021 17:57:28 +0900
+Subject: btrfs: properly split extent_map for REQ_OP_ZONE_APPEND
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit abb99cfdaf0759f8a619e5fecf52ccccdf310c8c upstream.
+
+Damien reported a test failure with btrfs/209. The test itself ran fine,
+but the fsck ran afterwards reported a corrupted filesystem.
+
+The filesystem corruption happens because we're splitting an extent and
+then writing the extent twice. We have to split the extent though, because
+we're creating too large extents for a REQ_OP_ZONE_APPEND operation.
+
+When dumping the extent tree, we can see two EXTENT_ITEMs at the same
+start address but different lengths.
+
+$ btrfs inspect dump-tree /dev/nullb1 -t extent
+...
+   item 19 key (269484032 EXTENT_ITEM 126976) itemoff 15470 itemsize 53
+           refs 1 gen 7 flags DATA
+           extent data backref root FS_TREE objectid 257 offset 786432 count 1
+   item 20 key (269484032 EXTENT_ITEM 262144) itemoff 15417 itemsize 53
+           refs 1 gen 7 flags DATA
+           extent data backref root FS_TREE objectid 257 offset 786432 count 1
+
+The duplicated EXTENT_ITEMs originally come from wrongly split extent_map in
+extract_ordered_extent(). Since extract_ordered_extent() uses
+create_io_em() to split an existing extent_map, we will have
+split->orig_start != split->start. Then, it will be logged with non-zero
+"extent data offset". Finally, the logged entries are replayed into
+a duplicated EXTENT_ITEM.
+
+Introduce and use proper splitting function for extent_map. The function is
+intended to be simple and specific usage for extract_ordered_extent() e.g.
+not supporting compression case (we do not allow splitting compressed
+extent_map anyway).
+
+There was a question raised by Qu, in summary why we want to split the
+extent map (and not the bio):
+
+The problem is not the limit on the zone end, which as you mention is
+the same as the block group end. The problem is that data write use zone
+append (ZA) operations. ZA BIOs cannot be split so a large extent may
+need to be processed with multiple ZA BIOs, While that is also true for
+regular writes, the major difference is that ZA are "nameless" write
+operation giving back the written sectors on completion. And ZA
+operations may be reordered by the block layer (not intentionally
+though). Combine both of these characteristics and you can see that the
+data for a large extent may end up being shuffled when written resulting
+in data corruption and the impossibility to map the extent to some start
+sector.
+
+To avoid this problem, zoned btrfs uses the principle "one data extent
+== one ZA BIO". So large extents need to be split. This is unfortunate,
+but we can revisit this later and optimize, e.g. merge back together the
+fragments of an extent once written if they actually were written
+sequentially in the zone.
+
+Reported-by: Damien Le Moal <damien.lemoal@wdc.com>
+Fixes: d22002fd37bd ("btrfs: zoned: split ordered extent when bio is sent")
+CC: stable@vger.kernel.org # 5.12+
+CC: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/inode.c |  147 ++++++++++++++++++++++++++++++++++++++++++++-----------
+ 1 file changed, 118 insertions(+), 29 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -2271,13 +2271,127 @@ bool btrfs_bio_fits_in_ordered_extent(st
+       return ret;
+ }
+ 
++/*
++ * Split an extent_map at [start, start + len]
++ *
++ * This function is intended to be used only for extract_ordered_extent().
++ */
++static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
++                        u64 pre, u64 post)
++{
++      struct extent_map_tree *em_tree = &inode->extent_tree;
++      struct extent_map *em;
++      struct extent_map *split_pre = NULL;
++      struct extent_map *split_mid = NULL;
++      struct extent_map *split_post = NULL;
++      int ret = 0;
++      int modified;
++      unsigned long flags;
++
++      /* Sanity check */
++      if (pre == 0 && post == 0)
++              return 0;
++
++      split_pre = alloc_extent_map();
++      if (pre)
++              split_mid = alloc_extent_map();
++      if (post)
++              split_post = alloc_extent_map();
++      if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
++              ret = -ENOMEM;
++              goto out;
++      }
++
++      ASSERT(pre + post < len);
++
++      lock_extent(&inode->io_tree, start, start + len - 1);
++      write_lock(&em_tree->lock);
++      em = lookup_extent_mapping(em_tree, start, len);
++      if (!em) {
++              ret = -EIO;
++              goto out_unlock;
++      }
++
++      ASSERT(em->len == len);
++      ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
++      ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
++
++      flags = em->flags;
++      clear_bit(EXTENT_FLAG_PINNED, &em->flags);
++      clear_bit(EXTENT_FLAG_LOGGING, &flags);
++      modified = !list_empty(&em->list);
++
++      /* First, replace the em with a new extent_map starting from * em->start */
++      split_pre->start = em->start;
++      split_pre->len = (pre ? pre : em->len - post);
++      split_pre->orig_start = split_pre->start;
++      split_pre->block_start = em->block_start;
++      split_pre->block_len = split_pre->len;
++      split_pre->orig_block_len = split_pre->block_len;
++      split_pre->ram_bytes = split_pre->len;
++      split_pre->flags = flags;
++      split_pre->compress_type = em->compress_type;
++      split_pre->generation = em->generation;
++
++      replace_extent_mapping(em_tree, em, split_pre, modified);
++
++      /*
++       * Now we only have an extent_map at:
++       *     [em->start, em->start + pre] if pre != 0
++       *     [em->start, em->start + em->len - post] if pre == 0
++       */
++
++      if (pre) {
++              /* Insert the middle extent_map */
++              split_mid->start = em->start + pre;
++              split_mid->len = em->len - pre - post;
++              split_mid->orig_start = split_mid->start;
++              split_mid->block_start = em->block_start + pre;
++              split_mid->block_len = split_mid->len;
++              split_mid->orig_block_len = split_mid->block_len;
++              split_mid->ram_bytes = split_mid->len;
++              split_mid->flags = flags;
++              split_mid->compress_type = em->compress_type;
++              split_mid->generation = em->generation;
++              add_extent_mapping(em_tree, split_mid, modified);
++      }
++
++      if (post) {
++              split_post->start = em->start + em->len - post;
++              split_post->len = post;
++              split_post->orig_start = split_post->start;
++              split_post->block_start = em->block_start + em->len - post;
++              split_post->block_len = split_post->len;
++              split_post->orig_block_len = split_post->block_len;
++              split_post->ram_bytes = split_post->len;
++              split_post->flags = flags;
++              split_post->compress_type = em->compress_type;
++              split_post->generation = em->generation;
++              add_extent_mapping(em_tree, split_post, modified);
++      }
++
++      /* Once for us */
++      free_extent_map(em);
++      /* Once for the tree */
++      free_extent_map(em);
++
++out_unlock:
++      write_unlock(&em_tree->lock);
++      unlock_extent(&inode->io_tree, start, start + len - 1);
++out:
++      free_extent_map(split_pre);
++      free_extent_map(split_mid);
++      free_extent_map(split_post);
++
++      return ret;
++}
++
+ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
+                                          struct bio *bio, loff_t file_offset)
+ {
+       struct btrfs_ordered_extent *ordered;
+-      struct extent_map *em = NULL, *em_new = NULL;
+-      struct extent_map_tree *em_tree = &inode->extent_tree;
+       u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
++      u64 file_len;
+       u64 len = bio->bi_iter.bi_size;
+       u64 end = start + len;
+       u64 ordered_end;
+@@ -2317,41 +2431,16 @@ static blk_status_t extract_ordered_exte
+               goto out;
+       }
+ 
++      file_len = ordered->num_bytes;
+       pre = start - ordered->disk_bytenr;
+       post = ordered_end - end;
+ 
+       ret = btrfs_split_ordered_extent(ordered, pre, post);
+       if (ret)
+               goto out;
+-
+-      read_lock(&em_tree->lock);
+-      em = lookup_extent_mapping(em_tree, ordered->file_offset, len);
+-      if (!em) {
+-              read_unlock(&em_tree->lock);
+-              ret = -EIO;
+-              goto out;
+-      }
+-      read_unlock(&em_tree->lock);
+-
+-      ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+-      /*
+-       * We cannot reuse em_new here but have to create a new one, as
+-       * unpin_extent_cache() expects the start of the extent map to be the
+-       * logical offset of the file, which does not hold true anymore after
+-       * splitting.
+-       */
+-      em_new = create_io_em(inode, em->start + pre, len,
+-                            em->start + pre, em->block_start + pre, len,
+-                            len, len, BTRFS_COMPRESS_NONE,
+-                            BTRFS_ORDERED_REGULAR);
+-      if (IS_ERR(em_new)) {
+-              ret = PTR_ERR(em_new);
+-              goto out;
+-      }
+-      free_extent_map(em_new);
++      ret = split_zoned_em(inode, file_offset, file_len, pre, post);
+ 
+ out:
+-      free_extent_map(em);
+       btrfs_put_ordered_extent(ordered);
+ 
+       return errno_to_blk_status(ret);
diff --git a/queue-5.12/btrfs-rework-chunk-allocation-to-avoid-exhaustion-of-the-system-chunk-array.patch b/queue-5.12/btrfs-rework-chunk-allocation-to-avoid-exhaustion-of-the-system-chunk-array.patch

new file mode 100644 (file)

index 0000000..6169fd0
--- /dev/null
+++ b/queue-5.12/btrfs-rework-chunk-allocation-to-avoid-exhaustion-of-the-system-chunk-array.patch
@@ -0,0 +1,1280 @@
+From 79bd37120b149532af5b21953643ed74af69654f Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 29 Jun 2021 14:43:06 +0100
+Subject: btrfs: rework chunk allocation to avoid exhaustion of the system chunk array
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 79bd37120b149532af5b21953643ed74af69654f upstream.
+
+Commit eafa4fd0ad0607 ("btrfs: fix exhaustion of the system chunk array
+due to concurrent allocations") fixed a problem that resulted in
+exhausting the system chunk array in the superblock when there are many
+tasks allocating chunks in parallel. Basically too many tasks enter the
+first phase of chunk allocation without previous tasks having finished
+their second phase of allocation, resulting in too many system chunks
+being allocated. That was originally observed when running the fallocate
+tests of stress-ng on a PowerPC machine, using a node size of 64K.
+
+However that commit also introduced a deadlock where a task in phase 1 of
+the chunk allocation waited for another task that had allocated a system
+chunk to finish its phase 2, but that other task was waiting on an extent
+buffer lock held by the first task, therefore resulting in both tasks not
+making any progress. That change was later reverted by a patch with the
+subject "btrfs: fix deadlock with concurrent chunk allocations involving
+system chunks", since there is no simple and short solution to address it
+and the deadlock is relatively easy to trigger on zoned filesystems, while
+the system chunk array exhaustion is not so common.
+
+This change reworks the chunk allocation to avoid the system chunk array
+exhaustion. It accomplishes that by making the first phase of chunk
+allocation do the updates of the device items in the chunk btree and the
+insertion of the new chunk item in the chunk btree. This is done while
+under the protection of the chunk mutex (fs_info->chunk_mutex), in the
+same critical section that checks for available system space, allocates
+a new system chunk if needed and reserves system chunk space. This way
+we do not have chunk space reserved until the second phase completes.
+
+The same logic is applied to chunk removal as well, since it keeps
+reserved system space long after it is done updating the chunk btree.
+
+For direct allocation of system chunks, the previous behaviour remains,
+because otherwise we would deadlock on extent buffers of the chunk btree.
+Changes to the chunk btree are by large done by chunk allocation and chunk
+removal, which first reserve chunk system space and then later do changes
+to the chunk btree. The other remaining cases are uncommon and correspond
+to adding a device, removing a device and resizing a device. All these
+other cases do not pre-reserve system space, they modify the chunk btree
+right away, so they don't hold reserved space for a long period like chunk
+allocation and chunk removal do.
+
+The diff of this change is huge, but more than half of it is just addition
+of comments describing both how things work regarding chunk allocation and
+removal, including both the new behavior and the parts of the old behavior
+that did not change.
+
+CC: stable@vger.kernel.org # 5.12+
+Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
+Tested-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Tested-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/block-group.c |  285 ++++++++++++++++++++++++++++++++++-----
+ fs/btrfs/block-group.h |    6 
+ fs/btrfs/ctree.c       |   67 +--------
+ fs/btrfs/transaction.c |   10 -
+ fs/btrfs/transaction.h |    2 
+ fs/btrfs/volumes.c     |  355 +++++++++++++++++++++++++++++++++++++------------
+ fs/btrfs/volumes.h     |    5 
+ 7 files changed, 546 insertions(+), 184 deletions(-)
+
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -2101,6 +2101,13 @@ error:
+       return ret;
+ }
+ 
++/*
++ * This function, insert_block_group_item(), belongs to the phase 2 of chunk
++ * allocation.
++ *
++ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
++ * phases.
++ */
+ static int insert_block_group_item(struct btrfs_trans_handle *trans,
+                                  struct btrfs_block_group *block_group)
+ {
+@@ -2123,15 +2130,19 @@ static int insert_block_group_item(struc
+       return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
+ }
+ 
++/*
++ * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
++ * chunk allocation.
++ *
++ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
++ * phases.
++ */
+ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
+ {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+       struct btrfs_block_group *block_group;
+       int ret = 0;
+ 
+-      if (!trans->can_flush_pending_bgs)
+-              return;
+-
+       while (!list_empty(&trans->new_bgs)) {
+               int index;
+ 
+@@ -2146,6 +2157,13 @@ void btrfs_create_pending_block_groups(s
+               ret = insert_block_group_item(trans, block_group);
+               if (ret)
+                       btrfs_abort_transaction(trans, ret);
++              if (!block_group->chunk_item_inserted) {
++                      mutex_lock(&fs_info->chunk_mutex);
++                      ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
++                      mutex_unlock(&fs_info->chunk_mutex);
++                      if (ret)
++                              btrfs_abort_transaction(trans, ret);
++              }
+               ret = btrfs_finish_chunk_alloc(trans, block_group->start,
+                                       block_group->length);
+               if (ret)
+@@ -2169,8 +2187,9 @@ next:
+       btrfs_trans_release_chunk_metadata(trans);
+ }
+ 
+-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
+-                         u64 type, u64 chunk_offset, u64 size)
++struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
++                                               u64 bytes_used, u64 type,
++                                               u64 chunk_offset, u64 size)
+ {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+       struct btrfs_block_group *cache;
+@@ -2180,7 +2199,7 @@ int btrfs_make_block_group(struct btrfs_
+ 
+       cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
+       if (!cache)
+-              return -ENOMEM;
++              return ERR_PTR(-ENOMEM);
+ 
+       cache->length = size;
+       set_free_space_tree_thresholds(cache);
+@@ -2194,7 +2213,7 @@ int btrfs_make_block_group(struct btrfs_
+       ret = btrfs_load_block_group_zone_info(cache, true);
+       if (ret) {
+               btrfs_put_block_group(cache);
+-              return ret;
++              return ERR_PTR(ret);
+       }
+ 
+       ret = exclude_super_stripes(cache);
+@@ -2202,7 +2221,7 @@ int btrfs_make_block_group(struct btrfs_
+               /* We may have excluded something, so call this just in case */
+               btrfs_free_excluded_extents(cache);
+               btrfs_put_block_group(cache);
+-              return ret;
++              return ERR_PTR(ret);
+       }
+ 
+       add_new_free_space(cache, chunk_offset, chunk_offset + size);
+@@ -2229,7 +2248,7 @@ int btrfs_make_block_group(struct btrfs_
+       if (ret) {
+               btrfs_remove_free_space_cache(cache);
+               btrfs_put_block_group(cache);
+-              return ret;
++              return ERR_PTR(ret);
+       }
+ 
+       /*
+@@ -2248,7 +2267,7 @@ int btrfs_make_block_group(struct btrfs_
+       btrfs_update_delayed_refs_rsv(trans);
+ 
+       set_avail_alloc_bits(fs_info, type);
+-      return 0;
++      return cache;
+ }
+ 
+ /*
+@@ -3124,11 +3143,203 @@ int btrfs_force_chunk_alloc(struct btrfs
+       return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+ }
+ 
++static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
++{
++      struct btrfs_block_group *bg;
++      int ret;
++
++      /*
++       * Check if we have enough space in the system space info because we
++       * will need to update device items in the chunk btree and insert a new
++       * chunk item in the chunk btree as well. This will allocate a new
++       * system block group if needed.
++       */
++      check_system_chunk(trans, flags);
++
++      bg = btrfs_alloc_chunk(trans, flags);
++      if (IS_ERR(bg)) {
++              ret = PTR_ERR(bg);
++              goto out;
++      }
++
++      /*
++       * If this is a system chunk allocation then stop right here and do not
++       * add the chunk item to the chunk btree. This is to prevent a deadlock
++       * because this system chunk allocation can be triggered while COWing
++       * some extent buffer of the chunk btree and while holding a lock on a
++       * parent extent buffer, in which case attempting to insert the chunk
++       * item (or update the device item) would result in a deadlock on that
++       * parent extent buffer. In this case defer the chunk btree updates to
++       * the second phase of chunk allocation and keep our reservation until
++       * the second phase completes.
++       *
++       * This is a rare case and can only be triggered by the very few cases
++       * we have where we need to touch the chunk btree outside chunk allocation
++       * and chunk removal. These cases are basically adding a device, removing
++       * a device or resizing a device.
++       */
++      if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
++              return 0;
++
++      ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
++      /*
++       * Normally we are not expected to fail with -ENOSPC here, since we have
++       * previously reserved space in the system space_info and allocated one
++       * new system chunk if necessary. However there are two exceptions:
++       *
++       * 1) We may have enough free space in the system space_info but all the
++       *    existing system block groups have a profile which can not be used
++       *    for extent allocation.
++       *
++       *    This happens when mounting in degraded mode. For example we have a
++       *    RAID1 filesystem with 2 devices, lose one device and mount the fs
++       *    using the other device in degraded mode. If we then allocate a chunk,
++       *    we may have enough free space in the existing system space_info, but
++       *    none of the block groups can be used for extent allocation since they
++       *    have a RAID1 profile, and because we are in degraded mode with a
++       *    single device, we are forced to allocate a new system chunk with a
++       *    SINGLE profile. Making check_system_chunk() iterate over all system
++       *    block groups and check if they have a usable profile and enough space
++       *    can be slow on very large filesystems, so we tolerate the -ENOSPC and
++       *    try again after forcing allocation of a new system chunk. Like this
++       *    we avoid paying the cost of that search in normal circumstances, when
++       *    we were not mounted in degraded mode;
++       *
++       * 2) We had enough free space info the system space_info, and one suitable
++       *    block group to allocate from when we called check_system_chunk()
++       *    above. However right after we called it, the only system block group
++       *    with enough free space got turned into RO mode by a running scrub,
++       *    and in this case we have to allocate a new one and retry. We only
++       *    need do this allocate and retry once, since we have a transaction
++       *    handle and scrub uses the commit root to search for block groups.
++       */
++      if (ret == -ENOSPC) {
++              const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
++              struct btrfs_block_group *sys_bg;
++
++              sys_bg = btrfs_alloc_chunk(trans, sys_flags);
++              if (IS_ERR(sys_bg)) {
++                      ret = PTR_ERR(sys_bg);
++                      btrfs_abort_transaction(trans, ret);
++                      goto out;
++              }
++
++              ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
++              if (ret) {
++                      btrfs_abort_transaction(trans, ret);
++                      goto out;
++              }
++
++              ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
++              if (ret) {
++                      btrfs_abort_transaction(trans, ret);
++                      goto out;
++              }
++      } else if (ret) {
++              btrfs_abort_transaction(trans, ret);
++              goto out;
++      }
++out:
++      btrfs_trans_release_chunk_metadata(trans);
++
++      return ret;
++}
++
+ /*
+- * If force is CHUNK_ALLOC_FORCE:
++ * Chunk allocation is done in 2 phases:
++ *
++ * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
++ *    the chunk, the chunk mapping, create its block group and add the items
++ *    that belong in the chunk btree to it - more specifically, we need to
++ *    update device items in the chunk btree and add a new chunk item to it.
++ *
++ * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
++ *    group item to the extent btree and the device extent items to the devices
++ *    btree.
++ *
++ * This is done to prevent deadlocks. For example when COWing a node from the
++ * extent btree we are holding a write lock on the node's parent and if we
++ * trigger chunk allocation and attempted to insert the new block group item
++ * in the extent btree right way, we could deadlock because the path for the
++ * insertion can include that parent node. At first glance it seems impossible
++ * to trigger chunk allocation after starting a transaction since tasks should
++ * reserve enough transaction units (metadata space), however while that is true
++ * most of the time, chunk allocation may still be triggered for several reasons:
++ *
++ * 1) When reserving metadata, we check if there is enough free space in the
++ *    metadata space_info and therefore don't trigger allocation of a new chunk.
++ *    However later when the task actually tries to COW an extent buffer from
++ *    the extent btree or from the device btree for example, it is forced to
++ *    allocate a new block group (chunk) because the only one that had enough
++ *    free space was just turned to RO mode by a running scrub for example (or
++ *    device replace, block group reclaim thread, etc), so we can not use it
++ *    for allocating an extent and end up being forced to allocate a new one;
++ *
++ * 2) Because we only check that the metadata space_info has enough free bytes,
++ *    we end up not allocating a new metadata chunk in that case. However if
++ *    the filesystem was mounted in degraded mode, none of the existing block
++ *    groups might be suitable for extent allocation due to their incompatible
++ *    profile (for e.g. mounting a 2 devices filesystem, where all block groups
++ *    use a RAID1 profile, in degraded mode using a single device). In this case
++ *    when the task attempts to COW some extent buffer of the extent btree for
++ *    example, it will trigger allocation of a new metadata block group with a
++ *    suitable profile (SINGLE profile in the example of the degraded mount of
++ *    the RAID1 filesystem);
++ *
++ * 3) The task has reserved enough transaction units / metadata space, but when
++ *    it attempts to COW an extent buffer from the extent or device btree for
++ *    example, it does not find any free extent in any metadata block group,
++ *    therefore forced to try to allocate a new metadata block group.
++ *    This is because some other task allocated all available extents in the
++ *    meanwhile - this typically happens with tasks that don't reserve space
++ *    properly, either intentionally or as a bug. One example where this is
++ *    done intentionally is fsync, as it does not reserve any transaction units
++ *    and ends up allocating a variable number of metadata extents for log
++ *    tree extent buffers.
++ *
++ * We also need this 2 phases setup when adding a device to a filesystem with
++ * a seed device - we must create new metadata and system chunks without adding
++ * any of the block group items to the chunk, extent and device btrees. If we
++ * did not do it this way, we would get ENOSPC when attempting to update those
++ * btrees, since all the chunks from the seed device are read-only.
++ *
++ * Phase 1 does the updates and insertions to the chunk btree because if we had
++ * it done in phase 2 and have a thundering herd of tasks allocating chunks in
++ * parallel, we risk having too many system chunks allocated by many tasks if
++ * many tasks reach phase 1 without the previous ones completing phase 2. In the
++ * extreme case this leads to exhaustion of the system chunk array in the
++ * superblock. This is easier to trigger if using a btree node/leaf size of 64K
++ * and with RAID filesystems (so we have more device items in the chunk btree).
++ * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
++ * the system chunk array due to concurrent allocations") provides more details.
++ *
++ * For allocation of system chunks, we defer the updates and insertions into the
++ * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
++ * if the chunk allocation is triggered while COWing an extent buffer of the
++ * chunk btree, we are holding a lock on the parent of that extent buffer and
++ * doing the chunk btree updates and insertions can require locking that parent.
++ * This is for the very few and rare cases where we update the chunk btree that
++ * are not chunk allocation or chunk removal: adding a device, removing a device
++ * or resizing a device.
++ *
++ * The reservation of system space, done through check_system_chunk(), as well
++ * as all the updates and insertions into the chunk btree must be done while
++ * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
++ * an extent buffer from the chunks btree we never trigger allocation of a new
++ * system chunk, which would result in a deadlock (trying to lock twice an
++ * extent buffer of the chunk btree, first time before triggering the chunk
++ * allocation and the second time during chunk allocation while attempting to
++ * update the chunks btree). The system chunk array is also updated while holding
++ * that mutex. The same logic applies to removing chunks - we must reserve system
++ * space, update the chunk btree and the system chunk array in the superblock
++ * while holding fs_info->chunk_mutex.
++ *
++ * This function, btrfs_chunk_alloc(), belongs to phase 1.
++ *
++ * If @force is CHUNK_ALLOC_FORCE:
+  *    - return 1 if it successfully allocates a chunk,
+  *    - return errors including -ENOSPC otherwise.
+- * If force is NOT CHUNK_ALLOC_FORCE:
++ * If @force is NOT CHUNK_ALLOC_FORCE:
+  *    - return 0 if it doesn't need to allocate a new chunk,
+  *    - return 1 if it successfully allocates a chunk,
+  *    - return errors including -ENOSPC otherwise.
+@@ -3145,6 +3356,13 @@ int btrfs_chunk_alloc(struct btrfs_trans
+       /* Don't re-enter if we're already allocating a chunk */
+       if (trans->allocating_chunk)
+               return -ENOSPC;
++      /*
++       * If we are removing a chunk, don't re-enter or we would deadlock.
++       * System space reservation and system chunk allocation is done by the
++       * chunk remove operation (btrfs_remove_chunk()).
++       */
++      if (trans->removing_chunk)
++              return -ENOSPC;
+ 
+       space_info = btrfs_find_space_info(fs_info, flags);
+       ASSERT(space_info);
+@@ -3208,13 +3426,7 @@ int btrfs_chunk_alloc(struct btrfs_trans
+                       force_metadata_allocation(fs_info);
+       }
+ 
+-      /*
+-       * Check if we have enough space in SYSTEM chunk because we may need
+-       * to update devices.
+-       */
+-      check_system_chunk(trans, flags);
+-
+-      ret = btrfs_alloc_chunk(trans, flags);
++      ret = do_chunk_alloc(trans, flags);
+       trans->allocating_chunk = false;
+ 
+       spin_lock(&space_info->lock);
+@@ -3233,22 +3445,6 @@ out:
+       space_info->chunk_alloc = 0;
+       spin_unlock(&space_info->lock);
+       mutex_unlock(&fs_info->chunk_mutex);
+-      /*
+-       * When we allocate a new chunk we reserve space in the chunk block
+-       * reserve to make sure we can COW nodes/leafs in the chunk tree or
+-       * add new nodes/leafs to it if we end up needing to do it when
+-       * inserting the chunk item and updating device items as part of the
+-       * second phase of chunk allocation, performed by
+-       * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
+-       * large number of new block groups to create in our transaction
+-       * handle's new_bgs list to avoid exhausting the chunk block reserve
+-       * in extreme cases - like having a single transaction create many new
+-       * block groups when starting to write out the free space caches of all
+-       * the block groups that were made dirty during the lifetime of the
+-       * transaction.
+-       */
+-      if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
+-              btrfs_create_pending_block_groups(trans);
+ 
+       return ret;
+ }
+@@ -3301,14 +3497,31 @@ void check_system_chunk(struct btrfs_tra
+ 
+       if (left < thresh) {
+               u64 flags = btrfs_system_alloc_profile(fs_info);
++              struct btrfs_block_group *bg;
+ 
+               /*
+                * Ignore failure to create system chunk. We might end up not
+                * needing it, as we might not need to COW all nodes/leafs from
+                * the paths we visit in the chunk tree (they were already COWed
+                * or created in the current transaction for example).
++               *
++               * Also, if our caller is allocating a system chunk, do not
++               * attempt to insert the chunk item in the chunk btree, as we
++               * could deadlock on an extent buffer since our caller may be
++               * COWing an extent buffer from the chunk btree.
+                */
+-              ret = btrfs_alloc_chunk(trans, flags);
++              bg = btrfs_alloc_chunk(trans, flags);
++              if (IS_ERR(bg)) {
++                      ret = PTR_ERR(bg);
++              } else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
++                      /*
++                       * If we fail to add the chunk item here, we end up
++                       * trying again at phase 2 of chunk allocation, at
++                       * btrfs_create_pending_block_groups(). So ignore
++                       * any error here.
++                       */
++                      btrfs_chunk_alloc_add_chunk_item(trans, bg);
++              }
+       }
+ 
+       if (!ret) {
+--- a/fs/btrfs/block-group.h
++++ b/fs/btrfs/block-group.h
+@@ -97,6 +97,7 @@ struct btrfs_block_group {
+       unsigned int removed:1;
+       unsigned int to_copy:1;
+       unsigned int relocating_repair:1;
++      unsigned int chunk_item_inserted:1;
+ 
+       int disk_cache_state;
+ 
+@@ -265,8 +266,9 @@ int btrfs_remove_block_group(struct btrf
+ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
+ void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
+ int btrfs_read_block_groups(struct btrfs_fs_info *info);
+-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
+-                         u64 type, u64 chunk_offset, u64 size);
++struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
++                                               u64 bytes_used, u64 type,
++                                               u64 chunk_offset, u64 size);
+ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
+ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
+                            bool do_chunk_alloc);
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -954,49 +954,6 @@ static noinline int update_ref_for_cow(s
+       return 0;
+ }
+ 
+-static struct extent_buffer *alloc_tree_block_no_bg_flush(
+-                                        struct btrfs_trans_handle *trans,
+-                                        struct btrfs_root *root,
+-                                        u64 parent_start,
+-                                        const struct btrfs_disk_key *disk_key,
+-                                        int level,
+-                                        u64 hint,
+-                                        u64 empty_size,
+-                                        enum btrfs_lock_nesting nest)
+-{
+-      struct btrfs_fs_info *fs_info = root->fs_info;
+-      struct extent_buffer *ret;
+-
+-      /*
+-       * If we are COWing a node/leaf from the extent, chunk, device or free
+-       * space trees, make sure that we do not finish block group creation of
+-       * pending block groups. We do this to avoid a deadlock.
+-       * COWing can result in allocation of a new chunk, and flushing pending
+-       * block groups (btrfs_create_pending_block_groups()) can be triggered
+-       * when finishing allocation of a new chunk. Creation of a pending block
+-       * group modifies the extent, chunk, device and free space trees,
+-       * therefore we could deadlock with ourselves since we are holding a
+-       * lock on an extent buffer that btrfs_create_pending_block_groups() may
+-       * try to COW later.
+-       * For similar reasons, we also need to delay flushing pending block
+-       * groups when splitting a leaf or node, from one of those trees, since
+-       * we are holding a write lock on it and its parent or when inserting a
+-       * new root node for one of those trees.
+-       */
+-      if (root == fs_info->extent_root ||
+-          root == fs_info->chunk_root ||
+-          root == fs_info->dev_root ||
+-          root == fs_info->free_space_root)
+-              trans->can_flush_pending_bgs = false;
+-
+-      ret = btrfs_alloc_tree_block(trans, root, parent_start,
+-                                   root->root_key.objectid, disk_key, level,
+-                                   hint, empty_size, nest);
+-      trans->can_flush_pending_bgs = true;
+-
+-      return ret;
+-}
+-
+ /*
+  * does the dirty work in cow of a single block.  The parent block (if
+  * supplied) is updated to point to the new cow copy.  The new buffer is marked
+@@ -1045,8 +1002,9 @@ static noinline int __btrfs_cow_block(st
+       if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
+               parent_start = parent->start;
+ 
+-      cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
+-                                         level, search_start, empty_size, nest);
++      cow = btrfs_alloc_tree_block(trans, root, parent_start,
++                                   root->root_key.objectid, &disk_key, level,
++                                   search_start, empty_size, nest);
+       if (IS_ERR(cow))
+               return PTR_ERR(cow);
+ 
+@@ -3340,9 +3298,9 @@ static noinline int insert_new_root(stru
+       else
+               btrfs_node_key(lower, &lower_key, 0);
+ 
+-      c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
+-                                       root->node->start, 0,
+-                                       BTRFS_NESTING_NEW_ROOT);
++      c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
++                                 &lower_key, level, root->node->start, 0,
++                                 BTRFS_NESTING_NEW_ROOT);
+       if (IS_ERR(c))
+               return PTR_ERR(c);
+ 
+@@ -3471,8 +3429,9 @@ static noinline int split_node(struct bt
+       mid = (c_nritems + 1) / 2;
+       btrfs_node_key(c, &disk_key, mid);
+ 
+-      split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
+-                                           c->start, 0, BTRFS_NESTING_SPLIT);
++      split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
++                                     &disk_key, level, c->start, 0,
++                                     BTRFS_NESTING_SPLIT);
+       if (IS_ERR(split))
+               return PTR_ERR(split);
+ 
+@@ -4263,10 +4222,10 @@ again:
+        * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
+        * use BTRFS_NESTING_NEW_ROOT.
+        */
+-      right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
+-                                           l->start, 0, num_doubles ?
+-                                           BTRFS_NESTING_NEW_ROOT :
+-                                           BTRFS_NESTING_SPLIT);
++      right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
++                                     &disk_key, 0, l->start, 0,
++                                     num_doubles ? BTRFS_NESTING_NEW_ROOT :
++                                     BTRFS_NESTING_SPLIT);
+       if (IS_ERR(right))
+               return PTR_ERR(right);
+ 
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -254,8 +254,11 @@ static inline int extwriter_counter_read
+ }
+ 
+ /*
+- * To be called after all the new block groups attached to the transaction
+- * handle have been created (btrfs_create_pending_block_groups()).
++ * To be called after doing the chunk btree updates right after allocating a new
++ * chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a
++ * chunk after all chunk btree updates and after finishing the second phase of
++ * chunk allocation (btrfs_create_pending_block_groups()) in case some block
++ * group had its chunk item insertion delayed to the second phase.
+  */
+ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
+ {
+@@ -264,8 +267,6 @@ void btrfs_trans_release_chunk_metadata(
+       if (!trans->chunk_bytes_reserved)
+               return;
+ 
+-      WARN_ON_ONCE(!list_empty(&trans->new_bgs));
+-
+       btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
+                               trans->chunk_bytes_reserved, NULL);
+       trans->chunk_bytes_reserved = 0;
+@@ -697,7 +698,6 @@ again:
+       h->fs_info = root->fs_info;
+ 
+       h->type = type;
+-      h->can_flush_pending_bgs = true;
+       INIT_LIST_HEAD(&h->new_bgs);
+ 
+       smp_mb();
+--- a/fs/btrfs/transaction.h
++++ b/fs/btrfs/transaction.h
+@@ -134,7 +134,7 @@ struct btrfs_trans_handle {
+       short aborted;
+       bool adding_csums;
+       bool allocating_chunk;
+-      bool can_flush_pending_bgs;
++      bool removing_chunk;
+       bool reloc_reserved;
+       bool in_fsync;
+       struct btrfs_root *root;
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -1744,19 +1744,14 @@ again:
+               extent = btrfs_item_ptr(leaf, path->slots[0],
+                                       struct btrfs_dev_extent);
+       } else {
+-              btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
+               goto out;
+       }
+ 
+       *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
+ 
+       ret = btrfs_del_item(trans, root, path);
+-      if (ret) {
+-              btrfs_handle_fs_error(fs_info, ret,
+-                                    "Failed to remove dev extent item");
+-      } else {
++      if (ret == 0)
+               set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
+-      }
+ out:
+       btrfs_free_path(path);
+       return ret;
+@@ -2941,7 +2936,7 @@ static int btrfs_del_sys_chunk(struct bt
+       u32 cur;
+       struct btrfs_key key;
+ 
+-      mutex_lock(&fs_info->chunk_mutex);
++      lockdep_assert_held(&fs_info->chunk_mutex);
+       array_size = btrfs_super_sys_array_size(super_copy);
+ 
+       ptr = super_copy->sys_chunk_array;
+@@ -2971,7 +2966,6 @@ static int btrfs_del_sys_chunk(struct bt
+                       cur += len;
+               }
+       }
+-      mutex_unlock(&fs_info->chunk_mutex);
+       return ret;
+ }
+ 
+@@ -3011,6 +3005,29 @@ struct extent_map *btrfs_get_chunk_map(s
+       return em;
+ }
+ 
++static int remove_chunk_item(struct btrfs_trans_handle *trans,
++                           struct map_lookup *map, u64 chunk_offset)
++{
++      int i;
++
++      /*
++       * Removing chunk items and updating the device items in the chunks btree
++       * requires holding the chunk_mutex.
++       * See the comment at btrfs_chunk_alloc() for the details.
++       */
++      lockdep_assert_held(&trans->fs_info->chunk_mutex);
++
++      for (i = 0; i < map->num_stripes; i++) {
++              int ret;
++
++              ret = btrfs_update_device(trans, map->stripes[i].dev);
++              if (ret)
++                      return ret;
++      }
++
++      return btrfs_free_chunk(trans, chunk_offset);
++}
++
+ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
+ {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+@@ -3031,14 +3048,16 @@ int btrfs_remove_chunk(struct btrfs_tran
+               return PTR_ERR(em);
+       }
+       map = em->map_lookup;
+-      mutex_lock(&fs_info->chunk_mutex);
+-      check_system_chunk(trans, map->type);
+-      mutex_unlock(&fs_info->chunk_mutex);
+ 
+       /*
+-       * Take the device list mutex to prevent races with the final phase of
+-       * a device replace operation that replaces the device object associated
+-       * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
++       * First delete the device extent items from the devices btree.
++       * We take the device_list_mutex to avoid racing with the finishing phase
++       * of a device replace operation. See the comment below before acquiring
++       * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
++       * because that can result in a deadlock when deleting the device extent
++       * items from the devices btree - COWing an extent buffer from the btree
++       * may result in allocating a new metadata chunk, which would attempt to
++       * lock again fs_info->chunk_mutex.
+        */
+       mutex_lock(&fs_devices->device_list_mutex);
+       for (i = 0; i < map->num_stripes; i++) {
+@@ -3060,18 +3079,73 @@ int btrfs_remove_chunk(struct btrfs_tran
+                       btrfs_clear_space_info_full(fs_info);
+                       mutex_unlock(&fs_info->chunk_mutex);
+               }
++      }
++      mutex_unlock(&fs_devices->device_list_mutex);
+ 
+-              ret = btrfs_update_device(trans, device);
++      /*
++       * We acquire fs_info->chunk_mutex for 2 reasons:
++       *
++       * 1) Just like with the first phase of the chunk allocation, we must
++       *    reserve system space, do all chunk btree updates and deletions, and
++       *    update the system chunk array in the superblock while holding this
++       *    mutex. This is for similar reasons as explained on the comment at
++       *    the top of btrfs_chunk_alloc();
++       *
++       * 2) Prevent races with the final phase of a device replace operation
++       *    that replaces the device object associated with the map's stripes,
++       *    because the device object's id can change at any time during that
++       *    final phase of the device replace operation
++       *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
++       *    replaced device and then see it with an ID of
++       *    BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
++       *    the device item, which does not exists on the chunk btree.
++       *    The finishing phase of device replace acquires both the
++       *    device_list_mutex and the chunk_mutex, in that order, so we are
++       *    safe by just acquiring the chunk_mutex.
++       */
++      trans->removing_chunk = true;
++      mutex_lock(&fs_info->chunk_mutex);
++
++      check_system_chunk(trans, map->type);
++
++      ret = remove_chunk_item(trans, map, chunk_offset);
++      /*
++       * Normally we should not get -ENOSPC since we reserved space before
++       * through the call to check_system_chunk().
++       *
++       * Despite our system space_info having enough free space, we may not
++       * be able to allocate extents from its block groups, because all have
++       * an incompatible profile, which will force us to allocate a new system
++       * block group with the right profile, or right after we called
++       * check_system_space() above, a scrub turned the only system block group
++       * with enough free space into RO mode.
++       * This is explained with more detail at do_chunk_alloc().
++       *
++       * So if we get -ENOSPC, allocate a new system chunk and retry once.
++       */
++      if (ret == -ENOSPC) {
++              const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
++              struct btrfs_block_group *sys_bg;
++
++              sys_bg = btrfs_alloc_chunk(trans, sys_flags);
++              if (IS_ERR(sys_bg)) {
++                      ret = PTR_ERR(sys_bg);
++                      btrfs_abort_transaction(trans, ret);
++                      goto out;
++              }
++
++              ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
+               if (ret) {
+-                      mutex_unlock(&fs_devices->device_list_mutex);
+                       btrfs_abort_transaction(trans, ret);
+                       goto out;
+               }
+-      }
+-      mutex_unlock(&fs_devices->device_list_mutex);
+ 
+-      ret = btrfs_free_chunk(trans, chunk_offset);
+-      if (ret) {
++              ret = remove_chunk_item(trans, map, chunk_offset);
++              if (ret) {
++                      btrfs_abort_transaction(trans, ret);
++                      goto out;
++              }
++      } else if (ret) {
+               btrfs_abort_transaction(trans, ret);
+               goto out;
+       }
+@@ -3086,6 +3160,15 @@ int btrfs_remove_chunk(struct btrfs_tran
+               }
+       }
+ 
++      mutex_unlock(&fs_info->chunk_mutex);
++      trans->removing_chunk = false;
++
++      /*
++       * We are done with chunk btree updates and deletions, so release the
++       * system space we previously reserved (with check_system_chunk()).
++       */
++      btrfs_trans_release_chunk_metadata(trans);
++
+       ret = btrfs_remove_block_group(trans, chunk_offset, em);
+       if (ret) {
+               btrfs_abort_transaction(trans, ret);
+@@ -3093,6 +3176,10 @@ int btrfs_remove_chunk(struct btrfs_tran
+       }
+ 
+ out:
++      if (trans->removing_chunk) {
++              mutex_unlock(&fs_info->chunk_mutex);
++              trans->removing_chunk = false;
++      }
+       /* once for us */
+       free_extent_map(em);
+       return ret;
+@@ -4851,13 +4938,12 @@ static int btrfs_add_system_chunk(struct
+       u32 array_size;
+       u8 *ptr;
+ 
+-      mutex_lock(&fs_info->chunk_mutex);
++      lockdep_assert_held(&fs_info->chunk_mutex);
++
+       array_size = btrfs_super_sys_array_size(super_copy);
+       if (array_size + item_size + sizeof(disk_key)
+-                      > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+-              mutex_unlock(&fs_info->chunk_mutex);
++                      > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+               return -EFBIG;
+-      }
+ 
+       ptr = super_copy->sys_chunk_array + array_size;
+       btrfs_cpu_key_to_disk(&disk_key, key);
+@@ -4866,7 +4952,6 @@ static int btrfs_add_system_chunk(struct
+       memcpy(ptr, chunk, item_size);
+       item_size += sizeof(disk_key);
+       btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+-      mutex_unlock(&fs_info->chunk_mutex);
+ 
+       return 0;
+ }
+@@ -5216,13 +5301,14 @@ static int decide_stripe_size(struct btr
+       }
+ }
+ 
+-static int create_chunk(struct btrfs_trans_handle *trans,
++static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
+                       struct alloc_chunk_ctl *ctl,
+                       struct btrfs_device_info *devices_info)
+ {
+       struct btrfs_fs_info *info = trans->fs_info;
+       struct map_lookup *map = NULL;
+       struct extent_map_tree *em_tree;
++      struct btrfs_block_group *block_group;
+       struct extent_map *em;
+       u64 start = ctl->start;
+       u64 type = ctl->type;
+@@ -5232,7 +5318,7 @@ static int create_chunk(struct btrfs_tra
+ 
+       map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
+       if (!map)
+-              return -ENOMEM;
++              return ERR_PTR(-ENOMEM);
+       map->num_stripes = ctl->num_stripes;
+ 
+       for (i = 0; i < ctl->ndevs; ++i) {
+@@ -5254,7 +5340,7 @@ static int create_chunk(struct btrfs_tra
+       em = alloc_extent_map();
+       if (!em) {
+               kfree(map);
+-              return -ENOMEM;
++              return ERR_PTR(-ENOMEM);
+       }
+       set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
+       em->map_lookup = map;
+@@ -5270,12 +5356,12 @@ static int create_chunk(struct btrfs_tra
+       if (ret) {
+               write_unlock(&em_tree->lock);
+               free_extent_map(em);
+-              return ret;
++              return ERR_PTR(ret);
+       }
+       write_unlock(&em_tree->lock);
+ 
+-      ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
+-      if (ret)
++      block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
++      if (IS_ERR(block_group))
+               goto error_del_extent;
+ 
+       for (i = 0; i < map->num_stripes; i++) {
+@@ -5295,7 +5381,7 @@ static int create_chunk(struct btrfs_tra
+       check_raid56_incompat_flag(info, type);
+       check_raid1c34_incompat_flag(info, type);
+ 
+-      return 0;
++      return block_group;
+ 
+ error_del_extent:
+       write_lock(&em_tree->lock);
+@@ -5307,34 +5393,36 @@ error_del_extent:
+       /* One for the tree reference */
+       free_extent_map(em);
+ 
+-      return ret;
++      return block_group;
+ }
+ 
+-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
++struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
++                                          u64 type)
+ {
+       struct btrfs_fs_info *info = trans->fs_info;
+       struct btrfs_fs_devices *fs_devices = info->fs_devices;
+       struct btrfs_device_info *devices_info = NULL;
+       struct alloc_chunk_ctl ctl;
++      struct btrfs_block_group *block_group;
+       int ret;
+ 
+       lockdep_assert_held(&info->chunk_mutex);
+ 
+       if (!alloc_profile_is_valid(type, 0)) {
+               ASSERT(0);
+-              return -EINVAL;
++              return ERR_PTR(-EINVAL);
+       }
+ 
+       if (list_empty(&fs_devices->alloc_list)) {
+               if (btrfs_test_opt(info, ENOSPC_DEBUG))
+                       btrfs_debug(info, "%s: no writable device", __func__);
+-              return -ENOSPC;
++              return ERR_PTR(-ENOSPC);
+       }
+ 
+       if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+               btrfs_err(info, "invalid chunk type 0x%llx requested", type);
+               ASSERT(0);
+-              return -EINVAL;
++              return ERR_PTR(-EINVAL);
+       }
+ 
+       ctl.start = find_next_chunk(info);
+@@ -5344,46 +5432,43 @@ int btrfs_alloc_chunk(struct btrfs_trans
+       devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
+                              GFP_NOFS);
+       if (!devices_info)
+-              return -ENOMEM;
++              return ERR_PTR(-ENOMEM);
+ 
+       ret = gather_device_info(fs_devices, &ctl, devices_info);
+-      if (ret < 0)
++      if (ret < 0) {
++              block_group = ERR_PTR(ret);
+               goto out;
++      }
+ 
+       ret = decide_stripe_size(fs_devices, &ctl, devices_info);
+-      if (ret < 0)
++      if (ret < 0) {
++              block_group = ERR_PTR(ret);
+               goto out;
++      }
+ 
+-      ret = create_chunk(trans, &ctl, devices_info);
++      block_group = create_chunk(trans, &ctl, devices_info);
+ 
+ out:
+       kfree(devices_info);
+-      return ret;
++      return block_group;
+ }
+ 
+ /*
+- * Chunk allocation falls into two parts. The first part does work
+- * that makes the new allocated chunk usable, but does not do any operation
+- * that modifies the chunk tree. The second part does the work that
+- * requires modifying the chunk tree. This division is important for the
+- * bootstrap process of adding storage to a seed btrfs.
++ * This function, btrfs_finish_chunk_alloc(), belongs to phase 2.
++ *
++ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
++ * phases.
+  */
+ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+                            u64 chunk_offset, u64 chunk_size)
+ {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+-      struct btrfs_root *extent_root = fs_info->extent_root;
+-      struct btrfs_root *chunk_root = fs_info->chunk_root;
+-      struct btrfs_key key;
+       struct btrfs_device *device;
+-      struct btrfs_chunk *chunk;
+-      struct btrfs_stripe *stripe;
+       struct extent_map *em;
+       struct map_lookup *map;
+-      size_t item_size;
+       u64 dev_offset;
+       u64 stripe_size;
+-      int i = 0;
++      int i;
+       int ret = 0;
+ 
+       em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
+@@ -5391,53 +5476,117 @@ int btrfs_finish_chunk_alloc(struct btrf
+               return PTR_ERR(em);
+ 
+       map = em->map_lookup;
+-      item_size = btrfs_chunk_item_size(map->num_stripes);
+       stripe_size = em->orig_block_len;
+ 
+-      chunk = kzalloc(item_size, GFP_NOFS);
+-      if (!chunk) {
+-              ret = -ENOMEM;
+-              goto out;
+-      }
+-
+       /*
+        * Take the device list mutex to prevent races with the final phase of
+        * a device replace operation that replaces the device object associated
+        * with the map's stripes, because the device object's id can change
+        * at any time during that final phase of the device replace operation
+-       * (dev-replace.c:btrfs_dev_replace_finishing()).
++       * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
++       * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
++       * resulting in persisting a device extent item with such ID.
+        */
+       mutex_lock(&fs_info->fs_devices->device_list_mutex);
+       for (i = 0; i < map->num_stripes; i++) {
+               device = map->stripes[i].dev;
+               dev_offset = map->stripes[i].physical;
+ 
+-              ret = btrfs_update_device(trans, device);
+-              if (ret)
+-                      break;
+               ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
+                                            dev_offset, stripe_size);
+               if (ret)
+                       break;
+       }
+-      if (ret) {
+-              mutex_unlock(&fs_info->fs_devices->device_list_mutex);
++      mutex_unlock(&fs_info->fs_devices->device_list_mutex);
++
++      free_extent_map(em);
++      return ret;
++}
++
++/*
++ * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
++ * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
++ * chunks.
++ *
++ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
++ * phases.
++ */
++int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
++                                   struct btrfs_block_group *bg)
++{
++      struct btrfs_fs_info *fs_info = trans->fs_info;
++      struct btrfs_root *extent_root = fs_info->extent_root;
++      struct btrfs_root *chunk_root = fs_info->chunk_root;
++      struct btrfs_key key;
++      struct btrfs_chunk *chunk;
++      struct btrfs_stripe *stripe;
++      struct extent_map *em;
++      struct map_lookup *map;
++      size_t item_size;
++      int i;
++      int ret;
++
++      /*
++       * We take the chunk_mutex for 2 reasons:
++       *
++       * 1) Updates and insertions in the chunk btree must be done while holding
++       *    the chunk_mutex, as well as updating the system chunk array in the
++       *    superblock. See the comment on top of btrfs_chunk_alloc() for the
++       *    details;
++       *
++       * 2) To prevent races with the final phase of a device replace operation
++       *    that replaces the device object associated with the map's stripes,
++       *    because the device object's id can change at any time during that
++       *    final phase of the device replace operation
++       *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
++       *    replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
++       *    which would cause a failure when updating the device item, which does
++       *    not exists, or persisting a stripe of the chunk item with such ID.
++       *    Here we can't use the device_list_mutex because our caller already
++       *    has locked the chunk_mutex, and the final phase of device replace
++       *    acquires both mutexes - first the device_list_mutex and then the
++       *    chunk_mutex. Using any of those two mutexes protects us from a
++       *    concurrent device replace.
++       */
++      lockdep_assert_held(&fs_info->chunk_mutex);
++
++      em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
++      if (IS_ERR(em)) {
++              ret = PTR_ERR(em);
++              btrfs_abort_transaction(trans, ret);
++              return ret;
++      }
++
++      map = em->map_lookup;
++      item_size = btrfs_chunk_item_size(map->num_stripes);
++
++      chunk = kzalloc(item_size, GFP_NOFS);
++      if (!chunk) {
++              ret = -ENOMEM;
++              btrfs_abort_transaction(trans, ret);
+               goto out;
+       }
+ 
++      for (i = 0; i < map->num_stripes; i++) {
++              struct btrfs_device *device = map->stripes[i].dev;
++
++              ret = btrfs_update_device(trans, device);
++              if (ret)
++                      goto out;
++      }
++
+       stripe = &chunk->stripe;
+       for (i = 0; i < map->num_stripes; i++) {
+-              device = map->stripes[i].dev;
+-              dev_offset = map->stripes[i].physical;
++              struct btrfs_device *device = map->stripes[i].dev;
++              const u64 dev_offset = map->stripes[i].physical;
+ 
+               btrfs_set_stack_stripe_devid(stripe, device->devid);
+               btrfs_set_stack_stripe_offset(stripe, dev_offset);
+               memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
+               stripe++;
+       }
+-      mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ 
+-      btrfs_set_stack_chunk_length(chunk, chunk_size);
++      btrfs_set_stack_chunk_length(chunk, bg->length);
+       btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+       btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
+       btrfs_set_stack_chunk_type(chunk, map->type);
+@@ -5449,15 +5598,18 @@ int btrfs_finish_chunk_alloc(struct btrf
+ 
+       key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+       key.type = BTRFS_CHUNK_ITEM_KEY;
+-      key.offset = chunk_offset;
++      key.offset = bg->start;
+ 
+       ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
+-      if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+-              /*
+-               * TODO: Cleanup of inserted chunk root in case of
+-               * failure.
+-               */
++      if (ret)
++              goto out;
++
++      bg->chunk_item_inserted = 1;
++
++      if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+               ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
++              if (ret)
++                      goto out;
+       }
+ 
+ out:
+@@ -5470,16 +5622,41 @@ static noinline int init_first_rw_device
+ {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+       u64 alloc_profile;
+-      int ret;
++      struct btrfs_block_group *meta_bg;
++      struct btrfs_block_group *sys_bg;
++
++      /*
++       * When adding a new device for sprouting, the seed device is read-only
++       * so we must first allocate a metadata and a system chunk. But before
++       * adding the block group items to the extent, device and chunk btrees,
++       * we must first:
++       *
++       * 1) Create both chunks without doing any changes to the btrees, as
++       *    otherwise we would get -ENOSPC since the block groups from the
++       *    seed device are read-only;
++       *
++       * 2) Add the device item for the new sprout device - finishing the setup
++       *    of a new block group requires updating the device item in the chunk
++       *    btree, so it must exist when we attempt to do it. The previous step
++       *    ensures this does not fail with -ENOSPC.
++       *
++       * After that we can add the block group items to their btrees:
++       * update existing device item in the chunk btree, add a new block group
++       * item to the extent btree, add a new chunk item to the chunk btree and
++       * finally add the new device extent items to the devices btree.
++       */
+ 
+       alloc_profile = btrfs_metadata_alloc_profile(fs_info);
+-      ret = btrfs_alloc_chunk(trans, alloc_profile);
+-      if (ret)
+-              return ret;
++      meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
++      if (IS_ERR(meta_bg))
++              return PTR_ERR(meta_bg);
+ 
+       alloc_profile = btrfs_system_alloc_profile(fs_info);
+-      ret = btrfs_alloc_chunk(trans, alloc_profile);
+-      return ret;
++      sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
++      if (IS_ERR(sys_bg))
++              return PTR_ERR(sys_bg);
++
++      return 0;
+ }
+ 
+ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
+@@ -7359,10 +7536,18 @@ int btrfs_read_chunk_tree(struct btrfs_f
+                       total_dev++;
+               } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
+                       struct btrfs_chunk *chunk;
++
++                      /*
++                       * We are only called at mount time, so no need to take
++                       * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
++                       * we always lock first fs_info->chunk_mutex before
++                       * acquiring any locks on the chunk tree. This is a
++                       * requirement for chunk allocation, see the comment on
++                       * top of btrfs_chunk_alloc() for details.
++                       */
++                      ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
+                       chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+-                      mutex_lock(&fs_info->chunk_mutex);
+                       ret = read_one_chunk(&found_key, leaf, chunk);
+-                      mutex_unlock(&fs_info->chunk_mutex);
+                       if (ret)
+                               goto error;
+               }
+--- a/fs/btrfs/volumes.h
++++ b/fs/btrfs/volumes.h
+@@ -447,7 +447,8 @@ int btrfs_get_io_geometry(struct btrfs_f
+                         struct btrfs_io_geometry *io_geom);
+ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
+ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
+-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type);
++struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
++                                          u64 type);
+ void btrfs_mapping_tree_free(struct extent_map_tree *tree);
+ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+                          int mirror_num);
+@@ -505,6 +506,8 @@ unsigned long btrfs_full_stripe_len(stru
+                                   u64 logical);
+ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+                            u64 chunk_offset, u64 chunk_size);
++int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
++                                   struct btrfs_block_group *bg);
+ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
+ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+                                      u64 logical, u64 length);
diff --git a/queue-5.12/btrfs-zoned-fix-wrong-mutex-unlock-on-failure-to-allocate-log-root-tree.patch b/queue-5.12/btrfs-zoned-fix-wrong-mutex-unlock-on-failure-to-allocate-log-root-tree.patch

new file mode 100644 (file)

index 0000000..2f9cbca
--- /dev/null
+++ b/queue-5.12/btrfs-zoned-fix-wrong-mutex-unlock-on-failure-to-allocate-log-root-tree.patch
@@ -0,0 +1,43 @@
+From ea32af47f00a046a1f953370514d6d946efe0152 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 7 Jul 2021 12:23:45 +0100
+Subject: btrfs: zoned: fix wrong mutex unlock on failure to allocate log root tree
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit ea32af47f00a046a1f953370514d6d946efe0152 upstream.
+
+When syncing the log, if we fail to allocate the root node for the log
+root tree:
+
+1) We are unlocking fs_info->tree_log_mutex, but at this point we have
+   not yet locked this mutex;
+
+2) We have locked fs_info->tree_root->log_mutex, but we end up not
+   unlocking it;
+
+So fix this by unlocking fs_info->tree_root->log_mutex instead of
+fs_info->tree_log_mutex.
+
+Fixes: e75f9fd194090e ("btrfs: zoned: move log tree node allocation out of log_root_tree->log_mutex")
+CC: stable@vger.kernel.org # 5.13+
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -3173,7 +3173,7 @@ int btrfs_sync_log(struct btrfs_trans_ha
+               if (!log_root_tree->node) {
+                       ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
+                       if (ret) {
+-                              mutex_unlock(&fs_info->tree_log_mutex);
++                              mutex_unlock(&fs_info->tree_root->log_mutex);
+                               goto out;
+                       }
+               }
diff --git a/queue-5.12/cgroup-verify-that-source-is-a-string.patch b/queue-5.12/cgroup-verify-that-source-is-a-string.patch

new file mode 100644 (file)

index 0000000..3695520
--- /dev/null
+++ b/queue-5.12/cgroup-verify-that-source-is-a-string.patch
@@ -0,0 +1,64 @@
+From 3b0462726e7ef281c35a7a4ae33e93ee2bc9975b Mon Sep 17 00:00:00 2001
+From: Christian Brauner <christian.brauner@ubuntu.com>
+Date: Wed, 14 Jul 2021 15:47:49 +0200
+Subject: cgroup: verify that source is a string
+
+From: Christian Brauner <christian.brauner@ubuntu.com>
+
+commit 3b0462726e7ef281c35a7a4ae33e93ee2bc9975b upstream.
+
+The following sequence can be used to trigger a UAF:
+
+    int fscontext_fd = fsopen("cgroup");
+    int fd_null = open("/dev/null, O_RDONLY);
+    int fsconfig(fscontext_fd, FSCONFIG_SET_FD, "source", fd_null);
+    close_range(3, ~0U, 0);
+
+The cgroup v1 specific fs parser expects a string for the "source"
+parameter.  However, it is perfectly legitimate to e.g.  specify a file
+descriptor for the "source" parameter.  The fs parser doesn't know what
+a filesystem allows there.  So it's a bug to assume that "source" is
+always of type fs_value_is_string when it can reasonably also be
+fs_value_is_file.
+
+This assumption in the cgroup code causes a UAF because struct
+fs_parameter uses a union for the actual value.  Access to that union is
+guarded by the param->type member.  Since the cgroup paramter parser
+didn't check param->type but unconditionally moved param->string into
+fc->source a close on the fscontext_fd would trigger a UAF during
+put_fs_context() which frees fc->source thereby freeing the file stashed
+in param->file causing a UAF during a close of the fd_null.
+
+Fix this by verifying that param->type is actually a string and report
+an error if not.
+
+In follow up patches I'll add a new generic helper that can be used here
+and by other filesystems instead of this error-prone copy-pasta fix.
+But fixing it in here first makes backporting a it to stable a lot
+easier.
+
+Fixes: 8d2451f4994f ("cgroup1: switch to option-by-option parsing")
+Reported-by: syzbot+283ce5a46486d6acdbaf@syzkaller.appspotmail.com
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Alexander Viro <viro@zeniv.linux.org.uk>
+Cc: Dmitry Vyukov <dvyukov@google.com>
+Cc: <stable@kernel.org>
+Cc: syzkaller-bugs <syzkaller-bugs@googlegroups.com>
+Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/cgroup/cgroup-v1.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/kernel/cgroup/cgroup-v1.c
++++ b/kernel/cgroup/cgroup-v1.c
+@@ -912,6 +912,8 @@ int cgroup1_parse_param(struct fs_contex
+       opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
+       if (opt == -ENOPARAM) {
+               if (strcmp(param->key, "source") == 0) {
++                      if (param->type != fs_value_is_string)
++                              return invalf(fc, "Non-string source");
+                       if (fc->source)
+                               return invalf(fc, "Multiple sources not supported");
+                       fc->source = param->string;
diff --git a/queue-5.12/drm-amdgpu-add-another-renoir-did.patch b/queue-5.12/drm-amdgpu-add-another-renoir-did.patch

new file mode 100644 (file)

index 0000000..6c57226
--- /dev/null
+++ b/queue-5.12/drm-amdgpu-add-another-renoir-did.patch
@@ -0,0 +1,31 @@
+From 775da83005cb61d4c213c636df9337da05714ff1 Mon Sep 17 00:00:00 2001
+From: Jinzhou Su <Jinzhou.Su@amd.com>
+Date: Tue, 13 Jul 2021 09:26:11 +0800
+Subject: drm/amdgpu: add another Renoir DID
+
+From: Jinzhou Su <Jinzhou.Su@amd.com>
+
+commit 775da83005cb61d4c213c636df9337da05714ff1 upstream.
+
+Add new PCI device id.
+
+Signed-off-by: Jinzhou Su <Jinzhou.Su@amd.com>
+Reviewed-by: Huang Rui <ray.huang@amd.com>
+Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+Cc: stable@vger.kernel.org # 5.11.x
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+@@ -1092,6 +1092,7 @@ static const struct pci_device_id pciidl
+       {0x1002, 0x734F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14},
+ 
+       /* Renoir */
++      {0x1002, 0x15E7, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU},
+       {0x1002, 0x1636, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU},
+       {0x1002, 0x1638, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU},
+       {0x1002, 0x164C, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU},
diff --git a/queue-5.12/drm-i915-gt-fix-edeadlk-handling-regression.patch b/queue-5.12/drm-i915-gt-fix-edeadlk-handling-regression.patch

new file mode 100644 (file)

index 0000000..cd84234
--- /dev/null
+++ b/queue-5.12/drm-i915-gt-fix-edeadlk-handling-regression.patch
@@ -0,0 +1,60 @@
+From 2feeb52859fc1ab94cd35b61ada3a6ac4ff24243 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
+Date: Wed, 30 Jun 2021 19:44:13 +0300
+Subject: drm/i915/gt: Fix -EDEADLK handling regression
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Ville Syrjälä <ville.syrjala@linux.intel.com>
+
+commit 2feeb52859fc1ab94cd35b61ada3a6ac4ff24243 upstream.
+
+The conversion to ww mutexes failed to address the fence code which
+already returns -EDEADLK when we run out of fences. Ww mutexes on
+the other hand treat -EDEADLK as an internal errno value indicating
+a need to restart the operation due to a deadlock. So now when the
+fence code returns -EDEADLK the higher level code erroneously
+restarts everything instead of returning the error to userspace
+as is expected.
+
+To remedy this let's switch the fence code to use a different errno
+value for this. -ENOBUFS seems like a semi-reasonable unique choice.
+Apart from igt the only user of this I could find is sna, and even
+there all we do is dump the current fence registers from debugfs
+into the X server log. So no user visible functionality is affected.
+If we really cared about preserving this we could of course convert
+back to -EDEADLK higher up, but doesn't seem like that's worth
+the hassle here.
+
+Not quite sure which commit specifically broke this, but I'll
+just attribute it to the general gem ww mutex work.
+
+Cc: stable@vger.kernel.org
+Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
+Cc: Thomas Hellström <thomas.hellstrom@intel.com>
+Testcase: igt/gem_pread/exhaustion
+Testcase: igt/gem_pwrite/basic-exhaustion
+Testcase: igt/gem_fenced_exec_thrash/too-many-fences
+Fixes: 80f0b679d6f0 ("drm/i915: Add an implementation for i915_gem_ww_ctx locking, v2.")
+Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20210630164413.25481-1-ville.syrjala@linux.intel.com
+Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
+(cherry picked from commit 78d2ad7eb4e1f0e9cd5d79788446b6092c21d3e0)
+Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c
++++ b/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c
+@@ -366,7 +366,7 @@ static struct i915_fence_reg *fence_find
+       if (intel_has_pending_fb_unpin(ggtt->vm.i915))
+               return ERR_PTR(-EAGAIN);
+ 
+-      return ERR_PTR(-EDEADLK);
++      return ERR_PTR(-ENOBUFS);
+ }
+ 
+ int __i915_vma_pin_fence(struct i915_vma *vma)
diff --git a/queue-5.12/drm-i915-gtt-drop-the-page-table-optimisation.patch b/queue-5.12/drm-i915-gtt-drop-the-page-table-optimisation.patch

new file mode 100644 (file)

index 0000000..837ae6a
--- /dev/null
+++ b/queue-5.12/drm-i915-gtt-drop-the-page-table-optimisation.patch
@@ -0,0 +1,55 @@
+From 0abb33bfca0fb74df76aac03e90ce685016ef7be Mon Sep 17 00:00:00 2001
+From: Matthew Auld <matthew.auld@intel.com>
+Date: Tue, 13 Jul 2021 14:04:31 +0100
+Subject: drm/i915/gtt: drop the page table optimisation
+
+From: Matthew Auld <matthew.auld@intel.com>
+
+commit 0abb33bfca0fb74df76aac03e90ce685016ef7be upstream.
+
+We skip filling out the pt with scratch entries if the va range covers
+the entire pt, since we later have to fill it with the PTEs for the
+object pages anyway. However this might leave open a small window where
+the PTEs don't point to anything valid for the HW to consume.
+
+When for example using 2M GTT pages this fill_px() showed up as being
+quite significant in perf measurements, and ends up being completely
+wasted since we ignore the pt and just use the pde directly.
+
+Anyway, currently we have our PTE construction split between alloc and
+insert, which is probably slightly iffy nowadays, since the alloc
+doesn't actually allocate anything anymore, instead it just sets up the
+page directories and points the PTEs at the scratch page. Later when we
+do the insert step we re-program the PTEs again. Better might be to
+squash the alloc and insert into a single step, then bringing back this
+optimisation(along with some others) should be possible.
+
+Fixes: 14826673247e ("drm/i915: Only initialize partially filled pagetables")
+Signed-off-by: Matthew Auld <matthew.auld@intel.com>
+Cc: Jon Bloomfield <jon.bloomfield@intel.com>
+Cc: Chris Wilson <chris.p.wilson@intel.com>
+Cc: Daniel Vetter <daniel@ffwll.ch>
+Cc: <stable@vger.kernel.org> # v4.15+
+Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
+Link: https://patchwork.freedesktop.org/patch/msgid/20210713130431.2392740-1-matthew.auld@intel.com
+(cherry picked from commit 8f88ca76b3942d82e2c1cea8735ec368d89ecc15)
+Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/i915/gt/gen8_ppgtt.c |    5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+--- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
++++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
+@@ -298,10 +298,7 @@ static void __gen8_ppgtt_alloc(struct i9
+                       __i915_gem_object_pin_pages(pt->base);
+                       i915_gem_object_make_unshrinkable(pt->base);
+ 
+-                      if (lvl ||
+-                          gen8_pt_count(*start, end) < I915_PDES ||
+-                          intel_vgpu_active(vm->i915))
+-                              fill_px(pt, vm->scratch[lvl]->encode);
++                      fill_px(pt, vm->scratch[lvl]->encode);
+ 
+                       spin_lock(&pd->lock);
+                       if (likely(!pd->entry[idx])) {
diff --git a/queue-5.12/edac-igen6-fix-core-dependency-again.patch b/queue-5.12/edac-igen6-fix-core-dependency-again.patch

new file mode 100644 (file)

index 0000000..18ec526
--- /dev/null
+++ b/queue-5.12/edac-igen6-fix-core-dependency-again.patch
@@ -0,0 +1,38 @@
+From a1c9ca5f65c9acfd7c02474b9d5cacbd7ea288df Mon Sep 17 00:00:00 2001
+From: Randy Dunlap <rdunlap@infradead.org>
+Date: Thu, 15 Jul 2021 11:55:31 -0700
+Subject: EDAC/igen6: fix core dependency AGAIN
+
+From: Randy Dunlap <rdunlap@infradead.org>
+
+commit a1c9ca5f65c9acfd7c02474b9d5cacbd7ea288df upstream.
+
+My previous patch had a typo/thinko which prevents this driver
+from being enabled: change X64_64 to X86_64.
+
+Fixes: 0a9ece9ba154 ("EDAC/igen6: fix core dependency")
+Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
+Cc: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
+Cc: linux-edac@vger.kernel.org
+Cc: bowsingbetee <bowsingbetee@protonmail.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Tony Luck <tony.luck@intel.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/edac/Kconfig |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/edac/Kconfig
++++ b/drivers/edac/Kconfig
+@@ -271,7 +271,7 @@ config EDAC_PND2
+ config EDAC_IGEN6
+       tristate "Intel client SoC Integrated MC"
+       depends on PCI && PCI_MMCONFIG && ARCH_HAVE_NMI_SAFE_CMPXCHG
+-      depends on X64_64 && X86_MCE_INTEL
++      depends on X86_64 && X86_MCE_INTEL
+       help
+         Support for error detection and correction on the Intel
+         client SoC Integrated Memory Controller using In-Band ECC IP.
diff --git a/queue-5.12/fbmem-do-not-delete-the-mode-that-is-still-in-use.patch b/queue-5.12/fbmem-do-not-delete-the-mode-that-is-still-in-use.patch

new file mode 100644 (file)

index 0000000..4bb50b4
--- /dev/null
+++ b/queue-5.12/fbmem-do-not-delete-the-mode-that-is-still-in-use.patch
@@ -0,0 +1,85 @@
+From 0af778269a522c988ef0b4188556aba97fb420cc Mon Sep 17 00:00:00 2001
+From: Zhen Lei <thunder.leizhen@huawei.com>
+Date: Mon, 12 Jul 2021 16:55:44 +0800
+Subject: fbmem: Do not delete the mode that is still in use
+
+From: Zhen Lei <thunder.leizhen@huawei.com>
+
+commit 0af778269a522c988ef0b4188556aba97fb420cc upstream.
+
+The execution of fb_delete_videomode() is not based on the result of the
+previous fbcon_mode_deleted(). As a result, the mode is directly deleted,
+regardless of whether it is still in use, which may cause UAF.
+
+==================================================================
+BUG: KASAN: use-after-free in fb_mode_is_equal+0x36e/0x5e0 \
+drivers/video/fbdev/core/modedb.c:924
+Read of size 4 at addr ffff88807e0ddb1c by task syz-executor.0/18962
+
+CPU: 2 PID: 18962 Comm: syz-executor.0 Not tainted 5.10.45-rc1+ #3
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ...
+Call Trace:
+ __dump_stack lib/dump_stack.c:77 [inline]
+ dump_stack+0x137/0x1be lib/dump_stack.c:118
+ print_address_description+0x6c/0x640 mm/kasan/report.c:385
+ __kasan_report mm/kasan/report.c:545 [inline]
+ kasan_report+0x13d/0x1e0 mm/kasan/report.c:562
+ fb_mode_is_equal+0x36e/0x5e0 drivers/video/fbdev/core/modedb.c:924
+ fbcon_mode_deleted+0x16a/0x220 drivers/video/fbdev/core/fbcon.c:2746
+ fb_set_var+0x1e1/0xdb0 drivers/video/fbdev/core/fbmem.c:975
+ do_fb_ioctl+0x4d9/0x6e0 drivers/video/fbdev/core/fbmem.c:1108
+ vfs_ioctl fs/ioctl.c:48 [inline]
+ __do_sys_ioctl fs/ioctl.c:753 [inline]
+ __se_sys_ioctl+0xfb/0x170 fs/ioctl.c:739
+ do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+Freed by task 18960:
+ kasan_save_stack mm/kasan/common.c:48 [inline]
+ kasan_set_track+0x3d/0x70 mm/kasan/common.c:56
+ kasan_set_free_info+0x17/0x30 mm/kasan/generic.c:355
+ __kasan_slab_free+0x108/0x140 mm/kasan/common.c:422
+ slab_free_hook mm/slub.c:1541 [inline]
+ slab_free_freelist_hook+0xd6/0x1a0 mm/slub.c:1574
+ slab_free mm/slub.c:3139 [inline]
+ kfree+0xca/0x3d0 mm/slub.c:4121
+ fb_delete_videomode+0x56a/0x820 drivers/video/fbdev/core/modedb.c:1104
+ fb_set_var+0x1f3/0xdb0 drivers/video/fbdev/core/fbmem.c:978
+ do_fb_ioctl+0x4d9/0x6e0 drivers/video/fbdev/core/fbmem.c:1108
+ vfs_ioctl fs/ioctl.c:48 [inline]
+ __do_sys_ioctl fs/ioctl.c:753 [inline]
+ __se_sys_ioctl+0xfb/0x170 fs/ioctl.c:739
+ do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+Fixes: 13ff178ccd6d ("fbcon: Call fbcon_mode_deleted/new_modelist directly")
+Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
+Cc: <stable@vger.kernel.org> # v5.3+
+Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
+Link: https://patchwork.freedesktop.org/patch/msgid/20210712085544.2828-1-thunder.leizhen@huawei.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/video/fbdev/core/fbmem.c |   12 +++++-------
+ 1 file changed, 5 insertions(+), 7 deletions(-)
+
+--- a/drivers/video/fbdev/core/fbmem.c
++++ b/drivers/video/fbdev/core/fbmem.c
+@@ -970,13 +970,11 @@ fb_set_var(struct fb_info *info, struct
+               fb_var_to_videomode(&mode2, &info->var);
+               /* make sure we don't delete the videomode of current var */
+               ret = fb_mode_is_equal(&mode1, &mode2);
+-
+-              if (!ret)
+-                      fbcon_mode_deleted(info, &mode1);
+-
+-              if (!ret)
+-                      fb_delete_videomode(&mode1, &info->modelist);
+-
++              if (!ret) {
++                      ret = fbcon_mode_deleted(info, &mode1);
++                      if (!ret)
++                              fb_delete_videomode(&mode1, &info->modelist);
++              }
+ 
+               return ret ? -EINVAL : 0;
+       }
diff --git a/queue-5.12/io_uring-use-right-task-for-exiting-checks.patch b/queue-5.12/io_uring-use-right-task-for-exiting-checks.patch

new file mode 100644 (file)

index 0000000..09c1a45
--- /dev/null
+++ b/queue-5.12/io_uring-use-right-task-for-exiting-checks.patch
@@ -0,0 +1,35 @@
+From 9c6882608bce249a8918744ecdb65748534e3f17 Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Sat, 10 Jul 2021 02:45:59 +0100
+Subject: io_uring: use right task for exiting checks
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+commit 9c6882608bce249a8918744ecdb65748534e3f17 upstream.
+
+When we use delayed_work for fallback execution of requests, current
+will be not of the submitter task, and so checks in io_req_task_submit()
+may not behave as expected. Currently, it leaves inline completions not
+flushed, so making io_ring_exit_work() to hang. Use the submitter task
+for all those checks.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/cb413c715bed0bc9c98b169059ea9c8a2c770715.1625881431.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -2023,7 +2023,7 @@ static void __io_req_task_submit(struct
+ 
+       /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
+       mutex_lock(&ctx->uring_lock);
+-      if (!(current->flags & PF_EXITING) && !current->in_execve)
++      if (!(req->task->flags & PF_EXITING) && !req->task->in_execve)
+               __io_queue_sqe(req);
+       else
+               __io_req_task_cancel(req, -EFAULT);
diff --git a/queue-5.12/iommu-vt-d-fix-clearing-real-dma-device-s-scalable-mode-context-entries.patch b/queue-5.12/iommu-vt-d-fix-clearing-real-dma-device-s-scalable-mode-context-entries.patch

new file mode 100644 (file)

index 0000000..fab8cf5
--- /dev/null
+++ b/queue-5.12/iommu-vt-d-fix-clearing-real-dma-device-s-scalable-mode-context-entries.patch
@@ -0,0 +1,47 @@
+From 474dd1c6506411752a9b2f2233eec11f1733a099 Mon Sep 17 00:00:00 2001
+From: Lu Baolu <baolu.lu@linux.intel.com>
+Date: Mon, 12 Jul 2021 15:17:12 +0800
+Subject: iommu/vt-d: Fix clearing real DMA device's scalable-mode context entries
+
+From: Lu Baolu <baolu.lu@linux.intel.com>
+
+commit 474dd1c6506411752a9b2f2233eec11f1733a099 upstream.
+
+The commit 2b0140c69637e ("iommu/vt-d: Use pci_real_dma_dev() for mapping")
+fixes an issue of "sub-device is removed where the context entry is cleared
+for all aliases". But this commit didn't consider the PASID entry and PASID
+table in VT-d scalable mode. This fix increases the coverage of scalable
+mode.
+
+Suggested-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
+Fixes: 8038bdb855331 ("iommu/vt-d: Only clear real DMA device's context entries")
+Fixes: 2b0140c69637e ("iommu/vt-d: Use pci_real_dma_dev() for mapping")
+Cc: stable@vger.kernel.org # v5.6+
+Cc: Jon Derrick <jonathan.derrick@intel.com>
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Link: https://lore.kernel.org/r/20210712071712.3416949-1-baolu.lu@linux.intel.com
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/iommu/intel/iommu.c |    5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+--- a/drivers/iommu/intel/iommu.c
++++ b/drivers/iommu/intel/iommu.c
+@@ -4503,14 +4503,13 @@ static void __dmar_remove_one_dev_info(s
+       iommu = info->iommu;
+       domain = info->domain;
+ 
+-      if (info->dev) {
++      if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
+               if (dev_is_pci(info->dev) && sm_supported(iommu))
+                       intel_pasid_tear_down_entry(iommu, info->dev,
+                                       PASID_RID2PASID, false);
+ 
+               iommu_disable_dev_iotlb(info);
+-              if (!dev_is_real_dma_subdevice(info->dev))
+-                      domain_context_clear(info);
++              domain_context_clear(info);
+               intel_pasid_free_table(info->dev);
+       }
+ 
diff --git a/queue-5.12/iommu-vt-d-global-devtlb-flush-when-present-context-entry-changed.patch b/queue-5.12/iommu-vt-d-global-devtlb-flush-when-present-context-entry-changed.patch

new file mode 100644 (file)

index 0000000..0ce23cf
--- /dev/null
+++ b/queue-5.12/iommu-vt-d-global-devtlb-flush-when-present-context-entry-changed.patch
@@ -0,0 +1,107 @@
+From 37764b952e1b39053defc7ebe5dcd8c4e3e78de9 Mon Sep 17 00:00:00 2001
+From: Sanjay Kumar <sanjay.k.kumar@intel.com>
+Date: Mon, 12 Jul 2021 15:13:15 +0800
+Subject: iommu/vt-d: Global devTLB flush when present context entry changed
+
+From: Sanjay Kumar <sanjay.k.kumar@intel.com>
+
+commit 37764b952e1b39053defc7ebe5dcd8c4e3e78de9 upstream.
+
+This fixes a bug in context cache clear operation. The code was not
+following the correct invalidation flow. A global device TLB invalidation
+should be added after the IOTLB invalidation. At the same time, it
+uses the domain ID from the context entry. But in scalable mode, the
+domain ID is in PASID table entry, not context entry.
+
+Fixes: 7373a8cc38197 ("iommu/vt-d: Setup context and enable RID2PASID support")
+Cc: stable@vger.kernel.org # v5.0+
+Signed-off-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
+Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
+Link: https://lore.kernel.org/r/20210712071315.3416543-1-baolu.lu@linux.intel.com
+Signed-off-by: Joerg Roedel <jroedel@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/iommu/intel/iommu.c |   31 ++++++++++++++++++++++---------
+ 1 file changed, 22 insertions(+), 9 deletions(-)
+
+--- a/drivers/iommu/intel/iommu.c
++++ b/drivers/iommu/intel/iommu.c
+@@ -2434,10 +2434,11 @@ __domain_mapping(struct dmar_domain *dom
+       return 0;
+ }
+ 
+-static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
++static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
+ {
+-      unsigned long flags;
++      struct intel_iommu *iommu = info->iommu;
+       struct context_entry *context;
++      unsigned long flags;
+       u16 did_old;
+ 
+       if (!iommu)
+@@ -2449,7 +2450,16 @@ static void domain_context_clear_one(str
+               spin_unlock_irqrestore(&iommu->lock, flags);
+               return;
+       }
+-      did_old = context_domain_id(context);
++
++      if (sm_supported(iommu)) {
++              if (hw_pass_through && domain_type_is_si(info->domain))
++                      did_old = FLPT_DEFAULT_DID;
++              else
++                      did_old = info->domain->iommu_did[iommu->seq_id];
++      } else {
++              did_old = context_domain_id(context);
++      }
++
+       context_clear_entry(context);
+       __iommu_flush_cache(iommu, context, sizeof(*context));
+       spin_unlock_irqrestore(&iommu->lock, flags);
+@@ -2467,6 +2477,8 @@ static void domain_context_clear_one(str
+                                0,
+                                0,
+                                DMA_TLB_DSI_FLUSH);
++
++      __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
+ }
+ 
+ static inline void unlink_domain_info(struct device_domain_info *info)
+@@ -4456,9 +4468,9 @@ out_free_dmar:
+ 
+ static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
+ {
+-      struct intel_iommu *iommu = opaque;
++      struct device_domain_info *info = opaque;
+ 
+-      domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
++      domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
+       return 0;
+ }
+ 
+@@ -4468,12 +4480,13 @@ static int domain_context_clear_one_cb(s
+  * devices, unbinding the driver from any one of them will possibly leave
+  * the others unable to operate.
+  */
+-static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
++static void domain_context_clear(struct device_domain_info *info)
+ {
+-      if (!iommu || !dev || !dev_is_pci(dev))
++      if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
+               return;
+ 
+-      pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
++      pci_for_each_dma_alias(to_pci_dev(info->dev),
++                             &domain_context_clear_one_cb, info);
+ }
+ 
+ static void __dmar_remove_one_dev_info(struct device_domain_info *info)
+@@ -4497,7 +4510,7 @@ static void __dmar_remove_one_dev_info(s
+ 
+               iommu_disable_dev_iotlb(info);
+               if (!dev_is_real_dma_subdevice(info->dev))
+-                      domain_context_clear(iommu, info->dev);
++                      domain_context_clear(info);
+               intel_pasid_free_table(info->dev);
+       }
+ 
diff --git a/queue-5.12/mm-hugetlb-fix-refs-calculation-from-unaligned-vaddr.patch b/queue-5.12/mm-hugetlb-fix-refs-calculation-from-unaligned-vaddr.patch

new file mode 100644 (file)

index 0000000..3627c7f
--- /dev/null
+++ b/queue-5.12/mm-hugetlb-fix-refs-calculation-from-unaligned-vaddr.patch
@@ -0,0 +1,45 @@
+From d08af0a59684e18a51aa4bfd24c658994ea3fc5b Mon Sep 17 00:00:00 2001
+From: Joao Martins <joao.m.martins@oracle.com>
+Date: Wed, 14 Jul 2021 21:27:11 -0700
+Subject: mm/hugetlb: fix refs calculation from unaligned @vaddr
+
+From: Joao Martins <joao.m.martins@oracle.com>
+
+commit d08af0a59684e18a51aa4bfd24c658994ea3fc5b upstream.
+
+Commit 82e5d378b0e47 ("mm/hugetlb: refactor subpage recording")
+refactored the count of subpages but missed an edge case when @vaddr is
+not aligned to PAGE_SIZE e.g.  when close to vma->vm_end.  It would then
+errousnly set @refs to 0 and record_subpages_vmas() wouldn't set the
+@pages array element to its value, consequently causing the reported
+null-deref by syzbot.
+
+Fix it by aligning down @vaddr by PAGE_SIZE in @refs calculation.
+
+Link: https://lkml.kernel.org/r/20210713152440.28650-1-joao.m.martins@oracle.com
+Fixes: 82e5d378b0e47 ("mm/hugetlb: refactor subpage recording")
+Reported-by: syzbot+a3fcd59df1b372066f5a@syzkaller.appspotmail.com
+Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
+Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/hugetlb.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -5029,8 +5029,9 @@ long follow_hugetlb_page(struct mm_struc
+                       continue;
+               }
+ 
+-              refs = min3(pages_per_huge_page(h) - pfn_offset,
+-                          (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);
++              /* vaddr may not be aligned to PAGE_SIZE */
++              refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
++                  (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
+ 
+               if (pages || vmas)
+                       record_subpages_vmas(mem_map_offset(page, pfn_offset),
diff --git a/queue-5.12/scsi-core-fix-bad-pointer-dereference-when-ehandler-kthread-is-invalid.patch b/queue-5.12/scsi-core-fix-bad-pointer-dereference-when-ehandler-kthread-is-invalid.patch

new file mode 100644 (file)

index 0000000..82c1561
--- /dev/null
+++ b/queue-5.12/scsi-core-fix-bad-pointer-dereference-when-ehandler-kthread-is-invalid.patch
@@ -0,0 +1,96 @@
+From 93aa71ad7379900e61c8adff6a710a4c18c7c99b Mon Sep 17 00:00:00 2001
+From: Tyrel Datwyler <tyreld@linux.ibm.com>
+Date: Thu, 1 Jul 2021 13:56:59 -0600
+Subject: scsi: core: Fix bad pointer dereference when ehandler kthread is invalid
+
+From: Tyrel Datwyler <tyreld@linux.ibm.com>
+
+commit 93aa71ad7379900e61c8adff6a710a4c18c7c99b upstream.
+
+Commit 66a834d09293 ("scsi: core: Fix error handling of scsi_host_alloc()")
+changed the allocation logic to call put_device() to perform host cleanup
+with the assumption that IDA removal and stopping the kthread would
+properly be performed in scsi_host_dev_release(). However, in the unlikely
+case that the error handler thread fails to spawn, shost->ehandler is set
+to ERR_PTR(-ENOMEM).
+
+The error handler cleanup code in scsi_host_dev_release() will call
+kthread_stop() if shost->ehandler != NULL which will always be the case
+whether the kthread was successfully spawned or not. In the case that it
+failed to spawn this has the nasty side effect of trying to dereference an
+invalid pointer when kthread_stop() is called. The following splat provides
+an example of this behavior in the wild:
+
+scsi host11: error handler thread failed to spawn, error = -4
+Kernel attempted to read user page (10c) - exploit attempt? (uid: 0)
+BUG: Kernel NULL pointer dereference on read at 0x0000010c
+Faulting instruction address: 0xc00000000818e9a8
+Oops: Kernel access of bad area, sig: 11 [#1]
+LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
+Modules linked in: ibmvscsi(+) scsi_transport_srp dm_multipath dm_mirror dm_region
+ hash dm_log dm_mod fuse overlay squashfs loop
+CPU: 12 PID: 274 Comm: systemd-udevd Not tainted 5.13.0-rc7 #1
+NIP:  c00000000818e9a8 LR: c0000000089846e8 CTR: 0000000000007ee8
+REGS: c000000037d12ea0 TRAP: 0300   Not tainted  (5.13.0-rc7)
+MSR:  800000000280b033 &lt;SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE&gt;  CR: 28228228
+XER: 20040001
+CFAR: c0000000089846e4 DAR: 000000000000010c DSISR: 40000000 IRQMASK: 0
+GPR00: c0000000089846e8 c000000037d13140 c000000009cc1100 fffffffffffffffc
+GPR04: 0000000000000001 0000000000000000 0000000000000000 c000000037dc0000
+GPR08: 0000000000000000 c000000037dc0000 0000000000000001 00000000fffff7ff
+GPR12: 0000000000008000 c00000000a049000 c000000037d13d00 000000011134d5a0
+GPR16: 0000000000001740 c0080000190d0000 c0080000190d1740 c000000009129288
+GPR20: c000000037d13bc0 0000000000000001 c000000037d13bc0 c0080000190b7898
+GPR24: c0080000190b7708 0000000000000000 c000000033bb2c48 0000000000000000
+GPR28: c000000046b28280 0000000000000000 000000000000010c fffffffffffffffc
+NIP [c00000000818e9a8] kthread_stop+0x38/0x230
+LR [c0000000089846e8] scsi_host_dev_release+0x98/0x160
+Call Trace:
+[c000000033bb2c48] 0xc000000033bb2c48 (unreliable)
+[c0000000089846e8] scsi_host_dev_release+0x98/0x160
+[c00000000891e960] device_release+0x60/0x100
+[c0000000087e55c4] kobject_release+0x84/0x210
+[c00000000891ec78] put_device+0x28/0x40
+[c000000008984ea4] scsi_host_alloc+0x314/0x430
+[c0080000190b38bc] ibmvscsi_probe+0x54/0xad0 [ibmvscsi]
+[c000000008110104] vio_bus_probe+0xa4/0x4b0
+[c00000000892a860] really_probe+0x140/0x680
+[c00000000892aefc] driver_probe_device+0x15c/0x200
+[c00000000892b63c] device_driver_attach+0xcc/0xe0
+[c00000000892b740] __driver_attach+0xf0/0x200
+[c000000008926f28] bus_for_each_dev+0xa8/0x130
+[c000000008929ce4] driver_attach+0x34/0x50
+[c000000008928fc0] bus_add_driver+0x1b0/0x300
+[c00000000892c798] driver_register+0x98/0x1a0
+[c00000000810eb60] __vio_register_driver+0x80/0xe0
+[c0080000190b4a30] ibmvscsi_module_init+0x9c/0xdc [ibmvscsi]
+[c0000000080121d0] do_one_initcall+0x60/0x2d0
+[c000000008261abc] do_init_module+0x7c/0x320
+[c000000008265700] load_module+0x2350/0x25b0
+[c000000008265cb4] __do_sys_finit_module+0xd4/0x160
+[c000000008031110] system_call_exception+0x150/0x2d0
+[c00000000800d35c] system_call_common+0xec/0x278
+
+Fix this be nulling shost->ehandler when the kthread fails to spawn.
+
+Link: https://lore.kernel.org/r/20210701195659.3185475-1-tyreld@linux.ibm.com
+Fixes: 66a834d09293 ("scsi: core: Fix error handling of scsi_host_alloc()")
+Cc: stable@vger.kernel.org
+Reviewed-by: Ming Lei <ming.lei@redhat.com>
+Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/scsi/hosts.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/scsi/hosts.c
++++ b/drivers/scsi/hosts.c
+@@ -490,6 +490,7 @@ struct Scsi_Host *scsi_host_alloc(struct
+               shost_printk(KERN_WARNING, shost,
+                       "error handler thread failed to spawn, error = %ld\n",
+                       PTR_ERR(shost->ehandler));
++              shost->ehandler = NULL;
+               goto fail;
+       }
+ 
diff --git a/queue-5.12/scsi-zfcp-report-port-fc_security-as-unknown-early-during-remote-cable-pull.patch b/queue-5.12/scsi-zfcp-report-port-fc_security-as-unknown-early-during-remote-cable-pull.patch

new file mode 100644 (file)

index 0000000..c0b4bea
--- /dev/null
+++ b/queue-5.12/scsi-zfcp-report-port-fc_security-as-unknown-early-during-remote-cable-pull.patch
@@ -0,0 +1,38 @@
+From 8b3bdd99c092bbaeaa7d9eecb1a3e5dc9112002b Mon Sep 17 00:00:00 2001
+From: Steffen Maier <maier@linux.ibm.com>
+Date: Fri, 2 Jul 2021 18:09:22 +0200
+Subject: scsi: zfcp: Report port fc_security as unknown early during remote cable pull
+
+From: Steffen Maier <maier@linux.ibm.com>
+
+commit 8b3bdd99c092bbaeaa7d9eecb1a3e5dc9112002b upstream.
+
+On remote cable pull, a zfcp_port keeps its status and only gets
+ZFCP_STATUS_PORT_LINK_TEST added. Only after an ADISC timeout, we would
+actually start port recovery and remove ZFCP_STATUS_COMMON_UNBLOCKED which
+zfcp_sysfs_port_fc_security_show() detected and reported as "unknown"
+instead of the old and possibly stale zfcp_port->connection_info.
+
+Add check for ZFCP_STATUS_PORT_LINK_TEST for timely "unknown" report.
+
+Link: https://lore.kernel.org/r/20210702160922.2667874-1-maier@linux.ibm.com
+Fixes: a17c78460093 ("scsi: zfcp: report FC Endpoint Security in sysfs")
+Cc: <stable@vger.kernel.org> #5.7+
+Reviewed-by: Benjamin Block <bblock@linux.ibm.com>
+Signed-off-by: Steffen Maier <maier@linux.ibm.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/s390/scsi/zfcp_sysfs.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/s390/scsi/zfcp_sysfs.c
++++ b/drivers/s390/scsi/zfcp_sysfs.c
+@@ -487,6 +487,7 @@ static ssize_t zfcp_sysfs_port_fc_securi
+       if (0 == (status & ZFCP_STATUS_COMMON_OPEN) ||
+           0 == (status & ZFCP_STATUS_COMMON_UNBLOCKED) ||
+           0 == (status & ZFCP_STATUS_PORT_PHYS_OPEN) ||
++          0 != (status & ZFCP_STATUS_PORT_LINK_TEST) ||
+           0 != (status & ZFCP_STATUS_COMMON_ERP_FAILED) ||
+           0 != (status & ZFCP_STATUS_COMMON_ACCESS_BOXED))
+               i = sprintf(buf, "unknown\n");
diff --git a/queue-5.12/series b/queue-5.12/series

index 416976fa59a8c8c05bff1a3f7573d261fd3bbfdb..a35f3d978265ac08b5a97619ab9d2f6504be3f3d 100644 (file)
--- a/queue-5.12/series
+++ b/queue-5.12/series
@@ -6,3 +6,21 @@ kvm-x86-use-guest-maxphyaddr-from-cpuid.0x8000_0008-iff-tdp-is-enabled.patch
  kvm-x86-mmu-do-not-apply-hpa-memory-encryption-mask-to-gpas.patch
  kvm-nsvm-check-the-value-written-to-msr_vm_hsave_pa.patch
  kvm-x86-disable-hardware-breakpoints-unconditionally-before-kvm_x86-run.patch
+scsi-core-fix-bad-pointer-dereference-when-ehandler-kthread-is-invalid.patch
+scsi-zfcp-report-port-fc_security-as-unknown-early-during-remote-cable-pull.patch
+iommu-vt-d-global-devtlb-flush-when-present-context-entry-changed.patch
+iommu-vt-d-fix-clearing-real-dma-device-s-scalable-mode-context-entries.patch
+tracing-do-not-reference-char-as-a-string-in-histograms.patch
+drm-amdgpu-add-another-renoir-did.patch
+drm-i915-gtt-drop-the-page-table-optimisation.patch
+drm-i915-gt-fix-edeadlk-handling-regression.patch
+cgroup-verify-that-source-is-a-string.patch
+fbmem-do-not-delete-the-mode-that-is-still-in-use.patch
+edac-igen6-fix-core-dependency-again.patch
+mm-hugetlb-fix-refs-calculation-from-unaligned-vaddr.patch
+arm64-avoid-premature-usercopy-failure.patch
+io_uring-use-right-task-for-exiting-checks.patch
+btrfs-properly-split-extent_map-for-req_op_zone_append.patch
+btrfs-fix-deadlock-with-concurrent-chunk-allocations-involving-system-chunks.patch
+btrfs-rework-chunk-allocation-to-avoid-exhaustion-of-the-system-chunk-array.patch
+btrfs-zoned-fix-wrong-mutex-unlock-on-failure-to-allocate-log-root-tree.patch
diff --git a/queue-5.12/tracing-do-not-reference-char-as-a-string-in-histograms.patch b/queue-5.12/tracing-do-not-reference-char-as-a-string-in-histograms.patch

new file mode 100644 (file)

index 0000000..c408136
--- /dev/null
+++ b/queue-5.12/tracing-do-not-reference-char-as-a-string-in-histograms.patch
@@ -0,0 +1,105 @@
+From 704adfb5a9978462cd861f170201ae2b5e3d3a80 Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
+Date: Thu, 15 Jul 2021 00:02:06 -0400
+Subject: tracing: Do not reference char * as a string in histograms
+
+From: Steven Rostedt (VMware) <rostedt@goodmis.org>
+
+commit 704adfb5a9978462cd861f170201ae2b5e3d3a80 upstream.
+
+The histogram logic was allowing events with char * pointers to be used as
+normal strings. But it was easy to crash the kernel with:
+
+ # echo 'hist:keys=filename' > events/syscalls/sys_enter_openat/trigger
+
+And open some files, and boom!
+
+ BUG: unable to handle page fault for address: 00007f2ced0c3280
+ #PF: supervisor read access in kernel mode
+ #PF: error_code(0x0000) - not-present page
+ PGD 1173fa067 P4D 1173fa067 PUD 1171b6067 PMD 1171dd067 PTE 0
+ Oops: 0000 [#1] PREEMPT SMP
+ CPU: 6 PID: 1810 Comm: cat Not tainted 5.13.0-rc5-test+ #61
+ Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01
+v03.03 07/14/2016
+ RIP: 0010:strlen+0x0/0x20
+ Code: f6 82 80 2a 0b a9 20 74 11 0f b6 50 01 48 83 c0 01 f6 82 80 2a 0b
+a9 20 75 ef c3 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 <80> 3f 00 74
+10 48 89 f8 48 83 c0 01 80 38 00 75 f7 48 29 f8 c3
+
+ RSP: 0018:ffffbdbf81567b50 EFLAGS: 00010246
+ RAX: 0000000000000003 RBX: ffff93815cdb3800 RCX: ffff9382401a22d0
+ RDX: 0000000000000100 RSI: 0000000000000000 RDI: 00007f2ced0c3280
+ RBP: 0000000000000100 R08: ffff9382409ff074 R09: ffffbdbf81567c98
+ R10: ffff9382409ff074 R11: 0000000000000000 R12: ffff9382409ff074
+ R13: 0000000000000001 R14: ffff93815a744f00 R15: 00007f2ced0c3280
+ FS:  00007f2ced0f8580(0000) GS:ffff93825a800000(0000)
+knlGS:0000000000000000
+ CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ CR2: 00007f2ced0c3280 CR3: 0000000107069005 CR4: 00000000001706e0
+ Call Trace:
+  event_hist_trigger+0x463/0x5f0
+  ? find_held_lock+0x32/0x90
+  ? sched_clock_cpu+0xe/0xd0
+  ? lock_release+0x155/0x440
+  ? kernel_init_free_pages+0x6d/0x90
+  ? preempt_count_sub+0x9b/0xd0
+  ? kernel_init_free_pages+0x6d/0x90
+  ? get_page_from_freelist+0x12c4/0x1680
+  ? __rb_reserve_next+0xe5/0x460
+  ? ring_buffer_lock_reserve+0x12a/0x3f0
+  event_triggers_call+0x52/0xe0
+  ftrace_syscall_enter+0x264/0x2c0
+  syscall_trace_enter.constprop.0+0x1ee/0x210
+  do_syscall_64+0x1c/0x80
+  entry_SYSCALL_64_after_hwframe+0x44/0xae
+
+Where it triggered a fault on strlen(key) where key was the filename.
+
+The reason is that filename is a char * to user space, and the histogram
+code just blindly dereferenced it, with obvious bad results.
+
+I originally tried to use strncpy_from_user/kernel_nofault() but found
+that there's other places that its dereferenced and not worth the effort.
+
+Just do not allow "char *" to act like strings.
+
+Link: https://lkml.kernel.org/r/20210715000206.025df9d2@rorschach.local.home
+
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Tzvetomir Stoyanov <tz.stoyanov@gmail.com>
+Cc: stable@vger.kernel.org
+Acked-by: Namhyung Kim <namhyung@kernel.org>
+Acked-by: Tom Zanussi <zanussi@kernel.org>
+Fixes: 79e577cbce4c4 ("tracing: Support string type key properly")
+Fixes: 5967bd5c4239 ("tracing: Let filter_assign_type() detect FILTER_PTR_STRING")
+Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/trace_events_hist.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/kernel/trace/trace_events_hist.c
++++ b/kernel/trace/trace_events_hist.c
+@@ -1673,7 +1673,9 @@ static struct hist_field *create_hist_fi
+       if (WARN_ON_ONCE(!field))
+               goto out;
+ 
+-      if (is_string_field(field)) {
++      /* Pointers to strings are just pointers and dangerous to dereference */
++      if (is_string_field(field) &&
++          (field->filter_type != FILTER_PTR_STRING)) {
+               flags |= HIST_FIELD_FL_STRING;
+ 
+               hist_field->size = MAX_FILTER_STR_VAL;
+@@ -4469,8 +4471,6 @@ static inline void add_to_key(char *comp
+               field = key_field->field;
+               if (field->filter_type == FILTER_DYN_STRING)
+                       size = *(u32 *)(rec + field->offset) >> 16;
+-              else if (field->filter_type == FILTER_PTR_STRING)
+-                      size = strlen(key);
+               else if (field->filter_type == FILTER_STATIC_STRING)
+                       size = field->size;
+
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 19 Jul 2021 12:16:10 +0000 (14:16 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 19 Jul 2021 12:16:10 +0000 (14:16 +0200)
queue-5.12/arm64-avoid-premature-usercopy-failure.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/btrfs-fix-deadlock-with-concurrent-chunk-allocations-involving-system-chunks.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/btrfs-properly-split-extent_map-for-req_op_zone_append.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/btrfs-rework-chunk-allocation-to-avoid-exhaustion-of-the-system-chunk-array.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/btrfs-zoned-fix-wrong-mutex-unlock-on-failure-to-allocate-log-root-tree.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/cgroup-verify-that-source-is-a-string.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/drm-amdgpu-add-another-renoir-did.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/drm-i915-gt-fix-edeadlk-handling-regression.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/drm-i915-gtt-drop-the-page-table-optimisation.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/edac-igen6-fix-core-dependency-again.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/fbmem-do-not-delete-the-mode-that-is-still-in-use.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/io_uring-use-right-task-for-exiting-checks.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/iommu-vt-d-fix-clearing-real-dma-device-s-scalable-mode-context-entries.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/iommu-vt-d-global-devtlb-flush-when-present-context-entry-changed.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/mm-hugetlb-fix-refs-calculation-from-unaligned-vaddr.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/scsi-core-fix-bad-pointer-dereference-when-ehandler-kthread-is-invalid.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/scsi-zfcp-report-port-fc_security-as-unknown-early-during-remote-cable-pull.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/series		patch \| blob \| blame \| history
queue-5.12/tracing-do-not-reference-char-as-a-string-in-histograms.patch	[new file with mode: 0644]	patch \| blob