From 435f9920f94d8bd937cbba7dbb61d3f68925b1e6 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 19 Jul 2021 14:16:28 +0200 Subject: [PATCH] 5.13-stable patches added patches: arm64-avoid-premature-usercopy-failure.patch btrfs-don-t-block-if-we-can-t-acquire-the-reclaim-lock.patch btrfs-fix-deadlock-with-concurrent-chunk-allocations-involving-system-chunks.patch btrfs-properly-split-extent_map-for-req_op_zone_append.patch btrfs-rework-chunk-allocation-to-avoid-exhaustion-of-the-system-chunk-array.patch btrfs-zoned-fix-types-for-u64-division-in-btrfs_reclaim_bgs_work.patch btrfs-zoned-fix-wrong-mutex-unlock-on-failure-to-allocate-log-root-tree.patch cgroup-verify-that-source-is-a-string.patch drm-amdgpu-add-another-renoir-did.patch drm-i915-gt-fix-edeadlk-handling-regression.patch drm-i915-gtt-drop-the-page-table-optimisation.patch edac-igen6-fix-core-dependency-again.patch fbmem-do-not-delete-the-mode-that-is-still-in-use.patch io_uring-use-right-task-for-exiting-checks.patch iommu-vt-d-fix-clearing-real-dma-device-s-scalable-mode-context-entries.patch iommu-vt-d-global-devtlb-flush-when-present-context-entry-changed.patch mm-hugetlb-fix-refs-calculation-from-unaligned-vaddr.patch scsi-core-fix-bad-pointer-dereference-when-ehandler-kthread-is-invalid.patch scsi-zfcp-report-port-fc_security-as-unknown-early-during-remote-cable-pull.patch tracing-do-not-reference-char-as-a-string-in-histograms.patch --- ...m64-avoid-premature-usercopy-failure.patch | 199 +++ ...if-we-can-t-acquire-the-reclaim-lock.patch | 44 + ...-allocations-involving-system-chunks.patch | 212 +++ ...it-extent_map-for-req_op_zone_append.patch | 248 ++++ ...exhaustion-of-the-system-chunk-array.patch | 1280 +++++++++++++++++ ...4-division-in-btrfs_reclaim_bgs_work.patch | 35 + ...on-failure-to-allocate-log-root-tree.patch | 43 + ...group-verify-that-source-is-a-string.patch | 64 + .../drm-amdgpu-add-another-renoir-did.patch | 31 + ...5-gt-fix-edeadlk-handling-regression.patch | 60 + ...gtt-drop-the-page-table-optimisation.patch | 55 + ...edac-igen6-fix-core-dependency-again.patch | 38 + ...delete-the-mode-that-is-still-in-use.patch | 85 ++ ...ng-use-right-task-for-exiting-checks.patch | 35 + ...vice-s-scalable-mode-context-entries.patch | 47 + ...h-when-present-context-entry-changed.patch | 107 ++ ...efs-calculation-from-unaligned-vaddr.patch | 45 + ...nce-when-ehandler-kthread-is-invalid.patch | 96 ++ ...known-early-during-remote-cable-pull.patch | 38 + queue-5.13/series | 20 + ...rence-char-as-a-string-in-histograms.patch | 105 ++ 21 files changed, 2887 insertions(+) create mode 100644 queue-5.13/arm64-avoid-premature-usercopy-failure.patch create mode 100644 queue-5.13/btrfs-don-t-block-if-we-can-t-acquire-the-reclaim-lock.patch create mode 100644 queue-5.13/btrfs-fix-deadlock-with-concurrent-chunk-allocations-involving-system-chunks.patch create mode 100644 queue-5.13/btrfs-properly-split-extent_map-for-req_op_zone_append.patch create mode 100644 queue-5.13/btrfs-rework-chunk-allocation-to-avoid-exhaustion-of-the-system-chunk-array.patch create mode 100644 queue-5.13/btrfs-zoned-fix-types-for-u64-division-in-btrfs_reclaim_bgs_work.patch create mode 100644 queue-5.13/btrfs-zoned-fix-wrong-mutex-unlock-on-failure-to-allocate-log-root-tree.patch create mode 100644 queue-5.13/cgroup-verify-that-source-is-a-string.patch create mode 100644 queue-5.13/drm-amdgpu-add-another-renoir-did.patch create mode 100644 queue-5.13/drm-i915-gt-fix-edeadlk-handling-regression.patch create mode 100644 queue-5.13/drm-i915-gtt-drop-the-page-table-optimisation.patch create mode 100644 queue-5.13/edac-igen6-fix-core-dependency-again.patch create mode 100644 queue-5.13/fbmem-do-not-delete-the-mode-that-is-still-in-use.patch create mode 100644 queue-5.13/io_uring-use-right-task-for-exiting-checks.patch create mode 100644 queue-5.13/iommu-vt-d-fix-clearing-real-dma-device-s-scalable-mode-context-entries.patch create mode 100644 queue-5.13/iommu-vt-d-global-devtlb-flush-when-present-context-entry-changed.patch create mode 100644 queue-5.13/mm-hugetlb-fix-refs-calculation-from-unaligned-vaddr.patch create mode 100644 queue-5.13/scsi-core-fix-bad-pointer-dereference-when-ehandler-kthread-is-invalid.patch create mode 100644 queue-5.13/scsi-zfcp-report-port-fc_security-as-unknown-early-during-remote-cable-pull.patch create mode 100644 queue-5.13/tracing-do-not-reference-char-as-a-string-in-histograms.patch diff --git a/queue-5.13/arm64-avoid-premature-usercopy-failure.patch b/queue-5.13/arm64-avoid-premature-usercopy-failure.patch new file mode 100644 index 00000000000..5e7f1cf46a0 --- /dev/null +++ b/queue-5.13/arm64-avoid-premature-usercopy-failure.patch @@ -0,0 +1,199 @@ +From 295cf156231ca3f9e3a66bde7fab5e09c41835e0 Mon Sep 17 00:00:00 2001 +From: Robin Murphy +Date: Mon, 12 Jul 2021 15:27:46 +0100 +Subject: arm64: Avoid premature usercopy failure + +From: Robin Murphy + +commit 295cf156231ca3f9e3a66bde7fab5e09c41835e0 upstream. + +Al reminds us that the usercopy API must only return complete failure +if absolutely nothing could be copied. Currently, if userspace does +something silly like giving us an unaligned pointer to Device memory, +or a size which overruns MTE tag bounds, we may fail to honour that +requirement when faulting on a multi-byte access even though a smaller +access could have succeeded. + +Add a mitigation to the fixup routines to fall back to a single-byte +copy if we faulted on a larger access before anything has been written +to the destination, to guarantee making *some* forward progress. We +needn't be too concerned about the overall performance since this should +only occur when callers are doing something a bit dodgy in the first +place. Particularly broken userspace might still be able to trick +generic_perform_write() into an infinite loop by targeting write() at +an mmap() of some read-only device register where the fault-in load +succeeds but any store synchronously aborts such that copy_to_user() is +genuinely unable to make progress, but, well, don't do that... + +CC: stable@vger.kernel.org +Reported-by: Chen Huang +Suggested-by: Al Viro +Reviewed-by: Catalin Marinas +Signed-off-by: Robin Murphy +Link: https://lore.kernel.org/r/dc03d5c675731a1f24a62417dba5429ad744234e.1626098433.git.robin.murphy@arm.com +Signed-off-by: Will Deacon +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/lib/copy_from_user.S | 13 ++++++++++--- + arch/arm64/lib/copy_in_user.S | 21 ++++++++++++++------- + arch/arm64/lib/copy_to_user.S | 14 +++++++++++--- + 3 files changed, 35 insertions(+), 13 deletions(-) + +--- a/arch/arm64/lib/copy_from_user.S ++++ b/arch/arm64/lib/copy_from_user.S +@@ -29,7 +29,7 @@ + .endm + + .macro ldrh1 reg, ptr, val +- user_ldst 9998f, ldtrh, \reg, \ptr, \val ++ user_ldst 9997f, ldtrh, \reg, \ptr, \val + .endm + + .macro strh1 reg, ptr, val +@@ -37,7 +37,7 @@ + .endm + + .macro ldr1 reg, ptr, val +- user_ldst 9998f, ldtr, \reg, \ptr, \val ++ user_ldst 9997f, ldtr, \reg, \ptr, \val + .endm + + .macro str1 reg, ptr, val +@@ -45,7 +45,7 @@ + .endm + + .macro ldp1 reg1, reg2, ptr, val +- user_ldp 9998f, \reg1, \reg2, \ptr, \val ++ user_ldp 9997f, \reg1, \reg2, \ptr, \val + .endm + + .macro stp1 reg1, reg2, ptr, val +@@ -53,8 +53,10 @@ + .endm + + end .req x5 ++srcin .req x15 + SYM_FUNC_START(__arch_copy_from_user) + add end, x0, x2 ++ mov srcin, x1 + #include "copy_template.S" + mov x0, #0 // Nothing to copy + ret +@@ -63,6 +65,11 @@ EXPORT_SYMBOL(__arch_copy_from_user) + + .section .fixup,"ax" + .align 2 ++9997: cmp dst, dstin ++ b.ne 9998f ++ // Before being absolutely sure we couldn't copy anything, try harder ++USER(9998f, ldtrb tmp1w, [srcin]) ++ strb tmp1w, [dst], #1 + 9998: sub x0, end, dst // bytes not copied + ret + .previous +--- a/arch/arm64/lib/copy_in_user.S ++++ b/arch/arm64/lib/copy_in_user.S +@@ -30,33 +30,34 @@ + .endm + + .macro ldrh1 reg, ptr, val +- user_ldst 9998f, ldtrh, \reg, \ptr, \val ++ user_ldst 9997f, ldtrh, \reg, \ptr, \val + .endm + + .macro strh1 reg, ptr, val +- user_ldst 9998f, sttrh, \reg, \ptr, \val ++ user_ldst 9997f, sttrh, \reg, \ptr, \val + .endm + + .macro ldr1 reg, ptr, val +- user_ldst 9998f, ldtr, \reg, \ptr, \val ++ user_ldst 9997f, ldtr, \reg, \ptr, \val + .endm + + .macro str1 reg, ptr, val +- user_ldst 9998f, sttr, \reg, \ptr, \val ++ user_ldst 9997f, sttr, \reg, \ptr, \val + .endm + + .macro ldp1 reg1, reg2, ptr, val +- user_ldp 9998f, \reg1, \reg2, \ptr, \val ++ user_ldp 9997f, \reg1, \reg2, \ptr, \val + .endm + + .macro stp1 reg1, reg2, ptr, val +- user_stp 9998f, \reg1, \reg2, \ptr, \val ++ user_stp 9997f, \reg1, \reg2, \ptr, \val + .endm + + end .req x5 +- ++srcin .req x15 + SYM_FUNC_START(__arch_copy_in_user) + add end, x0, x2 ++ mov srcin, x1 + #include "copy_template.S" + mov x0, #0 + ret +@@ -65,6 +66,12 @@ EXPORT_SYMBOL(__arch_copy_in_user) + + .section .fixup,"ax" + .align 2 ++9997: cmp dst, dstin ++ b.ne 9998f ++ // Before being absolutely sure we couldn't copy anything, try harder ++USER(9998f, ldtrb tmp1w, [srcin]) ++USER(9998f, sttrb tmp1w, [dst]) ++ add dst, dst, #1 + 9998: sub x0, end, dst // bytes not copied + ret + .previous +--- a/arch/arm64/lib/copy_to_user.S ++++ b/arch/arm64/lib/copy_to_user.S +@@ -32,7 +32,7 @@ + .endm + + .macro strh1 reg, ptr, val +- user_ldst 9998f, sttrh, \reg, \ptr, \val ++ user_ldst 9997f, sttrh, \reg, \ptr, \val + .endm + + .macro ldr1 reg, ptr, val +@@ -40,7 +40,7 @@ + .endm + + .macro str1 reg, ptr, val +- user_ldst 9998f, sttr, \reg, \ptr, \val ++ user_ldst 9997f, sttr, \reg, \ptr, \val + .endm + + .macro ldp1 reg1, reg2, ptr, val +@@ -48,12 +48,14 @@ + .endm + + .macro stp1 reg1, reg2, ptr, val +- user_stp 9998f, \reg1, \reg2, \ptr, \val ++ user_stp 9997f, \reg1, \reg2, \ptr, \val + .endm + + end .req x5 ++srcin .req x15 + SYM_FUNC_START(__arch_copy_to_user) + add end, x0, x2 ++ mov srcin, x1 + #include "copy_template.S" + mov x0, #0 + ret +@@ -62,6 +64,12 @@ EXPORT_SYMBOL(__arch_copy_to_user) + + .section .fixup,"ax" + .align 2 ++9997: cmp dst, dstin ++ b.ne 9998f ++ // Before being absolutely sure we couldn't copy anything, try harder ++ ldrb tmp1w, [srcin] ++USER(9998f, sttrb tmp1w, [dst]) ++ add dst, dst, #1 + 9998: sub x0, end, dst // bytes not copied + ret + .previous diff --git a/queue-5.13/btrfs-don-t-block-if-we-can-t-acquire-the-reclaim-lock.patch b/queue-5.13/btrfs-don-t-block-if-we-can-t-acquire-the-reclaim-lock.patch new file mode 100644 index 00000000000..62f8bb25fda --- /dev/null +++ b/queue-5.13/btrfs-don-t-block-if-we-can-t-acquire-the-reclaim-lock.patch @@ -0,0 +1,44 @@ +From 9cc0b837e14ae913581ec1ea6e979a738f71b0fd Mon Sep 17 00:00:00 2001 +From: Johannes Thumshirn +Date: Tue, 6 Jul 2021 01:32:38 +0900 +Subject: btrfs: don't block if we can't acquire the reclaim lock + +From: Johannes Thumshirn + +commit 9cc0b837e14ae913581ec1ea6e979a738f71b0fd upstream. + +If we can't acquire the reclaim_bgs_lock on block group reclaim, we +block until it is free. This can potentially stall for a long time. + +While reclaim of block groups is necessary for a good user experience on +a zoned file system, there still is no need to block as it is best +effort only, just like when we're deleting unused block groups. + +CC: stable@vger.kernel.org # 5.13 +Signed-off-by: Johannes Thumshirn +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/block-group.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/block-group.c ++++ b/fs/btrfs/block-group.c +@@ -1499,7 +1499,15 @@ void btrfs_reclaim_bgs_work(struct work_ + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) + return; + +- mutex_lock(&fs_info->reclaim_bgs_lock); ++ /* ++ * Long running balances can keep us blocked here for eternity, so ++ * simply skip reclaim if we're unable to get the mutex. ++ */ ++ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) { ++ btrfs_exclop_finish(fs_info); ++ return; ++ } ++ + spin_lock(&fs_info->unused_bgs_lock); + while (!list_empty(&fs_info->reclaim_bgs)) { + bg = list_first_entry(&fs_info->reclaim_bgs, diff --git a/queue-5.13/btrfs-fix-deadlock-with-concurrent-chunk-allocations-involving-system-chunks.patch b/queue-5.13/btrfs-fix-deadlock-with-concurrent-chunk-allocations-involving-system-chunks.patch new file mode 100644 index 00000000000..559851213e7 --- /dev/null +++ b/queue-5.13/btrfs-fix-deadlock-with-concurrent-chunk-allocations-involving-system-chunks.patch @@ -0,0 +1,212 @@ +From 1cb3db1cf383a3c7dbda1aa0ce748b0958759947 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Tue, 29 Jun 2021 14:43:05 +0100 +Subject: btrfs: fix deadlock with concurrent chunk allocations involving system chunks + +From: Filipe Manana + +commit 1cb3db1cf383a3c7dbda1aa0ce748b0958759947 upstream. + +When a task attempting to allocate a new chunk verifies that there is not +currently enough free space in the system space_info and there is another +task that allocated a new system chunk but it did not finish yet the +creation of the respective block group, it waits for that other task to +finish creating the block group. This is to avoid exhaustion of the system +chunk array in the superblock, which is limited, when we have a thundering +herd of tasks allocating new chunks. This problem was described and fixed +by commit eafa4fd0ad0607 ("btrfs: fix exhaustion of the system chunk array +due to concurrent allocations"). + +However there are two very similar scenarios where this can lead to a +deadlock: + +1) Task B allocated a new system chunk and task A is waiting on task B + to finish creation of the respective system block group. However before + task B ends its transaction handle and finishes the creation of the + system block group, it attempts to allocate another chunk (like a data + chunk for an fallocate operation for a very large range). Task B will + be unable to progress and allocate the new chunk, because task A set + space_info->chunk_alloc to 1 and therefore it loops at + btrfs_chunk_alloc() waiting for task A to finish its chunk allocation + and set space_info->chunk_alloc to 0, but task A is waiting on task B + to finish creation of the new system block group, therefore resulting + in a deadlock; + +2) Task B allocated a new system chunk and task A is waiting on task B to + finish creation of the respective system block group. By the time that + task B enter the final phase of block group allocation, which happens + at btrfs_create_pending_block_groups(), when it modifies the extent + tree, the device tree or the chunk tree to insert the items for some + new block group, it needs to allocate a new chunk, so it ends up at + btrfs_chunk_alloc() and keeps looping there because task A has set + space_info->chunk_alloc to 1, but task A is waiting for task B to + finish creation of the new system block group and release the reserved + system space, therefore resulting in a deadlock. + +In short, the problem is if a task B needs to allocate a new chunk after +it previously allocated a new system chunk and if another task A is +currently waiting for task B to complete the allocation of the new system +chunk. + +Unfortunately this deadlock scenario introduced by the previous fix for +the system chunk array exhaustion problem does not have a simple and short +fix, and requires a big change to rework the chunk allocation code so that +chunk btree updates are all made in the first phase of chunk allocation. +And since this deadlock regression is being frequently hit on zoned +filesystems and the system chunk array exhaustion problem is triggered +in more extreme cases (originally observed on PowerPC with a node size +of 64K when running the fallocate tests from stress-ng), revert the +changes from that commit. The next patch in the series, with a subject +of "btrfs: rework chunk allocation to avoid exhaustion of the system +chunk array" does the necessary changes to fix the system chunk array +exhaustion problem. + +Reported-by: Naohiro Aota +Link: https://lore.kernel.org/linux-btrfs/20210621015922.ewgbffxuawia7liz@naota-xeon/ +Fixes: eafa4fd0ad0607 ("btrfs: fix exhaustion of the system chunk array due to concurrent allocations") +CC: stable@vger.kernel.org # 5.12+ +Tested-by: Shin'ichiro Kawasaki +Tested-by: Naohiro Aota +Signed-off-by: Filipe Manana +Tested-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/block-group.c | 58 ------------------------------------------------- + fs/btrfs/transaction.c | 5 ---- + fs/btrfs/transaction.h | 7 ----- + 3 files changed, 1 insertion(+), 69 deletions(-) + +--- a/fs/btrfs/block-group.c ++++ b/fs/btrfs/block-group.c +@@ -3364,7 +3364,6 @@ static u64 get_profile_num_devs(struct b + */ + void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) + { +- struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_space_info *info; + u64 left; +@@ -3379,7 +3378,6 @@ void check_system_chunk(struct btrfs_tra + lockdep_assert_held(&fs_info->chunk_mutex); + + info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); +-again: + spin_lock(&info->lock); + left = info->total_bytes - btrfs_space_info_used(info, true); + spin_unlock(&info->lock); +@@ -3398,58 +3396,6 @@ again: + + if (left < thresh) { + u64 flags = btrfs_system_alloc_profile(fs_info); +- u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved); +- +- /* +- * If there's not available space for the chunk tree (system +- * space) and there are other tasks that reserved space for +- * creating a new system block group, wait for them to complete +- * the creation of their system block group and release excess +- * reserved space. We do this because: +- * +- * *) We can end up allocating more system chunks than necessary +- * when there are multiple tasks that are concurrently +- * allocating block groups, which can lead to exhaustion of +- * the system array in the superblock; +- * +- * *) If we allocate extra and unnecessary system block groups, +- * despite being empty for a long time, and possibly forever, +- * they end not being added to the list of unused block groups +- * because that typically happens only when deallocating the +- * last extent from a block group - which never happens since +- * we never allocate from them in the first place. The few +- * exceptions are when mounting a filesystem or running scrub, +- * which add unused block groups to the list of unused block +- * groups, to be deleted by the cleaner kthread. +- * And even when they are added to the list of unused block +- * groups, it can take a long time until they get deleted, +- * since the cleaner kthread might be sleeping or busy with +- * other work (deleting subvolumes, running delayed iputs, +- * defrag scheduling, etc); +- * +- * This is rare in practice, but can happen when too many tasks +- * are allocating blocks groups in parallel (via fallocate()) +- * and before the one that reserved space for a new system block +- * group finishes the block group creation and releases the space +- * reserved in excess (at btrfs_create_pending_block_groups()), +- * other tasks end up here and see free system space temporarily +- * not enough for updating the chunk tree. +- * +- * We unlock the chunk mutex before waiting for such tasks and +- * lock it again after the wait, otherwise we would deadlock. +- * It is safe to do so because allocating a system chunk is the +- * first thing done while allocating a new block group. +- */ +- if (reserved > trans->chunk_bytes_reserved) { +- const u64 min_needed = reserved - thresh; +- +- mutex_unlock(&fs_info->chunk_mutex); +- wait_event(cur_trans->chunk_reserve_wait, +- atomic64_read(&cur_trans->chunk_bytes_reserved) <= +- min_needed); +- mutex_lock(&fs_info->chunk_mutex); +- goto again; +- } + + /* + * Ignore failure to create system chunk. We might end up not +@@ -3464,10 +3410,8 @@ again: + ret = btrfs_block_rsv_add(fs_info->chunk_root, + &fs_info->chunk_block_rsv, + thresh, BTRFS_RESERVE_NO_FLUSH); +- if (!ret) { +- atomic64_add(thresh, &cur_trans->chunk_bytes_reserved); ++ if (!ret) + trans->chunk_bytes_reserved += thresh; +- } + } + } + +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -260,7 +260,6 @@ static inline int extwriter_counter_read + void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) + { + struct btrfs_fs_info *fs_info = trans->fs_info; +- struct btrfs_transaction *cur_trans = trans->transaction; + + if (!trans->chunk_bytes_reserved) + return; +@@ -269,8 +268,6 @@ void btrfs_trans_release_chunk_metadata( + + btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, + trans->chunk_bytes_reserved, NULL); +- atomic64_sub(trans->chunk_bytes_reserved, &cur_trans->chunk_bytes_reserved); +- cond_wake_up(&cur_trans->chunk_reserve_wait); + trans->chunk_bytes_reserved = 0; + } + +@@ -386,8 +383,6 @@ loop: + spin_lock_init(&cur_trans->dropped_roots_lock); + INIT_LIST_HEAD(&cur_trans->releasing_ebs); + spin_lock_init(&cur_trans->releasing_ebs_lock); +- atomic64_set(&cur_trans->chunk_bytes_reserved, 0); +- init_waitqueue_head(&cur_trans->chunk_reserve_wait); + list_add_tail(&cur_trans->list, &fs_info->trans_list); + extent_io_tree_init(fs_info, &cur_trans->dirty_pages, + IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode); +--- a/fs/btrfs/transaction.h ++++ b/fs/btrfs/transaction.h +@@ -96,13 +96,6 @@ struct btrfs_transaction { + + spinlock_t releasing_ebs_lock; + struct list_head releasing_ebs; +- +- /* +- * The number of bytes currently reserved, by all transaction handles +- * attached to this transaction, for metadata extents of the chunk tree. +- */ +- atomic64_t chunk_bytes_reserved; +- wait_queue_head_t chunk_reserve_wait; + }; + + #define __TRANS_FREEZABLE (1U << 0) diff --git a/queue-5.13/btrfs-properly-split-extent_map-for-req_op_zone_append.patch b/queue-5.13/btrfs-properly-split-extent_map-for-req_op_zone_append.patch new file mode 100644 index 00000000000..818d59e24d8 --- /dev/null +++ b/queue-5.13/btrfs-properly-split-extent_map-for-req_op_zone_append.patch @@ -0,0 +1,248 @@ +From abb99cfdaf0759f8a619e5fecf52ccccdf310c8c Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Mon, 28 Jun 2021 17:57:28 +0900 +Subject: btrfs: properly split extent_map for REQ_OP_ZONE_APPEND + +From: Naohiro Aota + +commit abb99cfdaf0759f8a619e5fecf52ccccdf310c8c upstream. + +Damien reported a test failure with btrfs/209. The test itself ran fine, +but the fsck ran afterwards reported a corrupted filesystem. + +The filesystem corruption happens because we're splitting an extent and +then writing the extent twice. We have to split the extent though, because +we're creating too large extents for a REQ_OP_ZONE_APPEND operation. + +When dumping the extent tree, we can see two EXTENT_ITEMs at the same +start address but different lengths. + +$ btrfs inspect dump-tree /dev/nullb1 -t extent +... + item 19 key (269484032 EXTENT_ITEM 126976) itemoff 15470 itemsize 53 + refs 1 gen 7 flags DATA + extent data backref root FS_TREE objectid 257 offset 786432 count 1 + item 20 key (269484032 EXTENT_ITEM 262144) itemoff 15417 itemsize 53 + refs 1 gen 7 flags DATA + extent data backref root FS_TREE objectid 257 offset 786432 count 1 + +The duplicated EXTENT_ITEMs originally come from wrongly split extent_map in +extract_ordered_extent(). Since extract_ordered_extent() uses +create_io_em() to split an existing extent_map, we will have +split->orig_start != split->start. Then, it will be logged with non-zero +"extent data offset". Finally, the logged entries are replayed into +a duplicated EXTENT_ITEM. + +Introduce and use proper splitting function for extent_map. The function is +intended to be simple and specific usage for extract_ordered_extent() e.g. +not supporting compression case (we do not allow splitting compressed +extent_map anyway). + +There was a question raised by Qu, in summary why we want to split the +extent map (and not the bio): + +The problem is not the limit on the zone end, which as you mention is +the same as the block group end. The problem is that data write use zone +append (ZA) operations. ZA BIOs cannot be split so a large extent may +need to be processed with multiple ZA BIOs, While that is also true for +regular writes, the major difference is that ZA are "nameless" write +operation giving back the written sectors on completion. And ZA +operations may be reordered by the block layer (not intentionally +though). Combine both of these characteristics and you can see that the +data for a large extent may end up being shuffled when written resulting +in data corruption and the impossibility to map the extent to some start +sector. + +To avoid this problem, zoned btrfs uses the principle "one data extent +== one ZA BIO". So large extents need to be split. This is unfortunate, +but we can revisit this later and optimize, e.g. merge back together the +fragments of an extent once written if they actually were written +sequentially in the zone. + +Reported-by: Damien Le Moal +Fixes: d22002fd37bd ("btrfs: zoned: split ordered extent when bio is sent") +CC: stable@vger.kernel.org # 5.12+ +CC: Johannes Thumshirn +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/inode.c | 147 ++++++++++++++++++++++++++++++++++++++++++++----------- + 1 file changed, 118 insertions(+), 29 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -2260,13 +2260,127 @@ bool btrfs_bio_fits_in_ordered_extent(st + return ret; + } + ++/* ++ * Split an extent_map at [start, start + len] ++ * ++ * This function is intended to be used only for extract_ordered_extent(). ++ */ ++static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, ++ u64 pre, u64 post) ++{ ++ struct extent_map_tree *em_tree = &inode->extent_tree; ++ struct extent_map *em; ++ struct extent_map *split_pre = NULL; ++ struct extent_map *split_mid = NULL; ++ struct extent_map *split_post = NULL; ++ int ret = 0; ++ int modified; ++ unsigned long flags; ++ ++ /* Sanity check */ ++ if (pre == 0 && post == 0) ++ return 0; ++ ++ split_pre = alloc_extent_map(); ++ if (pre) ++ split_mid = alloc_extent_map(); ++ if (post) ++ split_post = alloc_extent_map(); ++ if (!split_pre || (pre && !split_mid) || (post && !split_post)) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ ASSERT(pre + post < len); ++ ++ lock_extent(&inode->io_tree, start, start + len - 1); ++ write_lock(&em_tree->lock); ++ em = lookup_extent_mapping(em_tree, start, len); ++ if (!em) { ++ ret = -EIO; ++ goto out_unlock; ++ } ++ ++ ASSERT(em->len == len); ++ ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); ++ ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); ++ ++ flags = em->flags; ++ clear_bit(EXTENT_FLAG_PINNED, &em->flags); ++ clear_bit(EXTENT_FLAG_LOGGING, &flags); ++ modified = !list_empty(&em->list); ++ ++ /* First, replace the em with a new extent_map starting from * em->start */ ++ split_pre->start = em->start; ++ split_pre->len = (pre ? pre : em->len - post); ++ split_pre->orig_start = split_pre->start; ++ split_pre->block_start = em->block_start; ++ split_pre->block_len = split_pre->len; ++ split_pre->orig_block_len = split_pre->block_len; ++ split_pre->ram_bytes = split_pre->len; ++ split_pre->flags = flags; ++ split_pre->compress_type = em->compress_type; ++ split_pre->generation = em->generation; ++ ++ replace_extent_mapping(em_tree, em, split_pre, modified); ++ ++ /* ++ * Now we only have an extent_map at: ++ * [em->start, em->start + pre] if pre != 0 ++ * [em->start, em->start + em->len - post] if pre == 0 ++ */ ++ ++ if (pre) { ++ /* Insert the middle extent_map */ ++ split_mid->start = em->start + pre; ++ split_mid->len = em->len - pre - post; ++ split_mid->orig_start = split_mid->start; ++ split_mid->block_start = em->block_start + pre; ++ split_mid->block_len = split_mid->len; ++ split_mid->orig_block_len = split_mid->block_len; ++ split_mid->ram_bytes = split_mid->len; ++ split_mid->flags = flags; ++ split_mid->compress_type = em->compress_type; ++ split_mid->generation = em->generation; ++ add_extent_mapping(em_tree, split_mid, modified); ++ } ++ ++ if (post) { ++ split_post->start = em->start + em->len - post; ++ split_post->len = post; ++ split_post->orig_start = split_post->start; ++ split_post->block_start = em->block_start + em->len - post; ++ split_post->block_len = split_post->len; ++ split_post->orig_block_len = split_post->block_len; ++ split_post->ram_bytes = split_post->len; ++ split_post->flags = flags; ++ split_post->compress_type = em->compress_type; ++ split_post->generation = em->generation; ++ add_extent_mapping(em_tree, split_post, modified); ++ } ++ ++ /* Once for us */ ++ free_extent_map(em); ++ /* Once for the tree */ ++ free_extent_map(em); ++ ++out_unlock: ++ write_unlock(&em_tree->lock); ++ unlock_extent(&inode->io_tree, start, start + len - 1); ++out: ++ free_extent_map(split_pre); ++ free_extent_map(split_mid); ++ free_extent_map(split_post); ++ ++ return ret; ++} ++ + static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, + struct bio *bio, loff_t file_offset) + { + struct btrfs_ordered_extent *ordered; +- struct extent_map *em = NULL, *em_new = NULL; +- struct extent_map_tree *em_tree = &inode->extent_tree; + u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT; ++ u64 file_len; + u64 len = bio->bi_iter.bi_size; + u64 end = start + len; + u64 ordered_end; +@@ -2306,41 +2420,16 @@ static blk_status_t extract_ordered_exte + goto out; + } + ++ file_len = ordered->num_bytes; + pre = start - ordered->disk_bytenr; + post = ordered_end - end; + + ret = btrfs_split_ordered_extent(ordered, pre, post); + if (ret) + goto out; +- +- read_lock(&em_tree->lock); +- em = lookup_extent_mapping(em_tree, ordered->file_offset, len); +- if (!em) { +- read_unlock(&em_tree->lock); +- ret = -EIO; +- goto out; +- } +- read_unlock(&em_tree->lock); +- +- ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); +- /* +- * We cannot reuse em_new here but have to create a new one, as +- * unpin_extent_cache() expects the start of the extent map to be the +- * logical offset of the file, which does not hold true anymore after +- * splitting. +- */ +- em_new = create_io_em(inode, em->start + pre, len, +- em->start + pre, em->block_start + pre, len, +- len, len, BTRFS_COMPRESS_NONE, +- BTRFS_ORDERED_REGULAR); +- if (IS_ERR(em_new)) { +- ret = PTR_ERR(em_new); +- goto out; +- } +- free_extent_map(em_new); ++ ret = split_zoned_em(inode, file_offset, file_len, pre, post); + + out: +- free_extent_map(em); + btrfs_put_ordered_extent(ordered); + + return errno_to_blk_status(ret); diff --git a/queue-5.13/btrfs-rework-chunk-allocation-to-avoid-exhaustion-of-the-system-chunk-array.patch b/queue-5.13/btrfs-rework-chunk-allocation-to-avoid-exhaustion-of-the-system-chunk-array.patch new file mode 100644 index 00000000000..4d3e35e1454 --- /dev/null +++ b/queue-5.13/btrfs-rework-chunk-allocation-to-avoid-exhaustion-of-the-system-chunk-array.patch @@ -0,0 +1,1280 @@ +From 79bd37120b149532af5b21953643ed74af69654f Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Tue, 29 Jun 2021 14:43:06 +0100 +Subject: btrfs: rework chunk allocation to avoid exhaustion of the system chunk array + +From: Filipe Manana + +commit 79bd37120b149532af5b21953643ed74af69654f upstream. + +Commit eafa4fd0ad0607 ("btrfs: fix exhaustion of the system chunk array +due to concurrent allocations") fixed a problem that resulted in +exhausting the system chunk array in the superblock when there are many +tasks allocating chunks in parallel. Basically too many tasks enter the +first phase of chunk allocation without previous tasks having finished +their second phase of allocation, resulting in too many system chunks +being allocated. That was originally observed when running the fallocate +tests of stress-ng on a PowerPC machine, using a node size of 64K. + +However that commit also introduced a deadlock where a task in phase 1 of +the chunk allocation waited for another task that had allocated a system +chunk to finish its phase 2, but that other task was waiting on an extent +buffer lock held by the first task, therefore resulting in both tasks not +making any progress. That change was later reverted by a patch with the +subject "btrfs: fix deadlock with concurrent chunk allocations involving +system chunks", since there is no simple and short solution to address it +and the deadlock is relatively easy to trigger on zoned filesystems, while +the system chunk array exhaustion is not so common. + +This change reworks the chunk allocation to avoid the system chunk array +exhaustion. It accomplishes that by making the first phase of chunk +allocation do the updates of the device items in the chunk btree and the +insertion of the new chunk item in the chunk btree. This is done while +under the protection of the chunk mutex (fs_info->chunk_mutex), in the +same critical section that checks for available system space, allocates +a new system chunk if needed and reserves system chunk space. This way +we do not have chunk space reserved until the second phase completes. + +The same logic is applied to chunk removal as well, since it keeps +reserved system space long after it is done updating the chunk btree. + +For direct allocation of system chunks, the previous behaviour remains, +because otherwise we would deadlock on extent buffers of the chunk btree. +Changes to the chunk btree are by large done by chunk allocation and chunk +removal, which first reserve chunk system space and then later do changes +to the chunk btree. The other remaining cases are uncommon and correspond +to adding a device, removing a device and resizing a device. All these +other cases do not pre-reserve system space, they modify the chunk btree +right away, so they don't hold reserved space for a long period like chunk +allocation and chunk removal do. + +The diff of this change is huge, but more than half of it is just addition +of comments describing both how things work regarding chunk allocation and +removal, including both the new behavior and the parts of the old behavior +that did not change. + +CC: stable@vger.kernel.org # 5.12+ +Tested-by: Shin'ichiro Kawasaki +Tested-by: Naohiro Aota +Signed-off-by: Filipe Manana +Tested-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/block-group.c | 285 ++++++++++++++++++++++++++++++++++----- + fs/btrfs/block-group.h | 6 + fs/btrfs/ctree.c | 67 +-------- + fs/btrfs/transaction.c | 10 - + fs/btrfs/transaction.h | 2 + fs/btrfs/volumes.c | 355 +++++++++++++++++++++++++++++++++++++------------ + fs/btrfs/volumes.h | 5 + 7 files changed, 546 insertions(+), 184 deletions(-) + +--- a/fs/btrfs/block-group.c ++++ b/fs/btrfs/block-group.c +@@ -2192,6 +2192,13 @@ error: + return ret; + } + ++/* ++ * This function, insert_block_group_item(), belongs to the phase 2 of chunk ++ * allocation. ++ * ++ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation ++ * phases. ++ */ + static int insert_block_group_item(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group) + { +@@ -2214,15 +2221,19 @@ static int insert_block_group_item(struc + return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); + } + ++/* ++ * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of ++ * chunk allocation. ++ * ++ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation ++ * phases. ++ */ + void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) + { + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group *block_group; + int ret = 0; + +- if (!trans->can_flush_pending_bgs) +- return; +- + while (!list_empty(&trans->new_bgs)) { + int index; + +@@ -2237,6 +2248,13 @@ void btrfs_create_pending_block_groups(s + ret = insert_block_group_item(trans, block_group); + if (ret) + btrfs_abort_transaction(trans, ret); ++ if (!block_group->chunk_item_inserted) { ++ mutex_lock(&fs_info->chunk_mutex); ++ ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group); ++ mutex_unlock(&fs_info->chunk_mutex); ++ if (ret) ++ btrfs_abort_transaction(trans, ret); ++ } + ret = btrfs_finish_chunk_alloc(trans, block_group->start, + block_group->length); + if (ret) +@@ -2260,8 +2278,9 @@ next: + btrfs_trans_release_chunk_metadata(trans); + } + +-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, +- u64 type, u64 chunk_offset, u64 size) ++struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, ++ u64 bytes_used, u64 type, ++ u64 chunk_offset, u64 size) + { + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group *cache; +@@ -2271,7 +2290,7 @@ int btrfs_make_block_group(struct btrfs_ + + cache = btrfs_create_block_group_cache(fs_info, chunk_offset); + if (!cache) +- return -ENOMEM; ++ return ERR_PTR(-ENOMEM); + + cache->length = size; + set_free_space_tree_thresholds(cache); +@@ -2285,7 +2304,7 @@ int btrfs_make_block_group(struct btrfs_ + ret = btrfs_load_block_group_zone_info(cache, true); + if (ret) { + btrfs_put_block_group(cache); +- return ret; ++ return ERR_PTR(ret); + } + + ret = exclude_super_stripes(cache); +@@ -2293,7 +2312,7 @@ int btrfs_make_block_group(struct btrfs_ + /* We may have excluded something, so call this just in case */ + btrfs_free_excluded_extents(cache); + btrfs_put_block_group(cache); +- return ret; ++ return ERR_PTR(ret); + } + + add_new_free_space(cache, chunk_offset, chunk_offset + size); +@@ -2320,7 +2339,7 @@ int btrfs_make_block_group(struct btrfs_ + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); +- return ret; ++ return ERR_PTR(ret); + } + + /* +@@ -2339,7 +2358,7 @@ int btrfs_make_block_group(struct btrfs_ + btrfs_update_delayed_refs_rsv(trans); + + set_avail_alloc_bits(fs_info, type); +- return 0; ++ return cache; + } + + /* +@@ -3219,11 +3238,203 @@ int btrfs_force_chunk_alloc(struct btrfs + return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + } + ++static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) ++{ ++ struct btrfs_block_group *bg; ++ int ret; ++ ++ /* ++ * Check if we have enough space in the system space info because we ++ * will need to update device items in the chunk btree and insert a new ++ * chunk item in the chunk btree as well. This will allocate a new ++ * system block group if needed. ++ */ ++ check_system_chunk(trans, flags); ++ ++ bg = btrfs_alloc_chunk(trans, flags); ++ if (IS_ERR(bg)) { ++ ret = PTR_ERR(bg); ++ goto out; ++ } ++ ++ /* ++ * If this is a system chunk allocation then stop right here and do not ++ * add the chunk item to the chunk btree. This is to prevent a deadlock ++ * because this system chunk allocation can be triggered while COWing ++ * some extent buffer of the chunk btree and while holding a lock on a ++ * parent extent buffer, in which case attempting to insert the chunk ++ * item (or update the device item) would result in a deadlock on that ++ * parent extent buffer. In this case defer the chunk btree updates to ++ * the second phase of chunk allocation and keep our reservation until ++ * the second phase completes. ++ * ++ * This is a rare case and can only be triggered by the very few cases ++ * we have where we need to touch the chunk btree outside chunk allocation ++ * and chunk removal. These cases are basically adding a device, removing ++ * a device or resizing a device. ++ */ ++ if (flags & BTRFS_BLOCK_GROUP_SYSTEM) ++ return 0; ++ ++ ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); ++ /* ++ * Normally we are not expected to fail with -ENOSPC here, since we have ++ * previously reserved space in the system space_info and allocated one ++ * new system chunk if necessary. However there are two exceptions: ++ * ++ * 1) We may have enough free space in the system space_info but all the ++ * existing system block groups have a profile which can not be used ++ * for extent allocation. ++ * ++ * This happens when mounting in degraded mode. For example we have a ++ * RAID1 filesystem with 2 devices, lose one device and mount the fs ++ * using the other device in degraded mode. If we then allocate a chunk, ++ * we may have enough free space in the existing system space_info, but ++ * none of the block groups can be used for extent allocation since they ++ * have a RAID1 profile, and because we are in degraded mode with a ++ * single device, we are forced to allocate a new system chunk with a ++ * SINGLE profile. Making check_system_chunk() iterate over all system ++ * block groups and check if they have a usable profile and enough space ++ * can be slow on very large filesystems, so we tolerate the -ENOSPC and ++ * try again after forcing allocation of a new system chunk. Like this ++ * we avoid paying the cost of that search in normal circumstances, when ++ * we were not mounted in degraded mode; ++ * ++ * 2) We had enough free space info the system space_info, and one suitable ++ * block group to allocate from when we called check_system_chunk() ++ * above. However right after we called it, the only system block group ++ * with enough free space got turned into RO mode by a running scrub, ++ * and in this case we have to allocate a new one and retry. We only ++ * need do this allocate and retry once, since we have a transaction ++ * handle and scrub uses the commit root to search for block groups. ++ */ ++ if (ret == -ENOSPC) { ++ const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info); ++ struct btrfs_block_group *sys_bg; ++ ++ sys_bg = btrfs_alloc_chunk(trans, sys_flags); ++ if (IS_ERR(sys_bg)) { ++ ret = PTR_ERR(sys_bg); ++ btrfs_abort_transaction(trans, ret); ++ goto out; ++ } ++ ++ ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); ++ if (ret) { ++ btrfs_abort_transaction(trans, ret); ++ goto out; ++ } ++ ++ ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); ++ if (ret) { ++ btrfs_abort_transaction(trans, ret); ++ goto out; ++ } ++ } else if (ret) { ++ btrfs_abort_transaction(trans, ret); ++ goto out; ++ } ++out: ++ btrfs_trans_release_chunk_metadata(trans); ++ ++ return ret; ++} ++ + /* +- * If force is CHUNK_ALLOC_FORCE: ++ * Chunk allocation is done in 2 phases: ++ * ++ * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for ++ * the chunk, the chunk mapping, create its block group and add the items ++ * that belong in the chunk btree to it - more specifically, we need to ++ * update device items in the chunk btree and add a new chunk item to it. ++ * ++ * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block ++ * group item to the extent btree and the device extent items to the devices ++ * btree. ++ * ++ * This is done to prevent deadlocks. For example when COWing a node from the ++ * extent btree we are holding a write lock on the node's parent and if we ++ * trigger chunk allocation and attempted to insert the new block group item ++ * in the extent btree right way, we could deadlock because the path for the ++ * insertion can include that parent node. At first glance it seems impossible ++ * to trigger chunk allocation after starting a transaction since tasks should ++ * reserve enough transaction units (metadata space), however while that is true ++ * most of the time, chunk allocation may still be triggered for several reasons: ++ * ++ * 1) When reserving metadata, we check if there is enough free space in the ++ * metadata space_info and therefore don't trigger allocation of a new chunk. ++ * However later when the task actually tries to COW an extent buffer from ++ * the extent btree or from the device btree for example, it is forced to ++ * allocate a new block group (chunk) because the only one that had enough ++ * free space was just turned to RO mode by a running scrub for example (or ++ * device replace, block group reclaim thread, etc), so we can not use it ++ * for allocating an extent and end up being forced to allocate a new one; ++ * ++ * 2) Because we only check that the metadata space_info has enough free bytes, ++ * we end up not allocating a new metadata chunk in that case. However if ++ * the filesystem was mounted in degraded mode, none of the existing block ++ * groups might be suitable for extent allocation due to their incompatible ++ * profile (for e.g. mounting a 2 devices filesystem, where all block groups ++ * use a RAID1 profile, in degraded mode using a single device). In this case ++ * when the task attempts to COW some extent buffer of the extent btree for ++ * example, it will trigger allocation of a new metadata block group with a ++ * suitable profile (SINGLE profile in the example of the degraded mount of ++ * the RAID1 filesystem); ++ * ++ * 3) The task has reserved enough transaction units / metadata space, but when ++ * it attempts to COW an extent buffer from the extent or device btree for ++ * example, it does not find any free extent in any metadata block group, ++ * therefore forced to try to allocate a new metadata block group. ++ * This is because some other task allocated all available extents in the ++ * meanwhile - this typically happens with tasks that don't reserve space ++ * properly, either intentionally or as a bug. One example where this is ++ * done intentionally is fsync, as it does not reserve any transaction units ++ * and ends up allocating a variable number of metadata extents for log ++ * tree extent buffers. ++ * ++ * We also need this 2 phases setup when adding a device to a filesystem with ++ * a seed device - we must create new metadata and system chunks without adding ++ * any of the block group items to the chunk, extent and device btrees. If we ++ * did not do it this way, we would get ENOSPC when attempting to update those ++ * btrees, since all the chunks from the seed device are read-only. ++ * ++ * Phase 1 does the updates and insertions to the chunk btree because if we had ++ * it done in phase 2 and have a thundering herd of tasks allocating chunks in ++ * parallel, we risk having too many system chunks allocated by many tasks if ++ * many tasks reach phase 1 without the previous ones completing phase 2. In the ++ * extreme case this leads to exhaustion of the system chunk array in the ++ * superblock. This is easier to trigger if using a btree node/leaf size of 64K ++ * and with RAID filesystems (so we have more device items in the chunk btree). ++ * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of ++ * the system chunk array due to concurrent allocations") provides more details. ++ * ++ * For allocation of system chunks, we defer the updates and insertions into the ++ * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because ++ * if the chunk allocation is triggered while COWing an extent buffer of the ++ * chunk btree, we are holding a lock on the parent of that extent buffer and ++ * doing the chunk btree updates and insertions can require locking that parent. ++ * This is for the very few and rare cases where we update the chunk btree that ++ * are not chunk allocation or chunk removal: adding a device, removing a device ++ * or resizing a device. ++ * ++ * The reservation of system space, done through check_system_chunk(), as well ++ * as all the updates and insertions into the chunk btree must be done while ++ * holding fs_info->chunk_mutex. This is important to guarantee that while COWing ++ * an extent buffer from the chunks btree we never trigger allocation of a new ++ * system chunk, which would result in a deadlock (trying to lock twice an ++ * extent buffer of the chunk btree, first time before triggering the chunk ++ * allocation and the second time during chunk allocation while attempting to ++ * update the chunks btree). The system chunk array is also updated while holding ++ * that mutex. The same logic applies to removing chunks - we must reserve system ++ * space, update the chunk btree and the system chunk array in the superblock ++ * while holding fs_info->chunk_mutex. ++ * ++ * This function, btrfs_chunk_alloc(), belongs to phase 1. ++ * ++ * If @force is CHUNK_ALLOC_FORCE: + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. +- * If force is NOT CHUNK_ALLOC_FORCE: ++ * If @force is NOT CHUNK_ALLOC_FORCE: + * - return 0 if it doesn't need to allocate a new chunk, + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. +@@ -3240,6 +3451,13 @@ int btrfs_chunk_alloc(struct btrfs_trans + /* Don't re-enter if we're already allocating a chunk */ + if (trans->allocating_chunk) + return -ENOSPC; ++ /* ++ * If we are removing a chunk, don't re-enter or we would deadlock. ++ * System space reservation and system chunk allocation is done by the ++ * chunk remove operation (btrfs_remove_chunk()). ++ */ ++ if (trans->removing_chunk) ++ return -ENOSPC; + + space_info = btrfs_find_space_info(fs_info, flags); + ASSERT(space_info); +@@ -3303,13 +3521,7 @@ int btrfs_chunk_alloc(struct btrfs_trans + force_metadata_allocation(fs_info); + } + +- /* +- * Check if we have enough space in SYSTEM chunk because we may need +- * to update devices. +- */ +- check_system_chunk(trans, flags); +- +- ret = btrfs_alloc_chunk(trans, flags); ++ ret = do_chunk_alloc(trans, flags); + trans->allocating_chunk = false; + + spin_lock(&space_info->lock); +@@ -3328,22 +3540,6 @@ out: + space_info->chunk_alloc = 0; + spin_unlock(&space_info->lock); + mutex_unlock(&fs_info->chunk_mutex); +- /* +- * When we allocate a new chunk we reserve space in the chunk block +- * reserve to make sure we can COW nodes/leafs in the chunk tree or +- * add new nodes/leafs to it if we end up needing to do it when +- * inserting the chunk item and updating device items as part of the +- * second phase of chunk allocation, performed by +- * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a +- * large number of new block groups to create in our transaction +- * handle's new_bgs list to avoid exhausting the chunk block reserve +- * in extreme cases - like having a single transaction create many new +- * block groups when starting to write out the free space caches of all +- * the block groups that were made dirty during the lifetime of the +- * transaction. +- */ +- if (trans->chunk_bytes_reserved >= (u64)SZ_2M) +- btrfs_create_pending_block_groups(trans); + + return ret; + } +@@ -3396,14 +3592,31 @@ void check_system_chunk(struct btrfs_tra + + if (left < thresh) { + u64 flags = btrfs_system_alloc_profile(fs_info); ++ struct btrfs_block_group *bg; + + /* + * Ignore failure to create system chunk. We might end up not + * needing it, as we might not need to COW all nodes/leafs from + * the paths we visit in the chunk tree (they were already COWed + * or created in the current transaction for example). ++ * ++ * Also, if our caller is allocating a system chunk, do not ++ * attempt to insert the chunk item in the chunk btree, as we ++ * could deadlock on an extent buffer since our caller may be ++ * COWing an extent buffer from the chunk btree. + */ +- ret = btrfs_alloc_chunk(trans, flags); ++ bg = btrfs_alloc_chunk(trans, flags); ++ if (IS_ERR(bg)) { ++ ret = PTR_ERR(bg); ++ } else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) { ++ /* ++ * If we fail to add the chunk item here, we end up ++ * trying again at phase 2 of chunk allocation, at ++ * btrfs_create_pending_block_groups(). So ignore ++ * any error here. ++ */ ++ btrfs_chunk_alloc_add_chunk_item(trans, bg); ++ } + } + + if (!ret) { +--- a/fs/btrfs/block-group.h ++++ b/fs/btrfs/block-group.h +@@ -97,6 +97,7 @@ struct btrfs_block_group { + unsigned int removed:1; + unsigned int to_copy:1; + unsigned int relocating_repair:1; ++ unsigned int chunk_item_inserted:1; + + int disk_cache_state; + +@@ -268,8 +269,9 @@ void btrfs_reclaim_bgs_work(struct work_ + void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info); + void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg); + int btrfs_read_block_groups(struct btrfs_fs_info *info); +-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, +- u64 type, u64 chunk_offset, u64 size); ++struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, ++ u64 bytes_used, u64 type, ++ u64 chunk_offset, u64 size); + void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); + int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, + bool do_chunk_alloc); +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -364,49 +364,6 @@ static noinline int update_ref_for_cow(s + return 0; + } + +-static struct extent_buffer *alloc_tree_block_no_bg_flush( +- struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- u64 parent_start, +- const struct btrfs_disk_key *disk_key, +- int level, +- u64 hint, +- u64 empty_size, +- enum btrfs_lock_nesting nest) +-{ +- struct btrfs_fs_info *fs_info = root->fs_info; +- struct extent_buffer *ret; +- +- /* +- * If we are COWing a node/leaf from the extent, chunk, device or free +- * space trees, make sure that we do not finish block group creation of +- * pending block groups. We do this to avoid a deadlock. +- * COWing can result in allocation of a new chunk, and flushing pending +- * block groups (btrfs_create_pending_block_groups()) can be triggered +- * when finishing allocation of a new chunk. Creation of a pending block +- * group modifies the extent, chunk, device and free space trees, +- * therefore we could deadlock with ourselves since we are holding a +- * lock on an extent buffer that btrfs_create_pending_block_groups() may +- * try to COW later. +- * For similar reasons, we also need to delay flushing pending block +- * groups when splitting a leaf or node, from one of those trees, since +- * we are holding a write lock on it and its parent or when inserting a +- * new root node for one of those trees. +- */ +- if (root == fs_info->extent_root || +- root == fs_info->chunk_root || +- root == fs_info->dev_root || +- root == fs_info->free_space_root) +- trans->can_flush_pending_bgs = false; +- +- ret = btrfs_alloc_tree_block(trans, root, parent_start, +- root->root_key.objectid, disk_key, level, +- hint, empty_size, nest); +- trans->can_flush_pending_bgs = true; +- +- return ret; +-} +- + /* + * does the dirty work in cow of a single block. The parent block (if + * supplied) is updated to point to the new cow copy. The new buffer is marked +@@ -455,8 +412,9 @@ static noinline int __btrfs_cow_block(st + if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) + parent_start = parent->start; + +- cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key, +- level, search_start, empty_size, nest); ++ cow = btrfs_alloc_tree_block(trans, root, parent_start, ++ root->root_key.objectid, &disk_key, level, ++ search_start, empty_size, nest); + if (IS_ERR(cow)) + return PTR_ERR(cow); + +@@ -2458,9 +2416,9 @@ static noinline int insert_new_root(stru + else + btrfs_node_key(lower, &lower_key, 0); + +- c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level, +- root->node->start, 0, +- BTRFS_NESTING_NEW_ROOT); ++ c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, ++ &lower_key, level, root->node->start, 0, ++ BTRFS_NESTING_NEW_ROOT); + if (IS_ERR(c)) + return PTR_ERR(c); + +@@ -2589,8 +2547,9 @@ static noinline int split_node(struct bt + mid = (c_nritems + 1) / 2; + btrfs_node_key(c, &disk_key, mid); + +- split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level, +- c->start, 0, BTRFS_NESTING_SPLIT); ++ split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, ++ &disk_key, level, c->start, 0, ++ BTRFS_NESTING_SPLIT); + if (IS_ERR(split)) + return PTR_ERR(split); + +@@ -3381,10 +3340,10 @@ again: + * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just + * use BTRFS_NESTING_NEW_ROOT. + */ +- right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0, +- l->start, 0, num_doubles ? +- BTRFS_NESTING_NEW_ROOT : +- BTRFS_NESTING_SPLIT); ++ right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, ++ &disk_key, 0, l->start, 0, ++ num_doubles ? BTRFS_NESTING_NEW_ROOT : ++ BTRFS_NESTING_SPLIT); + if (IS_ERR(right)) + return PTR_ERR(right); + +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -254,8 +254,11 @@ static inline int extwriter_counter_read + } + + /* +- * To be called after all the new block groups attached to the transaction +- * handle have been created (btrfs_create_pending_block_groups()). ++ * To be called after doing the chunk btree updates right after allocating a new ++ * chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a ++ * chunk after all chunk btree updates and after finishing the second phase of ++ * chunk allocation (btrfs_create_pending_block_groups()) in case some block ++ * group had its chunk item insertion delayed to the second phase. + */ + void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) + { +@@ -264,8 +267,6 @@ void btrfs_trans_release_chunk_metadata( + if (!trans->chunk_bytes_reserved) + return; + +- WARN_ON_ONCE(!list_empty(&trans->new_bgs)); +- + btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, + trans->chunk_bytes_reserved, NULL); + trans->chunk_bytes_reserved = 0; +@@ -699,7 +700,6 @@ again: + h->fs_info = root->fs_info; + + h->type = type; +- h->can_flush_pending_bgs = true; + INIT_LIST_HEAD(&h->new_bgs); + + smp_mb(); +--- a/fs/btrfs/transaction.h ++++ b/fs/btrfs/transaction.h +@@ -134,7 +134,7 @@ struct btrfs_trans_handle { + short aborted; + bool adding_csums; + bool allocating_chunk; +- bool can_flush_pending_bgs; ++ bool removing_chunk; + bool reloc_reserved; + bool in_fsync; + struct btrfs_root *root; +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -1745,19 +1745,14 @@ again: + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + } else { +- btrfs_handle_fs_error(fs_info, ret, "Slot search failed"); + goto out; + } + + *dev_extent_len = btrfs_dev_extent_length(leaf, extent); + + ret = btrfs_del_item(trans, root, path); +- if (ret) { +- btrfs_handle_fs_error(fs_info, ret, +- "Failed to remove dev extent item"); +- } else { ++ if (ret == 0) + set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); +- } + out: + btrfs_free_path(path); + return ret; +@@ -2942,7 +2937,7 @@ static int btrfs_del_sys_chunk(struct bt + u32 cur; + struct btrfs_key key; + +- mutex_lock(&fs_info->chunk_mutex); ++ lockdep_assert_held(&fs_info->chunk_mutex); + array_size = btrfs_super_sys_array_size(super_copy); + + ptr = super_copy->sys_chunk_array; +@@ -2972,7 +2967,6 @@ static int btrfs_del_sys_chunk(struct bt + cur += len; + } + } +- mutex_unlock(&fs_info->chunk_mutex); + return ret; + } + +@@ -3012,6 +3006,29 @@ struct extent_map *btrfs_get_chunk_map(s + return em; + } + ++static int remove_chunk_item(struct btrfs_trans_handle *trans, ++ struct map_lookup *map, u64 chunk_offset) ++{ ++ int i; ++ ++ /* ++ * Removing chunk items and updating the device items in the chunks btree ++ * requires holding the chunk_mutex. ++ * See the comment at btrfs_chunk_alloc() for the details. ++ */ ++ lockdep_assert_held(&trans->fs_info->chunk_mutex); ++ ++ for (i = 0; i < map->num_stripes; i++) { ++ int ret; ++ ++ ret = btrfs_update_device(trans, map->stripes[i].dev); ++ if (ret) ++ return ret; ++ } ++ ++ return btrfs_free_chunk(trans, chunk_offset); ++} ++ + int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) + { + struct btrfs_fs_info *fs_info = trans->fs_info; +@@ -3032,14 +3049,16 @@ int btrfs_remove_chunk(struct btrfs_tran + return PTR_ERR(em); + } + map = em->map_lookup; +- mutex_lock(&fs_info->chunk_mutex); +- check_system_chunk(trans, map->type); +- mutex_unlock(&fs_info->chunk_mutex); + + /* +- * Take the device list mutex to prevent races with the final phase of +- * a device replace operation that replaces the device object associated +- * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()). ++ * First delete the device extent items from the devices btree. ++ * We take the device_list_mutex to avoid racing with the finishing phase ++ * of a device replace operation. See the comment below before acquiring ++ * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex ++ * because that can result in a deadlock when deleting the device extent ++ * items from the devices btree - COWing an extent buffer from the btree ++ * may result in allocating a new metadata chunk, which would attempt to ++ * lock again fs_info->chunk_mutex. + */ + mutex_lock(&fs_devices->device_list_mutex); + for (i = 0; i < map->num_stripes; i++) { +@@ -3061,18 +3080,73 @@ int btrfs_remove_chunk(struct btrfs_tran + btrfs_clear_space_info_full(fs_info); + mutex_unlock(&fs_info->chunk_mutex); + } ++ } ++ mutex_unlock(&fs_devices->device_list_mutex); + +- ret = btrfs_update_device(trans, device); ++ /* ++ * We acquire fs_info->chunk_mutex for 2 reasons: ++ * ++ * 1) Just like with the first phase of the chunk allocation, we must ++ * reserve system space, do all chunk btree updates and deletions, and ++ * update the system chunk array in the superblock while holding this ++ * mutex. This is for similar reasons as explained on the comment at ++ * the top of btrfs_chunk_alloc(); ++ * ++ * 2) Prevent races with the final phase of a device replace operation ++ * that replaces the device object associated with the map's stripes, ++ * because the device object's id can change at any time during that ++ * final phase of the device replace operation ++ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the ++ * replaced device and then see it with an ID of ++ * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating ++ * the device item, which does not exists on the chunk btree. ++ * The finishing phase of device replace acquires both the ++ * device_list_mutex and the chunk_mutex, in that order, so we are ++ * safe by just acquiring the chunk_mutex. ++ */ ++ trans->removing_chunk = true; ++ mutex_lock(&fs_info->chunk_mutex); ++ ++ check_system_chunk(trans, map->type); ++ ++ ret = remove_chunk_item(trans, map, chunk_offset); ++ /* ++ * Normally we should not get -ENOSPC since we reserved space before ++ * through the call to check_system_chunk(). ++ * ++ * Despite our system space_info having enough free space, we may not ++ * be able to allocate extents from its block groups, because all have ++ * an incompatible profile, which will force us to allocate a new system ++ * block group with the right profile, or right after we called ++ * check_system_space() above, a scrub turned the only system block group ++ * with enough free space into RO mode. ++ * This is explained with more detail at do_chunk_alloc(). ++ * ++ * So if we get -ENOSPC, allocate a new system chunk and retry once. ++ */ ++ if (ret == -ENOSPC) { ++ const u64 sys_flags = btrfs_system_alloc_profile(fs_info); ++ struct btrfs_block_group *sys_bg; ++ ++ sys_bg = btrfs_alloc_chunk(trans, sys_flags); ++ if (IS_ERR(sys_bg)) { ++ ret = PTR_ERR(sys_bg); ++ btrfs_abort_transaction(trans, ret); ++ goto out; ++ } ++ ++ ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); + if (ret) { +- mutex_unlock(&fs_devices->device_list_mutex); + btrfs_abort_transaction(trans, ret); + goto out; + } +- } +- mutex_unlock(&fs_devices->device_list_mutex); + +- ret = btrfs_free_chunk(trans, chunk_offset); +- if (ret) { ++ ret = remove_chunk_item(trans, map, chunk_offset); ++ if (ret) { ++ btrfs_abort_transaction(trans, ret); ++ goto out; ++ } ++ } else if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } +@@ -3087,6 +3161,15 @@ int btrfs_remove_chunk(struct btrfs_tran + } + } + ++ mutex_unlock(&fs_info->chunk_mutex); ++ trans->removing_chunk = false; ++ ++ /* ++ * We are done with chunk btree updates and deletions, so release the ++ * system space we previously reserved (with check_system_chunk()). ++ */ ++ btrfs_trans_release_chunk_metadata(trans); ++ + ret = btrfs_remove_block_group(trans, chunk_offset, em); + if (ret) { + btrfs_abort_transaction(trans, ret); +@@ -3094,6 +3177,10 @@ int btrfs_remove_chunk(struct btrfs_tran + } + + out: ++ if (trans->removing_chunk) { ++ mutex_unlock(&fs_info->chunk_mutex); ++ trans->removing_chunk = false; ++ } + /* once for us */ + free_extent_map(em); + return ret; +@@ -4868,13 +4955,12 @@ static int btrfs_add_system_chunk(struct + u32 array_size; + u8 *ptr; + +- mutex_lock(&fs_info->chunk_mutex); ++ lockdep_assert_held(&fs_info->chunk_mutex); ++ + array_size = btrfs_super_sys_array_size(super_copy); + if (array_size + item_size + sizeof(disk_key) +- > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { +- mutex_unlock(&fs_info->chunk_mutex); ++ > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) + return -EFBIG; +- } + + ptr = super_copy->sys_chunk_array + array_size; + btrfs_cpu_key_to_disk(&disk_key, key); +@@ -4883,7 +4969,6 @@ static int btrfs_add_system_chunk(struct + memcpy(ptr, chunk, item_size); + item_size += sizeof(disk_key); + btrfs_set_super_sys_array_size(super_copy, array_size + item_size); +- mutex_unlock(&fs_info->chunk_mutex); + + return 0; + } +@@ -5233,13 +5318,14 @@ static int decide_stripe_size(struct btr + } + } + +-static int create_chunk(struct btrfs_trans_handle *trans, ++static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, + struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) + { + struct btrfs_fs_info *info = trans->fs_info; + struct map_lookup *map = NULL; + struct extent_map_tree *em_tree; ++ struct btrfs_block_group *block_group; + struct extent_map *em; + u64 start = ctl->start; + u64 type = ctl->type; +@@ -5249,7 +5335,7 @@ static int create_chunk(struct btrfs_tra + + map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); + if (!map) +- return -ENOMEM; ++ return ERR_PTR(-ENOMEM); + map->num_stripes = ctl->num_stripes; + + for (i = 0; i < ctl->ndevs; ++i) { +@@ -5271,7 +5357,7 @@ static int create_chunk(struct btrfs_tra + em = alloc_extent_map(); + if (!em) { + kfree(map); +- return -ENOMEM; ++ return ERR_PTR(-ENOMEM); + } + set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); + em->map_lookup = map; +@@ -5287,12 +5373,12 @@ static int create_chunk(struct btrfs_tra + if (ret) { + write_unlock(&em_tree->lock); + free_extent_map(em); +- return ret; ++ return ERR_PTR(ret); + } + write_unlock(&em_tree->lock); + +- ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); +- if (ret) ++ block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); ++ if (IS_ERR(block_group)) + goto error_del_extent; + + for (i = 0; i < map->num_stripes; i++) { +@@ -5312,7 +5398,7 @@ static int create_chunk(struct btrfs_tra + check_raid56_incompat_flag(info, type); + check_raid1c34_incompat_flag(info, type); + +- return 0; ++ return block_group; + + error_del_extent: + write_lock(&em_tree->lock); +@@ -5324,34 +5410,36 @@ error_del_extent: + /* One for the tree reference */ + free_extent_map(em); + +- return ret; ++ return block_group; + } + +-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) ++struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans, ++ u64 type) + { + struct btrfs_fs_info *info = trans->fs_info; + struct btrfs_fs_devices *fs_devices = info->fs_devices; + struct btrfs_device_info *devices_info = NULL; + struct alloc_chunk_ctl ctl; ++ struct btrfs_block_group *block_group; + int ret; + + lockdep_assert_held(&info->chunk_mutex); + + if (!alloc_profile_is_valid(type, 0)) { + ASSERT(0); +- return -EINVAL; ++ return ERR_PTR(-EINVAL); + } + + if (list_empty(&fs_devices->alloc_list)) { + if (btrfs_test_opt(info, ENOSPC_DEBUG)) + btrfs_debug(info, "%s: no writable device", __func__); +- return -ENOSPC; ++ return ERR_PTR(-ENOSPC); + } + + if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(info, "invalid chunk type 0x%llx requested", type); + ASSERT(0); +- return -EINVAL; ++ return ERR_PTR(-EINVAL); + } + + ctl.start = find_next_chunk(info); +@@ -5361,46 +5449,43 @@ int btrfs_alloc_chunk(struct btrfs_trans + devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), + GFP_NOFS); + if (!devices_info) +- return -ENOMEM; ++ return ERR_PTR(-ENOMEM); + + ret = gather_device_info(fs_devices, &ctl, devices_info); +- if (ret < 0) ++ if (ret < 0) { ++ block_group = ERR_PTR(ret); + goto out; ++ } + + ret = decide_stripe_size(fs_devices, &ctl, devices_info); +- if (ret < 0) ++ if (ret < 0) { ++ block_group = ERR_PTR(ret); + goto out; ++ } + +- ret = create_chunk(trans, &ctl, devices_info); ++ block_group = create_chunk(trans, &ctl, devices_info); + + out: + kfree(devices_info); +- return ret; ++ return block_group; + } + + /* +- * Chunk allocation falls into two parts. The first part does work +- * that makes the new allocated chunk usable, but does not do any operation +- * that modifies the chunk tree. The second part does the work that +- * requires modifying the chunk tree. This division is important for the +- * bootstrap process of adding storage to a seed btrfs. ++ * This function, btrfs_finish_chunk_alloc(), belongs to phase 2. ++ * ++ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation ++ * phases. + */ + int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, + u64 chunk_offset, u64 chunk_size) + { + struct btrfs_fs_info *fs_info = trans->fs_info; +- struct btrfs_root *extent_root = fs_info->extent_root; +- struct btrfs_root *chunk_root = fs_info->chunk_root; +- struct btrfs_key key; + struct btrfs_device *device; +- struct btrfs_chunk *chunk; +- struct btrfs_stripe *stripe; + struct extent_map *em; + struct map_lookup *map; +- size_t item_size; + u64 dev_offset; + u64 stripe_size; +- int i = 0; ++ int i; + int ret = 0; + + em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); +@@ -5408,53 +5493,117 @@ int btrfs_finish_chunk_alloc(struct btrf + return PTR_ERR(em); + + map = em->map_lookup; +- item_size = btrfs_chunk_item_size(map->num_stripes); + stripe_size = em->orig_block_len; + +- chunk = kzalloc(item_size, GFP_NOFS); +- if (!chunk) { +- ret = -ENOMEM; +- goto out; +- } +- + /* + * Take the device list mutex to prevent races with the final phase of + * a device replace operation that replaces the device object associated + * with the map's stripes, because the device object's id can change + * at any time during that final phase of the device replace operation +- * (dev-replace.c:btrfs_dev_replace_finishing()). ++ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the ++ * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, ++ * resulting in persisting a device extent item with such ID. + */ + mutex_lock(&fs_info->fs_devices->device_list_mutex); + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + dev_offset = map->stripes[i].physical; + +- ret = btrfs_update_device(trans, device); +- if (ret) +- break; + ret = btrfs_alloc_dev_extent(trans, device, chunk_offset, + dev_offset, stripe_size); + if (ret) + break; + } +- if (ret) { +- mutex_unlock(&fs_info->fs_devices->device_list_mutex); ++ mutex_unlock(&fs_info->fs_devices->device_list_mutex); ++ ++ free_extent_map(em); ++ return ret; ++} ++ ++/* ++ * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the ++ * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system ++ * chunks. ++ * ++ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation ++ * phases. ++ */ ++int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, ++ struct btrfs_block_group *bg) ++{ ++ struct btrfs_fs_info *fs_info = trans->fs_info; ++ struct btrfs_root *extent_root = fs_info->extent_root; ++ struct btrfs_root *chunk_root = fs_info->chunk_root; ++ struct btrfs_key key; ++ struct btrfs_chunk *chunk; ++ struct btrfs_stripe *stripe; ++ struct extent_map *em; ++ struct map_lookup *map; ++ size_t item_size; ++ int i; ++ int ret; ++ ++ /* ++ * We take the chunk_mutex for 2 reasons: ++ * ++ * 1) Updates and insertions in the chunk btree must be done while holding ++ * the chunk_mutex, as well as updating the system chunk array in the ++ * superblock. See the comment on top of btrfs_chunk_alloc() for the ++ * details; ++ * ++ * 2) To prevent races with the final phase of a device replace operation ++ * that replaces the device object associated with the map's stripes, ++ * because the device object's id can change at any time during that ++ * final phase of the device replace operation ++ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the ++ * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, ++ * which would cause a failure when updating the device item, which does ++ * not exists, or persisting a stripe of the chunk item with such ID. ++ * Here we can't use the device_list_mutex because our caller already ++ * has locked the chunk_mutex, and the final phase of device replace ++ * acquires both mutexes - first the device_list_mutex and then the ++ * chunk_mutex. Using any of those two mutexes protects us from a ++ * concurrent device replace. ++ */ ++ lockdep_assert_held(&fs_info->chunk_mutex); ++ ++ em = btrfs_get_chunk_map(fs_info, bg->start, bg->length); ++ if (IS_ERR(em)) { ++ ret = PTR_ERR(em); ++ btrfs_abort_transaction(trans, ret); ++ return ret; ++ } ++ ++ map = em->map_lookup; ++ item_size = btrfs_chunk_item_size(map->num_stripes); ++ ++ chunk = kzalloc(item_size, GFP_NOFS); ++ if (!chunk) { ++ ret = -ENOMEM; ++ btrfs_abort_transaction(trans, ret); + goto out; + } + ++ for (i = 0; i < map->num_stripes; i++) { ++ struct btrfs_device *device = map->stripes[i].dev; ++ ++ ret = btrfs_update_device(trans, device); ++ if (ret) ++ goto out; ++ } ++ + stripe = &chunk->stripe; + for (i = 0; i < map->num_stripes; i++) { +- device = map->stripes[i].dev; +- dev_offset = map->stripes[i].physical; ++ struct btrfs_device *device = map->stripes[i].dev; ++ const u64 dev_offset = map->stripes[i].physical; + + btrfs_set_stack_stripe_devid(stripe, device->devid); + btrfs_set_stack_stripe_offset(stripe, dev_offset); + memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); + stripe++; + } +- mutex_unlock(&fs_info->fs_devices->device_list_mutex); + +- btrfs_set_stack_chunk_length(chunk, chunk_size); ++ btrfs_set_stack_chunk_length(chunk, bg->length); + btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); + btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); + btrfs_set_stack_chunk_type(chunk, map->type); +@@ -5466,15 +5615,18 @@ int btrfs_finish_chunk_alloc(struct btrf + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.type = BTRFS_CHUNK_ITEM_KEY; +- key.offset = chunk_offset; ++ key.offset = bg->start; + + ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); +- if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { +- /* +- * TODO: Cleanup of inserted chunk root in case of +- * failure. +- */ ++ if (ret) ++ goto out; ++ ++ bg->chunk_item_inserted = 1; ++ ++ if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); ++ if (ret) ++ goto out; + } + + out: +@@ -5487,16 +5639,41 @@ static noinline int init_first_rw_device + { + struct btrfs_fs_info *fs_info = trans->fs_info; + u64 alloc_profile; +- int ret; ++ struct btrfs_block_group *meta_bg; ++ struct btrfs_block_group *sys_bg; ++ ++ /* ++ * When adding a new device for sprouting, the seed device is read-only ++ * so we must first allocate a metadata and a system chunk. But before ++ * adding the block group items to the extent, device and chunk btrees, ++ * we must first: ++ * ++ * 1) Create both chunks without doing any changes to the btrees, as ++ * otherwise we would get -ENOSPC since the block groups from the ++ * seed device are read-only; ++ * ++ * 2) Add the device item for the new sprout device - finishing the setup ++ * of a new block group requires updating the device item in the chunk ++ * btree, so it must exist when we attempt to do it. The previous step ++ * ensures this does not fail with -ENOSPC. ++ * ++ * After that we can add the block group items to their btrees: ++ * update existing device item in the chunk btree, add a new block group ++ * item to the extent btree, add a new chunk item to the chunk btree and ++ * finally add the new device extent items to the devices btree. ++ */ + + alloc_profile = btrfs_metadata_alloc_profile(fs_info); +- ret = btrfs_alloc_chunk(trans, alloc_profile); +- if (ret) +- return ret; ++ meta_bg = btrfs_alloc_chunk(trans, alloc_profile); ++ if (IS_ERR(meta_bg)) ++ return PTR_ERR(meta_bg); + + alloc_profile = btrfs_system_alloc_profile(fs_info); +- ret = btrfs_alloc_chunk(trans, alloc_profile); +- return ret; ++ sys_bg = btrfs_alloc_chunk(trans, alloc_profile); ++ if (IS_ERR(sys_bg)) ++ return PTR_ERR(sys_bg); ++ ++ return 0; + } + + static inline int btrfs_chunk_max_errors(struct map_lookup *map) +@@ -7425,10 +7602,18 @@ int btrfs_read_chunk_tree(struct btrfs_f + total_dev++; + } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { + struct btrfs_chunk *chunk; ++ ++ /* ++ * We are only called at mount time, so no need to take ++ * fs_info->chunk_mutex. Plus, to avoid lockdep warnings, ++ * we always lock first fs_info->chunk_mutex before ++ * acquiring any locks on the chunk tree. This is a ++ * requirement for chunk allocation, see the comment on ++ * top of btrfs_chunk_alloc() for details. ++ */ ++ ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); +- mutex_lock(&fs_info->chunk_mutex); + ret = read_one_chunk(&found_key, leaf, chunk); +- mutex_unlock(&fs_info->chunk_mutex); + if (ret) + goto error; + } +--- a/fs/btrfs/volumes.h ++++ b/fs/btrfs/volumes.h +@@ -447,7 +447,8 @@ int btrfs_get_io_geometry(struct btrfs_f + struct btrfs_io_geometry *io_geom); + int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); + int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); +-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type); ++struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans, ++ u64 type); + void btrfs_mapping_tree_free(struct extent_map_tree *tree); + blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num); +@@ -506,6 +507,8 @@ unsigned long btrfs_full_stripe_len(stru + u64 logical); + int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, + u64 chunk_offset, u64 chunk_size); ++int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, ++ struct btrfs_block_group *bg); + int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); + struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); diff --git a/queue-5.13/btrfs-zoned-fix-types-for-u64-division-in-btrfs_reclaim_bgs_work.patch b/queue-5.13/btrfs-zoned-fix-types-for-u64-division-in-btrfs_reclaim_bgs_work.patch new file mode 100644 index 00000000000..c6f78bff116 --- /dev/null +++ b/queue-5.13/btrfs-zoned-fix-types-for-u64-division-in-btrfs_reclaim_bgs_work.patch @@ -0,0 +1,35 @@ +From 54afaae34ee49e98c1c902b444b42832551d090c Mon Sep 17 00:00:00 2001 +From: David Sterba +Date: Wed, 23 Jun 2021 17:54:54 +0200 +Subject: btrfs: zoned: fix types for u64 division in btrfs_reclaim_bgs_work + +From: David Sterba + +commit 54afaae34ee49e98c1c902b444b42832551d090c upstream. + +The types in calculation of the used percentage in the reclaiming +messages are both u64, though bg->length is either 1GiB (non-zoned) or +the zone size in the zoned mode. The upper limit on zone size is 8GiB so +this could theoretically overflow in the future, right now the values +fit. + +Fixes: 18bb8bbf13c1 ("btrfs: zoned: automatically reclaim zones") +CC: stable@vger.kernel.org # 5.13 +Reviewed-by: Johannes Thumshirn +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/block-group.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/block-group.c ++++ b/fs/btrfs/block-group.c +@@ -1539,7 +1539,7 @@ void btrfs_reclaim_bgs_work(struct work_ + goto next; + + btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used", +- bg->start, div_u64(bg->used * 100, bg->length)); ++ bg->start, div64_u64(bg->used * 100, bg->length)); + trace_btrfs_reclaim_block_group(bg); + ret = btrfs_relocate_chunk(fs_info, bg->start); + if (ret) diff --git a/queue-5.13/btrfs-zoned-fix-wrong-mutex-unlock-on-failure-to-allocate-log-root-tree.patch b/queue-5.13/btrfs-zoned-fix-wrong-mutex-unlock-on-failure-to-allocate-log-root-tree.patch new file mode 100644 index 00000000000..2f9cbca7072 --- /dev/null +++ b/queue-5.13/btrfs-zoned-fix-wrong-mutex-unlock-on-failure-to-allocate-log-root-tree.patch @@ -0,0 +1,43 @@ +From ea32af47f00a046a1f953370514d6d946efe0152 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Wed, 7 Jul 2021 12:23:45 +0100 +Subject: btrfs: zoned: fix wrong mutex unlock on failure to allocate log root tree + +From: Filipe Manana + +commit ea32af47f00a046a1f953370514d6d946efe0152 upstream. + +When syncing the log, if we fail to allocate the root node for the log +root tree: + +1) We are unlocking fs_info->tree_log_mutex, but at this point we have + not yet locked this mutex; + +2) We have locked fs_info->tree_root->log_mutex, but we end up not + unlocking it; + +So fix this by unlocking fs_info->tree_root->log_mutex instead of +fs_info->tree_log_mutex. + +Fixes: e75f9fd194090e ("btrfs: zoned: move log tree node allocation out of log_root_tree->log_mutex") +CC: stable@vger.kernel.org # 5.13+ +Reviewed-by: Nikolay Borisov +Reviewed-by: Johannes Thumshirn +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-log.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -3173,7 +3173,7 @@ int btrfs_sync_log(struct btrfs_trans_ha + if (!log_root_tree->node) { + ret = btrfs_alloc_log_tree_node(trans, log_root_tree); + if (ret) { +- mutex_unlock(&fs_info->tree_log_mutex); ++ mutex_unlock(&fs_info->tree_root->log_mutex); + goto out; + } + } diff --git a/queue-5.13/cgroup-verify-that-source-is-a-string.patch b/queue-5.13/cgroup-verify-that-source-is-a-string.patch new file mode 100644 index 00000000000..36955203579 --- /dev/null +++ b/queue-5.13/cgroup-verify-that-source-is-a-string.patch @@ -0,0 +1,64 @@ +From 3b0462726e7ef281c35a7a4ae33e93ee2bc9975b Mon Sep 17 00:00:00 2001 +From: Christian Brauner +Date: Wed, 14 Jul 2021 15:47:49 +0200 +Subject: cgroup: verify that source is a string + +From: Christian Brauner + +commit 3b0462726e7ef281c35a7a4ae33e93ee2bc9975b upstream. + +The following sequence can be used to trigger a UAF: + + int fscontext_fd = fsopen("cgroup"); + int fd_null = open("/dev/null, O_RDONLY); + int fsconfig(fscontext_fd, FSCONFIG_SET_FD, "source", fd_null); + close_range(3, ~0U, 0); + +The cgroup v1 specific fs parser expects a string for the "source" +parameter. However, it is perfectly legitimate to e.g. specify a file +descriptor for the "source" parameter. The fs parser doesn't know what +a filesystem allows there. So it's a bug to assume that "source" is +always of type fs_value_is_string when it can reasonably also be +fs_value_is_file. + +This assumption in the cgroup code causes a UAF because struct +fs_parameter uses a union for the actual value. Access to that union is +guarded by the param->type member. Since the cgroup paramter parser +didn't check param->type but unconditionally moved param->string into +fc->source a close on the fscontext_fd would trigger a UAF during +put_fs_context() which frees fc->source thereby freeing the file stashed +in param->file causing a UAF during a close of the fd_null. + +Fix this by verifying that param->type is actually a string and report +an error if not. + +In follow up patches I'll add a new generic helper that can be used here +and by other filesystems instead of this error-prone copy-pasta fix. +But fixing it in here first makes backporting a it to stable a lot +easier. + +Fixes: 8d2451f4994f ("cgroup1: switch to option-by-option parsing") +Reported-by: syzbot+283ce5a46486d6acdbaf@syzkaller.appspotmail.com +Cc: Christoph Hellwig +Cc: Alexander Viro +Cc: Dmitry Vyukov +Cc: +Cc: syzkaller-bugs +Signed-off-by: Christian Brauner +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + kernel/cgroup/cgroup-v1.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/kernel/cgroup/cgroup-v1.c ++++ b/kernel/cgroup/cgroup-v1.c +@@ -912,6 +912,8 @@ int cgroup1_parse_param(struct fs_contex + opt = fs_parse(fc, cgroup1_fs_parameters, param, &result); + if (opt == -ENOPARAM) { + if (strcmp(param->key, "source") == 0) { ++ if (param->type != fs_value_is_string) ++ return invalf(fc, "Non-string source"); + if (fc->source) + return invalf(fc, "Multiple sources not supported"); + fc->source = param->string; diff --git a/queue-5.13/drm-amdgpu-add-another-renoir-did.patch b/queue-5.13/drm-amdgpu-add-another-renoir-did.patch new file mode 100644 index 00000000000..462ac881d26 --- /dev/null +++ b/queue-5.13/drm-amdgpu-add-another-renoir-did.patch @@ -0,0 +1,31 @@ +From 775da83005cb61d4c213c636df9337da05714ff1 Mon Sep 17 00:00:00 2001 +From: Jinzhou Su +Date: Tue, 13 Jul 2021 09:26:11 +0800 +Subject: drm/amdgpu: add another Renoir DID + +From: Jinzhou Su + +commit 775da83005cb61d4c213c636df9337da05714ff1 upstream. + +Add new PCI device id. + +Signed-off-by: Jinzhou Su +Reviewed-by: Huang Rui +Reviewed-by: Alex Deucher +Signed-off-by: Alex Deucher +Cc: stable@vger.kernel.org # 5.11.x +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +@@ -1148,6 +1148,7 @@ static const struct pci_device_id pciidl + {0x1002, 0x734F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14}, + + /* Renoir */ ++ {0x1002, 0x15E7, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU}, + {0x1002, 0x1636, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU}, + {0x1002, 0x1638, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU}, + {0x1002, 0x164C, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU}, diff --git a/queue-5.13/drm-i915-gt-fix-edeadlk-handling-regression.patch b/queue-5.13/drm-i915-gt-fix-edeadlk-handling-regression.patch new file mode 100644 index 00000000000..e1f0c7ff978 --- /dev/null +++ b/queue-5.13/drm-i915-gt-fix-edeadlk-handling-regression.patch @@ -0,0 +1,60 @@ +From 2feeb52859fc1ab94cd35b61ada3a6ac4ff24243 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= +Date: Wed, 30 Jun 2021 19:44:13 +0300 +Subject: drm/i915/gt: Fix -EDEADLK handling regression +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Ville Syrjälä + +commit 2feeb52859fc1ab94cd35b61ada3a6ac4ff24243 upstream. + +The conversion to ww mutexes failed to address the fence code which +already returns -EDEADLK when we run out of fences. Ww mutexes on +the other hand treat -EDEADLK as an internal errno value indicating +a need to restart the operation due to a deadlock. So now when the +fence code returns -EDEADLK the higher level code erroneously +restarts everything instead of returning the error to userspace +as is expected. + +To remedy this let's switch the fence code to use a different errno +value for this. -ENOBUFS seems like a semi-reasonable unique choice. +Apart from igt the only user of this I could find is sna, and even +there all we do is dump the current fence registers from debugfs +into the X server log. So no user visible functionality is affected. +If we really cared about preserving this we could of course convert +back to -EDEADLK higher up, but doesn't seem like that's worth +the hassle here. + +Not quite sure which commit specifically broke this, but I'll +just attribute it to the general gem ww mutex work. + +Cc: stable@vger.kernel.org +Cc: Maarten Lankhorst +Cc: Thomas Hellström +Testcase: igt/gem_pread/exhaustion +Testcase: igt/gem_pwrite/basic-exhaustion +Testcase: igt/gem_fenced_exec_thrash/too-many-fences +Fixes: 80f0b679d6f0 ("drm/i915: Add an implementation for i915_gem_ww_ctx locking, v2.") +Signed-off-by: Ville Syrjälä +Link: https://patchwork.freedesktop.org/patch/msgid/20210630164413.25481-1-ville.syrjala@linux.intel.com +Reviewed-by: Maarten Lankhorst +(cherry picked from commit 78d2ad7eb4e1f0e9cd5d79788446b6092c21d3e0) +Signed-off-by: Rodrigo Vivi +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c ++++ b/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c +@@ -348,7 +348,7 @@ static struct i915_fence_reg *fence_find + if (intel_has_pending_fb_unpin(ggtt->vm.i915)) + return ERR_PTR(-EAGAIN); + +- return ERR_PTR(-EDEADLK); ++ return ERR_PTR(-ENOBUFS); + } + + int __i915_vma_pin_fence(struct i915_vma *vma) diff --git a/queue-5.13/drm-i915-gtt-drop-the-page-table-optimisation.patch b/queue-5.13/drm-i915-gtt-drop-the-page-table-optimisation.patch new file mode 100644 index 00000000000..8dfb4b68d38 --- /dev/null +++ b/queue-5.13/drm-i915-gtt-drop-the-page-table-optimisation.patch @@ -0,0 +1,55 @@ +From 0abb33bfca0fb74df76aac03e90ce685016ef7be Mon Sep 17 00:00:00 2001 +From: Matthew Auld +Date: Tue, 13 Jul 2021 14:04:31 +0100 +Subject: drm/i915/gtt: drop the page table optimisation + +From: Matthew Auld + +commit 0abb33bfca0fb74df76aac03e90ce685016ef7be upstream. + +We skip filling out the pt with scratch entries if the va range covers +the entire pt, since we later have to fill it with the PTEs for the +object pages anyway. However this might leave open a small window where +the PTEs don't point to anything valid for the HW to consume. + +When for example using 2M GTT pages this fill_px() showed up as being +quite significant in perf measurements, and ends up being completely +wasted since we ignore the pt and just use the pde directly. + +Anyway, currently we have our PTE construction split between alloc and +insert, which is probably slightly iffy nowadays, since the alloc +doesn't actually allocate anything anymore, instead it just sets up the +page directories and points the PTEs at the scratch page. Later when we +do the insert step we re-program the PTEs again. Better might be to +squash the alloc and insert into a single step, then bringing back this +optimisation(along with some others) should be possible. + +Fixes: 14826673247e ("drm/i915: Only initialize partially filled pagetables") +Signed-off-by: Matthew Auld +Cc: Jon Bloomfield +Cc: Chris Wilson +Cc: Daniel Vetter +Cc: # v4.15+ +Reviewed-by: Daniel Vetter +Link: https://patchwork.freedesktop.org/patch/msgid/20210713130431.2392740-1-matthew.auld@intel.com +(cherry picked from commit 8f88ca76b3942d82e2c1cea8735ec368d89ecc15) +Signed-off-by: Rodrigo Vivi +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +--- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c ++++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c +@@ -304,10 +304,7 @@ static void __gen8_ppgtt_alloc(struct i9 + __i915_gem_object_pin_pages(pt->base); + i915_gem_object_make_unshrinkable(pt->base); + +- if (lvl || +- gen8_pt_count(*start, end) < I915_PDES || +- intel_vgpu_active(vm->i915)) +- fill_px(pt, vm->scratch[lvl]->encode); ++ fill_px(pt, vm->scratch[lvl]->encode); + + spin_lock(&pd->lock); + if (likely(!pd->entry[idx])) { diff --git a/queue-5.13/edac-igen6-fix-core-dependency-again.patch b/queue-5.13/edac-igen6-fix-core-dependency-again.patch new file mode 100644 index 00000000000..18ec5268bca --- /dev/null +++ b/queue-5.13/edac-igen6-fix-core-dependency-again.patch @@ -0,0 +1,38 @@ +From a1c9ca5f65c9acfd7c02474b9d5cacbd7ea288df Mon Sep 17 00:00:00 2001 +From: Randy Dunlap +Date: Thu, 15 Jul 2021 11:55:31 -0700 +Subject: EDAC/igen6: fix core dependency AGAIN + +From: Randy Dunlap + +commit a1c9ca5f65c9acfd7c02474b9d5cacbd7ea288df upstream. + +My previous patch had a typo/thinko which prevents this driver +from being enabled: change X64_64 to X86_64. + +Fixes: 0a9ece9ba154 ("EDAC/igen6: fix core dependency") +Signed-off-by: Randy Dunlap +Cc: Qiuxu Zhuo +Cc: Borislav Petkov +Cc: Mauro Carvalho Chehab +Cc: linux-edac@vger.kernel.org +Cc: bowsingbetee +Cc: stable@vger.kernel.org +Signed-off-by: Tony Luck +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + drivers/edac/Kconfig | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/edac/Kconfig ++++ b/drivers/edac/Kconfig +@@ -271,7 +271,7 @@ config EDAC_PND2 + config EDAC_IGEN6 + tristate "Intel client SoC Integrated MC" + depends on PCI && PCI_MMCONFIG && ARCH_HAVE_NMI_SAFE_CMPXCHG +- depends on X64_64 && X86_MCE_INTEL ++ depends on X86_64 && X86_MCE_INTEL + help + Support for error detection and correction on the Intel + client SoC Integrated Memory Controller using In-Band ECC IP. diff --git a/queue-5.13/fbmem-do-not-delete-the-mode-that-is-still-in-use.patch b/queue-5.13/fbmem-do-not-delete-the-mode-that-is-still-in-use.patch new file mode 100644 index 00000000000..4bb50b45e21 --- /dev/null +++ b/queue-5.13/fbmem-do-not-delete-the-mode-that-is-still-in-use.patch @@ -0,0 +1,85 @@ +From 0af778269a522c988ef0b4188556aba97fb420cc Mon Sep 17 00:00:00 2001 +From: Zhen Lei +Date: Mon, 12 Jul 2021 16:55:44 +0800 +Subject: fbmem: Do not delete the mode that is still in use + +From: Zhen Lei + +commit 0af778269a522c988ef0b4188556aba97fb420cc upstream. + +The execution of fb_delete_videomode() is not based on the result of the +previous fbcon_mode_deleted(). As a result, the mode is directly deleted, +regardless of whether it is still in use, which may cause UAF. + +================================================================== +BUG: KASAN: use-after-free in fb_mode_is_equal+0x36e/0x5e0 \ +drivers/video/fbdev/core/modedb.c:924 +Read of size 4 at addr ffff88807e0ddb1c by task syz-executor.0/18962 + +CPU: 2 PID: 18962 Comm: syz-executor.0 Not tainted 5.10.45-rc1+ #3 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ... +Call Trace: + __dump_stack lib/dump_stack.c:77 [inline] + dump_stack+0x137/0x1be lib/dump_stack.c:118 + print_address_description+0x6c/0x640 mm/kasan/report.c:385 + __kasan_report mm/kasan/report.c:545 [inline] + kasan_report+0x13d/0x1e0 mm/kasan/report.c:562 + fb_mode_is_equal+0x36e/0x5e0 drivers/video/fbdev/core/modedb.c:924 + fbcon_mode_deleted+0x16a/0x220 drivers/video/fbdev/core/fbcon.c:2746 + fb_set_var+0x1e1/0xdb0 drivers/video/fbdev/core/fbmem.c:975 + do_fb_ioctl+0x4d9/0x6e0 drivers/video/fbdev/core/fbmem.c:1108 + vfs_ioctl fs/ioctl.c:48 [inline] + __do_sys_ioctl fs/ioctl.c:753 [inline] + __se_sys_ioctl+0xfb/0x170 fs/ioctl.c:739 + do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + +Freed by task 18960: + kasan_save_stack mm/kasan/common.c:48 [inline] + kasan_set_track+0x3d/0x70 mm/kasan/common.c:56 + kasan_set_free_info+0x17/0x30 mm/kasan/generic.c:355 + __kasan_slab_free+0x108/0x140 mm/kasan/common.c:422 + slab_free_hook mm/slub.c:1541 [inline] + slab_free_freelist_hook+0xd6/0x1a0 mm/slub.c:1574 + slab_free mm/slub.c:3139 [inline] + kfree+0xca/0x3d0 mm/slub.c:4121 + fb_delete_videomode+0x56a/0x820 drivers/video/fbdev/core/modedb.c:1104 + fb_set_var+0x1f3/0xdb0 drivers/video/fbdev/core/fbmem.c:978 + do_fb_ioctl+0x4d9/0x6e0 drivers/video/fbdev/core/fbmem.c:1108 + vfs_ioctl fs/ioctl.c:48 [inline] + __do_sys_ioctl fs/ioctl.c:753 [inline] + __se_sys_ioctl+0xfb/0x170 fs/ioctl.c:739 + do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + +Fixes: 13ff178ccd6d ("fbcon: Call fbcon_mode_deleted/new_modelist directly") +Signed-off-by: Zhen Lei +Cc: # v5.3+ +Signed-off-by: Daniel Vetter +Link: https://patchwork.freedesktop.org/patch/msgid/20210712085544.2828-1-thunder.leizhen@huawei.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/video/fbdev/core/fbmem.c | 12 +++++------- + 1 file changed, 5 insertions(+), 7 deletions(-) + +--- a/drivers/video/fbdev/core/fbmem.c ++++ b/drivers/video/fbdev/core/fbmem.c +@@ -970,13 +970,11 @@ fb_set_var(struct fb_info *info, struct + fb_var_to_videomode(&mode2, &info->var); + /* make sure we don't delete the videomode of current var */ + ret = fb_mode_is_equal(&mode1, &mode2); +- +- if (!ret) +- fbcon_mode_deleted(info, &mode1); +- +- if (!ret) +- fb_delete_videomode(&mode1, &info->modelist); +- ++ if (!ret) { ++ ret = fbcon_mode_deleted(info, &mode1); ++ if (!ret) ++ fb_delete_videomode(&mode1, &info->modelist); ++ } + + return ret ? -EINVAL : 0; + } diff --git a/queue-5.13/io_uring-use-right-task-for-exiting-checks.patch b/queue-5.13/io_uring-use-right-task-for-exiting-checks.patch new file mode 100644 index 00000000000..65979ccdfbc --- /dev/null +++ b/queue-5.13/io_uring-use-right-task-for-exiting-checks.patch @@ -0,0 +1,35 @@ +From 9c6882608bce249a8918744ecdb65748534e3f17 Mon Sep 17 00:00:00 2001 +From: Pavel Begunkov +Date: Sat, 10 Jul 2021 02:45:59 +0100 +Subject: io_uring: use right task for exiting checks + +From: Pavel Begunkov + +commit 9c6882608bce249a8918744ecdb65748534e3f17 upstream. + +When we use delayed_work for fallback execution of requests, current +will be not of the submitter task, and so checks in io_req_task_submit() +may not behave as expected. Currently, it leaves inline completions not +flushed, so making io_ring_exit_work() to hang. Use the submitter task +for all those checks. + +Cc: stable@vger.kernel.org +Signed-off-by: Pavel Begunkov +Link: https://lore.kernel.org/r/cb413c715bed0bc9c98b169059ea9c8a2c770715.1625881431.git.asml.silence@gmail.com +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io_uring.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -2040,7 +2040,7 @@ static void __io_req_task_submit(struct + + /* ctx stays valid until unlock, even if we drop all ours ctx->refs */ + mutex_lock(&ctx->uring_lock); +- if (!(current->flags & PF_EXITING) && !current->in_execve) ++ if (!(req->task->flags & PF_EXITING) && !req->task->in_execve) + __io_queue_sqe(req); + else + io_req_complete_failed(req, -EFAULT); diff --git a/queue-5.13/iommu-vt-d-fix-clearing-real-dma-device-s-scalable-mode-context-entries.patch b/queue-5.13/iommu-vt-d-fix-clearing-real-dma-device-s-scalable-mode-context-entries.patch new file mode 100644 index 00000000000..a81432bf801 --- /dev/null +++ b/queue-5.13/iommu-vt-d-fix-clearing-real-dma-device-s-scalable-mode-context-entries.patch @@ -0,0 +1,47 @@ +From 474dd1c6506411752a9b2f2233eec11f1733a099 Mon Sep 17 00:00:00 2001 +From: Lu Baolu +Date: Mon, 12 Jul 2021 15:17:12 +0800 +Subject: iommu/vt-d: Fix clearing real DMA device's scalable-mode context entries + +From: Lu Baolu + +commit 474dd1c6506411752a9b2f2233eec11f1733a099 upstream. + +The commit 2b0140c69637e ("iommu/vt-d: Use pci_real_dma_dev() for mapping") +fixes an issue of "sub-device is removed where the context entry is cleared +for all aliases". But this commit didn't consider the PASID entry and PASID +table in VT-d scalable mode. This fix increases the coverage of scalable +mode. + +Suggested-by: Sanjay Kumar +Fixes: 8038bdb855331 ("iommu/vt-d: Only clear real DMA device's context entries") +Fixes: 2b0140c69637e ("iommu/vt-d: Use pci_real_dma_dev() for mapping") +Cc: stable@vger.kernel.org # v5.6+ +Cc: Jon Derrick +Signed-off-by: Lu Baolu +Link: https://lore.kernel.org/r/20210712071712.3416949-1-baolu.lu@linux.intel.com +Signed-off-by: Joerg Roedel +Signed-off-by: Greg Kroah-Hartman +--- + drivers/iommu/intel/iommu.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +--- a/drivers/iommu/intel/iommu.c ++++ b/drivers/iommu/intel/iommu.c +@@ -4483,14 +4483,13 @@ static void __dmar_remove_one_dev_info(s + iommu = info->iommu; + domain = info->domain; + +- if (info->dev) { ++ if (info->dev && !dev_is_real_dma_subdevice(info->dev)) { + if (dev_is_pci(info->dev) && sm_supported(iommu)) + intel_pasid_tear_down_entry(iommu, info->dev, + PASID_RID2PASID, false); + + iommu_disable_dev_iotlb(info); +- if (!dev_is_real_dma_subdevice(info->dev)) +- domain_context_clear(info); ++ domain_context_clear(info); + intel_pasid_free_table(info->dev); + } + diff --git a/queue-5.13/iommu-vt-d-global-devtlb-flush-when-present-context-entry-changed.patch b/queue-5.13/iommu-vt-d-global-devtlb-flush-when-present-context-entry-changed.patch new file mode 100644 index 00000000000..9b89fd32a3f --- /dev/null +++ b/queue-5.13/iommu-vt-d-global-devtlb-flush-when-present-context-entry-changed.patch @@ -0,0 +1,107 @@ +From 37764b952e1b39053defc7ebe5dcd8c4e3e78de9 Mon Sep 17 00:00:00 2001 +From: Sanjay Kumar +Date: Mon, 12 Jul 2021 15:13:15 +0800 +Subject: iommu/vt-d: Global devTLB flush when present context entry changed + +From: Sanjay Kumar + +commit 37764b952e1b39053defc7ebe5dcd8c4e3e78de9 upstream. + +This fixes a bug in context cache clear operation. The code was not +following the correct invalidation flow. A global device TLB invalidation +should be added after the IOTLB invalidation. At the same time, it +uses the domain ID from the context entry. But in scalable mode, the +domain ID is in PASID table entry, not context entry. + +Fixes: 7373a8cc38197 ("iommu/vt-d: Setup context and enable RID2PASID support") +Cc: stable@vger.kernel.org # v5.0+ +Signed-off-by: Sanjay Kumar +Signed-off-by: Lu Baolu +Link: https://lore.kernel.org/r/20210712071315.3416543-1-baolu.lu@linux.intel.com +Signed-off-by: Joerg Roedel +Signed-off-by: Greg Kroah-Hartman +--- + drivers/iommu/intel/iommu.c | 31 ++++++++++++++++++++++--------- + 1 file changed, 22 insertions(+), 9 deletions(-) + +--- a/drivers/iommu/intel/iommu.c ++++ b/drivers/iommu/intel/iommu.c +@@ -2434,10 +2434,11 @@ __domain_mapping(struct dmar_domain *dom + return 0; + } + +-static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn) ++static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) + { +- unsigned long flags; ++ struct intel_iommu *iommu = info->iommu; + struct context_entry *context; ++ unsigned long flags; + u16 did_old; + + if (!iommu) +@@ -2449,7 +2450,16 @@ static void domain_context_clear_one(str + spin_unlock_irqrestore(&iommu->lock, flags); + return; + } +- did_old = context_domain_id(context); ++ ++ if (sm_supported(iommu)) { ++ if (hw_pass_through && domain_type_is_si(info->domain)) ++ did_old = FLPT_DEFAULT_DID; ++ else ++ did_old = info->domain->iommu_did[iommu->seq_id]; ++ } else { ++ did_old = context_domain_id(context); ++ } ++ + context_clear_entry(context); + __iommu_flush_cache(iommu, context, sizeof(*context)); + spin_unlock_irqrestore(&iommu->lock, flags); +@@ -2467,6 +2477,8 @@ static void domain_context_clear_one(str + 0, + 0, + DMA_TLB_DSI_FLUSH); ++ ++ __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); + } + + static inline void unlink_domain_info(struct device_domain_info *info) +@@ -4436,9 +4448,9 @@ out_free_dmar: + + static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) + { +- struct intel_iommu *iommu = opaque; ++ struct device_domain_info *info = opaque; + +- domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff); ++ domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); + return 0; + } + +@@ -4448,12 +4460,13 @@ static int domain_context_clear_one_cb(s + * devices, unbinding the driver from any one of them will possibly leave + * the others unable to operate. + */ +-static void domain_context_clear(struct intel_iommu *iommu, struct device *dev) ++static void domain_context_clear(struct device_domain_info *info) + { +- if (!iommu || !dev || !dev_is_pci(dev)) ++ if (!info->iommu || !info->dev || !dev_is_pci(info->dev)) + return; + +- pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu); ++ pci_for_each_dma_alias(to_pci_dev(info->dev), ++ &domain_context_clear_one_cb, info); + } + + static void __dmar_remove_one_dev_info(struct device_domain_info *info) +@@ -4477,7 +4490,7 @@ static void __dmar_remove_one_dev_info(s + + iommu_disable_dev_iotlb(info); + if (!dev_is_real_dma_subdevice(info->dev)) +- domain_context_clear(iommu, info->dev); ++ domain_context_clear(info); + intel_pasid_free_table(info->dev); + } + diff --git a/queue-5.13/mm-hugetlb-fix-refs-calculation-from-unaligned-vaddr.patch b/queue-5.13/mm-hugetlb-fix-refs-calculation-from-unaligned-vaddr.patch new file mode 100644 index 00000000000..92ff94cf323 --- /dev/null +++ b/queue-5.13/mm-hugetlb-fix-refs-calculation-from-unaligned-vaddr.patch @@ -0,0 +1,45 @@ +From d08af0a59684e18a51aa4bfd24c658994ea3fc5b Mon Sep 17 00:00:00 2001 +From: Joao Martins +Date: Wed, 14 Jul 2021 21:27:11 -0700 +Subject: mm/hugetlb: fix refs calculation from unaligned @vaddr + +From: Joao Martins + +commit d08af0a59684e18a51aa4bfd24c658994ea3fc5b upstream. + +Commit 82e5d378b0e47 ("mm/hugetlb: refactor subpage recording") +refactored the count of subpages but missed an edge case when @vaddr is +not aligned to PAGE_SIZE e.g. when close to vma->vm_end. It would then +errousnly set @refs to 0 and record_subpages_vmas() wouldn't set the +@pages array element to its value, consequently causing the reported +null-deref by syzbot. + +Fix it by aligning down @vaddr by PAGE_SIZE in @refs calculation. + +Link: https://lkml.kernel.org/r/20210713152440.28650-1-joao.m.martins@oracle.com +Fixes: 82e5d378b0e47 ("mm/hugetlb: refactor subpage recording") +Reported-by: syzbot+a3fcd59df1b372066f5a@syzkaller.appspotmail.com +Signed-off-by: Joao Martins +Reviewed-by: Mike Kravetz +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/hugetlb.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -5216,8 +5216,9 @@ long follow_hugetlb_page(struct mm_struc + continue; + } + +- refs = min3(pages_per_huge_page(h) - pfn_offset, +- (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder); ++ /* vaddr may not be aligned to PAGE_SIZE */ ++ refs = min3(pages_per_huge_page(h) - pfn_offset, remainder, ++ (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT); + + if (pages || vmas) + record_subpages_vmas(mem_map_offset(page, pfn_offset), diff --git a/queue-5.13/scsi-core-fix-bad-pointer-dereference-when-ehandler-kthread-is-invalid.patch b/queue-5.13/scsi-core-fix-bad-pointer-dereference-when-ehandler-kthread-is-invalid.patch new file mode 100644 index 00000000000..a3c101b8943 --- /dev/null +++ b/queue-5.13/scsi-core-fix-bad-pointer-dereference-when-ehandler-kthread-is-invalid.patch @@ -0,0 +1,96 @@ +From 93aa71ad7379900e61c8adff6a710a4c18c7c99b Mon Sep 17 00:00:00 2001 +From: Tyrel Datwyler +Date: Thu, 1 Jul 2021 13:56:59 -0600 +Subject: scsi: core: Fix bad pointer dereference when ehandler kthread is invalid + +From: Tyrel Datwyler + +commit 93aa71ad7379900e61c8adff6a710a4c18c7c99b upstream. + +Commit 66a834d09293 ("scsi: core: Fix error handling of scsi_host_alloc()") +changed the allocation logic to call put_device() to perform host cleanup +with the assumption that IDA removal and stopping the kthread would +properly be performed in scsi_host_dev_release(). However, in the unlikely +case that the error handler thread fails to spawn, shost->ehandler is set +to ERR_PTR(-ENOMEM). + +The error handler cleanup code in scsi_host_dev_release() will call +kthread_stop() if shost->ehandler != NULL which will always be the case +whether the kthread was successfully spawned or not. In the case that it +failed to spawn this has the nasty side effect of trying to dereference an +invalid pointer when kthread_stop() is called. The following splat provides +an example of this behavior in the wild: + +scsi host11: error handler thread failed to spawn, error = -4 +Kernel attempted to read user page (10c) - exploit attempt? (uid: 0) +BUG: Kernel NULL pointer dereference on read at 0x0000010c +Faulting instruction address: 0xc00000000818e9a8 +Oops: Kernel access of bad area, sig: 11 [#1] +LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries +Modules linked in: ibmvscsi(+) scsi_transport_srp dm_multipath dm_mirror dm_region + hash dm_log dm_mod fuse overlay squashfs loop +CPU: 12 PID: 274 Comm: systemd-udevd Not tainted 5.13.0-rc7 #1 +NIP: c00000000818e9a8 LR: c0000000089846e8 CTR: 0000000000007ee8 +REGS: c000000037d12ea0 TRAP: 0300 Not tainted (5.13.0-rc7) +MSR: 800000000280b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE> CR: 28228228 +XER: 20040001 +CFAR: c0000000089846e4 DAR: 000000000000010c DSISR: 40000000 IRQMASK: 0 +GPR00: c0000000089846e8 c000000037d13140 c000000009cc1100 fffffffffffffffc +GPR04: 0000000000000001 0000000000000000 0000000000000000 c000000037dc0000 +GPR08: 0000000000000000 c000000037dc0000 0000000000000001 00000000fffff7ff +GPR12: 0000000000008000 c00000000a049000 c000000037d13d00 000000011134d5a0 +GPR16: 0000000000001740 c0080000190d0000 c0080000190d1740 c000000009129288 +GPR20: c000000037d13bc0 0000000000000001 c000000037d13bc0 c0080000190b7898 +GPR24: c0080000190b7708 0000000000000000 c000000033bb2c48 0000000000000000 +GPR28: c000000046b28280 0000000000000000 000000000000010c fffffffffffffffc +NIP [c00000000818e9a8] kthread_stop+0x38/0x230 +LR [c0000000089846e8] scsi_host_dev_release+0x98/0x160 +Call Trace: +[c000000033bb2c48] 0xc000000033bb2c48 (unreliable) +[c0000000089846e8] scsi_host_dev_release+0x98/0x160 +[c00000000891e960] device_release+0x60/0x100 +[c0000000087e55c4] kobject_release+0x84/0x210 +[c00000000891ec78] put_device+0x28/0x40 +[c000000008984ea4] scsi_host_alloc+0x314/0x430 +[c0080000190b38bc] ibmvscsi_probe+0x54/0xad0 [ibmvscsi] +[c000000008110104] vio_bus_probe+0xa4/0x4b0 +[c00000000892a860] really_probe+0x140/0x680 +[c00000000892aefc] driver_probe_device+0x15c/0x200 +[c00000000892b63c] device_driver_attach+0xcc/0xe0 +[c00000000892b740] __driver_attach+0xf0/0x200 +[c000000008926f28] bus_for_each_dev+0xa8/0x130 +[c000000008929ce4] driver_attach+0x34/0x50 +[c000000008928fc0] bus_add_driver+0x1b0/0x300 +[c00000000892c798] driver_register+0x98/0x1a0 +[c00000000810eb60] __vio_register_driver+0x80/0xe0 +[c0080000190b4a30] ibmvscsi_module_init+0x9c/0xdc [ibmvscsi] +[c0000000080121d0] do_one_initcall+0x60/0x2d0 +[c000000008261abc] do_init_module+0x7c/0x320 +[c000000008265700] load_module+0x2350/0x25b0 +[c000000008265cb4] __do_sys_finit_module+0xd4/0x160 +[c000000008031110] system_call_exception+0x150/0x2d0 +[c00000000800d35c] system_call_common+0xec/0x278 + +Fix this be nulling shost->ehandler when the kthread fails to spawn. + +Link: https://lore.kernel.org/r/20210701195659.3185475-1-tyreld@linux.ibm.com +Fixes: 66a834d09293 ("scsi: core: Fix error handling of scsi_host_alloc()") +Cc: stable@vger.kernel.org +Reviewed-by: Ming Lei +Signed-off-by: Tyrel Datwyler +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/scsi/hosts.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/scsi/hosts.c ++++ b/drivers/scsi/hosts.c +@@ -485,6 +485,7 @@ struct Scsi_Host *scsi_host_alloc(struct + shost_printk(KERN_WARNING, shost, + "error handler thread failed to spawn, error = %ld\n", + PTR_ERR(shost->ehandler)); ++ shost->ehandler = NULL; + goto fail; + } + diff --git a/queue-5.13/scsi-zfcp-report-port-fc_security-as-unknown-early-during-remote-cable-pull.patch b/queue-5.13/scsi-zfcp-report-port-fc_security-as-unknown-early-during-remote-cable-pull.patch new file mode 100644 index 00000000000..c0b4beadf8c --- /dev/null +++ b/queue-5.13/scsi-zfcp-report-port-fc_security-as-unknown-early-during-remote-cable-pull.patch @@ -0,0 +1,38 @@ +From 8b3bdd99c092bbaeaa7d9eecb1a3e5dc9112002b Mon Sep 17 00:00:00 2001 +From: Steffen Maier +Date: Fri, 2 Jul 2021 18:09:22 +0200 +Subject: scsi: zfcp: Report port fc_security as unknown early during remote cable pull + +From: Steffen Maier + +commit 8b3bdd99c092bbaeaa7d9eecb1a3e5dc9112002b upstream. + +On remote cable pull, a zfcp_port keeps its status and only gets +ZFCP_STATUS_PORT_LINK_TEST added. Only after an ADISC timeout, we would +actually start port recovery and remove ZFCP_STATUS_COMMON_UNBLOCKED which +zfcp_sysfs_port_fc_security_show() detected and reported as "unknown" +instead of the old and possibly stale zfcp_port->connection_info. + +Add check for ZFCP_STATUS_PORT_LINK_TEST for timely "unknown" report. + +Link: https://lore.kernel.org/r/20210702160922.2667874-1-maier@linux.ibm.com +Fixes: a17c78460093 ("scsi: zfcp: report FC Endpoint Security in sysfs") +Cc: #5.7+ +Reviewed-by: Benjamin Block +Signed-off-by: Steffen Maier +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/s390/scsi/zfcp_sysfs.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/s390/scsi/zfcp_sysfs.c ++++ b/drivers/s390/scsi/zfcp_sysfs.c +@@ -487,6 +487,7 @@ static ssize_t zfcp_sysfs_port_fc_securi + if (0 == (status & ZFCP_STATUS_COMMON_OPEN) || + 0 == (status & ZFCP_STATUS_COMMON_UNBLOCKED) || + 0 == (status & ZFCP_STATUS_PORT_PHYS_OPEN) || ++ 0 != (status & ZFCP_STATUS_PORT_LINK_TEST) || + 0 != (status & ZFCP_STATUS_COMMON_ERP_FAILED) || + 0 != (status & ZFCP_STATUS_COMMON_ACCESS_BOXED)) + i = sprintf(buf, "unknown\n"); diff --git a/queue-5.13/series b/queue-5.13/series index 2799920be80..532b10d0c0b 100644 --- a/queue-5.13/series +++ b/queue-5.13/series @@ -11,3 +11,23 @@ kvm-nsvm-check-the-value-written-to-msr_vm_hsave_pa.patch kvm-x86-disable-hardware-breakpoints-unconditionally-before-kvm_x86-run.patch kvm-svm-smi-interception-must-not-skip-the-instruction.patch kvm-svm-remove-init-intercept-handler.patch +scsi-core-fix-bad-pointer-dereference-when-ehandler-kthread-is-invalid.patch +scsi-zfcp-report-port-fc_security-as-unknown-early-during-remote-cable-pull.patch +iommu-vt-d-global-devtlb-flush-when-present-context-entry-changed.patch +iommu-vt-d-fix-clearing-real-dma-device-s-scalable-mode-context-entries.patch +tracing-do-not-reference-char-as-a-string-in-histograms.patch +drm-amdgpu-add-another-renoir-did.patch +drm-i915-gtt-drop-the-page-table-optimisation.patch +drm-i915-gt-fix-edeadlk-handling-regression.patch +cgroup-verify-that-source-is-a-string.patch +fbmem-do-not-delete-the-mode-that-is-still-in-use.patch +edac-igen6-fix-core-dependency-again.patch +mm-hugetlb-fix-refs-calculation-from-unaligned-vaddr.patch +arm64-avoid-premature-usercopy-failure.patch +io_uring-use-right-task-for-exiting-checks.patch +btrfs-properly-split-extent_map-for-req_op_zone_append.patch +btrfs-zoned-fix-types-for-u64-division-in-btrfs_reclaim_bgs_work.patch +btrfs-fix-deadlock-with-concurrent-chunk-allocations-involving-system-chunks.patch +btrfs-rework-chunk-allocation-to-avoid-exhaustion-of-the-system-chunk-array.patch +btrfs-don-t-block-if-we-can-t-acquire-the-reclaim-lock.patch +btrfs-zoned-fix-wrong-mutex-unlock-on-failure-to-allocate-log-root-tree.patch diff --git a/queue-5.13/tracing-do-not-reference-char-as-a-string-in-histograms.patch b/queue-5.13/tracing-do-not-reference-char-as-a-string-in-histograms.patch new file mode 100644 index 00000000000..5b36d4fdefe --- /dev/null +++ b/queue-5.13/tracing-do-not-reference-char-as-a-string-in-histograms.patch @@ -0,0 +1,105 @@ +From 704adfb5a9978462cd861f170201ae2b5e3d3a80 Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (VMware)" +Date: Thu, 15 Jul 2021 00:02:06 -0400 +Subject: tracing: Do not reference char * as a string in histograms + +From: Steven Rostedt (VMware) + +commit 704adfb5a9978462cd861f170201ae2b5e3d3a80 upstream. + +The histogram logic was allowing events with char * pointers to be used as +normal strings. But it was easy to crash the kernel with: + + # echo 'hist:keys=filename' > events/syscalls/sys_enter_openat/trigger + +And open some files, and boom! + + BUG: unable to handle page fault for address: 00007f2ced0c3280 + #PF: supervisor read access in kernel mode + #PF: error_code(0x0000) - not-present page + PGD 1173fa067 P4D 1173fa067 PUD 1171b6067 PMD 1171dd067 PTE 0 + Oops: 0000 [#1] PREEMPT SMP + CPU: 6 PID: 1810 Comm: cat Not tainted 5.13.0-rc5-test+ #61 + Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 +v03.03 07/14/2016 + RIP: 0010:strlen+0x0/0x20 + Code: f6 82 80 2a 0b a9 20 74 11 0f b6 50 01 48 83 c0 01 f6 82 80 2a 0b +a9 20 75 ef c3 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 <80> 3f 00 74 +10 48 89 f8 48 83 c0 01 80 38 00 75 f7 48 29 f8 c3 + + RSP: 0018:ffffbdbf81567b50 EFLAGS: 00010246 + RAX: 0000000000000003 RBX: ffff93815cdb3800 RCX: ffff9382401a22d0 + RDX: 0000000000000100 RSI: 0000000000000000 RDI: 00007f2ced0c3280 + RBP: 0000000000000100 R08: ffff9382409ff074 R09: ffffbdbf81567c98 + R10: ffff9382409ff074 R11: 0000000000000000 R12: ffff9382409ff074 + R13: 0000000000000001 R14: ffff93815a744f00 R15: 00007f2ced0c3280 + FS: 00007f2ced0f8580(0000) GS:ffff93825a800000(0000) +knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 00007f2ced0c3280 CR3: 0000000107069005 CR4: 00000000001706e0 + Call Trace: + event_hist_trigger+0x463/0x5f0 + ? find_held_lock+0x32/0x90 + ? sched_clock_cpu+0xe/0xd0 + ? lock_release+0x155/0x440 + ? kernel_init_free_pages+0x6d/0x90 + ? preempt_count_sub+0x9b/0xd0 + ? kernel_init_free_pages+0x6d/0x90 + ? get_page_from_freelist+0x12c4/0x1680 + ? __rb_reserve_next+0xe5/0x460 + ? ring_buffer_lock_reserve+0x12a/0x3f0 + event_triggers_call+0x52/0xe0 + ftrace_syscall_enter+0x264/0x2c0 + syscall_trace_enter.constprop.0+0x1ee/0x210 + do_syscall_64+0x1c/0x80 + entry_SYSCALL_64_after_hwframe+0x44/0xae + +Where it triggered a fault on strlen(key) where key was the filename. + +The reason is that filename is a char * to user space, and the histogram +code just blindly dereferenced it, with obvious bad results. + +I originally tried to use strncpy_from_user/kernel_nofault() but found +that there's other places that its dereferenced and not worth the effort. + +Just do not allow "char *" to act like strings. + +Link: https://lkml.kernel.org/r/20210715000206.025df9d2@rorschach.local.home + +Cc: Ingo Molnar +Cc: Andrew Morton +Cc: Masami Hiramatsu +Cc: Tzvetomir Stoyanov +Cc: stable@vger.kernel.org +Acked-by: Namhyung Kim +Acked-by: Tom Zanussi +Fixes: 79e577cbce4c4 ("tracing: Support string type key properly") +Fixes: 5967bd5c4239 ("tracing: Let filter_assign_type() detect FILTER_PTR_STRING") +Signed-off-by: Steven Rostedt (VMware) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/trace_events_hist.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/kernel/trace/trace_events_hist.c ++++ b/kernel/trace/trace_events_hist.c +@@ -1689,7 +1689,9 @@ static struct hist_field *create_hist_fi + if (WARN_ON_ONCE(!field)) + goto out; + +- if (is_string_field(field)) { ++ /* Pointers to strings are just pointers and dangerous to dereference */ ++ if (is_string_field(field) && ++ (field->filter_type != FILTER_PTR_STRING)) { + flags |= HIST_FIELD_FL_STRING; + + hist_field->size = MAX_FILTER_STR_VAL; +@@ -4495,8 +4497,6 @@ static inline void add_to_key(char *comp + field = key_field->field; + if (field->filter_type == FILTER_DYN_STRING) + size = *(u32 *)(rec + field->offset) >> 16; +- else if (field->filter_type == FILTER_PTR_STRING) +- size = strlen(key); + else if (field->filter_type == FILTER_STATIC_STRING) + size = field->size; + -- 2.47.3