From 177ca55679feb064fd55a43f358664127ff05f3e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 8 Jun 2021 16:06:11 +0200 Subject: [PATCH] 5.10-stable patches added patches: btrfs-abort-in-rename_exchange-if-we-fail-to-insert-the-second-ref.patch btrfs-fix-deadlock-when-cloning-inline-extents-and-low-on-available-space.patch btrfs-fix-error-handling-in-btrfs_del_csums.patch btrfs-fix-fsync-failure-and-transaction-abort-after-writes-to-prealloc-extents.patch btrfs-fixup-error-handling-in-fixup_inode_link_counts.patch btrfs-mark-ordered-extent-and-inode-with-error-if-we-fail-to-finish.patch btrfs-return-errors-from-btrfs_del_csums-in-cleanup_ref_head.patch drm-amdgpu-don-t-query-ce-and-ue-errors.patch drm-amdgpu-make-sure-we-unpin-the-uvd-bo.patch mm-debug_vm_pgtable-fix-alignment-for-pmd-pud_advanced_tests.patch mm-page_alloc-fix-counting-of-free-pages-after-take-off-from-buddy.patch nfc-fix-null-ptr-dereference-in-llcp_sock_getname-after-failed-connect.patch ocfs2-fix-data-corruption-by-fallocate.patch powerpc-kprobes-fix-validation-of-prefixed-instructions-across-page-boundary.patch x86-apic-mark-_all_-legacy-interrupts-when-io-apic-is-missing.patch x86-cpufeatures-force-disable-x86_feature_enqcmd-and-remove-update_pasid.patch x86-sev-check-sme-sev-support-in-cpuid-first.patch --- ...-if-we-fail-to-insert-the-second-ref.patch | 56 +++ ...e-extents-and-low-on-available-space.patch | 123 ++++++ ...ix-error-handling-in-btrfs_del_csums.patch | 93 ++++ ...ort-after-writes-to-prealloc-extents.patch | 407 ++++++++++++++++++ ...-handling-in-fixup_inode_link_counts.patch | 85 ++++ ...node-with-error-if-we-fail-to-finish.patch | 57 +++ ...-btrfs_del_csums-in-cleanup_ref_head.patch | 35 ++ ...-amdgpu-don-t-query-ce-and-ue-errors.patch | 63 +++ ...amdgpu-make-sure-we-unpin-the-uvd-bo.patch | 35 ++ ...alignment-for-pmd-pud_advanced_tests.patch | 64 +++ ...free-pages-after-take-off-from-buddy.patch | 60 +++ ...cp_sock_getname-after-failed-connect.patch | 59 +++ ...fs2-fix-data-corruption-by-fallocate.patch | 148 +++++++ ...ed-instructions-across-page-boundary.patch | 53 +++ queue-5.10/series | 17 + ...y-interrupts-when-io-apic-is-missing.patch | 95 ++++ ...ature_enqcmd-and-remove-update_pasid.patch | 178 ++++++++ ...check-sme-sev-support-in-cpuid-first.patch | 69 +++ 18 files changed, 1697 insertions(+) create mode 100644 queue-5.10/btrfs-abort-in-rename_exchange-if-we-fail-to-insert-the-second-ref.patch create mode 100644 queue-5.10/btrfs-fix-deadlock-when-cloning-inline-extents-and-low-on-available-space.patch create mode 100644 queue-5.10/btrfs-fix-error-handling-in-btrfs_del_csums.patch create mode 100644 queue-5.10/btrfs-fix-fsync-failure-and-transaction-abort-after-writes-to-prealloc-extents.patch create mode 100644 queue-5.10/btrfs-fixup-error-handling-in-fixup_inode_link_counts.patch create mode 100644 queue-5.10/btrfs-mark-ordered-extent-and-inode-with-error-if-we-fail-to-finish.patch create mode 100644 queue-5.10/btrfs-return-errors-from-btrfs_del_csums-in-cleanup_ref_head.patch create mode 100644 queue-5.10/drm-amdgpu-don-t-query-ce-and-ue-errors.patch create mode 100644 queue-5.10/drm-amdgpu-make-sure-we-unpin-the-uvd-bo.patch create mode 100644 queue-5.10/mm-debug_vm_pgtable-fix-alignment-for-pmd-pud_advanced_tests.patch create mode 100644 queue-5.10/mm-page_alloc-fix-counting-of-free-pages-after-take-off-from-buddy.patch create mode 100644 queue-5.10/nfc-fix-null-ptr-dereference-in-llcp_sock_getname-after-failed-connect.patch create mode 100644 queue-5.10/ocfs2-fix-data-corruption-by-fallocate.patch create mode 100644 queue-5.10/powerpc-kprobes-fix-validation-of-prefixed-instructions-across-page-boundary.patch create mode 100644 queue-5.10/x86-apic-mark-_all_-legacy-interrupts-when-io-apic-is-missing.patch create mode 100644 queue-5.10/x86-cpufeatures-force-disable-x86_feature_enqcmd-and-remove-update_pasid.patch create mode 100644 queue-5.10/x86-sev-check-sme-sev-support-in-cpuid-first.patch diff --git a/queue-5.10/btrfs-abort-in-rename_exchange-if-we-fail-to-insert-the-second-ref.patch b/queue-5.10/btrfs-abort-in-rename_exchange-if-we-fail-to-insert-the-second-ref.patch new file mode 100644 index 00000000000..0e01a84dad3 --- /dev/null +++ b/queue-5.10/btrfs-abort-in-rename_exchange-if-we-fail-to-insert-the-second-ref.patch @@ -0,0 +1,56 @@ +From dc09ef3562726cd520c8338c1640872a60187af5 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Wed, 19 May 2021 14:04:21 -0400 +Subject: btrfs: abort in rename_exchange if we fail to insert the second ref + +From: Josef Bacik + +commit dc09ef3562726cd520c8338c1640872a60187af5 upstream. + +Error injection stress uncovered a problem where we'd leave a dangling +inode ref if we failed during a rename_exchange. This happens because +we insert the inode ref for one side of the rename, and then for the +other side. If this second inode ref insert fails we'll leave the first +one dangling and leave a corrupt file system behind. Fix this by +aborting if we did the insert for the first inode ref. + +CC: stable@vger.kernel.org # 4.9+ +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/inode.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -8890,6 +8890,7 @@ static int btrfs_rename_exchange(struct + int ret2; + bool root_log_pinned = false; + bool dest_log_pinned = false; ++ bool need_abort = false; + + /* we only allow rename subvolume link between subvolumes */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) +@@ -8946,6 +8947,7 @@ static int btrfs_rename_exchange(struct + old_idx); + if (ret) + goto out_fail; ++ need_abort = true; + } + + /* And now for the dest. */ +@@ -8961,8 +8963,11 @@ static int btrfs_rename_exchange(struct + new_ino, + btrfs_ino(BTRFS_I(old_dir)), + new_idx); +- if (ret) ++ if (ret) { ++ if (need_abort) ++ btrfs_abort_transaction(trans, ret); + goto out_fail; ++ } + } + + /* Update inode version and ctime/mtime. */ diff --git a/queue-5.10/btrfs-fix-deadlock-when-cloning-inline-extents-and-low-on-available-space.patch b/queue-5.10/btrfs-fix-deadlock-when-cloning-inline-extents-and-low-on-available-space.patch new file mode 100644 index 00000000000..3d2ed7107e9 --- /dev/null +++ b/queue-5.10/btrfs-fix-deadlock-when-cloning-inline-extents-and-low-on-available-space.patch @@ -0,0 +1,123 @@ +From 76a6d5cd74479e7ec8a7f9a29bce63d5549b6b2e Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Tue, 25 May 2021 11:05:28 +0100 +Subject: btrfs: fix deadlock when cloning inline extents and low on available space + +From: Filipe Manana + +commit 76a6d5cd74479e7ec8a7f9a29bce63d5549b6b2e upstream. + +There are a few cases where cloning an inline extent requires copying data +into a page of the destination inode. For these cases we are allocating +the required data and metadata space while holding a leaf locked. This can +result in a deadlock when we are low on available space because allocating +the space may flush delalloc and two deadlock scenarios can happen: + +1) When starting writeback for an inode with a very small dirty range that + fits in an inline extent, we deadlock during the writeback when trying + to insert the inline extent, at cow_file_range_inline(), if the extent + is going to be located in the leaf for which we are already holding a + read lock; + +2) After successfully starting writeback, for non-inline extent cases, + the async reclaim thread will hang waiting for an ordered extent to + complete if the ordered extent completion needs to modify the leaf + for which the clone task is holding a read lock (for adding or + replacing file extent items). So the cloning task will wait forever + on the async reclaim thread to make progress, which in turn is + waiting for the ordered extent completion which in turn is waiting + to acquire a write lock on the same leaf. + +So fix this by making sure we release the path (and therefore the leaf) +every time we need to copy the inline extent's data into a page of the +destination inode, as by that time we do not need to have the leaf locked. + +Fixes: 05a5a7621ce66c ("Btrfs: implement full reflink support for inline extents") +CC: stable@vger.kernel.org # 5.10+ +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/reflink.c | 38 ++++++++++++++++++++++---------------- + 1 file changed, 22 insertions(+), 16 deletions(-) + +--- a/fs/btrfs/reflink.c ++++ b/fs/btrfs/reflink.c +@@ -207,10 +207,7 @@ static int clone_copy_inline_extent(stru + * inline extent's data to the page. + */ + ASSERT(key.offset > 0); +- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, +- inline_data, size, datal, +- comp_type); +- goto out; ++ goto copy_to_page; + } + } else if (i_size_read(dst) <= datal) { + struct btrfs_file_extent_item *ei; +@@ -226,13 +223,10 @@ static int clone_copy_inline_extent(stru + BTRFS_FILE_EXTENT_INLINE) + goto copy_inline_extent; + +- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, +- inline_data, size, datal, comp_type); +- goto out; ++ goto copy_to_page; + } + + copy_inline_extent: +- ret = 0; + /* + * We have no extent items, or we have an extent at offset 0 which may + * or may not be inlined. All these cases are dealt the same way. +@@ -244,11 +238,13 @@ copy_inline_extent: + * clone. Deal with all these cases by copying the inline extent + * data into the respective page at the destination inode. + */ +- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, +- inline_data, size, datal, comp_type); +- goto out; ++ goto copy_to_page; + } + ++ /* ++ * Release path before starting a new transaction so we don't hold locks ++ * that would confuse lockdep. ++ */ + btrfs_release_path(path); + /* + * If we end up here it means were copy the inline extent into a leaf +@@ -282,11 +278,6 @@ copy_inline_extent: + out: + if (!ret && !trans) { + /* +- * Release path before starting a new transaction so we don't +- * hold locks that would confuse lockdep. +- */ +- btrfs_release_path(path); +- /* + * No transaction here means we copied the inline extent into a + * page of the destination inode. + * +@@ -306,6 +297,21 @@ out: + *trans_out = trans; + + return ret; ++ ++copy_to_page: ++ /* ++ * Release our path because we don't need it anymore and also because ++ * copy_inline_to_page() needs to reserve data and metadata, which may ++ * need to flush delalloc when we are low on available space and ++ * therefore cause a deadlock if writeback of an inline extent needs to ++ * write to the same leaf or an ordered extent completion needs to write ++ * to the same leaf. ++ */ ++ btrfs_release_path(path); ++ ++ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, ++ inline_data, size, datal, comp_type); ++ goto out; + } + + /** diff --git a/queue-5.10/btrfs-fix-error-handling-in-btrfs_del_csums.patch b/queue-5.10/btrfs-fix-error-handling-in-btrfs_del_csums.patch new file mode 100644 index 00000000000..bb1d67fbd0a --- /dev/null +++ b/queue-5.10/btrfs-fix-error-handling-in-btrfs_del_csums.patch @@ -0,0 +1,93 @@ +From b86652be7c83f70bf406bed18ecf55adb9bfb91b Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Wed, 19 May 2021 10:52:45 -0400 +Subject: btrfs: fix error handling in btrfs_del_csums + +From: Josef Bacik + +commit b86652be7c83f70bf406bed18ecf55adb9bfb91b upstream. + +Error injection stress would sometimes fail with checksums on disk that +did not have a corresponding extent. This occurred because the pattern +in btrfs_del_csums was + + while (1) { + ret = btrfs_search_slot(); + if (ret < 0) + break; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; + +If we got an error from btrfs_search_slot we'd clear the error because +we were breaking instead of goto out. Instead of using goto out, simply +handle the cases where we may leave a random value in ret, and get rid +of the + + ret = 0; +out: + +pattern and simply allow break to have the proper error reporting. With +this fix we properly abort the transaction and do not commit thinking we +successfully deleted the csum. + +Reviewed-by: Qu Wenruo +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/file-item.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +--- a/fs/btrfs/file-item.c ++++ b/fs/btrfs/file-item.c +@@ -690,7 +690,7 @@ int btrfs_del_csums(struct btrfs_trans_h + u64 end_byte = bytenr + len; + u64 csum_end; + struct extent_buffer *leaf; +- int ret; ++ int ret = 0; + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + int blocksize_bits = fs_info->sb->s_blocksize_bits; + +@@ -709,6 +709,7 @@ int btrfs_del_csums(struct btrfs_trans_h + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { ++ ret = 0; + if (path->slots[0] == 0) + break; + path->slots[0]--; +@@ -765,7 +766,7 @@ int btrfs_del_csums(struct btrfs_trans_h + ret = btrfs_del_items(trans, root, path, + path->slots[0], del_nr); + if (ret) +- goto out; ++ break; + if (key.offset == bytenr) + break; + } else if (key.offset < bytenr && csum_end > end_byte) { +@@ -809,8 +810,9 @@ int btrfs_del_csums(struct btrfs_trans_h + ret = btrfs_split_item(trans, root, path, &key, offset); + if (ret && ret != -EAGAIN) { + btrfs_abort_transaction(trans, ret); +- goto out; ++ break; + } ++ ret = 0; + + key.offset = end_byte - 1; + } else { +@@ -820,8 +822,6 @@ int btrfs_del_csums(struct btrfs_trans_h + } + btrfs_release_path(path); + } +- ret = 0; +-out: + btrfs_free_path(path); + return ret; + } diff --git a/queue-5.10/btrfs-fix-fsync-failure-and-transaction-abort-after-writes-to-prealloc-extents.patch b/queue-5.10/btrfs-fix-fsync-failure-and-transaction-abort-after-writes-to-prealloc-extents.patch new file mode 100644 index 00000000000..dcf24130d1c --- /dev/null +++ b/queue-5.10/btrfs-fix-fsync-failure-and-transaction-abort-after-writes-to-prealloc-extents.patch @@ -0,0 +1,407 @@ +From ea7036de0d36c4e6c9508f68789e9567d514333a Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 24 May 2021 11:35:53 +0100 +Subject: btrfs: fix fsync failure and transaction abort after writes to prealloc extents + +From: Filipe Manana + +commit ea7036de0d36c4e6c9508f68789e9567d514333a upstream. + +When doing a series of partial writes to different ranges of preallocated +extents with transaction commits and fsyncs in between, we can end up with +a checksum items in a log tree. This causes an fsync to fail with -EIO and +abort the transaction, turning the filesystem to RO mode, when syncing the +log. + +For this to happen, we need to have a full fsync of a file following one +or more fast fsyncs. + +The following example reproduces the problem and explains how it happens: + + $ mkfs.btrfs -f /dev/sdc + $ mount /dev/sdc /mnt + + # Create our test file with 2 preallocated extents. Leave a 1M hole + # between them to ensure that we get two file extent items that will + # never be merged into a single one. The extents are contiguous on disk, + # which will later result in the checksums for their data to be merged + # into a single checksum item in the csums btree. + # + $ xfs_io -f \ + -c "falloc 0 1M" \ + -c "falloc 3M 3M" \ + /mnt/foobar + + # Now write to the second extent and leave only 1M of it as unwritten, + # which corresponds to the file range [4M, 5M[. + # + # Then fsync the file to flush delalloc and to clear full sync flag from + # the inode, so that a future fsync will use the fast code path. + # + # After the writeback triggered by the fsync we have 3 file extent items + # that point to the second extent we previously allocated: + # + # 1) One file extent item of type BTRFS_FILE_EXTENT_REG that covers the + # file range [3M, 4M[ + # + # 2) One file extent item of type BTRFS_FILE_EXTENT_PREALLOC that covers + # the file range [4M, 5M[ + # + # 3) One file extent item of type BTRFS_FILE_EXTENT_REG that covers the + # file range [5M, 6M[ + # + # All these file extent items have a generation of 6, which is the ID of + # the transaction where they were created. The split of the original file + # extent item is done at btrfs_mark_extent_written() when ordered extents + # complete for the file ranges [3M, 4M[ and [5M, 6M[. + # + $ xfs_io -c "pwrite -S 0xab 3M 1M" \ + -c "pwrite -S 0xef 5M 1M" \ + -c "fsync" \ + /mnt/foobar + + # Commit the current transaction. This wipes out the log tree created by + # the previous fsync. + sync + + # Now write to the unwritten range of the second extent we allocated, + # corresponding to the file range [4M, 5M[, and fsync the file, which + # triggers the fast fsync code path. + # + # The fast fsync code path sees that there is a new extent map covering + # the file range [4M, 5M[ and therefore it will log a checksum item + # covering the range [1M, 2M[ of the second extent we allocated. + # + # Also, after the fsync finishes we no longer have the 3 file extent + # items that pointed to 3 sections of the second extent we allocated. + # Instead we end up with a single file extent item pointing to the whole + # extent, with a type of BTRFS_FILE_EXTENT_REG and a generation of 7 (the + # current transaction ID). This is due to the file extent item merging we + # do when completing ordered extents into ranges that point to unwritten + # (preallocated) extents. This merging is done at + # btrfs_mark_extent_written(). + # + $ xfs_io -c "pwrite -S 0xcd 4M 1M" \ + -c "fsync" \ + /mnt/foobar + + # Now do some write to our file outside the range of the second extent + # that we allocated with fallocate() and truncate the file size from 6M + # down to 5M. + # + # The truncate operation sets the full sync runtime flag on the inode, + # forcing the next fsync to use the slow code path. It also changes the + # length of the second file extent item so that it represents the file + # range [3M, 5M[ and not the range [3M, 6M[ anymore. + # + # Finally fsync the file. Since this is a fsync that triggers the slow + # code path, it will remove all items associated to the inode from the + # log tree and then it will scan for file extent items in the + # fs/subvolume tree that have a generation matching the current + # transaction ID, which is 7. This means it will log 2 file extent + # items: + # + # 1) One for the first extent we allocated, covering the file range + # [0, 1M[ + # + # 2) Another for the first 2M of the second extent we allocated, + # covering the file range [3M, 5M[ + # + # When logging the first file extent item we log a single checksum item + # that has all the checksums for the entire extent. + # + # When logging the second file extent item, we also lookup for the + # checksums that are associated with the range [0, 2M[ of the second + # extent we allocated (file range [3M, 5M[), and then we log them with + # btrfs_csum_file_blocks(). However that results in ending up with a log + # that has two checksum items with ranges that overlap: + # + # 1) One for the range [1M, 2M[ of the second extent we allocated, + # corresponding to the file range [4M, 5M[, which we logged in the + # previous fsync that used the fast code path; + # + # 2) One for the ranges [0, 1M[ and [0, 2M[ of the first and second + # extents, respectively, corresponding to the files ranges [0, 1M[ + # and [3M, 5M[. This one was added during this last fsync that uses + # the slow code path and overlaps with the previous one logged by + # the previous fast fsync. + # + # This happens because when logging the checksums for the second + # extent, we notice they start at an offset that matches the end of the + # checksums item that we logged for the first extent, and because both + # extents are contiguous on disk, btrfs_csum_file_blocks() decides to + # extend that existing checksums item and append the checksums for the + # second extent to this item. The end result is we end up with two + # checksum items in the log tree that have overlapping ranges, as + # listed before, resulting in the fsync to fail with -EIO and aborting + # the transaction, turning the filesystem into RO mode. + # + $ xfs_io -c "pwrite -S 0xff 0 1M" \ + -c "truncate 5M" \ + -c "fsync" \ + /mnt/foobar + fsync: Input/output error + +After running the example, dmesg/syslog shows the tree checker complained +about the checksum items with overlapping ranges and we aborted the +transaction: + + $ dmesg + (...) + [756289.557487] BTRFS critical (device sdc): corrupt leaf: root=18446744073709551610 block=30720000 slot=5, csum end range (16777216) goes beyond the start range (15728640) of the next csum item + [756289.560583] BTRFS info (device sdc): leaf 30720000 gen 7 total ptrs 7 free space 11677 owner 18446744073709551610 + [756289.562435] BTRFS info (device sdc): refs 2 lock_owner 0 current 2303929 + [756289.563654] item 0 key (257 1 0) itemoff 16123 itemsize 160 + [756289.564649] inode generation 6 size 5242880 mode 100600 + [756289.565636] item 1 key (257 12 256) itemoff 16107 itemsize 16 + [756289.566694] item 2 key (257 108 0) itemoff 16054 itemsize 53 + [756289.567725] extent data disk bytenr 13631488 nr 1048576 + [756289.568697] extent data offset 0 nr 1048576 ram 1048576 + [756289.569689] item 3 key (257 108 1048576) itemoff 16001 itemsize 53 + [756289.570682] extent data disk bytenr 0 nr 0 + [756289.571363] extent data offset 0 nr 2097152 ram 2097152 + [756289.572213] item 4 key (257 108 3145728) itemoff 15948 itemsize 53 + [756289.573246] extent data disk bytenr 14680064 nr 3145728 + [756289.574121] extent data offset 0 nr 2097152 ram 3145728 + [756289.574993] item 5 key (18446744073709551606 128 13631488) itemoff 12876 itemsize 3072 + [756289.576113] item 6 key (18446744073709551606 128 15728640) itemoff 11852 itemsize 1024 + [756289.577286] BTRFS error (device sdc): block=30720000 write time tree block corruption detected + [756289.578644] ------------[ cut here ]------------ + [756289.579376] WARNING: CPU: 0 PID: 2303929 at fs/btrfs/disk-io.c:465 csum_one_extent_buffer+0xed/0x100 [btrfs] + [756289.580857] Modules linked in: btrfs dm_zero dm_dust loop dm_snapshot (...) + [756289.591534] CPU: 0 PID: 2303929 Comm: xfs_io Tainted: G W 5.12.0-rc8-btrfs-next-87 #1 + [756289.592580] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 + [756289.594161] RIP: 0010:csum_one_extent_buffer+0xed/0x100 [btrfs] + [756289.595122] Code: 5d c3 e8 76 60 (...) + [756289.597509] RSP: 0018:ffffb51b416cb898 EFLAGS: 00010282 + [756289.598142] RAX: 0000000000000000 RBX: fffff02b8a365bc0 RCX: 0000000000000000 + [756289.598970] RDX: 0000000000000000 RSI: ffffffffa9112421 RDI: 00000000ffffffff + [756289.599798] RBP: ffffa06500880000 R08: 0000000000000000 R09: 0000000000000000 + [756289.600619] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 + [756289.601456] R13: ffffa0652b1d8980 R14: ffffa06500880000 R15: 0000000000000000 + [756289.602278] FS: 00007f08b23c9800(0000) GS:ffffa0682be00000(0000) knlGS:0000000000000000 + [756289.603217] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + [756289.603892] CR2: 00005652f32d0138 CR3: 000000025d616003 CR4: 0000000000370ef0 + [756289.604725] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + [756289.605563] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + [756289.606400] Call Trace: + [756289.606704] btree_csum_one_bio+0x244/0x2b0 [btrfs] + [756289.607313] btrfs_submit_metadata_bio+0xb7/0x100 [btrfs] + [756289.608040] submit_one_bio+0x61/0x70 [btrfs] + [756289.608587] btree_write_cache_pages+0x587/0x610 [btrfs] + [756289.609258] ? free_debug_processing+0x1d5/0x240 + [756289.609812] ? __module_address+0x28/0xf0 + [756289.610298] ? lock_acquire+0x1a0/0x3e0 + [756289.610754] ? lock_acquired+0x19f/0x430 + [756289.611220] ? lock_acquire+0x1a0/0x3e0 + [756289.611675] do_writepages+0x43/0xf0 + [756289.612101] ? __filemap_fdatawrite_range+0xa4/0x100 + [756289.612800] __filemap_fdatawrite_range+0xc5/0x100 + [756289.613393] btrfs_write_marked_extents+0x68/0x160 [btrfs] + [756289.614085] btrfs_sync_log+0x21c/0xf20 [btrfs] + [756289.614661] ? finish_wait+0x90/0x90 + [756289.615096] ? __mutex_unlock_slowpath+0x45/0x2a0 + [756289.615661] ? btrfs_log_inode_parent+0x3c9/0xdc0 [btrfs] + [756289.616338] ? lock_acquire+0x1a0/0x3e0 + [756289.616801] ? lock_acquired+0x19f/0x430 + [756289.617284] ? lock_acquire+0x1a0/0x3e0 + [756289.617750] ? lock_release+0x214/0x470 + [756289.618221] ? lock_acquired+0x19f/0x430 + [756289.618704] ? dput+0x20/0x4a0 + [756289.619079] ? dput+0x20/0x4a0 + [756289.619452] ? lockref_put_or_lock+0x9/0x30 + [756289.619969] ? lock_release+0x214/0x470 + [756289.620445] ? lock_release+0x214/0x470 + [756289.620924] ? lock_release+0x214/0x470 + [756289.621415] btrfs_sync_file+0x46a/0x5b0 [btrfs] + [756289.621982] do_fsync+0x38/0x70 + [756289.622395] __x64_sys_fsync+0x10/0x20 + [756289.622907] do_syscall_64+0x33/0x80 + [756289.623438] entry_SYSCALL_64_after_hwframe+0x44/0xae + [756289.624063] RIP: 0033:0x7f08b27fbb7b + [756289.624588] Code: 0f 05 48 3d 00 (...) + [756289.626760] RSP: 002b:00007ffe2583f940 EFLAGS: 00000293 ORIG_RAX: 000000000000004a + [756289.627639] RAX: ffffffffffffffda RBX: 00005652f32cd0f0 RCX: 00007f08b27fbb7b + [756289.628464] RDX: 00005652f32cbca0 RSI: 00005652f32cd110 RDI: 0000000000000003 + [756289.629323] RBP: 00005652f32cd110 R08: 0000000000000000 R09: 00007f08b28c4be0 + [756289.630172] R10: fffffffffffff39a R11: 0000000000000293 R12: 0000000000000001 + [756289.631007] R13: 00005652f32cd0f0 R14: 0000000000000001 R15: 00005652f32cc480 + [756289.631819] irq event stamp: 0 + [756289.632188] hardirqs last enabled at (0): [<0000000000000000>] 0x0 + [756289.632911] hardirqs last disabled at (0): [] copy_process+0x879/0x1cc0 + [756289.633893] softirqs last enabled at (0): [] copy_process+0x879/0x1cc0 + [756289.634871] softirqs last disabled at (0): [<0000000000000000>] 0x0 + [756289.635606] ---[ end trace 0a039fdc16ff3fef ]--- + [756289.636179] BTRFS: error (device sdc) in btrfs_sync_log:3136: errno=-5 IO failure + [756289.637082] BTRFS info (device sdc): forced readonly + +Having checksum items covering ranges that overlap is dangerous as in some +cases it can lead to having extent ranges for which we miss checksums +after log replay or getting the wrong checksum item. There were some fixes +in the past for bugs that resulted in this problem, and were explained and +fixed by the following commits: + + 27b9a8122ff71a ("Btrfs: fix csum tree corruption, duplicate and outdated checksums") + b84b8390d6009c ("Btrfs: fix file read corruption after extent cloning and fsync") + 40e046acbd2f36 ("Btrfs: fix missing data checksums after replaying a log tree") + e289f03ea79bbc ("btrfs: fix corrupt log due to concurrent fsync of inodes with shared extents") + +Fix the issue by making btrfs_csum_file_blocks() taking into account the +start offset of the next checksum item when it decides to extend an +existing checksum item, so that it never extends the checksum to end at a +range that goes beyond the start range of the next checksum item. + +When we can not access the next checksum item without releasing the path, +simply drop the optimization of extending the previous checksum item and +fallback to inserting a new checksum item - this happens rarely and the +optimization is not significant enough for a log tree in order to justify +the extra complexity, as it would only save a few bytes (the size of a +struct btrfs_item) of leaf space. + +This behaviour is only needed when inserting into a log tree because +for the regular checksums tree we never have a case where we try to +insert a range of checksums that overlap with a range that was previously +inserted. + +A test case for fstests will follow soon. + +Reported-by: Philipp Fent +Link: https://lore.kernel.org/linux-btrfs/93c4600e-5263-5cba-adf0-6f47526e7561@in.tum.de/ +CC: stable@vger.kernel.org # 5.4+ +Tested-by: Anand Jain +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/file-item.c | 98 +++++++++++++++++++++++++++++++++++++++------------ + 1 file changed, 76 insertions(+), 22 deletions(-) + +--- a/fs/btrfs/file-item.c ++++ b/fs/btrfs/file-item.c +@@ -826,6 +826,37 @@ int btrfs_del_csums(struct btrfs_trans_h + return ret; + } + ++static int find_next_csum_offset(struct btrfs_root *root, ++ struct btrfs_path *path, ++ u64 *next_offset) ++{ ++ const u32 nritems = btrfs_header_nritems(path->nodes[0]); ++ struct btrfs_key found_key; ++ int slot = path->slots[0] + 1; ++ int ret; ++ ++ if (nritems == 0 || slot >= nritems) { ++ ret = btrfs_next_leaf(root, path); ++ if (ret < 0) { ++ return ret; ++ } else if (ret > 0) { ++ *next_offset = (u64)-1; ++ return 0; ++ } ++ slot = path->slots[0]; ++ } ++ ++ btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); ++ ++ if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || ++ found_key.type != BTRFS_EXTENT_CSUM_KEY) ++ *next_offset = (u64)-1; ++ else ++ *next_offset = found_key.offset; ++ ++ return 0; ++} ++ + int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums) +@@ -841,7 +872,6 @@ int btrfs_csum_file_blocks(struct btrfs_ + u64 total_bytes = 0; + u64 csum_offset; + u64 bytenr; +- u32 nritems; + u32 ins_size; + int index = 0; + int found_next; +@@ -884,26 +914,10 @@ again: + goto insert; + } + } else { +- int slot = path->slots[0] + 1; +- /* we didn't find a csum item, insert one */ +- nritems = btrfs_header_nritems(path->nodes[0]); +- if (!nritems || (path->slots[0] >= nritems - 1)) { +- ret = btrfs_next_leaf(root, path); +- if (ret < 0) { +- goto out; +- } else if (ret > 0) { +- found_next = 1; +- goto insert; +- } +- slot = path->slots[0]; +- } +- btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); +- if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || +- found_key.type != BTRFS_EXTENT_CSUM_KEY) { +- found_next = 1; +- goto insert; +- } +- next_offset = found_key.offset; ++ /* We didn't find a csum item, insert one. */ ++ ret = find_next_csum_offset(root, path, &next_offset); ++ if (ret < 0) ++ goto out; + found_next = 1; + goto insert; + } +@@ -958,8 +972,48 @@ extend_csum: + tmp = sums->len - total_bytes; + tmp >>= fs_info->sb->s_blocksize_bits; + WARN_ON(tmp < 1); ++ extend_nr = max_t(int, 1, tmp); ++ ++ /* ++ * A log tree can already have checksum items with a subset of ++ * the checksums we are trying to log. This can happen after ++ * doing a sequence of partial writes into prealloc extents and ++ * fsyncs in between, with a full fsync logging a larger subrange ++ * of an extent for which a previous fast fsync logged a smaller ++ * subrange. And this happens in particular due to merging file ++ * extent items when we complete an ordered extent for a range ++ * covered by a prealloc extent - this is done at ++ * btrfs_mark_extent_written(). ++ * ++ * So if we try to extend the previous checksum item, which has ++ * a range that ends at the start of the range we want to insert, ++ * make sure we don't extend beyond the start offset of the next ++ * checksum item. If we are at the last item in the leaf, then ++ * forget the optimization of extending and add a new checksum ++ * item - it is not worth the complexity of releasing the path, ++ * getting the first key for the next leaf, repeat the btree ++ * search, etc, because log trees are temporary anyway and it ++ * would only save a few bytes of leaf space. ++ */ ++ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { ++ if (path->slots[0] + 1 >= ++ btrfs_header_nritems(path->nodes[0])) { ++ ret = find_next_csum_offset(root, path, &next_offset); ++ if (ret < 0) ++ goto out; ++ found_next = 1; ++ goto insert; ++ } ++ ++ ret = find_next_csum_offset(root, path, &next_offset); ++ if (ret < 0) ++ goto out; ++ ++ tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits; ++ if (tmp <= INT_MAX) ++ extend_nr = min_t(int, extend_nr, tmp); ++ } + +- extend_nr = max_t(int, 1, (int)tmp); + diff = (csum_offset + extend_nr) * csum_size; + diff = min(diff, + MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size); diff --git a/queue-5.10/btrfs-fixup-error-handling-in-fixup_inode_link_counts.patch b/queue-5.10/btrfs-fixup-error-handling-in-fixup_inode_link_counts.patch new file mode 100644 index 00000000000..c1008c1a6f3 --- /dev/null +++ b/queue-5.10/btrfs-fixup-error-handling-in-fixup_inode_link_counts.patch @@ -0,0 +1,85 @@ +From 011b28acf940eb61c000059dd9e2cfcbf52ed96b Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Wed, 19 May 2021 13:13:15 -0400 +Subject: btrfs: fixup error handling in fixup_inode_link_counts + +From: Josef Bacik + +commit 011b28acf940eb61c000059dd9e2cfcbf52ed96b upstream. + +This function has the following pattern + + while (1) { + ret = whatever(); + if (ret) + goto out; + } + ret = 0 +out: + return ret; + +However several places in this while loop we simply break; when there's +a problem, thus clearing the return value, and in one case we do a +return -EIO, and leak the memory for the path. + +Fix this by re-arranging the loop to deal with ret == 1 coming from +btrfs_search_slot, and then simply delete the + + ret = 0; +out: + +bit so everybody can break if there is an error, which will allow for +proper error handling to occur. + +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-log.c | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -1752,6 +1752,7 @@ static noinline int fixup_inode_link_cou + break; + + if (ret == 1) { ++ ret = 0; + if (path->slots[0] == 0) + break; + path->slots[0]--; +@@ -1764,17 +1765,19 @@ static noinline int fixup_inode_link_cou + + ret = btrfs_del_item(trans, root, path); + if (ret) +- goto out; ++ break; + + btrfs_release_path(path); + inode = read_one_inode(root, key.offset); +- if (!inode) +- return -EIO; ++ if (!inode) { ++ ret = -EIO; ++ break; ++ } + + ret = fixup_inode_link_count(trans, root, inode); + iput(inode); + if (ret) +- goto out; ++ break; + + /* + * fixup on a directory may create new entries, +@@ -1783,8 +1786,6 @@ static noinline int fixup_inode_link_cou + */ + key.offset = (u64)-1; + } +- ret = 0; +-out: + btrfs_release_path(path); + return ret; + } diff --git a/queue-5.10/btrfs-mark-ordered-extent-and-inode-with-error-if-we-fail-to-finish.patch b/queue-5.10/btrfs-mark-ordered-extent-and-inode-with-error-if-we-fail-to-finish.patch new file mode 100644 index 00000000000..34437a8b3a2 --- /dev/null +++ b/queue-5.10/btrfs-mark-ordered-extent-and-inode-with-error-if-we-fail-to-finish.patch @@ -0,0 +1,57 @@ +From d61bec08b904cf171835db98168f82bc338e92e4 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Wed, 19 May 2021 09:38:27 -0400 +Subject: btrfs: mark ordered extent and inode with error if we fail to finish + +From: Josef Bacik + +commit d61bec08b904cf171835db98168f82bc338e92e4 upstream. + +While doing error injection testing I saw that sometimes we'd get an +abort that wouldn't stop the current transaction commit from completing. +This abort was coming from finish ordered IO, but at this point in the +transaction commit we should have gotten an error and stopped. + +It turns out the abort came from finish ordered io while trying to write +out the free space cache. It occurred to me that any failure inside of +finish_ordered_io isn't actually raised to the person doing the writing, +so we could have any number of failures in this path and think the +ordered extent completed successfully and the inode was fine. + +Fix this by marking the ordered extent with BTRFS_ORDERED_IOERR, and +marking the mapping of the inode with mapping_set_error, so any callers +that simply call fdatawait will also get the error. + +With this we're seeing the IO error on the free space inode when we fail +to do the finish_ordered_io. + +CC: stable@vger.kernel.org # 4.19+ +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/inode.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -2760,6 +2760,18 @@ out: + if (ret || truncated) { + u64 unwritten_start = start; + ++ /* ++ * If we failed to finish this ordered extent for any reason we ++ * need to make sure BTRFS_ORDERED_IOERR is set on the ordered ++ * extent, and mark the inode with the error if it wasn't ++ * already set. Any error during writeback would have already ++ * set the mapping error, so we need to set it if we're the ones ++ * marking this ordered extent as failed. ++ */ ++ if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR, ++ &ordered_extent->flags)) ++ mapping_set_error(ordered_extent->inode->i_mapping, -EIO); ++ + if (truncated) + unwritten_start += logical_len; + clear_extent_uptodate(io_tree, unwritten_start, end, NULL); diff --git a/queue-5.10/btrfs-return-errors-from-btrfs_del_csums-in-cleanup_ref_head.patch b/queue-5.10/btrfs-return-errors-from-btrfs_del_csums-in-cleanup_ref_head.patch new file mode 100644 index 00000000000..7c23f394be2 --- /dev/null +++ b/queue-5.10/btrfs-return-errors-from-btrfs_del_csums-in-cleanup_ref_head.patch @@ -0,0 +1,35 @@ +From 856bd270dc4db209c779ce1e9555c7641ffbc88e Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Wed, 19 May 2021 10:52:46 -0400 +Subject: btrfs: return errors from btrfs_del_csums in cleanup_ref_head + +From: Josef Bacik + +commit 856bd270dc4db209c779ce1e9555c7641ffbc88e upstream. + +We are unconditionally returning 0 in cleanup_ref_head, despite the fact +that btrfs_del_csums could fail. We need to return the error so the +transaction gets aborted properly, fix this by returning ret from +btrfs_del_csums in cleanup_ref_head. + +Reviewed-by: Qu Wenruo +CC: stable@vger.kernel.org # 4.19+ +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent-tree.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -1830,7 +1830,7 @@ static int cleanup_ref_head(struct btrfs + trace_run_delayed_ref_head(fs_info, head, 0); + btrfs_delayed_ref_unlock(head); + btrfs_put_delayed_ref_head(head); +- return 0; ++ return ret; + } + + static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( diff --git a/queue-5.10/drm-amdgpu-don-t-query-ce-and-ue-errors.patch b/queue-5.10/drm-amdgpu-don-t-query-ce-and-ue-errors.patch new file mode 100644 index 00000000000..5265dfe3317 --- /dev/null +++ b/queue-5.10/drm-amdgpu-don-t-query-ce-and-ue-errors.patch @@ -0,0 +1,63 @@ +From dce3d8e1d070900e0feeb06787a319ff9379212c Mon Sep 17 00:00:00 2001 +From: Luben Tuikov +Date: Wed, 12 May 2021 12:33:23 -0400 +Subject: drm/amdgpu: Don't query CE and UE errors +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Luben Tuikov + +commit dce3d8e1d070900e0feeb06787a319ff9379212c upstream. + +On QUERY2 IOCTL don't query counts of correctable +and uncorrectable errors, since when RAS is +enabled and supported on Vega20 server boards, +this takes insurmountably long time, in O(n^3), +which slows the system down to the point of it +being unusable when we have GUI up. + +Fixes: ae363a212b14 ("drm/amdgpu: Add a new flag to AMDGPU_CTX_OP_QUERY_STATE2") +Cc: Alexander Deucher +Cc: stable@vger.kernel.org +Signed-off-by: Luben Tuikov +Reviewed-by: Alexander Deucher +Reviewed-by: Christian König +Signed-off-by: Alex Deucher +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 16 ---------------- + 1 file changed, 16 deletions(-) + +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +@@ -337,7 +337,6 @@ static int amdgpu_ctx_query2(struct amdg + { + struct amdgpu_ctx *ctx; + struct amdgpu_ctx_mgr *mgr; +- unsigned long ras_counter; + + if (!fpriv) + return -EINVAL; +@@ -362,21 +361,6 @@ static int amdgpu_ctx_query2(struct amdg + if (atomic_read(&ctx->guilty)) + out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY; + +- /*query ue count*/ +- ras_counter = amdgpu_ras_query_error_count(adev, false); +- /*ras counter is monotonic increasing*/ +- if (ras_counter != ctx->ras_counter_ue) { +- out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; +- ctx->ras_counter_ue = ras_counter; +- } +- +- /*query ce count*/ +- ras_counter = amdgpu_ras_query_error_count(adev, true); +- if (ras_counter != ctx->ras_counter_ce) { +- out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; +- ctx->ras_counter_ce = ras_counter; +- } +- + mutex_unlock(&mgr->lock); + return 0; + } diff --git a/queue-5.10/drm-amdgpu-make-sure-we-unpin-the-uvd-bo.patch b/queue-5.10/drm-amdgpu-make-sure-we-unpin-the-uvd-bo.patch new file mode 100644 index 00000000000..4526c133fd9 --- /dev/null +++ b/queue-5.10/drm-amdgpu-make-sure-we-unpin-the-uvd-bo.patch @@ -0,0 +1,35 @@ +From 07438603a07e52f1c6aa731842bd298d2725b7be Mon Sep 17 00:00:00 2001 +From: Nirmoy Das +Date: Fri, 28 May 2021 16:54:16 +0200 +Subject: drm/amdgpu: make sure we unpin the UVD BO +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Nirmoy Das + +commit 07438603a07e52f1c6aa731842bd298d2725b7be upstream. + +Releasing pinned BOs is illegal now. UVD 6 was missing from: +commit 2f40801dc553 ("drm/amdgpu: make sure we unpin the UVD BO") + +Fixes: 2f40801dc553 ("drm/amdgpu: make sure we unpin the UVD BO") +Cc: stable@vger.kernel.org +Signed-off-by: Nirmoy Das +Reviewed-by: Christian König +Signed-off-by: Alex Deucher +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c +@@ -356,6 +356,7 @@ static int uvd_v6_0_enc_ring_test_ib(str + + error: + dma_fence_put(fence); ++ amdgpu_bo_unpin(bo); + amdgpu_bo_unreserve(bo); + amdgpu_bo_unref(&bo); + return r; diff --git a/queue-5.10/mm-debug_vm_pgtable-fix-alignment-for-pmd-pud_advanced_tests.patch b/queue-5.10/mm-debug_vm_pgtable-fix-alignment-for-pmd-pud_advanced_tests.patch new file mode 100644 index 00000000000..42fd7027b7a --- /dev/null +++ b/queue-5.10/mm-debug_vm_pgtable-fix-alignment-for-pmd-pud_advanced_tests.patch @@ -0,0 +1,64 @@ +From 04f7ce3f07ce39b1a3ca03a56b238a53acc52cfd Mon Sep 17 00:00:00 2001 +From: Gerald Schaefer +Date: Fri, 4 Jun 2021 20:01:18 -0700 +Subject: mm/debug_vm_pgtable: fix alignment for pmd/pud_advanced_tests() + +From: Gerald Schaefer + +commit 04f7ce3f07ce39b1a3ca03a56b238a53acc52cfd upstream. + +In pmd/pud_advanced_tests(), the vaddr is aligned up to the next pmd/pud +entry, and so it does not match the given pmdp/pudp and (aligned down) +pfn any more. + +For s390, this results in memory corruption, because the IDTE +instruction used e.g. in xxx_get_and_clear() will take the vaddr for +some calculations, in combination with the given pmdp. It will then end +up with a wrong table origin, ending on ...ff8, and some of those +wrongly set low-order bits will also select a wrong pagetable level for +the index addition. IDTE could therefore invalidate (or 0x20) something +outside of the page tables, depending on the wrongly picked index, which +in turn depends on the random vaddr. + +As result, we sometimes see "BUG task_struct (Not tainted): Padding +overwritten" on s390, where one 0x5a padding value got overwritten with +0x7a. + +Fix this by aligning down, similar to how the pmd/pud_aligned pfns are +calculated. + +Link: https://lkml.kernel.org/r/20210525130043.186290-2-gerald.schaefer@linux.ibm.com +Fixes: a5c3b9ffb0f40 ("mm/debug_vm_pgtable: add tests validating advanced arch page table helpers") +Signed-off-by: Gerald Schaefer +Reviewed-by: Anshuman Khandual +Cc: Vineet Gupta +Cc: Palmer Dabbelt +Cc: Paul Walmsley +Cc: [5.9+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/debug_vm_pgtable.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/mm/debug_vm_pgtable.c ++++ b/mm/debug_vm_pgtable.c +@@ -163,7 +163,7 @@ static void __init pmd_advanced_tests(st + + pr_debug("Validating PMD advanced\n"); + /* Align the address wrt HPAGE_PMD_SIZE */ +- vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE; ++ vaddr &= HPAGE_PMD_MASK; + + pgtable_trans_huge_deposit(mm, pmdp, pgtable); + +@@ -285,7 +285,7 @@ static void __init pud_advanced_tests(st + + pr_debug("Validating PUD advanced\n"); + /* Align the address wrt HPAGE_PUD_SIZE */ +- vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE; ++ vaddr &= HPAGE_PUD_MASK; + + set_pud_at(mm, vaddr, pudp, pud); + pudp_set_wrprotect(mm, vaddr, pudp); diff --git a/queue-5.10/mm-page_alloc-fix-counting-of-free-pages-after-take-off-from-buddy.patch b/queue-5.10/mm-page_alloc-fix-counting-of-free-pages-after-take-off-from-buddy.patch new file mode 100644 index 00000000000..3874713b55f --- /dev/null +++ b/queue-5.10/mm-page_alloc-fix-counting-of-free-pages-after-take-off-from-buddy.patch @@ -0,0 +1,60 @@ +From bac9c6fa1f929213bbd0ac9cdf21e8e2f0916828 Mon Sep 17 00:00:00 2001 +From: Ding Hui +Date: Fri, 4 Jun 2021 20:01:21 -0700 +Subject: mm/page_alloc: fix counting of free pages after take off from buddy + +From: Ding Hui + +commit bac9c6fa1f929213bbd0ac9cdf21e8e2f0916828 upstream. + +Recently we found that there is a lot MemFree left in /proc/meminfo +after do a lot of pages soft offline, it's not quite correct. + +Before Oscar's rework of soft offline for free pages [1], if we soft +offline free pages, these pages are left in buddy with HWPoison flag, +and NR_FREE_PAGES is not updated immediately. So the difference between +NR_FREE_PAGES and real number of available free pages is also even big +at the beginning. + +However, with the workload running, when we catch HWPoison page in any +alloc functions subsequently, we will remove it from buddy, meanwhile +update the NR_FREE_PAGES and try again, so the NR_FREE_PAGES will get +more and more closer to the real number of available free pages. +(regardless of unpoison_memory()) + +Now, for offline free pages, after a successful call +take_page_off_buddy(), the page is no longer belong to buddy allocator, +and will not be used any more, but we missed accounting NR_FREE_PAGES in +this situation, and there is no chance to be updated later. + +Do update in take_page_off_buddy() like rmqueue() does, but avoid double +counting if some one already set_migratetype_isolate() on the page. + +[1]: commit 06be6ff3d2ec ("mm,hwpoison: rework soft offline for free pages") + +Link: https://lkml.kernel.org/r/20210526075247.11130-1-dinghui@sangfor.com.cn +Fixes: 06be6ff3d2ec ("mm,hwpoison: rework soft offline for free pages") +Signed-off-by: Ding Hui +Suggested-by: Naoya Horiguchi +Reviewed-by: Oscar Salvador +Acked-by: David Hildenbrand +Acked-by: Naoya Horiguchi +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/page_alloc.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -8870,6 +8870,8 @@ bool take_page_off_buddy(struct page *pa + del_page_from_free_list(page_head, zone, page_order); + break_down_buddy_pages(zone, page_head, page, 0, + page_order, migratetype); ++ if (!is_migrate_isolate(migratetype)) ++ __mod_zone_freepage_state(zone, -1, migratetype); + ret = true; + break; + } diff --git a/queue-5.10/nfc-fix-null-ptr-dereference-in-llcp_sock_getname-after-failed-connect.patch b/queue-5.10/nfc-fix-null-ptr-dereference-in-llcp_sock_getname-after-failed-connect.patch new file mode 100644 index 00000000000..4c628ccec9a --- /dev/null +++ b/queue-5.10/nfc-fix-null-ptr-dereference-in-llcp_sock_getname-after-failed-connect.patch @@ -0,0 +1,59 @@ +From 4ac06a1e013cf5fdd963317ffd3b968560f33bba Mon Sep 17 00:00:00 2001 +From: Krzysztof Kozlowski +Date: Mon, 31 May 2021 09:21:38 +0200 +Subject: nfc: fix NULL ptr dereference in llcp_sock_getname() after failed connect + +From: Krzysztof Kozlowski + +commit 4ac06a1e013cf5fdd963317ffd3b968560f33bba upstream. + +It's possible to trigger NULL pointer dereference by local unprivileged +user, when calling getsockname() after failed bind() (e.g. the bind +fails because LLCP_SAP_MAX used as SAP): + + BUG: kernel NULL pointer dereference, address: 0000000000000000 + CPU: 1 PID: 426 Comm: llcp_sock_getna Not tainted 5.13.0-rc2-next-20210521+ #9 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1 04/01/2014 + Call Trace: + llcp_sock_getname+0xb1/0xe0 + __sys_getpeername+0x95/0xc0 + ? lockdep_hardirqs_on_prepare+0xd5/0x180 + ? syscall_enter_from_user_mode+0x1c/0x40 + __x64_sys_getpeername+0x11/0x20 + do_syscall_64+0x36/0x70 + entry_SYSCALL_64_after_hwframe+0x44/0xae + +This can be reproduced with Syzkaller C repro (bind followed by +getpeername): +https://syzkaller.appspot.com/x/repro.c?x=14def446e00000 + +Cc: +Fixes: d646960f7986 ("NFC: Initial LLCP support") +Reported-by: syzbot+80fb126e7f7d8b1a5914@syzkaller.appspotmail.com +Reported-by: butt3rflyh4ck +Signed-off-by: Krzysztof Kozlowski +Link: https://lore.kernel.org/r/20210531072138.5219-1-krzysztof.kozlowski@canonical.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/nfc/llcp_sock.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/net/nfc/llcp_sock.c ++++ b/net/nfc/llcp_sock.c +@@ -110,6 +110,7 @@ static int llcp_sock_bind(struct socket + if (!llcp_sock->service_name) { + nfc_llcp_local_put(llcp_sock->local); + llcp_sock->local = NULL; ++ llcp_sock->dev = NULL; + ret = -ENOMEM; + goto put_dev; + } +@@ -119,6 +120,7 @@ static int llcp_sock_bind(struct socket + llcp_sock->local = NULL; + kfree(llcp_sock->service_name); + llcp_sock->service_name = NULL; ++ llcp_sock->dev = NULL; + ret = -EADDRINUSE; + goto put_dev; + } diff --git a/queue-5.10/ocfs2-fix-data-corruption-by-fallocate.patch b/queue-5.10/ocfs2-fix-data-corruption-by-fallocate.patch new file mode 100644 index 00000000000..4aa41cfaab2 --- /dev/null +++ b/queue-5.10/ocfs2-fix-data-corruption-by-fallocate.patch @@ -0,0 +1,148 @@ +From 6bba4471f0cc1296fe3c2089b9e52442d3074b2e Mon Sep 17 00:00:00 2001 +From: Junxiao Bi +Date: Fri, 4 Jun 2021 20:01:42 -0700 +Subject: ocfs2: fix data corruption by fallocate + +From: Junxiao Bi + +commit 6bba4471f0cc1296fe3c2089b9e52442d3074b2e upstream. + +When fallocate punches holes out of inode size, if original isize is in +the middle of last cluster, then the part from isize to the end of the +cluster will be zeroed with buffer write, at that time isize is not yet +updated to match the new size, if writeback is kicked in, it will invoke +ocfs2_writepage()->block_write_full_page() where the pages out of inode +size will be dropped. That will cause file corruption. Fix this by +zero out eof blocks when extending the inode size. + +Running the following command with qemu-image 4.2.1 can get a corrupted +coverted image file easily. + + qemu-img convert -p -t none -T none -f qcow2 $qcow_image \ + -O qcow2 -o compat=1.1 $qcow_image.conv + +The usage of fallocate in qemu is like this, it first punches holes out +of inode size, then extend the inode size. + + fallocate(11, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 2276196352, 65536) = 0 + fallocate(11, 0, 2276196352, 65536) = 0 + +v1: https://www.spinics.net/lists/linux-fsdevel/msg193999.html +v2: https://lore.kernel.org/linux-fsdevel/20210525093034.GB4112@quack2.suse.cz/T/ + +Link: https://lkml.kernel.org/r/20210528210648.9124-1-junxiao.bi@oracle.com +Signed-off-by: Junxiao Bi +Reviewed-by: Joseph Qi +Cc: Jan Kara +Cc: Mark Fasheh +Cc: Joel Becker +Cc: Changwei Ge +Cc: Gang He +Cc: Jun Piao +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + fs/ocfs2/file.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 50 insertions(+), 5 deletions(-) + +--- a/fs/ocfs2/file.c ++++ b/fs/ocfs2/file.c +@@ -1856,6 +1856,45 @@ out: + } + + /* ++ * zero out partial blocks of one cluster. ++ * ++ * start: file offset where zero starts, will be made upper block aligned. ++ * len: it will be trimmed to the end of current cluster if "start + len" ++ * is bigger than it. ++ */ ++static int ocfs2_zeroout_partial_cluster(struct inode *inode, ++ u64 start, u64 len) ++{ ++ int ret; ++ u64 start_block, end_block, nr_blocks; ++ u64 p_block, offset; ++ u32 cluster, p_cluster, nr_clusters; ++ struct super_block *sb = inode->i_sb; ++ u64 end = ocfs2_align_bytes_to_clusters(sb, start); ++ ++ if (start + len < end) ++ end = start + len; ++ ++ start_block = ocfs2_blocks_for_bytes(sb, start); ++ end_block = ocfs2_blocks_for_bytes(sb, end); ++ nr_blocks = end_block - start_block; ++ if (!nr_blocks) ++ return 0; ++ ++ cluster = ocfs2_bytes_to_clusters(sb, start); ++ ret = ocfs2_get_clusters(inode, cluster, &p_cluster, ++ &nr_clusters, NULL); ++ if (ret) ++ return ret; ++ if (!p_cluster) ++ return 0; ++ ++ offset = start_block - ocfs2_clusters_to_blocks(sb, cluster); ++ p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset; ++ return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS); ++} ++ ++/* + * Parts of this function taken from xfs_change_file_space() + */ + static int __ocfs2_change_file_space(struct file *file, struct inode *inode, +@@ -1865,7 +1904,7 @@ static int __ocfs2_change_file_space(str + { + int ret; + s64 llen; +- loff_t size; ++ loff_t size, orig_isize; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *di_bh = NULL; + handle_t *handle; +@@ -1896,6 +1935,7 @@ static int __ocfs2_change_file_space(str + goto out_inode_unlock; + } + ++ orig_isize = i_size_read(inode); + switch (sr->l_whence) { + case 0: /*SEEK_SET*/ + break; +@@ -1903,7 +1943,7 @@ static int __ocfs2_change_file_space(str + sr->l_start += f_pos; + break; + case 2: /*SEEK_END*/ +- sr->l_start += i_size_read(inode); ++ sr->l_start += orig_isize; + break; + default: + ret = -EINVAL; +@@ -1957,6 +1997,14 @@ static int __ocfs2_change_file_space(str + default: + ret = -EINVAL; + } ++ ++ /* zeroout eof blocks in the cluster. */ ++ if (!ret && change_size && orig_isize < size) { ++ ret = ocfs2_zeroout_partial_cluster(inode, orig_isize, ++ size - orig_isize); ++ if (!ret) ++ i_size_write(inode, size); ++ } + up_write(&OCFS2_I(inode)->ip_alloc_sem); + if (ret) { + mlog_errno(ret); +@@ -1973,9 +2021,6 @@ static int __ocfs2_change_file_space(str + goto out_inode_unlock; + } + +- if (change_size && i_size_read(inode) < size) +- i_size_write(inode, size); +- + inode->i_ctime = inode->i_mtime = current_time(inode); + ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); + if (ret < 0) diff --git a/queue-5.10/powerpc-kprobes-fix-validation-of-prefixed-instructions-across-page-boundary.patch b/queue-5.10/powerpc-kprobes-fix-validation-of-prefixed-instructions-across-page-boundary.patch new file mode 100644 index 00000000000..9b161a09e1e --- /dev/null +++ b/queue-5.10/powerpc-kprobes-fix-validation-of-prefixed-instructions-across-page-boundary.patch @@ -0,0 +1,53 @@ +From 82123a3d1d5a306fdf50c968a474cc60fe43a80f Mon Sep 17 00:00:00 2001 +From: "Naveen N. Rao" +Date: Wed, 19 May 2021 16:17:17 +0530 +Subject: powerpc/kprobes: Fix validation of prefixed instructions across page boundary + +From: Naveen N. Rao + +commit 82123a3d1d5a306fdf50c968a474cc60fe43a80f upstream. + +When checking if the probed instruction is the suffix of a prefixed +instruction, we access the instruction at the previous word. If the +probed instruction is the very first word of a module, we can end up +trying to access an invalid page. + +Fix this by skipping the check for all instructions at the beginning of +a page. Prefixed instructions cannot cross a 64-byte boundary and as +such, we don't expect to encounter a suffix as the very first word in a +page for kernel text. Even if there are prefixed instructions crossing +a page boundary (from a module, for instance), the instruction will be +illegal, so preventing probing on the suffix of such prefix instructions +isn't worthwhile. + +Fixes: b4657f7650ba ("powerpc/kprobes: Don't allow breakpoints on suffixes") +Cc: stable@vger.kernel.org # v5.8+ +Reported-by: Christophe Leroy +Signed-off-by: Naveen N. Rao +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/0df9a032a05576a2fa8e97d1b769af2ff0eafbd6.1621416666.git.naveen.n.rao@linux.vnet.ibm.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/kernel/kprobes.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/powerpc/kernel/kprobes.c ++++ b/arch/powerpc/kernel/kprobes.c +@@ -108,7 +108,6 @@ int arch_prepare_kprobe(struct kprobe *p + int ret = 0; + struct kprobe *prev; + struct ppc_inst insn = ppc_inst_read((struct ppc_inst *)p->addr); +- struct ppc_inst prefix = ppc_inst_read((struct ppc_inst *)(p->addr - 1)); + + if ((unsigned long)p->addr & 0x03) { + printk("Attempt to register kprobe at an unaligned address\n"); +@@ -116,7 +115,8 @@ int arch_prepare_kprobe(struct kprobe *p + } else if (IS_MTMSRD(insn) || IS_RFID(insn) || IS_RFI(insn)) { + printk("Cannot register a kprobe on rfi/rfid or mtmsr[d]\n"); + ret = -EINVAL; +- } else if (ppc_inst_prefixed(prefix)) { ++ } else if ((unsigned long)p->addr & ~PAGE_MASK && ++ ppc_inst_prefixed(ppc_inst_read((struct ppc_inst *)(p->addr - 1)))) { + printk("Cannot register a kprobe on the second word of prefixed instruction\n"); + ret = -EINVAL; + } diff --git a/queue-5.10/series b/queue-5.10/series index 45562442de7..4eae7559cd1 100644 --- a/queue-5.10/series +++ b/queue-5.10/series @@ -106,3 +106,20 @@ ext4-fix-memory-leak-in-ext4_mb_init_backend-on-error-path.patch ext4-fix-accessing-uninit-percpu-counter-variable-with-fast_commit.patch usb-dwc2-fix-build-in-periphal-only-mode.patch pid-take-a-reference-when-initializing-cad_pid.patch +ocfs2-fix-data-corruption-by-fallocate.patch +mm-debug_vm_pgtable-fix-alignment-for-pmd-pud_advanced_tests.patch +mm-page_alloc-fix-counting-of-free-pages-after-take-off-from-buddy.patch +x86-cpufeatures-force-disable-x86_feature_enqcmd-and-remove-update_pasid.patch +x86-sev-check-sme-sev-support-in-cpuid-first.patch +nfc-fix-null-ptr-dereference-in-llcp_sock_getname-after-failed-connect.patch +drm-amdgpu-don-t-query-ce-and-ue-errors.patch +drm-amdgpu-make-sure-we-unpin-the-uvd-bo.patch +x86-apic-mark-_all_-legacy-interrupts-when-io-apic-is-missing.patch +powerpc-kprobes-fix-validation-of-prefixed-instructions-across-page-boundary.patch +btrfs-mark-ordered-extent-and-inode-with-error-if-we-fail-to-finish.patch +btrfs-fix-error-handling-in-btrfs_del_csums.patch +btrfs-return-errors-from-btrfs_del_csums-in-cleanup_ref_head.patch +btrfs-fix-fsync-failure-and-transaction-abort-after-writes-to-prealloc-extents.patch +btrfs-fixup-error-handling-in-fixup_inode_link_counts.patch +btrfs-abort-in-rename_exchange-if-we-fail-to-insert-the-second-ref.patch +btrfs-fix-deadlock-when-cloning-inline-extents-and-low-on-available-space.patch diff --git a/queue-5.10/x86-apic-mark-_all_-legacy-interrupts-when-io-apic-is-missing.patch b/queue-5.10/x86-apic-mark-_all_-legacy-interrupts-when-io-apic-is-missing.patch new file mode 100644 index 00000000000..03dc99e9c02 --- /dev/null +++ b/queue-5.10/x86-apic-mark-_all_-legacy-interrupts-when-io-apic-is-missing.patch @@ -0,0 +1,95 @@ +From 7d65f9e80646c595e8c853640a9d0768a33e204c Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Tue, 25 May 2021 13:08:41 +0200 +Subject: x86/apic: Mark _all_ legacy interrupts when IO/APIC is missing + +From: Thomas Gleixner + +commit 7d65f9e80646c595e8c853640a9d0768a33e204c upstream. + +PIC interrupts do not support affinity setting and they can end up on +any online CPU. Therefore, it's required to mark the associated vectors +as system-wide reserved. Otherwise, the corresponding irq descriptors +are copied to the secondary CPUs but the vectors are not marked as +assigned or reserved. This works correctly for the IO/APIC case. + +When the IO/APIC is disabled via config, kernel command line or lack of +enumeration then all legacy interrupts are routed through the PIC, but +nothing marks them as system-wide reserved vectors. + +As a consequence, a subsequent allocation on a secondary CPU can result in +allocating one of these vectors, which triggers the BUG() in +apic_update_vector() because the interrupt descriptor slot is not empty. + +Imran tried to work around that by marking those interrupts as allocated +when a CPU comes online. But that's wrong in case that the IO/APIC is +available and one of the legacy interrupts, e.g. IRQ0, has been switched to +PIC mode because then marking them as allocated will fail as they are +already marked as system vectors. + +Stay consistent and update the legacy vectors after attempting IO/APIC +initialization and mark them as system vectors in case that no IO/APIC is +available. + +Fixes: 69cde0004a4b ("x86/vector: Use matrix allocator for vector assignment") +Reported-by: Imran Khan +Signed-off-by: Thomas Gleixner +Signed-off-by: Borislav Petkov +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/20210519233928.2157496-1-imran.f.khan@oracle.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/apic.h | 1 + + arch/x86/kernel/apic/apic.c | 1 + + arch/x86/kernel/apic/vector.c | 20 ++++++++++++++++++++ + 3 files changed, 22 insertions(+) + +--- a/arch/x86/include/asm/apic.h ++++ b/arch/x86/include/asm/apic.h +@@ -174,6 +174,7 @@ static inline int apic_is_clustered_box( + extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); + extern void lapic_assign_system_vectors(void); + extern void lapic_assign_legacy_vector(unsigned int isairq, bool replace); ++extern void lapic_update_legacy_vectors(void); + extern void lapic_online(void); + extern void lapic_offline(void); + extern bool apic_needs_pit(void); +--- a/arch/x86/kernel/apic/apic.c ++++ b/arch/x86/kernel/apic/apic.c +@@ -2539,6 +2539,7 @@ static void __init apic_bsp_setup(bool u + end_local_APIC_setup(); + irq_remap_enable_fault_handling(); + setup_IO_APIC(); ++ lapic_update_legacy_vectors(); + } + + #ifdef CONFIG_UP_LATE_INIT +--- a/arch/x86/kernel/apic/vector.c ++++ b/arch/x86/kernel/apic/vector.c +@@ -687,6 +687,26 @@ void lapic_assign_legacy_vector(unsigned + irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace); + } + ++void __init lapic_update_legacy_vectors(void) ++{ ++ unsigned int i; ++ ++ if (IS_ENABLED(CONFIG_X86_IO_APIC) && nr_ioapics > 0) ++ return; ++ ++ /* ++ * If the IO/APIC is disabled via config, kernel command line or ++ * lack of enumeration then all legacy interrupts are routed ++ * through the PIC. Make sure that they are marked as legacy ++ * vectors. PIC_CASCADE_IRQ has already been marked in ++ * lapic_assign_system_vectors(). ++ */ ++ for (i = 0; i < nr_legacy_irqs(); i++) { ++ if (i != PIC_CASCADE_IR) ++ lapic_assign_legacy_vector(i, true); ++ } ++} ++ + void __init lapic_assign_system_vectors(void) + { + unsigned int i, vector = 0; diff --git a/queue-5.10/x86-cpufeatures-force-disable-x86_feature_enqcmd-and-remove-update_pasid.patch b/queue-5.10/x86-cpufeatures-force-disable-x86_feature_enqcmd-and-remove-update_pasid.patch new file mode 100644 index 00000000000..bfedc9eff42 --- /dev/null +++ b/queue-5.10/x86-cpufeatures-force-disable-x86_feature_enqcmd-and-remove-update_pasid.patch @@ -0,0 +1,178 @@ +From 9bfecd05833918526cc7357d55e393393440c5fa Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Sat, 29 May 2021 11:17:30 +0200 +Subject: x86/cpufeatures: Force disable X86_FEATURE_ENQCMD and remove update_pasid() + +From: Thomas Gleixner + +commit 9bfecd05833918526cc7357d55e393393440c5fa upstream. + +While digesting the XSAVE-related horrors which got introduced with +the supervisor/user split, the recent addition of ENQCMD-related +functionality got on the radar and turned out to be similarly broken. + +update_pasid(), which is only required when X86_FEATURE_ENQCMD is +available, is invoked from two places: + + 1) From switch_to() for the incoming task + + 2) Via a SMP function call from the IOMMU/SMV code + +#1 is half-ways correct as it hacks around the brokenness of get_xsave_addr() + by enforcing the state to be 'present', but all the conditionals in that + code are completely pointless for that. + + Also the invocation is just useless overhead because at that point + it's guaranteed that TIF_NEED_FPU_LOAD is set on the incoming task + and all of this can be handled at return to user space. + +#2 is broken beyond repair. The comment in the code claims that it is safe + to invoke this in an IPI, but that's just wishful thinking. + + FPU state of a running task is protected by fregs_lock() which is + nothing else than a local_bh_disable(). As BH-disabled regions run + usually with interrupts enabled the IPI can hit a code section which + modifies FPU state and there is absolutely no guarantee that any of the + assumptions which are made for the IPI case is true. + + Also the IPI is sent to all CPUs in mm_cpumask(mm), but the IPI is + invoked with a NULL pointer argument, so it can hit a completely + unrelated task and unconditionally force an update for nothing. + Worse, it can hit a kernel thread which operates on a user space + address space and set a random PASID for it. + +The offending commit does not cleanly revert, but it's sufficient to +force disable X86_FEATURE_ENQCMD and to remove the broken update_pasid() +code to make this dysfunctional all over the place. Anything more +complex would require more surgery and none of the related functions +outside of the x86 core code are blatantly wrong, so removing those +would be overkill. + +As nothing enables the PASID bit in the IA32_XSS MSR yet, which is +required to make this actually work, this cannot result in a regression +except for related out of tree train-wrecks, but they are broken already +today. + +Fixes: 20f0afd1fb3d ("x86/mmu: Allocate/free a PASID") +Signed-off-by: Thomas Gleixner +Signed-off-by: Borislav Petkov +Acked-by: Andy Lutomirski +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/87mtsd6gr9.ffs@nanos.tec.linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/disabled-features.h | 7 +-- + arch/x86/include/asm/fpu/api.h | 6 --- + arch/x86/include/asm/fpu/internal.h | 7 --- + arch/x86/kernel/fpu/xstate.c | 57 ------------------------------- + 4 files changed, 3 insertions(+), 74 deletions(-) + +--- a/arch/x86/include/asm/disabled-features.h ++++ b/arch/x86/include/asm/disabled-features.h +@@ -56,11 +56,8 @@ + # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) + #endif + +-#ifdef CONFIG_IOMMU_SUPPORT +-# define DISABLE_ENQCMD 0 +-#else +-# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31)) +-#endif ++/* Force disable because it's broken beyond repair */ ++#define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31)) + + /* + * Make sure to add features to the correct mask +--- a/arch/x86/include/asm/fpu/api.h ++++ b/arch/x86/include/asm/fpu/api.h +@@ -79,10 +79,6 @@ extern int cpu_has_xfeatures(u64 xfeatur + */ + #define PASID_DISABLED 0 + +-#ifdef CONFIG_IOMMU_SUPPORT +-/* Update current's PASID MSR/state by mm's PASID. */ +-void update_pasid(void); +-#else + static inline void update_pasid(void) { } +-#endif ++ + #endif /* _ASM_X86_FPU_API_H */ +--- a/arch/x86/include/asm/fpu/internal.h ++++ b/arch/x86/include/asm/fpu/internal.h +@@ -584,13 +584,6 @@ static inline void switch_fpu_finish(str + pkru_val = pk->pkru; + } + __write_pkru(pkru_val); +- +- /* +- * Expensive PASID MSR write will be avoided in update_pasid() because +- * TIF_NEED_FPU_LOAD was set. And the PASID state won't be updated +- * unless it's different from mm->pasid to reduce overhead. +- */ +- update_pasid(); + } + + #endif /* _ASM_X86_FPU_INTERNAL_H */ +--- a/arch/x86/kernel/fpu/xstate.c ++++ b/arch/x86/kernel/fpu/xstate.c +@@ -1402,60 +1402,3 @@ int proc_pid_arch_status(struct seq_file + return 0; + } + #endif /* CONFIG_PROC_PID_ARCH_STATUS */ +- +-#ifdef CONFIG_IOMMU_SUPPORT +-void update_pasid(void) +-{ +- u64 pasid_state; +- u32 pasid; +- +- if (!cpu_feature_enabled(X86_FEATURE_ENQCMD)) +- return; +- +- if (!current->mm) +- return; +- +- pasid = READ_ONCE(current->mm->pasid); +- /* Set the valid bit in the PASID MSR/state only for valid pasid. */ +- pasid_state = pasid == PASID_DISABLED ? +- pasid : pasid | MSR_IA32_PASID_VALID; +- +- /* +- * No need to hold fregs_lock() since the task's fpstate won't +- * be changed by others (e.g. ptrace) while the task is being +- * switched to or is in IPI. +- */ +- if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { +- /* The MSR is active and can be directly updated. */ +- wrmsrl(MSR_IA32_PASID, pasid_state); +- } else { +- struct fpu *fpu = ¤t->thread.fpu; +- struct ia32_pasid_state *ppasid_state; +- struct xregs_state *xsave; +- +- /* +- * The CPU's xstate registers are not currently active. Just +- * update the PASID state in the memory buffer here. The +- * PASID MSR will be loaded when returning to user mode. +- */ +- xsave = &fpu->state.xsave; +- xsave->header.xfeatures |= XFEATURE_MASK_PASID; +- ppasid_state = get_xsave_addr(xsave, XFEATURE_PASID); +- /* +- * Since XFEATURE_MASK_PASID is set in xfeatures, ppasid_state +- * won't be NULL and no need to check its value. +- * +- * Only update the task's PASID state when it's different +- * from the mm's pasid. +- */ +- if (ppasid_state->pasid != pasid_state) { +- /* +- * Invalid fpregs so that state restoring will pick up +- * the PASID state. +- */ +- __fpu_invalidate_fpregs_state(fpu); +- ppasid_state->pasid = pasid_state; +- } +- } +-} +-#endif /* CONFIG_IOMMU_SUPPORT */ diff --git a/queue-5.10/x86-sev-check-sme-sev-support-in-cpuid-first.patch b/queue-5.10/x86-sev-check-sme-sev-support-in-cpuid-first.patch new file mode 100644 index 00000000000..25a84cb76a4 --- /dev/null +++ b/queue-5.10/x86-sev-check-sme-sev-support-in-cpuid-first.patch @@ -0,0 +1,69 @@ +From 009767dbf42ac0dbe3cf48c1ee224f6b778aa85a Mon Sep 17 00:00:00 2001 +From: Pu Wen +Date: Wed, 2 Jun 2021 15:02:07 +0800 +Subject: x86/sev: Check SME/SEV support in CPUID first + +From: Pu Wen + +commit 009767dbf42ac0dbe3cf48c1ee224f6b778aa85a upstream. + +The first two bits of the CPUID leaf 0x8000001F EAX indicate whether SEV +or SME is supported, respectively. It's better to check whether SEV or +SME is actually supported before accessing the MSR_AMD64_SEV to check +whether SEV or SME is enabled. + +This is both a bare-metal issue and a guest/VM issue. Since the first +generation Hygon Dhyana CPU doesn't support the MSR_AMD64_SEV, reading that +MSR results in a #GP - either directly from hardware in the bare-metal +case or via the hypervisor (because the RDMSR is actually intercepted) +in the guest/VM case, resulting in a failed boot. And since this is very +early in the boot phase, rdmsrl_safe()/native_read_msr_safe() can't be +used. + +So check the CPUID bits first, before accessing the MSR. + + [ tlendacky: Expand and improve commit message. ] + [ bp: Massage commit message. ] + +Fixes: eab696d8e8b9 ("x86/sev: Do not require Hypervisor CPUID bit for SEV guests") +Signed-off-by: Pu Wen +Signed-off-by: Borislav Petkov +Acked-by: Tom Lendacky +Cc: # v5.10+ +Link: https://lkml.kernel.org/r/20210602070207.2480-1-puwen@hygon.cn +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/mm/mem_encrypt_identity.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +--- a/arch/x86/mm/mem_encrypt_identity.c ++++ b/arch/x86/mm/mem_encrypt_identity.c +@@ -504,10 +504,6 @@ void __init sme_enable(struct boot_param + #define AMD_SME_BIT BIT(0) + #define AMD_SEV_BIT BIT(1) + +- /* Check the SEV MSR whether SEV or SME is enabled */ +- sev_status = __rdmsr(MSR_AMD64_SEV); +- feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; +- + /* + * Check for the SME/SEV feature: + * CPUID Fn8000_001F[EAX] +@@ -519,11 +515,16 @@ void __init sme_enable(struct boot_param + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); +- if (!(eax & feature_mask)) ++ /* Check whether SEV or SME is supported */ ++ if (!(eax & (AMD_SEV_BIT | AMD_SME_BIT))) + return; + + me_mask = 1UL << (ebx & 0x3f); + ++ /* Check the SEV MSR whether SEV or SME is enabled */ ++ sev_status = __rdmsr(MSR_AMD64_SEV); ++ feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; ++ + /* Check if memory encryption is enabled */ + if (feature_mask == AMD_SME_BIT) { + /* -- 2.47.3