5.12-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 8 Jun 2021 14:06:33 +0000 (16:06 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 8 Jun 2021 14:06:33 +0000 (16:06 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 8 Jun 2021 14:06:33 +0000 (16:06 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 8 Jun 2021 14:06:33 +0000 (16:06 +0200)
diff --git a/queue-5.12/btrfs-abort-in-rename_exchange-if-we-fail-to-insert-the-second-ref.patch b/queue-5.12/btrfs-abort-in-rename_exchange-if-we-fail-to-insert-the-second-ref.patch

new file mode 100644 (file)

index 0000000..84bb743
--- /dev/null
+++ b/queue-5.12/btrfs-abort-in-rename_exchange-if-we-fail-to-insert-the-second-ref.patch
@@ -0,0 +1,56 @@
+From dc09ef3562726cd520c8338c1640872a60187af5 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 19 May 2021 14:04:21 -0400
+Subject: btrfs: abort in rename_exchange if we fail to insert the second ref
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit dc09ef3562726cd520c8338c1640872a60187af5 upstream.
+
+Error injection stress uncovered a problem where we'd leave a dangling
+inode ref if we failed during a rename_exchange.  This happens because
+we insert the inode ref for one side of the rename, and then for the
+other side.  If this second inode ref insert fails we'll leave the first
+one dangling and leave a corrupt file system behind.  Fix this by
+aborting if we did the insert for the first inode ref.
+
+CC: stable@vger.kernel.org # 4.9+
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/inode.c |    7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -9088,6 +9088,7 @@ static int btrfs_rename_exchange(struct
+       int ret2;
+       bool root_log_pinned = false;
+       bool dest_log_pinned = false;
++      bool need_abort = false;
+ 
+       /* we only allow rename subvolume link between subvolumes */
+       if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+@@ -9144,6 +9145,7 @@ static int btrfs_rename_exchange(struct
+                                            old_idx);
+               if (ret)
+                       goto out_fail;
++              need_abort = true;
+       }
+ 
+       /* And now for the dest. */
+@@ -9159,8 +9161,11 @@ static int btrfs_rename_exchange(struct
+                                            new_ino,
+                                            btrfs_ino(BTRFS_I(old_dir)),
+                                            new_idx);
+-              if (ret)
++              if (ret) {
++                      if (need_abort)
++                              btrfs_abort_transaction(trans, ret);
+                       goto out_fail;
++              }
+       }
+ 
+       /* Update inode version and ctime/mtime. */
diff --git a/queue-5.12/btrfs-check-error-value-from-btrfs_update_inode-in-tree-log.patch b/queue-5.12/btrfs-check-error-value-from-btrfs_update_inode-in-tree-log.patch

new file mode 100644 (file)

index 0000000..55b2fec
--- /dev/null
+++ b/queue-5.12/btrfs-check-error-value-from-btrfs_update_inode-in-tree-log.patch
@@ -0,0 +1,53 @@
+From f96d44743a44e3332f75d23d2075bb8270900e1d Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 19 May 2021 11:26:25 -0400
+Subject: btrfs: check error value from btrfs_update_inode in tree log
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit f96d44743a44e3332f75d23d2075bb8270900e1d upstream.
+
+Error injection testing uncovered a case where we ended up with invalid
+link counts on an inode.  This happened because we failed to notice an
+error when updating the inode while replaying the tree log, and
+committed the transaction with an invalid file system.
+
+Fix this by checking the return value of btrfs_update_inode.  This
+resolved the link count errors I was seeing, and we already properly
+handle passing up the error values in these paths.
+
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c |    8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -1574,7 +1574,9 @@ static noinline int add_inode_ref(struct
+                       if (ret)
+                               goto out;
+ 
+-                      btrfs_update_inode(trans, root, BTRFS_I(inode));
++                      ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
++                      if (ret)
++                              goto out;
+               }
+ 
+               ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
+@@ -1749,7 +1751,9 @@ static noinline int fixup_inode_link_cou
+ 
+       if (nlink != inode->i_nlink) {
+               set_nlink(inode, nlink);
+-              btrfs_update_inode(trans, root, BTRFS_I(inode));
++              ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
++              if (ret)
++                      goto out;
+       }
+       BTRFS_I(inode)->index_cnt = (u64)-1;
+ 
diff --git a/queue-5.12/btrfs-fix-deadlock-when-cloning-inline-extents-and-low-on-available-space.patch b/queue-5.12/btrfs-fix-deadlock-when-cloning-inline-extents-and-low-on-available-space.patch

new file mode 100644 (file)

index 0000000..a361a93
--- /dev/null
+++ b/queue-5.12/btrfs-fix-deadlock-when-cloning-inline-extents-and-low-on-available-space.patch
@@ -0,0 +1,123 @@
+From 76a6d5cd74479e7ec8a7f9a29bce63d5549b6b2e Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 25 May 2021 11:05:28 +0100
+Subject: btrfs: fix deadlock when cloning inline extents and low on available space
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 76a6d5cd74479e7ec8a7f9a29bce63d5549b6b2e upstream.
+
+There are a few cases where cloning an inline extent requires copying data
+into a page of the destination inode. For these cases we are allocating
+the required data and metadata space while holding a leaf locked. This can
+result in a deadlock when we are low on available space because allocating
+the space may flush delalloc and two deadlock scenarios can happen:
+
+1) When starting writeback for an inode with a very small dirty range that
+   fits in an inline extent, we deadlock during the writeback when trying
+   to insert the inline extent, at cow_file_range_inline(), if the extent
+   is going to be located in the leaf for which we are already holding a
+   read lock;
+
+2) After successfully starting writeback, for non-inline extent cases,
+   the async reclaim thread will hang waiting for an ordered extent to
+   complete if the ordered extent completion needs to modify the leaf
+   for which the clone task is holding a read lock (for adding or
+   replacing file extent items). So the cloning task will wait forever
+   on the async reclaim thread to make progress, which in turn is
+   waiting for the ordered extent completion which in turn is waiting
+   to acquire a write lock on the same leaf.
+
+So fix this by making sure we release the path (and therefore the leaf)
+every time we need to copy the inline extent's data into a page of the
+destination inode, as by that time we do not need to have the leaf locked.
+
+Fixes: 05a5a7621ce66c ("Btrfs: implement full reflink support for inline extents")
+CC: stable@vger.kernel.org # 5.10+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/reflink.c |   38 ++++++++++++++++++++++----------------
+ 1 file changed, 22 insertions(+), 16 deletions(-)
+
+--- a/fs/btrfs/reflink.c
++++ b/fs/btrfs/reflink.c
+@@ -207,10 +207,7 @@ static int clone_copy_inline_extent(stru
+                        * inline extent's data to the page.
+                        */
+                       ASSERT(key.offset > 0);
+-                      ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+-                                                inline_data, size, datal,
+-                                                comp_type);
+-                      goto out;
++                      goto copy_to_page;
+               }
+       } else if (i_size_read(dst) <= datal) {
+               struct btrfs_file_extent_item *ei;
+@@ -226,13 +223,10 @@ static int clone_copy_inline_extent(stru
+                   BTRFS_FILE_EXTENT_INLINE)
+                       goto copy_inline_extent;
+ 
+-              ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+-                                        inline_data, size, datal, comp_type);
+-              goto out;
++              goto copy_to_page;
+       }
+ 
+ copy_inline_extent:
+-      ret = 0;
+       /*
+        * We have no extent items, or we have an extent at offset 0 which may
+        * or may not be inlined. All these cases are dealt the same way.
+@@ -244,11 +238,13 @@ copy_inline_extent:
+                * clone. Deal with all these cases by copying the inline extent
+                * data into the respective page at the destination inode.
+                */
+-              ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+-                                        inline_data, size, datal, comp_type);
+-              goto out;
++              goto copy_to_page;
+       }
+ 
++      /*
++       * Release path before starting a new transaction so we don't hold locks
++       * that would confuse lockdep.
++       */
+       btrfs_release_path(path);
+       /*
+        * If we end up here it means were copy the inline extent into a leaf
+@@ -286,11 +282,6 @@ copy_inline_extent:
+ out:
+       if (!ret && !trans) {
+               /*
+-               * Release path before starting a new transaction so we don't
+-               * hold locks that would confuse lockdep.
+-               */
+-              btrfs_release_path(path);
+-              /*
+                * No transaction here means we copied the inline extent into a
+                * page of the destination inode.
+                *
+@@ -310,6 +301,21 @@ out:
+               *trans_out = trans;
+ 
+       return ret;
++
++copy_to_page:
++      /*
++       * Release our path because we don't need it anymore and also because
++       * copy_inline_to_page() needs to reserve data and metadata, which may
++       * need to flush delalloc when we are low on available space and
++       * therefore cause a deadlock if writeback of an inline extent needs to
++       * write to the same leaf or an ordered extent completion needs to write
++       * to the same leaf.
++       */
++      btrfs_release_path(path);
++
++      ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
++                                inline_data, size, datal, comp_type);
++      goto out;
+ }
+ 
+ /**
diff --git a/queue-5.12/btrfs-fix-error-handling-in-btrfs_del_csums.patch b/queue-5.12/btrfs-fix-error-handling-in-btrfs_del_csums.patch

new file mode 100644 (file)

index 0000000..dbe42c7
--- /dev/null
+++ b/queue-5.12/btrfs-fix-error-handling-in-btrfs_del_csums.patch
@@ -0,0 +1,93 @@
+From b86652be7c83f70bf406bed18ecf55adb9bfb91b Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 19 May 2021 10:52:45 -0400
+Subject: btrfs: fix error handling in btrfs_del_csums
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit b86652be7c83f70bf406bed18ecf55adb9bfb91b upstream.
+
+Error injection stress would sometimes fail with checksums on disk that
+did not have a corresponding extent.  This occurred because the pattern
+in btrfs_del_csums was
+
+       while (1) {
+               ret = btrfs_search_slot();
+               if (ret < 0)
+                       break;
+       }
+       ret = 0;
+out:
+       btrfs_free_path(path);
+       return ret;
+
+If we got an error from btrfs_search_slot we'd clear the error because
+we were breaking instead of goto out.  Instead of using goto out, simply
+handle the cases where we may leave a random value in ret, and get rid
+of the
+
+       ret = 0;
+out:
+
+pattern and simply allow break to have the proper error reporting.  With
+this fix we properly abort the transaction and do not commit thinking we
+successfully deleted the csum.
+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+CC: stable@vger.kernel.org # 4.4+
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/file-item.c |   10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+--- a/fs/btrfs/file-item.c
++++ b/fs/btrfs/file-item.c
+@@ -787,7 +787,7 @@ int btrfs_del_csums(struct btrfs_trans_h
+       u64 end_byte = bytenr + len;
+       u64 csum_end;
+       struct extent_buffer *leaf;
+-      int ret;
++      int ret = 0;
+       const u32 csum_size = fs_info->csum_size;
+       u32 blocksize_bits = fs_info->sectorsize_bits;
+ 
+@@ -805,6 +805,7 @@ int btrfs_del_csums(struct btrfs_trans_h
+ 
+               ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+               if (ret > 0) {
++                      ret = 0;
+                       if (path->slots[0] == 0)
+                               break;
+                       path->slots[0]--;
+@@ -861,7 +862,7 @@ int btrfs_del_csums(struct btrfs_trans_h
+                       ret = btrfs_del_items(trans, root, path,
+                                             path->slots[0], del_nr);
+                       if (ret)
+-                              goto out;
++                              break;
+                       if (key.offset == bytenr)
+                               break;
+               } else if (key.offset < bytenr && csum_end > end_byte) {
+@@ -905,8 +906,9 @@ int btrfs_del_csums(struct btrfs_trans_h
+                       ret = btrfs_split_item(trans, root, path, &key, offset);
+                       if (ret && ret != -EAGAIN) {
+                               btrfs_abort_transaction(trans, ret);
+-                              goto out;
++                              break;
+                       }
++                      ret = 0;
+ 
+                       key.offset = end_byte - 1;
+               } else {
+@@ -916,8 +918,6 @@ int btrfs_del_csums(struct btrfs_trans_h
+               }
+               btrfs_release_path(path);
+       }
+-      ret = 0;
+-out:
+       btrfs_free_path(path);
+       return ret;
+ }
diff --git a/queue-5.12/btrfs-fix-fsync-failure-and-transaction-abort-after-writes-to-prealloc-extents.patch b/queue-5.12/btrfs-fix-fsync-failure-and-transaction-abort-after-writes-to-prealloc-extents.patch

new file mode 100644 (file)

index 0000000..4ff3b1d
--- /dev/null
+++ b/queue-5.12/btrfs-fix-fsync-failure-and-transaction-abort-after-writes-to-prealloc-extents.patch
@@ -0,0 +1,407 @@
+From ea7036de0d36c4e6c9508f68789e9567d514333a Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 24 May 2021 11:35:53 +0100
+Subject: btrfs: fix fsync failure and transaction abort after writes to prealloc extents
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit ea7036de0d36c4e6c9508f68789e9567d514333a upstream.
+
+When doing a series of partial writes to different ranges of preallocated
+extents with transaction commits and fsyncs in between, we can end up with
+a checksum items in a log tree. This causes an fsync to fail with -EIO and
+abort the transaction, turning the filesystem to RO mode, when syncing the
+log.
+
+For this to happen, we need to have a full fsync of a file following one
+or more fast fsyncs.
+
+The following example reproduces the problem and explains how it happens:
+
+  $ mkfs.btrfs -f /dev/sdc
+  $ mount /dev/sdc /mnt
+
+  # Create our test file with 2 preallocated extents. Leave a 1M hole
+  # between them to ensure that we get two file extent items that will
+  # never be merged into a single one. The extents are contiguous on disk,
+  # which will later result in the checksums for their data to be merged
+  # into a single checksum item in the csums btree.
+  #
+  $ xfs_io -f \
+           -c "falloc 0 1M" \
+           -c "falloc 3M 3M" \
+           /mnt/foobar
+
+  # Now write to the second extent and leave only 1M of it as unwritten,
+  # which corresponds to the file range [4M, 5M[.
+  #
+  # Then fsync the file to flush delalloc and to clear full sync flag from
+  # the inode, so that a future fsync will use the fast code path.
+  #
+  # After the writeback triggered by the fsync we have 3 file extent items
+  # that point to the second extent we previously allocated:
+  #
+  # 1) One file extent item of type BTRFS_FILE_EXTENT_REG that covers the
+  #    file range [3M, 4M[
+  #
+  # 2) One file extent item of type BTRFS_FILE_EXTENT_PREALLOC that covers
+  #    the file range [4M, 5M[
+  #
+  # 3) One file extent item of type BTRFS_FILE_EXTENT_REG that covers the
+  #    file range [5M, 6M[
+  #
+  # All these file extent items have a generation of 6, which is the ID of
+  # the transaction where they were created. The split of the original file
+  # extent item is done at btrfs_mark_extent_written() when ordered extents
+  # complete for the file ranges [3M, 4M[ and [5M, 6M[.
+  #
+  $ xfs_io -c "pwrite -S 0xab 3M 1M" \
+           -c "pwrite -S 0xef 5M 1M" \
+           -c "fsync" \
+           /mnt/foobar
+
+  # Commit the current transaction. This wipes out the log tree created by
+  # the previous fsync.
+  sync
+
+  # Now write to the unwritten range of the second extent we allocated,
+  # corresponding to the file range [4M, 5M[, and fsync the file, which
+  # triggers the fast fsync code path.
+  #
+  # The fast fsync code path sees that there is a new extent map covering
+  # the file range [4M, 5M[ and therefore it will log a checksum item
+  # covering the range [1M, 2M[ of the second extent we allocated.
+  #
+  # Also, after the fsync finishes we no longer have the 3 file extent
+  # items that pointed to 3 sections of the second extent we allocated.
+  # Instead we end up with a single file extent item pointing to the whole
+  # extent, with a type of BTRFS_FILE_EXTENT_REG and a generation of 7 (the
+  # current transaction ID). This is due to the file extent item merging we
+  # do when completing ordered extents into ranges that point to unwritten
+  # (preallocated) extents. This merging is done at
+  # btrfs_mark_extent_written().
+  #
+  $ xfs_io -c "pwrite -S 0xcd 4M 1M" \
+           -c "fsync" \
+           /mnt/foobar
+
+  # Now do some write to our file outside the range of the second extent
+  # that we allocated with fallocate() and truncate the file size from 6M
+  # down to 5M.
+  #
+  # The truncate operation sets the full sync runtime flag on the inode,
+  # forcing the next fsync to use the slow code path. It also changes the
+  # length of the second file extent item so that it represents the file
+  # range [3M, 5M[ and not the range [3M, 6M[ anymore.
+  #
+  # Finally fsync the file. Since this is a fsync that triggers the slow
+  # code path, it will remove all items associated to the inode from the
+  # log tree and then it will scan for file extent items in the
+  # fs/subvolume tree that have a generation matching the current
+  # transaction ID, which is 7. This means it will log 2 file extent
+  # items:
+  #
+  # 1) One for the first extent we allocated, covering the file range
+  #    [0, 1M[
+  #
+  # 2) Another for the first 2M of the second extent we allocated,
+  #    covering the file range [3M, 5M[
+  #
+  # When logging the first file extent item we log a single checksum item
+  # that has all the checksums for the entire extent.
+  #
+  # When logging the second file extent item, we also lookup for the
+  # checksums that are associated with the range [0, 2M[ of the second
+  # extent we allocated (file range [3M, 5M[), and then we log them with
+  # btrfs_csum_file_blocks(). However that results in ending up with a log
+  # that has two checksum items with ranges that overlap:
+  #
+  # 1) One for the range [1M, 2M[ of the second extent we allocated,
+  #    corresponding to the file range [4M, 5M[, which we logged in the
+  #    previous fsync that used the fast code path;
+  #
+  # 2) One for the ranges [0, 1M[ and [0, 2M[ of the first and second
+  #    extents, respectively, corresponding to the files ranges [0, 1M[
+  #    and [3M, 5M[. This one was added during this last fsync that uses
+  #    the slow code path and overlaps with the previous one logged by
+  #    the previous fast fsync.
+  #
+  # This happens because when logging the checksums for the second
+  # extent, we notice they start at an offset that matches the end of the
+  # checksums item that we logged for the first extent, and because both
+  # extents are contiguous on disk, btrfs_csum_file_blocks() decides to
+  # extend that existing checksums item and append the checksums for the
+  # second extent to this item. The end result is we end up with two
+  # checksum items in the log tree that have overlapping ranges, as
+  # listed before, resulting in the fsync to fail with -EIO and aborting
+  # the transaction, turning the filesystem into RO mode.
+  #
+  $ xfs_io -c "pwrite -S 0xff 0 1M" \
+           -c "truncate 5M" \
+           -c "fsync" \
+           /mnt/foobar
+  fsync: Input/output error
+
+After running the example, dmesg/syslog shows the tree checker complained
+about the checksum items with overlapping ranges and we aborted the
+transaction:
+
+  $ dmesg
+  (...)
+  [756289.557487] BTRFS critical (device sdc): corrupt leaf: root=18446744073709551610 block=30720000 slot=5, csum end range (16777216) goes beyond the start range (15728640) of the next csum item
+  [756289.560583] BTRFS info (device sdc): leaf 30720000 gen 7 total ptrs 7 free space 11677 owner 18446744073709551610
+  [756289.562435] BTRFS info (device sdc): refs 2 lock_owner 0 current 2303929
+  [756289.563654]      item 0 key (257 1 0) itemoff 16123 itemsize 160
+  [756289.564649]              inode generation 6 size 5242880 mode 100600
+  [756289.565636]      item 1 key (257 12 256) itemoff 16107 itemsize 16
+  [756289.566694]      item 2 key (257 108 0) itemoff 16054 itemsize 53
+  [756289.567725]              extent data disk bytenr 13631488 nr 1048576
+  [756289.568697]              extent data offset 0 nr 1048576 ram 1048576
+  [756289.569689]      item 3 key (257 108 1048576) itemoff 16001 itemsize 53
+  [756289.570682]              extent data disk bytenr 0 nr 0
+  [756289.571363]              extent data offset 0 nr 2097152 ram 2097152
+  [756289.572213]      item 4 key (257 108 3145728) itemoff 15948 itemsize 53
+  [756289.573246]              extent data disk bytenr 14680064 nr 3145728
+  [756289.574121]              extent data offset 0 nr 2097152 ram 3145728
+  [756289.574993]      item 5 key (18446744073709551606 128 13631488) itemoff 12876 itemsize 3072
+  [756289.576113]      item 6 key (18446744073709551606 128 15728640) itemoff 11852 itemsize 1024
+  [756289.577286] BTRFS error (device sdc): block=30720000 write time tree block corruption detected
+  [756289.578644] ------------[ cut here ]------------
+  [756289.579376] WARNING: CPU: 0 PID: 2303929 at fs/btrfs/disk-io.c:465 csum_one_extent_buffer+0xed/0x100 [btrfs]
+  [756289.580857] Modules linked in: btrfs dm_zero dm_dust loop dm_snapshot (...)
+  [756289.591534] CPU: 0 PID: 2303929 Comm: xfs_io Tainted: G        W         5.12.0-rc8-btrfs-next-87 #1
+  [756289.592580] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
+  [756289.594161] RIP: 0010:csum_one_extent_buffer+0xed/0x100 [btrfs]
+  [756289.595122] Code: 5d c3 e8 76 60 (...)
+  [756289.597509] RSP: 0018:ffffb51b416cb898 EFLAGS: 00010282
+  [756289.598142] RAX: 0000000000000000 RBX: fffff02b8a365bc0 RCX: 0000000000000000
+  [756289.598970] RDX: 0000000000000000 RSI: ffffffffa9112421 RDI: 00000000ffffffff
+  [756289.599798] RBP: ffffa06500880000 R08: 0000000000000000 R09: 0000000000000000
+  [756289.600619] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
+  [756289.601456] R13: ffffa0652b1d8980 R14: ffffa06500880000 R15: 0000000000000000
+  [756289.602278] FS:  00007f08b23c9800(0000) GS:ffffa0682be00000(0000) knlGS:0000000000000000
+  [756289.603217] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+  [756289.603892] CR2: 00005652f32d0138 CR3: 000000025d616003 CR4: 0000000000370ef0
+  [756289.604725] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+  [756289.605563] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+  [756289.606400] Call Trace:
+  [756289.606704]  btree_csum_one_bio+0x244/0x2b0 [btrfs]
+  [756289.607313]  btrfs_submit_metadata_bio+0xb7/0x100 [btrfs]
+  [756289.608040]  submit_one_bio+0x61/0x70 [btrfs]
+  [756289.608587]  btree_write_cache_pages+0x587/0x610 [btrfs]
+  [756289.609258]  ? free_debug_processing+0x1d5/0x240
+  [756289.609812]  ? __module_address+0x28/0xf0
+  [756289.610298]  ? lock_acquire+0x1a0/0x3e0
+  [756289.610754]  ? lock_acquired+0x19f/0x430
+  [756289.611220]  ? lock_acquire+0x1a0/0x3e0
+  [756289.611675]  do_writepages+0x43/0xf0
+  [756289.612101]  ? __filemap_fdatawrite_range+0xa4/0x100
+  [756289.612800]  __filemap_fdatawrite_range+0xc5/0x100
+  [756289.613393]  btrfs_write_marked_extents+0x68/0x160 [btrfs]
+  [756289.614085]  btrfs_sync_log+0x21c/0xf20 [btrfs]
+  [756289.614661]  ? finish_wait+0x90/0x90
+  [756289.615096]  ? __mutex_unlock_slowpath+0x45/0x2a0
+  [756289.615661]  ? btrfs_log_inode_parent+0x3c9/0xdc0 [btrfs]
+  [756289.616338]  ? lock_acquire+0x1a0/0x3e0
+  [756289.616801]  ? lock_acquired+0x19f/0x430
+  [756289.617284]  ? lock_acquire+0x1a0/0x3e0
+  [756289.617750]  ? lock_release+0x214/0x470
+  [756289.618221]  ? lock_acquired+0x19f/0x430
+  [756289.618704]  ? dput+0x20/0x4a0
+  [756289.619079]  ? dput+0x20/0x4a0
+  [756289.619452]  ? lockref_put_or_lock+0x9/0x30
+  [756289.619969]  ? lock_release+0x214/0x470
+  [756289.620445]  ? lock_release+0x214/0x470
+  [756289.620924]  ? lock_release+0x214/0x470
+  [756289.621415]  btrfs_sync_file+0x46a/0x5b0 [btrfs]
+  [756289.621982]  do_fsync+0x38/0x70
+  [756289.622395]  __x64_sys_fsync+0x10/0x20
+  [756289.622907]  do_syscall_64+0x33/0x80
+  [756289.623438]  entry_SYSCALL_64_after_hwframe+0x44/0xae
+  [756289.624063] RIP: 0033:0x7f08b27fbb7b
+  [756289.624588] Code: 0f 05 48 3d 00 (...)
+  [756289.626760] RSP: 002b:00007ffe2583f940 EFLAGS: 00000293 ORIG_RAX: 000000000000004a
+  [756289.627639] RAX: ffffffffffffffda RBX: 00005652f32cd0f0 RCX: 00007f08b27fbb7b
+  [756289.628464] RDX: 00005652f32cbca0 RSI: 00005652f32cd110 RDI: 0000000000000003
+  [756289.629323] RBP: 00005652f32cd110 R08: 0000000000000000 R09: 00007f08b28c4be0
+  [756289.630172] R10: fffffffffffff39a R11: 0000000000000293 R12: 0000000000000001
+  [756289.631007] R13: 00005652f32cd0f0 R14: 0000000000000001 R15: 00005652f32cc480
+  [756289.631819] irq event stamp: 0
+  [756289.632188] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
+  [756289.632911] hardirqs last disabled at (0): [<ffffffffa7e97c29>] copy_process+0x879/0x1cc0
+  [756289.633893] softirqs last  enabled at (0): [<ffffffffa7e97c29>] copy_process+0x879/0x1cc0
+  [756289.634871] softirqs last disabled at (0): [<0000000000000000>] 0x0
+  [756289.635606] ---[ end trace 0a039fdc16ff3fef ]---
+  [756289.636179] BTRFS: error (device sdc) in btrfs_sync_log:3136: errno=-5 IO failure
+  [756289.637082] BTRFS info (device sdc): forced readonly
+
+Having checksum items covering ranges that overlap is dangerous as in some
+cases it can lead to having extent ranges for which we miss checksums
+after log replay or getting the wrong checksum item. There were some fixes
+in the past for bugs that resulted in this problem, and were explained and
+fixed by the following commits:
+
+  27b9a8122ff71a ("Btrfs: fix csum tree corruption, duplicate and outdated checksums")
+  b84b8390d6009c ("Btrfs: fix file read corruption after extent cloning and fsync")
+  40e046acbd2f36 ("Btrfs: fix missing data checksums after replaying a log tree")
+  e289f03ea79bbc ("btrfs: fix corrupt log due to concurrent fsync of inodes with shared extents")
+
+Fix the issue by making btrfs_csum_file_blocks() taking into account the
+start offset of the next checksum item when it decides to extend an
+existing checksum item, so that it never extends the checksum to end at a
+range that goes beyond the start range of the next checksum item.
+
+When we can not access the next checksum item without releasing the path,
+simply drop the optimization of extending the previous checksum item and
+fallback to inserting a new checksum item - this happens rarely and the
+optimization is not significant enough for a log tree in order to justify
+the extra complexity, as it would only save a few bytes (the size of a
+struct btrfs_item) of leaf space.
+
+This behaviour is only needed when inserting into a log tree because
+for the regular checksums tree we never have a case where we try to
+insert a range of checksums that overlap with a range that was previously
+inserted.
+
+A test case for fstests will follow soon.
+
+Reported-by: Philipp Fent <fent@in.tum.de>
+Link: https://lore.kernel.org/linux-btrfs/93c4600e-5263-5cba-adf0-6f47526e7561@in.tum.de/
+CC: stable@vger.kernel.org # 5.4+
+Tested-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/file-item.c |   98 +++++++++++++++++++++++++++++++++++++++------------
+ 1 file changed, 76 insertions(+), 22 deletions(-)
+
+--- a/fs/btrfs/file-item.c
++++ b/fs/btrfs/file-item.c
+@@ -922,6 +922,37 @@ int btrfs_del_csums(struct btrfs_trans_h
+       return ret;
+ }
+ 
++static int find_next_csum_offset(struct btrfs_root *root,
++                               struct btrfs_path *path,
++                               u64 *next_offset)
++{
++      const u32 nritems = btrfs_header_nritems(path->nodes[0]);
++      struct btrfs_key found_key;
++      int slot = path->slots[0] + 1;
++      int ret;
++
++      if (nritems == 0 || slot >= nritems) {
++              ret = btrfs_next_leaf(root, path);
++              if (ret < 0) {
++                      return ret;
++              } else if (ret > 0) {
++                      *next_offset = (u64)-1;
++                      return 0;
++              }
++              slot = path->slots[0];
++      }
++
++      btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
++
++      if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
++          found_key.type != BTRFS_EXTENT_CSUM_KEY)
++              *next_offset = (u64)-1;
++      else
++              *next_offset = found_key.offset;
++
++      return 0;
++}
++
+ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          struct btrfs_ordered_sum *sums)
+@@ -937,7 +968,6 @@ int btrfs_csum_file_blocks(struct btrfs_
+       u64 total_bytes = 0;
+       u64 csum_offset;
+       u64 bytenr;
+-      u32 nritems;
+       u32 ins_size;
+       int index = 0;
+       int found_next;
+@@ -980,26 +1010,10 @@ again:
+                       goto insert;
+               }
+       } else {
+-              int slot = path->slots[0] + 1;
+-              /* we didn't find a csum item, insert one */
+-              nritems = btrfs_header_nritems(path->nodes[0]);
+-              if (!nritems || (path->slots[0] >= nritems - 1)) {
+-                      ret = btrfs_next_leaf(root, path);
+-                      if (ret < 0) {
+-                              goto out;
+-                      } else if (ret > 0) {
+-                              found_next = 1;
+-                              goto insert;
+-                      }
+-                      slot = path->slots[0];
+-              }
+-              btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
+-              if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+-                  found_key.type != BTRFS_EXTENT_CSUM_KEY) {
+-                      found_next = 1;
+-                      goto insert;
+-              }
+-              next_offset = found_key.offset;
++              /* We didn't find a csum item, insert one. */
++              ret = find_next_csum_offset(root, path, &next_offset);
++              if (ret < 0)
++                      goto out;
+               found_next = 1;
+               goto insert;
+       }
+@@ -1055,8 +1069,48 @@ extend_csum:
+               tmp = sums->len - total_bytes;
+               tmp >>= fs_info->sectorsize_bits;
+               WARN_ON(tmp < 1);
++              extend_nr = max_t(int, 1, tmp);
++
++              /*
++               * A log tree can already have checksum items with a subset of
++               * the checksums we are trying to log. This can happen after
++               * doing a sequence of partial writes into prealloc extents and
++               * fsyncs in between, with a full fsync logging a larger subrange
++               * of an extent for which a previous fast fsync logged a smaller
++               * subrange. And this happens in particular due to merging file
++               * extent items when we complete an ordered extent for a range
++               * covered by a prealloc extent - this is done at
++               * btrfs_mark_extent_written().
++               *
++               * So if we try to extend the previous checksum item, which has
++               * a range that ends at the start of the range we want to insert,
++               * make sure we don't extend beyond the start offset of the next
++               * checksum item. If we are at the last item in the leaf, then
++               * forget the optimization of extending and add a new checksum
++               * item - it is not worth the complexity of releasing the path,
++               * getting the first key for the next leaf, repeat the btree
++               * search, etc, because log trees are temporary anyway and it
++               * would only save a few bytes of leaf space.
++               */
++              if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
++                      if (path->slots[0] + 1 >=
++                          btrfs_header_nritems(path->nodes[0])) {
++                              ret = find_next_csum_offset(root, path, &next_offset);
++                              if (ret < 0)
++                                      goto out;
++                              found_next = 1;
++                              goto insert;
++                      }
++
++                      ret = find_next_csum_offset(root, path, &next_offset);
++                      if (ret < 0)
++                              goto out;
++
++                      tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits;
++                      if (tmp <= INT_MAX)
++                              extend_nr = min_t(int, extend_nr, tmp);
++              }
+ 
+-              extend_nr = max_t(int, 1, (int)tmp);
+               diff = (csum_offset + extend_nr) * csum_size;
+               diff = min(diff,
+                          MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size);
diff --git a/queue-5.12/btrfs-fixup-error-handling-in-fixup_inode_link_counts.patch b/queue-5.12/btrfs-fixup-error-handling-in-fixup_inode_link_counts.patch

new file mode 100644 (file)

index 0000000..4566f0a
--- /dev/null
+++ b/queue-5.12/btrfs-fixup-error-handling-in-fixup_inode_link_counts.patch
@@ -0,0 +1,85 @@
+From 011b28acf940eb61c000059dd9e2cfcbf52ed96b Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 19 May 2021 13:13:15 -0400
+Subject: btrfs: fixup error handling in fixup_inode_link_counts
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 011b28acf940eb61c000059dd9e2cfcbf52ed96b upstream.
+
+This function has the following pattern
+
+       while (1) {
+               ret = whatever();
+               if (ret)
+                       goto out;
+       }
+       ret = 0
+out:
+       return ret;
+
+However several places in this while loop we simply break; when there's
+a problem, thus clearing the return value, and in one case we do a
+return -EIO, and leak the memory for the path.
+
+Fix this by re-arranging the loop to deal with ret == 1 coming from
+btrfs_search_slot, and then simply delete the
+
+       ret = 0;
+out:
+
+bit so everybody can break if there is an error, which will allow for
+proper error handling to occur.
+
+CC: stable@vger.kernel.org # 4.4+
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c |   13 +++++++------
+ 1 file changed, 7 insertions(+), 6 deletions(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -1791,6 +1791,7 @@ static noinline int fixup_inode_link_cou
+                       break;
+ 
+               if (ret == 1) {
++                      ret = 0;
+                       if (path->slots[0] == 0)
+                               break;
+                       path->slots[0]--;
+@@ -1803,17 +1804,19 @@ static noinline int fixup_inode_link_cou
+ 
+               ret = btrfs_del_item(trans, root, path);
+               if (ret)
+-                      goto out;
++                      break;
+ 
+               btrfs_release_path(path);
+               inode = read_one_inode(root, key.offset);
+-              if (!inode)
+-                      return -EIO;
++              if (!inode) {
++                      ret = -EIO;
++                      break;
++              }
+ 
+               ret = fixup_inode_link_count(trans, root, inode);
+               iput(inode);
+               if (ret)
+-                      goto out;
++                      break;
+ 
+               /*
+                * fixup on a directory may create new entries,
+@@ -1822,8 +1825,6 @@ static noinline int fixup_inode_link_cou
+                */
+               key.offset = (u64)-1;
+       }
+-      ret = 0;
+-out:
+       btrfs_release_path(path);
+       return ret;
+ }
diff --git a/queue-5.12/btrfs-mark-ordered-extent-and-inode-with-error-if-we-fail-to-finish.patch b/queue-5.12/btrfs-mark-ordered-extent-and-inode-with-error-if-we-fail-to-finish.patch

new file mode 100644 (file)

index 0000000..9d80706
--- /dev/null
+++ b/queue-5.12/btrfs-mark-ordered-extent-and-inode-with-error-if-we-fail-to-finish.patch
@@ -0,0 +1,57 @@
+From d61bec08b904cf171835db98168f82bc338e92e4 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 19 May 2021 09:38:27 -0400
+Subject: btrfs: mark ordered extent and inode with error if we fail to finish
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit d61bec08b904cf171835db98168f82bc338e92e4 upstream.
+
+While doing error injection testing I saw that sometimes we'd get an
+abort that wouldn't stop the current transaction commit from completing.
+This abort was coming from finish ordered IO, but at this point in the
+transaction commit we should have gotten an error and stopped.
+
+It turns out the abort came from finish ordered io while trying to write
+out the free space cache.  It occurred to me that any failure inside of
+finish_ordered_io isn't actually raised to the person doing the writing,
+so we could have any number of failures in this path and think the
+ordered extent completed successfully and the inode was fine.
+
+Fix this by marking the ordered extent with BTRFS_ORDERED_IOERR, and
+marking the mapping of the inode with mapping_set_error, so any callers
+that simply call fdatawait will also get the error.
+
+With this we're seeing the IO error on the free space inode when we fail
+to do the finish_ordered_io.
+
+CC: stable@vger.kernel.org # 4.19+
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/inode.c |   12 ++++++++++++
+ 1 file changed, 12 insertions(+)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -3011,6 +3011,18 @@ out:
+       if (ret || truncated) {
+               u64 unwritten_start = start;
+ 
++              /*
++               * If we failed to finish this ordered extent for any reason we
++               * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
++               * extent, and mark the inode with the error if it wasn't
++               * already set.  Any error during writeback would have already
++               * set the mapping error, so we need to set it if we're the ones
++               * marking this ordered extent as failed.
++               */
++              if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
++                                           &ordered_extent->flags))
++                      mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
++
+               if (truncated)
+                       unwritten_start += logical_len;
+               clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
diff --git a/queue-5.12/btrfs-return-errors-from-btrfs_del_csums-in-cleanup_ref_head.patch b/queue-5.12/btrfs-return-errors-from-btrfs_del_csums-in-cleanup_ref_head.patch

new file mode 100644 (file)

index 0000000..f06c5bd
--- /dev/null
+++ b/queue-5.12/btrfs-return-errors-from-btrfs_del_csums-in-cleanup_ref_head.patch
@@ -0,0 +1,35 @@
+From 856bd270dc4db209c779ce1e9555c7641ffbc88e Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 19 May 2021 10:52:46 -0400
+Subject: btrfs: return errors from btrfs_del_csums in cleanup_ref_head
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 856bd270dc4db209c779ce1e9555c7641ffbc88e upstream.
+
+We are unconditionally returning 0 in cleanup_ref_head, despite the fact
+that btrfs_del_csums could fail.  We need to return the error so the
+transaction gets aborted properly, fix this by returning ret from
+btrfs_del_csums in cleanup_ref_head.
+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+CC: stable@vger.kernel.org # 4.19+
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent-tree.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -1868,7 +1868,7 @@ static int cleanup_ref_head(struct btrfs
+       trace_run_delayed_ref_head(fs_info, head, 0);
+       btrfs_delayed_ref_unlock(head);
+       btrfs_put_delayed_ref_head(head);
+-      return 0;
++      return ret;
+ }
+ 
+ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
diff --git a/queue-5.12/dmaengine-idxd-use-cpu_feature_enabled.patch b/queue-5.12/dmaengine-idxd-use-cpu_feature_enabled.patch

new file mode 100644 (file)

index 0000000..61728b4
--- /dev/null
+++ b/queue-5.12/dmaengine-idxd-use-cpu_feature_enabled.patch
@@ -0,0 +1,39 @@
+From 74b2fc882d380d8fafc2a26f01d401c2a7beeadb Mon Sep 17 00:00:00 2001
+From: Borislav Petkov <bp@suse.de>
+Date: Wed, 2 Jun 2021 12:07:52 +0200
+Subject: dmaengine: idxd: Use cpu_feature_enabled()
+
+From: Borislav Petkov <bp@suse.de>
+
+commit 74b2fc882d380d8fafc2a26f01d401c2a7beeadb upstream.
+
+When testing x86 feature bits, use cpu_feature_enabled() so that
+build-disabled features can remain off, regardless of what CPUID says.
+
+Fixes: 8e50d392652f ("dmaengine: idxd: Add shared workqueue support")
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-By: Vinod Koul <vkoul@kernel.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/dma/idxd/init.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/dma/idxd/init.c
++++ b/drivers/dma/idxd/init.c
+@@ -675,12 +675,12 @@ static int __init idxd_init_module(void)
+        * If the CPU does not support MOVDIR64B or ENQCMDS, there's no point in
+        * enumerating the device. We can not utilize it.
+        */
+-      if (!boot_cpu_has(X86_FEATURE_MOVDIR64B)) {
++      if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
+               pr_warn("idxd driver failed to load without MOVDIR64B.\n");
+               return -ENODEV;
+       }
+ 
+-      if (!boot_cpu_has(X86_FEATURE_ENQCMD))
++      if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
+               pr_warn("Platform does not have ENQCMD(S) support.\n");
+       else
+               support_enqcmd = true;
diff --git a/queue-5.12/drm-amdgpu-don-t-query-ce-and-ue-errors.patch b/queue-5.12/drm-amdgpu-don-t-query-ce-and-ue-errors.patch

new file mode 100644 (file)

index 0000000..5265dfe
--- /dev/null
+++ b/queue-5.12/drm-amdgpu-don-t-query-ce-and-ue-errors.patch
@@ -0,0 +1,63 @@
+From dce3d8e1d070900e0feeb06787a319ff9379212c Mon Sep 17 00:00:00 2001
+From: Luben Tuikov <luben.tuikov@amd.com>
+Date: Wed, 12 May 2021 12:33:23 -0400
+Subject: drm/amdgpu: Don't query CE and UE errors
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Luben Tuikov <luben.tuikov@amd.com>
+
+commit dce3d8e1d070900e0feeb06787a319ff9379212c upstream.
+
+On QUERY2 IOCTL don't query counts of correctable
+and uncorrectable errors, since when RAS is
+enabled and supported on Vega20 server boards,
+this takes insurmountably long time, in O(n^3),
+which slows the system down to the point of it
+being unusable when we have GUI up.
+
+Fixes: ae363a212b14 ("drm/amdgpu: Add a new flag to AMDGPU_CTX_OP_QUERY_STATE2")
+Cc: Alexander Deucher <Alexander.Deucher@amd.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
+Reviewed-by: Alexander Deucher <Alexander.Deucher@amd.com>
+Reviewed-by: Christian König <christian.koenig@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c |   16 ----------------
+ 1 file changed, 16 deletions(-)
+
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+@@ -337,7 +337,6 @@ static int amdgpu_ctx_query2(struct amdg
+ {
+       struct amdgpu_ctx *ctx;
+       struct amdgpu_ctx_mgr *mgr;
+-      unsigned long ras_counter;
+ 
+       if (!fpriv)
+               return -EINVAL;
+@@ -362,21 +361,6 @@ static int amdgpu_ctx_query2(struct amdg
+       if (atomic_read(&ctx->guilty))
+               out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
+ 
+-      /*query ue count*/
+-      ras_counter = amdgpu_ras_query_error_count(adev, false);
+-      /*ras counter is monotonic increasing*/
+-      if (ras_counter != ctx->ras_counter_ue) {
+-              out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
+-              ctx->ras_counter_ue = ras_counter;
+-      }
+-
+-      /*query ce count*/
+-      ras_counter = amdgpu_ras_query_error_count(adev, true);
+-      if (ras_counter != ctx->ras_counter_ce) {
+-              out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
+-              ctx->ras_counter_ce = ras_counter;
+-      }
+-
+       mutex_unlock(&mgr->lock);
+       return 0;
+ }
diff --git a/queue-5.12/drm-amdgpu-make-sure-we-unpin-the-uvd-bo.patch b/queue-5.12/drm-amdgpu-make-sure-we-unpin-the-uvd-bo.patch

new file mode 100644 (file)

index 0000000..247b30c
--- /dev/null
+++ b/queue-5.12/drm-amdgpu-make-sure-we-unpin-the-uvd-bo.patch
@@ -0,0 +1,35 @@
+From 07438603a07e52f1c6aa731842bd298d2725b7be Mon Sep 17 00:00:00 2001
+From: Nirmoy Das <nirmoy.das@amd.com>
+Date: Fri, 28 May 2021 16:54:16 +0200
+Subject: drm/amdgpu: make sure we unpin the UVD BO
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Nirmoy Das <nirmoy.das@amd.com>
+
+commit 07438603a07e52f1c6aa731842bd298d2725b7be upstream.
+
+Releasing pinned BOs is illegal now. UVD 6 was missing from:
+commit 2f40801dc553 ("drm/amdgpu: make sure we unpin the UVD BO")
+
+Fixes: 2f40801dc553 ("drm/amdgpu: make sure we unpin the UVD BO")
+Cc: stable@vger.kernel.org
+Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
+Reviewed-by: Christian König <christian.koenig@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
++++ b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
+@@ -357,6 +357,7 @@ static int uvd_v6_0_enc_ring_test_ib(str
+ 
+ error:
+       dma_fence_put(fence);
++      amdgpu_bo_unpin(bo);
+       amdgpu_bo_unreserve(bo);
+       amdgpu_bo_unref(&bo);
+       return r;
diff --git a/queue-5.12/kvm-ppc-book3s-hv-save-host-fscr-in-the-p7-8-path.patch b/queue-5.12/kvm-ppc-book3s-hv-save-host-fscr-in-the-p7-8-path.patch

new file mode 100644 (file)

index 0000000..69f5592
--- /dev/null
+++ b/queue-5.12/kvm-ppc-book3s-hv-save-host-fscr-in-the-p7-8-path.patch
@@ -0,0 +1,70 @@
+From 1438709e6328925ef496dafd467dbd0353137434 Mon Sep 17 00:00:00 2001
+From: Nicholas Piggin <npiggin@gmail.com>
+Date: Wed, 26 May 2021 22:58:51 +1000
+Subject: KVM: PPC: Book3S HV: Save host FSCR in the P7/8 path
+
+From: Nicholas Piggin <npiggin@gmail.com>
+
+commit 1438709e6328925ef496dafd467dbd0353137434 upstream.
+
+Similar to commit 25edcc50d76c ("KVM: PPC: Book3S HV: Save and restore
+FSCR in the P9 path"), ensure the P7/8 path saves and restores the host
+FSCR. The logic explained in that patch actually applies there to the
+old path well: a context switch can be made before kvmppc_vcpu_run_hv
+restores the host FSCR and returns.
+
+Now both the p9 and the p7/8 paths now save and restore their FSCR, it
+no longer needs to be restored at the end of kvmppc_vcpu_run_hv
+
+Fixes: b005255e12a3 ("KVM: PPC: Book3S HV: Context-switch new POWER8 SPRs")
+Cc: stable@vger.kernel.org # v3.14+
+Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
+Reviewed-by: Fabiano Rosas <farosas@linux.ibm.com>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/20210526125851.3436735-1-npiggin@gmail.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/kvm/book3s_hv.c            |    1 -
+ arch/powerpc/kvm/book3s_hv_rmhandlers.S |    7 +++++++
+ 2 files changed, 7 insertions(+), 1 deletion(-)
+
+--- a/arch/powerpc/kvm/book3s_hv.c
++++ b/arch/powerpc/kvm/book3s_hv.c
+@@ -4418,7 +4418,6 @@ static int kvmppc_vcpu_run_hv(struct kvm
+               mtspr(SPRN_EBBRR, ebb_regs[1]);
+               mtspr(SPRN_BESCR, ebb_regs[2]);
+               mtspr(SPRN_TAR, user_tar);
+-              mtspr(SPRN_FSCR, current->thread.fscr);
+       }
+       mtspr(SPRN_VRSAVE, user_vrsave);
+ 
+--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
++++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+@@ -59,6 +59,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+ #define STACK_SLOT_UAMOR      (SFS-88)
+ #define STACK_SLOT_DAWR1      (SFS-96)
+ #define STACK_SLOT_DAWRX1     (SFS-104)
++#define STACK_SLOT_FSCR               (SFS-112)
+ /* the following is used by the P9 short path */
+ #define STACK_SLOT_NVGPRS     (SFS-152)       /* 18 gprs */
+ 
+@@ -686,6 +687,8 @@ BEGIN_FTR_SECTION
+       std     r6, STACK_SLOT_DAWR0(r1)
+       std     r7, STACK_SLOT_DAWRX0(r1)
+       std     r8, STACK_SLOT_IAMR(r1)
++      mfspr   r5, SPRN_FSCR
++      std     r5, STACK_SLOT_FSCR(r1)
+ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+ BEGIN_FTR_SECTION
+       mfspr   r6, SPRN_DAWR1
+@@ -1663,6 +1666,10 @@ FTR_SECTION_ELSE
+       ld      r7, STACK_SLOT_HFSCR(r1)
+       mtspr   SPRN_HFSCR, r7
+ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
++BEGIN_FTR_SECTION
++      ld      r5, STACK_SLOT_FSCR(r1)
++      mtspr   SPRN_FSCR, r5
++END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       /*
+        * Restore various registers to 0, where non-zero values
+        * set by the guest could disrupt the host.
diff --git a/queue-5.12/mm-debug_vm_pgtable-fix-alignment-for-pmd-pud_advanced_tests.patch b/queue-5.12/mm-debug_vm_pgtable-fix-alignment-for-pmd-pud_advanced_tests.patch

new file mode 100644 (file)

index 0000000..7b27feb
--- /dev/null
+++ b/queue-5.12/mm-debug_vm_pgtable-fix-alignment-for-pmd-pud_advanced_tests.patch
@@ -0,0 +1,64 @@
+From 04f7ce3f07ce39b1a3ca03a56b238a53acc52cfd Mon Sep 17 00:00:00 2001
+From: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
+Date: Fri, 4 Jun 2021 20:01:18 -0700
+Subject: mm/debug_vm_pgtable: fix alignment for pmd/pud_advanced_tests()
+
+From: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
+
+commit 04f7ce3f07ce39b1a3ca03a56b238a53acc52cfd upstream.
+
+In pmd/pud_advanced_tests(), the vaddr is aligned up to the next pmd/pud
+entry, and so it does not match the given pmdp/pudp and (aligned down)
+pfn any more.
+
+For s390, this results in memory corruption, because the IDTE
+instruction used e.g.  in xxx_get_and_clear() will take the vaddr for
+some calculations, in combination with the given pmdp.  It will then end
+up with a wrong table origin, ending on ...ff8, and some of those
+wrongly set low-order bits will also select a wrong pagetable level for
+the index addition.  IDTE could therefore invalidate (or 0x20) something
+outside of the page tables, depending on the wrongly picked index, which
+in turn depends on the random vaddr.
+
+As result, we sometimes see "BUG task_struct (Not tainted): Padding
+overwritten" on s390, where one 0x5a padding value got overwritten with
+0x7a.
+
+Fix this by aligning down, similar to how the pmd/pud_aligned pfns are
+calculated.
+
+Link: https://lkml.kernel.org/r/20210525130043.186290-2-gerald.schaefer@linux.ibm.com
+Fixes: a5c3b9ffb0f40 ("mm/debug_vm_pgtable: add tests validating advanced arch page table helpers")
+Signed-off-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
+Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
+Cc: Vineet Gupta <vgupta@synopsys.com>
+Cc: Palmer Dabbelt <palmer@dabbelt.com>
+Cc: Paul Walmsley <paul.walmsley@sifive.com>
+Cc: <stable@vger.kernel.org>   [5.9+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/debug_vm_pgtable.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/mm/debug_vm_pgtable.c
++++ b/mm/debug_vm_pgtable.c
+@@ -192,7 +192,7 @@ static void __init pmd_advanced_tests(st
+ 
+       pr_debug("Validating PMD advanced\n");
+       /* Align the address wrt HPAGE_PMD_SIZE */
+-      vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
++      vaddr &= HPAGE_PMD_MASK;
+ 
+       pgtable_trans_huge_deposit(mm, pmdp, pgtable);
+ 
+@@ -330,7 +330,7 @@ static void __init pud_advanced_tests(st
+ 
+       pr_debug("Validating PUD advanced\n");
+       /* Align the address wrt HPAGE_PUD_SIZE */
+-      vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE;
++      vaddr &= HPAGE_PUD_MASK;
+ 
+       set_pud_at(mm, vaddr, pudp, pud);
+       pudp_set_wrprotect(mm, vaddr, pudp);
diff --git a/queue-5.12/mm-page_alloc-fix-counting-of-free-pages-after-take-off-from-buddy.patch b/queue-5.12/mm-page_alloc-fix-counting-of-free-pages-after-take-off-from-buddy.patch

new file mode 100644 (file)

index 0000000..3a41a8e
--- /dev/null
+++ b/queue-5.12/mm-page_alloc-fix-counting-of-free-pages-after-take-off-from-buddy.patch
@@ -0,0 +1,60 @@
+From bac9c6fa1f929213bbd0ac9cdf21e8e2f0916828 Mon Sep 17 00:00:00 2001
+From: Ding Hui <dinghui@sangfor.com.cn>
+Date: Fri, 4 Jun 2021 20:01:21 -0700
+Subject: mm/page_alloc: fix counting of free pages after take off from buddy
+
+From: Ding Hui <dinghui@sangfor.com.cn>
+
+commit bac9c6fa1f929213bbd0ac9cdf21e8e2f0916828 upstream.
+
+Recently we found that there is a lot MemFree left in /proc/meminfo
+after do a lot of pages soft offline, it's not quite correct.
+
+Before Oscar's rework of soft offline for free pages [1], if we soft
+offline free pages, these pages are left in buddy with HWPoison flag,
+and NR_FREE_PAGES is not updated immediately.  So the difference between
+NR_FREE_PAGES and real number of available free pages is also even big
+at the beginning.
+
+However, with the workload running, when we catch HWPoison page in any
+alloc functions subsequently, we will remove it from buddy, meanwhile
+update the NR_FREE_PAGES and try again, so the NR_FREE_PAGES will get
+more and more closer to the real number of available free pages.
+(regardless of unpoison_memory())
+
+Now, for offline free pages, after a successful call
+take_page_off_buddy(), the page is no longer belong to buddy allocator,
+and will not be used any more, but we missed accounting NR_FREE_PAGES in
+this situation, and there is no chance to be updated later.
+
+Do update in take_page_off_buddy() like rmqueue() does, but avoid double
+counting if some one already set_migratetype_isolate() on the page.
+
+[1]: commit 06be6ff3d2ec ("mm,hwpoison: rework soft offline for free pages")
+
+Link: https://lkml.kernel.org/r/20210526075247.11130-1-dinghui@sangfor.com.cn
+Fixes: 06be6ff3d2ec ("mm,hwpoison: rework soft offline for free pages")
+Signed-off-by: Ding Hui <dinghui@sangfor.com.cn>
+Suggested-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Reviewed-by: Oscar Salvador <osalvador@suse.de>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/page_alloc.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -8951,6 +8951,8 @@ bool take_page_off_buddy(struct page *pa
+                       del_page_from_free_list(page_head, zone, page_order);
+                       break_down_buddy_pages(zone, page_head, page, 0,
+                                               page_order, migratetype);
++                      if (!is_migrate_isolate(migratetype))
++                              __mod_zone_freepage_state(zone, -1, migratetype);
+                       ret = true;
+                       break;
+               }
diff --git a/queue-5.12/nfc-fix-null-ptr-dereference-in-llcp_sock_getname-after-failed-connect.patch b/queue-5.12/nfc-fix-null-ptr-dereference-in-llcp_sock_getname-after-failed-connect.patch

new file mode 100644 (file)

index 0000000..4c628cc
--- /dev/null
+++ b/queue-5.12/nfc-fix-null-ptr-dereference-in-llcp_sock_getname-after-failed-connect.patch
@@ -0,0 +1,59 @@
+From 4ac06a1e013cf5fdd963317ffd3b968560f33bba Mon Sep 17 00:00:00 2001
+From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
+Date: Mon, 31 May 2021 09:21:38 +0200
+Subject: nfc: fix NULL ptr dereference in llcp_sock_getname() after failed connect
+
+From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
+
+commit 4ac06a1e013cf5fdd963317ffd3b968560f33bba upstream.
+
+It's possible to trigger NULL pointer dereference by local unprivileged
+user, when calling getsockname() after failed bind() (e.g. the bind
+fails because LLCP_SAP_MAX used as SAP):
+
+  BUG: kernel NULL pointer dereference, address: 0000000000000000
+  CPU: 1 PID: 426 Comm: llcp_sock_getna Not tainted 5.13.0-rc2-next-20210521+ #9
+  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1 04/01/2014
+  Call Trace:
+   llcp_sock_getname+0xb1/0xe0
+   __sys_getpeername+0x95/0xc0
+   ? lockdep_hardirqs_on_prepare+0xd5/0x180
+   ? syscall_enter_from_user_mode+0x1c/0x40
+   __x64_sys_getpeername+0x11/0x20
+   do_syscall_64+0x36/0x70
+   entry_SYSCALL_64_after_hwframe+0x44/0xae
+
+This can be reproduced with Syzkaller C repro (bind followed by
+getpeername):
+https://syzkaller.appspot.com/x/repro.c?x=14def446e00000
+
+Cc: <stable@vger.kernel.org>
+Fixes: d646960f7986 ("NFC: Initial LLCP support")
+Reported-by: syzbot+80fb126e7f7d8b1a5914@syzkaller.appspotmail.com
+Reported-by: butt3rflyh4ck <butterflyhuangxx@gmail.com>
+Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
+Link: https://lore.kernel.org/r/20210531072138.5219-1-krzysztof.kozlowski@canonical.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/nfc/llcp_sock.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/net/nfc/llcp_sock.c
++++ b/net/nfc/llcp_sock.c
+@@ -110,6 +110,7 @@ static int llcp_sock_bind(struct socket
+       if (!llcp_sock->service_name) {
+               nfc_llcp_local_put(llcp_sock->local);
+               llcp_sock->local = NULL;
++              llcp_sock->dev = NULL;
+               ret = -ENOMEM;
+               goto put_dev;
+       }
+@@ -119,6 +120,7 @@ static int llcp_sock_bind(struct socket
+               llcp_sock->local = NULL;
+               kfree(llcp_sock->service_name);
+               llcp_sock->service_name = NULL;
++              llcp_sock->dev = NULL;
+               ret = -EADDRINUSE;
+               goto put_dev;
+       }
diff --git a/queue-5.12/ocfs2-fix-data-corruption-by-fallocate.patch b/queue-5.12/ocfs2-fix-data-corruption-by-fallocate.patch

new file mode 100644 (file)

index 0000000..d8fe0d6
--- /dev/null
+++ b/queue-5.12/ocfs2-fix-data-corruption-by-fallocate.patch
@@ -0,0 +1,148 @@
+From 6bba4471f0cc1296fe3c2089b9e52442d3074b2e Mon Sep 17 00:00:00 2001
+From: Junxiao Bi <junxiao.bi@oracle.com>
+Date: Fri, 4 Jun 2021 20:01:42 -0700
+Subject: ocfs2: fix data corruption by fallocate
+
+From: Junxiao Bi <junxiao.bi@oracle.com>
+
+commit 6bba4471f0cc1296fe3c2089b9e52442d3074b2e upstream.
+
+When fallocate punches holes out of inode size, if original isize is in
+the middle of last cluster, then the part from isize to the end of the
+cluster will be zeroed with buffer write, at that time isize is not yet
+updated to match the new size, if writeback is kicked in, it will invoke
+ocfs2_writepage()->block_write_full_page() where the pages out of inode
+size will be dropped.  That will cause file corruption.  Fix this by
+zero out eof blocks when extending the inode size.
+
+Running the following command with qemu-image 4.2.1 can get a corrupted
+coverted image file easily.
+
+    qemu-img convert -p -t none -T none -f qcow2 $qcow_image \
+             -O qcow2 -o compat=1.1 $qcow_image.conv
+
+The usage of fallocate in qemu is like this, it first punches holes out
+of inode size, then extend the inode size.
+
+    fallocate(11, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 2276196352, 65536) = 0
+    fallocate(11, 0, 2276196352, 65536) = 0
+
+v1: https://www.spinics.net/lists/linux-fsdevel/msg193999.html
+v2: https://lore.kernel.org/linux-fsdevel/20210525093034.GB4112@quack2.suse.cz/T/
+
+Link: https://lkml.kernel.org/r/20210528210648.9124-1-junxiao.bi@oracle.com
+Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
+Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
+Cc: Jan Kara <jack@suse.cz>
+Cc: Mark Fasheh <mark@fasheh.com>
+Cc: Joel Becker <jlbec@evilplan.org>
+Cc: Changwei Ge <gechangwei@live.cn>
+Cc: Gang He <ghe@suse.com>
+Cc: Jun Piao <piaojun@huawei.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ocfs2/file.c |   55 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
+ 1 file changed, 50 insertions(+), 5 deletions(-)
+
+--- a/fs/ocfs2/file.c
++++ b/fs/ocfs2/file.c
+@@ -1858,6 +1858,45 @@ out:
+ }
+ 
+ /*
++ * zero out partial blocks of one cluster.
++ *
++ * start: file offset where zero starts, will be made upper block aligned.
++ * len: it will be trimmed to the end of current cluster if "start + len"
++ *      is bigger than it.
++ */
++static int ocfs2_zeroout_partial_cluster(struct inode *inode,
++                                      u64 start, u64 len)
++{
++      int ret;
++      u64 start_block, end_block, nr_blocks;
++      u64 p_block, offset;
++      u32 cluster, p_cluster, nr_clusters;
++      struct super_block *sb = inode->i_sb;
++      u64 end = ocfs2_align_bytes_to_clusters(sb, start);
++
++      if (start + len < end)
++              end = start + len;
++
++      start_block = ocfs2_blocks_for_bytes(sb, start);
++      end_block = ocfs2_blocks_for_bytes(sb, end);
++      nr_blocks = end_block - start_block;
++      if (!nr_blocks)
++              return 0;
++
++      cluster = ocfs2_bytes_to_clusters(sb, start);
++      ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
++                              &nr_clusters, NULL);
++      if (ret)
++              return ret;
++      if (!p_cluster)
++              return 0;
++
++      offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
++      p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
++      return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
++}
++
++/*
+  * Parts of this function taken from xfs_change_file_space()
+  */
+ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
+@@ -1867,7 +1906,7 @@ static int __ocfs2_change_file_space(str
+ {
+       int ret;
+       s64 llen;
+-      loff_t size;
++      loff_t size, orig_isize;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct buffer_head *di_bh = NULL;
+       handle_t *handle;
+@@ -1898,6 +1937,7 @@ static int __ocfs2_change_file_space(str
+               goto out_inode_unlock;
+       }
+ 
++      orig_isize = i_size_read(inode);
+       switch (sr->l_whence) {
+       case 0: /*SEEK_SET*/
+               break;
+@@ -1905,7 +1945,7 @@ static int __ocfs2_change_file_space(str
+               sr->l_start += f_pos;
+               break;
+       case 2: /*SEEK_END*/
+-              sr->l_start += i_size_read(inode);
++              sr->l_start += orig_isize;
+               break;
+       default:
+               ret = -EINVAL;
+@@ -1959,6 +1999,14 @@ static int __ocfs2_change_file_space(str
+       default:
+               ret = -EINVAL;
+       }
++
++      /* zeroout eof blocks in the cluster. */
++      if (!ret && change_size && orig_isize < size) {
++              ret = ocfs2_zeroout_partial_cluster(inode, orig_isize,
++                                      size - orig_isize);
++              if (!ret)
++                      i_size_write(inode, size);
++      }
+       up_write(&OCFS2_I(inode)->ip_alloc_sem);
+       if (ret) {
+               mlog_errno(ret);
+@@ -1975,9 +2023,6 @@ static int __ocfs2_change_file_space(str
+               goto out_inode_unlock;
+       }
+ 
+-      if (change_size && i_size_read(inode) < size)
+-              i_size_write(inode, size);
+-
+       inode->i_ctime = inode->i_mtime = current_time(inode);
+       ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
+       if (ret < 0)
diff --git a/queue-5.12/powerpc-kprobes-fix-validation-of-prefixed-instructions-across-page-boundary.patch b/queue-5.12/powerpc-kprobes-fix-validation-of-prefixed-instructions-across-page-boundary.patch

new file mode 100644 (file)

index 0000000..9b161a0
--- /dev/null
+++ b/queue-5.12/powerpc-kprobes-fix-validation-of-prefixed-instructions-across-page-boundary.patch
@@ -0,0 +1,53 @@
+From 82123a3d1d5a306fdf50c968a474cc60fe43a80f Mon Sep 17 00:00:00 2001
+From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
+Date: Wed, 19 May 2021 16:17:17 +0530
+Subject: powerpc/kprobes: Fix validation of prefixed instructions across page boundary
+
+From: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
+
+commit 82123a3d1d5a306fdf50c968a474cc60fe43a80f upstream.
+
+When checking if the probed instruction is the suffix of a prefixed
+instruction, we access the instruction at the previous word. If the
+probed instruction is the very first word of a module, we can end up
+trying to access an invalid page.
+
+Fix this by skipping the check for all instructions at the beginning of
+a page. Prefixed instructions cannot cross a 64-byte boundary and as
+such, we don't expect to encounter a suffix as the very first word in a
+page for kernel text. Even if there are prefixed instructions crossing
+a page boundary (from a module, for instance), the instruction will be
+illegal, so preventing probing on the suffix of such prefix instructions
+isn't worthwhile.
+
+Fixes: b4657f7650ba ("powerpc/kprobes: Don't allow breakpoints on suffixes")
+Cc: stable@vger.kernel.org # v5.8+
+Reported-by: Christophe Leroy <christophe.leroy@csgroup.eu>
+Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/0df9a032a05576a2fa8e97d1b769af2ff0eafbd6.1621416666.git.naveen.n.rao@linux.vnet.ibm.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/kernel/kprobes.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/powerpc/kernel/kprobes.c
++++ b/arch/powerpc/kernel/kprobes.c
+@@ -108,7 +108,6 @@ int arch_prepare_kprobe(struct kprobe *p
+       int ret = 0;
+       struct kprobe *prev;
+       struct ppc_inst insn = ppc_inst_read((struct ppc_inst *)p->addr);
+-      struct ppc_inst prefix = ppc_inst_read((struct ppc_inst *)(p->addr - 1));
+ 
+       if ((unsigned long)p->addr & 0x03) {
+               printk("Attempt to register kprobe at an unaligned address\n");
+@@ -116,7 +115,8 @@ int arch_prepare_kprobe(struct kprobe *p
+       } else if (IS_MTMSRD(insn) || IS_RFID(insn) || IS_RFI(insn)) {
+               printk("Cannot register a kprobe on rfi/rfid or mtmsr[d]\n");
+               ret = -EINVAL;
+-      } else if (ppc_inst_prefixed(prefix)) {
++      } else if ((unsigned long)p->addr & ~PAGE_MASK &&
++                 ppc_inst_prefixed(ppc_inst_read((struct ppc_inst *)(p->addr - 1)))) {
+               printk("Cannot register a kprobe on the second word of prefixed instruction\n");
+               ret = -EINVAL;
+       }
diff --git a/queue-5.12/scsi-lpfc-fix-failure-to-transmit-abts-on-fc-link.patch b/queue-5.12/scsi-lpfc-fix-failure-to-transmit-abts-on-fc-link.patch

new file mode 100644 (file)

index 0000000..b1b1380
--- /dev/null
+++ b/queue-5.12/scsi-lpfc-fix-failure-to-transmit-abts-on-fc-link.patch
@@ -0,0 +1,43 @@
+From 696770e72f2b42b92ea0a4a98087fb2ba376417a Mon Sep 17 00:00:00 2001
+From: James Smart <jsmart2021@gmail.com>
+Date: Fri, 28 May 2021 14:22:40 -0700
+Subject: scsi: lpfc: Fix failure to transmit ABTS on FC link
+
+From: James Smart <jsmart2021@gmail.com>
+
+commit 696770e72f2b42b92ea0a4a98087fb2ba376417a upstream.
+
+The abort_cmd_ia flag in an abort wqe describes whether an ABTS basic link
+service should be transmitted on the FC link or not.  Code added in
+lpfc_sli4_issue_abort_iotag() set the abort_cmd_ia flag incorrectly,
+surpressing ABTS transmission.
+
+A previous LPFC change to build an abort wqe inverted prior logic that
+determined whether an ABTS was to be issued on the FC link.
+
+Revert this logic to its proper state.
+
+Link: https://lore.kernel.org/r/20210528212240.11387-1-jsmart2021@gmail.com
+Fixes: db7531d2b377 ("scsi: lpfc: Convert abort handling to SLI-3 and SLI-4 handlers")
+Cc: <stable@vger.kernel.org> # v5.11+
+Signed-off-by: James Smart <jsmart2021@gmail.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/scsi/lpfc/lpfc_sli.c |    4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+--- a/drivers/scsi/lpfc/lpfc_sli.c
++++ b/drivers/scsi/lpfc/lpfc_sli.c
+@@ -20591,10 +20591,8 @@ lpfc_sli4_issue_abort_iotag(struct lpfc_
+       abtswqe = &abtsiocb->wqe;
+       memset(abtswqe, 0, sizeof(*abtswqe));
+ 
+-      if (lpfc_is_link_up(phba))
++      if (!lpfc_is_link_up(phba))
+               bf_set(abort_cmd_ia, &abtswqe->abort_cmd, 1);
+-      else
+-              bf_set(abort_cmd_ia, &abtswqe->abort_cmd, 0);
+       bf_set(abort_cmd_criteria, &abtswqe->abort_cmd, T_XRI_TAG);
+       abtswqe->abort_cmd.rsrvd5 = 0;
+       abtswqe->abort_cmd.wqe_com.abort_tag = xritag;
diff --git a/queue-5.12/series b/queue-5.12/series

index 3dccea738b4af35029176941eb5c21bbeee3f756..58a165ea248e8e856410081f9a5a0f0ade5d2e6f 100644 (file)
--- a/queue-5.12/series
+++ b/queue-5.12/series
@@ -123,3 +123,25 @@ revert-mips-make-userspace-mapping-young-by-default.patch
  kfence-maximize-allocation-wait-timeout-duration.patch
  kfence-use-task_idle-when-awaiting-allocation.patch
  pid-take-a-reference-when-initializing-cad_pid.patch
+ocfs2-fix-data-corruption-by-fallocate.patch
+mm-debug_vm_pgtable-fix-alignment-for-pmd-pud_advanced_tests.patch
+mm-page_alloc-fix-counting-of-free-pages-after-take-off-from-buddy.patch
+scsi-lpfc-fix-failure-to-transmit-abts-on-fc-link.patch
+x86-cpufeatures-force-disable-x86_feature_enqcmd-and-remove-update_pasid.patch
+dmaengine-idxd-use-cpu_feature_enabled.patch
+x86-sev-check-sme-sev-support-in-cpuid-first.patch
+kvm-ppc-book3s-hv-save-host-fscr-in-the-p7-8-path.patch
+nfc-fix-null-ptr-dereference-in-llcp_sock_getname-after-failed-connect.patch
+drm-amdgpu-don-t-query-ce-and-ue-errors.patch
+drm-amdgpu-make-sure-we-unpin-the-uvd-bo.patch
+x86-apic-mark-_all_-legacy-interrupts-when-io-apic-is-missing.patch
+x86-thermal-fix-lvt-thermal-setup-for-smi-delivery-mode.patch
+powerpc-kprobes-fix-validation-of-prefixed-instructions-across-page-boundary.patch
+btrfs-mark-ordered-extent-and-inode-with-error-if-we-fail-to-finish.patch
+btrfs-fix-error-handling-in-btrfs_del_csums.patch
+btrfs-return-errors-from-btrfs_del_csums-in-cleanup_ref_head.patch
+btrfs-fix-fsync-failure-and-transaction-abort-after-writes-to-prealloc-extents.patch
+btrfs-check-error-value-from-btrfs_update_inode-in-tree-log.patch
+btrfs-fixup-error-handling-in-fixup_inode_link_counts.patch
+btrfs-abort-in-rename_exchange-if-we-fail-to-insert-the-second-ref.patch
+btrfs-fix-deadlock-when-cloning-inline-extents-and-low-on-available-space.patch
diff --git a/queue-5.12/x86-apic-mark-_all_-legacy-interrupts-when-io-apic-is-missing.patch b/queue-5.12/x86-apic-mark-_all_-legacy-interrupts-when-io-apic-is-missing.patch

new file mode 100644 (file)

index 0000000..eb878d7
--- /dev/null
+++ b/queue-5.12/x86-apic-mark-_all_-legacy-interrupts-when-io-apic-is-missing.patch
@@ -0,0 +1,95 @@
+From 7d65f9e80646c595e8c853640a9d0768a33e204c Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Tue, 25 May 2021 13:08:41 +0200
+Subject: x86/apic: Mark _all_ legacy interrupts when IO/APIC is missing
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 7d65f9e80646c595e8c853640a9d0768a33e204c upstream.
+
+PIC interrupts do not support affinity setting and they can end up on
+any online CPU. Therefore, it's required to mark the associated vectors
+as system-wide reserved. Otherwise, the corresponding irq descriptors
+are copied to the secondary CPUs but the vectors are not marked as
+assigned or reserved. This works correctly for the IO/APIC case.
+
+When the IO/APIC is disabled via config, kernel command line or lack of
+enumeration then all legacy interrupts are routed through the PIC, but
+nothing marks them as system-wide reserved vectors.
+
+As a consequence, a subsequent allocation on a secondary CPU can result in
+allocating one of these vectors, which triggers the BUG() in
+apic_update_vector() because the interrupt descriptor slot is not empty.
+
+Imran tried to work around that by marking those interrupts as allocated
+when a CPU comes online. But that's wrong in case that the IO/APIC is
+available and one of the legacy interrupts, e.g. IRQ0, has been switched to
+PIC mode because then marking them as allocated will fail as they are
+already marked as system vectors.
+
+Stay consistent and update the legacy vectors after attempting IO/APIC
+initialization and mark them as system vectors in case that no IO/APIC is
+available.
+
+Fixes: 69cde0004a4b ("x86/vector: Use matrix allocator for vector assignment")
+Reported-by: Imran Khan <imran.f.khan@oracle.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Cc: stable@vger.kernel.org
+Link: https://lkml.kernel.org/r/20210519233928.2157496-1-imran.f.khan@oracle.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/apic.h   |    1 +
+ arch/x86/kernel/apic/apic.c   |    1 +
+ arch/x86/kernel/apic/vector.c |   20 ++++++++++++++++++++
+ 3 files changed, 22 insertions(+)
+
+--- a/arch/x86/include/asm/apic.h
++++ b/arch/x86/include/asm/apic.h
+@@ -174,6 +174,7 @@ static inline int apic_is_clustered_box(
+ extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
+ extern void lapic_assign_system_vectors(void);
+ extern void lapic_assign_legacy_vector(unsigned int isairq, bool replace);
++extern void lapic_update_legacy_vectors(void);
+ extern void lapic_online(void);
+ extern void lapic_offline(void);
+ extern bool apic_needs_pit(void);
+--- a/arch/x86/kernel/apic/apic.c
++++ b/arch/x86/kernel/apic/apic.c
+@@ -2604,6 +2604,7 @@ static void __init apic_bsp_setup(bool u
+       end_local_APIC_setup();
+       irq_remap_enable_fault_handling();
+       setup_IO_APIC();
++      lapic_update_legacy_vectors();
+ }
+ 
+ #ifdef CONFIG_UP_LATE_INIT
+--- a/arch/x86/kernel/apic/vector.c
++++ b/arch/x86/kernel/apic/vector.c
+@@ -730,6 +730,26 @@ void lapic_assign_legacy_vector(unsigned
+       irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace);
+ }
+ 
++void __init lapic_update_legacy_vectors(void)
++{
++      unsigned int i;
++
++      if (IS_ENABLED(CONFIG_X86_IO_APIC) && nr_ioapics > 0)
++              return;
++
++      /*
++       * If the IO/APIC is disabled via config, kernel command line or
++       * lack of enumeration then all legacy interrupts are routed
++       * through the PIC. Make sure that they are marked as legacy
++       * vectors. PIC_CASCADE_IRQ has already been marked in
++       * lapic_assign_system_vectors().
++       */
++      for (i = 0; i < nr_legacy_irqs(); i++) {
++              if (i != PIC_CASCADE_IR)
++                      lapic_assign_legacy_vector(i, true);
++      }
++}
++
+ void __init lapic_assign_system_vectors(void)
+ {
+       unsigned int i, vector = 0;
diff --git a/queue-5.12/x86-cpufeatures-force-disable-x86_feature_enqcmd-and-remove-update_pasid.patch b/queue-5.12/x86-cpufeatures-force-disable-x86_feature_enqcmd-and-remove-update_pasid.patch

new file mode 100644 (file)

index 0000000..d0159a1
--- /dev/null
+++ b/queue-5.12/x86-cpufeatures-force-disable-x86_feature_enqcmd-and-remove-update_pasid.patch
@@ -0,0 +1,178 @@
+From 9bfecd05833918526cc7357d55e393393440c5fa Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Sat, 29 May 2021 11:17:30 +0200
+Subject: x86/cpufeatures: Force disable X86_FEATURE_ENQCMD and remove update_pasid()
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 9bfecd05833918526cc7357d55e393393440c5fa upstream.
+
+While digesting the XSAVE-related horrors which got introduced with
+the supervisor/user split, the recent addition of ENQCMD-related
+functionality got on the radar and turned out to be similarly broken.
+
+update_pasid(), which is only required when X86_FEATURE_ENQCMD is
+available, is invoked from two places:
+
+ 1) From switch_to() for the incoming task
+
+ 2) Via a SMP function call from the IOMMU/SMV code
+
+#1 is half-ways correct as it hacks around the brokenness of get_xsave_addr()
+   by enforcing the state to be 'present', but all the conditionals in that
+   code are completely pointless for that.
+
+   Also the invocation is just useless overhead because at that point
+   it's guaranteed that TIF_NEED_FPU_LOAD is set on the incoming task
+   and all of this can be handled at return to user space.
+
+#2 is broken beyond repair. The comment in the code claims that it is safe
+   to invoke this in an IPI, but that's just wishful thinking.
+
+   FPU state of a running task is protected by fregs_lock() which is
+   nothing else than a local_bh_disable(). As BH-disabled regions run
+   usually with interrupts enabled the IPI can hit a code section which
+   modifies FPU state and there is absolutely no guarantee that any of the
+   assumptions which are made for the IPI case is true.
+
+   Also the IPI is sent to all CPUs in mm_cpumask(mm), but the IPI is
+   invoked with a NULL pointer argument, so it can hit a completely
+   unrelated task and unconditionally force an update for nothing.
+   Worse, it can hit a kernel thread which operates on a user space
+   address space and set a random PASID for it.
+
+The offending commit does not cleanly revert, but it's sufficient to
+force disable X86_FEATURE_ENQCMD and to remove the broken update_pasid()
+code to make this dysfunctional all over the place. Anything more
+complex would require more surgery and none of the related functions
+outside of the x86 core code are blatantly wrong, so removing those
+would be overkill.
+
+As nothing enables the PASID bit in the IA32_XSS MSR yet, which is
+required to make this actually work, this cannot result in a regression
+except for related out of tree train-wrecks, but they are broken already
+today.
+
+Fixes: 20f0afd1fb3d ("x86/mmu: Allocate/free a PASID")
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Acked-by: Andy Lutomirski <luto@kernel.org>
+Cc: stable@vger.kernel.org
+Link: https://lkml.kernel.org/r/87mtsd6gr9.ffs@nanos.tec.linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/disabled-features.h |    7 +--
+ arch/x86/include/asm/fpu/api.h           |    6 ---
+ arch/x86/include/asm/fpu/internal.h      |    7 ---
+ arch/x86/kernel/fpu/xstate.c             |   57 -------------------------------
+ 4 files changed, 3 insertions(+), 74 deletions(-)
+
+--- a/arch/x86/include/asm/disabled-features.h
++++ b/arch/x86/include/asm/disabled-features.h
+@@ -56,11 +56,8 @@
+ # define DISABLE_PTI          (1 << (X86_FEATURE_PTI & 31))
+ #endif
+ 
+-#ifdef CONFIG_IOMMU_SUPPORT
+-# define DISABLE_ENQCMD       0
+-#else
+-# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31))
+-#endif
++/* Force disable because it's broken beyond repair */
++#define DISABLE_ENQCMD                (1 << (X86_FEATURE_ENQCMD & 31))
+ 
+ #ifdef CONFIG_X86_SGX
+ # define DISABLE_SGX  0
+--- a/arch/x86/include/asm/fpu/api.h
++++ b/arch/x86/include/asm/fpu/api.h
+@@ -106,10 +106,6 @@ extern int cpu_has_xfeatures(u64 xfeatur
+  */
+ #define PASID_DISABLED        0
+ 
+-#ifdef CONFIG_IOMMU_SUPPORT
+-/* Update current's PASID MSR/state by mm's PASID. */
+-void update_pasid(void);
+-#else
+ static inline void update_pasid(void) { }
+-#endif
++
+ #endif /* _ASM_X86_FPU_API_H */
+--- a/arch/x86/include/asm/fpu/internal.h
++++ b/arch/x86/include/asm/fpu/internal.h
+@@ -584,13 +584,6 @@ static inline void switch_fpu_finish(str
+                       pkru_val = pk->pkru;
+       }
+       __write_pkru(pkru_val);
+-
+-      /*
+-       * Expensive PASID MSR write will be avoided in update_pasid() because
+-       * TIF_NEED_FPU_LOAD was set. And the PASID state won't be updated
+-       * unless it's different from mm->pasid to reduce overhead.
+-       */
+-      update_pasid();
+ }
+ 
+ #endif /* _ASM_X86_FPU_INTERNAL_H */
+--- a/arch/x86/kernel/fpu/xstate.c
++++ b/arch/x86/kernel/fpu/xstate.c
+@@ -1402,60 +1402,3 @@ int proc_pid_arch_status(struct seq_file
+       return 0;
+ }
+ #endif /* CONFIG_PROC_PID_ARCH_STATUS */
+-
+-#ifdef CONFIG_IOMMU_SUPPORT
+-void update_pasid(void)
+-{
+-      u64 pasid_state;
+-      u32 pasid;
+-
+-      if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
+-              return;
+-
+-      if (!current->mm)
+-              return;
+-
+-      pasid = READ_ONCE(current->mm->pasid);
+-      /* Set the valid bit in the PASID MSR/state only for valid pasid. */
+-      pasid_state = pasid == PASID_DISABLED ?
+-                    pasid : pasid | MSR_IA32_PASID_VALID;
+-
+-      /*
+-       * No need to hold fregs_lock() since the task's fpstate won't
+-       * be changed by others (e.g. ptrace) while the task is being
+-       * switched to or is in IPI.
+-       */
+-      if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
+-              /* The MSR is active and can be directly updated. */
+-              wrmsrl(MSR_IA32_PASID, pasid_state);
+-      } else {
+-              struct fpu *fpu = &current->thread.fpu;
+-              struct ia32_pasid_state *ppasid_state;
+-              struct xregs_state *xsave;
+-
+-              /*
+-               * The CPU's xstate registers are not currently active. Just
+-               * update the PASID state in the memory buffer here. The
+-               * PASID MSR will be loaded when returning to user mode.
+-               */
+-              xsave = &fpu->state.xsave;
+-              xsave->header.xfeatures |= XFEATURE_MASK_PASID;
+-              ppasid_state = get_xsave_addr(xsave, XFEATURE_PASID);
+-              /*
+-               * Since XFEATURE_MASK_PASID is set in xfeatures, ppasid_state
+-               * won't be NULL and no need to check its value.
+-               *
+-               * Only update the task's PASID state when it's different
+-               * from the mm's pasid.
+-               */
+-              if (ppasid_state->pasid != pasid_state) {
+-                      /*
+-                       * Invalid fpregs so that state restoring will pick up
+-                       * the PASID state.
+-                       */
+-                      __fpu_invalidate_fpregs_state(fpu);
+-                      ppasid_state->pasid = pasid_state;
+-              }
+-      }
+-}
+-#endif /* CONFIG_IOMMU_SUPPORT */
diff --git a/queue-5.12/x86-sev-check-sme-sev-support-in-cpuid-first.patch b/queue-5.12/x86-sev-check-sme-sev-support-in-cpuid-first.patch

new file mode 100644 (file)

index 0000000..25a84cb
--- /dev/null
+++ b/queue-5.12/x86-sev-check-sme-sev-support-in-cpuid-first.patch
@@ -0,0 +1,69 @@
+From 009767dbf42ac0dbe3cf48c1ee224f6b778aa85a Mon Sep 17 00:00:00 2001
+From: Pu Wen <puwen@hygon.cn>
+Date: Wed, 2 Jun 2021 15:02:07 +0800
+Subject: x86/sev: Check SME/SEV support in CPUID first
+
+From: Pu Wen <puwen@hygon.cn>
+
+commit 009767dbf42ac0dbe3cf48c1ee224f6b778aa85a upstream.
+
+The first two bits of the CPUID leaf 0x8000001F EAX indicate whether SEV
+or SME is supported, respectively. It's better to check whether SEV or
+SME is actually supported before accessing the MSR_AMD64_SEV to check
+whether SEV or SME is enabled.
+
+This is both a bare-metal issue and a guest/VM issue. Since the first
+generation Hygon Dhyana CPU doesn't support the MSR_AMD64_SEV, reading that
+MSR results in a #GP - either directly from hardware in the bare-metal
+case or via the hypervisor (because the RDMSR is actually intercepted)
+in the guest/VM case, resulting in a failed boot. And since this is very
+early in the boot phase, rdmsrl_safe()/native_read_msr_safe() can't be
+used.
+
+So check the CPUID bits first, before accessing the MSR.
+
+ [ tlendacky: Expand and improve commit message. ]
+ [ bp: Massage commit message. ]
+
+Fixes: eab696d8e8b9 ("x86/sev: Do not require Hypervisor CPUID bit for SEV guests")
+Signed-off-by: Pu Wen <puwen@hygon.cn>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Acked-by: Tom Lendacky <thomas.lendacky@amd.com>
+Cc: <stable@vger.kernel.org> # v5.10+
+Link: https://lkml.kernel.org/r/20210602070207.2480-1-puwen@hygon.cn
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/mm/mem_encrypt_identity.c |   11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/mm/mem_encrypt_identity.c
++++ b/arch/x86/mm/mem_encrypt_identity.c
+@@ -504,10 +504,6 @@ void __init sme_enable(struct boot_param
+ #define AMD_SME_BIT   BIT(0)
+ #define AMD_SEV_BIT   BIT(1)
+ 
+-      /* Check the SEV MSR whether SEV or SME is enabled */
+-      sev_status   = __rdmsr(MSR_AMD64_SEV);
+-      feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
+-
+       /*
+        * Check for the SME/SEV feature:
+        *   CPUID Fn8000_001F[EAX]
+@@ -519,11 +515,16 @@ void __init sme_enable(struct boot_param
+       eax = 0x8000001f;
+       ecx = 0;
+       native_cpuid(&eax, &ebx, &ecx, &edx);
+-      if (!(eax & feature_mask))
++      /* Check whether SEV or SME is supported */
++      if (!(eax & (AMD_SEV_BIT | AMD_SME_BIT)))
+               return;
+ 
+       me_mask = 1UL << (ebx & 0x3f);
+ 
++      /* Check the SEV MSR whether SEV or SME is enabled */
++      sev_status   = __rdmsr(MSR_AMD64_SEV);
++      feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
++
+       /* Check if memory encryption is enabled */
+       if (feature_mask == AMD_SME_BIT) {
+               /*
diff --git a/queue-5.12/x86-thermal-fix-lvt-thermal-setup-for-smi-delivery-mode.patch b/queue-5.12/x86-thermal-fix-lvt-thermal-setup-for-smi-delivery-mode.patch

new file mode 100644 (file)

index 0000000..a989783
--- /dev/null
+++ b/queue-5.12/x86-thermal-fix-lvt-thermal-setup-for-smi-delivery-mode.patch
@@ -0,0 +1,161 @@
+From 9a90ed065a155d13db0d0ffeaad5cc54e51c90c6 Mon Sep 17 00:00:00 2001
+From: Borislav Petkov <bp@suse.de>
+Date: Thu, 27 May 2021 11:02:26 +0200
+Subject: x86/thermal: Fix LVT thermal setup for SMI delivery mode
+
+From: Borislav Petkov <bp@suse.de>
+
+commit 9a90ed065a155d13db0d0ffeaad5cc54e51c90c6 upstream.
+
+There are machines out there with added value crap^WBIOS which provide an
+SMI handler for the local APIC thermal sensor interrupt. Out of reset,
+the BSP on those machines has something like 0x200 in that APIC register
+(timestamps left in because this whole issue is timing sensitive):
+
+  [    0.033858] read lvtthmr: 0x330, val: 0x200
+
+which means:
+
+ - bit 16 - the interrupt mask bit is clear and thus that interrupt is enabled
+ - bits [10:8] have 010b which means SMI delivery mode.
+
+Now, later during boot, when the kernel programs the local APIC, it
+soft-disables it temporarily through the spurious vector register:
+
+  setup_local_APIC:
+
+       ...
+
+       /*
+        * If this comes from kexec/kcrash the APIC might be enabled in
+        * SPIV. Soft disable it before doing further initialization.
+        */
+       value = apic_read(APIC_SPIV);
+       value &= ~APIC_SPIV_APIC_ENABLED;
+       apic_write(APIC_SPIV, value);
+
+which means (from the SDM):
+
+"10.4.7.2 Local APIC State After It Has Been Software Disabled
+
+...
+
+* The mask bits for all the LVT entries are set. Attempts to reset these
+bits will be ignored."
+
+And this happens too:
+
+  [    0.124111] APIC: Switch to symmetric I/O mode setup
+  [    0.124117] lvtthmr 0x200 before write 0xf to APIC 0xf0
+  [    0.124118] lvtthmr 0x10200 after write 0xf to APIC 0xf0
+
+This results in CPU 0 soft lockups depending on the placement in time
+when the APIC soft-disable happens. Those soft lockups are not 100%
+reproducible and the reason for that can only be speculated as no one
+tells you what SMM does. Likely, it confuses the SMM code that the APIC
+is disabled and the thermal interrupt doesn't doesn't fire at all,
+leading to CPU 0 stuck in SMM forever...
+
+Now, before
+
+  4f432e8bb15b ("x86/mce: Get rid of mcheck_intel_therm_init()")
+
+due to how the APIC_LVTTHMR was read before APIC initialization in
+mcheck_intel_therm_init(), it would read the value with the mask bit 16
+clear and then intel_init_thermal() would replicate it onto the APs and
+all would be peachy - the thermal interrupt would remain enabled.
+
+But that commit moved that reading to a later moment in
+intel_init_thermal(), resulting in reading APIC_LVTTHMR on the BSP too
+late and with its interrupt mask bit set.
+
+Thus, revert back to the old behavior of reading the thermal LVT
+register before the APIC gets initialized.
+
+Fixes: 4f432e8bb15b ("x86/mce: Get rid of mcheck_intel_therm_init()")
+Reported-by: James Feeney <james@nurealm.net>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Cc: <stable@vger.kernel.org>
+Cc: Zhang Rui <rui.zhang@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Link: https://lkml.kernel.org/r/YKIqDdFNaXYd39wz@zn.tnic
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/thermal.h      |    4 +++-
+ arch/x86/kernel/setup.c             |    9 +++++++++
+ drivers/thermal/intel/therm_throt.c |   15 +++++++++++----
+ 3 files changed, 23 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/include/asm/thermal.h
++++ b/arch/x86/include/asm/thermal.h
+@@ -3,11 +3,13 @@
+ #define _ASM_X86_THERMAL_H
+ 
+ #ifdef CONFIG_X86_THERMAL_VECTOR
++void therm_lvt_init(void);
+ void intel_init_thermal(struct cpuinfo_x86 *c);
+ bool x86_thermal_enabled(void);
+ void intel_thermal_interrupt(void);
+ #else
+-static inline void intel_init_thermal(struct cpuinfo_x86 *c) { }
++static inline void therm_lvt_init(void)                               { }
++static inline void intel_init_thermal(struct cpuinfo_x86 *c)  { }
+ #endif
+ 
+ #endif /* _ASM_X86_THERMAL_H */
+--- a/arch/x86/kernel/setup.c
++++ b/arch/x86/kernel/setup.c
+@@ -44,6 +44,7 @@
+ #include <asm/pci-direct.h>
+ #include <asm/prom.h>
+ #include <asm/proto.h>
++#include <asm/thermal.h>
+ #include <asm/unwind.h>
+ #include <asm/vsyscall.h>
+ #include <linux/vmalloc.h>
+@@ -1220,6 +1221,14 @@ void __init setup_arch(char **cmdline_p)
+ 
+       x86_init.timers.wallclock_init();
+ 
++      /*
++       * This needs to run before setup_local_APIC() which soft-disables the
++       * local APIC temporarily and that masks the thermal LVT interrupt,
++       * leading to softlockups on machines which have configured SMI
++       * interrupt delivery.
++       */
++      therm_lvt_init();
++
+       mcheck_init();
+ 
+       register_refined_jiffies(CLOCK_TICK_RATE);
+--- a/drivers/thermal/intel/therm_throt.c
++++ b/drivers/thermal/intel/therm_throt.c
+@@ -621,6 +621,17 @@ bool x86_thermal_enabled(void)
+       return atomic_read(&therm_throt_en);
+ }
+ 
++void __init therm_lvt_init(void)
++{
++      /*
++       * This function is only called on boot CPU. Save the init thermal
++       * LVT value on BSP and use that value to restore APs' thermal LVT
++       * entry BIOS programmed later
++       */
++      if (intel_thermal_supported(&boot_cpu_data))
++              lvtthmr_init = apic_read(APIC_LVTTHMR);
++}
++
+ void intel_init_thermal(struct cpuinfo_x86 *c)
+ {
+       unsigned int cpu = smp_processor_id();
+@@ -630,10 +641,6 @@ void intel_init_thermal(struct cpuinfo_x
+       if (!intel_thermal_supported(c))
+               return;
+ 
+-      /* On the BSP? */
+-      if (c == &boot_cpu_data)
+-              lvtthmr_init = apic_read(APIC_LVTTHMR);
+-
+       /*
+        * First check if its enabled already, in which case there might
+        * be some SMM goo which handles it, so we can't even put a handler
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 8 Jun 2021 14:06:33 +0000 (16:06 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 8 Jun 2021 14:06:33 +0000 (16:06 +0200)
queue-5.12/btrfs-abort-in-rename_exchange-if-we-fail-to-insert-the-second-ref.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/btrfs-check-error-value-from-btrfs_update_inode-in-tree-log.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/btrfs-fix-deadlock-when-cloning-inline-extents-and-low-on-available-space.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/btrfs-fix-error-handling-in-btrfs_del_csums.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/btrfs-fix-fsync-failure-and-transaction-abort-after-writes-to-prealloc-extents.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/btrfs-fixup-error-handling-in-fixup_inode_link_counts.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/btrfs-mark-ordered-extent-and-inode-with-error-if-we-fail-to-finish.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/btrfs-return-errors-from-btrfs_del_csums-in-cleanup_ref_head.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/dmaengine-idxd-use-cpu_feature_enabled.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/drm-amdgpu-don-t-query-ce-and-ue-errors.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/drm-amdgpu-make-sure-we-unpin-the-uvd-bo.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/kvm-ppc-book3s-hv-save-host-fscr-in-the-p7-8-path.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/mm-debug_vm_pgtable-fix-alignment-for-pmd-pud_advanced_tests.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/mm-page_alloc-fix-counting-of-free-pages-after-take-off-from-buddy.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/nfc-fix-null-ptr-dereference-in-llcp_sock_getname-after-failed-connect.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/ocfs2-fix-data-corruption-by-fallocate.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/powerpc-kprobes-fix-validation-of-prefixed-instructions-across-page-boundary.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/scsi-lpfc-fix-failure-to-transmit-abts-on-fc-link.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/series		patch \| blob \| blame \| history
queue-5.12/x86-apic-mark-_all_-legacy-interrupts-when-io-apic-is-missing.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/x86-cpufeatures-force-disable-x86_feature_enqcmd-and-remove-update_pasid.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/x86-sev-check-sme-sev-support-in-cpuid-first.patch	[new file with mode: 0644]	patch \| blob
queue-5.12/x86-thermal-fix-lvt-thermal-setup-for-smi-delivery-mode.patch	[new file with mode: 0644]	patch \| blob