]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.4-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 27 Feb 2020 09:23:41 +0000 (10:23 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 27 Feb 2020 09:23:41 +0000 (10:23 +0100)
added patches:
btrfs-destroy-qgroup-extent-records-on-transaction-abort.patch
btrfs-do-not-check-delayed-items-are-empty-for-single-transaction-cleanup.patch
btrfs-don-t-set-path-leave_spinning-for-truncate.patch
btrfs-fix-btrfs_wait_ordered_range-so-that-it-waits-for-all-ordered-extents.patch
btrfs-fix-bytes_may_use-underflow-in-prealloc-error-condtition.patch
btrfs-fix-deadlock-during-fast-fsync-when-logging-prealloc-extents-beyond-eof.patch
btrfs-fix-race-between-shrinking-truncate-and-fiemap.patch
btrfs-reset-fs_root-to-null-on-error-in-open_ctree.patch
kvm-apic-avoid-calculating-pending-eoi-from-an-uninitialized-val.patch
kvm-nvmx-clear-pin_based_posted_intr-from-nested-pinbased_ctls-only-when-apicv-is-globally-disabled.patch
kvm-nvmx-handle-nested-posted-interrupts-when-apicv-is-disabled-for-l1.patch

12 files changed:
queue-5.4/btrfs-destroy-qgroup-extent-records-on-transaction-abort.patch [new file with mode: 0644]
queue-5.4/btrfs-do-not-check-delayed-items-are-empty-for-single-transaction-cleanup.patch [new file with mode: 0644]
queue-5.4/btrfs-don-t-set-path-leave_spinning-for-truncate.patch [new file with mode: 0644]
queue-5.4/btrfs-fix-btrfs_wait_ordered_range-so-that-it-waits-for-all-ordered-extents.patch [new file with mode: 0644]
queue-5.4/btrfs-fix-bytes_may_use-underflow-in-prealloc-error-condtition.patch [new file with mode: 0644]
queue-5.4/btrfs-fix-deadlock-during-fast-fsync-when-logging-prealloc-extents-beyond-eof.patch [new file with mode: 0644]
queue-5.4/btrfs-fix-race-between-shrinking-truncate-and-fiemap.patch [new file with mode: 0644]
queue-5.4/btrfs-reset-fs_root-to-null-on-error-in-open_ctree.patch [new file with mode: 0644]
queue-5.4/kvm-apic-avoid-calculating-pending-eoi-from-an-uninitialized-val.patch [new file with mode: 0644]
queue-5.4/kvm-nvmx-clear-pin_based_posted_intr-from-nested-pinbased_ctls-only-when-apicv-is-globally-disabled.patch [new file with mode: 0644]
queue-5.4/kvm-nvmx-handle-nested-posted-interrupts-when-apicv-is-disabled-for-l1.patch [new file with mode: 0644]
queue-5.4/series

diff --git a/queue-5.4/btrfs-destroy-qgroup-extent-records-on-transaction-abort.patch b/queue-5.4/btrfs-destroy-qgroup-extent-records-on-transaction-abort.patch
new file mode 100644 (file)
index 0000000..d2ef8ff
--- /dev/null
@@ -0,0 +1,82 @@
+From 81f7eb00ff5bb8326e82503a32809421d14abb8a Mon Sep 17 00:00:00 2001
+From: Jeff Mahoney <jeffm@suse.com>
+Date: Tue, 11 Feb 2020 15:25:37 +0800
+Subject: btrfs: destroy qgroup extent records on transaction abort
+
+From: Jeff Mahoney <jeffm@suse.com>
+
+commit 81f7eb00ff5bb8326e82503a32809421d14abb8a upstream.
+
+We clean up the delayed references when we abort a transaction but we
+leave the pending qgroup extent records behind, leaking memory.
+
+This patch destroys the extent records when we destroy the delayed refs
+and makes sure ensure they're gone before releasing the transaction.
+
+Fixes: 3368d001ba5d ("btrfs: qgroup: Record possible quota-related extent for qgroup.")
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Jeff Mahoney <jeffm@suse.com>
+[ Rebased to latest upstream, remove to_qgroup() helper, use
+  rbtree_postorder_for_each_entry_safe() wrapper ]
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/disk-io.c     |    1 +
+ fs/btrfs/qgroup.c      |   13 +++++++++++++
+ fs/btrfs/qgroup.h      |    1 +
+ fs/btrfs/transaction.c |    2 ++
+ 4 files changed, 17 insertions(+)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -4293,6 +4293,7 @@ static int btrfs_destroy_delayed_refs(st
+               cond_resched();
+               spin_lock(&delayed_refs->lock);
+       }
++      btrfs_qgroup_destroy_extent_records(trans);
+       spin_unlock(&delayed_refs->lock);
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -4018,3 +4018,16 @@ out:
+       }
+       return ret;
+ }
++
++void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
++{
++      struct btrfs_qgroup_extent_record *entry;
++      struct btrfs_qgroup_extent_record *next;
++      struct rb_root *root;
++
++      root = &trans->delayed_refs.dirty_extent_root;
++      rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
++              ulist_free(entry->old_roots);
++              kfree(entry);
++      }
++}
+--- a/fs/btrfs/qgroup.h
++++ b/fs/btrfs/qgroup.h
+@@ -414,5 +414,6 @@ int btrfs_qgroup_add_swapped_blocks(stru
+               u64 last_snapshot);
+ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+               struct btrfs_root *root, struct extent_buffer *eb);
++void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
+ #endif
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -51,6 +51,8 @@ void btrfs_put_transaction(struct btrfs_
+               BUG_ON(!list_empty(&transaction->list));
+               WARN_ON(!RB_EMPTY_ROOT(
+                               &transaction->delayed_refs.href_root.rb_root));
++              WARN_ON(!RB_EMPTY_ROOT(
++                              &transaction->delayed_refs.dirty_extent_root));
+               if (transaction->delayed_refs.pending_csums)
+                       btrfs_err(transaction->fs_info,
+                                 "pending csums is %llu",
diff --git a/queue-5.4/btrfs-do-not-check-delayed-items-are-empty-for-single-transaction-cleanup.patch b/queue-5.4/btrfs-do-not-check-delayed-items-are-empty-for-single-transaction-cleanup.patch
new file mode 100644 (file)
index 0000000..00ab344
--- /dev/null
@@ -0,0 +1,41 @@
+From 1e90315149f3fe148e114a5de86f0196d1c21fa5 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Thu, 13 Feb 2020 10:47:29 -0500
+Subject: btrfs: do not check delayed items are empty for single transaction cleanup
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 1e90315149f3fe148e114a5de86f0196d1c21fa5 upstream.
+
+btrfs_assert_delayed_root_empty() will check if the delayed root is
+completely empty, but this is a filesystem-wide check.  On cleanup we
+may have allowed other transactions to begin, for whatever reason, and
+thus the delayed root is not empty.
+
+So remove this check from cleanup_one_transation().  This however can
+stay in btrfs_cleanup_transaction(), because it checks only after all of
+the transactions have been properly cleaned up, and thus is valid.
+
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/disk-io.c |    1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -4520,7 +4520,6 @@ void btrfs_cleanup_one_transaction(struc
+       wake_up(&fs_info->transaction_wait);
+       btrfs_destroy_delayed_inodes(fs_info);
+-      btrfs_assert_delayed_root_empty(fs_info);
+       btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
+                                    EXTENT_DIRTY);
diff --git a/queue-5.4/btrfs-don-t-set-path-leave_spinning-for-truncate.patch b/queue-5.4/btrfs-don-t-set-path-leave_spinning-for-truncate.patch
new file mode 100644 (file)
index 0000000..878c0e1
--- /dev/null
@@ -0,0 +1,79 @@
+From 52e29e331070cd7d52a64cbf1b0958212a340e28 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Fri, 17 Jan 2020 09:02:20 -0500
+Subject: btrfs: don't set path->leave_spinning for truncate
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 52e29e331070cd7d52a64cbf1b0958212a340e28 upstream.
+
+The only time we actually leave the path spinning is if we're truncating
+a small amount and don't actually free an extent, which is not a common
+occurrence.  We have to set the path blocking in order to add the
+delayed ref anyway, so the first extent we find we set the path to
+blocking and stay blocking for the duration of the operation.  With the
+upcoming file extent map stuff there will be another case that we have
+to have the path blocking, so just swap to blocking always.
+
+Note: this patch also fixes a warning after 28553fa992cb ("Btrfs: fix
+race between shrinking truncate and fiemap") got merged that inserts
+extent locks around truncation so the path must not leave spinning locks
+after btrfs_search_slot.
+
+  [70.794783] BUG: sleeping function called from invalid context at mm/slab.h:565
+  [70.794834] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1141, name: rsync
+  [70.794863] 5 locks held by rsync/1141:
+  [70.794876]  #0: ffff888417b9c408 (sb_writers#17){.+.+}, at: mnt_want_write+0x20/0x50
+  [70.795030]  #1: ffff888428de28e8 (&type->i_mutex_dir_key#13/1){+.+.}, at: lock_rename+0xf1/0x100
+  [70.795051]  #2: ffff888417b9c608 (sb_internal#2){.+.+}, at: start_transaction+0x394/0x560
+  [70.795124]  #3: ffff888403081768 (btrfs-fs-01){++++}, at: btrfs_try_tree_write_lock+0x2f/0x160
+  [70.795203]  #4: ffff888403086568 (btrfs-fs-00){++++}, at: btrfs_try_tree_write_lock+0x2f/0x160
+  [70.795222] CPU: 5 PID: 1141 Comm: rsync Not tainted 5.6.0-rc2-backup+ #2
+  [70.795362] Call Trace:
+  [70.795374]  dump_stack+0x71/0xa0
+  [70.795445]  ___might_sleep.part.96.cold.106+0xa6/0xb6
+  [70.795459]  kmem_cache_alloc+0x1d3/0x290
+  [70.795471]  alloc_extent_state+0x22/0x1c0
+  [70.795544]  __clear_extent_bit+0x3ba/0x580
+  [70.795557]  ? _raw_spin_unlock_irq+0x24/0x30
+  [70.795569]  btrfs_truncate_inode_items+0x339/0xe50
+  [70.795647]  btrfs_evict_inode+0x269/0x540
+  [70.795659]  ? dput.part.38+0x29/0x460
+  [70.795671]  evict+0xcd/0x190
+  [70.795682]  __dentry_kill+0xd6/0x180
+  [70.795754]  dput.part.38+0x2ad/0x460
+  [70.795765]  do_renameat2+0x3cb/0x540
+  [70.795777]  __x64_sys_rename+0x1c/0x20
+
+Reported-by: Dave Jones <davej@codemonkey.org.uk>
+Fixes: 28553fa992cb ("Btrfs: fix race between shrinking truncate and fiemap")
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+[ add note ]
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c |    2 --
+ 1 file changed, 2 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -4791,7 +4791,6 @@ search_again:
+               goto out;
+       }
+-      path->leave_spinning = 1;
+       ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+       if (ret < 0)
+               goto out;
+@@ -4943,7 +4942,6 @@ delete:
+                    root == fs_info->tree_root)) {
+                       struct btrfs_ref ref = { 0 };
+-                      btrfs_set_path_blocking(path);
+                       bytes_deleted += extent_num_bytes;
+                       btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
diff --git a/queue-5.4/btrfs-fix-btrfs_wait_ordered_range-so-that-it-waits-for-all-ordered-extents.patch b/queue-5.4/btrfs-fix-btrfs_wait_ordered_range-so-that-it-waits-for-all-ordered-extents.patch
new file mode 100644 (file)
index 0000000..ec8fd6c
--- /dev/null
@@ -0,0 +1,59 @@
+From e75fd33b3f744f644061a4f9662bd63f5434f806 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 13 Feb 2020 12:29:50 +0000
+Subject: Btrfs: fix btrfs_wait_ordered_range() so that it waits for all ordered extents
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit e75fd33b3f744f644061a4f9662bd63f5434f806 upstream.
+
+In btrfs_wait_ordered_range() once we find an ordered extent that has
+finished with an error we exit the loop and don't wait for any other
+ordered extents that might be still in progress.
+
+All the users of btrfs_wait_ordered_range() expect that there are no more
+ordered extents in progress after that function returns. So past fixes
+such like the ones from the two following commits:
+
+  ff612ba7849964 ("btrfs: fix panic during relocation after ENOSPC before
+                   writeback happens")
+
+  28aeeac1dd3080 ("Btrfs: fix panic when starting bg cache writeout after
+                   IO error")
+
+don't work when there are multiple ordered extents in the range.
+
+Fix that by making btrfs_wait_ordered_range() wait for all ordered extents
+even after it finds one that had an error.
+
+Link: https://github.com/kdave/btrfs-progs/issues/228#issuecomment-569777554
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ordered-data.c |    7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/ordered-data.c
++++ b/fs/btrfs/ordered-data.c
+@@ -690,10 +690,15 @@ int btrfs_wait_ordered_range(struct inod
+               }
+               btrfs_start_ordered_extent(inode, ordered, 1);
+               end = ordered->file_offset;
++              /*
++               * If the ordered extent had an error save the error but don't
++               * exit without waiting first for all other ordered extents in
++               * the range to complete.
++               */
+               if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+                       ret = -EIO;
+               btrfs_put_ordered_extent(ordered);
+-              if (ret || end == 0 || end == start)
++              if (end == 0 || end == start)
+                       break;
+               end--;
+       }
diff --git a/queue-5.4/btrfs-fix-bytes_may_use-underflow-in-prealloc-error-condtition.patch b/queue-5.4/btrfs-fix-bytes_may_use-underflow-in-prealloc-error-condtition.patch
new file mode 100644 (file)
index 0000000..a17e8d0
--- /dev/null
@@ -0,0 +1,96 @@
+From b778cf962d71a0e737923d55d0432f3bd287258e Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Thu, 13 Feb 2020 10:47:31 -0500
+Subject: btrfs: fix bytes_may_use underflow in prealloc error condtition
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit b778cf962d71a0e737923d55d0432f3bd287258e upstream.
+
+I hit the following warning while running my error injection stress
+testing:
+
+  WARNING: CPU: 3 PID: 1453 at fs/btrfs/space-info.h:108 btrfs_free_reserved_data_space_noquota+0xfd/0x160 [btrfs]
+  RIP: 0010:btrfs_free_reserved_data_space_noquota+0xfd/0x160 [btrfs]
+  Call Trace:
+  btrfs_free_reserved_data_space+0x4f/0x70 [btrfs]
+  __btrfs_prealloc_file_range+0x378/0x470 [btrfs]
+  elfcorehdr_read+0x40/0x40
+  ? elfcorehdr_read+0x40/0x40
+  ? btrfs_commit_transaction+0xca/0xa50 [btrfs]
+  ? dput+0xb4/0x2a0
+  ? btrfs_log_dentry_safe+0x55/0x70 [btrfs]
+  ? btrfs_sync_file+0x30e/0x420 [btrfs]
+  ? do_fsync+0x38/0x70
+  ? __x64_sys_fdatasync+0x13/0x20
+  ? do_syscall_64+0x5b/0x1b0
+  ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+This happens if we fail to insert our reserved file extent.  At this
+point we've already converted our reservation from ->bytes_may_use to
+->bytes_reserved.  However once we break we will attempt to free
+everything from [cur_offset, end] from ->bytes_may_use, but our extent
+reservation will overlap part of this.
+
+Fix this problem by adding ins.offset (our extent allocation size) to
+cur_offset so we remove the actual remaining part from ->bytes_may_use.
+
+I validated this fix using my inject-error.py script
+
+python inject-error.py -o should_fail_bio -t cache_save_setup -t \
+       __btrfs_prealloc_file_range \
+       -t insert_reserved_file_extent.constprop.0 \
+       -r "-5" ./run-fsstress.sh
+
+where run-fsstress.sh simply mounts and runs fsstress on a disk.
+
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c |   16 +++++++++++++---
+ 1 file changed, 13 insertions(+), 3 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -10464,6 +10464,7 @@ static int __btrfs_prealloc_file_range(s
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_key ins;
+       u64 cur_offset = start;
++      u64 clear_offset = start;
+       u64 i_size;
+       u64 cur_bytes;
+       u64 last_alloc = (u64)-1;
+@@ -10498,6 +10499,15 @@ static int __btrfs_prealloc_file_range(s
+                               btrfs_end_transaction(trans);
+                       break;
+               }
++
++              /*
++               * We've reserved this space, and thus converted it from
++               * ->bytes_may_use to ->bytes_reserved.  Any error that happens
++               * from here on out we will only need to clear our reservation
++               * for the remaining unreserved area, so advance our
++               * clear_offset by our extent size.
++               */
++              clear_offset += ins.offset;
+               btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+               last_alloc = ins.offset;
+@@ -10578,9 +10588,9 @@ next:
+               if (own_trans)
+                       btrfs_end_transaction(trans);
+       }
+-      if (cur_offset < end)
+-              btrfs_free_reserved_data_space(inode, NULL, cur_offset,
+-                      end - cur_offset + 1);
++      if (clear_offset < end)
++              btrfs_free_reserved_data_space(inode, NULL, clear_offset,
++                      end - clear_offset + 1);
+       return ret;
+ }
diff --git a/queue-5.4/btrfs-fix-deadlock-during-fast-fsync-when-logging-prealloc-extents-beyond-eof.patch b/queue-5.4/btrfs-fix-deadlock-during-fast-fsync-when-logging-prealloc-extents-beyond-eof.patch
new file mode 100644 (file)
index 0000000..42d2e56
--- /dev/null
@@ -0,0 +1,214 @@
+From a5ae50dea9111db63d30d700766dd5509602f7ad Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 20 Feb 2020 13:29:49 +0000
+Subject: Btrfs: fix deadlock during fast fsync when logging prealloc extents beyond eof
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit a5ae50dea9111db63d30d700766dd5509602f7ad upstream.
+
+While logging the prealloc extents of an inode during a fast fsync we call
+btrfs_truncate_inode_items(), through btrfs_log_prealloc_extents(), while
+holding a read lock on a leaf of the inode's root (not the log root, the
+fs/subvol root), and then that function locks the file range in the inode's
+iotree. This can lead to a deadlock when:
+
+* the fsync is ranged
+
+* the file has prealloc extents beyond eof
+
+* writeback for a range different from the fsync range starts
+  during the fsync
+
+* the size of the file is not sector size aligned
+
+Because when finishing an ordered extent we lock first a file range and
+then try to COW the fs/subvol tree to insert an extent item.
+
+The following diagram shows how the deadlock can happen.
+
+           CPU 1                                        CPU 2
+
+  btrfs_sync_file()
+    --> for range [0, 1MiB)
+
+    --> inode has a size of
+        1MiB and has 1 prealloc
+        extent beyond the
+        i_size, starting at offset
+        4MiB
+
+    flushes all delalloc for the
+    range [0MiB, 1MiB) and waits
+    for the respective ordered
+    extents to complete
+
+                                              --> before task at CPU 1 locks the
+                                                  inode, a write into file range
+                                                  [1MiB, 2MiB + 1KiB) is made
+
+                                              --> i_size is updated to 2MiB + 1KiB
+
+                                              --> writeback is started for that
+                                                  range, [1MiB, 2MiB + 4KiB)
+                                                  --> end offset rounded up to
+                                                      be sector size aligned
+
+    btrfs_log_dentry_safe()
+      btrfs_log_inode_parent()
+        btrfs_log_inode()
+
+          btrfs_log_changed_extents()
+            btrfs_log_prealloc_extents()
+              --> does a search on the
+                  inode's root
+              --> holds a read lock on
+                  leaf X
+
+                                              btrfs_finish_ordered_io()
+                                                --> locks range [1MiB, 2MiB + 4KiB)
+                                                    --> end offset rounded up
+                                                        to be sector size aligned
+
+                                                --> tries to cow leaf X, through
+                                                    insert_reserved_file_extent()
+                                                    --> already locked by the
+                                                        task at CPU 1
+
+              btrfs_truncate_inode_items()
+
+                --> gets an i_size of
+                    2MiB + 1KiB, which is
+                    not sector size
+                    aligned
+
+                --> tries to lock file
+                    range [2MiB, (u64)-1)
+                    --> the start range
+                        is rounded down
+                        from 2MiB + 1K
+                        to 2MiB to be sector
+                        size aligned
+
+                    --> but the subrange
+                        [2MiB, 2MiB + 4KiB) is
+                        already locked by
+                        task at CPU 2 which
+                        is waiting to get a
+                        write lock on leaf X
+                        for which we are
+                        holding a read lock
+
+                                *** deadlock ***
+
+This results in a stack trace like the following, triggered by test case
+generic/561 from fstests:
+
+  [ 2779.973608] INFO: task kworker/u8:6:247 blocked for more than 120 seconds.
+  [ 2779.979536]       Not tainted 5.6.0-rc2-btrfs-next-53 #1
+  [ 2779.984503] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+  [ 2779.990136] kworker/u8:6    D    0   247      2 0x80004000
+  [ 2779.990457] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs]
+  [ 2779.990466] Call Trace:
+  [ 2779.990491]  ? __schedule+0x384/0xa30
+  [ 2779.990521]  schedule+0x33/0xe0
+  [ 2779.990616]  btrfs_tree_read_lock+0x19e/0x2e0 [btrfs]
+  [ 2779.990632]  ? remove_wait_queue+0x60/0x60
+  [ 2779.990730]  btrfs_read_lock_root_node+0x2f/0x40 [btrfs]
+  [ 2779.990782]  btrfs_search_slot+0x510/0x1000 [btrfs]
+  [ 2779.990869]  btrfs_lookup_file_extent+0x4a/0x70 [btrfs]
+  [ 2779.990944]  __btrfs_drop_extents+0x161/0x1060 [btrfs]
+  [ 2779.990987]  ? mark_held_locks+0x6d/0xc0
+  [ 2779.990994]  ? __slab_alloc.isra.49+0x99/0x100
+  [ 2779.991060]  ? insert_reserved_file_extent.constprop.19+0x64/0x300 [btrfs]
+  [ 2779.991145]  insert_reserved_file_extent.constprop.19+0x97/0x300 [btrfs]
+  [ 2779.991222]  ? start_transaction+0xdd/0x5c0 [btrfs]
+  [ 2779.991291]  btrfs_finish_ordered_io+0x4f4/0x840 [btrfs]
+  [ 2779.991405]  btrfs_work_helper+0xaa/0x720 [btrfs]
+  [ 2779.991432]  process_one_work+0x26d/0x6a0
+  [ 2779.991460]  worker_thread+0x4f/0x3e0
+  [ 2779.991481]  ? process_one_work+0x6a0/0x6a0
+  [ 2779.991489]  kthread+0x103/0x140
+  [ 2779.991499]  ? kthread_create_worker_on_cpu+0x70/0x70
+  [ 2779.991515]  ret_from_fork+0x3a/0x50
+  (...)
+  [ 2780.026211] INFO: task fsstress:17375 blocked for more than 120 seconds.
+  [ 2780.027480]       Not tainted 5.6.0-rc2-btrfs-next-53 #1
+  [ 2780.028482] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+  [ 2780.030035] fsstress        D    0 17375  17373 0x00004000
+  [ 2780.030038] Call Trace:
+  [ 2780.030044]  ? __schedule+0x384/0xa30
+  [ 2780.030052]  schedule+0x33/0xe0
+  [ 2780.030075]  lock_extent_bits+0x20c/0x320 [btrfs]
+  [ 2780.030094]  ? btrfs_truncate_inode_items+0xf4/0x1150 [btrfs]
+  [ 2780.030098]  ? rcu_read_lock_sched_held+0x59/0xa0
+  [ 2780.030102]  ? remove_wait_queue+0x60/0x60
+  [ 2780.030122]  btrfs_truncate_inode_items+0x133/0x1150 [btrfs]
+  [ 2780.030151]  ? btrfs_set_path_blocking+0xb2/0x160 [btrfs]
+  [ 2780.030165]  ? btrfs_search_slot+0x379/0x1000 [btrfs]
+  [ 2780.030195]  btrfs_log_changed_extents.isra.8+0x841/0x93e [btrfs]
+  [ 2780.030202]  ? do_raw_spin_unlock+0x49/0xc0
+  [ 2780.030215]  ? btrfs_get_num_csums+0x10/0x10 [btrfs]
+  [ 2780.030239]  btrfs_log_inode+0xf83/0x1124 [btrfs]
+  [ 2780.030251]  ? __mutex_unlock_slowpath+0x45/0x2a0
+  [ 2780.030275]  btrfs_log_inode_parent+0x2a0/0xe40 [btrfs]
+  [ 2780.030282]  ? dget_parent+0xa1/0x370
+  [ 2780.030309]  btrfs_log_dentry_safe+0x4a/0x70 [btrfs]
+  [ 2780.030329]  btrfs_sync_file+0x3f3/0x490 [btrfs]
+  [ 2780.030339]  do_fsync+0x38/0x60
+  [ 2780.030343]  __x64_sys_fdatasync+0x13/0x20
+  [ 2780.030345]  do_syscall_64+0x5c/0x280
+  [ 2780.030348]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
+  [ 2780.030356] RIP: 0033:0x7f2d80f6d5f0
+  [ 2780.030361] Code: Bad RIP value.
+  [ 2780.030362] RSP: 002b:00007ffdba3c8548 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
+  [ 2780.030364] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f2d80f6d5f0
+  [ 2780.030365] RDX: 00007ffdba3c84b0 RSI: 00007ffdba3c84b0 RDI: 0000000000000003
+  [ 2780.030367] RBP: 000000000000004a R08: 0000000000000001 R09: 00007ffdba3c855c
+  [ 2780.030368] R10: 0000000000000078 R11: 0000000000000246 R12: 00000000000001f4
+  [ 2780.030369] R13: 0000000051eb851f R14: 00007ffdba3c85f0 R15: 0000557a49220d90
+
+So fix this by making btrfs_truncate_inode_items() not lock the range in
+the inode's iotree when the target root is a log root, since it's not
+needed to lock the range for log roots as the protection from the inode's
+lock and log_mutex are all that's needed.
+
+Fixes: 28553fa992cb28 ("Btrfs: fix race between shrinking truncate and fiemap")
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c |   10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -4752,8 +4752,9 @@ int btrfs_truncate_inode_items(struct bt
+               return -ENOMEM;
+       path->reada = READA_BACK;
+-      lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+-                       &cached_state);
++      if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
++              lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
++                               &cached_state);
+       /*
+        * We want to drop from the next block forward in case this new size is
+@@ -5017,11 +5018,10 @@ out:
+               if (!ret && last_size > new_size)
+                       last_size = new_size;
+               btrfs_ordered_update_i_size(inode, last_size, NULL);
++              unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
++                                   (u64)-1, &cached_state);
+       }
+-      unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+-                           &cached_state);
+-
+       btrfs_free_path(path);
+       return ret;
+ }
diff --git a/queue-5.4/btrfs-fix-race-between-shrinking-truncate-and-fiemap.patch b/queue-5.4/btrfs-fix-race-between-shrinking-truncate-and-fiemap.patch
new file mode 100644 (file)
index 0000000..0a5e4c7
--- /dev/null
@@ -0,0 +1,112 @@
+From 28553fa992cb28be6a65566681aac6cafabb4f2d Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Fri, 7 Feb 2020 12:23:09 +0000
+Subject: Btrfs: fix race between shrinking truncate and fiemap
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 28553fa992cb28be6a65566681aac6cafabb4f2d upstream.
+
+When there is a fiemap executing in parallel with a shrinking truncate
+we can end up in a situation where we have extent maps for which we no
+longer have corresponding file extent items. This is generally harmless
+and at the moment the only consequences are missing file extent items
+representing holes after we expand the file size again after the
+truncate operation removed the prealloc extent items, and stale
+information for future fiemap calls (reporting extents that no longer
+exist or may have been reallocated to other files for example).
+
+Consider the following example:
+
+1) Our inode has a size of 128KiB, one 128KiB extent at file offset 0
+   and a 1MiB prealloc extent at file offset 128KiB;
+
+2) Task A starts doing a shrinking truncate of our inode to reduce it to
+   a size of 64KiB. Before it searches the subvolume tree for file
+   extent items to delete, it drops all the extent maps in the range
+   from 64KiB to (u64)-1 by calling btrfs_drop_extent_cache();
+
+3) Task B starts doing a fiemap against our inode. When looking up for
+   the inode's extent maps in the range from 128KiB to (u64)-1, it
+   doesn't find any in the inode's extent map tree, since they were
+   removed by task A.  Because it didn't find any in the extent map
+   tree, it scans the inode's subvolume tree for file extent items, and
+   it finds the 1MiB prealloc extent at file offset 128KiB, then it
+   creates an extent map based on that file extent item and adds it to
+   inode's extent map tree (this ends up being done by
+   btrfs_get_extent() <- btrfs_get_extent_fiemap() <-
+   get_extent_skip_holes());
+
+4) Task A then drops the prealloc extent at file offset 128KiB and
+   shrinks the 128KiB extent file offset 0 to a length of 64KiB. The
+   truncation operation finishes and we end up with an extent map
+   representing a 1MiB prealloc extent at file offset 128KiB, despite we
+   don't have any more that extent;
+
+After this the two types of problems we have are:
+
+1) Future calls to fiemap always report that a 1MiB prealloc extent
+   exists at file offset 128KiB. This is stale information, no longer
+   correct;
+
+2) If the size of the file is increased, by a truncate operation that
+   increases the file size or by a write into a file offset > 64KiB for
+   example, we end up not inserting file extent items to represent holes
+   for any range between 128KiB and 128KiB + 1MiB, since the hole
+   expansion function, btrfs_cont_expand() will skip hole insertion for
+   any range for which an extent map exists that represents a prealloc
+   extent. This causes fsck to complain about missing file extent items
+   when not using the NO_HOLES feature.
+
+The second issue could be often triggered by test case generic/561 from
+fstests, which runs fsstress and duperemove in parallel, and duperemove
+does frequent fiemap calls.
+
+Essentially the problems happens because fiemap does not acquire the
+inode's lock while truncate does, and fiemap locks the file range in the
+inode's iotree while truncate does not. So fix the issue by making
+btrfs_truncate_inode_items() lock the file range from the new file size
+to (u64)-1, so that it serializes with fiemap.
+
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -4734,6 +4734,8 @@ int btrfs_truncate_inode_items(struct bt
+       u64 bytes_deleted = 0;
+       bool be_nice = false;
+       bool should_throttle = false;
++      const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
++      struct extent_state *cached_state = NULL;
+       BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
+@@ -4750,6 +4752,9 @@ int btrfs_truncate_inode_items(struct bt
+               return -ENOMEM;
+       path->reada = READA_BACK;
++      lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
++                       &cached_state);
++
+       /*
+        * We want to drop from the next block forward in case this new size is
+        * not block aligned since we will be keeping the last block of the
+@@ -5016,6 +5021,9 @@ out:
+               btrfs_ordered_update_i_size(inode, last_size, NULL);
+       }
++      unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
++                           &cached_state);
++
+       btrfs_free_path(path);
+       return ret;
+ }
diff --git a/queue-5.4/btrfs-reset-fs_root-to-null-on-error-in-open_ctree.patch b/queue-5.4/btrfs-reset-fs_root-to-null-on-error-in-open_ctree.patch
new file mode 100644 (file)
index 0000000..d75a3e9
--- /dev/null
@@ -0,0 +1,37 @@
+From 315bf8ef914f31d51d084af950703aa1e09a728c Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Thu, 13 Feb 2020 10:47:28 -0500
+Subject: btrfs: reset fs_root to NULL on error in open_ctree
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 315bf8ef914f31d51d084af950703aa1e09a728c upstream.
+
+While running my error injection script I hit a panic when we tried to
+clean up the fs_root when freeing the fs_root.  This is because
+fs_info->fs_root == PTR_ERR(-EIO), which isn't great.  Fix this by
+setting fs_info->fs_root = NULL; if we fail to read the root.
+
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/disk-io.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -3203,6 +3203,7 @@ retry_root_backup:
+       if (IS_ERR(fs_info->fs_root)) {
+               err = PTR_ERR(fs_info->fs_root);
+               btrfs_warn(fs_info, "failed to read fs tree: %d", err);
++              fs_info->fs_root = NULL;
+               goto fail_qgroup;
+       }
diff --git a/queue-5.4/kvm-apic-avoid-calculating-pending-eoi-from-an-uninitialized-val.patch b/queue-5.4/kvm-apic-avoid-calculating-pending-eoi-from-an-uninitialized-val.patch
new file mode 100644 (file)
index 0000000..bf91a9f
--- /dev/null
@@ -0,0 +1,38 @@
+From 23520b2def95205f132e167cf5b25c609975e959 Mon Sep 17 00:00:00 2001
+From: Miaohe Lin <linmiaohe@huawei.com>
+Date: Fri, 21 Feb 2020 22:04:46 +0800
+Subject: KVM: apic: avoid calculating pending eoi from an uninitialized val
+
+From: Miaohe Lin <linmiaohe@huawei.com>
+
+commit 23520b2def95205f132e167cf5b25c609975e959 upstream.
+
+When pv_eoi_get_user() fails, 'val' may remain uninitialized and the return
+value of pv_eoi_get_pending() becomes random. Fix the issue by initializing
+the variable.
+
+Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/lapic.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -637,9 +637,11 @@ static inline bool pv_eoi_enabled(struct
+ static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
+ {
+       u8 val;
+-      if (pv_eoi_get_user(vcpu, &val) < 0)
++      if (pv_eoi_get_user(vcpu, &val) < 0) {
+               printk(KERN_WARNING "Can't read EOI MSR value: 0x%llx\n",
+                          (unsigned long long)vcpu->arch.pv_eoi.msr_val);
++              return false;
++      }
+       return val & 0x1;
+ }
diff --git a/queue-5.4/kvm-nvmx-clear-pin_based_posted_intr-from-nested-pinbased_ctls-only-when-apicv-is-globally-disabled.patch b/queue-5.4/kvm-nvmx-clear-pin_based_posted_intr-from-nested-pinbased_ctls-only-when-apicv-is-globally-disabled.patch
new file mode 100644 (file)
index 0000000..7f2101a
--- /dev/null
@@ -0,0 +1,122 @@
+From a4443267800af240072280c44521caab61924e55 Mon Sep 17 00:00:00 2001
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+Date: Thu, 20 Feb 2020 18:22:04 +0100
+Subject: KVM: nVMX: clear PIN_BASED_POSTED_INTR from nested pinbased_ctls only when apicv is globally disabled
+
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+
+commit a4443267800af240072280c44521caab61924e55 upstream.
+
+When apicv is disabled on a vCPU (e.g. by enabling KVM_CAP_HYPERV_SYNIC*),
+nothing happens to VMX MSRs on the already existing vCPUs, however, all new
+ones are created with PIN_BASED_POSTED_INTR filtered out. This is very
+confusing and results in the following picture inside the guest:
+
+$ rdmsr -ax 0x48d
+ff00000016
+7f00000016
+7f00000016
+7f00000016
+
+This is observed with QEMU and 4-vCPU guest: QEMU creates vCPU0, does
+KVM_CAP_HYPERV_SYNIC2 and then creates the remaining three.
+
+L1 hypervisor may only check CPU0's controls to find out what features
+are available and it will be very confused later. Switch to setting
+PIN_BASED_POSTED_INTR control based on global 'enable_apicv' setting.
+
+Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx/capabilities.h |    1 +
+ arch/x86/kvm/vmx/nested.c       |    5 ++---
+ arch/x86/kvm/vmx/nested.h       |    3 +--
+ arch/x86/kvm/vmx/vmx.c          |   10 ++++------
+ 4 files changed, 8 insertions(+), 11 deletions(-)
+
+--- a/arch/x86/kvm/vmx/capabilities.h
++++ b/arch/x86/kvm/vmx/capabilities.h
+@@ -12,6 +12,7 @@ extern bool __read_mostly enable_ept;
+ extern bool __read_mostly enable_unrestricted_guest;
+ extern bool __read_mostly enable_ept_ad_bits;
+ extern bool __read_mostly enable_pml;
++extern bool __read_mostly enable_apicv;
+ extern int __read_mostly pt_mode;
+ #define PT_MODE_SYSTEM                0
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -5807,8 +5807,7 @@ void nested_vmx_vcpu_setup(void)
+  * bit in the high half is on if the corresponding bit in the control field
+  * may be on. See also vmx_control_verify().
+  */
+-void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
+-                              bool apicv)
++void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
+ {
+       /*
+        * Note that as a general rule, the high half of the MSRs (bits in
+@@ -5835,7 +5834,7 @@ void nested_vmx_setup_ctls_msrs(struct n
+               PIN_BASED_EXT_INTR_MASK |
+               PIN_BASED_NMI_EXITING |
+               PIN_BASED_VIRTUAL_NMIS |
+-              (apicv ? PIN_BASED_POSTED_INTR : 0);
++              (enable_apicv ? PIN_BASED_POSTED_INTR : 0);
+       msrs->pinbased_ctls_high |=
+               PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+               PIN_BASED_VMX_PREEMPTION_TIMER;
+--- a/arch/x86/kvm/vmx/nested.h
++++ b/arch/x86/kvm/vmx/nested.h
+@@ -17,8 +17,7 @@ enum nvmx_vmentry_status {
+ };
+ void vmx_leave_nested(struct kvm_vcpu *vcpu);
+-void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
+-                              bool apicv);
++void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps);
+ void nested_vmx_hardware_unsetup(void);
+ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *));
+ void nested_vmx_vcpu_setup(void);
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -95,7 +95,7 @@ module_param(emulate_invalid_guest_state
+ static bool __read_mostly fasteoi = 1;
+ module_param(fasteoi, bool, S_IRUGO);
+-static bool __read_mostly enable_apicv = 1;
++bool __read_mostly enable_apicv = 1;
+ module_param(enable_apicv, bool, S_IRUGO);
+ /*
+@@ -6802,8 +6802,7 @@ static struct kvm_vcpu *vmx_create_vcpu(
+       if (nested)
+               nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
+-                                         vmx_capability.ept,
+-                                         kvm_vcpu_apicv_active(&vmx->vcpu));
++                                         vmx_capability.ept);
+       else
+               memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
+@@ -6885,8 +6884,7 @@ static int __init vmx_check_processor_co
+       if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
+               return -EIO;
+       if (nested)
+-              nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept,
+-                                         enable_apicv);
++              nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept);
+       if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
+               printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
+                               smp_processor_id());
+@@ -7781,7 +7779,7 @@ static __init int hardware_setup(void)
+       if (nested) {
+               nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
+-                                         vmx_capability.ept, enable_apicv);
++                                         vmx_capability.ept);
+               r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
+               if (r)
diff --git a/queue-5.4/kvm-nvmx-handle-nested-posted-interrupts-when-apicv-is-disabled-for-l1.patch b/queue-5.4/kvm-nvmx-handle-nested-posted-interrupts-when-apicv-is-disabled-for-l1.patch
new file mode 100644 (file)
index 0000000..7ca8041
--- /dev/null
@@ -0,0 +1,113 @@
+From 91a5f413af596ad01097e59bf487eb07cb3f1331 Mon Sep 17 00:00:00 2001
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+Date: Thu, 20 Feb 2020 18:22:05 +0100
+Subject: KVM: nVMX: handle nested posted interrupts when apicv is disabled for L1
+
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+
+commit 91a5f413af596ad01097e59bf487eb07cb3f1331 upstream.
+
+Even when APICv is disabled for L1 it can (and, actually, is) still
+available for L2, this means we need to always call
+vmx_deliver_nested_posted_interrupt() when attempting an interrupt
+delivery.
+
+Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/kvm_host.h |    2 +-
+ arch/x86/kvm/lapic.c            |    5 +----
+ arch/x86/kvm/svm.c              |    7 ++++++-
+ arch/x86/kvm/vmx/vmx.c          |   13 +++++++++----
+ 4 files changed, 17 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1098,7 +1098,7 @@ struct kvm_x86_ops {
+       void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+       void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
+       void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
+-      void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
++      int (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
+       int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
+       int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
+       int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -1056,11 +1056,8 @@ static int __apic_accept_irq(struct kvm_
+                                                      apic->regs + APIC_TMR);
+               }
+-              if (vcpu->arch.apicv_active)
+-                      kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
+-              else {
++              if (kvm_x86_ops->deliver_posted_interrupt(vcpu, vector)) {
+                       kvm_lapic_set_irr(vector, apic);
+-
+                       kvm_make_request(KVM_REQ_EVENT, vcpu);
+                       kvm_vcpu_kick(vcpu);
+               }
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -5141,8 +5141,11 @@ static void svm_load_eoi_exitmap(struct
+       return;
+ }
+-static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
++static int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
+ {
++      if (!vcpu->arch.apicv_active)
++              return -1;
++
+       kvm_lapic_set_irr(vec, vcpu->arch.apic);
+       smp_mb__after_atomic();
+@@ -5154,6 +5157,8 @@ static void svm_deliver_avic_intr(struct
+               put_cpu();
+       } else
+               kvm_vcpu_wake_up(vcpu);
++
++      return 0;
+ }
+ static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -3853,24 +3853,29 @@ static int vmx_deliver_nested_posted_int
+  * 2. If target vcpu isn't running(root mode), kick it to pick up the
+  * interrupt from PIR in next vmentry.
+  */
+-static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
++static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       int r;
+       r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
+       if (!r)
+-              return;
++              return 0;
++
++      if (!vcpu->arch.apicv_active)
++              return -1;
+       if (pi_test_and_set_pir(vector, &vmx->pi_desc))
+-              return;
++              return 0;
+       /* If a previous notification has sent the IPI, nothing to do.  */
+       if (pi_test_and_set_on(&vmx->pi_desc))
+-              return;
++              return 0;
+       if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
+               kvm_vcpu_kick(vcpu);
++
++      return 0;
+ }
+ /*
index 1e24ad5929f4e2282593a01c2134defb4140d50a..6c9f24a275556ebde3b72fcb0352f5e5890f6693 100644 (file)
@@ -86,3 +86,14 @@ ext4-rename-s_journal_flag_rwsem-to-s_writepages_rwsem.patch
 ext4-fix-race-between-writepages-and-enabling-ext4_extents_fl.patch
 kvm-nvmx-refactor-io-bitmap-checks-into-helper-function.patch
 kvm-nvmx-check-io-instruction-vm-exit-conditions.patch
+kvm-nvmx-clear-pin_based_posted_intr-from-nested-pinbased_ctls-only-when-apicv-is-globally-disabled.patch
+kvm-nvmx-handle-nested-posted-interrupts-when-apicv-is-disabled-for-l1.patch
+kvm-apic-avoid-calculating-pending-eoi-from-an-uninitialized-val.patch
+btrfs-destroy-qgroup-extent-records-on-transaction-abort.patch
+btrfs-fix-bytes_may_use-underflow-in-prealloc-error-condtition.patch
+btrfs-reset-fs_root-to-null-on-error-in-open_ctree.patch
+btrfs-do-not-check-delayed-items-are-empty-for-single-transaction-cleanup.patch
+btrfs-fix-btrfs_wait_ordered_range-so-that-it-waits-for-all-ordered-extents.patch
+btrfs-fix-race-between-shrinking-truncate-and-fiemap.patch
+btrfs-don-t-set-path-leave_spinning-for-truncate.patch
+btrfs-fix-deadlock-during-fast-fsync-when-logging-prealloc-extents-beyond-eof.patch