]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.0-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 8 Nov 2022 08:11:04 +0000 (09:11 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 8 Nov 2022 08:11:04 +0000 (09:11 +0100)
added patches:
ext4-fix-bug_on-when-directory-entry-has-invalid-rec_len.patch
ext4-fix-warning-in-ext4_da_release_space.patch
ext4-update-the-backup-superblock-s-at-the-end-of-the-online-resize.patch
kvm-arm64-fix-bad-dereference-on-mte-enabled-systems.patch
kvm-arm64-fix-smpri_el1-tpidr2_el0-trapping-on-vhe.patch
kvm-initialize-gfn_to_pfn_cache-locks-in-dedicated-helper.patch
kvm-reject-attempts-to-consume-or-refresh-inactive-gfn_to_pfn_cache.patch
kvm-vmx-advertise-pmu-lbrs-if-and-only-if-perf-supports-lbrs.patch
kvm-vmx-fold-vmx_supported_debugctl-into-vcpu_supported_debugctl.patch
kvm-vmx-fully-disable-sgx-if-secondary_exec_encls_exiting-unavailable.patch
kvm-vmx-ignore-guest-cpuid-for-host-userspace-writes-to-debugctl.patch
kvm-x86-emulator-em_sysexit-should-update-ctxt-mode.patch
kvm-x86-emulator-introduce-emulator_recalc_and_set_mode.patch
kvm-x86-emulator-update-the-emulation-mode-after-cr0-write.patch
kvm-x86-emulator-update-the-emulation-mode-after-rsm.patch
kvm-x86-mask-off-reserved-bits-in-cpuid.80000001h.patch
kvm-x86-mask-off-reserved-bits-in-cpuid.80000006h.patch
kvm-x86-mask-off-reserved-bits-in-cpuid.80000008h.patch
kvm-x86-mask-off-reserved-bits-in-cpuid.8000001ah.patch
kvm-x86-mask-off-reserved-bits-in-cpuid.8000001fh.patch
kvm-x86-smm-number-of-gprs-in-the-smram-image-depends-on-the-image-format.patch
parisc-avoid-printing-the-hardware-path-twice.patch
parisc-export-iosapic_serial_irq-symbol-for-serial-port-driver.patch
parisc-make-8250_gsc-driver-dependend-on-config_parisc.patch
x86-syscall-include-asm-ptrace.h-in-syscall_wrapper-header.patch
x86-tdx-panic-on-bad-configs-that-ve-on-private-memory-access.patch
x86-tdx-prepare-for-using-info-call-for-a-second-purpose.patch

28 files changed:
queue-6.0/ext4-fix-bug_on-when-directory-entry-has-invalid-rec_len.patch [new file with mode: 0644]
queue-6.0/ext4-fix-warning-in-ext4_da_release_space.patch [new file with mode: 0644]
queue-6.0/ext4-update-the-backup-superblock-s-at-the-end-of-the-online-resize.patch [new file with mode: 0644]
queue-6.0/kvm-arm64-fix-bad-dereference-on-mte-enabled-systems.patch [new file with mode: 0644]
queue-6.0/kvm-arm64-fix-smpri_el1-tpidr2_el0-trapping-on-vhe.patch [new file with mode: 0644]
queue-6.0/kvm-initialize-gfn_to_pfn_cache-locks-in-dedicated-helper.patch [new file with mode: 0644]
queue-6.0/kvm-reject-attempts-to-consume-or-refresh-inactive-gfn_to_pfn_cache.patch [new file with mode: 0644]
queue-6.0/kvm-vmx-advertise-pmu-lbrs-if-and-only-if-perf-supports-lbrs.patch [new file with mode: 0644]
queue-6.0/kvm-vmx-fold-vmx_supported_debugctl-into-vcpu_supported_debugctl.patch [new file with mode: 0644]
queue-6.0/kvm-vmx-fully-disable-sgx-if-secondary_exec_encls_exiting-unavailable.patch [new file with mode: 0644]
queue-6.0/kvm-vmx-ignore-guest-cpuid-for-host-userspace-writes-to-debugctl.patch [new file with mode: 0644]
queue-6.0/kvm-x86-emulator-em_sysexit-should-update-ctxt-mode.patch [new file with mode: 0644]
queue-6.0/kvm-x86-emulator-introduce-emulator_recalc_and_set_mode.patch [new file with mode: 0644]
queue-6.0/kvm-x86-emulator-update-the-emulation-mode-after-cr0-write.patch [new file with mode: 0644]
queue-6.0/kvm-x86-emulator-update-the-emulation-mode-after-rsm.patch [new file with mode: 0644]
queue-6.0/kvm-x86-mask-off-reserved-bits-in-cpuid.80000001h.patch [new file with mode: 0644]
queue-6.0/kvm-x86-mask-off-reserved-bits-in-cpuid.80000006h.patch [new file with mode: 0644]
queue-6.0/kvm-x86-mask-off-reserved-bits-in-cpuid.80000008h.patch [new file with mode: 0644]
queue-6.0/kvm-x86-mask-off-reserved-bits-in-cpuid.8000001ah.patch [new file with mode: 0644]
queue-6.0/kvm-x86-mask-off-reserved-bits-in-cpuid.8000001fh.patch [new file with mode: 0644]
queue-6.0/kvm-x86-smm-number-of-gprs-in-the-smram-image-depends-on-the-image-format.patch [new file with mode: 0644]
queue-6.0/parisc-avoid-printing-the-hardware-path-twice.patch [new file with mode: 0644]
queue-6.0/parisc-export-iosapic_serial_irq-symbol-for-serial-port-driver.patch [new file with mode: 0644]
queue-6.0/parisc-make-8250_gsc-driver-dependend-on-config_parisc.patch [new file with mode: 0644]
queue-6.0/series
queue-6.0/x86-syscall-include-asm-ptrace.h-in-syscall_wrapper-header.patch [new file with mode: 0644]
queue-6.0/x86-tdx-panic-on-bad-configs-that-ve-on-private-memory-access.patch [new file with mode: 0644]
queue-6.0/x86-tdx-prepare-for-using-info-call-for-a-second-purpose.patch [new file with mode: 0644]

diff --git a/queue-6.0/ext4-fix-bug_on-when-directory-entry-has-invalid-rec_len.patch b/queue-6.0/ext4-fix-bug_on-when-directory-entry-has-invalid-rec_len.patch
new file mode 100644 (file)
index 0000000..56a4ae0
--- /dev/null
@@ -0,0 +1,69 @@
+From 17a0bc9bd697f75cfdf9b378d5eb2d7409c91340 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Lu=C3=ADs=20Henriques?= <lhenriques@suse.de>
+Date: Wed, 12 Oct 2022 14:13:30 +0100
+Subject: ext4: fix BUG_ON() when directory entry has invalid rec_len
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Luís Henriques <lhenriques@suse.de>
+
+commit 17a0bc9bd697f75cfdf9b378d5eb2d7409c91340 upstream.
+
+The rec_len field in the directory entry has to be a multiple of 4.  A
+corrupted filesystem image can be used to hit a BUG() in
+ext4_rec_len_to_disk(), called from make_indexed_dir().
+
+ ------------[ cut here ]------------
+ kernel BUG at fs/ext4/ext4.h:2413!
+ ...
+ RIP: 0010:make_indexed_dir+0x53f/0x5f0
+ ...
+ Call Trace:
+  <TASK>
+  ? add_dirent_to_buf+0x1b2/0x200
+  ext4_add_entry+0x36e/0x480
+  ext4_add_nondir+0x2b/0xc0
+  ext4_create+0x163/0x200
+  path_openat+0x635/0xe90
+  do_filp_open+0xb4/0x160
+  ? __create_object.isra.0+0x1de/0x3b0
+  ? _raw_spin_unlock+0x12/0x30
+  do_sys_openat2+0x91/0x150
+  __x64_sys_open+0x6c/0xa0
+  do_syscall_64+0x3c/0x80
+  entry_SYSCALL_64_after_hwframe+0x46/0xb0
+
+The fix simply adds a call to ext4_check_dir_entry() to validate the
+directory entry, returning -EFSCORRUPTED if the entry is invalid.
+
+CC: stable@kernel.org
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=216540
+Signed-off-by: Luís Henriques <lhenriques@suse.de>
+Link: https://lore.kernel.org/r/20221012131330.32456-1-lhenriques@suse.de
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/namei.c |   10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -2259,8 +2259,16 @@ static int make_indexed_dir(handle_t *ha
+       memset(de, 0, len); /* wipe old data */
+       de = (struct ext4_dir_entry_2 *) data2;
+       top = data2 + len;
+-      while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
++      while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) {
++              if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len,
++                                       (data2 + (blocksize - csum_size) -
++                                        (char *) de))) {
++                      brelse(bh2);
++                      brelse(bh);
++                      return -EFSCORRUPTED;
++              }
+               de = de2;
++      }
+       de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
+                                          (char *) de, blocksize);
diff --git a/queue-6.0/ext4-fix-warning-in-ext4_da_release_space.patch b/queue-6.0/ext4-fix-warning-in-ext4_da_release_space.patch
new file mode 100644 (file)
index 0000000..5ae9e23
--- /dev/null
@@ -0,0 +1,102 @@
+From 1b8f787ef547230a3249bcf897221ef0cc78481b Mon Sep 17 00:00:00 2001
+From: Ye Bin <yebin10@huawei.com>
+Date: Tue, 18 Oct 2022 10:27:01 +0800
+Subject: ext4: fix warning in 'ext4_da_release_space'
+
+From: Ye Bin <yebin10@huawei.com>
+
+commit 1b8f787ef547230a3249bcf897221ef0cc78481b upstream.
+
+Syzkaller report issue as follows:
+EXT4-fs (loop0): Free/Dirty block details
+EXT4-fs (loop0): free_blocks=0
+EXT4-fs (loop0): dirty_blocks=0
+EXT4-fs (loop0): Block reservation details
+EXT4-fs (loop0): i_reserved_data_blocks=0
+EXT4-fs warning (device loop0): ext4_da_release_space:1527: ext4_da_release_space: ino 18, to_free 1 with only 0 reserved data blocks
+------------[ cut here ]------------
+WARNING: CPU: 0 PID: 92 at fs/ext4/inode.c:1528 ext4_da_release_space+0x25e/0x370 fs/ext4/inode.c:1524
+Modules linked in:
+CPU: 0 PID: 92 Comm: kworker/u4:4 Not tainted 6.0.0-syzkaller-09423-g493ffd6605b2 #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/22/2022
+Workqueue: writeback wb_workfn (flush-7:0)
+RIP: 0010:ext4_da_release_space+0x25e/0x370 fs/ext4/inode.c:1528
+RSP: 0018:ffffc900015f6c90 EFLAGS: 00010296
+RAX: 42215896cd52ea00 RBX: 0000000000000000 RCX: 42215896cd52ea00
+RDX: 0000000000000000 RSI: 0000000080000001 RDI: 0000000000000000
+RBP: 1ffff1100e907d96 R08: ffffffff816aa79d R09: fffff520002bece5
+R10: fffff520002bece5 R11: 1ffff920002bece4 R12: ffff888021fd2000
+R13: ffff88807483ecb0 R14: 0000000000000001 R15: ffff88807483e740
+FS:  0000000000000000(0000) GS:ffff8880b9a00000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00005555569ba628 CR3: 000000000c88e000 CR4: 00000000003506f0
+DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+Call Trace:
+ <TASK>
+ ext4_es_remove_extent+0x1ab/0x260 fs/ext4/extents_status.c:1461
+ mpage_release_unused_pages+0x24d/0xef0 fs/ext4/inode.c:1589
+ ext4_writepages+0x12eb/0x3be0 fs/ext4/inode.c:2852
+ do_writepages+0x3c3/0x680 mm/page-writeback.c:2469
+ __writeback_single_inode+0xd1/0x670 fs/fs-writeback.c:1587
+ writeback_sb_inodes+0xb3b/0x18f0 fs/fs-writeback.c:1870
+ wb_writeback+0x41f/0x7b0 fs/fs-writeback.c:2044
+ wb_do_writeback fs/fs-writeback.c:2187 [inline]
+ wb_workfn+0x3cb/0xef0 fs/fs-writeback.c:2227
+ process_one_work+0x877/0xdb0 kernel/workqueue.c:2289
+ worker_thread+0xb14/0x1330 kernel/workqueue.c:2436
+ kthread+0x266/0x300 kernel/kthread.c:376
+ ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306
+ </TASK>
+
+Above issue may happens as follows:
+ext4_da_write_begin
+  ext4_create_inline_data
+    ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+    ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+__ext4_ioctl
+  ext4_ext_migrate -> will lead to eh->eh_entries not zero, and set extent flag
+ext4_da_write_begin
+  ext4_da_convert_inline_data_to_extent
+    ext4_da_write_inline_data_begin
+      ext4_da_map_blocks
+        ext4_insert_delayed_block
+         if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk))
+           if (!ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk))
+             ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); -> will return 1
+              allocated = true;
+          ext4_es_insert_delayed_block(inode, lblk, allocated);
+ext4_writepages
+  mpage_map_and_submit_extent(handle, &mpd, &give_up_on_write); -> return -ENOSPC
+  mpage_release_unused_pages(&mpd, give_up_on_write); -> give_up_on_write == 1
+    ext4_es_remove_extent
+      ext4_da_release_space(inode, reserved);
+        if (unlikely(to_free > ei->i_reserved_data_blocks))
+         -> to_free == 1  but ei->i_reserved_data_blocks == 0
+         -> then trigger warning as above
+
+To solve above issue, forbid inode do migrate which has inline data.
+
+Cc: stable@kernel.org
+Reported-by: syzbot+c740bb18df70ad00952e@syzkaller.appspotmail.com
+Signed-off-by: Ye Bin <yebin10@huawei.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Link: https://lore.kernel.org/r/20221018022701.683489-1-yebin10@huawei.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/migrate.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/migrate.c
++++ b/fs/ext4/migrate.c
+@@ -425,7 +425,8 @@ int ext4_ext_migrate(struct inode *inode
+        * already is extent-based, error out.
+        */
+       if (!ext4_has_feature_extents(inode->i_sb) ||
+-          (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
++          ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
++          ext4_has_inline_data(inode))
+               return -EINVAL;
+       if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/queue-6.0/ext4-update-the-backup-superblock-s-at-the-end-of-the-online-resize.patch b/queue-6.0/ext4-update-the-backup-superblock-s-at-the-end-of-the-online-resize.patch
new file mode 100644 (file)
index 0000000..40ea1eb
--- /dev/null
@@ -0,0 +1,80 @@
+From 9a8c5b0d061554fedd7dbe894e63aa34d0bac7c4 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Thu, 27 Oct 2022 16:04:36 -0400
+Subject: ext4: update the backup superblock's at the end of the online resize
+
+From: Theodore Ts'o <tytso@mit.edu>
+
+commit 9a8c5b0d061554fedd7dbe894e63aa34d0bac7c4 upstream.
+
+When expanding a file system using online resize, various fields in
+the superblock (e.g., s_blocks_count, s_inodes_count, etc.) change.
+To update the backup superblocks, the online resize uses the function
+update_backups() in fs/ext4/resize.c.  This function was not updating
+the checksum field in the backup superblocks.  This wasn't a big deal
+previously, because e2fsck didn't care about the checksum field in the
+backup superblock.  (And indeed, update_backups() goes all the way
+back to the ext3 days, well before we had support for metadata
+checksums.)
+
+However, there is an alternate, more general way of updating
+superblock fields, ext4_update_primary_sb() in fs/ext4/ioctl.c.  This
+function does check the checksum of the backup superblock, and if it
+doesn't match will mark the file system as corrupted.  That was
+clearly not the intent, so avoid to aborting the resize when a bad
+superblock is found.
+
+In addition, teach update_backups() to properly update the checksum in
+the backup superblocks.  We will eventually want to unify
+updapte_backups() with the infrasture in ext4_update_primary_sb(), but
+that's for another day.
+
+Note: The problem has been around for a while; it just didn't really
+matter until ext4_update_primary_sb() was added by commit bbc605cdb1e1
+("ext4: implement support for get/set fs label").  And it became
+trivially easy to reproduce after commit 827891a38acc ("ext4: update
+the s_overhead_clusters in the backup sb's when resizing") in v6.0.
+
+Cc: stable@kernel.org # 5.17+
+Fixes: bbc605cdb1e1 ("ext4: implement support for get/set fs label")
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/ioctl.c  |    3 +--
+ fs/ext4/resize.c |    5 +++++
+ 2 files changed, 6 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -145,9 +145,8 @@ static int ext4_update_backup_sb(struct
+       if (ext4_has_metadata_csum(sb) &&
+           es->s_checksum != ext4_superblock_csum(sb, es)) {
+               ext4_msg(sb, KERN_ERR, "Invalid checksum for backup "
+-              "superblock %llu\n", sb_block);
++              "superblock %llu", sb_block);
+               unlock_buffer(bh);
+-              err = -EFSBADCRC;
+               goto out_bh;
+       }
+       func(es, arg);
+--- a/fs/ext4/resize.c
++++ b/fs/ext4/resize.c
+@@ -1158,6 +1158,7 @@ static void update_backups(struct super_
+       while (group < sbi->s_groups_count) {
+               struct buffer_head *bh;
+               ext4_fsblk_t backup_block;
++              struct ext4_super_block *es;
+               /* Out of journal space, and can't get more - abort - so sad */
+               err = ext4_resize_ensure_credits_batch(handle, 1);
+@@ -1186,6 +1187,10 @@ static void update_backups(struct super_
+               memcpy(bh->b_data, data, size);
+               if (rest)
+                       memset(bh->b_data + size, 0, rest);
++              es = (struct ext4_super_block *) bh->b_data;
++              es->s_block_group_nr = cpu_to_le16(group);
++              if (ext4_has_metadata_csum(sb))
++                      es->s_checksum = ext4_superblock_csum(sb, es);
+               set_buffer_uptodate(bh);
+               unlock_buffer(bh);
+               err = ext4_handle_dirty_metadata(handle, NULL, bh);
diff --git a/queue-6.0/kvm-arm64-fix-bad-dereference-on-mte-enabled-systems.patch b/queue-6.0/kvm-arm64-fix-bad-dereference-on-mte-enabled-systems.patch
new file mode 100644 (file)
index 0000000..077d834
--- /dev/null
@@ -0,0 +1,49 @@
+From b6bcdc9f6b8321e4471ff45413b6410e16762a8d Mon Sep 17 00:00:00 2001
+From: Ryan Roberts <ryan.roberts@arm.com>
+Date: Thu, 27 Oct 2022 13:09:45 +0100
+Subject: KVM: arm64: Fix bad dereference on MTE-enabled systems
+
+From: Ryan Roberts <ryan.roberts@arm.com>
+
+commit b6bcdc9f6b8321e4471ff45413b6410e16762a8d upstream.
+
+enter_exception64() performs an MTE check, which involves dereferencing
+vcpu->kvm. While vcpu has already been fixed up to be a HYP VA pointer,
+kvm is still a pointer in the kernel VA space.
+
+This only affects nVHE configurations with MTE enabled, as in other
+cases, the pointer is either valid (VHE) or not dereferenced (!MTE).
+
+Fix this by first converting kvm to a HYP VA pointer.
+
+Fixes: ea7fc1bb1cd1 ("KVM: arm64: Introduce MTE VM feature")
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: Steven Price <steven.price@arm.com>
+[maz: commit message tidy-up]
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20221027120945.29679-1-ryan.roberts@arm.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/hyp/exception.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/arch/arm64/kvm/hyp/exception.c
++++ b/arch/arm64/kvm/hyp/exception.c
+@@ -13,6 +13,7 @@
+ #include <hyp/adjust_pc.h>
+ #include <linux/kvm_host.h>
+ #include <asm/kvm_emulate.h>
++#include <asm/kvm_mmu.h>
+ #if !defined (__KVM_NVHE_HYPERVISOR__) && !defined (__KVM_VHE_HYPERVISOR__)
+ #error Hypervisor code only!
+@@ -115,7 +116,7 @@ static void enter_exception64(struct kvm
+       new |= (old & PSR_C_BIT);
+       new |= (old & PSR_V_BIT);
+-      if (kvm_has_mte(vcpu->kvm))
++      if (kvm_has_mte(kern_hyp_va(vcpu->kvm)))
+               new |= PSR_TCO_BIT;
+       new |= (old & PSR_DIT_BIT);
diff --git a/queue-6.0/kvm-arm64-fix-smpri_el1-tpidr2_el0-trapping-on-vhe.patch b/queue-6.0/kvm-arm64-fix-smpri_el1-tpidr2_el0-trapping-on-vhe.patch
new file mode 100644 (file)
index 0000000..2e13697
--- /dev/null
@@ -0,0 +1,143 @@
+From 4151bb636acf32bb2e6126cec8216b023117c0e9 Mon Sep 17 00:00:00 2001
+From: Marc Zyngier <maz@kernel.org>
+Date: Tue, 1 Nov 2022 12:19:51 +0000
+Subject: KVM: arm64: Fix SMPRI_EL1/TPIDR2_EL0 trapping on VHE
+
+From: Marc Zyngier <maz@kernel.org>
+
+commit 4151bb636acf32bb2e6126cec8216b023117c0e9 upstream.
+
+The trapping of SMPRI_EL1 and TPIDR2_EL0 currently only really
+work on nVHE, as only this mode uses the fine-grained trapping
+that controls these two registers.
+
+Move the trapping enable/disable code into
+__{de,}activate_traps_common(), allowing it to be called when it
+actually matters on VHE, and remove the flipping of EL2 control
+for TPIDR2_EL0, which only affects the host access of this
+register.
+
+Fixes: 861262ab8627 ("KVM: arm64: Handle SME host state when running guests")
+Reported-by: Mark Brown <broonie@kernel.org>
+Reviewed-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/86bkpqer4z.wl-maz@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/hyp/include/hyp/switch.h | 20 +++++++++++++++++++
+ arch/arm64/kvm/hyp/nvhe/switch.c        | 26 -------------------------
+ arch/arm64/kvm/hyp/vhe/switch.c         |  8 --------
+ 3 files changed, 20 insertions(+), 34 deletions(-)
+
+diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
+index 6cbbb6c02f66..3330d1b76bdd 100644
+--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
++++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
+@@ -87,6 +87,17 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
+       vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2);
+       write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
++
++      if (cpus_have_final_cap(ARM64_SME)) {
++              sysreg_clear_set_s(SYS_HFGRTR_EL2,
++                                 HFGxTR_EL2_nSMPRI_EL1_MASK |
++                                 HFGxTR_EL2_nTPIDR2_EL0_MASK,
++                                 0);
++              sysreg_clear_set_s(SYS_HFGWTR_EL2,
++                                 HFGxTR_EL2_nSMPRI_EL1_MASK |
++                                 HFGxTR_EL2_nTPIDR2_EL0_MASK,
++                                 0);
++      }
+ }
+ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
+@@ -96,6 +107,15 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
+       write_sysreg(0, hstr_el2);
+       if (kvm_arm_support_pmu_v3())
+               write_sysreg(0, pmuserenr_el0);
++
++      if (cpus_have_final_cap(ARM64_SME)) {
++              sysreg_clear_set_s(SYS_HFGRTR_EL2, 0,
++                                 HFGxTR_EL2_nSMPRI_EL1_MASK |
++                                 HFGxTR_EL2_nTPIDR2_EL0_MASK);
++              sysreg_clear_set_s(SYS_HFGWTR_EL2, 0,
++                                 HFGxTR_EL2_nSMPRI_EL1_MASK |
++                                 HFGxTR_EL2_nTPIDR2_EL0_MASK);
++      }
+ }
+ static inline void ___activate_traps(struct kvm_vcpu *vcpu)
+diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
+index 8e9d49a964be..c2cb46ca4fb6 100644
+--- a/arch/arm64/kvm/hyp/nvhe/switch.c
++++ b/arch/arm64/kvm/hyp/nvhe/switch.c
+@@ -55,18 +55,6 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
+       write_sysreg(val, cptr_el2);
+       write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2);
+-      if (cpus_have_final_cap(ARM64_SME)) {
+-              val = read_sysreg_s(SYS_HFGRTR_EL2);
+-              val &= ~(HFGxTR_EL2_nTPIDR2_EL0_MASK |
+-                       HFGxTR_EL2_nSMPRI_EL1_MASK);
+-              write_sysreg_s(val, SYS_HFGRTR_EL2);
+-
+-              val = read_sysreg_s(SYS_HFGWTR_EL2);
+-              val &= ~(HFGxTR_EL2_nTPIDR2_EL0_MASK |
+-                       HFGxTR_EL2_nSMPRI_EL1_MASK);
+-              write_sysreg_s(val, SYS_HFGWTR_EL2);
+-      }
+-
+       if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
+               struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
+@@ -110,20 +98,6 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
+       write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2);
+-      if (cpus_have_final_cap(ARM64_SME)) {
+-              u64 val;
+-
+-              val = read_sysreg_s(SYS_HFGRTR_EL2);
+-              val |= HFGxTR_EL2_nTPIDR2_EL0_MASK |
+-                      HFGxTR_EL2_nSMPRI_EL1_MASK;
+-              write_sysreg_s(val, SYS_HFGRTR_EL2);
+-
+-              val = read_sysreg_s(SYS_HFGWTR_EL2);
+-              val |= HFGxTR_EL2_nTPIDR2_EL0_MASK |
+-                      HFGxTR_EL2_nSMPRI_EL1_MASK;
+-              write_sysreg_s(val, SYS_HFGWTR_EL2);
+-      }
+-
+       cptr = CPTR_EL2_DEFAULT;
+       if (vcpu_has_sve(vcpu) && (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED))
+               cptr |= CPTR_EL2_TZ;
+diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
+index 7acb87eaa092..1a97391fedd2 100644
+--- a/arch/arm64/kvm/hyp/vhe/switch.c
++++ b/arch/arm64/kvm/hyp/vhe/switch.c
+@@ -63,10 +63,6 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
+               __activate_traps_fpsimd32(vcpu);
+       }
+-      if (cpus_have_final_cap(ARM64_SME))
+-              write_sysreg(read_sysreg(sctlr_el2) & ~SCTLR_ELx_ENTP2,
+-                           sctlr_el2);
+-
+       write_sysreg(val, cpacr_el1);
+       write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1);
+@@ -88,10 +84,6 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
+        */
+       asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
+-      if (cpus_have_final_cap(ARM64_SME))
+-              write_sysreg(read_sysreg(sctlr_el2) | SCTLR_ELx_ENTP2,
+-                           sctlr_el2);
+-
+       write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1);
+       if (!arm64_kernel_unmapped_at_el0())
+-- 
+2.38.1
+
diff --git a/queue-6.0/kvm-initialize-gfn_to_pfn_cache-locks-in-dedicated-helper.patch b/queue-6.0/kvm-initialize-gfn_to_pfn_cache-locks-in-dedicated-helper.patch
new file mode 100644 (file)
index 0000000..3f4135e
--- /dev/null
@@ -0,0 +1,315 @@
+From 52491a38b2c2411f3f0229dc6ad610349c704a41 Mon Sep 17 00:00:00 2001
+From: Michal Luczaj <mhal@rbox.co>
+Date: Thu, 13 Oct 2022 21:12:19 +0000
+Subject: KVM: Initialize gfn_to_pfn_cache locks in dedicated helper
+
+From: Michal Luczaj <mhal@rbox.co>
+
+commit 52491a38b2c2411f3f0229dc6ad610349c704a41 upstream.
+
+Move the gfn_to_pfn_cache lock initialization to another helper and
+call the new helper during VM/vCPU creation.  There are race
+conditions possible due to kvm_gfn_to_pfn_cache_init()'s
+ability to re-initialize the cache's locks.
+
+For example: a race between ioctl(KVM_XEN_HVM_EVTCHN_SEND) and
+kvm_gfn_to_pfn_cache_init() leads to a corrupted shinfo gpc lock.
+
+                (thread 1)                |           (thread 2)
+                                          |
+ kvm_xen_set_evtchn_fast                  |
+  read_lock_irqsave(&gpc->lock, ...)      |
+                                          | kvm_gfn_to_pfn_cache_init
+                                          |  rwlock_init(&gpc->lock)
+  read_unlock_irqrestore(&gpc->lock, ...) |
+
+Rename "cache_init" and "cache_destroy" to activate+deactivate to
+avoid implying that the cache really is destroyed/freed.
+
+Note, there more races in the newly named kvm_gpc_activate() that will
+be addressed separately.
+
+Fixes: 982ed0de4753 ("KVM: Reinstate gfn_to_pfn_cache with invalidation support")
+Cc: stable@vger.kernel.org
+Suggested-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Michal Luczaj <mhal@rbox.co>
+[sean: call out that this is a bug fix]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20221013211234.1318131-2-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c       |   12 +++++----
+ arch/x86/kvm/xen.c       |   57 ++++++++++++++++++++++++-----------------------
+ include/linux/kvm_host.h |   24 ++++++++++++++-----
+ virt/kvm/pfncache.c      |   21 +++++++++--------
+ 4 files changed, 66 insertions(+), 48 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -2304,11 +2304,11 @@ static void kvm_write_system_time(struct
+       /* we verify if the enable bit is set... */
+       if (system_time & 1) {
+-              kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
+-                                        KVM_HOST_USES_PFN, system_time & ~1ULL,
+-                                        sizeof(struct pvclock_vcpu_time_info));
++              kvm_gpc_activate(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
++                               KVM_HOST_USES_PFN, system_time & ~1ULL,
++                               sizeof(struct pvclock_vcpu_time_info));
+       } else {
+-              kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
++              kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time);
+       }
+       return;
+@@ -3377,7 +3377,7 @@ static int kvm_pv_enable_async_pf_int(st
+ static void kvmclock_reset(struct kvm_vcpu *vcpu)
+ {
+-      kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
++      kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time);
+       vcpu->arch.time = 0;
+ }
+@@ -11629,6 +11629,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu
+       vcpu->arch.regs_avail = ~0;
+       vcpu->arch.regs_dirty = ~0;
++      kvm_gpc_init(&vcpu->arch.pv_time);
++
+       if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
+               vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+       else
+--- a/arch/x86/kvm/xen.c
++++ b/arch/x86/kvm/xen.c
+@@ -42,13 +42,13 @@ static int kvm_xen_shared_info_init(stru
+       int idx = srcu_read_lock(&kvm->srcu);
+       if (gfn == GPA_INVALID) {
+-              kvm_gfn_to_pfn_cache_destroy(kvm, gpc);
++              kvm_gpc_deactivate(kvm, gpc);
+               goto out;
+       }
+       do {
+-              ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, KVM_HOST_USES_PFN,
+-                                              gpa, PAGE_SIZE);
++              ret = kvm_gpc_activate(kvm, gpc, NULL, KVM_HOST_USES_PFN, gpa,
++                                     PAGE_SIZE);
+               if (ret)
+                       goto out;
+@@ -554,15 +554,15 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcp
+                            offsetof(struct compat_vcpu_info, time));
+               if (data->u.gpa == GPA_INVALID) {
+-                      kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
++                      kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
+                       r = 0;
+                       break;
+               }
+-              r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
+-                                            &vcpu->arch.xen.vcpu_info_cache,
+-                                            NULL, KVM_HOST_USES_PFN, data->u.gpa,
+-                                            sizeof(struct vcpu_info));
++              r = kvm_gpc_activate(vcpu->kvm,
++                                   &vcpu->arch.xen.vcpu_info_cache, NULL,
++                                   KVM_HOST_USES_PFN, data->u.gpa,
++                                   sizeof(struct vcpu_info));
+               if (!r)
+                       kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+@@ -570,16 +570,16 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcp
+       case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
+               if (data->u.gpa == GPA_INVALID) {
+-                      kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+-                                                   &vcpu->arch.xen.vcpu_time_info_cache);
++                      kvm_gpc_deactivate(vcpu->kvm,
++                                         &vcpu->arch.xen.vcpu_time_info_cache);
+                       r = 0;
+                       break;
+               }
+-              r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
+-                                            &vcpu->arch.xen.vcpu_time_info_cache,
+-                                            NULL, KVM_HOST_USES_PFN, data->u.gpa,
+-                                            sizeof(struct pvclock_vcpu_time_info));
++              r = kvm_gpc_activate(vcpu->kvm,
++                                   &vcpu->arch.xen.vcpu_time_info_cache,
++                                   NULL, KVM_HOST_USES_PFN, data->u.gpa,
++                                   sizeof(struct pvclock_vcpu_time_info));
+               if (!r)
+                       kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+               break;
+@@ -590,16 +590,15 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcp
+                       break;
+               }
+               if (data->u.gpa == GPA_INVALID) {
+-                      kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+-                                                   &vcpu->arch.xen.runstate_cache);
++                      kvm_gpc_deactivate(vcpu->kvm,
++                                         &vcpu->arch.xen.runstate_cache);
+                       r = 0;
+                       break;
+               }
+-              r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
+-                                            &vcpu->arch.xen.runstate_cache,
+-                                            NULL, KVM_HOST_USES_PFN, data->u.gpa,
+-                                            sizeof(struct vcpu_runstate_info));
++              r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
++                                   NULL, KVM_HOST_USES_PFN, data->u.gpa,
++                                   sizeof(struct vcpu_runstate_info));
+               break;
+       case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
+@@ -1817,7 +1816,12 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *
+ {
+       vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx;
+       vcpu->arch.xen.poll_evtchn = 0;
++
+       timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
++
++      kvm_gpc_init(&vcpu->arch.xen.runstate_cache);
++      kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache);
++      kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache);
+ }
+ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
+@@ -1825,18 +1829,17 @@ void kvm_xen_destroy_vcpu(struct kvm_vcp
+       if (kvm_xen_timer_enabled(vcpu))
+               kvm_xen_stop_timer(vcpu);
+-      kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+-                                   &vcpu->arch.xen.runstate_cache);
+-      kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+-                                   &vcpu->arch.xen.vcpu_info_cache);
+-      kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+-                                   &vcpu->arch.xen.vcpu_time_info_cache);
++      kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache);
++      kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
++      kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache);
++
+       del_timer_sync(&vcpu->arch.xen.poll_timer);
+ }
+ void kvm_xen_init_vm(struct kvm *kvm)
+ {
+       idr_init(&kvm->arch.xen.evtchn_ports);
++      kvm_gpc_init(&kvm->arch.xen.shinfo_cache);
+ }
+ void kvm_xen_destroy_vm(struct kvm *kvm)
+@@ -1844,7 +1847,7 @@ void kvm_xen_destroy_vm(struct kvm *kvm)
+       struct evtchnfd *evtchnfd;
+       int i;
+-      kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
++      kvm_gpc_deactivate(kvm, &kvm->arch.xen.shinfo_cache);
+       idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
+               if (!evtchnfd->deliver.port.port)
+--- a/include/linux/kvm_host.h
++++ b/include/linux/kvm_host.h
+@@ -1241,8 +1241,18 @@ int kvm_vcpu_write_guest(struct kvm_vcpu
+ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
+ /**
+- * kvm_gfn_to_pfn_cache_init - prepare a cached kernel mapping and HPA for a
+- *                             given guest physical address.
++ * kvm_gpc_init - initialize gfn_to_pfn_cache.
++ *
++ * @gpc:         struct gfn_to_pfn_cache object.
++ *
++ * This sets up a gfn_to_pfn_cache by initializing locks.  Note, the cache must
++ * be zero-allocated (or zeroed by the caller before init).
++ */
++void kvm_gpc_init(struct gfn_to_pfn_cache *gpc);
++
++/**
++ * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest
++ *                    physical address.
+  *
+  * @kvm:         pointer to kvm instance.
+  * @gpc:         struct gfn_to_pfn_cache object.
+@@ -1266,9 +1276,9 @@ void kvm_vcpu_mark_page_dirty(struct kvm
+  * kvm_gfn_to_pfn_cache_check() to ensure that the cache is valid before
+  * accessing the target page.
+  */
+-int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+-                            struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
+-                            gpa_t gpa, unsigned long len);
++int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
++                   struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
++                   gpa_t gpa, unsigned long len);
+ /**
+  * kvm_gfn_to_pfn_cache_check - check validity of a gfn_to_pfn_cache.
+@@ -1325,7 +1335,7 @@ int kvm_gfn_to_pfn_cache_refresh(struct
+ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
+ /**
+- * kvm_gfn_to_pfn_cache_destroy - destroy and unlink a gfn_to_pfn_cache.
++ * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache.
+  *
+  * @kvm:         pointer to kvm instance.
+  * @gpc:         struct gfn_to_pfn_cache object.
+@@ -1333,7 +1343,7 @@ void kvm_gfn_to_pfn_cache_unmap(struct k
+  * This removes a cache from the @kvm's list to be processed on MMU notifier
+  * invocation.
+  */
+-void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
++void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
+ void kvm_sigset_activate(struct kvm_vcpu *vcpu);
+ void kvm_sigset_deactivate(struct kvm_vcpu *vcpu);
+--- a/virt/kvm/pfncache.c
++++ b/virt/kvm/pfncache.c
+@@ -346,17 +346,20 @@ void kvm_gfn_to_pfn_cache_unmap(struct k
+ }
+ EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_unmap);
++void kvm_gpc_init(struct gfn_to_pfn_cache *gpc)
++{
++      rwlock_init(&gpc->lock);
++      mutex_init(&gpc->refresh_lock);
++}
++EXPORT_SYMBOL_GPL(kvm_gpc_init);
+-int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+-                            struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
+-                            gpa_t gpa, unsigned long len)
++int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
++                   struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
++                   gpa_t gpa, unsigned long len)
+ {
+       WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage);
+       if (!gpc->active) {
+-              rwlock_init(&gpc->lock);
+-              mutex_init(&gpc->refresh_lock);
+-
+               gpc->khva = NULL;
+               gpc->pfn = KVM_PFN_ERR_FAULT;
+               gpc->uhva = KVM_HVA_ERR_BAD;
+@@ -371,9 +374,9 @@ int kvm_gfn_to_pfn_cache_init(struct kvm
+       }
+       return kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpa, len);
+ }
+-EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_init);
++EXPORT_SYMBOL_GPL(kvm_gpc_activate);
+-void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
++void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
+ {
+       if (gpc->active) {
+               spin_lock(&kvm->gpc_lock);
+@@ -384,4 +387,4 @@ void kvm_gfn_to_pfn_cache_destroy(struct
+               gpc->active = false;
+       }
+ }
+-EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_destroy);
++EXPORT_SYMBOL_GPL(kvm_gpc_deactivate);
diff --git a/queue-6.0/kvm-reject-attempts-to-consume-or-refresh-inactive-gfn_to_pfn_cache.patch b/queue-6.0/kvm-reject-attempts-to-consume-or-refresh-inactive-gfn_to_pfn_cache.patch
new file mode 100644 (file)
index 0000000..4959d46
--- /dev/null
@@ -0,0 +1,197 @@
+From ecbcf030b45666ad11bc98565e71dfbcb7be4393 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Thu, 13 Oct 2022 21:12:20 +0000
+Subject: KVM: Reject attempts to consume or refresh inactive gfn_to_pfn_cache
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit ecbcf030b45666ad11bc98565e71dfbcb7be4393 upstream.
+
+Reject kvm_gpc_check() and kvm_gpc_refresh() if the cache is inactive.
+Not checking the active flag during refresh is particularly egregious, as
+KVM can end up with a valid, inactive cache, which can lead to a variety
+of use-after-free bugs, e.g. consuming a NULL kernel pointer or missing
+an mmu_notifier invalidation due to the cache not being on the list of
+gfns to invalidate.
+
+Note, "active" needs to be set if and only if the cache is on the list
+of caches, i.e. is reachable via mmu_notifier events.  If a relevant
+mmu_notifier event occurs while the cache is "active" but not on the
+list, KVM will not acquire the cache's lock and so will not serailize
+the mmu_notifier event with active users and/or kvm_gpc_refresh().
+
+A race between KVM_XEN_ATTR_TYPE_SHARED_INFO and KVM_XEN_HVM_EVTCHN_SEND
+can be exploited to trigger the bug.
+
+1. Deactivate shinfo cache:
+
+kvm_xen_hvm_set_attr
+case KVM_XEN_ATTR_TYPE_SHARED_INFO
+ kvm_gpc_deactivate
+  kvm_gpc_unmap
+   gpc->valid = false
+   gpc->khva = NULL
+  gpc->active = false
+
+Result: active = false, valid = false
+
+2. Cause cache refresh:
+
+kvm_arch_vm_ioctl
+case KVM_XEN_HVM_EVTCHN_SEND
+ kvm_xen_hvm_evtchn_send
+  kvm_xen_set_evtchn
+   kvm_xen_set_evtchn_fast
+    kvm_gpc_check
+    return -EWOULDBLOCK because !gpc->valid
+   kvm_xen_set_evtchn_fast
+    return -EWOULDBLOCK
+   kvm_gpc_refresh
+    hva_to_pfn_retry
+     gpc->valid = true
+     gpc->khva = not NULL
+
+Result: active = false, valid = true
+
+3. Race ioctl KVM_XEN_HVM_EVTCHN_SEND against ioctl
+KVM_XEN_ATTR_TYPE_SHARED_INFO:
+
+kvm_arch_vm_ioctl
+case KVM_XEN_HVM_EVTCHN_SEND
+ kvm_xen_hvm_evtchn_send
+  kvm_xen_set_evtchn
+   kvm_xen_set_evtchn_fast
+    read_lock gpc->lock
+                                          kvm_xen_hvm_set_attr case
+                                          KVM_XEN_ATTR_TYPE_SHARED_INFO
+                                           mutex_lock kvm->lock
+                                           kvm_xen_shared_info_init
+                                            kvm_gpc_activate
+                                             gpc->khva = NULL
+    kvm_gpc_check
+     [ Check passes because gpc->valid is
+       still true, even though gpc->khva
+       is already NULL. ]
+    shinfo = gpc->khva
+    pending_bits = shinfo->evtchn_pending
+    CRASH: test_and_set_bit(..., pending_bits)
+
+Fixes: 982ed0de4753 ("KVM: Reinstate gfn_to_pfn_cache with invalidation support")
+Cc: stable@vger.kernel.org
+Reported-by: : Michal Luczaj <mhal@rbox.co>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20221013211234.1318131-3-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ virt/kvm/pfncache.c |   41 ++++++++++++++++++++++++++++++++++-------
+ 1 file changed, 34 insertions(+), 7 deletions(-)
+
+--- a/virt/kvm/pfncache.c
++++ b/virt/kvm/pfncache.c
+@@ -81,6 +81,9 @@ bool kvm_gfn_to_pfn_cache_check(struct k
+ {
+       struct kvm_memslots *slots = kvm_memslots(kvm);
++      if (!gpc->active)
++              return false;
++
+       if ((gpa & ~PAGE_MASK) + len > PAGE_SIZE)
+               return false;
+@@ -240,10 +243,11 @@ int kvm_gfn_to_pfn_cache_refresh(struct
+ {
+       struct kvm_memslots *slots = kvm_memslots(kvm);
+       unsigned long page_offset = gpa & ~PAGE_MASK;
+-      kvm_pfn_t old_pfn, new_pfn;
++      bool unmap_old = false;
+       unsigned long old_uhva;
++      kvm_pfn_t old_pfn;
+       void *old_khva;
+-      int ret = 0;
++      int ret;
+       /*
+        * If must fit within a single page. The 'len' argument is
+@@ -261,6 +265,11 @@ int kvm_gfn_to_pfn_cache_refresh(struct
+       write_lock_irq(&gpc->lock);
++      if (!gpc->active) {
++              ret = -EINVAL;
++              goto out_unlock;
++      }
++
+       old_pfn = gpc->pfn;
+       old_khva = gpc->khva - offset_in_page(gpc->khva);
+       old_uhva = gpc->uhva;
+@@ -291,6 +300,7 @@ int kvm_gfn_to_pfn_cache_refresh(struct
+               /* If the HVA→PFN mapping was already valid, don't unmap it. */
+               old_pfn = KVM_PFN_ERR_FAULT;
+               old_khva = NULL;
++              ret = 0;
+       }
+  out:
+@@ -305,14 +315,15 @@ int kvm_gfn_to_pfn_cache_refresh(struct
+               gpc->khva = NULL;
+       }
+-      /* Snapshot the new pfn before dropping the lock! */
+-      new_pfn = gpc->pfn;
++      /* Detect a pfn change before dropping the lock! */
++      unmap_old = (old_pfn != gpc->pfn);
++out_unlock:
+       write_unlock_irq(&gpc->lock);
+       mutex_unlock(&gpc->refresh_lock);
+-      if (old_pfn != new_pfn)
++      if (unmap_old)
+               gpc_unmap_khva(kvm, old_pfn, old_khva);
+       return ret;
+@@ -366,11 +377,19 @@ int kvm_gpc_activate(struct kvm *kvm, st
+               gpc->vcpu = vcpu;
+               gpc->usage = usage;
+               gpc->valid = false;
+-              gpc->active = true;
+               spin_lock(&kvm->gpc_lock);
+               list_add(&gpc->list, &kvm->gpc_list);
+               spin_unlock(&kvm->gpc_lock);
++
++              /*
++               * Activate the cache after adding it to the list, a concurrent
++               * refresh must not establish a mapping until the cache is
++               * reachable by mmu_notifier events.
++               */
++              write_lock_irq(&gpc->lock);
++              gpc->active = true;
++              write_unlock_irq(&gpc->lock);
+       }
+       return kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpa, len);
+ }
+@@ -379,12 +398,20 @@ EXPORT_SYMBOL_GPL(kvm_gpc_activate);
+ void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
+ {
+       if (gpc->active) {
++              /*
++               * Deactivate the cache before removing it from the list, KVM
++               * must stall mmu_notifier events until all users go away, i.e.
++               * until gpc->lock is dropped and refresh is guaranteed to fail.
++               */
++              write_lock_irq(&gpc->lock);
++              gpc->active = false;
++              write_unlock_irq(&gpc->lock);
++
+               spin_lock(&kvm->gpc_lock);
+               list_del(&gpc->list);
+               spin_unlock(&kvm->gpc_lock);
+               kvm_gfn_to_pfn_cache_unmap(kvm, gpc);
+-              gpc->active = false;
+       }
+ }
+ EXPORT_SYMBOL_GPL(kvm_gpc_deactivate);
diff --git a/queue-6.0/kvm-vmx-advertise-pmu-lbrs-if-and-only-if-perf-supports-lbrs.patch b/queue-6.0/kvm-vmx-advertise-pmu-lbrs-if-and-only-if-perf-supports-lbrs.patch
new file mode 100644 (file)
index 0000000..1b626d8
--- /dev/null
@@ -0,0 +1,45 @@
+From 145dfad998eac74abc59219d936e905766ba2d98 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Thu, 6 Oct 2022 00:03:08 +0000
+Subject: KVM: VMX: Advertise PMU LBRs if and only if perf supports LBRs
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 145dfad998eac74abc59219d936e905766ba2d98 upstream.
+
+Advertise LBR support to userspace via MSR_IA32_PERF_CAPABILITIES if and
+only if perf fully supports LBRs.  Perf may disable LBRs (by zeroing the
+number of LBRs) even on platforms the allegedly support LBRs, e.g. if
+probing any LBR MSRs during setup fails.
+
+Fixes: be635e34c284 ("KVM: vmx/pmu: Expose LBR_FMT in the MSR_IA32_PERF_CAPABILITIES")
+Reported-by: Like Xu <like.xu.linux@gmail.com>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20221006000314.73240-3-seanjc@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx/capabilities.h |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/vmx/capabilities.h
++++ b/arch/x86/kvm/vmx/capabilities.h
+@@ -404,6 +404,7 @@ static inline bool vmx_pebs_supported(vo
+ static inline u64 vmx_get_perf_capabilities(void)
+ {
+       u64 perf_cap = PMU_CAP_FW_WRITES;
++      struct x86_pmu_lbr lbr;
+       u64 host_perf_cap = 0;
+       if (!enable_pmu)
+@@ -412,7 +413,8 @@ static inline u64 vmx_get_perf_capabilit
+       if (boot_cpu_has(X86_FEATURE_PDCM))
+               rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
+-      perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
++      if (x86_perf_get_lbr(&lbr) >= 0 && lbr.nr)
++              perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
+       if (vmx_pebs_supported()) {
+               perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
diff --git a/queue-6.0/kvm-vmx-fold-vmx_supported_debugctl-into-vcpu_supported_debugctl.patch b/queue-6.0/kvm-vmx-fold-vmx_supported_debugctl-into-vcpu_supported_debugctl.patch
new file mode 100644 (file)
index 0000000..3ef0f55
--- /dev/null
@@ -0,0 +1,85 @@
+From 18e897d213cb152c786abab14919196bd9dc3a9f Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Thu, 6 Oct 2022 00:03:09 +0000
+Subject: KVM: VMX: Fold vmx_supported_debugctl() into vcpu_supported_debugctl()
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 18e897d213cb152c786abab14919196bd9dc3a9f upstream.
+
+Fold vmx_supported_debugctl() into vcpu_supported_debugctl(), its only
+caller.  Setting bits only to clear them a few instructions later is
+rather silly, and splitting the logic makes things seem more complicated
+than they actually are.
+
+Opportunistically drop DEBUGCTLMSR_LBR_MASK now that there's a single
+reference to the pair of bits.  The extra layer of indirection provides
+no meaningful value and makes it unnecessarily tedious to understand
+what KVM is doing.
+
+No functional change.
+
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20221006000314.73240-4-seanjc@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx/capabilities.h |   15 ---------------
+ arch/x86/kvm/vmx/vmx.c          |   12 +++++++-----
+ 2 files changed, 7 insertions(+), 20 deletions(-)
+
+--- a/arch/x86/kvm/vmx/capabilities.h
++++ b/arch/x86/kvm/vmx/capabilities.h
+@@ -24,8 +24,6 @@ extern int __read_mostly pt_mode;
+ #define PMU_CAP_FW_WRITES     (1ULL << 13)
+ #define PMU_CAP_LBR_FMT               0x3f
+-#define DEBUGCTLMSR_LBR_MASK          (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI)
+-
+ struct nested_vmx_msrs {
+       /*
+        * We only store the "true" versions of the VMX capability MSRs. We
+@@ -425,19 +423,6 @@ static inline u64 vmx_get_perf_capabilit
+       return perf_cap;
+ }
+-static inline u64 vmx_supported_debugctl(void)
+-{
+-      u64 debugctl = 0;
+-
+-      if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+-              debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+-
+-      if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)
+-              debugctl |= DEBUGCTLMSR_LBR_MASK;
+-
+-      return debugctl;
+-}
+-
+ static inline bool cpu_has_notify_vmexit(void)
+ {
+       return vmcs_config.cpu_based_2nd_exec_ctrl &
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2018,13 +2018,15 @@ static u64 nested_vmx_truncate_sysenter_
+ static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)
+ {
+-      u64 debugctl = vmx_supported_debugctl();
++      u64 debugctl = 0;
+-      if (!intel_pmu_lbr_is_enabled(vcpu))
+-              debugctl &= ~DEBUGCTLMSR_LBR_MASK;
++      if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
++          guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
++              debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+-      if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+-              debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
++      if ((vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) &&
++          intel_pmu_lbr_is_enabled(vcpu))
++              debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+       return debugctl;
+ }
diff --git a/queue-6.0/kvm-vmx-fully-disable-sgx-if-secondary_exec_encls_exiting-unavailable.patch b/queue-6.0/kvm-vmx-fully-disable-sgx-if-secondary_exec_encls_exiting-unavailable.patch
new file mode 100644 (file)
index 0000000..eb068a6
--- /dev/null
@@ -0,0 +1,58 @@
+From 1c1a41497ab879ac9608f3047f230af833eeef3d Mon Sep 17 00:00:00 2001
+From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+Date: Tue, 25 Oct 2022 08:37:49 -0400
+Subject: KVM: VMX: fully disable SGX if SECONDARY_EXEC_ENCLS_EXITING unavailable
+
+From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+
+commit 1c1a41497ab879ac9608f3047f230af833eeef3d upstream.
+
+Clear enable_sgx if ENCLS-exiting is not supported, i.e. if SGX cannot be
+virtualized.  When KVM is loaded, adjust_vmx_controls checks that the
+bit is available before enabling the feature; however, other parts of the
+code check enable_sgx and not clearing the variable caused two different
+bugs, mostly affecting nested virtualization scenarios.
+
+First, because enable_sgx remained true, SECONDARY_EXEC_ENCLS_EXITING
+would be marked available in the capability MSR that are accessed by a
+nested hypervisor.  KVM would then propagate the control from vmcs12
+to vmcs02 even if it isn't supported by the processor, thus causing an
+unexpected VM-Fail (exit code 0x7) in L1.
+
+Second, vmx_set_cpu_caps() would not clear the SGX bits when hardware
+support is unavailable.  This is a much less problematic bug as it only
+happens if SGX is soft-disabled (available in the processor but hidden
+in CPUID) or if SGX is supported for bare metal but not in the VMCS
+(will never happen when running on bare metal, but can theoertically
+happen when running in a VM).
+
+Last but not least, this ensures that module params in sysfs reflect
+KVM's actual configuration.
+
+RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=2127128
+Fixes: 72add915fbd5 ("KVM: VMX: Enable SGX virtualization for SGX1, SGX2 and LC")
+Cc: stable@vger.kernel.org
+Suggested-by: Sean Christopherson <seanjc@google.com>
+Suggested-by: Bandan Das <bsd@redhat.com>
+Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+Message-Id: <20221025123749.2201649-1-eesposit@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx/vmx.c |    5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -8281,6 +8281,11 @@ static __init int hardware_setup(void)
+       if (!cpu_has_virtual_nmis())
+               enable_vnmi = 0;
++#ifdef CONFIG_X86_SGX_KVM
++      if (!cpu_has_vmx_encls_vmexit())
++              enable_sgx = false;
++#endif
++
+       /*
+        * set_apic_access_page_addr() is used to reload apic access
+        * page upon invalidation.  No need to do anything if not
diff --git a/queue-6.0/kvm-vmx-ignore-guest-cpuid-for-host-userspace-writes-to-debugctl.patch b/queue-6.0/kvm-vmx-ignore-guest-cpuid-for-host-userspace-writes-to-debugctl.patch
new file mode 100644 (file)
index 0000000..bc80aa5
--- /dev/null
@@ -0,0 +1,68 @@
+From b333b8ebb85d62469f32b52fa03fd7d1522afc03 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Thu, 6 Oct 2022 00:03:10 +0000
+Subject: KVM: VMX: Ignore guest CPUID for host userspace writes to DEBUGCTL
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit b333b8ebb85d62469f32b52fa03fd7d1522afc03 upstream.
+
+Ignore guest CPUID for host userspace writes to the DEBUGCTL MSR, KVM's
+ABI is that setting CPUID vs. state can be done in any order, i.e. KVM
+allows userspace to stuff MSRs prior to setting the guest's CPUID that
+makes the new MSR "legal".
+
+Keep the vmx_get_perf_capabilities() check for guest writes, even though
+it's technically unnecessary since the vCPU's PERF_CAPABILITIES is
+consulted when refreshing LBR support.  A future patch will clean up
+vmx_get_perf_capabilities() to avoid the RDMSR on every call, at which
+point the paranoia will incur no meaningful overhead.
+
+Note, prior to vmx_get_perf_capabilities() checking that the host fully
+supports LBRs via x86_perf_get_lbr(), KVM effectively relied on
+intel_pmu_lbr_is_enabled() to guard against host userspace enabling LBRs
+on platforms without full support.
+
+Fixes: c646236344e9 ("KVM: vmx/pmu: Add PMU_CAP_LBR_FMT check when guest LBR is enabled")
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20221006000314.73240-5-seanjc@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx/vmx.c |   10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2016,16 +2016,16 @@ static u64 nested_vmx_truncate_sysenter_
+       return (unsigned long)data;
+ }
+-static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)
++static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
+ {
+       u64 debugctl = 0;
+       if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
+-          guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
++          (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
+               debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+       if ((vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) &&
+-          intel_pmu_lbr_is_enabled(vcpu))
++          (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
+               debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+       return debugctl;
+@@ -2100,7 +2100,9 @@ static int vmx_set_msr(struct kvm_vcpu *
+               vmcs_writel(GUEST_SYSENTER_ESP, data);
+               break;
+       case MSR_IA32_DEBUGCTLMSR: {
+-              u64 invalid = data & ~vcpu_supported_debugctl(vcpu);
++              u64 invalid;
++
++              invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+               if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
+                       if (report_ignored_msrs)
+                               vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n",
diff --git a/queue-6.0/kvm-x86-emulator-em_sysexit-should-update-ctxt-mode.patch b/queue-6.0/kvm-x86-emulator-em_sysexit-should-update-ctxt-mode.patch
new file mode 100644 (file)
index 0000000..a137cf2
--- /dev/null
@@ -0,0 +1,36 @@
+From 5015bb89b58225f97df6ac44383e7e8c8662c8c9 Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Tue, 25 Oct 2022 15:47:28 +0300
+Subject: KVM: x86: emulator: em_sysexit should update ctxt->mode
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit 5015bb89b58225f97df6ac44383e7e8c8662c8c9 upstream.
+
+SYSEXIT is one of the instructions that can change the
+processor mode, thus ctxt->mode should be updated after it.
+
+Note that this is likely a benign bug, because the only problematic
+mode change is from 32 bit to 64 bit which can lead to truncation of RIP,
+and it is not possible to do with sysexit,
+since sysexit running in 32 bit mode will be limited to 32 bit version.
+
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221025124741.228045-11-mlevitsk@redhat.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/emulate.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -2874,6 +2874,7 @@ static int em_sysexit(struct x86_emulate
+       ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
+       ctxt->_eip = rdx;
++      ctxt->mode = usermode;
+       *reg_write(ctxt, VCPU_REGS_RSP) = rcx;
+       return X86EMUL_CONTINUE;
diff --git a/queue-6.0/kvm-x86-emulator-introduce-emulator_recalc_and_set_mode.patch b/queue-6.0/kvm-x86-emulator-introduce-emulator_recalc_and_set_mode.patch
new file mode 100644 (file)
index 0000000..608b5a2
--- /dev/null
@@ -0,0 +1,163 @@
+From d087e0f79fa0dd336a9a6b2f79ec23120f5eff73 Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Tue, 25 Oct 2022 15:47:29 +0300
+Subject: KVM: x86: emulator: introduce emulator_recalc_and_set_mode
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit d087e0f79fa0dd336a9a6b2f79ec23120f5eff73 upstream.
+
+Some instructions update the cpu execution mode, which needs to update the
+emulation mode.
+
+Extract this code, and make assign_eip_far use it.
+
+assign_eip_far now reads CS, instead of getting it via a parameter,
+which is ok, because callers always assign CS to the same value
+before calling this function.
+
+No functional change is intended.
+
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221025124741.228045-12-mlevitsk@redhat.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/emulate.c |   85 ++++++++++++++++++++++++++++++++-----------------
+ 1 file changed, 57 insertions(+), 28 deletions(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -791,8 +791,7 @@ static int linearize(struct x86_emulate_
+                          ctxt->mode, linear);
+ }
+-static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst,
+-                           enum x86emul_mode mode)
++static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst)
+ {
+       ulong linear;
+       int rc;
+@@ -802,41 +801,71 @@ static inline int assign_eip(struct x86_
+       if (ctxt->op_bytes != sizeof(unsigned long))
+               addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1);
+-      rc = __linearize(ctxt, addr, &max_size, 1, false, true, mode, &linear);
++      rc = __linearize(ctxt, addr, &max_size, 1, false, true, ctxt->mode, &linear);
+       if (rc == X86EMUL_CONTINUE)
+               ctxt->_eip = addr.ea;
+       return rc;
+ }
++static inline int emulator_recalc_and_set_mode(struct x86_emulate_ctxt *ctxt)
++{
++      u64 efer;
++      struct desc_struct cs;
++      u16 selector;
++      u32 base3;
++
++      ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
++
++      if (!(ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PE)) {
++              /* Real mode. cpu must not have long mode active */
++              if (efer & EFER_LMA)
++                      return X86EMUL_UNHANDLEABLE;
++              ctxt->mode = X86EMUL_MODE_REAL;
++              return X86EMUL_CONTINUE;
++      }
++
++      if (ctxt->eflags & X86_EFLAGS_VM) {
++              /* Protected/VM86 mode. cpu must not have long mode active */
++              if (efer & EFER_LMA)
++                      return X86EMUL_UNHANDLEABLE;
++              ctxt->mode = X86EMUL_MODE_VM86;
++              return X86EMUL_CONTINUE;
++      }
++
++      if (!ctxt->ops->get_segment(ctxt, &selector, &cs, &base3, VCPU_SREG_CS))
++              return X86EMUL_UNHANDLEABLE;
++
++      if (efer & EFER_LMA) {
++              if (cs.l) {
++                      /* Proper long mode */
++                      ctxt->mode = X86EMUL_MODE_PROT64;
++              } else if (cs.d) {
++                      /* 32 bit compatibility mode*/
++                      ctxt->mode = X86EMUL_MODE_PROT32;
++              } else {
++                      ctxt->mode = X86EMUL_MODE_PROT16;
++              }
++      } else {
++              /* Legacy 32 bit / 16 bit mode */
++              ctxt->mode = cs.d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
++      }
++
++      return X86EMUL_CONTINUE;
++}
++
+ static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst)
+ {
+-      return assign_eip(ctxt, dst, ctxt->mode);
++      return assign_eip(ctxt, dst);
+ }
+-static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
+-                        const struct desc_struct *cs_desc)
++static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst)
+ {
+-      enum x86emul_mode mode = ctxt->mode;
+-      int rc;
++      int rc = emulator_recalc_and_set_mode(ctxt);
+-#ifdef CONFIG_X86_64
+-      if (ctxt->mode >= X86EMUL_MODE_PROT16) {
+-              if (cs_desc->l) {
+-                      u64 efer = 0;
++      if (rc != X86EMUL_CONTINUE)
++              return rc;
+-                      ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+-                      if (efer & EFER_LMA)
+-                              mode = X86EMUL_MODE_PROT64;
+-              } else
+-                      mode = X86EMUL_MODE_PROT32; /* temporary value */
+-      }
+-#endif
+-      if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32)
+-              mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+-      rc = assign_eip(ctxt, dst, mode);
+-      if (rc == X86EMUL_CONTINUE)
+-              ctxt->mode = mode;
+-      return rc;
++      return assign_eip(ctxt, dst);
+ }
+ static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
+@@ -2170,7 +2199,7 @@ static int em_jmp_far(struct x86_emulate
+       if (rc != X86EMUL_CONTINUE)
+               return rc;
+-      rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
++      rc = assign_eip_far(ctxt, ctxt->src.val);
+       /* Error handling is not implemented. */
+       if (rc != X86EMUL_CONTINUE)
+               return X86EMUL_UNHANDLEABLE;
+@@ -2248,7 +2277,7 @@ static int em_ret_far(struct x86_emulate
+                                      &new_desc);
+       if (rc != X86EMUL_CONTINUE)
+               return rc;
+-      rc = assign_eip_far(ctxt, eip, &new_desc);
++      rc = assign_eip_far(ctxt, eip);
+       /* Error handling is not implemented. */
+       if (rc != X86EMUL_CONTINUE)
+               return X86EMUL_UNHANDLEABLE;
+@@ -3468,7 +3497,7 @@ static int em_call_far(struct x86_emulat
+       if (rc != X86EMUL_CONTINUE)
+               return rc;
+-      rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
++      rc = assign_eip_far(ctxt, ctxt->src.val);
+       if (rc != X86EMUL_CONTINUE)
+               goto fail;
diff --git a/queue-6.0/kvm-x86-emulator-update-the-emulation-mode-after-cr0-write.patch b/queue-6.0/kvm-x86-emulator-update-the-emulation-mode-after-cr0-write.patch
new file mode 100644 (file)
index 0000000..abde3aa
--- /dev/null
@@ -0,0 +1,55 @@
+From ad8f9e69942c7db90758d9d774157e53bce94840 Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Tue, 25 Oct 2022 15:47:31 +0300
+Subject: KVM: x86: emulator: update the emulation mode after CR0 write
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit ad8f9e69942c7db90758d9d774157e53bce94840 upstream.
+
+Update the emulation mode when handling writes to CR0, because
+toggling CR0.PE switches between Real and Protected Mode, and toggling
+CR0.PG when EFER.LME=1 switches between Long and Protected Mode.
+
+This is likely a benign bug because there is no writeback of state,
+other than the RIP increment, and when toggling CR0.PE, the CPU has
+to execute code from a very low memory address.
+
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221025124741.228045-14-mlevitsk@redhat.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/emulate.c |   16 +++++++++++++++-
+ 1 file changed, 15 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -3639,11 +3639,25 @@ static int em_movbe(struct x86_emulate_c
+ static int em_cr_write(struct x86_emulate_ctxt *ctxt)
+ {
+-      if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
++      int cr_num = ctxt->modrm_reg;
++      int r;
++
++      if (ctxt->ops->set_cr(ctxt, cr_num, ctxt->src.val))
+               return emulate_gp(ctxt, 0);
+       /* Disable writeback. */
+       ctxt->dst.type = OP_NONE;
++
++      if (cr_num == 0) {
++              /*
++               * CR0 write might have updated CR0.PE and/or CR0.PG
++               * which can affect the cpu's execution mode.
++               */
++              r = emulator_recalc_and_set_mode(ctxt);
++              if (r != X86EMUL_CONTINUE)
++                      return r;
++      }
++
+       return X86EMUL_CONTINUE;
+ }
diff --git a/queue-6.0/kvm-x86-emulator-update-the-emulation-mode-after-rsm.patch b/queue-6.0/kvm-x86-emulator-update-the-emulation-mode-after-rsm.patch
new file mode 100644 (file)
index 0000000..1f20993
--- /dev/null
@@ -0,0 +1,36 @@
+From 055f37f84e304e59c046d1accfd8f08462f52c4c Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Tue, 25 Oct 2022 15:47:30 +0300
+Subject: KVM: x86: emulator: update the emulation mode after rsm
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit 055f37f84e304e59c046d1accfd8f08462f52c4c upstream.
+
+Update the emulation mode after RSM so that RIP will be correctly
+written back, because the RSM instruction can switch the CPU mode from
+32 bit (or less) to 64 bit.
+
+This fixes a guest crash in case the #SMI is received while the guest
+runs a code from an address > 32 bit.
+
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221025124741.228045-13-mlevitsk@redhat.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/emulate.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -2660,7 +2660,7 @@ static int em_rsm(struct x86_emulate_ctx
+        * those side effects need to be explicitly handled for both success
+        * and shutdown.
+        */
+-      return X86EMUL_CONTINUE;
++      return emulator_recalc_and_set_mode(ctxt);
+ emulate_shutdown:
+       ctxt->ops->triple_fault(ctxt);
diff --git a/queue-6.0/kvm-x86-mask-off-reserved-bits-in-cpuid.80000001h.patch b/queue-6.0/kvm-x86-mask-off-reserved-bits-in-cpuid.80000001h.patch
new file mode 100644 (file)
index 0000000..8ddf686
--- /dev/null
@@ -0,0 +1,32 @@
+From 0469e56a14bf8cfb80507e51b7aeec0332cdbc13 Mon Sep 17 00:00:00 2001
+From: Jim Mattson <jmattson@google.com>
+Date: Fri, 30 Sep 2022 00:51:58 +0200
+Subject: KVM: x86: Mask off reserved bits in CPUID.80000001H
+
+From: Jim Mattson <jmattson@google.com>
+
+commit 0469e56a14bf8cfb80507e51b7aeec0332cdbc13 upstream.
+
+KVM_GET_SUPPORTED_CPUID should only enumerate features that KVM
+actually supports. CPUID.80000001:EBX[27:16] are reserved bits and
+should be masked off.
+
+Fixes: 0771671749b5 ("KVM: Enhance guest cpuid management")
+Signed-off-by: Jim Mattson <jmattson@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/cpuid.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -1117,6 +1117,7 @@ static inline int __do_cpuid_func(struct
+                       entry->eax = max(entry->eax, 0x80000021);
+               break;
+       case 0x80000001:
++              entry->ebx &= ~GENMASK(27, 16);
+               cpuid_entry_override(entry, CPUID_8000_0001_EDX);
+               cpuid_entry_override(entry, CPUID_8000_0001_ECX);
+               break;
diff --git a/queue-6.0/kvm-x86-mask-off-reserved-bits-in-cpuid.80000006h.patch b/queue-6.0/kvm-x86-mask-off-reserved-bits-in-cpuid.80000006h.patch
new file mode 100644 (file)
index 0000000..858eacd
--- /dev/null
@@ -0,0 +1,35 @@
+From eeb69eab57c6604ac90b3fd8e5ac43f24a5535b1 Mon Sep 17 00:00:00 2001
+From: Jim Mattson <jmattson@google.com>
+Date: Thu, 29 Sep 2022 15:51:59 -0700
+Subject: KVM: x86: Mask off reserved bits in CPUID.80000006H
+
+From: Jim Mattson <jmattson@google.com>
+
+commit eeb69eab57c6604ac90b3fd8e5ac43f24a5535b1 upstream.
+
+KVM_GET_SUPPORTED_CPUID should only enumerate features that KVM
+actually supports. CPUID.80000006H:EDX[17:16] are reserved bits and
+should be masked off.
+
+Fixes: 43d05de2bee7 ("KVM: pass through CPUID(0x80000006)")
+Signed-off-by: Jim Mattson <jmattson@google.com>
+Message-Id: <20220929225203.2234702-2-jmattson@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/cpuid.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -1121,7 +1121,8 @@ static inline int __do_cpuid_func(struct
+               cpuid_entry_override(entry, CPUID_8000_0001_ECX);
+               break;
+       case 0x80000006:
+-              /* L2 cache and TLB: pass through host info. */
++              /* Drop reserved bits, pass host L2 cache and TLB info. */
++              entry->edx &= ~GENMASK(17, 16);
+               break;
+       case 0x80000007: /* Advanced power management */
+               /* invariant TSC is CPUID.80000007H:EDX[8] */
diff --git a/queue-6.0/kvm-x86-mask-off-reserved-bits-in-cpuid.80000008h.patch b/queue-6.0/kvm-x86-mask-off-reserved-bits-in-cpuid.80000008h.patch
new file mode 100644 (file)
index 0000000..47e4a46
--- /dev/null
@@ -0,0 +1,38 @@
+From 7030d8530e533844e2f4b0e7476498afcd324634 Mon Sep 17 00:00:00 2001
+From: Jim Mattson <jmattson@google.com>
+Date: Thu, 29 Sep 2022 15:52:00 -0700
+Subject: KVM: x86: Mask off reserved bits in CPUID.80000008H
+
+From: Jim Mattson <jmattson@google.com>
+
+commit 7030d8530e533844e2f4b0e7476498afcd324634 upstream.
+
+KVM_GET_SUPPORTED_CPUID should only enumerate features that KVM
+actually supports. The following ranges of CPUID.80000008H are reserved
+and should be masked off:
+    ECX[31:18]
+    ECX[11:8]
+
+In addition, the PerfTscSize field at ECX[17:16] should also be zero
+because KVM does not set the PERFTSC bit at CPUID.80000001H.ECX[27].
+
+Fixes: 24c82e576b78 ("KVM: Sanitize cpuid")
+Signed-off-by: Jim Mattson <jmattson@google.com>
+Message-Id: <20220929225203.2234702-3-jmattson@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/cpuid.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -1152,6 +1152,7 @@ static inline int __do_cpuid_func(struct
+                       g_phys_as = phys_as;
+               entry->eax = g_phys_as | (virt_as << 8);
++              entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8));
+               entry->edx = 0;
+               cpuid_entry_override(entry, CPUID_8000_0008_EBX);
+               break;
diff --git a/queue-6.0/kvm-x86-mask-off-reserved-bits-in-cpuid.8000001ah.patch b/queue-6.0/kvm-x86-mask-off-reserved-bits-in-cpuid.8000001ah.patch
new file mode 100644 (file)
index 0000000..ea3b007
--- /dev/null
@@ -0,0 +1,35 @@
+From 079f6889818dd07903fb36c252532ab47ebb6d48 Mon Sep 17 00:00:00 2001
+From: Jim Mattson <jmattson@google.com>
+Date: Thu, 29 Sep 2022 15:52:01 -0700
+Subject: KVM: x86: Mask off reserved bits in CPUID.8000001AH
+
+From: Jim Mattson <jmattson@google.com>
+
+commit 079f6889818dd07903fb36c252532ab47ebb6d48 upstream.
+
+KVM_GET_SUPPORTED_CPUID should only enumerate features that KVM
+actually supports. In the case of CPUID.8000001AH, only three bits are
+currently defined. The 125 reserved bits should be masked off.
+
+Fixes: 24c82e576b78 ("KVM: Sanitize cpuid")
+Signed-off-by: Jim Mattson <jmattson@google.com>
+Message-Id: <20220929225203.2234702-4-jmattson@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/cpuid.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -1171,6 +1171,9 @@ static inline int __do_cpuid_func(struct
+               entry->ecx = entry->edx = 0;
+               break;
+       case 0x8000001a:
++              entry->eax &= GENMASK(2, 0);
++              entry->ebx = entry->ecx = entry->edx = 0;
++              break;
+       case 0x8000001e:
+               break;
+       case 0x8000001F:
diff --git a/queue-6.0/kvm-x86-mask-off-reserved-bits-in-cpuid.8000001fh.patch b/queue-6.0/kvm-x86-mask-off-reserved-bits-in-cpuid.8000001fh.patch
new file mode 100644 (file)
index 0000000..3227e3e
--- /dev/null
@@ -0,0 +1,36 @@
+From 86c4f0d547f6460d0426ebb3ba0614f1134b8cda Mon Sep 17 00:00:00 2001
+From: Jim Mattson <jmattson@google.com>
+Date: Thu, 29 Sep 2022 15:52:03 -0700
+Subject: KVM: x86: Mask off reserved bits in CPUID.8000001FH
+
+From: Jim Mattson <jmattson@google.com>
+
+commit 86c4f0d547f6460d0426ebb3ba0614f1134b8cda upstream.
+
+KVM_GET_SUPPORTED_CPUID should only enumerate features that KVM
+actually supports. CPUID.8000001FH:EBX[31:16] are reserved bits and
+should be masked off.
+
+Fixes: 8765d75329a3 ("KVM: X86: Extend CPUID range to include new leaf")
+Signed-off-by: Jim Mattson <jmattson@google.com>
+Message-Id: <20220929225203.2234702-6-jmattson@google.com>
+Cc: stable@vger.kernel.org
+[Clear NumVMPL too. - Paolo]
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/cpuid.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -1183,7 +1183,8 @@ static inline int __do_cpuid_func(struct
+                       entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+               } else {
+                       cpuid_entry_override(entry, CPUID_8000_001F_EAX);
+-
++                      /* Clear NumVMPL since KVM does not support VMPL.  */
++                      entry->ebx &= ~GENMASK(31, 12);
+                       /*
+                        * Enumerate '0' for "PA bits reduction", the adjusted
+                        * MAXPHYADDR is enumerated directly (see 0x80000008).
diff --git a/queue-6.0/kvm-x86-smm-number-of-gprs-in-the-smram-image-depends-on-the-image-format.patch b/queue-6.0/kvm-x86-smm-number-of-gprs-in-the-smram-image-depends-on-the-image-format.patch
new file mode 100644 (file)
index 0000000..23afdd7
--- /dev/null
@@ -0,0 +1,48 @@
+From 696db303e54f7352623d9f640e6c51d8fa9d5588 Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Tue, 25 Oct 2022 15:47:32 +0300
+Subject: KVM: x86: smm: number of GPRs in the SMRAM image depends on the image format
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit 696db303e54f7352623d9f640e6c51d8fa9d5588 upstream.
+
+On 64 bit host, if the guest doesn't have X86_FEATURE_LM, KVM will
+access 16 gprs to 32-bit smram image, causing out-ouf-bound ram
+access.
+
+On 32 bit host, the rsm_load_state_64/enter_smm_save_state_64
+is compiled out, thus access overflow can't happen.
+
+Fixes: b443183a25ab61 ("KVM: x86: Reduce the number of emulator GPRs to '8' for 32-bit KVM")
+
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20221025124741.228045-15-mlevitsk@redhat.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/emulate.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -2430,7 +2430,7 @@ static int rsm_load_state_32(struct x86_
+       ctxt->eflags =             GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED;
+       ctxt->_eip =               GET_SMSTATE(u32, smstate, 0x7ff0);
+-      for (i = 0; i < NR_EMULATOR_GPRS; i++)
++      for (i = 0; i < 8; i++)
+               *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4);
+       val = GET_SMSTATE(u32, smstate, 0x7fcc);
+@@ -2487,7 +2487,7 @@ static int rsm_load_state_64(struct x86_
+       u16 selector;
+       int i, r;
+-      for (i = 0; i < NR_EMULATOR_GPRS; i++)
++      for (i = 0; i < 16; i++)
+               *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8);
+       ctxt->_eip   = GET_SMSTATE(u64, smstate, 0x7f78);
diff --git a/queue-6.0/parisc-avoid-printing-the-hardware-path-twice.patch b/queue-6.0/parisc-avoid-printing-the-hardware-path-twice.patch
new file mode 100644 (file)
index 0000000..cc3eaea
--- /dev/null
@@ -0,0 +1,77 @@
+From 2b6ae0962b421103feb41a80406732944b0665b3 Mon Sep 17 00:00:00 2001
+From: Helge Deller <deller@gmx.de>
+Date: Fri, 28 Oct 2022 18:12:49 +0200
+Subject: parisc: Avoid printing the hardware path twice
+
+From: Helge Deller <deller@gmx.de>
+
+commit 2b6ae0962b421103feb41a80406732944b0665b3 upstream.
+
+Avoid that the hardware path is shown twice in the kernel log, and clean
+up the output of the version numbers to show up in the same order as
+they are listed in the hardware database in the hardware.c file.
+Additionally, optimize the memory footprint of the hardware database
+and mark some code as init code.
+
+Fixes: cab56b51ec0e ("parisc: Fix device names in /proc/iomem")
+Signed-off-by: Helge Deller <deller@gmx.de>
+Cc: <stable@vger.kernel.org> # v4.9+
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/parisc/include/asm/hardware.h |   12 ++++++------
+ arch/parisc/kernel/drivers.c       |   14 ++++++--------
+ 2 files changed, 12 insertions(+), 14 deletions(-)
+
+--- a/arch/parisc/include/asm/hardware.h
++++ b/arch/parisc/include/asm/hardware.h
+@@ -10,12 +10,12 @@
+ #define SVERSION_ANY_ID               PA_SVERSION_ANY_ID
+ struct hp_hardware {
+-      unsigned short  hw_type:5;      /* HPHW_xxx */
+-      unsigned short  hversion;
+-      unsigned long   sversion:28;
+-      unsigned short  opt;
+-      const char      name[80];       /* The hardware description */
+-};
++      unsigned int    hw_type:8;      /* HPHW_xxx */
++      unsigned int    hversion:12;
++      unsigned int    sversion:12;
++      unsigned char   opt;
++      unsigned char   name[59];       /* The hardware description */
++} __packed;
+ struct parisc_device;
+--- a/arch/parisc/kernel/drivers.c
++++ b/arch/parisc/kernel/drivers.c
+@@ -882,15 +882,13 @@ void __init walk_central_bus(void)
+                       &root);
+ }
+-static void print_parisc_device(struct parisc_device *dev)
++static __init void print_parisc_device(struct parisc_device *dev)
+ {
+-      char hw_path[64];
+-      static int count;
++      static int count __initdata;
+-      print_pa_hwpath(dev, hw_path);
+-      pr_info("%d. %s at %pap [%s] { %d, 0x%x, 0x%.3x, 0x%.5x }",
+-              ++count, dev->name, &(dev->hpa.start), hw_path, dev->id.hw_type,
+-              dev->id.hversion_rev, dev->id.hversion, dev->id.sversion);
++      pr_info("%d. %s at %pap { type:%d, hv:%#x, sv:%#x, rev:%#x }",
++              ++count, dev->name, &(dev->hpa.start), dev->id.hw_type,
++              dev->id.hversion, dev->id.sversion, dev->id.hversion_rev);
+       if (dev->num_addrs) {
+               int k;
+@@ -1079,7 +1077,7 @@ static __init int qemu_print_iodc_data(s
+-static int print_one_device(struct device * dev, void * data)
++static __init int print_one_device(struct device * dev, void * data)
+ {
+       struct parisc_device * pdev = to_parisc_device(dev);
diff --git a/queue-6.0/parisc-export-iosapic_serial_irq-symbol-for-serial-port-driver.patch b/queue-6.0/parisc-export-iosapic_serial_irq-symbol-for-serial-port-driver.patch
new file mode 100644 (file)
index 0000000..f0b6cc2
--- /dev/null
@@ -0,0 +1,30 @@
+From a0c9f1f2e53b8eb2ae43987a30e547ba56b4fa18 Mon Sep 17 00:00:00 2001
+From: Helge Deller <deller@gmx.de>
+Date: Thu, 27 Oct 2022 09:12:05 +0200
+Subject: parisc: Export iosapic_serial_irq() symbol for serial port driver
+
+From: Helge Deller <deller@gmx.de>
+
+commit a0c9f1f2e53b8eb2ae43987a30e547ba56b4fa18 upstream.
+
+The parisc serial port driver needs this symbol when it's compiled
+as module.
+
+Signed-off-by: Helge Deller <deller@gmx.de>
+Reported-by: kernel test robot <lkp@intel.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/parisc/iosapic.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/parisc/iosapic.c
++++ b/drivers/parisc/iosapic.c
+@@ -866,6 +866,7 @@ int iosapic_serial_irq(struct parisc_dev
+       return vi->txn_irq;
+ }
++EXPORT_SYMBOL(iosapic_serial_irq);
+ #endif
diff --git a/queue-6.0/parisc-make-8250_gsc-driver-dependend-on-config_parisc.patch b/queue-6.0/parisc-make-8250_gsc-driver-dependend-on-config_parisc.patch
new file mode 100644 (file)
index 0000000..a849320
--- /dev/null
@@ -0,0 +1,36 @@
+From e8a18e3f00f3ee8d07c17ab1ea3ad4df4a3b6fe0 Mon Sep 17 00:00:00 2001
+From: Helge Deller <deller@gmx.de>
+Date: Fri, 21 Oct 2022 07:44:49 +0200
+Subject: parisc: Make 8250_gsc driver dependend on CONFIG_PARISC
+
+From: Helge Deller <deller@gmx.de>
+
+commit e8a18e3f00f3ee8d07c17ab1ea3ad4df4a3b6fe0 upstream.
+
+Although the name of the driver 8250_gsc.c suggests that it handles
+only serial ports on the GSC bus, it does handle serial ports listed
+in the parisc machine inventory as well, e.g. the serial ports in a
+C8000 PCI-only workstation.
+
+Change the dependency to CONFIG_PARISC, so that the driver gets included
+in the kernel even if CONFIG_GSC isn't set.
+
+Reported-by: Mikulas Patocka <mpatocka@redhat.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Helge Deller <deller@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/tty/serial/8250/Kconfig |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/tty/serial/8250/Kconfig
++++ b/drivers/tty/serial/8250/Kconfig
+@@ -118,7 +118,7 @@ config SERIAL_8250_CONSOLE
+ config SERIAL_8250_GSC
+       tristate
+-      depends on SERIAL_8250 && GSC
++      depends on SERIAL_8250 && PARISC
+       default SERIAL_8250
+ config SERIAL_8250_DMA
index 5516c2be96431bb222b1a290576ec9e87ca68767..ef58b4d7a39374233993581a033e356633bbc000 100644 (file)
@@ -160,3 +160,30 @@ perf-x86-intel-add-cooper-lake-stepping-to-isolation_ucodes.patch
 perf-x86-intel-fix-pebs-event-constraints-for-spr.patch
 net-remove-sock_support_zc-from-sockmap.patch
 net-also-flag-accepted-sockets-supporting-msghdr-originated-zerocopy.patch
+parisc-make-8250_gsc-driver-dependend-on-config_parisc.patch
+parisc-export-iosapic_serial_irq-symbol-for-serial-port-driver.patch
+parisc-avoid-printing-the-hardware-path-twice.patch
+ext4-fix-warning-in-ext4_da_release_space.patch
+ext4-fix-bug_on-when-directory-entry-has-invalid-rec_len.patch
+ext4-update-the-backup-superblock-s-at-the-end-of-the-online-resize.patch
+x86-tdx-prepare-for-using-info-call-for-a-second-purpose.patch
+x86-tdx-panic-on-bad-configs-that-ve-on-private-memory-access.patch
+x86-syscall-include-asm-ptrace.h-in-syscall_wrapper-header.patch
+kvm-x86-mask-off-reserved-bits-in-cpuid.80000006h.patch
+kvm-x86-mask-off-reserved-bits-in-cpuid.8000001ah.patch
+kvm-x86-mask-off-reserved-bits-in-cpuid.80000008h.patch
+kvm-x86-mask-off-reserved-bits-in-cpuid.80000001h.patch
+kvm-x86-mask-off-reserved-bits-in-cpuid.8000001fh.patch
+kvm-vmx-advertise-pmu-lbrs-if-and-only-if-perf-supports-lbrs.patch
+kvm-vmx-fold-vmx_supported_debugctl-into-vcpu_supported_debugctl.patch
+kvm-vmx-ignore-guest-cpuid-for-host-userspace-writes-to-debugctl.patch
+kvm-vmx-fully-disable-sgx-if-secondary_exec_encls_exiting-unavailable.patch
+kvm-initialize-gfn_to_pfn_cache-locks-in-dedicated-helper.patch
+kvm-reject-attempts-to-consume-or-refresh-inactive-gfn_to_pfn_cache.patch
+kvm-arm64-fix-bad-dereference-on-mte-enabled-systems.patch
+kvm-arm64-fix-smpri_el1-tpidr2_el0-trapping-on-vhe.patch
+kvm-x86-smm-number-of-gprs-in-the-smram-image-depends-on-the-image-format.patch
+kvm-x86-emulator-em_sysexit-should-update-ctxt-mode.patch
+kvm-x86-emulator-introduce-emulator_recalc_and_set_mode.patch
+kvm-x86-emulator-update-the-emulation-mode-after-rsm.patch
+kvm-x86-emulator-update-the-emulation-mode-after-cr0-write.patch
diff --git a/queue-6.0/x86-syscall-include-asm-ptrace.h-in-syscall_wrapper-header.patch b/queue-6.0/x86-syscall-include-asm-ptrace.h-in-syscall_wrapper-header.patch
new file mode 100644 (file)
index 0000000..64d5000
--- /dev/null
@@ -0,0 +1,64 @@
+From 9440c42941606af4c379afa3cf8624f0dc43a629 Mon Sep 17 00:00:00 2001
+From: Jiri Olsa <olsajiri@gmail.com>
+Date: Tue, 18 Oct 2022 14:27:08 +0200
+Subject: x86/syscall: Include asm/ptrace.h in syscall_wrapper header
+
+From: Jiri Olsa <olsajiri@gmail.com>
+
+commit 9440c42941606af4c379afa3cf8624f0dc43a629 upstream.
+
+With just the forward declaration of the 'struct pt_regs' in
+syscall_wrapper.h, the syscall stub functions:
+
+  __[x64|ia32]_sys_*(struct pt_regs *regs)
+
+will have different definition of 'regs' argument in BTF data
+based on which object file they are defined in.
+
+If the syscall's object includes 'struct pt_regs' definition,
+the BTF argument data will point to a 'struct pt_regs' record,
+like:
+
+  [226] STRUCT 'pt_regs' size=168 vlen=21
+         'r15' type_id=1 bits_offset=0
+         'r14' type_id=1 bits_offset=64
+         'r13' type_id=1 bits_offset=128
+  ...
+
+If not, it will point to a fwd declaration record:
+
+  [15439] FWD 'pt_regs' fwd_kind=struct
+
+and make bpf tracing program hooking on those functions unable
+to access fields from 'struct pt_regs'.
+
+Include asm/ptrace.h directly in syscall_wrapper.h to make sure all
+syscalls see 'struct pt_regs' definition. This then results in BTF for
+'__*_sys_*(struct pt_regs *regs)' functions to point to the actual
+struct, not just the forward declaration.
+
+  [ bp: No Fixes tag as this is not really a bug fix but "adjustment" so
+    that BTF is happy. ]
+
+Reported-by: Akihiro HARAI <jharai0815@gmail.com>
+Signed-off-by: Jiri Olsa <jolsa@kernel.org>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Acked-by: Andrii Nakryiko <andrii@kernel.org>
+Cc: <stable@vger.kernel.org> # this is needed only for BTF so kernels >= 5.15
+Link: https://lore.kernel.org/r/20221018122708.823792-1-jolsa@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/syscall_wrapper.h |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/include/asm/syscall_wrapper.h
++++ b/arch/x86/include/asm/syscall_wrapper.h
+@@ -6,7 +6,7 @@
+ #ifndef _ASM_X86_SYSCALL_WRAPPER_H
+ #define _ASM_X86_SYSCALL_WRAPPER_H
+-struct pt_regs;
++#include <asm/ptrace.h>
+ extern long __x64_sys_ni_syscall(const struct pt_regs *regs);
+ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
diff --git a/queue-6.0/x86-tdx-panic-on-bad-configs-that-ve-on-private-memory-access.patch b/queue-6.0/x86-tdx-panic-on-bad-configs-that-ve-on-private-memory-access.patch
new file mode 100644 (file)
index 0000000..ea39a48
--- /dev/null
@@ -0,0 +1,98 @@
+From 373e715e31bf4e0f129befe87613a278fac228d3 Mon Sep 17 00:00:00 2001
+From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
+Date: Fri, 28 Oct 2022 17:12:20 +0300
+Subject: x86/tdx: Panic on bad configs that #VE on "private" memory access
+
+From: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+
+commit 373e715e31bf4e0f129befe87613a278fac228d3 upstream.
+
+All normal kernel memory is "TDX private memory".  This includes
+everything from kernel stacks to kernel text.  Handling
+exceptions on arbitrary accesses to kernel memory is essentially
+impossible because they can happen in horribly nasty places like
+kernel entry/exit.  But, TDX hardware can theoretically _deliver_
+a virtualization exception (#VE) on any access to private memory.
+
+But, it's not as bad as it sounds.  TDX can be configured to never
+deliver these exceptions on private memory with a "TD attribute"
+called ATTR_SEPT_VE_DISABLE.  The guest has no way to *set* this
+attribute, but it can check it.
+
+Ensure ATTR_SEPT_VE_DISABLE is set in early boot.  panic() if it
+is unset.  There is no sane way for Linux to run with this
+attribute clear so a panic() is appropriate.
+
+There's small window during boot before the check where kernel
+has an early #VE handler. But the handler is only for port I/O
+and will also panic() as soon as it sees any other #VE, such as
+a one generated by a private memory access.
+
+[ dhansen: Rewrite changelog and rebase on new tdx_parse_tdinfo().
+          Add Kirill's tested-by because I made changes since
+          he wrote this. ]
+
+Fixes: 9a22bf6debbf ("x86/traps: Add #VE support for TDX guest")
+Reported-by: ruogui.ygr@alibaba-inc.com
+Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Tested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/all/20221028141220.29217-3-kirill.shutemov%40linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/coco/tdx/tdx.c |   21 ++++++++++++++++-----
+ 1 file changed, 16 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/coco/tdx/tdx.c
++++ b/arch/x86/coco/tdx/tdx.c
+@@ -34,6 +34,8 @@
+ #define VE_GET_PORT_NUM(e)    ((e) >> 16)
+ #define VE_IS_IO_STRING(e)    ((e) & BIT(4))
++#define ATTR_SEPT_VE_DISABLE  BIT(28)
++
+ /*
+  * Wrapper for standard use of __tdx_hypercall with no output aside from
+  * return code.
+@@ -102,6 +104,7 @@ static void tdx_parse_tdinfo(u64 *cc_mas
+ {
+       struct tdx_module_output out;
+       unsigned int gpa_width;
++      u64 td_attr;
+       /*
+        * TDINFO TDX module call is used to get the TD execution environment
+@@ -109,19 +112,27 @@ static void tdx_parse_tdinfo(u64 *cc_mas
+        * information, etc. More details about the ABI can be found in TDX
+        * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
+        * [TDG.VP.INFO].
+-       *
+-       * The GPA width that comes out of this call is critical. TDX guests
+-       * can not meaningfully run without it.
+        */
+       tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out);
+-      gpa_width = out.rcx & GENMASK(5, 0);
+-
+       /*
+        * The highest bit of a guest physical address is the "sharing" bit.
+        * Set it for shared pages and clear it for private pages.
++       *
++       * The GPA width that comes out of this call is critical. TDX guests
++       * can not meaningfully run without it.
+        */
++      gpa_width = out.rcx & GENMASK(5, 0);
+       *cc_mask = BIT_ULL(gpa_width - 1);
++
++      /*
++       * The kernel can not handle #VE's when accessing normal kernel
++       * memory.  Ensure that no #VE will be delivered for accesses to
++       * TD-private memory.  Only VMM-shared memory (MMIO) will #VE.
++       */
++      td_attr = out.rdx;
++      if (!(td_attr & ATTR_SEPT_VE_DISABLE))
++              panic("TD misconfiguration: SEPT_VE_DISABLE attibute must be set.\n");
+ }
+ /*
diff --git a/queue-6.0/x86-tdx-prepare-for-using-info-call-for-a-second-purpose.patch b/queue-6.0/x86-tdx-prepare-for-using-info-call-for-a-second-purpose.patch
new file mode 100644 (file)
index 0000000..f6567e4
--- /dev/null
@@ -0,0 +1,61 @@
+From a6dd6f39008bb3ef7c73ef0a2acc2a4209555bd8 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Fri, 28 Oct 2022 17:12:19 +0300
+Subject: x86/tdx: Prepare for using "INFO" call for a second purpose
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit a6dd6f39008bb3ef7c73ef0a2acc2a4209555bd8 upstream.
+
+The TDG.VP.INFO TDCALL provides the guest with various details about
+the TDX system that the guest needs to run.  Only one field is currently
+used: 'gpa_width' which tells the guest which PTE bits mark pages shared
+or private.
+
+A second field is now needed: the guest "TD attributes" to tell if
+virtualization exceptions are configured in a way that can harm the guest.
+
+Make the naming and calling convention more generic and discrete from the
+mask-centric one.
+
+Thanks to Sathya for the inspiration here, but there's no code, comments
+or changelogs left from where he started.
+
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Tested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/coco/tdx/tdx.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/coco/tdx/tdx.c
++++ b/arch/x86/coco/tdx/tdx.c
+@@ -98,7 +98,7 @@ static inline void tdx_module_call(u64 f
+               panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
+ }
+-static u64 get_cc_mask(void)
++static void tdx_parse_tdinfo(u64 *cc_mask)
+ {
+       struct tdx_module_output out;
+       unsigned int gpa_width;
+@@ -121,7 +121,7 @@ static u64 get_cc_mask(void)
+        * The highest bit of a guest physical address is the "sharing" bit.
+        * Set it for shared pages and clear it for private pages.
+        */
+-      return BIT_ULL(gpa_width - 1);
++      *cc_mask = BIT_ULL(gpa_width - 1);
+ }
+ /*
+@@ -758,7 +758,7 @@ void __init tdx_early_init(void)
+       setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
+       cc_set_vendor(CC_VENDOR_INTEL);
+-      cc_mask = get_cc_mask();
++      tdx_parse_tdinfo(&cc_mask);
+       cc_set_mask(cc_mask);
+       /*