--- /dev/null
+From 17a0bc9bd697f75cfdf9b378d5eb2d7409c91340 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Lu=C3=ADs=20Henriques?= <lhenriques@suse.de>
+Date: Wed, 12 Oct 2022 14:13:30 +0100
+Subject: ext4: fix BUG_ON() when directory entry has invalid rec_len
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Luís Henriques <lhenriques@suse.de>
+
+commit 17a0bc9bd697f75cfdf9b378d5eb2d7409c91340 upstream.
+
+The rec_len field in the directory entry has to be a multiple of 4. A
+corrupted filesystem image can be used to hit a BUG() in
+ext4_rec_len_to_disk(), called from make_indexed_dir().
+
+ ------------[ cut here ]------------
+ kernel BUG at fs/ext4/ext4.h:2413!
+ ...
+ RIP: 0010:make_indexed_dir+0x53f/0x5f0
+ ...
+ Call Trace:
+ <TASK>
+ ? add_dirent_to_buf+0x1b2/0x200
+ ext4_add_entry+0x36e/0x480
+ ext4_add_nondir+0x2b/0xc0
+ ext4_create+0x163/0x200
+ path_openat+0x635/0xe90
+ do_filp_open+0xb4/0x160
+ ? __create_object.isra.0+0x1de/0x3b0
+ ? _raw_spin_unlock+0x12/0x30
+ do_sys_openat2+0x91/0x150
+ __x64_sys_open+0x6c/0xa0
+ do_syscall_64+0x3c/0x80
+ entry_SYSCALL_64_after_hwframe+0x46/0xb0
+
+The fix simply adds a call to ext4_check_dir_entry() to validate the
+directory entry, returning -EFSCORRUPTED if the entry is invalid.
+
+CC: stable@kernel.org
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=216540
+Signed-off-by: Luís Henriques <lhenriques@suse.de>
+Link: https://lore.kernel.org/r/20221012131330.32456-1-lhenriques@suse.de
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/namei.c | 10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -2259,8 +2259,16 @@ static int make_indexed_dir(handle_t *ha
+ memset(de, 0, len); /* wipe old data */
+ de = (struct ext4_dir_entry_2 *) data2;
+ top = data2 + len;
+- while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
++ while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) {
++ if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len,
++ (data2 + (blocksize - csum_size) -
++ (char *) de))) {
++ brelse(bh2);
++ brelse(bh);
++ return -EFSCORRUPTED;
++ }
+ de = de2;
++ }
+ de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
+ (char *) de, blocksize);
+
--- /dev/null
+From 1b8f787ef547230a3249bcf897221ef0cc78481b Mon Sep 17 00:00:00 2001
+From: Ye Bin <yebin10@huawei.com>
+Date: Tue, 18 Oct 2022 10:27:01 +0800
+Subject: ext4: fix warning in 'ext4_da_release_space'
+
+From: Ye Bin <yebin10@huawei.com>
+
+commit 1b8f787ef547230a3249bcf897221ef0cc78481b upstream.
+
+Syzkaller report issue as follows:
+EXT4-fs (loop0): Free/Dirty block details
+EXT4-fs (loop0): free_blocks=0
+EXT4-fs (loop0): dirty_blocks=0
+EXT4-fs (loop0): Block reservation details
+EXT4-fs (loop0): i_reserved_data_blocks=0
+EXT4-fs warning (device loop0): ext4_da_release_space:1527: ext4_da_release_space: ino 18, to_free 1 with only 0 reserved data blocks
+------------[ cut here ]------------
+WARNING: CPU: 0 PID: 92 at fs/ext4/inode.c:1528 ext4_da_release_space+0x25e/0x370 fs/ext4/inode.c:1524
+Modules linked in:
+CPU: 0 PID: 92 Comm: kworker/u4:4 Not tainted 6.0.0-syzkaller-09423-g493ffd6605b2 #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/22/2022
+Workqueue: writeback wb_workfn (flush-7:0)
+RIP: 0010:ext4_da_release_space+0x25e/0x370 fs/ext4/inode.c:1528
+RSP: 0018:ffffc900015f6c90 EFLAGS: 00010296
+RAX: 42215896cd52ea00 RBX: 0000000000000000 RCX: 42215896cd52ea00
+RDX: 0000000000000000 RSI: 0000000080000001 RDI: 0000000000000000
+RBP: 1ffff1100e907d96 R08: ffffffff816aa79d R09: fffff520002bece5
+R10: fffff520002bece5 R11: 1ffff920002bece4 R12: ffff888021fd2000
+R13: ffff88807483ecb0 R14: 0000000000000001 R15: ffff88807483e740
+FS: 0000000000000000(0000) GS:ffff8880b9a00000(0000) knlGS:0000000000000000
+CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00005555569ba628 CR3: 000000000c88e000 CR4: 00000000003506f0
+DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+Call Trace:
+ <TASK>
+ ext4_es_remove_extent+0x1ab/0x260 fs/ext4/extents_status.c:1461
+ mpage_release_unused_pages+0x24d/0xef0 fs/ext4/inode.c:1589
+ ext4_writepages+0x12eb/0x3be0 fs/ext4/inode.c:2852
+ do_writepages+0x3c3/0x680 mm/page-writeback.c:2469
+ __writeback_single_inode+0xd1/0x670 fs/fs-writeback.c:1587
+ writeback_sb_inodes+0xb3b/0x18f0 fs/fs-writeback.c:1870
+ wb_writeback+0x41f/0x7b0 fs/fs-writeback.c:2044
+ wb_do_writeback fs/fs-writeback.c:2187 [inline]
+ wb_workfn+0x3cb/0xef0 fs/fs-writeback.c:2227
+ process_one_work+0x877/0xdb0 kernel/workqueue.c:2289
+ worker_thread+0xb14/0x1330 kernel/workqueue.c:2436
+ kthread+0x266/0x300 kernel/kthread.c:376
+ ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306
+ </TASK>
+
+Above issue may happens as follows:
+ext4_da_write_begin
+ ext4_create_inline_data
+ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+ ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+__ext4_ioctl
+ ext4_ext_migrate -> will lead to eh->eh_entries not zero, and set extent flag
+ext4_da_write_begin
+ ext4_da_convert_inline_data_to_extent
+ ext4_da_write_inline_data_begin
+ ext4_da_map_blocks
+ ext4_insert_delayed_block
+ if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk))
+ if (!ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk))
+ ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); -> will return 1
+ allocated = true;
+ ext4_es_insert_delayed_block(inode, lblk, allocated);
+ext4_writepages
+ mpage_map_and_submit_extent(handle, &mpd, &give_up_on_write); -> return -ENOSPC
+ mpage_release_unused_pages(&mpd, give_up_on_write); -> give_up_on_write == 1
+ ext4_es_remove_extent
+ ext4_da_release_space(inode, reserved);
+ if (unlikely(to_free > ei->i_reserved_data_blocks))
+ -> to_free == 1 but ei->i_reserved_data_blocks == 0
+ -> then trigger warning as above
+
+To solve above issue, forbid inode do migrate which has inline data.
+
+Cc: stable@kernel.org
+Reported-by: syzbot+c740bb18df70ad00952e@syzkaller.appspotmail.com
+Signed-off-by: Ye Bin <yebin10@huawei.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Link: https://lore.kernel.org/r/20221018022701.683489-1-yebin10@huawei.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/migrate.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/migrate.c
++++ b/fs/ext4/migrate.c
+@@ -425,7 +425,8 @@ int ext4_ext_migrate(struct inode *inode
+ * already is extent-based, error out.
+ */
+ if (!ext4_has_feature_extents(inode->i_sb) ||
+- (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
++ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
++ ext4_has_inline_data(inode))
+ return -EINVAL;
+
+ if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
--- /dev/null
+From 9a8c5b0d061554fedd7dbe894e63aa34d0bac7c4 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Thu, 27 Oct 2022 16:04:36 -0400
+Subject: ext4: update the backup superblock's at the end of the online resize
+
+From: Theodore Ts'o <tytso@mit.edu>
+
+commit 9a8c5b0d061554fedd7dbe894e63aa34d0bac7c4 upstream.
+
+When expanding a file system using online resize, various fields in
+the superblock (e.g., s_blocks_count, s_inodes_count, etc.) change.
+To update the backup superblocks, the online resize uses the function
+update_backups() in fs/ext4/resize.c. This function was not updating
+the checksum field in the backup superblocks. This wasn't a big deal
+previously, because e2fsck didn't care about the checksum field in the
+backup superblock. (And indeed, update_backups() goes all the way
+back to the ext3 days, well before we had support for metadata
+checksums.)
+
+However, there is an alternate, more general way of updating
+superblock fields, ext4_update_primary_sb() in fs/ext4/ioctl.c. This
+function does check the checksum of the backup superblock, and if it
+doesn't match will mark the file system as corrupted. That was
+clearly not the intent, so avoid to aborting the resize when a bad
+superblock is found.
+
+In addition, teach update_backups() to properly update the checksum in
+the backup superblocks. We will eventually want to unify
+updapte_backups() with the infrasture in ext4_update_primary_sb(), but
+that's for another day.
+
+Note: The problem has been around for a while; it just didn't really
+matter until ext4_update_primary_sb() was added by commit bbc605cdb1e1
+("ext4: implement support for get/set fs label"). And it became
+trivially easy to reproduce after commit 827891a38acc ("ext4: update
+the s_overhead_clusters in the backup sb's when resizing") in v6.0.
+
+Cc: stable@kernel.org # 5.17+
+Fixes: bbc605cdb1e1 ("ext4: implement support for get/set fs label")
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/ioctl.c | 3 +--
+ fs/ext4/resize.c | 5 +++++
+ 2 files changed, 6 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -145,9 +145,8 @@ static int ext4_update_backup_sb(struct
+ if (ext4_has_metadata_csum(sb) &&
+ es->s_checksum != ext4_superblock_csum(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "Invalid checksum for backup "
+- "superblock %llu\n", sb_block);
++ "superblock %llu", sb_block);
+ unlock_buffer(bh);
+- err = -EFSBADCRC;
+ goto out_bh;
+ }
+ func(es, arg);
+--- a/fs/ext4/resize.c
++++ b/fs/ext4/resize.c
+@@ -1158,6 +1158,7 @@ static void update_backups(struct super_
+ while (group < sbi->s_groups_count) {
+ struct buffer_head *bh;
+ ext4_fsblk_t backup_block;
++ struct ext4_super_block *es;
+
+ /* Out of journal space, and can't get more - abort - so sad */
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+@@ -1186,6 +1187,10 @@ static void update_backups(struct super_
+ memcpy(bh->b_data, data, size);
+ if (rest)
+ memset(bh->b_data + size, 0, rest);
++ es = (struct ext4_super_block *) bh->b_data;
++ es->s_block_group_nr = cpu_to_le16(group);
++ if (ext4_has_metadata_csum(sb))
++ es->s_checksum = ext4_superblock_csum(sb, es);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
--- /dev/null
+From b6bcdc9f6b8321e4471ff45413b6410e16762a8d Mon Sep 17 00:00:00 2001
+From: Ryan Roberts <ryan.roberts@arm.com>
+Date: Thu, 27 Oct 2022 13:09:45 +0100
+Subject: KVM: arm64: Fix bad dereference on MTE-enabled systems
+
+From: Ryan Roberts <ryan.roberts@arm.com>
+
+commit b6bcdc9f6b8321e4471ff45413b6410e16762a8d upstream.
+
+enter_exception64() performs an MTE check, which involves dereferencing
+vcpu->kvm. While vcpu has already been fixed up to be a HYP VA pointer,
+kvm is still a pointer in the kernel VA space.
+
+This only affects nVHE configurations with MTE enabled, as in other
+cases, the pointer is either valid (VHE) or not dereferenced (!MTE).
+
+Fix this by first converting kvm to a HYP VA pointer.
+
+Fixes: ea7fc1bb1cd1 ("KVM: arm64: Introduce MTE VM feature")
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: Steven Price <steven.price@arm.com>
+[maz: commit message tidy-up]
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20221027120945.29679-1-ryan.roberts@arm.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/hyp/exception.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/arch/arm64/kvm/hyp/exception.c
++++ b/arch/arm64/kvm/hyp/exception.c
+@@ -13,6 +13,7 @@
+ #include <hyp/adjust_pc.h>
+ #include <linux/kvm_host.h>
+ #include <asm/kvm_emulate.h>
++#include <asm/kvm_mmu.h>
+
+ #if !defined (__KVM_NVHE_HYPERVISOR__) && !defined (__KVM_VHE_HYPERVISOR__)
+ #error Hypervisor code only!
+@@ -115,7 +116,7 @@ static void enter_exception64(struct kvm
+ new |= (old & PSR_C_BIT);
+ new |= (old & PSR_V_BIT);
+
+- if (kvm_has_mte(vcpu->kvm))
++ if (kvm_has_mte(kern_hyp_va(vcpu->kvm)))
+ new |= PSR_TCO_BIT;
+
+ new |= (old & PSR_DIT_BIT);
--- /dev/null
+From 4151bb636acf32bb2e6126cec8216b023117c0e9 Mon Sep 17 00:00:00 2001
+From: Marc Zyngier <maz@kernel.org>
+Date: Tue, 1 Nov 2022 12:19:51 +0000
+Subject: KVM: arm64: Fix SMPRI_EL1/TPIDR2_EL0 trapping on VHE
+
+From: Marc Zyngier <maz@kernel.org>
+
+commit 4151bb636acf32bb2e6126cec8216b023117c0e9 upstream.
+
+The trapping of SMPRI_EL1 and TPIDR2_EL0 currently only really
+work on nVHE, as only this mode uses the fine-grained trapping
+that controls these two registers.
+
+Move the trapping enable/disable code into
+__{de,}activate_traps_common(), allowing it to be called when it
+actually matters on VHE, and remove the flipping of EL2 control
+for TPIDR2_EL0, which only affects the host access of this
+register.
+
+Fixes: 861262ab8627 ("KVM: arm64: Handle SME host state when running guests")
+Reported-by: Mark Brown <broonie@kernel.org>
+Reviewed-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/86bkpqer4z.wl-maz@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/hyp/include/hyp/switch.h | 20 +++++++++++++++++++
+ arch/arm64/kvm/hyp/nvhe/switch.c | 26 -------------------------
+ arch/arm64/kvm/hyp/vhe/switch.c | 8 --------
+ 3 files changed, 20 insertions(+), 34 deletions(-)
+
+diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
+index 6cbbb6c02f66..3330d1b76bdd 100644
+--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
++++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
+@@ -87,6 +87,17 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
+
+ vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2);
+ write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
++
++ if (cpus_have_final_cap(ARM64_SME)) {
++ sysreg_clear_set_s(SYS_HFGRTR_EL2,
++ HFGxTR_EL2_nSMPRI_EL1_MASK |
++ HFGxTR_EL2_nTPIDR2_EL0_MASK,
++ 0);
++ sysreg_clear_set_s(SYS_HFGWTR_EL2,
++ HFGxTR_EL2_nSMPRI_EL1_MASK |
++ HFGxTR_EL2_nTPIDR2_EL0_MASK,
++ 0);
++ }
+ }
+
+ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
+@@ -96,6 +107,15 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
+ write_sysreg(0, hstr_el2);
+ if (kvm_arm_support_pmu_v3())
+ write_sysreg(0, pmuserenr_el0);
++
++ if (cpus_have_final_cap(ARM64_SME)) {
++ sysreg_clear_set_s(SYS_HFGRTR_EL2, 0,
++ HFGxTR_EL2_nSMPRI_EL1_MASK |
++ HFGxTR_EL2_nTPIDR2_EL0_MASK);
++ sysreg_clear_set_s(SYS_HFGWTR_EL2, 0,
++ HFGxTR_EL2_nSMPRI_EL1_MASK |
++ HFGxTR_EL2_nTPIDR2_EL0_MASK);
++ }
+ }
+
+ static inline void ___activate_traps(struct kvm_vcpu *vcpu)
+diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
+index 8e9d49a964be..c2cb46ca4fb6 100644
+--- a/arch/arm64/kvm/hyp/nvhe/switch.c
++++ b/arch/arm64/kvm/hyp/nvhe/switch.c
+@@ -55,18 +55,6 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
+ write_sysreg(val, cptr_el2);
+ write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2);
+
+- if (cpus_have_final_cap(ARM64_SME)) {
+- val = read_sysreg_s(SYS_HFGRTR_EL2);
+- val &= ~(HFGxTR_EL2_nTPIDR2_EL0_MASK |
+- HFGxTR_EL2_nSMPRI_EL1_MASK);
+- write_sysreg_s(val, SYS_HFGRTR_EL2);
+-
+- val = read_sysreg_s(SYS_HFGWTR_EL2);
+- val &= ~(HFGxTR_EL2_nTPIDR2_EL0_MASK |
+- HFGxTR_EL2_nSMPRI_EL1_MASK);
+- write_sysreg_s(val, SYS_HFGWTR_EL2);
+- }
+-
+ if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
+ struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
+
+@@ -110,20 +98,6 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
+
+ write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2);
+
+- if (cpus_have_final_cap(ARM64_SME)) {
+- u64 val;
+-
+- val = read_sysreg_s(SYS_HFGRTR_EL2);
+- val |= HFGxTR_EL2_nTPIDR2_EL0_MASK |
+- HFGxTR_EL2_nSMPRI_EL1_MASK;
+- write_sysreg_s(val, SYS_HFGRTR_EL2);
+-
+- val = read_sysreg_s(SYS_HFGWTR_EL2);
+- val |= HFGxTR_EL2_nTPIDR2_EL0_MASK |
+- HFGxTR_EL2_nSMPRI_EL1_MASK;
+- write_sysreg_s(val, SYS_HFGWTR_EL2);
+- }
+-
+ cptr = CPTR_EL2_DEFAULT;
+ if (vcpu_has_sve(vcpu) && (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED))
+ cptr |= CPTR_EL2_TZ;
+diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
+index 7acb87eaa092..1a97391fedd2 100644
+--- a/arch/arm64/kvm/hyp/vhe/switch.c
++++ b/arch/arm64/kvm/hyp/vhe/switch.c
+@@ -63,10 +63,6 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
+ __activate_traps_fpsimd32(vcpu);
+ }
+
+- if (cpus_have_final_cap(ARM64_SME))
+- write_sysreg(read_sysreg(sctlr_el2) & ~SCTLR_ELx_ENTP2,
+- sctlr_el2);
+-
+ write_sysreg(val, cpacr_el1);
+
+ write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1);
+@@ -88,10 +84,6 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
+ */
+ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
+
+- if (cpus_have_final_cap(ARM64_SME))
+- write_sysreg(read_sysreg(sctlr_el2) | SCTLR_ELx_ENTP2,
+- sctlr_el2);
+-
+ write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1);
+
+ if (!arm64_kernel_unmapped_at_el0())
+--
+2.38.1
+
--- /dev/null
+From 52491a38b2c2411f3f0229dc6ad610349c704a41 Mon Sep 17 00:00:00 2001
+From: Michal Luczaj <mhal@rbox.co>
+Date: Thu, 13 Oct 2022 21:12:19 +0000
+Subject: KVM: Initialize gfn_to_pfn_cache locks in dedicated helper
+
+From: Michal Luczaj <mhal@rbox.co>
+
+commit 52491a38b2c2411f3f0229dc6ad610349c704a41 upstream.
+
+Move the gfn_to_pfn_cache lock initialization to another helper and
+call the new helper during VM/vCPU creation. There are race
+conditions possible due to kvm_gfn_to_pfn_cache_init()'s
+ability to re-initialize the cache's locks.
+
+For example: a race between ioctl(KVM_XEN_HVM_EVTCHN_SEND) and
+kvm_gfn_to_pfn_cache_init() leads to a corrupted shinfo gpc lock.
+
+ (thread 1) | (thread 2)
+ |
+ kvm_xen_set_evtchn_fast |
+ read_lock_irqsave(&gpc->lock, ...) |
+ | kvm_gfn_to_pfn_cache_init
+ | rwlock_init(&gpc->lock)
+ read_unlock_irqrestore(&gpc->lock, ...) |
+
+Rename "cache_init" and "cache_destroy" to activate+deactivate to
+avoid implying that the cache really is destroyed/freed.
+
+Note, there more races in the newly named kvm_gpc_activate() that will
+be addressed separately.
+
+Fixes: 982ed0de4753 ("KVM: Reinstate gfn_to_pfn_cache with invalidation support")
+Cc: stable@vger.kernel.org
+Suggested-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Michal Luczaj <mhal@rbox.co>
+[sean: call out that this is a bug fix]
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20221013211234.1318131-2-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c | 12 +++++----
+ arch/x86/kvm/xen.c | 57 ++++++++++++++++++++++++-----------------------
+ include/linux/kvm_host.h | 24 ++++++++++++++-----
+ virt/kvm/pfncache.c | 21 +++++++++--------
+ 4 files changed, 66 insertions(+), 48 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -2304,11 +2304,11 @@ static void kvm_write_system_time(struct
+
+ /* we verify if the enable bit is set... */
+ if (system_time & 1) {
+- kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
+- KVM_HOST_USES_PFN, system_time & ~1ULL,
+- sizeof(struct pvclock_vcpu_time_info));
++ kvm_gpc_activate(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
++ KVM_HOST_USES_PFN, system_time & ~1ULL,
++ sizeof(struct pvclock_vcpu_time_info));
+ } else {
+- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
++ kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time);
+ }
+
+ return;
+@@ -3377,7 +3377,7 @@ static int kvm_pv_enable_async_pf_int(st
+
+ static void kvmclock_reset(struct kvm_vcpu *vcpu)
+ {
+- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
++ kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time);
+ vcpu->arch.time = 0;
+ }
+
+@@ -11629,6 +11629,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu
+ vcpu->arch.regs_avail = ~0;
+ vcpu->arch.regs_dirty = ~0;
+
++ kvm_gpc_init(&vcpu->arch.pv_time);
++
+ if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ else
+--- a/arch/x86/kvm/xen.c
++++ b/arch/x86/kvm/xen.c
+@@ -42,13 +42,13 @@ static int kvm_xen_shared_info_init(stru
+ int idx = srcu_read_lock(&kvm->srcu);
+
+ if (gfn == GPA_INVALID) {
+- kvm_gfn_to_pfn_cache_destroy(kvm, gpc);
++ kvm_gpc_deactivate(kvm, gpc);
+ goto out;
+ }
+
+ do {
+- ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, KVM_HOST_USES_PFN,
+- gpa, PAGE_SIZE);
++ ret = kvm_gpc_activate(kvm, gpc, NULL, KVM_HOST_USES_PFN, gpa,
++ PAGE_SIZE);
+ if (ret)
+ goto out;
+
+@@ -554,15 +554,15 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcp
+ offsetof(struct compat_vcpu_info, time));
+
+ if (data->u.gpa == GPA_INVALID) {
+- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
++ kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
+ r = 0;
+ break;
+ }
+
+- r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
+- &vcpu->arch.xen.vcpu_info_cache,
+- NULL, KVM_HOST_USES_PFN, data->u.gpa,
+- sizeof(struct vcpu_info));
++ r = kvm_gpc_activate(vcpu->kvm,
++ &vcpu->arch.xen.vcpu_info_cache, NULL,
++ KVM_HOST_USES_PFN, data->u.gpa,
++ sizeof(struct vcpu_info));
+ if (!r)
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+
+@@ -570,16 +570,16 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcp
+
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
+ if (data->u.gpa == GPA_INVALID) {
+- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+- &vcpu->arch.xen.vcpu_time_info_cache);
++ kvm_gpc_deactivate(vcpu->kvm,
++ &vcpu->arch.xen.vcpu_time_info_cache);
+ r = 0;
+ break;
+ }
+
+- r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
+- &vcpu->arch.xen.vcpu_time_info_cache,
+- NULL, KVM_HOST_USES_PFN, data->u.gpa,
+- sizeof(struct pvclock_vcpu_time_info));
++ r = kvm_gpc_activate(vcpu->kvm,
++ &vcpu->arch.xen.vcpu_time_info_cache,
++ NULL, KVM_HOST_USES_PFN, data->u.gpa,
++ sizeof(struct pvclock_vcpu_time_info));
+ if (!r)
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ break;
+@@ -590,16 +590,15 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcp
+ break;
+ }
+ if (data->u.gpa == GPA_INVALID) {
+- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+- &vcpu->arch.xen.runstate_cache);
++ kvm_gpc_deactivate(vcpu->kvm,
++ &vcpu->arch.xen.runstate_cache);
+ r = 0;
+ break;
+ }
+
+- r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
+- &vcpu->arch.xen.runstate_cache,
+- NULL, KVM_HOST_USES_PFN, data->u.gpa,
+- sizeof(struct vcpu_runstate_info));
++ r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
++ NULL, KVM_HOST_USES_PFN, data->u.gpa,
++ sizeof(struct vcpu_runstate_info));
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
+@@ -1817,7 +1816,12 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *
+ {
+ vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx;
+ vcpu->arch.xen.poll_evtchn = 0;
++
+ timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
++
++ kvm_gpc_init(&vcpu->arch.xen.runstate_cache);
++ kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache);
++ kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache);
+ }
+
+ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
+@@ -1825,18 +1829,17 @@ void kvm_xen_destroy_vcpu(struct kvm_vcp
+ if (kvm_xen_timer_enabled(vcpu))
+ kvm_xen_stop_timer(vcpu);
+
+- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+- &vcpu->arch.xen.runstate_cache);
+- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+- &vcpu->arch.xen.vcpu_info_cache);
+- kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+- &vcpu->arch.xen.vcpu_time_info_cache);
++ kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache);
++ kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
++ kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache);
++
+ del_timer_sync(&vcpu->arch.xen.poll_timer);
+ }
+
+ void kvm_xen_init_vm(struct kvm *kvm)
+ {
+ idr_init(&kvm->arch.xen.evtchn_ports);
++ kvm_gpc_init(&kvm->arch.xen.shinfo_cache);
+ }
+
+ void kvm_xen_destroy_vm(struct kvm *kvm)
+@@ -1844,7 +1847,7 @@ void kvm_xen_destroy_vm(struct kvm *kvm)
+ struct evtchnfd *evtchnfd;
+ int i;
+
+- kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
++ kvm_gpc_deactivate(kvm, &kvm->arch.xen.shinfo_cache);
+
+ idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
+ if (!evtchnfd->deliver.port.port)
+--- a/include/linux/kvm_host.h
++++ b/include/linux/kvm_host.h
+@@ -1241,8 +1241,18 @@ int kvm_vcpu_write_guest(struct kvm_vcpu
+ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
+
+ /**
+- * kvm_gfn_to_pfn_cache_init - prepare a cached kernel mapping and HPA for a
+- * given guest physical address.
++ * kvm_gpc_init - initialize gfn_to_pfn_cache.
++ *
++ * @gpc: struct gfn_to_pfn_cache object.
++ *
++ * This sets up a gfn_to_pfn_cache by initializing locks. Note, the cache must
++ * be zero-allocated (or zeroed by the caller before init).
++ */
++void kvm_gpc_init(struct gfn_to_pfn_cache *gpc);
++
++/**
++ * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest
++ * physical address.
+ *
+ * @kvm: pointer to kvm instance.
+ * @gpc: struct gfn_to_pfn_cache object.
+@@ -1266,9 +1276,9 @@ void kvm_vcpu_mark_page_dirty(struct kvm
+ * kvm_gfn_to_pfn_cache_check() to ensure that the cache is valid before
+ * accessing the target page.
+ */
+-int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+- struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
+- gpa_t gpa, unsigned long len);
++int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
++ struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
++ gpa_t gpa, unsigned long len);
+
+ /**
+ * kvm_gfn_to_pfn_cache_check - check validity of a gfn_to_pfn_cache.
+@@ -1325,7 +1335,7 @@ int kvm_gfn_to_pfn_cache_refresh(struct
+ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
+
+ /**
+- * kvm_gfn_to_pfn_cache_destroy - destroy and unlink a gfn_to_pfn_cache.
++ * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache.
+ *
+ * @kvm: pointer to kvm instance.
+ * @gpc: struct gfn_to_pfn_cache object.
+@@ -1333,7 +1343,7 @@ void kvm_gfn_to_pfn_cache_unmap(struct k
+ * This removes a cache from the @kvm's list to be processed on MMU notifier
+ * invocation.
+ */
+-void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
++void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
+
+ void kvm_sigset_activate(struct kvm_vcpu *vcpu);
+ void kvm_sigset_deactivate(struct kvm_vcpu *vcpu);
+--- a/virt/kvm/pfncache.c
++++ b/virt/kvm/pfncache.c
+@@ -346,17 +346,20 @@ void kvm_gfn_to_pfn_cache_unmap(struct k
+ }
+ EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_unmap);
+
++void kvm_gpc_init(struct gfn_to_pfn_cache *gpc)
++{
++ rwlock_init(&gpc->lock);
++ mutex_init(&gpc->refresh_lock);
++}
++EXPORT_SYMBOL_GPL(kvm_gpc_init);
+
+-int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+- struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
+- gpa_t gpa, unsigned long len)
++int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
++ struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
++ gpa_t gpa, unsigned long len)
+ {
+ WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage);
+
+ if (!gpc->active) {
+- rwlock_init(&gpc->lock);
+- mutex_init(&gpc->refresh_lock);
+-
+ gpc->khva = NULL;
+ gpc->pfn = KVM_PFN_ERR_FAULT;
+ gpc->uhva = KVM_HVA_ERR_BAD;
+@@ -371,9 +374,9 @@ int kvm_gfn_to_pfn_cache_init(struct kvm
+ }
+ return kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpa, len);
+ }
+-EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_init);
++EXPORT_SYMBOL_GPL(kvm_gpc_activate);
+
+-void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
++void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
+ {
+ if (gpc->active) {
+ spin_lock(&kvm->gpc_lock);
+@@ -384,4 +387,4 @@ void kvm_gfn_to_pfn_cache_destroy(struct
+ gpc->active = false;
+ }
+ }
+-EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_destroy);
++EXPORT_SYMBOL_GPL(kvm_gpc_deactivate);
--- /dev/null
+From ecbcf030b45666ad11bc98565e71dfbcb7be4393 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Thu, 13 Oct 2022 21:12:20 +0000
+Subject: KVM: Reject attempts to consume or refresh inactive gfn_to_pfn_cache
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit ecbcf030b45666ad11bc98565e71dfbcb7be4393 upstream.
+
+Reject kvm_gpc_check() and kvm_gpc_refresh() if the cache is inactive.
+Not checking the active flag during refresh is particularly egregious, as
+KVM can end up with a valid, inactive cache, which can lead to a variety
+of use-after-free bugs, e.g. consuming a NULL kernel pointer or missing
+an mmu_notifier invalidation due to the cache not being on the list of
+gfns to invalidate.
+
+Note, "active" needs to be set if and only if the cache is on the list
+of caches, i.e. is reachable via mmu_notifier events. If a relevant
+mmu_notifier event occurs while the cache is "active" but not on the
+list, KVM will not acquire the cache's lock and so will not serailize
+the mmu_notifier event with active users and/or kvm_gpc_refresh().
+
+A race between KVM_XEN_ATTR_TYPE_SHARED_INFO and KVM_XEN_HVM_EVTCHN_SEND
+can be exploited to trigger the bug.
+
+1. Deactivate shinfo cache:
+
+kvm_xen_hvm_set_attr
+case KVM_XEN_ATTR_TYPE_SHARED_INFO
+ kvm_gpc_deactivate
+ kvm_gpc_unmap
+ gpc->valid = false
+ gpc->khva = NULL
+ gpc->active = false
+
+Result: active = false, valid = false
+
+2. Cause cache refresh:
+
+kvm_arch_vm_ioctl
+case KVM_XEN_HVM_EVTCHN_SEND
+ kvm_xen_hvm_evtchn_send
+ kvm_xen_set_evtchn
+ kvm_xen_set_evtchn_fast
+ kvm_gpc_check
+ return -EWOULDBLOCK because !gpc->valid
+ kvm_xen_set_evtchn_fast
+ return -EWOULDBLOCK
+ kvm_gpc_refresh
+ hva_to_pfn_retry
+ gpc->valid = true
+ gpc->khva = not NULL
+
+Result: active = false, valid = true
+
+3. Race ioctl KVM_XEN_HVM_EVTCHN_SEND against ioctl
+KVM_XEN_ATTR_TYPE_SHARED_INFO:
+
+kvm_arch_vm_ioctl
+case KVM_XEN_HVM_EVTCHN_SEND
+ kvm_xen_hvm_evtchn_send
+ kvm_xen_set_evtchn
+ kvm_xen_set_evtchn_fast
+ read_lock gpc->lock
+ kvm_xen_hvm_set_attr case
+ KVM_XEN_ATTR_TYPE_SHARED_INFO
+ mutex_lock kvm->lock
+ kvm_xen_shared_info_init
+ kvm_gpc_activate
+ gpc->khva = NULL
+ kvm_gpc_check
+ [ Check passes because gpc->valid is
+ still true, even though gpc->khva
+ is already NULL. ]
+ shinfo = gpc->khva
+ pending_bits = shinfo->evtchn_pending
+ CRASH: test_and_set_bit(..., pending_bits)
+
+Fixes: 982ed0de4753 ("KVM: Reinstate gfn_to_pfn_cache with invalidation support")
+Cc: stable@vger.kernel.org
+Reported-by: : Michal Luczaj <mhal@rbox.co>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20221013211234.1318131-3-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ virt/kvm/pfncache.c | 41 ++++++++++++++++++++++++++++++++++-------
+ 1 file changed, 34 insertions(+), 7 deletions(-)
+
+--- a/virt/kvm/pfncache.c
++++ b/virt/kvm/pfncache.c
+@@ -81,6 +81,9 @@ bool kvm_gfn_to_pfn_cache_check(struct k
+ {
+ struct kvm_memslots *slots = kvm_memslots(kvm);
+
++ if (!gpc->active)
++ return false;
++
+ if ((gpa & ~PAGE_MASK) + len > PAGE_SIZE)
+ return false;
+
+@@ -240,10 +243,11 @@ int kvm_gfn_to_pfn_cache_refresh(struct
+ {
+ struct kvm_memslots *slots = kvm_memslots(kvm);
+ unsigned long page_offset = gpa & ~PAGE_MASK;
+- kvm_pfn_t old_pfn, new_pfn;
++ bool unmap_old = false;
+ unsigned long old_uhva;
++ kvm_pfn_t old_pfn;
+ void *old_khva;
+- int ret = 0;
++ int ret;
+
+ /*
+ * If must fit within a single page. The 'len' argument is
+@@ -261,6 +265,11 @@ int kvm_gfn_to_pfn_cache_refresh(struct
+
+ write_lock_irq(&gpc->lock);
+
++ if (!gpc->active) {
++ ret = -EINVAL;
++ goto out_unlock;
++ }
++
+ old_pfn = gpc->pfn;
+ old_khva = gpc->khva - offset_in_page(gpc->khva);
+ old_uhva = gpc->uhva;
+@@ -291,6 +300,7 @@ int kvm_gfn_to_pfn_cache_refresh(struct
+ /* If the HVA→PFN mapping was already valid, don't unmap it. */
+ old_pfn = KVM_PFN_ERR_FAULT;
+ old_khva = NULL;
++ ret = 0;
+ }
+
+ out:
+@@ -305,14 +315,15 @@ int kvm_gfn_to_pfn_cache_refresh(struct
+ gpc->khva = NULL;
+ }
+
+- /* Snapshot the new pfn before dropping the lock! */
+- new_pfn = gpc->pfn;
++ /* Detect a pfn change before dropping the lock! */
++ unmap_old = (old_pfn != gpc->pfn);
+
++out_unlock:
+ write_unlock_irq(&gpc->lock);
+
+ mutex_unlock(&gpc->refresh_lock);
+
+- if (old_pfn != new_pfn)
++ if (unmap_old)
+ gpc_unmap_khva(kvm, old_pfn, old_khva);
+
+ return ret;
+@@ -366,11 +377,19 @@ int kvm_gpc_activate(struct kvm *kvm, st
+ gpc->vcpu = vcpu;
+ gpc->usage = usage;
+ gpc->valid = false;
+- gpc->active = true;
+
+ spin_lock(&kvm->gpc_lock);
+ list_add(&gpc->list, &kvm->gpc_list);
+ spin_unlock(&kvm->gpc_lock);
++
++ /*
++ * Activate the cache after adding it to the list, a concurrent
++ * refresh must not establish a mapping until the cache is
++ * reachable by mmu_notifier events.
++ */
++ write_lock_irq(&gpc->lock);
++ gpc->active = true;
++ write_unlock_irq(&gpc->lock);
+ }
+ return kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpa, len);
+ }
+@@ -379,12 +398,20 @@ EXPORT_SYMBOL_GPL(kvm_gpc_activate);
+ void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
+ {
+ if (gpc->active) {
++ /*
++ * Deactivate the cache before removing it from the list, KVM
++ * must stall mmu_notifier events until all users go away, i.e.
++ * until gpc->lock is dropped and refresh is guaranteed to fail.
++ */
++ write_lock_irq(&gpc->lock);
++ gpc->active = false;
++ write_unlock_irq(&gpc->lock);
++
+ spin_lock(&kvm->gpc_lock);
+ list_del(&gpc->list);
+ spin_unlock(&kvm->gpc_lock);
+
+ kvm_gfn_to_pfn_cache_unmap(kvm, gpc);
+- gpc->active = false;
+ }
+ }
+ EXPORT_SYMBOL_GPL(kvm_gpc_deactivate);
--- /dev/null
+From 145dfad998eac74abc59219d936e905766ba2d98 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Thu, 6 Oct 2022 00:03:08 +0000
+Subject: KVM: VMX: Advertise PMU LBRs if and only if perf supports LBRs
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 145dfad998eac74abc59219d936e905766ba2d98 upstream.
+
+Advertise LBR support to userspace via MSR_IA32_PERF_CAPABILITIES if and
+only if perf fully supports LBRs. Perf may disable LBRs (by zeroing the
+number of LBRs) even on platforms the allegedly support LBRs, e.g. if
+probing any LBR MSRs during setup fails.
+
+Fixes: be635e34c284 ("KVM: vmx/pmu: Expose LBR_FMT in the MSR_IA32_PERF_CAPABILITIES")
+Reported-by: Like Xu <like.xu.linux@gmail.com>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20221006000314.73240-3-seanjc@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx/capabilities.h | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/vmx/capabilities.h
++++ b/arch/x86/kvm/vmx/capabilities.h
+@@ -404,6 +404,7 @@ static inline bool vmx_pebs_supported(vo
+ static inline u64 vmx_get_perf_capabilities(void)
+ {
+ u64 perf_cap = PMU_CAP_FW_WRITES;
++ struct x86_pmu_lbr lbr;
+ u64 host_perf_cap = 0;
+
+ if (!enable_pmu)
+@@ -412,7 +413,8 @@ static inline u64 vmx_get_perf_capabilit
+ if (boot_cpu_has(X86_FEATURE_PDCM))
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
+
+- perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
++ if (x86_perf_get_lbr(&lbr) >= 0 && lbr.nr)
++ perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
+
+ if (vmx_pebs_supported()) {
+ perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
--- /dev/null
+From 18e897d213cb152c786abab14919196bd9dc3a9f Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Thu, 6 Oct 2022 00:03:09 +0000
+Subject: KVM: VMX: Fold vmx_supported_debugctl() into vcpu_supported_debugctl()
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 18e897d213cb152c786abab14919196bd9dc3a9f upstream.
+
+Fold vmx_supported_debugctl() into vcpu_supported_debugctl(), its only
+caller. Setting bits only to clear them a few instructions later is
+rather silly, and splitting the logic makes things seem more complicated
+than they actually are.
+
+Opportunistically drop DEBUGCTLMSR_LBR_MASK now that there's a single
+reference to the pair of bits. The extra layer of indirection provides
+no meaningful value and makes it unnecessarily tedious to understand
+what KVM is doing.
+
+No functional change.
+
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20221006000314.73240-4-seanjc@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx/capabilities.h | 15 ---------------
+ arch/x86/kvm/vmx/vmx.c | 12 +++++++-----
+ 2 files changed, 7 insertions(+), 20 deletions(-)
+
+--- a/arch/x86/kvm/vmx/capabilities.h
++++ b/arch/x86/kvm/vmx/capabilities.h
+@@ -24,8 +24,6 @@ extern int __read_mostly pt_mode;
+ #define PMU_CAP_FW_WRITES (1ULL << 13)
+ #define PMU_CAP_LBR_FMT 0x3f
+
+-#define DEBUGCTLMSR_LBR_MASK (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI)
+-
+ struct nested_vmx_msrs {
+ /*
+ * We only store the "true" versions of the VMX capability MSRs. We
+@@ -425,19 +423,6 @@ static inline u64 vmx_get_perf_capabilit
+ return perf_cap;
+ }
+
+-static inline u64 vmx_supported_debugctl(void)
+-{
+- u64 debugctl = 0;
+-
+- if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+- debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+-
+- if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)
+- debugctl |= DEBUGCTLMSR_LBR_MASK;
+-
+- return debugctl;
+-}
+-
+ static inline bool cpu_has_notify_vmexit(void)
+ {
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2018,13 +2018,15 @@ static u64 nested_vmx_truncate_sysenter_
+
+ static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)
+ {
+- u64 debugctl = vmx_supported_debugctl();
++ u64 debugctl = 0;
+
+- if (!intel_pmu_lbr_is_enabled(vcpu))
+- debugctl &= ~DEBUGCTLMSR_LBR_MASK;
++ if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
++ guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
++ debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+
+- if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+- debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
++ if ((vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) &&
++ intel_pmu_lbr_is_enabled(vcpu))
++ debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+
+ return debugctl;
+ }
--- /dev/null
+From 1c1a41497ab879ac9608f3047f230af833eeef3d Mon Sep 17 00:00:00 2001
+From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+Date: Tue, 25 Oct 2022 08:37:49 -0400
+Subject: KVM: VMX: fully disable SGX if SECONDARY_EXEC_ENCLS_EXITING unavailable
+
+From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+
+commit 1c1a41497ab879ac9608f3047f230af833eeef3d upstream.
+
+Clear enable_sgx if ENCLS-exiting is not supported, i.e. if SGX cannot be
+virtualized. When KVM is loaded, adjust_vmx_controls checks that the
+bit is available before enabling the feature; however, other parts of the
+code check enable_sgx and not clearing the variable caused two different
+bugs, mostly affecting nested virtualization scenarios.
+
+First, because enable_sgx remained true, SECONDARY_EXEC_ENCLS_EXITING
+would be marked available in the capability MSR that are accessed by a
+nested hypervisor. KVM would then propagate the control from vmcs12
+to vmcs02 even if it isn't supported by the processor, thus causing an
+unexpected VM-Fail (exit code 0x7) in L1.
+
+Second, vmx_set_cpu_caps() would not clear the SGX bits when hardware
+support is unavailable. This is a much less problematic bug as it only
+happens if SGX is soft-disabled (available in the processor but hidden
+in CPUID) or if SGX is supported for bare metal but not in the VMCS
+(will never happen when running on bare metal, but can theoertically
+happen when running in a VM).
+
+Last but not least, this ensures that module params in sysfs reflect
+KVM's actual configuration.
+
+RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=2127128
+Fixes: 72add915fbd5 ("KVM: VMX: Enable SGX virtualization for SGX1, SGX2 and LC")
+Cc: stable@vger.kernel.org
+Suggested-by: Sean Christopherson <seanjc@google.com>
+Suggested-by: Bandan Das <bsd@redhat.com>
+Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+Message-Id: <20221025123749.2201649-1-eesposit@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -8281,6 +8281,11 @@ static __init int hardware_setup(void)
+ if (!cpu_has_virtual_nmis())
+ enable_vnmi = 0;
+
++#ifdef CONFIG_X86_SGX_KVM
++ if (!cpu_has_vmx_encls_vmexit())
++ enable_sgx = false;
++#endif
++
+ /*
+ * set_apic_access_page_addr() is used to reload apic access
+ * page upon invalidation. No need to do anything if not
--- /dev/null
+From b333b8ebb85d62469f32b52fa03fd7d1522afc03 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Thu, 6 Oct 2022 00:03:10 +0000
+Subject: KVM: VMX: Ignore guest CPUID for host userspace writes to DEBUGCTL
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit b333b8ebb85d62469f32b52fa03fd7d1522afc03 upstream.
+
+Ignore guest CPUID for host userspace writes to the DEBUGCTL MSR, KVM's
+ABI is that setting CPUID vs. state can be done in any order, i.e. KVM
+allows userspace to stuff MSRs prior to setting the guest's CPUID that
+makes the new MSR "legal".
+
+Keep the vmx_get_perf_capabilities() check for guest writes, even though
+it's technically unnecessary since the vCPU's PERF_CAPABILITIES is
+consulted when refreshing LBR support. A future patch will clean up
+vmx_get_perf_capabilities() to avoid the RDMSR on every call, at which
+point the paranoia will incur no meaningful overhead.
+
+Note, prior to vmx_get_perf_capabilities() checking that the host fully
+supports LBRs via x86_perf_get_lbr(), KVM effectively relied on
+intel_pmu_lbr_is_enabled() to guard against host userspace enabling LBRs
+on platforms without full support.
+
+Fixes: c646236344e9 ("KVM: vmx/pmu: Add PMU_CAP_LBR_FMT check when guest LBR is enabled")
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20221006000314.73240-5-seanjc@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx/vmx.c | 10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2016,16 +2016,16 @@ static u64 nested_vmx_truncate_sysenter_
+ return (unsigned long)data;
+ }
+
+-static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)
++static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
+ {
+ u64 debugctl = 0;
+
+ if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
+- guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
++ (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
+ debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+
+ if ((vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) &&
+- intel_pmu_lbr_is_enabled(vcpu))
++ (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
+ debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+
+ return debugctl;
+@@ -2100,7 +2100,9 @@ static int vmx_set_msr(struct kvm_vcpu *
+ vmcs_writel(GUEST_SYSENTER_ESP, data);
+ break;
+ case MSR_IA32_DEBUGCTLMSR: {
+- u64 invalid = data & ~vcpu_supported_debugctl(vcpu);
++ u64 invalid;
++
++ invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+ if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n",
--- /dev/null
+From 5015bb89b58225f97df6ac44383e7e8c8662c8c9 Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Tue, 25 Oct 2022 15:47:28 +0300
+Subject: KVM: x86: emulator: em_sysexit should update ctxt->mode
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit 5015bb89b58225f97df6ac44383e7e8c8662c8c9 upstream.
+
+SYSEXIT is one of the instructions that can change the
+processor mode, thus ctxt->mode should be updated after it.
+
+Note that this is likely a benign bug, because the only problematic
+mode change is from 32 bit to 64 bit which can lead to truncation of RIP,
+and it is not possible to do with sysexit,
+since sysexit running in 32 bit mode will be limited to 32 bit version.
+
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221025124741.228045-11-mlevitsk@redhat.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/emulate.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -2874,6 +2874,7 @@ static int em_sysexit(struct x86_emulate
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
+
+ ctxt->_eip = rdx;
++ ctxt->mode = usermode;
+ *reg_write(ctxt, VCPU_REGS_RSP) = rcx;
+
+ return X86EMUL_CONTINUE;
--- /dev/null
+From d087e0f79fa0dd336a9a6b2f79ec23120f5eff73 Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Tue, 25 Oct 2022 15:47:29 +0300
+Subject: KVM: x86: emulator: introduce emulator_recalc_and_set_mode
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit d087e0f79fa0dd336a9a6b2f79ec23120f5eff73 upstream.
+
+Some instructions update the cpu execution mode, which needs to update the
+emulation mode.
+
+Extract this code, and make assign_eip_far use it.
+
+assign_eip_far now reads CS, instead of getting it via a parameter,
+which is ok, because callers always assign CS to the same value
+before calling this function.
+
+No functional change is intended.
+
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221025124741.228045-12-mlevitsk@redhat.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/emulate.c | 85 ++++++++++++++++++++++++++++++++-----------------
+ 1 file changed, 57 insertions(+), 28 deletions(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -791,8 +791,7 @@ static int linearize(struct x86_emulate_
+ ctxt->mode, linear);
+ }
+
+-static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst,
+- enum x86emul_mode mode)
++static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst)
+ {
+ ulong linear;
+ int rc;
+@@ -802,41 +801,71 @@ static inline int assign_eip(struct x86_
+
+ if (ctxt->op_bytes != sizeof(unsigned long))
+ addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1);
+- rc = __linearize(ctxt, addr, &max_size, 1, false, true, mode, &linear);
++ rc = __linearize(ctxt, addr, &max_size, 1, false, true, ctxt->mode, &linear);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->_eip = addr.ea;
+ return rc;
+ }
+
++static inline int emulator_recalc_and_set_mode(struct x86_emulate_ctxt *ctxt)
++{
++ u64 efer;
++ struct desc_struct cs;
++ u16 selector;
++ u32 base3;
++
++ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
++
++ if (!(ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PE)) {
++ /* Real mode. cpu must not have long mode active */
++ if (efer & EFER_LMA)
++ return X86EMUL_UNHANDLEABLE;
++ ctxt->mode = X86EMUL_MODE_REAL;
++ return X86EMUL_CONTINUE;
++ }
++
++ if (ctxt->eflags & X86_EFLAGS_VM) {
++ /* Protected/VM86 mode. cpu must not have long mode active */
++ if (efer & EFER_LMA)
++ return X86EMUL_UNHANDLEABLE;
++ ctxt->mode = X86EMUL_MODE_VM86;
++ return X86EMUL_CONTINUE;
++ }
++
++ if (!ctxt->ops->get_segment(ctxt, &selector, &cs, &base3, VCPU_SREG_CS))
++ return X86EMUL_UNHANDLEABLE;
++
++ if (efer & EFER_LMA) {
++ if (cs.l) {
++ /* Proper long mode */
++ ctxt->mode = X86EMUL_MODE_PROT64;
++ } else if (cs.d) {
++ /* 32 bit compatibility mode*/
++ ctxt->mode = X86EMUL_MODE_PROT32;
++ } else {
++ ctxt->mode = X86EMUL_MODE_PROT16;
++ }
++ } else {
++ /* Legacy 32 bit / 16 bit mode */
++ ctxt->mode = cs.d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
++ }
++
++ return X86EMUL_CONTINUE;
++}
++
+ static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst)
+ {
+- return assign_eip(ctxt, dst, ctxt->mode);
++ return assign_eip(ctxt, dst);
+ }
+
+-static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
+- const struct desc_struct *cs_desc)
++static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst)
+ {
+- enum x86emul_mode mode = ctxt->mode;
+- int rc;
++ int rc = emulator_recalc_and_set_mode(ctxt);
+
+-#ifdef CONFIG_X86_64
+- if (ctxt->mode >= X86EMUL_MODE_PROT16) {
+- if (cs_desc->l) {
+- u64 efer = 0;
++ if (rc != X86EMUL_CONTINUE)
++ return rc;
+
+- ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+- if (efer & EFER_LMA)
+- mode = X86EMUL_MODE_PROT64;
+- } else
+- mode = X86EMUL_MODE_PROT32; /* temporary value */
+- }
+-#endif
+- if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32)
+- mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+- rc = assign_eip(ctxt, dst, mode);
+- if (rc == X86EMUL_CONTINUE)
+- ctxt->mode = mode;
+- return rc;
++ return assign_eip(ctxt, dst);
+ }
+
+ static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
+@@ -2170,7 +2199,7 @@ static int em_jmp_far(struct x86_emulate
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+- rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
++ rc = assign_eip_far(ctxt, ctxt->src.val);
+ /* Error handling is not implemented. */
+ if (rc != X86EMUL_CONTINUE)
+ return X86EMUL_UNHANDLEABLE;
+@@ -2248,7 +2277,7 @@ static int em_ret_far(struct x86_emulate
+ &new_desc);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+- rc = assign_eip_far(ctxt, eip, &new_desc);
++ rc = assign_eip_far(ctxt, eip);
+ /* Error handling is not implemented. */
+ if (rc != X86EMUL_CONTINUE)
+ return X86EMUL_UNHANDLEABLE;
+@@ -3468,7 +3497,7 @@ static int em_call_far(struct x86_emulat
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+- rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
++ rc = assign_eip_far(ctxt, ctxt->src.val);
+ if (rc != X86EMUL_CONTINUE)
+ goto fail;
+
--- /dev/null
+From ad8f9e69942c7db90758d9d774157e53bce94840 Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Tue, 25 Oct 2022 15:47:31 +0300
+Subject: KVM: x86: emulator: update the emulation mode after CR0 write
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit ad8f9e69942c7db90758d9d774157e53bce94840 upstream.
+
+Update the emulation mode when handling writes to CR0, because
+toggling CR0.PE switches between Real and Protected Mode, and toggling
+CR0.PG when EFER.LME=1 switches between Long and Protected Mode.
+
+This is likely a benign bug because there is no writeback of state,
+other than the RIP increment, and when toggling CR0.PE, the CPU has
+to execute code from a very low memory address.
+
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221025124741.228045-14-mlevitsk@redhat.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/emulate.c | 16 +++++++++++++++-
+ 1 file changed, 15 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -3639,11 +3639,25 @@ static int em_movbe(struct x86_emulate_c
+
+ static int em_cr_write(struct x86_emulate_ctxt *ctxt)
+ {
+- if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
++ int cr_num = ctxt->modrm_reg;
++ int r;
++
++ if (ctxt->ops->set_cr(ctxt, cr_num, ctxt->src.val))
+ return emulate_gp(ctxt, 0);
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
++
++ if (cr_num == 0) {
++ /*
++ * CR0 write might have updated CR0.PE and/or CR0.PG
++ * which can affect the cpu's execution mode.
++ */
++ r = emulator_recalc_and_set_mode(ctxt);
++ if (r != X86EMUL_CONTINUE)
++ return r;
++ }
++
+ return X86EMUL_CONTINUE;
+ }
+
--- /dev/null
+From 055f37f84e304e59c046d1accfd8f08462f52c4c Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Tue, 25 Oct 2022 15:47:30 +0300
+Subject: KVM: x86: emulator: update the emulation mode after rsm
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit 055f37f84e304e59c046d1accfd8f08462f52c4c upstream.
+
+Update the emulation mode after RSM so that RIP will be correctly
+written back, because the RSM instruction can switch the CPU mode from
+32 bit (or less) to 64 bit.
+
+This fixes a guest crash in case the #SMI is received while the guest
+runs a code from an address > 32 bit.
+
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20221025124741.228045-13-mlevitsk@redhat.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/emulate.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -2660,7 +2660,7 @@ static int em_rsm(struct x86_emulate_ctx
+ * those side effects need to be explicitly handled for both success
+ * and shutdown.
+ */
+- return X86EMUL_CONTINUE;
++ return emulator_recalc_and_set_mode(ctxt);
+
+ emulate_shutdown:
+ ctxt->ops->triple_fault(ctxt);
--- /dev/null
+From 0469e56a14bf8cfb80507e51b7aeec0332cdbc13 Mon Sep 17 00:00:00 2001
+From: Jim Mattson <jmattson@google.com>
+Date: Fri, 30 Sep 2022 00:51:58 +0200
+Subject: KVM: x86: Mask off reserved bits in CPUID.80000001H
+
+From: Jim Mattson <jmattson@google.com>
+
+commit 0469e56a14bf8cfb80507e51b7aeec0332cdbc13 upstream.
+
+KVM_GET_SUPPORTED_CPUID should only enumerate features that KVM
+actually supports. CPUID.80000001:EBX[27:16] are reserved bits and
+should be masked off.
+
+Fixes: 0771671749b5 ("KVM: Enhance guest cpuid management")
+Signed-off-by: Jim Mattson <jmattson@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/cpuid.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -1117,6 +1117,7 @@ static inline int __do_cpuid_func(struct
+ entry->eax = max(entry->eax, 0x80000021);
+ break;
+ case 0x80000001:
++ entry->ebx &= ~GENMASK(27, 16);
+ cpuid_entry_override(entry, CPUID_8000_0001_EDX);
+ cpuid_entry_override(entry, CPUID_8000_0001_ECX);
+ break;
--- /dev/null
+From eeb69eab57c6604ac90b3fd8e5ac43f24a5535b1 Mon Sep 17 00:00:00 2001
+From: Jim Mattson <jmattson@google.com>
+Date: Thu, 29 Sep 2022 15:51:59 -0700
+Subject: KVM: x86: Mask off reserved bits in CPUID.80000006H
+
+From: Jim Mattson <jmattson@google.com>
+
+commit eeb69eab57c6604ac90b3fd8e5ac43f24a5535b1 upstream.
+
+KVM_GET_SUPPORTED_CPUID should only enumerate features that KVM
+actually supports. CPUID.80000006H:EDX[17:16] are reserved bits and
+should be masked off.
+
+Fixes: 43d05de2bee7 ("KVM: pass through CPUID(0x80000006)")
+Signed-off-by: Jim Mattson <jmattson@google.com>
+Message-Id: <20220929225203.2234702-2-jmattson@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/cpuid.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -1121,7 +1121,8 @@ static inline int __do_cpuid_func(struct
+ cpuid_entry_override(entry, CPUID_8000_0001_ECX);
+ break;
+ case 0x80000006:
+- /* L2 cache and TLB: pass through host info. */
++ /* Drop reserved bits, pass host L2 cache and TLB info. */
++ entry->edx &= ~GENMASK(17, 16);
+ break;
+ case 0x80000007: /* Advanced power management */
+ /* invariant TSC is CPUID.80000007H:EDX[8] */
--- /dev/null
+From 7030d8530e533844e2f4b0e7476498afcd324634 Mon Sep 17 00:00:00 2001
+From: Jim Mattson <jmattson@google.com>
+Date: Thu, 29 Sep 2022 15:52:00 -0700
+Subject: KVM: x86: Mask off reserved bits in CPUID.80000008H
+
+From: Jim Mattson <jmattson@google.com>
+
+commit 7030d8530e533844e2f4b0e7476498afcd324634 upstream.
+
+KVM_GET_SUPPORTED_CPUID should only enumerate features that KVM
+actually supports. The following ranges of CPUID.80000008H are reserved
+and should be masked off:
+ ECX[31:18]
+ ECX[11:8]
+
+In addition, the PerfTscSize field at ECX[17:16] should also be zero
+because KVM does not set the PERFTSC bit at CPUID.80000001H.ECX[27].
+
+Fixes: 24c82e576b78 ("KVM: Sanitize cpuid")
+Signed-off-by: Jim Mattson <jmattson@google.com>
+Message-Id: <20220929225203.2234702-3-jmattson@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/cpuid.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -1152,6 +1152,7 @@ static inline int __do_cpuid_func(struct
+ g_phys_as = phys_as;
+
+ entry->eax = g_phys_as | (virt_as << 8);
++ entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8));
+ entry->edx = 0;
+ cpuid_entry_override(entry, CPUID_8000_0008_EBX);
+ break;
--- /dev/null
+From 079f6889818dd07903fb36c252532ab47ebb6d48 Mon Sep 17 00:00:00 2001
+From: Jim Mattson <jmattson@google.com>
+Date: Thu, 29 Sep 2022 15:52:01 -0700
+Subject: KVM: x86: Mask off reserved bits in CPUID.8000001AH
+
+From: Jim Mattson <jmattson@google.com>
+
+commit 079f6889818dd07903fb36c252532ab47ebb6d48 upstream.
+
+KVM_GET_SUPPORTED_CPUID should only enumerate features that KVM
+actually supports. In the case of CPUID.8000001AH, only three bits are
+currently defined. The 125 reserved bits should be masked off.
+
+Fixes: 24c82e576b78 ("KVM: Sanitize cpuid")
+Signed-off-by: Jim Mattson <jmattson@google.com>
+Message-Id: <20220929225203.2234702-4-jmattson@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/cpuid.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -1171,6 +1171,9 @@ static inline int __do_cpuid_func(struct
+ entry->ecx = entry->edx = 0;
+ break;
+ case 0x8000001a:
++ entry->eax &= GENMASK(2, 0);
++ entry->ebx = entry->ecx = entry->edx = 0;
++ break;
+ case 0x8000001e:
+ break;
+ case 0x8000001F:
--- /dev/null
+From 86c4f0d547f6460d0426ebb3ba0614f1134b8cda Mon Sep 17 00:00:00 2001
+From: Jim Mattson <jmattson@google.com>
+Date: Thu, 29 Sep 2022 15:52:03 -0700
+Subject: KVM: x86: Mask off reserved bits in CPUID.8000001FH
+
+From: Jim Mattson <jmattson@google.com>
+
+commit 86c4f0d547f6460d0426ebb3ba0614f1134b8cda upstream.
+
+KVM_GET_SUPPORTED_CPUID should only enumerate features that KVM
+actually supports. CPUID.8000001FH:EBX[31:16] are reserved bits and
+should be masked off.
+
+Fixes: 8765d75329a3 ("KVM: X86: Extend CPUID range to include new leaf")
+Signed-off-by: Jim Mattson <jmattson@google.com>
+Message-Id: <20220929225203.2234702-6-jmattson@google.com>
+Cc: stable@vger.kernel.org
+[Clear NumVMPL too. - Paolo]
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/cpuid.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -1183,7 +1183,8 @@ static inline int __do_cpuid_func(struct
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ } else {
+ cpuid_entry_override(entry, CPUID_8000_001F_EAX);
+-
++ /* Clear NumVMPL since KVM does not support VMPL. */
++ entry->ebx &= ~GENMASK(31, 12);
+ /*
+ * Enumerate '0' for "PA bits reduction", the adjusted
+ * MAXPHYADDR is enumerated directly (see 0x80000008).
--- /dev/null
+From 696db303e54f7352623d9f640e6c51d8fa9d5588 Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Tue, 25 Oct 2022 15:47:32 +0300
+Subject: KVM: x86: smm: number of GPRs in the SMRAM image depends on the image format
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit 696db303e54f7352623d9f640e6c51d8fa9d5588 upstream.
+
+On 64 bit host, if the guest doesn't have X86_FEATURE_LM, KVM will
+access 16 gprs to 32-bit smram image, causing out-ouf-bound ram
+access.
+
+On 32 bit host, the rsm_load_state_64/enter_smm_save_state_64
+is compiled out, thus access overflow can't happen.
+
+Fixes: b443183a25ab61 ("KVM: x86: Reduce the number of emulator GPRs to '8' for 32-bit KVM")
+
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20221025124741.228045-15-mlevitsk@redhat.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/emulate.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -2430,7 +2430,7 @@ static int rsm_load_state_32(struct x86_
+ ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED;
+ ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0);
+
+- for (i = 0; i < NR_EMULATOR_GPRS; i++)
++ for (i = 0; i < 8; i++)
+ *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4);
+
+ val = GET_SMSTATE(u32, smstate, 0x7fcc);
+@@ -2487,7 +2487,7 @@ static int rsm_load_state_64(struct x86_
+ u16 selector;
+ int i, r;
+
+- for (i = 0; i < NR_EMULATOR_GPRS; i++)
++ for (i = 0; i < 16; i++)
+ *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8);
+
+ ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78);
--- /dev/null
+From 2b6ae0962b421103feb41a80406732944b0665b3 Mon Sep 17 00:00:00 2001
+From: Helge Deller <deller@gmx.de>
+Date: Fri, 28 Oct 2022 18:12:49 +0200
+Subject: parisc: Avoid printing the hardware path twice
+
+From: Helge Deller <deller@gmx.de>
+
+commit 2b6ae0962b421103feb41a80406732944b0665b3 upstream.
+
+Avoid that the hardware path is shown twice in the kernel log, and clean
+up the output of the version numbers to show up in the same order as
+they are listed in the hardware database in the hardware.c file.
+Additionally, optimize the memory footprint of the hardware database
+and mark some code as init code.
+
+Fixes: cab56b51ec0e ("parisc: Fix device names in /proc/iomem")
+Signed-off-by: Helge Deller <deller@gmx.de>
+Cc: <stable@vger.kernel.org> # v4.9+
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/parisc/include/asm/hardware.h | 12 ++++++------
+ arch/parisc/kernel/drivers.c | 14 ++++++--------
+ 2 files changed, 12 insertions(+), 14 deletions(-)
+
+--- a/arch/parisc/include/asm/hardware.h
++++ b/arch/parisc/include/asm/hardware.h
+@@ -10,12 +10,12 @@
+ #define SVERSION_ANY_ID PA_SVERSION_ANY_ID
+
+ struct hp_hardware {
+- unsigned short hw_type:5; /* HPHW_xxx */
+- unsigned short hversion;
+- unsigned long sversion:28;
+- unsigned short opt;
+- const char name[80]; /* The hardware description */
+-};
++ unsigned int hw_type:8; /* HPHW_xxx */
++ unsigned int hversion:12;
++ unsigned int sversion:12;
++ unsigned char opt;
++ unsigned char name[59]; /* The hardware description */
++} __packed;
+
+ struct parisc_device;
+
+--- a/arch/parisc/kernel/drivers.c
++++ b/arch/parisc/kernel/drivers.c
+@@ -882,15 +882,13 @@ void __init walk_central_bus(void)
+ &root);
+ }
+
+-static void print_parisc_device(struct parisc_device *dev)
++static __init void print_parisc_device(struct parisc_device *dev)
+ {
+- char hw_path[64];
+- static int count;
++ static int count __initdata;
+
+- print_pa_hwpath(dev, hw_path);
+- pr_info("%d. %s at %pap [%s] { %d, 0x%x, 0x%.3x, 0x%.5x }",
+- ++count, dev->name, &(dev->hpa.start), hw_path, dev->id.hw_type,
+- dev->id.hversion_rev, dev->id.hversion, dev->id.sversion);
++ pr_info("%d. %s at %pap { type:%d, hv:%#x, sv:%#x, rev:%#x }",
++ ++count, dev->name, &(dev->hpa.start), dev->id.hw_type,
++ dev->id.hversion, dev->id.sversion, dev->id.hversion_rev);
+
+ if (dev->num_addrs) {
+ int k;
+@@ -1079,7 +1077,7 @@ static __init int qemu_print_iodc_data(s
+
+
+
+-static int print_one_device(struct device * dev, void * data)
++static __init int print_one_device(struct device * dev, void * data)
+ {
+ struct parisc_device * pdev = to_parisc_device(dev);
+
--- /dev/null
+From a0c9f1f2e53b8eb2ae43987a30e547ba56b4fa18 Mon Sep 17 00:00:00 2001
+From: Helge Deller <deller@gmx.de>
+Date: Thu, 27 Oct 2022 09:12:05 +0200
+Subject: parisc: Export iosapic_serial_irq() symbol for serial port driver
+
+From: Helge Deller <deller@gmx.de>
+
+commit a0c9f1f2e53b8eb2ae43987a30e547ba56b4fa18 upstream.
+
+The parisc serial port driver needs this symbol when it's compiled
+as module.
+
+Signed-off-by: Helge Deller <deller@gmx.de>
+Reported-by: kernel test robot <lkp@intel.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/parisc/iosapic.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/parisc/iosapic.c
++++ b/drivers/parisc/iosapic.c
+@@ -866,6 +866,7 @@ int iosapic_serial_irq(struct parisc_dev
+
+ return vi->txn_irq;
+ }
++EXPORT_SYMBOL(iosapic_serial_irq);
+ #endif
+
+
--- /dev/null
+From e8a18e3f00f3ee8d07c17ab1ea3ad4df4a3b6fe0 Mon Sep 17 00:00:00 2001
+From: Helge Deller <deller@gmx.de>
+Date: Fri, 21 Oct 2022 07:44:49 +0200
+Subject: parisc: Make 8250_gsc driver dependend on CONFIG_PARISC
+
+From: Helge Deller <deller@gmx.de>
+
+commit e8a18e3f00f3ee8d07c17ab1ea3ad4df4a3b6fe0 upstream.
+
+Although the name of the driver 8250_gsc.c suggests that it handles
+only serial ports on the GSC bus, it does handle serial ports listed
+in the parisc machine inventory as well, e.g. the serial ports in a
+C8000 PCI-only workstation.
+
+Change the dependency to CONFIG_PARISC, so that the driver gets included
+in the kernel even if CONFIG_GSC isn't set.
+
+Reported-by: Mikulas Patocka <mpatocka@redhat.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Helge Deller <deller@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/tty/serial/8250/Kconfig | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/tty/serial/8250/Kconfig
++++ b/drivers/tty/serial/8250/Kconfig
+@@ -118,7 +118,7 @@ config SERIAL_8250_CONSOLE
+
+ config SERIAL_8250_GSC
+ tristate
+- depends on SERIAL_8250 && GSC
++ depends on SERIAL_8250 && PARISC
+ default SERIAL_8250
+
+ config SERIAL_8250_DMA
perf-x86-intel-fix-pebs-event-constraints-for-spr.patch
net-remove-sock_support_zc-from-sockmap.patch
net-also-flag-accepted-sockets-supporting-msghdr-originated-zerocopy.patch
+parisc-make-8250_gsc-driver-dependend-on-config_parisc.patch
+parisc-export-iosapic_serial_irq-symbol-for-serial-port-driver.patch
+parisc-avoid-printing-the-hardware-path-twice.patch
+ext4-fix-warning-in-ext4_da_release_space.patch
+ext4-fix-bug_on-when-directory-entry-has-invalid-rec_len.patch
+ext4-update-the-backup-superblock-s-at-the-end-of-the-online-resize.patch
+x86-tdx-prepare-for-using-info-call-for-a-second-purpose.patch
+x86-tdx-panic-on-bad-configs-that-ve-on-private-memory-access.patch
+x86-syscall-include-asm-ptrace.h-in-syscall_wrapper-header.patch
+kvm-x86-mask-off-reserved-bits-in-cpuid.80000006h.patch
+kvm-x86-mask-off-reserved-bits-in-cpuid.8000001ah.patch
+kvm-x86-mask-off-reserved-bits-in-cpuid.80000008h.patch
+kvm-x86-mask-off-reserved-bits-in-cpuid.80000001h.patch
+kvm-x86-mask-off-reserved-bits-in-cpuid.8000001fh.patch
+kvm-vmx-advertise-pmu-lbrs-if-and-only-if-perf-supports-lbrs.patch
+kvm-vmx-fold-vmx_supported_debugctl-into-vcpu_supported_debugctl.patch
+kvm-vmx-ignore-guest-cpuid-for-host-userspace-writes-to-debugctl.patch
+kvm-vmx-fully-disable-sgx-if-secondary_exec_encls_exiting-unavailable.patch
+kvm-initialize-gfn_to_pfn_cache-locks-in-dedicated-helper.patch
+kvm-reject-attempts-to-consume-or-refresh-inactive-gfn_to_pfn_cache.patch
+kvm-arm64-fix-bad-dereference-on-mte-enabled-systems.patch
+kvm-arm64-fix-smpri_el1-tpidr2_el0-trapping-on-vhe.patch
+kvm-x86-smm-number-of-gprs-in-the-smram-image-depends-on-the-image-format.patch
+kvm-x86-emulator-em_sysexit-should-update-ctxt-mode.patch
+kvm-x86-emulator-introduce-emulator_recalc_and_set_mode.patch
+kvm-x86-emulator-update-the-emulation-mode-after-rsm.patch
+kvm-x86-emulator-update-the-emulation-mode-after-cr0-write.patch
--- /dev/null
+From 9440c42941606af4c379afa3cf8624f0dc43a629 Mon Sep 17 00:00:00 2001
+From: Jiri Olsa <olsajiri@gmail.com>
+Date: Tue, 18 Oct 2022 14:27:08 +0200
+Subject: x86/syscall: Include asm/ptrace.h in syscall_wrapper header
+
+From: Jiri Olsa <olsajiri@gmail.com>
+
+commit 9440c42941606af4c379afa3cf8624f0dc43a629 upstream.
+
+With just the forward declaration of the 'struct pt_regs' in
+syscall_wrapper.h, the syscall stub functions:
+
+ __[x64|ia32]_sys_*(struct pt_regs *regs)
+
+will have different definition of 'regs' argument in BTF data
+based on which object file they are defined in.
+
+If the syscall's object includes 'struct pt_regs' definition,
+the BTF argument data will point to a 'struct pt_regs' record,
+like:
+
+ [226] STRUCT 'pt_regs' size=168 vlen=21
+ 'r15' type_id=1 bits_offset=0
+ 'r14' type_id=1 bits_offset=64
+ 'r13' type_id=1 bits_offset=128
+ ...
+
+If not, it will point to a fwd declaration record:
+
+ [15439] FWD 'pt_regs' fwd_kind=struct
+
+and make bpf tracing program hooking on those functions unable
+to access fields from 'struct pt_regs'.
+
+Include asm/ptrace.h directly in syscall_wrapper.h to make sure all
+syscalls see 'struct pt_regs' definition. This then results in BTF for
+'__*_sys_*(struct pt_regs *regs)' functions to point to the actual
+struct, not just the forward declaration.
+
+ [ bp: No Fixes tag as this is not really a bug fix but "adjustment" so
+ that BTF is happy. ]
+
+Reported-by: Akihiro HARAI <jharai0815@gmail.com>
+Signed-off-by: Jiri Olsa <jolsa@kernel.org>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Acked-by: Andrii Nakryiko <andrii@kernel.org>
+Cc: <stable@vger.kernel.org> # this is needed only for BTF so kernels >= 5.15
+Link: https://lore.kernel.org/r/20221018122708.823792-1-jolsa@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/syscall_wrapper.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/include/asm/syscall_wrapper.h
++++ b/arch/x86/include/asm/syscall_wrapper.h
+@@ -6,7 +6,7 @@
+ #ifndef _ASM_X86_SYSCALL_WRAPPER_H
+ #define _ASM_X86_SYSCALL_WRAPPER_H
+
+-struct pt_regs;
++#include <asm/ptrace.h>
+
+ extern long __x64_sys_ni_syscall(const struct pt_regs *regs);
+ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
--- /dev/null
+From 373e715e31bf4e0f129befe87613a278fac228d3 Mon Sep 17 00:00:00 2001
+From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
+Date: Fri, 28 Oct 2022 17:12:20 +0300
+Subject: x86/tdx: Panic on bad configs that #VE on "private" memory access
+
+From: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+
+commit 373e715e31bf4e0f129befe87613a278fac228d3 upstream.
+
+All normal kernel memory is "TDX private memory". This includes
+everything from kernel stacks to kernel text. Handling
+exceptions on arbitrary accesses to kernel memory is essentially
+impossible because they can happen in horribly nasty places like
+kernel entry/exit. But, TDX hardware can theoretically _deliver_
+a virtualization exception (#VE) on any access to private memory.
+
+But, it's not as bad as it sounds. TDX can be configured to never
+deliver these exceptions on private memory with a "TD attribute"
+called ATTR_SEPT_VE_DISABLE. The guest has no way to *set* this
+attribute, but it can check it.
+
+Ensure ATTR_SEPT_VE_DISABLE is set in early boot. panic() if it
+is unset. There is no sane way for Linux to run with this
+attribute clear so a panic() is appropriate.
+
+There's small window during boot before the check where kernel
+has an early #VE handler. But the handler is only for port I/O
+and will also panic() as soon as it sees any other #VE, such as
+a one generated by a private memory access.
+
+[ dhansen: Rewrite changelog and rebase on new tdx_parse_tdinfo().
+ Add Kirill's tested-by because I made changes since
+ he wrote this. ]
+
+Fixes: 9a22bf6debbf ("x86/traps: Add #VE support for TDX guest")
+Reported-by: ruogui.ygr@alibaba-inc.com
+Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Tested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/all/20221028141220.29217-3-kirill.shutemov%40linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/coco/tdx/tdx.c | 21 ++++++++++++++++-----
+ 1 file changed, 16 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/coco/tdx/tdx.c
++++ b/arch/x86/coco/tdx/tdx.c
+@@ -34,6 +34,8 @@
+ #define VE_GET_PORT_NUM(e) ((e) >> 16)
+ #define VE_IS_IO_STRING(e) ((e) & BIT(4))
+
++#define ATTR_SEPT_VE_DISABLE BIT(28)
++
+ /*
+ * Wrapper for standard use of __tdx_hypercall with no output aside from
+ * return code.
+@@ -102,6 +104,7 @@ static void tdx_parse_tdinfo(u64 *cc_mas
+ {
+ struct tdx_module_output out;
+ unsigned int gpa_width;
++ u64 td_attr;
+
+ /*
+ * TDINFO TDX module call is used to get the TD execution environment
+@@ -109,19 +112,27 @@ static void tdx_parse_tdinfo(u64 *cc_mas
+ * information, etc. More details about the ABI can be found in TDX
+ * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
+ * [TDG.VP.INFO].
+- *
+- * The GPA width that comes out of this call is critical. TDX guests
+- * can not meaningfully run without it.
+ */
+ tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out);
+
+- gpa_width = out.rcx & GENMASK(5, 0);
+-
+ /*
+ * The highest bit of a guest physical address is the "sharing" bit.
+ * Set it for shared pages and clear it for private pages.
++ *
++ * The GPA width that comes out of this call is critical. TDX guests
++ * can not meaningfully run without it.
+ */
++ gpa_width = out.rcx & GENMASK(5, 0);
+ *cc_mask = BIT_ULL(gpa_width - 1);
++
++ /*
++ * The kernel can not handle #VE's when accessing normal kernel
++ * memory. Ensure that no #VE will be delivered for accesses to
++ * TD-private memory. Only VMM-shared memory (MMIO) will #VE.
++ */
++ td_attr = out.rdx;
++ if (!(td_attr & ATTR_SEPT_VE_DISABLE))
++ panic("TD misconfiguration: SEPT_VE_DISABLE attibute must be set.\n");
+ }
+
+ /*
--- /dev/null
+From a6dd6f39008bb3ef7c73ef0a2acc2a4209555bd8 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Fri, 28 Oct 2022 17:12:19 +0300
+Subject: x86/tdx: Prepare for using "INFO" call for a second purpose
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit a6dd6f39008bb3ef7c73ef0a2acc2a4209555bd8 upstream.
+
+The TDG.VP.INFO TDCALL provides the guest with various details about
+the TDX system that the guest needs to run. Only one field is currently
+used: 'gpa_width' which tells the guest which PTE bits mark pages shared
+or private.
+
+A second field is now needed: the guest "TD attributes" to tell if
+virtualization exceptions are configured in a way that can harm the guest.
+
+Make the naming and calling convention more generic and discrete from the
+mask-centric one.
+
+Thanks to Sathya for the inspiration here, but there's no code, comments
+or changelogs left from where he started.
+
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Tested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/coco/tdx/tdx.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/coco/tdx/tdx.c
++++ b/arch/x86/coco/tdx/tdx.c
+@@ -98,7 +98,7 @@ static inline void tdx_module_call(u64 f
+ panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
+ }
+
+-static u64 get_cc_mask(void)
++static void tdx_parse_tdinfo(u64 *cc_mask)
+ {
+ struct tdx_module_output out;
+ unsigned int gpa_width;
+@@ -121,7 +121,7 @@ static u64 get_cc_mask(void)
+ * The highest bit of a guest physical address is the "sharing" bit.
+ * Set it for shared pages and clear it for private pages.
+ */
+- return BIT_ULL(gpa_width - 1);
++ *cc_mask = BIT_ULL(gpa_width - 1);
+ }
+
+ /*
+@@ -758,7 +758,7 @@ void __init tdx_early_init(void)
+ setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
+
+ cc_set_vendor(CC_VENDOR_INTEL);
+- cc_mask = get_cc_mask();
++ tdx_parse_tdinfo(&cc_mask);
+ cc_set_mask(cc_mask);
+
+ /*