--- /dev/null
+From de3a0021a60635de96aa92713c1a31a96747d72c Mon Sep 17 00:00:00 2001
+From: Jim Mattson <jmattson@google.com>
+Date: Mon, 27 Nov 2017 17:22:25 -0600
+Subject: KVM: nVMX: Eliminate vmcs02 pool
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Jim Mattson <jmattson@google.com>
+
+commit de3a0021a60635de96aa92713c1a31a96747d72c upstream.
+
+The potential performance advantages of a vmcs02 pool have never been
+realized. To simplify the code, eliminate the pool. Instead, a single
+vmcs02 is allocated per VCPU when the VCPU enters VMX operation.
+
+Cc: stable@vger.kernel.org # prereq for Spectre mitigation
+Signed-off-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Mark Kanda <mark.kanda@oracle.com>
+Reviewed-by: Ameya More <ameya.more@oracle.com>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx.c | 146 ++++++++---------------------------------------------
+ 1 file changed, 23 insertions(+), 123 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -173,7 +173,6 @@ module_param(ple_window_max, int, S_IRUG
+ extern const ulong vmx_return;
+
+ #define NR_AUTOLOAD_MSRS 8
+-#define VMCS02_POOL_SIZE 1
+
+ struct vmcs {
+ u32 revision_id;
+@@ -207,7 +206,7 @@ struct shared_msr_entry {
+ * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
+ * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
+ * More than one of these structures may exist, if L1 runs multiple L2 guests.
+- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
++ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
+ * underlying hardware which will be used to run L2.
+ * This structure is packed to ensure that its layout is identical across
+ * machines (necessary for live migration).
+@@ -386,13 +385,6 @@ struct __packed vmcs12 {
+ */
+ #define VMCS12_SIZE 0x1000
+
+-/* Used to remember the last vmcs02 used for some recently used vmcs12s */
+-struct vmcs02_list {
+- struct list_head list;
+- gpa_t vmptr;
+- struct loaded_vmcs vmcs02;
+-};
+-
+ /*
+ * The nested_vmx structure is part of vcpu_vmx, and holds information we need
+ * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
+@@ -419,15 +411,15 @@ struct nested_vmx {
+ */
+ bool sync_shadow_vmcs;
+
+- /* vmcs02_list cache of VMCSs recently used to run L2 guests */
+- struct list_head vmcs02_pool;
+- int vmcs02_num;
+ bool change_vmcs01_virtual_x2apic_mode;
+ /* L2 must run next, and mustn't decide to exit to L1. */
+ bool nested_run_pending;
++
++ struct loaded_vmcs vmcs02;
++
+ /*
+- * Guest pages referred to in vmcs02 with host-physical pointers, so
+- * we must keep them pinned while L2 runs.
++ * Guest pages referred to in the vmcs02 with host-physical
++ * pointers, so we must keep them pinned while L2 runs.
+ */
+ struct page *apic_access_page;
+ struct page *virtual_apic_page;
+@@ -6684,94 +6676,6 @@ static int handle_monitor(struct kvm_vcp
+ }
+
+ /*
+- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
+- * We could reuse a single VMCS for all the L2 guests, but we also want the
+- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
+- * allows keeping them loaded on the processor, and in the future will allow
+- * optimizations where prepare_vmcs02 doesn't need to set all the fields on
+- * every entry if they never change.
+- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
+- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
+- *
+- * The following functions allocate and free a vmcs02 in this pool.
+- */
+-
+-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
+-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
+-{
+- struct vmcs02_list *item;
+- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
+- if (item->vmptr == vmx->nested.current_vmptr) {
+- list_move(&item->list, &vmx->nested.vmcs02_pool);
+- return &item->vmcs02;
+- }
+-
+- if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
+- /* Recycle the least recently used VMCS. */
+- item = list_last_entry(&vmx->nested.vmcs02_pool,
+- struct vmcs02_list, list);
+- item->vmptr = vmx->nested.current_vmptr;
+- list_move(&item->list, &vmx->nested.vmcs02_pool);
+- return &item->vmcs02;
+- }
+-
+- /* Create a new VMCS */
+- item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
+- if (!item)
+- return NULL;
+- item->vmcs02.vmcs = alloc_vmcs();
+- item->vmcs02.shadow_vmcs = NULL;
+- if (!item->vmcs02.vmcs) {
+- kfree(item);
+- return NULL;
+- }
+- loaded_vmcs_init(&item->vmcs02);
+- item->vmptr = vmx->nested.current_vmptr;
+- list_add(&(item->list), &(vmx->nested.vmcs02_pool));
+- vmx->nested.vmcs02_num++;
+- return &item->vmcs02;
+-}
+-
+-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
+-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
+-{
+- struct vmcs02_list *item;
+- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
+- if (item->vmptr == vmptr) {
+- free_loaded_vmcs(&item->vmcs02);
+- list_del(&item->list);
+- kfree(item);
+- vmx->nested.vmcs02_num--;
+- return;
+- }
+-}
+-
+-/*
+- * Free all VMCSs saved for this vcpu, except the one pointed by
+- * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
+- * must be &vmx->vmcs01.
+- */
+-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
+-{
+- struct vmcs02_list *item, *n;
+-
+- WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
+- list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
+- /*
+- * Something will leak if the above WARN triggers. Better than
+- * a use-after-free.
+- */
+- if (vmx->loaded_vmcs == &item->vmcs02)
+- continue;
+-
+- free_loaded_vmcs(&item->vmcs02);
+- list_del(&item->list);
+- kfree(item);
+- vmx->nested.vmcs02_num--;
+- }
+-}
+-
+-/*
+ * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
+ * set the success or error code of an emulated VMX instruction, as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions".
+@@ -7084,6 +6988,12 @@ static int handle_vmon(struct kvm_vcpu *
+ return 1;
+ }
+
++ vmx->nested.vmcs02.vmcs = alloc_vmcs();
++ vmx->nested.vmcs02.shadow_vmcs = NULL;
++ if (!vmx->nested.vmcs02.vmcs)
++ goto out_vmcs02;
++ loaded_vmcs_init(&vmx->nested.vmcs02);
++
+ if (cpu_has_vmx_msr_bitmap()) {
+ vmx->nested.msr_bitmap =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+@@ -7106,9 +7016,6 @@ static int handle_vmon(struct kvm_vcpu *
+ vmx->vmcs01.shadow_vmcs = shadow_vmcs;
+ }
+
+- INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
+- vmx->nested.vmcs02_num = 0;
+-
+ hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_PINNED);
+ vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
+@@ -7126,6 +7033,9 @@ out_cached_vmcs12:
+ free_page((unsigned long)vmx->nested.msr_bitmap);
+
+ out_msr_bitmap:
++ free_loaded_vmcs(&vmx->nested.vmcs02);
++
++out_vmcs02:
+ return -ENOMEM;
+ }
+
+@@ -7211,7 +7121,7 @@ static void free_nested(struct vcpu_vmx
+ vmx->vmcs01.shadow_vmcs = NULL;
+ }
+ kfree(vmx->nested.cached_vmcs12);
+- /* Unpin physical memory we referred to in current vmcs02 */
++ /* Unpin physical memory we referred to in the vmcs02 */
+ if (vmx->nested.apic_access_page) {
+ nested_release_page(vmx->nested.apic_access_page);
+ vmx->nested.apic_access_page = NULL;
+@@ -7227,7 +7137,7 @@ static void free_nested(struct vcpu_vmx
+ vmx->nested.pi_desc = NULL;
+ }
+
+- nested_free_all_saved_vmcss(vmx);
++ free_loaded_vmcs(&vmx->nested.vmcs02);
+ }
+
+ /* Emulate the VMXOFF instruction */
+@@ -7261,8 +7171,6 @@ static int handle_vmclear(struct kvm_vcp
+ vmptr + offsetof(struct vmcs12, launch_state),
+ &zero, sizeof(zero));
+
+- nested_free_vmcs02(vmx, vmptr);
+-
+ skip_emulated_instruction(vcpu);
+ nested_vmx_succeed(vcpu);
+ return 1;
+@@ -8051,10 +7959,11 @@ static bool nested_vmx_exit_handled(stru
+
+ /*
+ * The host physical addresses of some pages of guest memory
+- * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
+- * may write to these pages via their host physical address while
+- * L2 is running, bypassing any address-translation-based dirty
+- * tracking (e.g. EPT write protection).
++ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
++ * Page). The CPU may write to these pages via their host
++ * physical address while L2 is running, bypassing any
++ * address-translation-based dirty tracking (e.g. EPT write
++ * protection).
+ *
+ * Mark them dirty on every exit from L2 to prevent them from
+ * getting out of sync with dirty tracking.
+@@ -10223,7 +10132,6 @@ static int nested_vmx_run(struct kvm_vcp
+ struct vmcs12 *vmcs12;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int cpu;
+- struct loaded_vmcs *vmcs02;
+ bool ia32e;
+ u32 msr_entry_idx;
+
+@@ -10363,17 +10271,13 @@ static int nested_vmx_run(struct kvm_vcp
+ * the nested entry.
+ */
+
+- vmcs02 = nested_get_current_vmcs02(vmx);
+- if (!vmcs02)
+- return -ENOMEM;
+-
+ enter_guest_mode(vcpu);
+
+ if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+ vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+
+ cpu = get_cpu();
+- vmx->loaded_vmcs = vmcs02;
++ vmx->loaded_vmcs = &vmx->nested.vmcs02;
+ vmx_vcpu_put(vcpu);
+ vmx_vcpu_load(vcpu, cpu);
+ vcpu->cpu = cpu;
+@@ -10888,10 +10792,6 @@ static void nested_vmx_vmexit(struct kvm
+ vm_exit_controls_reset_shadow(vmx);
+ vmx_segment_cache_clear(vmx);
+
+- /* if no vmcs02 cache requested, remove the one we used */
+- if (VMCS02_POOL_SIZE == 0)
+- nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
+-
+ load_vmcs12_host_state(vcpu, vmcs12);
+
+ /* Update any VMCS fields that might have changed while L2 ran */
--- /dev/null
+From c9f04407f2e0b3fc9ff7913c65fcfcb0a4b61570 Mon Sep 17 00:00:00 2001
+From: David Matlack <dmatlack@google.com>
+Date: Tue, 1 Aug 2017 14:00:40 -0700
+Subject: KVM: nVMX: mark vmcs12 pages dirty on L2 exit
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: David Matlack <dmatlack@google.com>
+
+commit c9f04407f2e0b3fc9ff7913c65fcfcb0a4b61570 upstream.
+
+The host physical addresses of L1's Virtual APIC Page and Posted
+Interrupt descriptor are loaded into the VMCS02. The CPU may write
+to these pages via their host physical address while L2 is running,
+bypassing address-translation-based dirty tracking (e.g. EPT write
+protection). Mark them dirty on every exit from L2 to prevent them
+from getting out of sync with dirty tracking.
+
+Also mark the virtual APIC page and the posted interrupt descriptor
+dirty when KVM is virtualizing posted interrupt processing.
+
+Signed-off-by: David Matlack <dmatlack@google.com>
+Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx.c | 53 +++++++++++++++++++++++++++++++++++++++++++----------
+ 1 file changed, 43 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -4738,6 +4738,28 @@ static bool vmx_get_enable_apicv(void)
+ return enable_apicv;
+ }
+
++static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
++{
++ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
++ gfn_t gfn;
++
++ /*
++ * Don't need to mark the APIC access page dirty; it is never
++ * written to by the CPU during APIC virtualization.
++ */
++
++ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
++ gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
++ kvm_vcpu_mark_page_dirty(vcpu, gfn);
++ }
++
++ if (nested_cpu_has_posted_intr(vmcs12)) {
++ gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
++ kvm_vcpu_mark_page_dirty(vcpu, gfn);
++ }
++}
++
++
+ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+ {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+@@ -4745,18 +4767,15 @@ static void vmx_complete_nested_posted_i
+ void *vapic_page;
+ u16 status;
+
+- if (vmx->nested.pi_desc &&
+- vmx->nested.pi_pending) {
+- vmx->nested.pi_pending = false;
+- if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+- return;
+-
+- max_irr = find_last_bit(
+- (unsigned long *)vmx->nested.pi_desc->pir, 256);
++ if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
++ return;
+
+- if (max_irr == 256)
+- return;
++ vmx->nested.pi_pending = false;
++ if (!pi_test_and_clear_on(vmx->nested.pi_desc))
++ return;
+
++ max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
++ if (max_irr != 256) {
+ vapic_page = kmap(vmx->nested.virtual_apic_page);
+ if (!vapic_page) {
+ WARN_ON(1);
+@@ -4772,6 +4791,8 @@ static void vmx_complete_nested_posted_i
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+ }
++
++ nested_mark_vmcs12_pages_dirty(vcpu);
+ }
+
+ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
+@@ -8028,6 +8049,18 @@ static bool nested_vmx_exit_handled(stru
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+ KVM_ISA_VMX);
+
++ /*
++ * The host physical addresses of some pages of guest memory
++ * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
++ * may write to these pages via their host physical address while
++ * L2 is running, bypassing any address-translation-based dirty
++ * tracking (e.g. EPT write protection).
++ *
++ * Mark them dirty on every exit from L2 to prevent them from
++ * getting out of sync with dirty tracking.
++ */
++ nested_mark_vmcs12_pages_dirty(vcpu);
++
+ if (vmx->nested.nested_run_pending)
+ return false;
+
--- /dev/null
+From 6342c50ad12e8ce0736e722184a7dbdea4a3477f Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Wed, 25 Jan 2017 11:58:58 +0100
+Subject: KVM: nVMX: vmx_complete_nested_posted_interrupt() can't fail
+
+From: David Hildenbrand <david@redhat.com>
+
+commit 6342c50ad12e8ce0736e722184a7dbdea4a3477f upstream.
+
+vmx_complete_nested_posted_interrupt() can't fail, let's turn it into
+a void function.
+
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -4738,7 +4738,7 @@ static bool vmx_get_enable_apicv(void)
+ return enable_apicv;
+ }
+
+-static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
++static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+ {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int max_irr;
+@@ -4749,13 +4749,13 @@ static int vmx_complete_nested_posted_in
+ vmx->nested.pi_pending) {
+ vmx->nested.pi_pending = false;
+ if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+- return 0;
++ return;
+
+ max_irr = find_last_bit(
+ (unsigned long *)vmx->nested.pi_desc->pir, 256);
+
+ if (max_irr == 256)
+- return 0;
++ return;
+
+ vapic_page = kmap(vmx->nested.virtual_apic_page);
+ if (!vapic_page) {
+@@ -4772,7 +4772,6 @@ static int vmx_complete_nested_posted_in
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+ }
+- return 0;
+ }
+
+ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
+@@ -10493,7 +10492,8 @@ static int vmx_check_nested_events(struc
+ return 0;
+ }
+
+- return vmx_complete_nested_posted_interrupt(vcpu);
++ vmx_complete_nested_posted_interrupt(vcpu);
++ return 0;
+ }
+
+ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
--- /dev/null
+From b2ac58f90540e39324e7a29a7ad471407ae0bf48 Mon Sep 17 00:00:00 2001
+From: KarimAllah Ahmed <karahmed@amazon.de>
+Date: Sat, 3 Feb 2018 15:56:23 +0100
+Subject: KVM/SVM: Allow direct access to MSR_IA32_SPEC_CTRL
+
+From: KarimAllah Ahmed <karahmed@amazon.de>
+
+commit b2ac58f90540e39324e7a29a7ad471407ae0bf48 upstream.
+
+[ Based on a patch from Paolo Bonzini <pbonzini@redhat.com> ]
+
+... basically doing exactly what we do for VMX:
+
+- Passthrough SPEC_CTRL to guests (if enabled in guest CPUID)
+- Save and restore SPEC_CTRL around VMExit and VMEntry only if the guest
+ actually used it.
+
+Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
+Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Cc: Jun Nakajima <jun.nakajima@intel.com>
+Cc: kvm@vger.kernel.org
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Asit Mallick <asit.k.mallick@intel.com>
+Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Ashok Raj <ashok.raj@intel.com>
+Link: https://lkml.kernel.org/r/1517669783-20732-1-git-send-email-karahmed@amazon.de
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 88 insertions(+)
+
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -183,6 +183,8 @@ struct vcpu_svm {
+ u64 gs_base;
+ } host;
+
++ u64 spec_ctrl;
++
+ u32 *msrpm;
+
+ ulong nmi_iret_rip;
+@@ -248,6 +250,7 @@ static const struct svm_direct_access_ms
+ { .index = MSR_CSTAR, .always = true },
+ { .index = MSR_SYSCALL_MASK, .always = true },
+ #endif
++ { .index = MSR_IA32_SPEC_CTRL, .always = false },
+ { .index = MSR_IA32_PRED_CMD, .always = false },
+ { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
+ { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
+@@ -863,6 +866,25 @@ static bool valid_msr_intercept(u32 inde
+ return false;
+ }
+
++static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
++{
++ u8 bit_write;
++ unsigned long tmp;
++ u32 offset;
++ u32 *msrpm;
++
++ msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
++ to_svm(vcpu)->msrpm;
++
++ offset = svm_msrpm_offset(msr);
++ bit_write = 2 * (msr & 0x0f) + 1;
++ tmp = msrpm[offset];
++
++ BUG_ON(offset == MSR_INVALID);
++
++ return !!test_bit(bit_write, &tmp);
++}
++
+ static void set_msr_interception(u32 *msrpm, unsigned msr,
+ int read, int write)
+ {
+@@ -1537,6 +1559,8 @@ static void svm_vcpu_reset(struct kvm_vc
+ u32 dummy;
+ u32 eax = 1;
+
++ svm->spec_ctrl = 0;
++
+ if (!init_event) {
+ svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE;
+@@ -3520,6 +3544,13 @@ static int svm_get_msr(struct kvm_vcpu *
+ case MSR_VM_CR:
+ msr_info->data = svm->nested.vm_cr_msr;
+ break;
++ case MSR_IA32_SPEC_CTRL:
++ if (!msr_info->host_initiated &&
++ !guest_cpuid_has_ibrs(vcpu))
++ return 1;
++
++ msr_info->data = svm->spec_ctrl;
++ break;
+ case MSR_IA32_UCODE_REV:
+ msr_info->data = 0x01000065;
+ break;
+@@ -3611,6 +3642,33 @@ static int svm_set_msr(struct kvm_vcpu *
+ case MSR_IA32_TSC:
+ kvm_write_tsc(vcpu, msr);
+ break;
++ case MSR_IA32_SPEC_CTRL:
++ if (!msr->host_initiated &&
++ !guest_cpuid_has_ibrs(vcpu))
++ return 1;
++
++ /* The STIBP bit doesn't fault even if it's not advertised */
++ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
++ return 1;
++
++ svm->spec_ctrl = data;
++
++ if (!data)
++ break;
++
++ /*
++ * For non-nested:
++ * When it's written (to non-zero) for the first time, pass
++ * it through.
++ *
++ * For nested:
++ * The handling of the MSR bitmap for L2 guests is done in
++ * nested_svm_vmrun_msrpm.
++ * We update the L1 MSR bit as well since it will end up
++ * touching the MSR anyway now.
++ */
++ set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
++ break;
+ case MSR_IA32_PRED_CMD:
+ if (!msr->host_initiated &&
+ !guest_cpuid_has_ibpb(vcpu))
+@@ -4854,6 +4912,15 @@ static void svm_vcpu_run(struct kvm_vcpu
+
+ local_irq_enable();
+
++ /*
++ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
++ * it's non-zero. Since vmentry is serialising on affected CPUs, there
++ * is no need to worry about the conditional branch over the wrmsr
++ * being speculatively taken.
++ */
++ if (svm->spec_ctrl)
++ wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
++
+ asm volatile (
+ "push %%" _ASM_BP "; \n\t"
+ "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
+@@ -4946,6 +5013,27 @@ static void svm_vcpu_run(struct kvm_vcpu
+ #endif
+ );
+
++ /*
++ * We do not use IBRS in the kernel. If this vCPU has used the
++ * SPEC_CTRL MSR it may have left it on; save the value and
++ * turn it off. This is much more efficient than blindly adding
++ * it to the atomic save/restore list. Especially as the former
++ * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
++ *
++ * For non-nested case:
++ * If the L01 MSR bitmap does not intercept the MSR, then we need to
++ * save it.
++ *
++ * For nested case:
++ * If the L02 MSR bitmap does not intercept the MSR, then we need to
++ * save it.
++ */
++ if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
++ rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
++
++ if (svm->spec_ctrl)
++ wrmsrl(MSR_IA32_SPEC_CTRL, 0);
++
+ /* Eliminate branch target predictions from guest mode */
+ vmexit_fill_RSB();
+
--- /dev/null
+From d28b387fb74da95d69d2615732f50cceb38e9a4d Mon Sep 17 00:00:00 2001
+From: KarimAllah Ahmed <karahmed@amazon.de>
+Date: Thu, 1 Feb 2018 22:59:45 +0100
+Subject: KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL
+
+From: KarimAllah Ahmed <karahmed@amazon.de>
+
+commit d28b387fb74da95d69d2615732f50cceb38e9a4d upstream.
+
+[ Based on a patch from Ashok Raj <ashok.raj@intel.com> ]
+
+Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for
+guests that will only mitigate Spectre V2 through IBRS+IBPB and will not
+be using a retpoline+IBPB based approach.
+
+To avoid the overhead of saving and restoring the MSR_IA32_SPEC_CTRL for
+guests that do not actually use the MSR, only start saving and restoring
+when a non-zero is written to it.
+
+No attempt is made to handle STIBP here, intentionally. Filtering STIBP
+may be added in a future patch, which may require trapping all writes
+if we don't want to pass it through directly to the guest.
+
+[dwmw2: Clean up CPUID bits, save/restore manually, handle reset]
+
+Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
+Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Cc: Jun Nakajima <jun.nakajima@intel.com>
+Cc: kvm@vger.kernel.org
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Asit Mallick <asit.k.mallick@intel.com>
+Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Ashok Raj <ashok.raj@intel.com>
+Link: https://lkml.kernel.org/r/1517522386-18410-5-git-send-email-karahmed@amazon.de
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/cpuid.c | 8 ++-
+ arch/x86/kvm/cpuid.h | 11 +++++
+ arch/x86/kvm/vmx.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++-
+ arch/x86/kvm/x86.c | 2
+ 4 files changed, 118 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -357,7 +357,7 @@ static inline int __do_cpuid_ent(struct
+
+ /* cpuid 0x80000008.ebx */
+ const u32 kvm_cpuid_8000_0008_ebx_x86_features =
+- F(IBPB);
++ F(IBPB) | F(IBRS);
+
+ /* cpuid 0xC0000001.edx */
+ const u32 kvm_cpuid_C000_0001_edx_x86_features =
+@@ -382,7 +382,7 @@ static inline int __do_cpuid_ent(struct
+
+ /* cpuid 7.0.edx*/
+ const u32 kvm_cpuid_7_0_edx_x86_features =
+- F(ARCH_CAPABILITIES);
++ F(SPEC_CTRL) | F(ARCH_CAPABILITIES);
+
+ /* all calls to cpuid_count() should be made on the same cpu */
+ get_cpu();
+@@ -618,9 +618,11 @@ static inline int __do_cpuid_ent(struct
+ g_phys_as = phys_as;
+ entry->eax = g_phys_as | (virt_as << 8);
+ entry->edx = 0;
+- /* IBPB isn't necessarily present in hardware cpuid */
++ /* IBRS and IBPB aren't necessarily present in hardware cpuid */
+ if (boot_cpu_has(X86_FEATURE_IBPB))
+ entry->ebx |= F(IBPB);
++ if (boot_cpu_has(X86_FEATURE_IBRS))
++ entry->ebx |= F(IBRS);
+ entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
+ cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
+ break;
+--- a/arch/x86/kvm/cpuid.h
++++ b/arch/x86/kvm/cpuid.h
+@@ -171,6 +171,17 @@ static inline bool guest_cpuid_has_ibpb(
+ return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
+ }
+
++static inline bool guest_cpuid_has_ibrs(struct kvm_vcpu *vcpu)
++{
++ struct kvm_cpuid_entry2 *best;
++
++ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
++ if (best && (best->ebx & bit(X86_FEATURE_IBRS)))
++ return true;
++ best = kvm_find_cpuid_entry(vcpu, 7, 0);
++ return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
++}
++
+ static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu)
+ {
+ struct kvm_cpuid_entry2 *best;
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -551,6 +551,7 @@ struct vcpu_vmx {
+ #endif
+
+ u64 arch_capabilities;
++ u64 spec_ctrl;
+
+ u32 vm_entry_controls_shadow;
+ u32 vm_exit_controls_shadow;
+@@ -1854,6 +1855,29 @@ static void update_exception_bitmap(stru
+ }
+
+ /*
++ * Check if MSR is intercepted for currently loaded MSR bitmap.
++ */
++static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
++{
++ unsigned long *msr_bitmap;
++ int f = sizeof(unsigned long);
++
++ if (!cpu_has_vmx_msr_bitmap())
++ return true;
++
++ msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
++
++ if (msr <= 0x1fff) {
++ return !!test_bit(msr, msr_bitmap + 0x800 / f);
++ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
++ msr &= 0x1fff;
++ return !!test_bit(msr, msr_bitmap + 0xc00 / f);
++ }
++
++ return true;
++}
++
++/*
+ * Check if MSR is intercepted for L01 MSR bitmap.
+ */
+ static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
+@@ -2983,6 +3007,13 @@ static int vmx_get_msr(struct kvm_vcpu *
+ case MSR_IA32_TSC:
+ msr_info->data = guest_read_tsc(vcpu);
+ break;
++ case MSR_IA32_SPEC_CTRL:
++ if (!msr_info->host_initiated &&
++ !guest_cpuid_has_ibrs(vcpu))
++ return 1;
++
++ msr_info->data = to_vmx(vcpu)->spec_ctrl;
++ break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has_arch_capabilities(vcpu))
+@@ -3093,6 +3124,36 @@ static int vmx_set_msr(struct kvm_vcpu *
+ case MSR_IA32_TSC:
+ kvm_write_tsc(vcpu, msr_info);
+ break;
++ case MSR_IA32_SPEC_CTRL:
++ if (!msr_info->host_initiated &&
++ !guest_cpuid_has_ibrs(vcpu))
++ return 1;
++
++ /* The STIBP bit doesn't fault even if it's not advertised */
++ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
++ return 1;
++
++ vmx->spec_ctrl = data;
++
++ if (!data)
++ break;
++
++ /*
++ * For non-nested:
++ * When it's written (to non-zero) for the first time, pass
++ * it through.
++ *
++ * For nested:
++ * The handling of the MSR bitmap for L2 guests is done in
++ * nested_vmx_merge_msr_bitmap. We should not touch the
++ * vmcs02.msr_bitmap here since it gets completely overwritten
++ * in the merging. We update the vmcs01 here for L1 as well
++ * since it will end up touching the MSR anyway now.
++ */
++ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
++ MSR_IA32_SPEC_CTRL,
++ MSR_TYPE_RW);
++ break;
+ case MSR_IA32_PRED_CMD:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has_ibpb(vcpu))
+@@ -5245,6 +5306,7 @@ static void vmx_vcpu_reset(struct kvm_vc
+ u64 cr0;
+
+ vmx->rmode.vm86_active = 0;
++ vmx->spec_ctrl = 0;
+
+ vmx->soft_vnmi_blocked = 0;
+
+@@ -8830,6 +8892,15 @@ static void __noclone vmx_vcpu_run(struc
+
+ vmx_arm_hv_timer(vcpu);
+
++ /*
++ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
++ * it's non-zero. Since vmentry is serialising on affected CPUs, there
++ * is no need to worry about the conditional branch over the wrmsr
++ * being speculatively taken.
++ */
++ if (vmx->spec_ctrl)
++ wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
++
+ vmx->__launched = vmx->loaded_vmcs->launched;
+ asm(
+ /* Store host registers */
+@@ -8948,6 +9019,27 @@ static void __noclone vmx_vcpu_run(struc
+ #endif
+ );
+
++ /*
++ * We do not use IBRS in the kernel. If this vCPU has used the
++ * SPEC_CTRL MSR it may have left it on; save the value and
++ * turn it off. This is much more efficient than blindly adding
++ * it to the atomic save/restore list. Especially as the former
++ * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
++ *
++ * For non-nested case:
++ * If the L01 MSR bitmap does not intercept the MSR, then we need to
++ * save it.
++ *
++ * For nested case:
++ * If the L02 MSR bitmap does not intercept the MSR, then we need to
++ * save it.
++ */
++ if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
++ rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
++
++ if (vmx->spec_ctrl)
++ wrmsrl(MSR_IA32_SPEC_CTRL, 0);
++
+ /* Eliminate branch target predictions from guest mode */
+ vmexit_fill_RSB();
+
+@@ -9507,7 +9599,7 @@ static inline bool nested_vmx_merge_msr_
+ unsigned long *msr_bitmap_l1;
+ unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
+ /*
+- * pred_cmd is trying to verify two things:
++ * pred_cmd & spec_ctrl are trying to verify two things:
+ *
+ * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
+ * ensures that we do not accidentally generate an L02 MSR bitmap
+@@ -9520,9 +9612,10 @@ static inline bool nested_vmx_merge_msr_
+ * the MSR.
+ */
+ bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
++ bool spec_ctrl = msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
+
+ if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+- !pred_cmd)
++ !pred_cmd && !spec_ctrl)
+ return false;
+
+ page = nested_get_page(vcpu, vmcs12->msr_bitmap);
+@@ -9561,6 +9654,12 @@ static inline bool nested_vmx_merge_msr_
+ }
+ }
+
++ if (spec_ctrl)
++ nested_vmx_disable_intercept_for_msr(
++ msr_bitmap_l1, msr_bitmap_l0,
++ MSR_IA32_SPEC_CTRL,
++ MSR_TYPE_R | MSR_TYPE_W);
++
+ if (pred_cmd)
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -975,7 +975,7 @@ static u32 msrs_to_save[] = {
+ #endif
+ MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+ MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+- MSR_IA32_ARCH_CAPABILITIES
++ MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
+ };
+
+ static unsigned num_msrs_to_save;
--- /dev/null
+From 28c1c9fabf48d6ad596273a11c46e0d0da3e14cd Mon Sep 17 00:00:00 2001
+From: KarimAllah Ahmed <karahmed@amazon.de>
+Date: Thu, 1 Feb 2018 22:59:44 +0100
+Subject: KVM/VMX: Emulate MSR_IA32_ARCH_CAPABILITIES
+
+From: KarimAllah Ahmed <karahmed@amazon.de>
+
+commit 28c1c9fabf48d6ad596273a11c46e0d0da3e14cd upstream.
+
+Intel processors use MSR_IA32_ARCH_CAPABILITIES MSR to indicate RDCL_NO
+(bit 0) and IBRS_ALL (bit 1). This is a read-only MSR. By default the
+contents will come directly from the hardware, but user-space can still
+override it.
+
+[dwmw2: The bit in kvm_cpuid_7_0_edx_x86_features can be unconditional]
+
+Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Cc: Jun Nakajima <jun.nakajima@intel.com>
+Cc: kvm@vger.kernel.org
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Asit Mallick <asit.k.mallick@intel.com>
+Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>
+Cc: Ashok Raj <ashok.raj@intel.com>
+Link: https://lkml.kernel.org/r/1517522386-18410-4-git-send-email-karahmed@amazon.de
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/cpuid.c | 8 +++++++-
+ arch/x86/kvm/cpuid.h | 8 ++++++++
+ arch/x86/kvm/vmx.c | 15 +++++++++++++++
+ arch/x86/kvm/x86.c | 1 +
+ 4 files changed, 31 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -380,6 +380,10 @@ static inline int __do_cpuid_ent(struct
+ /* cpuid 7.0.ecx*/
+ const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/;
+
++ /* cpuid 7.0.edx*/
++ const u32 kvm_cpuid_7_0_edx_x86_features =
++ F(ARCH_CAPABILITIES);
++
+ /* all calls to cpuid_count() should be made on the same cpu */
+ get_cpu();
+
+@@ -462,12 +466,14 @@ static inline int __do_cpuid_ent(struct
+ /* PKU is not yet implemented for shadow paging. */
+ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
+ entry->ecx &= ~F(PKU);
++ entry->edx &= kvm_cpuid_7_0_edx_x86_features;
++ cpuid_mask(&entry->edx, CPUID_7_EDX);
+ } else {
+ entry->ebx = 0;
+ entry->ecx = 0;
++ entry->edx = 0;
+ }
+ entry->eax = 0;
+- entry->edx = 0;
+ break;
+ }
+ case 9:
+--- a/arch/x86/kvm/cpuid.h
++++ b/arch/x86/kvm/cpuid.h
+@@ -171,6 +171,14 @@ static inline bool guest_cpuid_has_ibpb(
+ return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
+ }
+
++static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu)
++{
++ struct kvm_cpuid_entry2 *best;
++
++ best = kvm_find_cpuid_entry(vcpu, 7, 0);
++ return best && (best->edx & bit(X86_FEATURE_ARCH_CAPABILITIES));
++}
++
+
+ /*
+ * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -550,6 +550,8 @@ struct vcpu_vmx {
+ u64 msr_guest_kernel_gs_base;
+ #endif
+
++ u64 arch_capabilities;
++
+ u32 vm_entry_controls_shadow;
+ u32 vm_exit_controls_shadow;
+ /*
+@@ -2981,6 +2983,12 @@ static int vmx_get_msr(struct kvm_vcpu *
+ case MSR_IA32_TSC:
+ msr_info->data = guest_read_tsc(vcpu);
+ break;
++ case MSR_IA32_ARCH_CAPABILITIES:
++ if (!msr_info->host_initiated &&
++ !guest_cpuid_has_arch_capabilities(vcpu))
++ return 1;
++ msr_info->data = to_vmx(vcpu)->arch_capabilities;
++ break;
+ case MSR_IA32_SYSENTER_CS:
+ msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
+ break;
+@@ -3112,6 +3120,11 @@ static int vmx_set_msr(struct kvm_vcpu *
+ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
+ MSR_TYPE_W);
+ break;
++ case MSR_IA32_ARCH_CAPABILITIES:
++ if (!msr_info->host_initiated)
++ return 1;
++ vmx->arch_capabilities = data;
++ break;
+ case MSR_IA32_CR_PAT:
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
+@@ -5202,6 +5215,8 @@ static int vmx_vcpu_setup(struct vcpu_vm
+ ++vmx->nmsrs;
+ }
+
++ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
++ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
+
+ vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -975,6 +975,7 @@ static u32 msrs_to_save[] = {
+ #endif
+ MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+ MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
++ MSR_IA32_ARCH_CAPABILITIES
+ };
+
+ static unsigned num_msrs_to_save;
--- /dev/null
+From f21f165ef922c2146cc5bdc620f542953c41714b Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Thu, 11 Jan 2018 12:16:15 +0100
+Subject: KVM: VMX: introduce alloc_loaded_vmcs
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit f21f165ef922c2146cc5bdc620f542953c41714b upstream.
+
+Group together the calls to alloc_vmcs and loaded_vmcs_init. Soon we'll also
+allocate an MSR bitmap there.
+
+Cc: stable@vger.kernel.org # prereq for Spectre mitigation
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx.c | 38 +++++++++++++++++++++++---------------
+ 1 file changed, 23 insertions(+), 15 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -3524,11 +3524,6 @@ static struct vmcs *alloc_vmcs_cpu(int c
+ return vmcs;
+ }
+
+-static struct vmcs *alloc_vmcs(void)
+-{
+- return alloc_vmcs_cpu(raw_smp_processor_id());
+-}
+-
+ static void free_vmcs(struct vmcs *vmcs)
+ {
+ free_pages((unsigned long)vmcs, vmcs_config.order);
+@@ -3547,6 +3542,22 @@ static void free_loaded_vmcs(struct load
+ WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
+ }
+
++static struct vmcs *alloc_vmcs(void)
++{
++ return alloc_vmcs_cpu(raw_smp_processor_id());
++}
++
++static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
++{
++ loaded_vmcs->vmcs = alloc_vmcs();
++ if (!loaded_vmcs->vmcs)
++ return -ENOMEM;
++
++ loaded_vmcs->shadow_vmcs = NULL;
++ loaded_vmcs_init(loaded_vmcs);
++ return 0;
++}
++
+ static void free_kvm_area(void)
+ {
+ int cpu;
+@@ -6949,6 +6960,7 @@ static int handle_vmon(struct kvm_vcpu *
+ struct vmcs *shadow_vmcs;
+ const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
+ | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
++ int r;
+
+ /* The Intel VMX Instruction Reference lists a bunch of bits that
+ * are prerequisite to running VMXON, most notably cr4.VMXE must be
+@@ -6988,11 +7000,9 @@ static int handle_vmon(struct kvm_vcpu *
+ return 1;
+ }
+
+- vmx->nested.vmcs02.vmcs = alloc_vmcs();
+- vmx->nested.vmcs02.shadow_vmcs = NULL;
+- if (!vmx->nested.vmcs02.vmcs)
++ r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
++ if (r < 0)
+ goto out_vmcs02;
+- loaded_vmcs_init(&vmx->nested.vmcs02);
+
+ if (cpu_has_vmx_msr_bitmap()) {
+ vmx->nested.msr_bitmap =
+@@ -9113,17 +9123,15 @@ static struct kvm_vcpu *vmx_create_vcpu(
+ if (!vmx->guest_msrs)
+ goto free_pml;
+
+- vmx->loaded_vmcs = &vmx->vmcs01;
+- vmx->loaded_vmcs->vmcs = alloc_vmcs();
+- vmx->loaded_vmcs->shadow_vmcs = NULL;
+- if (!vmx->loaded_vmcs->vmcs)
+- goto free_msrs;
+ if (!vmm_exclusive)
+ kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
+- loaded_vmcs_init(vmx->loaded_vmcs);
++ err = alloc_loaded_vmcs(&vmx->vmcs01);
+ if (!vmm_exclusive)
+ kvm_cpu_vmxoff();
++ if (err < 0)
++ goto free_msrs;
+
++ vmx->loaded_vmcs = &vmx->vmcs01;
+ cpu = get_cpu();
+ vmx_vcpu_load(&vmx->vcpu, cpu);
+ vmx->vcpu.cpu = cpu;
--- /dev/null
+From 904e14fb7cb96401a7dc803ca2863fd5ba32ffe6 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Tue, 16 Jan 2018 16:51:18 +0100
+Subject: KVM: VMX: make MSR bitmaps per-VCPU
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 904e14fb7cb96401a7dc803ca2863fd5ba32ffe6 upstream.
+
+Place the MSR bitmap in struct loaded_vmcs, and update it in place
+every time the x2apic or APICv state can change. This is rare and
+the loop can handle 64 MSRs per iteration, in a similar fashion as
+nested_vmx_prepare_msr_bitmap.
+
+This prepares for choosing, on a per-VM basis, whether to intercept
+the SPEC_CTRL and PRED_CMD MSRs.
+
+Cc: stable@vger.kernel.org # prereq for Spectre mitigation
+Suggested-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx.c | 316 +++++++++++++++++++----------------------------------
+ 1 file changed, 115 insertions(+), 201 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -109,6 +109,14 @@ static u64 __read_mostly host_xss;
+ static bool __read_mostly enable_pml = 1;
+ module_param_named(pml, enable_pml, bool, S_IRUGO);
+
++#define MSR_TYPE_R 1
++#define MSR_TYPE_W 2
++#define MSR_TYPE_RW 3
++
++#define MSR_BITMAP_MODE_X2APIC 1
++#define MSR_BITMAP_MODE_X2APIC_APICV 2
++#define MSR_BITMAP_MODE_LM 4
++
+ #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
+
+ /* Guest_tsc -> host_tsc conversion requires 64-bit division. */
+@@ -190,6 +198,7 @@ struct loaded_vmcs {
+ struct vmcs *shadow_vmcs;
+ int cpu;
+ int launched;
++ unsigned long *msr_bitmap;
+ struct list_head loaded_vmcss_on_cpu_link;
+ };
+
+@@ -428,8 +437,6 @@ struct nested_vmx {
+ bool pi_pending;
+ u16 posted_intr_nv;
+
+- unsigned long *msr_bitmap;
+-
+ struct hrtimer preemption_timer;
+ bool preemption_timer_expired;
+
+@@ -530,6 +537,7 @@ struct vcpu_vmx {
+ unsigned long host_rsp;
+ u8 fail;
+ bool nmi_known_unmasked;
++ u8 msr_bitmap_mode;
+ u32 exit_intr_info;
+ u32 idt_vectoring_info;
+ ulong rflags;
+@@ -904,6 +912,7 @@ static u32 vmx_segment_access_rights(str
+ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
+ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
+ static int alloc_identity_pagetable(struct kvm *kvm);
++static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
+
+ static DEFINE_PER_CPU(struct vmcs *, vmxarea);
+ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
+@@ -923,12 +932,6 @@ static DEFINE_PER_CPU(spinlock_t, blocke
+
+ static unsigned long *vmx_io_bitmap_a;
+ static unsigned long *vmx_io_bitmap_b;
+-static unsigned long *vmx_msr_bitmap_legacy;
+-static unsigned long *vmx_msr_bitmap_longmode;
+-static unsigned long *vmx_msr_bitmap_legacy_x2apic;
+-static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+-static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+-static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
+ static unsigned long *vmx_vmread_bitmap;
+ static unsigned long *vmx_vmwrite_bitmap;
+
+@@ -2522,36 +2525,6 @@ static void move_msr_up(struct vcpu_vmx
+ vmx->guest_msrs[from] = tmp;
+ }
+
+-static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
+-{
+- unsigned long *msr_bitmap;
+-
+- if (is_guest_mode(vcpu))
+- msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
+- else if (cpu_has_secondary_exec_ctrls() &&
+- (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
+- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
+- if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
+- if (is_long_mode(vcpu))
+- msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
+- else
+- msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+- } else {
+- if (is_long_mode(vcpu))
+- msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
+- else
+- msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+- }
+- } else {
+- if (is_long_mode(vcpu))
+- msr_bitmap = vmx_msr_bitmap_longmode;
+- else
+- msr_bitmap = vmx_msr_bitmap_legacy;
+- }
+-
+- vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
+-}
+-
+ /*
+ * Set up the vmcs to automatically save and restore system
+ * msrs. Don't touch the 64-bit msrs if the guest is in legacy
+@@ -2592,7 +2565,7 @@ static void setup_msrs(struct vcpu_vmx *
+ vmx->save_nmsrs = save_nmsrs;
+
+ if (cpu_has_vmx_msr_bitmap())
+- vmx_set_msr_bitmap(&vmx->vcpu);
++ vmx_update_msr_bitmap(&vmx->vcpu);
+ }
+
+ /*
+@@ -3539,6 +3512,8 @@ static void free_loaded_vmcs(struct load
+ loaded_vmcs_clear(loaded_vmcs);
+ free_vmcs(loaded_vmcs->vmcs);
+ loaded_vmcs->vmcs = NULL;
++ if (loaded_vmcs->msr_bitmap)
++ free_page((unsigned long)loaded_vmcs->msr_bitmap);
+ WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
+ }
+
+@@ -3555,7 +3530,18 @@ static int alloc_loaded_vmcs(struct load
+
+ loaded_vmcs->shadow_vmcs = NULL;
+ loaded_vmcs_init(loaded_vmcs);
++
++ if (cpu_has_vmx_msr_bitmap()) {
++ loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
++ if (!loaded_vmcs->msr_bitmap)
++ goto out_vmcs;
++ memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
++ }
+ return 0;
++
++out_vmcs:
++ free_loaded_vmcs(loaded_vmcs);
++ return -ENOMEM;
+ }
+
+ static void free_kvm_area(void)
+@@ -4564,10 +4550,8 @@ static void free_vpid(int vpid)
+ spin_unlock(&vmx_vpid_lock);
+ }
+
+-#define MSR_TYPE_R 1
+-#define MSR_TYPE_W 2
+-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+- u32 msr, int type)
++static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
++ u32 msr, int type)
+ {
+ int f = sizeof(unsigned long);
+
+@@ -4601,8 +4585,8 @@ static void __vmx_disable_intercept_for_
+ }
+ }
+
+-static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+- u32 msr, int type)
++static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
++ u32 msr, int type)
+ {
+ int f = sizeof(unsigned long);
+
+@@ -4636,6 +4620,15 @@ static void __vmx_enable_intercept_for_m
+ }
+ }
+
++static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
++ u32 msr, int type, bool value)
++{
++ if (value)
++ vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
++ else
++ vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
++}
++
+ /*
+ * If a msr is allowed by L0, we should check whether it is allowed by L1.
+ * The corresponding bit will be cleared unless both of L0 and L1 allow it.
+@@ -4682,58 +4675,68 @@ static void nested_vmx_disable_intercept
+ }
+ }
+
+-static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
++static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
+ {
+- if (!longmode_only)
+- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
+- msr, MSR_TYPE_R | MSR_TYPE_W);
+- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
+- msr, MSR_TYPE_R | MSR_TYPE_W);
+-}
+-
+-static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
+-{
+- if (apicv_active) {
+- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+- msr, MSR_TYPE_R);
+- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+- msr, MSR_TYPE_R);
+- } else {
+- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+- msr, MSR_TYPE_R);
+- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+- msr, MSR_TYPE_R);
++ u8 mode = 0;
++
++ if (cpu_has_secondary_exec_ctrls() &&
++ (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
++ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
++ mode |= MSR_BITMAP_MODE_X2APIC;
++ if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
++ mode |= MSR_BITMAP_MODE_X2APIC_APICV;
+ }
++
++ if (is_long_mode(vcpu))
++ mode |= MSR_BITMAP_MODE_LM;
++
++ return mode;
+ }
+
+-static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
++#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
++
++static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
++ u8 mode)
+ {
+- if (apicv_active) {
+- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+- msr, MSR_TYPE_R);
+- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+- msr, MSR_TYPE_R);
+- } else {
+- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+- msr, MSR_TYPE_R);
+- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+- msr, MSR_TYPE_R);
++ int msr;
++
++ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
++ unsigned word = msr / BITS_PER_LONG;
++ msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
++ msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
++ }
++
++ if (mode & MSR_BITMAP_MODE_X2APIC) {
++ /*
++ * TPR reads and writes can be virtualized even if virtual interrupt
++ * delivery is not in use.
++ */
++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
++ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
++ vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
++ }
+ }
+ }
+
+-static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active)
++static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
+ {
+- if (apicv_active) {
+- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+- msr, MSR_TYPE_W);
+- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+- msr, MSR_TYPE_W);
+- } else {
+- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+- msr, MSR_TYPE_W);
+- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+- msr, MSR_TYPE_W);
+- }
++ struct vcpu_vmx *vmx = to_vmx(vcpu);
++ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
++ u8 mode = vmx_msr_bitmap_mode(vcpu);
++ u8 changed = mode ^ vmx->msr_bitmap_mode;
++
++ if (!changed)
++ return;
++
++ vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
++ !(mode & MSR_BITMAP_MODE_LM));
++
++ if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
++ vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
++
++ vmx->msr_bitmap_mode = mode;
+ }
+
+ static bool vmx_get_enable_apicv(void)
+@@ -4982,7 +4985,7 @@ static void vmx_refresh_apicv_exec_ctrl(
+ }
+
+ if (cpu_has_vmx_msr_bitmap())
+- vmx_set_msr_bitmap(vcpu);
++ vmx_update_msr_bitmap(vcpu);
+ }
+
+ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
+@@ -5071,7 +5074,7 @@ static int vmx_vcpu_setup(struct vcpu_vm
+ vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
+ }
+ if (cpu_has_vmx_msr_bitmap())
+- vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
++ vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
+
+ vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
+
+@@ -6402,7 +6405,7 @@ static void wakeup_handler(void)
+
+ static __init int hardware_setup(void)
+ {
+- int r = -ENOMEM, i, msr;
++ int r = -ENOMEM, i;
+
+ rdmsrl_safe(MSR_EFER, &host_efer);
+
+@@ -6417,41 +6420,13 @@ static __init int hardware_setup(void)
+ if (!vmx_io_bitmap_b)
+ goto out;
+
+- vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
+- if (!vmx_msr_bitmap_legacy)
+- goto out1;
+-
+- vmx_msr_bitmap_legacy_x2apic =
+- (unsigned long *)__get_free_page(GFP_KERNEL);
+- if (!vmx_msr_bitmap_legacy_x2apic)
+- goto out2;
+-
+- vmx_msr_bitmap_legacy_x2apic_apicv_inactive =
+- (unsigned long *)__get_free_page(GFP_KERNEL);
+- if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive)
+- goto out3;
+-
+- vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
+- if (!vmx_msr_bitmap_longmode)
+- goto out4;
+-
+- vmx_msr_bitmap_longmode_x2apic =
+- (unsigned long *)__get_free_page(GFP_KERNEL);
+- if (!vmx_msr_bitmap_longmode_x2apic)
+- goto out5;
+-
+- vmx_msr_bitmap_longmode_x2apic_apicv_inactive =
+- (unsigned long *)__get_free_page(GFP_KERNEL);
+- if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive)
+- goto out6;
+-
+ vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_vmread_bitmap)
+- goto out7;
++ goto out1;
+
+ vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_vmwrite_bitmap)
+- goto out8;
++ goto out2;
+
+ memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
+ memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
+@@ -6460,12 +6435,9 @@ static __init int hardware_setup(void)
+
+ memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
+
+- memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
+- memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
+-
+ if (setup_vmcs_config(&vmcs_config) < 0) {
+ r = -EIO;
+- goto out9;
++ goto out3;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_NX))
+@@ -6522,47 +6494,8 @@ static __init int hardware_setup(void)
+ kvm_tsc_scaling_ratio_frac_bits = 48;
+ }
+
+- vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
+- vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
+- vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
+- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
+- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
+- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
+-
+- memcpy(vmx_msr_bitmap_legacy_x2apic,
+- vmx_msr_bitmap_legacy, PAGE_SIZE);
+- memcpy(vmx_msr_bitmap_longmode_x2apic,
+- vmx_msr_bitmap_longmode, PAGE_SIZE);
+- memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+- vmx_msr_bitmap_legacy, PAGE_SIZE);
+- memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+- vmx_msr_bitmap_longmode, PAGE_SIZE);
+-
+ set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
+
+- /*
+- * enable_apicv && kvm_vcpu_apicv_active()
+- */
+- for (msr = 0x800; msr <= 0x8ff; msr++)
+- vmx_disable_intercept_msr_read_x2apic(msr, true);
+-
+- /* TMCCT */
+- vmx_enable_intercept_msr_read_x2apic(0x839, true);
+- /* TPR */
+- vmx_disable_intercept_msr_write_x2apic(0x808, true);
+- /* EOI */
+- vmx_disable_intercept_msr_write_x2apic(0x80b, true);
+- /* SELF-IPI */
+- vmx_disable_intercept_msr_write_x2apic(0x83f, true);
+-
+- /*
+- * (enable_apicv && !kvm_vcpu_apicv_active()) ||
+- * !enable_apicv
+- */
+- /* TPR */
+- vmx_disable_intercept_msr_read_x2apic(0x808, false);
+- vmx_disable_intercept_msr_write_x2apic(0x808, false);
+-
+ if (enable_ept) {
+ kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
+ (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
+@@ -6608,22 +6541,10 @@ static __init int hardware_setup(void)
+
+ return alloc_kvm_area();
+
+-out9:
+- free_page((unsigned long)vmx_vmwrite_bitmap);
+-out8:
+- free_page((unsigned long)vmx_vmread_bitmap);
+-out7:
+- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
+-out6:
+- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
+-out5:
+- free_page((unsigned long)vmx_msr_bitmap_longmode);
+-out4:
+- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
+ out3:
+- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
++ free_page((unsigned long)vmx_vmwrite_bitmap);
+ out2:
+- free_page((unsigned long)vmx_msr_bitmap_legacy);
++ free_page((unsigned long)vmx_vmread_bitmap);
+ out1:
+ free_page((unsigned long)vmx_io_bitmap_b);
+ out:
+@@ -6634,12 +6555,6 @@ out:
+
+ static __exit void hardware_unsetup(void)
+ {
+- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
+- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
+- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
+- free_page((unsigned long)vmx_msr_bitmap_legacy);
+- free_page((unsigned long)vmx_msr_bitmap_longmode);
+ free_page((unsigned long)vmx_io_bitmap_b);
+ free_page((unsigned long)vmx_io_bitmap_a);
+ free_page((unsigned long)vmx_vmwrite_bitmap);
+@@ -7004,13 +6919,6 @@ static int handle_vmon(struct kvm_vcpu *
+ if (r < 0)
+ goto out_vmcs02;
+
+- if (cpu_has_vmx_msr_bitmap()) {
+- vmx->nested.msr_bitmap =
+- (unsigned long *)__get_free_page(GFP_KERNEL);
+- if (!vmx->nested.msr_bitmap)
+- goto out_msr_bitmap;
+- }
+-
+ vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+ if (!vmx->nested.cached_vmcs12)
+ goto out_cached_vmcs12;
+@@ -7040,9 +6948,6 @@ out_shadow_vmcs:
+ kfree(vmx->nested.cached_vmcs12);
+
+ out_cached_vmcs12:
+- free_page((unsigned long)vmx->nested.msr_bitmap);
+-
+-out_msr_bitmap:
+ free_loaded_vmcs(&vmx->nested.vmcs02);
+
+ out_vmcs02:
+@@ -7121,10 +7026,6 @@ static void free_nested(struct vcpu_vmx
+ vmx->nested.vmxon = false;
+ free_vpid(vmx->nested.vpid02);
+ nested_release_vmcs12(vmx);
+- if (vmx->nested.msr_bitmap) {
+- free_page((unsigned long)vmx->nested.msr_bitmap);
+- vmx->nested.msr_bitmap = NULL;
+- }
+ if (enable_shadow_vmcs) {
+ vmcs_clear(vmx->vmcs01.shadow_vmcs);
+ free_vmcs(vmx->vmcs01.shadow_vmcs);
+@@ -8471,7 +8372,7 @@ static void vmx_set_virtual_x2apic_mode(
+ }
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
+
+- vmx_set_msr_bitmap(vcpu);
++ vmx_update_msr_bitmap(vcpu);
+ }
+
+ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
+@@ -9091,6 +8992,7 @@ static struct kvm_vcpu *vmx_create_vcpu(
+ {
+ int err;
+ struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
++ unsigned long *msr_bitmap;
+ int cpu;
+
+ if (!vmx)
+@@ -9131,6 +9033,15 @@ static struct kvm_vcpu *vmx_create_vcpu(
+ if (err < 0)
+ goto free_msrs;
+
++ msr_bitmap = vmx->vmcs01.msr_bitmap;
++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
++ vmx->msr_bitmap_mode = 0;
++
+ vmx->loaded_vmcs = &vmx->vmcs01;
+ cpu = get_cpu();
+ vmx_vcpu_load(&vmx->vcpu, cpu);
+@@ -9525,7 +9436,7 @@ static inline bool nested_vmx_merge_msr_
+ int msr;
+ struct page *page;
+ unsigned long *msr_bitmap_l1;
+- unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
++ unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
+
+ /* This shortcut is ok because we support only x2APIC MSRs so far. */
+ if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+@@ -10045,6 +9956,9 @@ static void prepare_vmcs02(struct kvm_vc
+ if (kvm_has_tsc_control)
+ decache_tsc_multiplier(vmx);
+
++ if (cpu_has_vmx_msr_bitmap())
++ vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
++
+ if (enable_vpid) {
+ /*
+ * There is no direct mapping between vpid02 and vpid12, the
+@@ -10749,7 +10663,7 @@ static void load_vmcs12_host_state(struc
+ vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+ if (cpu_has_vmx_msr_bitmap())
+- vmx_set_msr_bitmap(vcpu);
++ vmx_update_msr_bitmap(vcpu);
+
+ if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+ vmcs12->vm_exit_msr_load_count))
--- /dev/null
+From 15d45071523d89b3fb7372e2135fbd72f6af9506 Mon Sep 17 00:00:00 2001
+From: Ashok Raj <ashok.raj@intel.com>
+Date: Thu, 1 Feb 2018 22:59:43 +0100
+Subject: KVM/x86: Add IBPB support
+
+From: Ashok Raj <ashok.raj@intel.com>
+
+commit 15d45071523d89b3fb7372e2135fbd72f6af9506 upstream.
+
+The Indirect Branch Predictor Barrier (IBPB) is an indirect branch
+control mechanism. It keeps earlier branches from influencing
+later ones.
+
+Unlike IBRS and STIBP, IBPB does not define a new mode of operation.
+It's a command that ensures predicted branch targets aren't used after
+the barrier. Although IBRS and IBPB are enumerated by the same CPUID
+enumeration, IBPB is very different.
+
+IBPB helps mitigate against three potential attacks:
+
+* Mitigate guests from being attacked by other guests.
+ - This is addressed by issing IBPB when we do a guest switch.
+
+* Mitigate attacks from guest/ring3->host/ring3.
+ These would require a IBPB during context switch in host, or after
+ VMEXIT. The host process has two ways to mitigate
+ - Either it can be compiled with retpoline
+ - If its going through context switch, and has set !dumpable then
+ there is a IBPB in that path.
+ (Tim's patch: https://patchwork.kernel.org/patch/10192871)
+ - The case where after a VMEXIT you return back to Qemu might make
+ Qemu attackable from guest when Qemu isn't compiled with retpoline.
+ There are issues reported when doing IBPB on every VMEXIT that resulted
+ in some tsc calibration woes in guest.
+
+* Mitigate guest/ring0->host/ring0 attacks.
+ When host kernel is using retpoline it is safe against these attacks.
+ If host kernel isn't using retpoline we might need to do a IBPB flush on
+ every VMEXIT.
+
+Even when using retpoline for indirect calls, in certain conditions 'ret'
+can use the BTB on Skylake-era CPUs. There are other mitigations
+available like RSB stuffing/clearing.
+
+* IBPB is issued only for SVM during svm_free_vcpu().
+ VMX has a vmclear and SVM doesn't. Follow discussion here:
+ https://lkml.org/lkml/2018/1/15/146
+
+Please refer to the following spec for more details on the enumeration
+and control.
+
+Refer here to get documentation about mitigations.
+
+https://software.intel.com/en-us/side-channel-security-support
+
+[peterz: rebase and changelog rewrite]
+[karahmed: - rebase
+ - vmx: expose PRED_CMD if guest has it in CPUID
+ - svm: only pass through IBPB if guest has it in CPUID
+ - vmx: support !cpu_has_vmx_msr_bitmap()]
+ - vmx: support nested]
+[dwmw2: Expose CPUID bit too (AMD IBPB only for now as we lack IBRS)
+ PRED_CMD is a write-only MSR]
+
+Signed-off-by: Ashok Raj <ashok.raj@intel.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Cc: kvm@vger.kernel.org
+Cc: Asit Mallick <asit.k.mallick@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: Jun Nakajima <jun.nakajima@intel.com>
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>
+Link: http://lkml.kernel.org/r/1515720739-43819-6-git-send-email-ashok.raj@intel.com
+Link: https://lkml.kernel.org/r/1517522386-18410-3-git-send-email-karahmed@amazon.de
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/cpuid.c | 11 ++++++-
+ arch/x86/kvm/cpuid.h | 12 +++++++
+ arch/x86/kvm/svm.c | 28 ++++++++++++++++++
+ arch/x86/kvm/vmx.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++--
+ 4 files changed, 127 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -355,6 +355,10 @@ static inline int __do_cpuid_ent(struct
+ F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
+ 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
+
++ /* cpuid 0x80000008.ebx */
++ const u32 kvm_cpuid_8000_0008_ebx_x86_features =
++ F(IBPB);
++
+ /* cpuid 0xC0000001.edx */
+ const u32 kvm_cpuid_C000_0001_edx_x86_features =
+ F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
+@@ -607,7 +611,12 @@ static inline int __do_cpuid_ent(struct
+ if (!g_phys_as)
+ g_phys_as = phys_as;
+ entry->eax = g_phys_as | (virt_as << 8);
+- entry->ebx = entry->edx = 0;
++ entry->edx = 0;
++ /* IBPB isn't necessarily present in hardware cpuid */
++ if (boot_cpu_has(X86_FEATURE_IBPB))
++ entry->ebx |= F(IBPB);
++ entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
++ cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
+ break;
+ }
+ case 0x80000019:
+--- a/arch/x86/kvm/cpuid.h
++++ b/arch/x86/kvm/cpuid.h
+@@ -160,6 +160,18 @@ static inline bool guest_cpuid_has_rdtsc
+ return best && (best->edx & bit(X86_FEATURE_RDTSCP));
+ }
+
++static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu *vcpu)
++{
++ struct kvm_cpuid_entry2 *best;
++
++ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
++ if (best && (best->ebx & bit(X86_FEATURE_IBPB)))
++ return true;
++ best = kvm_find_cpuid_entry(vcpu, 7, 0);
++ return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
++}
++
++
+ /*
+ * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3
+ */
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -248,6 +248,7 @@ static const struct svm_direct_access_ms
+ { .index = MSR_CSTAR, .always = true },
+ { .index = MSR_SYSCALL_MASK, .always = true },
+ #endif
++ { .index = MSR_IA32_PRED_CMD, .always = false },
+ { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
+ { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
+ { .index = MSR_IA32_LASTINTFROMIP, .always = false },
+@@ -510,6 +511,7 @@ struct svm_cpu_data {
+ struct kvm_ldttss_desc *tss_desc;
+
+ struct page *save_area;
++ struct vmcb *current_vmcb;
+ };
+
+ static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
+@@ -1644,11 +1646,17 @@ static void svm_free_vcpu(struct kvm_vcp
+ __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
+ kvm_vcpu_uninit(vcpu);
+ kmem_cache_free(kvm_vcpu_cache, svm);
++ /*
++ * The vmcb page can be recycled, causing a false negative in
++ * svm_vcpu_load(). So do a full IBPB now.
++ */
++ indirect_branch_prediction_barrier();
+ }
+
+ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+ {
+ struct vcpu_svm *svm = to_svm(vcpu);
++ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ int i;
+
+ if (unlikely(cpu != vcpu->cpu)) {
+@@ -1677,6 +1685,10 @@ static void svm_vcpu_load(struct kvm_vcp
+ if (static_cpu_has(X86_FEATURE_RDTSCP))
+ wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+
++ if (sd->current_vmcb != svm->vmcb) {
++ sd->current_vmcb = svm->vmcb;
++ indirect_branch_prediction_barrier();
++ }
+ avic_vcpu_load(vcpu, cpu);
+ }
+
+@@ -3599,6 +3611,22 @@ static int svm_set_msr(struct kvm_vcpu *
+ case MSR_IA32_TSC:
+ kvm_write_tsc(vcpu, msr);
+ break;
++ case MSR_IA32_PRED_CMD:
++ if (!msr->host_initiated &&
++ !guest_cpuid_has_ibpb(vcpu))
++ return 1;
++
++ if (data & ~PRED_CMD_IBPB)
++ return 1;
++
++ if (!data)
++ break;
++
++ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
++ if (is_guest_mode(vcpu))
++ break;
++ set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
++ break;
+ case MSR_STAR:
+ svm->vmcb->save.star = data;
+ break;
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -549,6 +549,7 @@ struct vcpu_vmx {
+ u64 msr_host_kernel_gs_base;
+ u64 msr_guest_kernel_gs_base;
+ #endif
++
+ u32 vm_entry_controls_shadow;
+ u32 vm_exit_controls_shadow;
+ /*
+@@ -913,6 +914,8 @@ static void copy_vmcs12_to_shadow(struct
+ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
+ static int alloc_identity_pagetable(struct kvm *kvm);
+ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
++static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
++ u32 msr, int type);
+
+ static DEFINE_PER_CPU(struct vmcs *, vmxarea);
+ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
+@@ -1848,6 +1851,29 @@ static void update_exception_bitmap(stru
+ vmcs_write32(EXCEPTION_BITMAP, eb);
+ }
+
++/*
++ * Check if MSR is intercepted for L01 MSR bitmap.
++ */
++static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
++{
++ unsigned long *msr_bitmap;
++ int f = sizeof(unsigned long);
++
++ if (!cpu_has_vmx_msr_bitmap())
++ return true;
++
++ msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
++
++ if (msr <= 0x1fff) {
++ return !!test_bit(msr, msr_bitmap + 0x800 / f);
++ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
++ msr &= 0x1fff;
++ return !!test_bit(msr, msr_bitmap + 0xc00 / f);
++ }
++
++ return true;
++}
++
+ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+ unsigned long entry, unsigned long exit)
+ {
+@@ -2257,6 +2283,7 @@ static void vmx_vcpu_load(struct kvm_vcp
+ if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+ per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+ vmcs_load(vmx->loaded_vmcs->vmcs);
++ indirect_branch_prediction_barrier();
+ }
+
+ if (!already_loaded) {
+@@ -3058,6 +3085,33 @@ static int vmx_set_msr(struct kvm_vcpu *
+ case MSR_IA32_TSC:
+ kvm_write_tsc(vcpu, msr_info);
+ break;
++ case MSR_IA32_PRED_CMD:
++ if (!msr_info->host_initiated &&
++ !guest_cpuid_has_ibpb(vcpu))
++ return 1;
++
++ if (data & ~PRED_CMD_IBPB)
++ return 1;
++
++ if (!data)
++ break;
++
++ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
++
++ /*
++ * For non-nested:
++ * When it's written (to non-zero) for the first time, pass
++ * it through.
++ *
++ * For nested:
++ * The handling of the MSR bitmap for L2 guests is done in
++ * nested_vmx_merge_msr_bitmap. We should not touch the
++ * vmcs02.msr_bitmap here since it gets completely overwritten
++ * in the merging.
++ */
++ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
++ MSR_TYPE_W);
++ break;
+ case MSR_IA32_CR_PAT:
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
+@@ -9437,9 +9491,23 @@ static inline bool nested_vmx_merge_msr_
+ struct page *page;
+ unsigned long *msr_bitmap_l1;
+ unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
++ /*
++ * pred_cmd is trying to verify two things:
++ *
++ * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
++ * ensures that we do not accidentally generate an L02 MSR bitmap
++ * from the L12 MSR bitmap that is too permissive.
++ * 2. That L1 or L2s have actually used the MSR. This avoids
++ * unnecessarily merging of the bitmap if the MSR is unused. This
++ * works properly because we only update the L01 MSR bitmap lazily.
++ * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
++ * updated to reflect this when L1 (or its L2s) actually write to
++ * the MSR.
++ */
++ bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
+
+- /* This shortcut is ok because we support only x2APIC MSRs so far. */
+- if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
++ if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
++ !pred_cmd)
+ return false;
+
+ page = nested_get_page(vcpu, vmcs12->msr_bitmap);
+@@ -9477,6 +9545,13 @@ static inline bool nested_vmx_merge_msr_
+ MSR_TYPE_W);
+ }
+ }
++
++ if (pred_cmd)
++ nested_vmx_disable_intercept_for_msr(
++ msr_bitmap_l1, msr_bitmap_l0,
++ MSR_IA32_PRED_CMD,
++ MSR_TYPE_W);
++
+ kunmap(page);
+ nested_release_page_clean(page);
+
x86-pti-make-unpoison-of-pgd-for-trusted-boot-work-for-real.patch
kaiser-allocate-pgd-with-order-0-when-pti-off.patch
serial-core-mark-port-as-initialized-after-successful-irq-change.patch
+kvm-nvmx-vmx_complete_nested_posted_interrupt-can-t-fail.patch
+kvm-nvmx-mark-vmcs12-pages-dirty-on-l2-exit.patch
+kvm-nvmx-eliminate-vmcs02-pool.patch
+kvm-vmx-introduce-alloc_loaded_vmcs.patch
+kvm-vmx-make-msr-bitmaps-per-vcpu.patch
+kvm-x86-add-ibpb-support.patch
+kvm-vmx-emulate-msr_ia32_arch_capabilities.patch
+kvm-vmx-allow-direct-access-to-msr_ia32_spec_ctrl.patch
+kvm-svm-allow-direct-access-to-msr_ia32_spec_ctrl.patch