]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.9-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 6 Feb 2018 20:40:33 +0000 (12:40 -0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 6 Feb 2018 20:40:33 +0000 (12:40 -0800)
added patches:
kvm-nvmx-eliminate-vmcs02-pool.patch
kvm-nvmx-mark-vmcs12-pages-dirty-on-l2-exit.patch
kvm-nvmx-vmx_complete_nested_posted_interrupt-can-t-fail.patch
kvm-svm-allow-direct-access-to-msr_ia32_spec_ctrl.patch
kvm-vmx-allow-direct-access-to-msr_ia32_spec_ctrl.patch
kvm-vmx-emulate-msr_ia32_arch_capabilities.patch
kvm-vmx-introduce-alloc_loaded_vmcs.patch
kvm-vmx-make-msr-bitmaps-per-vcpu.patch
kvm-x86-add-ibpb-support.patch

queue-4.9/kvm-nvmx-eliminate-vmcs02-pool.patch [new file with mode: 0644]
queue-4.9/kvm-nvmx-mark-vmcs12-pages-dirty-on-l2-exit.patch [new file with mode: 0644]
queue-4.9/kvm-nvmx-vmx_complete_nested_posted_interrupt-can-t-fail.patch [new file with mode: 0644]
queue-4.9/kvm-svm-allow-direct-access-to-msr_ia32_spec_ctrl.patch [new file with mode: 0644]
queue-4.9/kvm-vmx-allow-direct-access-to-msr_ia32_spec_ctrl.patch [new file with mode: 0644]
queue-4.9/kvm-vmx-emulate-msr_ia32_arch_capabilities.patch [new file with mode: 0644]
queue-4.9/kvm-vmx-introduce-alloc_loaded_vmcs.patch [new file with mode: 0644]
queue-4.9/kvm-vmx-make-msr-bitmaps-per-vcpu.patch [new file with mode: 0644]
queue-4.9/kvm-x86-add-ibpb-support.patch [new file with mode: 0644]
queue-4.9/series

diff --git a/queue-4.9/kvm-nvmx-eliminate-vmcs02-pool.patch b/queue-4.9/kvm-nvmx-eliminate-vmcs02-pool.patch
new file mode 100644 (file)
index 0000000..8a99c2a
--- /dev/null
@@ -0,0 +1,292 @@
+From de3a0021a60635de96aa92713c1a31a96747d72c Mon Sep 17 00:00:00 2001
+From: Jim Mattson <jmattson@google.com>
+Date: Mon, 27 Nov 2017 17:22:25 -0600
+Subject: KVM: nVMX: Eliminate vmcs02 pool
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Jim Mattson <jmattson@google.com>
+
+commit de3a0021a60635de96aa92713c1a31a96747d72c upstream.
+
+The potential performance advantages of a vmcs02 pool have never been
+realized. To simplify the code, eliminate the pool. Instead, a single
+vmcs02 is allocated per VCPU when the VCPU enters VMX operation.
+
+Cc: stable@vger.kernel.org       # prereq for Spectre mitigation
+Signed-off-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Mark Kanda <mark.kanda@oracle.com>
+Reviewed-by: Ameya More <ameya.more@oracle.com>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx.c |  146 ++++++++---------------------------------------------
+ 1 file changed, 23 insertions(+), 123 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -173,7 +173,6 @@ module_param(ple_window_max, int, S_IRUG
+ extern const ulong vmx_return;
+ #define NR_AUTOLOAD_MSRS 8
+-#define VMCS02_POOL_SIZE 1
+ struct vmcs {
+       u32 revision_id;
+@@ -207,7 +206,7 @@ struct shared_msr_entry {
+  * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
+  * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
+  * More than one of these structures may exist, if L1 runs multiple L2 guests.
+- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
++ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
+  * underlying hardware which will be used to run L2.
+  * This structure is packed to ensure that its layout is identical across
+  * machines (necessary for live migration).
+@@ -386,13 +385,6 @@ struct __packed vmcs12 {
+  */
+ #define VMCS12_SIZE 0x1000
+-/* Used to remember the last vmcs02 used for some recently used vmcs12s */
+-struct vmcs02_list {
+-      struct list_head list;
+-      gpa_t vmptr;
+-      struct loaded_vmcs vmcs02;
+-};
+-
+ /*
+  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
+  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
+@@ -419,15 +411,15 @@ struct nested_vmx {
+        */
+       bool sync_shadow_vmcs;
+-      /* vmcs02_list cache of VMCSs recently used to run L2 guests */
+-      struct list_head vmcs02_pool;
+-      int vmcs02_num;
+       bool change_vmcs01_virtual_x2apic_mode;
+       /* L2 must run next, and mustn't decide to exit to L1. */
+       bool nested_run_pending;
++
++      struct loaded_vmcs vmcs02;
++
+       /*
+-       * Guest pages referred to in vmcs02 with host-physical pointers, so
+-       * we must keep them pinned while L2 runs.
++       * Guest pages referred to in the vmcs02 with host-physical
++       * pointers, so we must keep them pinned while L2 runs.
+        */
+       struct page *apic_access_page;
+       struct page *virtual_apic_page;
+@@ -6684,94 +6676,6 @@ static int handle_monitor(struct kvm_vcp
+ }
+ /*
+- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
+- * We could reuse a single VMCS for all the L2 guests, but we also want the
+- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
+- * allows keeping them loaded on the processor, and in the future will allow
+- * optimizations where prepare_vmcs02 doesn't need to set all the fields on
+- * every entry if they never change.
+- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
+- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
+- *
+- * The following functions allocate and free a vmcs02 in this pool.
+- */
+-
+-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
+-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
+-{
+-      struct vmcs02_list *item;
+-      list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
+-              if (item->vmptr == vmx->nested.current_vmptr) {
+-                      list_move(&item->list, &vmx->nested.vmcs02_pool);
+-                      return &item->vmcs02;
+-              }
+-
+-      if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
+-              /* Recycle the least recently used VMCS. */
+-              item = list_last_entry(&vmx->nested.vmcs02_pool,
+-                                     struct vmcs02_list, list);
+-              item->vmptr = vmx->nested.current_vmptr;
+-              list_move(&item->list, &vmx->nested.vmcs02_pool);
+-              return &item->vmcs02;
+-      }
+-
+-      /* Create a new VMCS */
+-      item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
+-      if (!item)
+-              return NULL;
+-      item->vmcs02.vmcs = alloc_vmcs();
+-      item->vmcs02.shadow_vmcs = NULL;
+-      if (!item->vmcs02.vmcs) {
+-              kfree(item);
+-              return NULL;
+-      }
+-      loaded_vmcs_init(&item->vmcs02);
+-      item->vmptr = vmx->nested.current_vmptr;
+-      list_add(&(item->list), &(vmx->nested.vmcs02_pool));
+-      vmx->nested.vmcs02_num++;
+-      return &item->vmcs02;
+-}
+-
+-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
+-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
+-{
+-      struct vmcs02_list *item;
+-      list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
+-              if (item->vmptr == vmptr) {
+-                      free_loaded_vmcs(&item->vmcs02);
+-                      list_del(&item->list);
+-                      kfree(item);
+-                      vmx->nested.vmcs02_num--;
+-                      return;
+-              }
+-}
+-
+-/*
+- * Free all VMCSs saved for this vcpu, except the one pointed by
+- * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
+- * must be &vmx->vmcs01.
+- */
+-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
+-{
+-      struct vmcs02_list *item, *n;
+-
+-      WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
+-      list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
+-              /*
+-               * Something will leak if the above WARN triggers.  Better than
+-               * a use-after-free.
+-               */
+-              if (vmx->loaded_vmcs == &item->vmcs02)
+-                      continue;
+-
+-              free_loaded_vmcs(&item->vmcs02);
+-              list_del(&item->list);
+-              kfree(item);
+-              vmx->nested.vmcs02_num--;
+-      }
+-}
+-
+-/*
+  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
+  * set the success or error code of an emulated VMX instruction, as specified
+  * by Vol 2B, VMX Instruction Reference, "Conventions".
+@@ -7084,6 +6988,12 @@ static int handle_vmon(struct kvm_vcpu *
+               return 1;
+       }
++      vmx->nested.vmcs02.vmcs = alloc_vmcs();
++      vmx->nested.vmcs02.shadow_vmcs = NULL;
++      if (!vmx->nested.vmcs02.vmcs)
++              goto out_vmcs02;
++      loaded_vmcs_init(&vmx->nested.vmcs02);
++
+       if (cpu_has_vmx_msr_bitmap()) {
+               vmx->nested.msr_bitmap =
+                               (unsigned long *)__get_free_page(GFP_KERNEL);
+@@ -7106,9 +7016,6 @@ static int handle_vmon(struct kvm_vcpu *
+               vmx->vmcs01.shadow_vmcs = shadow_vmcs;
+       }
+-      INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
+-      vmx->nested.vmcs02_num = 0;
+-
+       hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
+                    HRTIMER_MODE_REL_PINNED);
+       vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
+@@ -7126,6 +7033,9 @@ out_cached_vmcs12:
+       free_page((unsigned long)vmx->nested.msr_bitmap);
+ out_msr_bitmap:
++      free_loaded_vmcs(&vmx->nested.vmcs02);
++
++out_vmcs02:
+       return -ENOMEM;
+ }
+@@ -7211,7 +7121,7 @@ static void free_nested(struct vcpu_vmx
+               vmx->vmcs01.shadow_vmcs = NULL;
+       }
+       kfree(vmx->nested.cached_vmcs12);
+-      /* Unpin physical memory we referred to in current vmcs02 */
++      /* Unpin physical memory we referred to in the vmcs02 */
+       if (vmx->nested.apic_access_page) {
+               nested_release_page(vmx->nested.apic_access_page);
+               vmx->nested.apic_access_page = NULL;
+@@ -7227,7 +7137,7 @@ static void free_nested(struct vcpu_vmx
+               vmx->nested.pi_desc = NULL;
+       }
+-      nested_free_all_saved_vmcss(vmx);
++      free_loaded_vmcs(&vmx->nested.vmcs02);
+ }
+ /* Emulate the VMXOFF instruction */
+@@ -7261,8 +7171,6 @@ static int handle_vmclear(struct kvm_vcp
+                       vmptr + offsetof(struct vmcs12, launch_state),
+                       &zero, sizeof(zero));
+-      nested_free_vmcs02(vmx, vmptr);
+-
+       skip_emulated_instruction(vcpu);
+       nested_vmx_succeed(vcpu);
+       return 1;
+@@ -8051,10 +7959,11 @@ static bool nested_vmx_exit_handled(stru
+       /*
+        * The host physical addresses of some pages of guest memory
+-       * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
+-       * may write to these pages via their host physical address while
+-       * L2 is running, bypassing any address-translation-based dirty
+-       * tracking (e.g. EPT write protection).
++       * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
++       * Page). The CPU may write to these pages via their host
++       * physical address while L2 is running, bypassing any
++       * address-translation-based dirty tracking (e.g. EPT write
++       * protection).
+        *
+        * Mark them dirty on every exit from L2 to prevent them from
+        * getting out of sync with dirty tracking.
+@@ -10223,7 +10132,6 @@ static int nested_vmx_run(struct kvm_vcp
+       struct vmcs12 *vmcs12;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       int cpu;
+-      struct loaded_vmcs *vmcs02;
+       bool ia32e;
+       u32 msr_entry_idx;
+@@ -10363,17 +10271,13 @@ static int nested_vmx_run(struct kvm_vcp
+        * the nested entry.
+        */
+-      vmcs02 = nested_get_current_vmcs02(vmx);
+-      if (!vmcs02)
+-              return -ENOMEM;
+-
+       enter_guest_mode(vcpu);
+       if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+               vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+       cpu = get_cpu();
+-      vmx->loaded_vmcs = vmcs02;
++      vmx->loaded_vmcs = &vmx->nested.vmcs02;
+       vmx_vcpu_put(vcpu);
+       vmx_vcpu_load(vcpu, cpu);
+       vcpu->cpu = cpu;
+@@ -10888,10 +10792,6 @@ static void nested_vmx_vmexit(struct kvm
+       vm_exit_controls_reset_shadow(vmx);
+       vmx_segment_cache_clear(vmx);
+-      /* if no vmcs02 cache requested, remove the one we used */
+-      if (VMCS02_POOL_SIZE == 0)
+-              nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
+-
+       load_vmcs12_host_state(vcpu, vmcs12);
+       /* Update any VMCS fields that might have changed while L2 ran */
diff --git a/queue-4.9/kvm-nvmx-mark-vmcs12-pages-dirty-on-l2-exit.patch b/queue-4.9/kvm-nvmx-mark-vmcs12-pages-dirty-on-l2-exit.patch
new file mode 100644 (file)
index 0000000..9361ea0
--- /dev/null
@@ -0,0 +1,116 @@
+From c9f04407f2e0b3fc9ff7913c65fcfcb0a4b61570 Mon Sep 17 00:00:00 2001
+From: David Matlack <dmatlack@google.com>
+Date: Tue, 1 Aug 2017 14:00:40 -0700
+Subject: KVM: nVMX: mark vmcs12 pages dirty on L2 exit
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: David Matlack <dmatlack@google.com>
+
+commit c9f04407f2e0b3fc9ff7913c65fcfcb0a4b61570 upstream.
+
+The host physical addresses of L1's Virtual APIC Page and Posted
+Interrupt descriptor are loaded into the VMCS02. The CPU may write
+to these pages via their host physical address while L2 is running,
+bypassing address-translation-based dirty tracking (e.g. EPT write
+protection). Mark them dirty on every exit from L2 to prevent them
+from getting out of sync with dirty tracking.
+
+Also mark the virtual APIC page and the posted interrupt descriptor
+dirty when KVM is virtualizing posted interrupt processing.
+
+Signed-off-by: David Matlack <dmatlack@google.com>
+Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx.c |   53 +++++++++++++++++++++++++++++++++++++++++++----------
+ 1 file changed, 43 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -4738,6 +4738,28 @@ static bool vmx_get_enable_apicv(void)
+       return enable_apicv;
+ }
++static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
++{
++      struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
++      gfn_t gfn;
++
++      /*
++       * Don't need to mark the APIC access page dirty; it is never
++       * written to by the CPU during APIC virtualization.
++       */
++
++      if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
++              gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
++              kvm_vcpu_mark_page_dirty(vcpu, gfn);
++      }
++
++      if (nested_cpu_has_posted_intr(vmcs12)) {
++              gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
++              kvm_vcpu_mark_page_dirty(vcpu, gfn);
++      }
++}
++
++
+ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+@@ -4745,18 +4767,15 @@ static void vmx_complete_nested_posted_i
+       void *vapic_page;
+       u16 status;
+-      if (vmx->nested.pi_desc &&
+-          vmx->nested.pi_pending) {
+-              vmx->nested.pi_pending = false;
+-              if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+-                      return;
+-
+-              max_irr = find_last_bit(
+-                      (unsigned long *)vmx->nested.pi_desc->pir, 256);
++      if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
++              return;
+-              if (max_irr == 256)
+-                      return;
++      vmx->nested.pi_pending = false;
++      if (!pi_test_and_clear_on(vmx->nested.pi_desc))
++              return;
++      max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
++      if (max_irr != 256) {
+               vapic_page = kmap(vmx->nested.virtual_apic_page);
+               if (!vapic_page) {
+                       WARN_ON(1);
+@@ -4772,6 +4791,8 @@ static void vmx_complete_nested_posted_i
+                       vmcs_write16(GUEST_INTR_STATUS, status);
+               }
+       }
++
++      nested_mark_vmcs12_pages_dirty(vcpu);
+ }
+ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
+@@ -8028,6 +8049,18 @@ static bool nested_vmx_exit_handled(stru
+                               vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+                               KVM_ISA_VMX);
++      /*
++       * The host physical addresses of some pages of guest memory
++       * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
++       * may write to these pages via their host physical address while
++       * L2 is running, bypassing any address-translation-based dirty
++       * tracking (e.g. EPT write protection).
++       *
++       * Mark them dirty on every exit from L2 to prevent them from
++       * getting out of sync with dirty tracking.
++       */
++      nested_mark_vmcs12_pages_dirty(vcpu);
++
+       if (vmx->nested.nested_run_pending)
+               return false;
diff --git a/queue-4.9/kvm-nvmx-vmx_complete_nested_posted_interrupt-can-t-fail.patch b/queue-4.9/kvm-nvmx-vmx_complete_nested_posted_interrupt-can-t-fail.patch
new file mode 100644 (file)
index 0000000..9c5498f
--- /dev/null
@@ -0,0 +1,65 @@
+From 6342c50ad12e8ce0736e722184a7dbdea4a3477f Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Wed, 25 Jan 2017 11:58:58 +0100
+Subject: KVM: nVMX: vmx_complete_nested_posted_interrupt() can't fail
+
+From: David Hildenbrand <david@redhat.com>
+
+commit 6342c50ad12e8ce0736e722184a7dbdea4a3477f upstream.
+
+vmx_complete_nested_posted_interrupt() can't fail, let's turn it into
+a void function.
+
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx.c |   10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -4738,7 +4738,7 @@ static bool vmx_get_enable_apicv(void)
+       return enable_apicv;
+ }
+-static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
++static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       int max_irr;
+@@ -4749,13 +4749,13 @@ static int vmx_complete_nested_posted_in
+           vmx->nested.pi_pending) {
+               vmx->nested.pi_pending = false;
+               if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+-                      return 0;
++                      return;
+               max_irr = find_last_bit(
+                       (unsigned long *)vmx->nested.pi_desc->pir, 256);
+               if (max_irr == 256)
+-                      return 0;
++                      return;
+               vapic_page = kmap(vmx->nested.virtual_apic_page);
+               if (!vapic_page) {
+@@ -4772,7 +4772,6 @@ static int vmx_complete_nested_posted_in
+                       vmcs_write16(GUEST_INTR_STATUS, status);
+               }
+       }
+-      return 0;
+ }
+ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
+@@ -10493,7 +10492,8 @@ static int vmx_check_nested_events(struc
+               return 0;
+       }
+-      return vmx_complete_nested_posted_interrupt(vcpu);
++      vmx_complete_nested_posted_interrupt(vcpu);
++      return 0;
+ }
+ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
diff --git a/queue-4.9/kvm-svm-allow-direct-access-to-msr_ia32_spec_ctrl.patch b/queue-4.9/kvm-svm-allow-direct-access-to-msr_ia32_spec_ctrl.patch
new file mode 100644 (file)
index 0000000..b81a525
--- /dev/null
@@ -0,0 +1,189 @@
+From b2ac58f90540e39324e7a29a7ad471407ae0bf48 Mon Sep 17 00:00:00 2001
+From: KarimAllah Ahmed <karahmed@amazon.de>
+Date: Sat, 3 Feb 2018 15:56:23 +0100
+Subject: KVM/SVM: Allow direct access to MSR_IA32_SPEC_CTRL
+
+From: KarimAllah Ahmed <karahmed@amazon.de>
+
+commit b2ac58f90540e39324e7a29a7ad471407ae0bf48 upstream.
+
+[ Based on a patch from Paolo Bonzini <pbonzini@redhat.com> ]
+
+... basically doing exactly what we do for VMX:
+
+- Passthrough SPEC_CTRL to guests (if enabled in guest CPUID)
+- Save and restore SPEC_CTRL around VMExit and VMEntry only if the guest
+  actually used it.
+
+Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
+Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Cc: Jun Nakajima <jun.nakajima@intel.com>
+Cc: kvm@vger.kernel.org
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Asit Mallick <asit.k.mallick@intel.com>
+Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Ashok Raj <ashok.raj@intel.com>
+Link: https://lkml.kernel.org/r/1517669783-20732-1-git-send-email-karahmed@amazon.de
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm.c |   88 +++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 88 insertions(+)
+
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -183,6 +183,8 @@ struct vcpu_svm {
+               u64 gs_base;
+       } host;
++      u64 spec_ctrl;
++
+       u32 *msrpm;
+       ulong nmi_iret_rip;
+@@ -248,6 +250,7 @@ static const struct svm_direct_access_ms
+       { .index = MSR_CSTAR,                           .always = true  },
+       { .index = MSR_SYSCALL_MASK,                    .always = true  },
+ #endif
++      { .index = MSR_IA32_SPEC_CTRL,                  .always = false },
+       { .index = MSR_IA32_PRED_CMD,                   .always = false },
+       { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
+       { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
+@@ -863,6 +866,25 @@ static bool valid_msr_intercept(u32 inde
+       return false;
+ }
++static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
++{
++      u8 bit_write;
++      unsigned long tmp;
++      u32 offset;
++      u32 *msrpm;
++
++      msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
++                                    to_svm(vcpu)->msrpm;
++
++      offset    = svm_msrpm_offset(msr);
++      bit_write = 2 * (msr & 0x0f) + 1;
++      tmp       = msrpm[offset];
++
++      BUG_ON(offset == MSR_INVALID);
++
++      return !!test_bit(bit_write,  &tmp);
++}
++
+ static void set_msr_interception(u32 *msrpm, unsigned msr,
+                                int read, int write)
+ {
+@@ -1537,6 +1559,8 @@ static void svm_vcpu_reset(struct kvm_vc
+       u32 dummy;
+       u32 eax = 1;
++      svm->spec_ctrl = 0;
++
+       if (!init_event) {
+               svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+                                          MSR_IA32_APICBASE_ENABLE;
+@@ -3520,6 +3544,13 @@ static int svm_get_msr(struct kvm_vcpu *
+       case MSR_VM_CR:
+               msr_info->data = svm->nested.vm_cr_msr;
+               break;
++      case MSR_IA32_SPEC_CTRL:
++              if (!msr_info->host_initiated &&
++                  !guest_cpuid_has_ibrs(vcpu))
++                      return 1;
++
++              msr_info->data = svm->spec_ctrl;
++              break;
+       case MSR_IA32_UCODE_REV:
+               msr_info->data = 0x01000065;
+               break;
+@@ -3611,6 +3642,33 @@ static int svm_set_msr(struct kvm_vcpu *
+       case MSR_IA32_TSC:
+               kvm_write_tsc(vcpu, msr);
+               break;
++      case MSR_IA32_SPEC_CTRL:
++              if (!msr->host_initiated &&
++                  !guest_cpuid_has_ibrs(vcpu))
++                      return 1;
++
++              /* The STIBP bit doesn't fault even if it's not advertised */
++              if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
++                      return 1;
++
++              svm->spec_ctrl = data;
++
++              if (!data)
++                      break;
++
++              /*
++               * For non-nested:
++               * When it's written (to non-zero) for the first time, pass
++               * it through.
++               *
++               * For nested:
++               * The handling of the MSR bitmap for L2 guests is done in
++               * nested_svm_vmrun_msrpm.
++               * We update the L1 MSR bit as well since it will end up
++               * touching the MSR anyway now.
++               */
++              set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
++              break;
+       case MSR_IA32_PRED_CMD:
+               if (!msr->host_initiated &&
+                   !guest_cpuid_has_ibpb(vcpu))
+@@ -4854,6 +4912,15 @@ static void svm_vcpu_run(struct kvm_vcpu
+       local_irq_enable();
++      /*
++       * If this vCPU has touched SPEC_CTRL, restore the guest's value if
++       * it's non-zero. Since vmentry is serialising on affected CPUs, there
++       * is no need to worry about the conditional branch over the wrmsr
++       * being speculatively taken.
++       */
++      if (svm->spec_ctrl)
++              wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
++
+       asm volatile (
+               "push %%" _ASM_BP "; \n\t"
+               "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
+@@ -4946,6 +5013,27 @@ static void svm_vcpu_run(struct kvm_vcpu
+ #endif
+               );
++      /*
++       * We do not use IBRS in the kernel. If this vCPU has used the
++       * SPEC_CTRL MSR it may have left it on; save the value and
++       * turn it off. This is much more efficient than blindly adding
++       * it to the atomic save/restore list. Especially as the former
++       * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
++       *
++       * For non-nested case:
++       * If the L01 MSR bitmap does not intercept the MSR, then we need to
++       * save it.
++       *
++       * For nested case:
++       * If the L02 MSR bitmap does not intercept the MSR, then we need to
++       * save it.
++       */
++      if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
++              rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
++
++      if (svm->spec_ctrl)
++              wrmsrl(MSR_IA32_SPEC_CTRL, 0);
++
+       /* Eliminate branch target predictions from guest mode */
+       vmexit_fill_RSB();
diff --git a/queue-4.9/kvm-vmx-allow-direct-access-to-msr_ia32_spec_ctrl.patch b/queue-4.9/kvm-vmx-allow-direct-access-to-msr_ia32_spec_ctrl.patch
new file mode 100644 (file)
index 0000000..b94e36d
--- /dev/null
@@ -0,0 +1,296 @@
+From d28b387fb74da95d69d2615732f50cceb38e9a4d Mon Sep 17 00:00:00 2001
+From: KarimAllah Ahmed <karahmed@amazon.de>
+Date: Thu, 1 Feb 2018 22:59:45 +0100
+Subject: KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL
+
+From: KarimAllah Ahmed <karahmed@amazon.de>
+
+commit d28b387fb74da95d69d2615732f50cceb38e9a4d upstream.
+
+[ Based on a patch from Ashok Raj <ashok.raj@intel.com> ]
+
+Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for
+guests that will only mitigate Spectre V2 through IBRS+IBPB and will not
+be using a retpoline+IBPB based approach.
+
+To avoid the overhead of saving and restoring the MSR_IA32_SPEC_CTRL for
+guests that do not actually use the MSR, only start saving and restoring
+when a non-zero is written to it.
+
+No attempt is made to handle STIBP here, intentionally. Filtering STIBP
+may be added in a future patch, which may require trapping all writes
+if we don't want to pass it through directly to the guest.
+
+[dwmw2: Clean up CPUID bits, save/restore manually, handle reset]
+
+Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
+Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Cc: Jun Nakajima <jun.nakajima@intel.com>
+Cc: kvm@vger.kernel.org
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Asit Mallick <asit.k.mallick@intel.com>
+Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Ashok Raj <ashok.raj@intel.com>
+Link: https://lkml.kernel.org/r/1517522386-18410-5-git-send-email-karahmed@amazon.de
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/cpuid.c |    8 ++-
+ arch/x86/kvm/cpuid.h |   11 +++++
+ arch/x86/kvm/vmx.c   |  103 ++++++++++++++++++++++++++++++++++++++++++++++++++-
+ arch/x86/kvm/x86.c   |    2 
+ 4 files changed, 118 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -357,7 +357,7 @@ static inline int __do_cpuid_ent(struct
+       /* cpuid 0x80000008.ebx */
+       const u32 kvm_cpuid_8000_0008_ebx_x86_features =
+-              F(IBPB);
++              F(IBPB) | F(IBRS);
+       /* cpuid 0xC0000001.edx */
+       const u32 kvm_cpuid_C000_0001_edx_x86_features =
+@@ -382,7 +382,7 @@ static inline int __do_cpuid_ent(struct
+       /* cpuid 7.0.edx*/
+       const u32 kvm_cpuid_7_0_edx_x86_features =
+-              F(ARCH_CAPABILITIES);
++              F(SPEC_CTRL) | F(ARCH_CAPABILITIES);
+       /* all calls to cpuid_count() should be made on the same cpu */
+       get_cpu();
+@@ -618,9 +618,11 @@ static inline int __do_cpuid_ent(struct
+                       g_phys_as = phys_as;
+               entry->eax = g_phys_as | (virt_as << 8);
+               entry->edx = 0;
+-              /* IBPB isn't necessarily present in hardware cpuid */
++              /* IBRS and IBPB aren't necessarily present in hardware cpuid */
+               if (boot_cpu_has(X86_FEATURE_IBPB))
+                       entry->ebx |= F(IBPB);
++              if (boot_cpu_has(X86_FEATURE_IBRS))
++                      entry->ebx |= F(IBRS);
+               entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
+               cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
+               break;
+--- a/arch/x86/kvm/cpuid.h
++++ b/arch/x86/kvm/cpuid.h
+@@ -171,6 +171,17 @@ static inline bool guest_cpuid_has_ibpb(
+       return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
+ }
++static inline bool guest_cpuid_has_ibrs(struct kvm_vcpu *vcpu)
++{
++      struct kvm_cpuid_entry2 *best;
++
++      best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
++      if (best && (best->ebx & bit(X86_FEATURE_IBRS)))
++              return true;
++      best = kvm_find_cpuid_entry(vcpu, 7, 0);
++      return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
++}
++
+ static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu)
+ {
+       struct kvm_cpuid_entry2 *best;
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -551,6 +551,7 @@ struct vcpu_vmx {
+ #endif
+       u64                   arch_capabilities;
++      u64                   spec_ctrl;
+       u32 vm_entry_controls_shadow;
+       u32 vm_exit_controls_shadow;
+@@ -1854,6 +1855,29 @@ static void update_exception_bitmap(stru
+ }
+ /*
++ * Check if MSR is intercepted for currently loaded MSR bitmap.
++ */
++static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
++{
++      unsigned long *msr_bitmap;
++      int f = sizeof(unsigned long);
++
++      if (!cpu_has_vmx_msr_bitmap())
++              return true;
++
++      msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
++
++      if (msr <= 0x1fff) {
++              return !!test_bit(msr, msr_bitmap + 0x800 / f);
++      } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
++              msr &= 0x1fff;
++              return !!test_bit(msr, msr_bitmap + 0xc00 / f);
++      }
++
++      return true;
++}
++
++/*
+  * Check if MSR is intercepted for L01 MSR bitmap.
+  */
+ static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
+@@ -2983,6 +3007,13 @@ static int vmx_get_msr(struct kvm_vcpu *
+       case MSR_IA32_TSC:
+               msr_info->data = guest_read_tsc(vcpu);
+               break;
++      case MSR_IA32_SPEC_CTRL:
++              if (!msr_info->host_initiated &&
++                  !guest_cpuid_has_ibrs(vcpu))
++                      return 1;
++
++              msr_info->data = to_vmx(vcpu)->spec_ctrl;
++              break;
+       case MSR_IA32_ARCH_CAPABILITIES:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has_arch_capabilities(vcpu))
+@@ -3093,6 +3124,36 @@ static int vmx_set_msr(struct kvm_vcpu *
+       case MSR_IA32_TSC:
+               kvm_write_tsc(vcpu, msr_info);
+               break;
++      case MSR_IA32_SPEC_CTRL:
++              if (!msr_info->host_initiated &&
++                  !guest_cpuid_has_ibrs(vcpu))
++                      return 1;
++
++              /* The STIBP bit doesn't fault even if it's not advertised */
++              if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
++                      return 1;
++
++              vmx->spec_ctrl = data;
++
++              if (!data)
++                      break;
++
++              /*
++               * For non-nested:
++               * When it's written (to non-zero) for the first time, pass
++               * it through.
++               *
++               * For nested:
++               * The handling of the MSR bitmap for L2 guests is done in
++               * nested_vmx_merge_msr_bitmap. We should not touch the
++               * vmcs02.msr_bitmap here since it gets completely overwritten
++               * in the merging. We update the vmcs01 here for L1 as well
++               * since it will end up touching the MSR anyway now.
++               */
++              vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
++                                            MSR_IA32_SPEC_CTRL,
++                                            MSR_TYPE_RW);
++              break;
+       case MSR_IA32_PRED_CMD:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has_ibpb(vcpu))
+@@ -5245,6 +5306,7 @@ static void vmx_vcpu_reset(struct kvm_vc
+       u64 cr0;
+       vmx->rmode.vm86_active = 0;
++      vmx->spec_ctrl = 0;
+       vmx->soft_vnmi_blocked = 0;
+@@ -8830,6 +8892,15 @@ static void __noclone vmx_vcpu_run(struc
+       vmx_arm_hv_timer(vcpu);
++      /*
++       * If this vCPU has touched SPEC_CTRL, restore the guest's value if
++       * it's non-zero. Since vmentry is serialising on affected CPUs, there
++       * is no need to worry about the conditional branch over the wrmsr
++       * being speculatively taken.
++       */
++      if (vmx->spec_ctrl)
++              wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
++
+       vmx->__launched = vmx->loaded_vmcs->launched;
+       asm(
+               /* Store host registers */
+@@ -8948,6 +9019,27 @@ static void __noclone vmx_vcpu_run(struc
+ #endif
+             );
++      /*
++       * We do not use IBRS in the kernel. If this vCPU has used the
++       * SPEC_CTRL MSR it may have left it on; save the value and
++       * turn it off. This is much more efficient than blindly adding
++       * it to the atomic save/restore list. Especially as the former
++       * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
++       *
++       * For non-nested case:
++       * If the L01 MSR bitmap does not intercept the MSR, then we need to
++       * save it.
++       *
++       * For nested case:
++       * If the L02 MSR bitmap does not intercept the MSR, then we need to
++       * save it.
++       */
++      if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
++              rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
++
++      if (vmx->spec_ctrl)
++              wrmsrl(MSR_IA32_SPEC_CTRL, 0);
++
+       /* Eliminate branch target predictions from guest mode */
+       vmexit_fill_RSB();
+@@ -9507,7 +9599,7 @@ static inline bool nested_vmx_merge_msr_
+       unsigned long *msr_bitmap_l1;
+       unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
+       /*
+-       * pred_cmd is trying to verify two things:
++       * pred_cmd & spec_ctrl are trying to verify two things:
+        *
+        * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
+        *    ensures that we do not accidentally generate an L02 MSR bitmap
+@@ -9520,9 +9612,10 @@ static inline bool nested_vmx_merge_msr_
+        *    the MSR.
+        */
+       bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
++      bool spec_ctrl = msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
+       if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+-          !pred_cmd)
++          !pred_cmd && !spec_ctrl)
+               return false;
+       page = nested_get_page(vcpu, vmcs12->msr_bitmap);
+@@ -9561,6 +9654,12 @@ static inline bool nested_vmx_merge_msr_
+               }
+       }
++      if (spec_ctrl)
++              nested_vmx_disable_intercept_for_msr(
++                                      msr_bitmap_l1, msr_bitmap_l0,
++                                      MSR_IA32_SPEC_CTRL,
++                                      MSR_TYPE_R | MSR_TYPE_W);
++
+       if (pred_cmd)
+               nested_vmx_disable_intercept_for_msr(
+                                       msr_bitmap_l1, msr_bitmap_l0,
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -975,7 +975,7 @@ static u32 msrs_to_save[] = {
+ #endif
+       MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+       MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+-      MSR_IA32_ARCH_CAPABILITIES
++      MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
+ };
+ static unsigned num_msrs_to_save;
diff --git a/queue-4.9/kvm-vmx-emulate-msr_ia32_arch_capabilities.patch b/queue-4.9/kvm-vmx-emulate-msr_ia32_arch_capabilities.patch
new file mode 100644 (file)
index 0000000..4b8d14c
--- /dev/null
@@ -0,0 +1,147 @@
+From 28c1c9fabf48d6ad596273a11c46e0d0da3e14cd Mon Sep 17 00:00:00 2001
+From: KarimAllah Ahmed <karahmed@amazon.de>
+Date: Thu, 1 Feb 2018 22:59:44 +0100
+Subject: KVM/VMX: Emulate MSR_IA32_ARCH_CAPABILITIES
+
+From: KarimAllah Ahmed <karahmed@amazon.de>
+
+commit 28c1c9fabf48d6ad596273a11c46e0d0da3e14cd upstream.
+
+Intel processors use MSR_IA32_ARCH_CAPABILITIES MSR to indicate RDCL_NO
+(bit 0) and IBRS_ALL (bit 1). This is a read-only MSR. By default the
+contents will come directly from the hardware, but user-space can still
+override it.
+
+[dwmw2: The bit in kvm_cpuid_7_0_edx_x86_features can be unconditional]
+
+Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Cc: Jun Nakajima <jun.nakajima@intel.com>
+Cc: kvm@vger.kernel.org
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Asit Mallick <asit.k.mallick@intel.com>
+Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>
+Cc: Ashok Raj <ashok.raj@intel.com>
+Link: https://lkml.kernel.org/r/1517522386-18410-4-git-send-email-karahmed@amazon.de
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/cpuid.c |    8 +++++++-
+ arch/x86/kvm/cpuid.h |    8 ++++++++
+ arch/x86/kvm/vmx.c   |   15 +++++++++++++++
+ arch/x86/kvm/x86.c   |    1 +
+ 4 files changed, 31 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -380,6 +380,10 @@ static inline int __do_cpuid_ent(struct
+       /* cpuid 7.0.ecx*/
+       const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/;
++      /* cpuid 7.0.edx*/
++      const u32 kvm_cpuid_7_0_edx_x86_features =
++              F(ARCH_CAPABILITIES);
++
+       /* all calls to cpuid_count() should be made on the same cpu */
+       get_cpu();
+@@ -462,12 +466,14 @@ static inline int __do_cpuid_ent(struct
+                       /* PKU is not yet implemented for shadow paging. */
+                       if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
+                               entry->ecx &= ~F(PKU);
++                      entry->edx &= kvm_cpuid_7_0_edx_x86_features;
++                      cpuid_mask(&entry->edx, CPUID_7_EDX);
+               } else {
+                       entry->ebx = 0;
+                       entry->ecx = 0;
++                      entry->edx = 0;
+               }
+               entry->eax = 0;
+-              entry->edx = 0;
+               break;
+       }
+       case 9:
+--- a/arch/x86/kvm/cpuid.h
++++ b/arch/x86/kvm/cpuid.h
+@@ -171,6 +171,14 @@ static inline bool guest_cpuid_has_ibpb(
+       return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
+ }
++static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu)
++{
++      struct kvm_cpuid_entry2 *best;
++
++      best = kvm_find_cpuid_entry(vcpu, 7, 0);
++      return best && (best->edx & bit(X86_FEATURE_ARCH_CAPABILITIES));
++}
++
+ /*
+  * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -550,6 +550,8 @@ struct vcpu_vmx {
+       u64                   msr_guest_kernel_gs_base;
+ #endif
++      u64                   arch_capabilities;
++
+       u32 vm_entry_controls_shadow;
+       u32 vm_exit_controls_shadow;
+       /*
+@@ -2981,6 +2983,12 @@ static int vmx_get_msr(struct kvm_vcpu *
+       case MSR_IA32_TSC:
+               msr_info->data = guest_read_tsc(vcpu);
+               break;
++      case MSR_IA32_ARCH_CAPABILITIES:
++              if (!msr_info->host_initiated &&
++                  !guest_cpuid_has_arch_capabilities(vcpu))
++                      return 1;
++              msr_info->data = to_vmx(vcpu)->arch_capabilities;
++              break;
+       case MSR_IA32_SYSENTER_CS:
+               msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
+               break;
+@@ -3112,6 +3120,11 @@ static int vmx_set_msr(struct kvm_vcpu *
+               vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
+                                             MSR_TYPE_W);
+               break;
++      case MSR_IA32_ARCH_CAPABILITIES:
++              if (!msr_info->host_initiated)
++                      return 1;
++              vmx->arch_capabilities = data;
++              break;
+       case MSR_IA32_CR_PAT:
+               if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+                       if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
+@@ -5202,6 +5215,8 @@ static int vmx_vcpu_setup(struct vcpu_vm
+               ++vmx->nmsrs;
+       }
++      if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
++              rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
+       vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -975,6 +975,7 @@ static u32 msrs_to_save[] = {
+ #endif
+       MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+       MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
++      MSR_IA32_ARCH_CAPABILITIES
+ };
+ static unsigned num_msrs_to_save;
diff --git a/queue-4.9/kvm-vmx-introduce-alloc_loaded_vmcs.patch b/queue-4.9/kvm-vmx-introduce-alloc_loaded_vmcs.patch
new file mode 100644 (file)
index 0000000..32d35f8
--- /dev/null
@@ -0,0 +1,101 @@
+From f21f165ef922c2146cc5bdc620f542953c41714b Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Thu, 11 Jan 2018 12:16:15 +0100
+Subject: KVM: VMX: introduce alloc_loaded_vmcs
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit f21f165ef922c2146cc5bdc620f542953c41714b upstream.
+
+Group together the calls to alloc_vmcs and loaded_vmcs_init.  Soon we'll also
+allocate an MSR bitmap there.
+
+Cc: stable@vger.kernel.org       # prereq for Spectre mitigation
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx.c |   38 +++++++++++++++++++++++---------------
+ 1 file changed, 23 insertions(+), 15 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -3524,11 +3524,6 @@ static struct vmcs *alloc_vmcs_cpu(int c
+       return vmcs;
+ }
+-static struct vmcs *alloc_vmcs(void)
+-{
+-      return alloc_vmcs_cpu(raw_smp_processor_id());
+-}
+-
+ static void free_vmcs(struct vmcs *vmcs)
+ {
+       free_pages((unsigned long)vmcs, vmcs_config.order);
+@@ -3547,6 +3542,22 @@ static void free_loaded_vmcs(struct load
+       WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
+ }
++static struct vmcs *alloc_vmcs(void)
++{
++      return alloc_vmcs_cpu(raw_smp_processor_id());
++}
++
++static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
++{
++      loaded_vmcs->vmcs = alloc_vmcs();
++      if (!loaded_vmcs->vmcs)
++              return -ENOMEM;
++
++      loaded_vmcs->shadow_vmcs = NULL;
++      loaded_vmcs_init(loaded_vmcs);
++      return 0;
++}
++
+ static void free_kvm_area(void)
+ {
+       int cpu;
+@@ -6949,6 +6960,7 @@ static int handle_vmon(struct kvm_vcpu *
+       struct vmcs *shadow_vmcs;
+       const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
+               | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
++      int r;
+       /* The Intel VMX Instruction Reference lists a bunch of bits that
+        * are prerequisite to running VMXON, most notably cr4.VMXE must be
+@@ -6988,11 +7000,9 @@ static int handle_vmon(struct kvm_vcpu *
+               return 1;
+       }
+-      vmx->nested.vmcs02.vmcs = alloc_vmcs();
+-      vmx->nested.vmcs02.shadow_vmcs = NULL;
+-      if (!vmx->nested.vmcs02.vmcs)
++      r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
++      if (r < 0)
+               goto out_vmcs02;
+-      loaded_vmcs_init(&vmx->nested.vmcs02);
+       if (cpu_has_vmx_msr_bitmap()) {
+               vmx->nested.msr_bitmap =
+@@ -9113,17 +9123,15 @@ static struct kvm_vcpu *vmx_create_vcpu(
+       if (!vmx->guest_msrs)
+               goto free_pml;
+-      vmx->loaded_vmcs = &vmx->vmcs01;
+-      vmx->loaded_vmcs->vmcs = alloc_vmcs();
+-      vmx->loaded_vmcs->shadow_vmcs = NULL;
+-      if (!vmx->loaded_vmcs->vmcs)
+-              goto free_msrs;
+       if (!vmm_exclusive)
+               kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
+-      loaded_vmcs_init(vmx->loaded_vmcs);
++      err = alloc_loaded_vmcs(&vmx->vmcs01);
+       if (!vmm_exclusive)
+               kvm_cpu_vmxoff();
++      if (err < 0)
++              goto free_msrs;
++      vmx->loaded_vmcs = &vmx->vmcs01;
+       cpu = get_cpu();
+       vmx_vcpu_load(&vmx->vcpu, cpu);
+       vmx->vcpu.cpu = cpu;
diff --git a/queue-4.9/kvm-vmx-make-msr-bitmaps-per-vcpu.patch b/queue-4.9/kvm-vmx-make-msr-bitmaps-per-vcpu.patch
new file mode 100644 (file)
index 0000000..a831850
--- /dev/null
@@ -0,0 +1,582 @@
+From 904e14fb7cb96401a7dc803ca2863fd5ba32ffe6 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Tue, 16 Jan 2018 16:51:18 +0100
+Subject: KVM: VMX: make MSR bitmaps per-VCPU
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 904e14fb7cb96401a7dc803ca2863fd5ba32ffe6 upstream.
+
+Place the MSR bitmap in struct loaded_vmcs, and update it in place
+every time the x2apic or APICv state can change.  This is rare and
+the loop can handle 64 MSRs per iteration, in a similar fashion as
+nested_vmx_prepare_msr_bitmap.
+
+This prepares for choosing, on a per-VM basis, whether to intercept
+the SPEC_CTRL and PRED_CMD MSRs.
+
+Cc: stable@vger.kernel.org       # prereq for Spectre mitigation
+Suggested-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx.c |  316 +++++++++++++++++++----------------------------------
+ 1 file changed, 115 insertions(+), 201 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -109,6 +109,14 @@ static u64 __read_mostly host_xss;
+ static bool __read_mostly enable_pml = 1;
+ module_param_named(pml, enable_pml, bool, S_IRUGO);
++#define MSR_TYPE_R    1
++#define MSR_TYPE_W    2
++#define MSR_TYPE_RW   3
++
++#define MSR_BITMAP_MODE_X2APIC                1
++#define MSR_BITMAP_MODE_X2APIC_APICV  2
++#define MSR_BITMAP_MODE_LM            4
++
+ #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
+ /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
+@@ -190,6 +198,7 @@ struct loaded_vmcs {
+       struct vmcs *shadow_vmcs;
+       int cpu;
+       int launched;
++      unsigned long *msr_bitmap;
+       struct list_head loaded_vmcss_on_cpu_link;
+ };
+@@ -428,8 +437,6 @@ struct nested_vmx {
+       bool pi_pending;
+       u16 posted_intr_nv;
+-      unsigned long *msr_bitmap;
+-
+       struct hrtimer preemption_timer;
+       bool preemption_timer_expired;
+@@ -530,6 +537,7 @@ struct vcpu_vmx {
+       unsigned long         host_rsp;
+       u8                    fail;
+       bool                  nmi_known_unmasked;
++      u8                    msr_bitmap_mode;
+       u32                   exit_intr_info;
+       u32                   idt_vectoring_info;
+       ulong                 rflags;
+@@ -904,6 +912,7 @@ static u32 vmx_segment_access_rights(str
+ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
+ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
+ static int alloc_identity_pagetable(struct kvm *kvm);
++static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
+ static DEFINE_PER_CPU(struct vmcs *, vmxarea);
+ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
+@@ -923,12 +932,6 @@ static DEFINE_PER_CPU(spinlock_t, blocke
+ static unsigned long *vmx_io_bitmap_a;
+ static unsigned long *vmx_io_bitmap_b;
+-static unsigned long *vmx_msr_bitmap_legacy;
+-static unsigned long *vmx_msr_bitmap_longmode;
+-static unsigned long *vmx_msr_bitmap_legacy_x2apic;
+-static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+-static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+-static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
+ static unsigned long *vmx_vmread_bitmap;
+ static unsigned long *vmx_vmwrite_bitmap;
+@@ -2522,36 +2525,6 @@ static void move_msr_up(struct vcpu_vmx
+       vmx->guest_msrs[from] = tmp;
+ }
+-static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
+-{
+-      unsigned long *msr_bitmap;
+-
+-      if (is_guest_mode(vcpu))
+-              msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
+-      else if (cpu_has_secondary_exec_ctrls() &&
+-               (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
+-                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
+-              if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
+-                      if (is_long_mode(vcpu))
+-                              msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
+-                      else
+-                              msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+-              } else {
+-                      if (is_long_mode(vcpu))
+-                              msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
+-                      else
+-                              msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+-              }
+-      } else {
+-              if (is_long_mode(vcpu))
+-                      msr_bitmap = vmx_msr_bitmap_longmode;
+-              else
+-                      msr_bitmap = vmx_msr_bitmap_legacy;
+-      }
+-
+-      vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
+-}
+-
+ /*
+  * Set up the vmcs to automatically save and restore system
+  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
+@@ -2592,7 +2565,7 @@ static void setup_msrs(struct vcpu_vmx *
+       vmx->save_nmsrs = save_nmsrs;
+       if (cpu_has_vmx_msr_bitmap())
+-              vmx_set_msr_bitmap(&vmx->vcpu);
++              vmx_update_msr_bitmap(&vmx->vcpu);
+ }
+ /*
+@@ -3539,6 +3512,8 @@ static void free_loaded_vmcs(struct load
+       loaded_vmcs_clear(loaded_vmcs);
+       free_vmcs(loaded_vmcs->vmcs);
+       loaded_vmcs->vmcs = NULL;
++      if (loaded_vmcs->msr_bitmap)
++              free_page((unsigned long)loaded_vmcs->msr_bitmap);
+       WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
+ }
+@@ -3555,7 +3530,18 @@ static int alloc_loaded_vmcs(struct load
+       loaded_vmcs->shadow_vmcs = NULL;
+       loaded_vmcs_init(loaded_vmcs);
++
++      if (cpu_has_vmx_msr_bitmap()) {
++              loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
++              if (!loaded_vmcs->msr_bitmap)
++                      goto out_vmcs;
++              memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
++      }
+       return 0;
++
++out_vmcs:
++      free_loaded_vmcs(loaded_vmcs);
++      return -ENOMEM;
+ }
+ static void free_kvm_area(void)
+@@ -4564,10 +4550,8 @@ static void free_vpid(int vpid)
+       spin_unlock(&vmx_vpid_lock);
+ }
+-#define MSR_TYPE_R    1
+-#define MSR_TYPE_W    2
+-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+-                                              u32 msr, int type)
++static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
++                                                        u32 msr, int type)
+ {
+       int f = sizeof(unsigned long);
+@@ -4601,8 +4585,8 @@ static void __vmx_disable_intercept_for_
+       }
+ }
+-static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+-                                              u32 msr, int type)
++static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
++                                                       u32 msr, int type)
+ {
+       int f = sizeof(unsigned long);
+@@ -4636,6 +4620,15 @@ static void __vmx_enable_intercept_for_m
+       }
+ }
++static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
++                                                    u32 msr, int type, bool value)
++{
++      if (value)
++              vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
++      else
++              vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
++}
++
+ /*
+  * If a msr is allowed by L0, we should check whether it is allowed by L1.
+  * The corresponding bit will be cleared unless both of L0 and L1 allow it.
+@@ -4682,58 +4675,68 @@ static void nested_vmx_disable_intercept
+       }
+ }
+-static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
++static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
+ {
+-      if (!longmode_only)
+-              __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
+-                                              msr, MSR_TYPE_R | MSR_TYPE_W);
+-      __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
+-                                              msr, MSR_TYPE_R | MSR_TYPE_W);
+-}
+-
+-static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
+-{
+-      if (apicv_active) {
+-              __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+-                              msr, MSR_TYPE_R);
+-              __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+-                              msr, MSR_TYPE_R);
+-      } else {
+-              __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+-                              msr, MSR_TYPE_R);
+-              __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+-                              msr, MSR_TYPE_R);
++      u8 mode = 0;
++
++      if (cpu_has_secondary_exec_ctrls() &&
++          (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
++           SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
++              mode |= MSR_BITMAP_MODE_X2APIC;
++              if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
++                      mode |= MSR_BITMAP_MODE_X2APIC_APICV;
+       }
++
++      if (is_long_mode(vcpu))
++              mode |= MSR_BITMAP_MODE_LM;
++
++      return mode;
+ }
+-static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
++#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
++
++static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
++                                       u8 mode)
+ {
+-      if (apicv_active) {
+-              __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+-                              msr, MSR_TYPE_R);
+-              __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+-                              msr, MSR_TYPE_R);
+-      } else {
+-              __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+-                              msr, MSR_TYPE_R);
+-              __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+-                              msr, MSR_TYPE_R);
++      int msr;
++
++      for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
++              unsigned word = msr / BITS_PER_LONG;
++              msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
++              msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
++      }
++
++      if (mode & MSR_BITMAP_MODE_X2APIC) {
++              /*
++               * TPR reads and writes can be virtualized even if virtual interrupt
++               * delivery is not in use.
++               */
++              vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
++              if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
++                      vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
++                      vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
++                      vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
++              }
+       }
+ }
+-static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active)
++static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
+ {
+-      if (apicv_active) {
+-              __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+-                              msr, MSR_TYPE_W);
+-              __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+-                              msr, MSR_TYPE_W);
+-      } else {
+-              __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+-                              msr, MSR_TYPE_W);
+-              __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+-                              msr, MSR_TYPE_W);
+-      }
++      struct vcpu_vmx *vmx = to_vmx(vcpu);
++      unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
++      u8 mode = vmx_msr_bitmap_mode(vcpu);
++      u8 changed = mode ^ vmx->msr_bitmap_mode;
++
++      if (!changed)
++              return;
++
++      vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
++                                !(mode & MSR_BITMAP_MODE_LM));
++
++      if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
++              vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
++
++      vmx->msr_bitmap_mode = mode;
+ }
+ static bool vmx_get_enable_apicv(void)
+@@ -4982,7 +4985,7 @@ static void vmx_refresh_apicv_exec_ctrl(
+       }
+       if (cpu_has_vmx_msr_bitmap())
+-              vmx_set_msr_bitmap(vcpu);
++              vmx_update_msr_bitmap(vcpu);
+ }
+ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
+@@ -5071,7 +5074,7 @@ static int vmx_vcpu_setup(struct vcpu_vm
+               vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
+       }
+       if (cpu_has_vmx_msr_bitmap())
+-              vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
++              vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
+       vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
+@@ -6402,7 +6405,7 @@ static void wakeup_handler(void)
+ static __init int hardware_setup(void)
+ {
+-      int r = -ENOMEM, i, msr;
++      int r = -ENOMEM, i;
+       rdmsrl_safe(MSR_EFER, &host_efer);
+@@ -6417,41 +6420,13 @@ static __init int hardware_setup(void)
+       if (!vmx_io_bitmap_b)
+               goto out;
+-      vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
+-      if (!vmx_msr_bitmap_legacy)
+-              goto out1;
+-
+-      vmx_msr_bitmap_legacy_x2apic =
+-                              (unsigned long *)__get_free_page(GFP_KERNEL);
+-      if (!vmx_msr_bitmap_legacy_x2apic)
+-              goto out2;
+-
+-      vmx_msr_bitmap_legacy_x2apic_apicv_inactive =
+-                              (unsigned long *)__get_free_page(GFP_KERNEL);
+-      if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive)
+-              goto out3;
+-
+-      vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
+-      if (!vmx_msr_bitmap_longmode)
+-              goto out4;
+-
+-      vmx_msr_bitmap_longmode_x2apic =
+-                              (unsigned long *)__get_free_page(GFP_KERNEL);
+-      if (!vmx_msr_bitmap_longmode_x2apic)
+-              goto out5;
+-
+-      vmx_msr_bitmap_longmode_x2apic_apicv_inactive =
+-                              (unsigned long *)__get_free_page(GFP_KERNEL);
+-      if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive)
+-              goto out6;
+-
+       vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_vmread_bitmap)
+-              goto out7;
++              goto out1;
+       vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_vmwrite_bitmap)
+-              goto out8;
++              goto out2;
+       memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
+       memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
+@@ -6460,12 +6435,9 @@ static __init int hardware_setup(void)
+       memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
+-      memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
+-      memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
+-
+       if (setup_vmcs_config(&vmcs_config) < 0) {
+               r = -EIO;
+-              goto out9;
++              goto out3;
+       }
+       if (boot_cpu_has(X86_FEATURE_NX))
+@@ -6522,47 +6494,8 @@ static __init int hardware_setup(void)
+               kvm_tsc_scaling_ratio_frac_bits = 48;
+       }
+-      vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
+-      vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
+-      vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
+-      vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
+-      vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
+-      vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
+-
+-      memcpy(vmx_msr_bitmap_legacy_x2apic,
+-                      vmx_msr_bitmap_legacy, PAGE_SIZE);
+-      memcpy(vmx_msr_bitmap_longmode_x2apic,
+-                      vmx_msr_bitmap_longmode, PAGE_SIZE);
+-      memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+-                      vmx_msr_bitmap_legacy, PAGE_SIZE);
+-      memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+-                      vmx_msr_bitmap_longmode, PAGE_SIZE);
+-
+       set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
+-      /*
+-       * enable_apicv && kvm_vcpu_apicv_active()
+-       */
+-      for (msr = 0x800; msr <= 0x8ff; msr++)
+-              vmx_disable_intercept_msr_read_x2apic(msr, true);
+-
+-      /* TMCCT */
+-      vmx_enable_intercept_msr_read_x2apic(0x839, true);
+-      /* TPR */
+-      vmx_disable_intercept_msr_write_x2apic(0x808, true);
+-      /* EOI */
+-      vmx_disable_intercept_msr_write_x2apic(0x80b, true);
+-      /* SELF-IPI */
+-      vmx_disable_intercept_msr_write_x2apic(0x83f, true);
+-
+-      /*
+-       * (enable_apicv && !kvm_vcpu_apicv_active()) ||
+-       *      !enable_apicv
+-       */
+-      /* TPR */
+-      vmx_disable_intercept_msr_read_x2apic(0x808, false);
+-      vmx_disable_intercept_msr_write_x2apic(0x808, false);
+-
+       if (enable_ept) {
+               kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
+                       (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
+@@ -6608,22 +6541,10 @@ static __init int hardware_setup(void)
+       return alloc_kvm_area();
+-out9:
+-      free_page((unsigned long)vmx_vmwrite_bitmap);
+-out8:
+-      free_page((unsigned long)vmx_vmread_bitmap);
+-out7:
+-      free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
+-out6:
+-      free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
+-out5:
+-      free_page((unsigned long)vmx_msr_bitmap_longmode);
+-out4:
+-      free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
+ out3:
+-      free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
++      free_page((unsigned long)vmx_vmwrite_bitmap);
+ out2:
+-      free_page((unsigned long)vmx_msr_bitmap_legacy);
++      free_page((unsigned long)vmx_vmread_bitmap);
+ out1:
+       free_page((unsigned long)vmx_io_bitmap_b);
+ out:
+@@ -6634,12 +6555,6 @@ out:
+ static __exit void hardware_unsetup(void)
+ {
+-      free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+-      free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
+-      free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
+-      free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
+-      free_page((unsigned long)vmx_msr_bitmap_legacy);
+-      free_page((unsigned long)vmx_msr_bitmap_longmode);
+       free_page((unsigned long)vmx_io_bitmap_b);
+       free_page((unsigned long)vmx_io_bitmap_a);
+       free_page((unsigned long)vmx_vmwrite_bitmap);
+@@ -7004,13 +6919,6 @@ static int handle_vmon(struct kvm_vcpu *
+       if (r < 0)
+               goto out_vmcs02;
+-      if (cpu_has_vmx_msr_bitmap()) {
+-              vmx->nested.msr_bitmap =
+-                              (unsigned long *)__get_free_page(GFP_KERNEL);
+-              if (!vmx->nested.msr_bitmap)
+-                      goto out_msr_bitmap;
+-      }
+-
+       vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+       if (!vmx->nested.cached_vmcs12)
+               goto out_cached_vmcs12;
+@@ -7040,9 +6948,6 @@ out_shadow_vmcs:
+       kfree(vmx->nested.cached_vmcs12);
+ out_cached_vmcs12:
+-      free_page((unsigned long)vmx->nested.msr_bitmap);
+-
+-out_msr_bitmap:
+       free_loaded_vmcs(&vmx->nested.vmcs02);
+ out_vmcs02:
+@@ -7121,10 +7026,6 @@ static void free_nested(struct vcpu_vmx
+       vmx->nested.vmxon = false;
+       free_vpid(vmx->nested.vpid02);
+       nested_release_vmcs12(vmx);
+-      if (vmx->nested.msr_bitmap) {
+-              free_page((unsigned long)vmx->nested.msr_bitmap);
+-              vmx->nested.msr_bitmap = NULL;
+-      }
+       if (enable_shadow_vmcs) {
+               vmcs_clear(vmx->vmcs01.shadow_vmcs);
+               free_vmcs(vmx->vmcs01.shadow_vmcs);
+@@ -8471,7 +8372,7 @@ static void vmx_set_virtual_x2apic_mode(
+       }
+       vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
+-      vmx_set_msr_bitmap(vcpu);
++      vmx_update_msr_bitmap(vcpu);
+ }
+ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
+@@ -9091,6 +8992,7 @@ static struct kvm_vcpu *vmx_create_vcpu(
+ {
+       int err;
+       struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
++      unsigned long *msr_bitmap;
+       int cpu;
+       if (!vmx)
+@@ -9131,6 +9033,15 @@ static struct kvm_vcpu *vmx_create_vcpu(
+       if (err < 0)
+               goto free_msrs;
++      msr_bitmap = vmx->vmcs01.msr_bitmap;
++      vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
++      vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
++      vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
++      vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
++      vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
++      vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
++      vmx->msr_bitmap_mode = 0;
++
+       vmx->loaded_vmcs = &vmx->vmcs01;
+       cpu = get_cpu();
+       vmx_vcpu_load(&vmx->vcpu, cpu);
+@@ -9525,7 +9436,7 @@ static inline bool nested_vmx_merge_msr_
+       int msr;
+       struct page *page;
+       unsigned long *msr_bitmap_l1;
+-      unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
++      unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
+       /* This shortcut is ok because we support only x2APIC MSRs so far. */
+       if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+@@ -10045,6 +9956,9 @@ static void prepare_vmcs02(struct kvm_vc
+       if (kvm_has_tsc_control)
+               decache_tsc_multiplier(vmx);
++      if (cpu_has_vmx_msr_bitmap())
++              vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
++
+       if (enable_vpid) {
+               /*
+                * There is no direct mapping between vpid02 and vpid12, the
+@@ -10749,7 +10663,7 @@ static void load_vmcs12_host_state(struc
+       vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+       if (cpu_has_vmx_msr_bitmap())
+-              vmx_set_msr_bitmap(vcpu);
++              vmx_update_msr_bitmap(vcpu);
+       if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+                               vmcs12->vm_exit_msr_load_count))
diff --git a/queue-4.9/kvm-x86-add-ibpb-support.patch b/queue-4.9/kvm-x86-add-ibpb-support.patch
new file mode 100644 (file)
index 0000000..c893472
--- /dev/null
@@ -0,0 +1,343 @@
+From 15d45071523d89b3fb7372e2135fbd72f6af9506 Mon Sep 17 00:00:00 2001
+From: Ashok Raj <ashok.raj@intel.com>
+Date: Thu, 1 Feb 2018 22:59:43 +0100
+Subject: KVM/x86: Add IBPB support
+
+From: Ashok Raj <ashok.raj@intel.com>
+
+commit 15d45071523d89b3fb7372e2135fbd72f6af9506 upstream.
+
+The Indirect Branch Predictor Barrier (IBPB) is an indirect branch
+control mechanism. It keeps earlier branches from influencing
+later ones.
+
+Unlike IBRS and STIBP, IBPB does not define a new mode of operation.
+It's a command that ensures predicted branch targets aren't used after
+the barrier. Although IBRS and IBPB are enumerated by the same CPUID
+enumeration, IBPB is very different.
+
+IBPB helps mitigate against three potential attacks:
+
+* Mitigate guests from being attacked by other guests.
+  - This is addressed by issing IBPB when we do a guest switch.
+
+* Mitigate attacks from guest/ring3->host/ring3.
+  These would require a IBPB during context switch in host, or after
+  VMEXIT. The host process has two ways to mitigate
+  - Either it can be compiled with retpoline
+  - If its going through context switch, and has set !dumpable then
+    there is a IBPB in that path.
+    (Tim's patch: https://patchwork.kernel.org/patch/10192871)
+  - The case where after a VMEXIT you return back to Qemu might make
+    Qemu attackable from guest when Qemu isn't compiled with retpoline.
+  There are issues reported when doing IBPB on every VMEXIT that resulted
+  in some tsc calibration woes in guest.
+
+* Mitigate guest/ring0->host/ring0 attacks.
+  When host kernel is using retpoline it is safe against these attacks.
+  If host kernel isn't using retpoline we might need to do a IBPB flush on
+  every VMEXIT.
+
+Even when using retpoline for indirect calls, in certain conditions 'ret'
+can use the BTB on Skylake-era CPUs. There are other mitigations
+available like RSB stuffing/clearing.
+
+* IBPB is issued only for SVM during svm_free_vcpu().
+  VMX has a vmclear and SVM doesn't.  Follow discussion here:
+  https://lkml.org/lkml/2018/1/15/146
+
+Please refer to the following spec for more details on the enumeration
+and control.
+
+Refer here to get documentation about mitigations.
+
+https://software.intel.com/en-us/side-channel-security-support
+
+[peterz: rebase and changelog rewrite]
+[karahmed: - rebase
+           - vmx: expose PRED_CMD if guest has it in CPUID
+           - svm: only pass through IBPB if guest has it in CPUID
+           - vmx: support !cpu_has_vmx_msr_bitmap()]
+           - vmx: support nested]
+[dwmw2: Expose CPUID bit too (AMD IBPB only for now as we lack IBRS)
+        PRED_CMD is a write-only MSR]
+
+Signed-off-by: Ashok Raj <ashok.raj@intel.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Cc: kvm@vger.kernel.org
+Cc: Asit Mallick <asit.k.mallick@intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: Jun Nakajima <jun.nakajima@intel.com>
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>
+Link: http://lkml.kernel.org/r/1515720739-43819-6-git-send-email-ashok.raj@intel.com
+Link: https://lkml.kernel.org/r/1517522386-18410-3-git-send-email-karahmed@amazon.de
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/cpuid.c |   11 ++++++-
+ arch/x86/kvm/cpuid.h |   12 +++++++
+ arch/x86/kvm/svm.c   |   28 ++++++++++++++++++
+ arch/x86/kvm/vmx.c   |   79 +++++++++++++++++++++++++++++++++++++++++++++++++--
+ 4 files changed, 127 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -355,6 +355,10 @@ static inline int __do_cpuid_ent(struct
+               F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
+               0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
++      /* cpuid 0x80000008.ebx */
++      const u32 kvm_cpuid_8000_0008_ebx_x86_features =
++              F(IBPB);
++
+       /* cpuid 0xC0000001.edx */
+       const u32 kvm_cpuid_C000_0001_edx_x86_features =
+               F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
+@@ -607,7 +611,12 @@ static inline int __do_cpuid_ent(struct
+               if (!g_phys_as)
+                       g_phys_as = phys_as;
+               entry->eax = g_phys_as | (virt_as << 8);
+-              entry->ebx = entry->edx = 0;
++              entry->edx = 0;
++              /* IBPB isn't necessarily present in hardware cpuid */
++              if (boot_cpu_has(X86_FEATURE_IBPB))
++                      entry->ebx |= F(IBPB);
++              entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
++              cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
+               break;
+       }
+       case 0x80000019:
+--- a/arch/x86/kvm/cpuid.h
++++ b/arch/x86/kvm/cpuid.h
+@@ -160,6 +160,18 @@ static inline bool guest_cpuid_has_rdtsc
+       return best && (best->edx & bit(X86_FEATURE_RDTSCP));
+ }
++static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu *vcpu)
++{
++      struct kvm_cpuid_entry2 *best;
++
++      best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
++      if (best && (best->ebx & bit(X86_FEATURE_IBPB)))
++              return true;
++      best = kvm_find_cpuid_entry(vcpu, 7, 0);
++      return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
++}
++
++
+ /*
+  * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3
+  */
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -248,6 +248,7 @@ static const struct svm_direct_access_ms
+       { .index = MSR_CSTAR,                           .always = true  },
+       { .index = MSR_SYSCALL_MASK,                    .always = true  },
+ #endif
++      { .index = MSR_IA32_PRED_CMD,                   .always = false },
+       { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
+       { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
+       { .index = MSR_IA32_LASTINTFROMIP,              .always = false },
+@@ -510,6 +511,7 @@ struct svm_cpu_data {
+       struct kvm_ldttss_desc *tss_desc;
+       struct page *save_area;
++      struct vmcb *current_vmcb;
+ };
+ static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
+@@ -1644,11 +1646,17 @@ static void svm_free_vcpu(struct kvm_vcp
+       __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
+       kvm_vcpu_uninit(vcpu);
+       kmem_cache_free(kvm_vcpu_cache, svm);
++      /*
++       * The vmcb page can be recycled, causing a false negative in
++       * svm_vcpu_load(). So do a full IBPB now.
++       */
++      indirect_branch_prediction_barrier();
+ }
+ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
++      struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+       int i;
+       if (unlikely(cpu != vcpu->cpu)) {
+@@ -1677,6 +1685,10 @@ static void svm_vcpu_load(struct kvm_vcp
+       if (static_cpu_has(X86_FEATURE_RDTSCP))
+               wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
++      if (sd->current_vmcb != svm->vmcb) {
++              sd->current_vmcb = svm->vmcb;
++              indirect_branch_prediction_barrier();
++      }
+       avic_vcpu_load(vcpu, cpu);
+ }
+@@ -3599,6 +3611,22 @@ static int svm_set_msr(struct kvm_vcpu *
+       case MSR_IA32_TSC:
+               kvm_write_tsc(vcpu, msr);
+               break;
++      case MSR_IA32_PRED_CMD:
++              if (!msr->host_initiated &&
++                  !guest_cpuid_has_ibpb(vcpu))
++                      return 1;
++
++              if (data & ~PRED_CMD_IBPB)
++                      return 1;
++
++              if (!data)
++                      break;
++
++              wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
++              if (is_guest_mode(vcpu))
++                      break;
++              set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
++              break;
+       case MSR_STAR:
+               svm->vmcb->save.star = data;
+               break;
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -549,6 +549,7 @@ struct vcpu_vmx {
+       u64                   msr_host_kernel_gs_base;
+       u64                   msr_guest_kernel_gs_base;
+ #endif
++
+       u32 vm_entry_controls_shadow;
+       u32 vm_exit_controls_shadow;
+       /*
+@@ -913,6 +914,8 @@ static void copy_vmcs12_to_shadow(struct
+ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
+ static int alloc_identity_pagetable(struct kvm *kvm);
+ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
++static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
++                                                        u32 msr, int type);
+ static DEFINE_PER_CPU(struct vmcs *, vmxarea);
+ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
+@@ -1848,6 +1851,29 @@ static void update_exception_bitmap(stru
+       vmcs_write32(EXCEPTION_BITMAP, eb);
+ }
++/*
++ * Check if MSR is intercepted for L01 MSR bitmap.
++ */
++static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
++{
++      unsigned long *msr_bitmap;
++      int f = sizeof(unsigned long);
++
++      if (!cpu_has_vmx_msr_bitmap())
++              return true;
++
++      msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
++
++      if (msr <= 0x1fff) {
++              return !!test_bit(msr, msr_bitmap + 0x800 / f);
++      } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
++              msr &= 0x1fff;
++              return !!test_bit(msr, msr_bitmap + 0xc00 / f);
++      }
++
++      return true;
++}
++
+ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+               unsigned long entry, unsigned long exit)
+ {
+@@ -2257,6 +2283,7 @@ static void vmx_vcpu_load(struct kvm_vcp
+       if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+               per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+               vmcs_load(vmx->loaded_vmcs->vmcs);
++              indirect_branch_prediction_barrier();
+       }
+       if (!already_loaded) {
+@@ -3058,6 +3085,33 @@ static int vmx_set_msr(struct kvm_vcpu *
+       case MSR_IA32_TSC:
+               kvm_write_tsc(vcpu, msr_info);
+               break;
++      case MSR_IA32_PRED_CMD:
++              if (!msr_info->host_initiated &&
++                  !guest_cpuid_has_ibpb(vcpu))
++                      return 1;
++
++              if (data & ~PRED_CMD_IBPB)
++                      return 1;
++
++              if (!data)
++                      break;
++
++              wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
++
++              /*
++               * For non-nested:
++               * When it's written (to non-zero) for the first time, pass
++               * it through.
++               *
++               * For nested:
++               * The handling of the MSR bitmap for L2 guests is done in
++               * nested_vmx_merge_msr_bitmap. We should not touch the
++               * vmcs02.msr_bitmap here since it gets completely overwritten
++               * in the merging.
++               */
++              vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
++                                            MSR_TYPE_W);
++              break;
+       case MSR_IA32_CR_PAT:
+               if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+                       if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
+@@ -9437,9 +9491,23 @@ static inline bool nested_vmx_merge_msr_
+       struct page *page;
+       unsigned long *msr_bitmap_l1;
+       unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
++      /*
++       * pred_cmd is trying to verify two things:
++       *
++       * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
++       *    ensures that we do not accidentally generate an L02 MSR bitmap
++       *    from the L12 MSR bitmap that is too permissive.
++       * 2. That L1 or L2s have actually used the MSR. This avoids
++       *    unnecessarily merging of the bitmap if the MSR is unused. This
++       *    works properly because we only update the L01 MSR bitmap lazily.
++       *    So even if L0 should pass L1 these MSRs, the L01 bitmap is only
++       *    updated to reflect this when L1 (or its L2s) actually write to
++       *    the MSR.
++       */
++      bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
+-      /* This shortcut is ok because we support only x2APIC MSRs so far. */
+-      if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
++      if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
++          !pred_cmd)
+               return false;
+       page = nested_get_page(vcpu, vmcs12->msr_bitmap);
+@@ -9477,6 +9545,13 @@ static inline bool nested_vmx_merge_msr_
+                               MSR_TYPE_W);
+               }
+       }
++
++      if (pred_cmd)
++              nested_vmx_disable_intercept_for_msr(
++                                      msr_bitmap_l1, msr_bitmap_l0,
++                                      MSR_IA32_PRED_CMD,
++                                      MSR_TYPE_W);
++
+       kunmap(page);
+       nested_release_page_clean(page);
index f6d225945cb37b31f996bca67e64addcc415f1e5..af23d56d520ea131b0a851d14f4a9ae9682b92e9 100644 (file)
@@ -16,3 +16,12 @@ kaiser-fix-intel_bts-perf-crashes.patch
 x86-pti-make-unpoison-of-pgd-for-trusted-boot-work-for-real.patch
 kaiser-allocate-pgd-with-order-0-when-pti-off.patch
 serial-core-mark-port-as-initialized-after-successful-irq-change.patch
+kvm-nvmx-vmx_complete_nested_posted_interrupt-can-t-fail.patch
+kvm-nvmx-mark-vmcs12-pages-dirty-on-l2-exit.patch
+kvm-nvmx-eliminate-vmcs02-pool.patch
+kvm-vmx-introduce-alloc_loaded_vmcs.patch
+kvm-vmx-make-msr-bitmaps-per-vcpu.patch
+kvm-x86-add-ibpb-support.patch
+kvm-vmx-emulate-msr_ia32_arch_capabilities.patch
+kvm-vmx-allow-direct-access-to-msr_ia32_spec_ctrl.patch
+kvm-svm-allow-direct-access-to-msr_ia32_spec_ctrl.patch