From: Greg Kroah-Hartman Date: Tue, 6 Feb 2018 20:40:33 +0000 (-0800) Subject: 4.9-stable patches X-Git-Tag: v3.18.94~5 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=2d6d1a82eaf5c2981bf90128425c667a0ff280ad;p=thirdparty%2Fkernel%2Fstable-queue.git 4.9-stable patches added patches: kvm-nvmx-eliminate-vmcs02-pool.patch kvm-nvmx-mark-vmcs12-pages-dirty-on-l2-exit.patch kvm-nvmx-vmx_complete_nested_posted_interrupt-can-t-fail.patch kvm-svm-allow-direct-access-to-msr_ia32_spec_ctrl.patch kvm-vmx-allow-direct-access-to-msr_ia32_spec_ctrl.patch kvm-vmx-emulate-msr_ia32_arch_capabilities.patch kvm-vmx-introduce-alloc_loaded_vmcs.patch kvm-vmx-make-msr-bitmaps-per-vcpu.patch kvm-x86-add-ibpb-support.patch --- diff --git a/queue-4.9/kvm-nvmx-eliminate-vmcs02-pool.patch b/queue-4.9/kvm-nvmx-eliminate-vmcs02-pool.patch new file mode 100644 index 00000000000..8a99c2ae01c --- /dev/null +++ b/queue-4.9/kvm-nvmx-eliminate-vmcs02-pool.patch @@ -0,0 +1,292 @@ +From de3a0021a60635de96aa92713c1a31a96747d72c Mon Sep 17 00:00:00 2001 +From: Jim Mattson +Date: Mon, 27 Nov 2017 17:22:25 -0600 +Subject: KVM: nVMX: Eliminate vmcs02 pool +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Jim Mattson + +commit de3a0021a60635de96aa92713c1a31a96747d72c upstream. + +The potential performance advantages of a vmcs02 pool have never been +realized. To simplify the code, eliminate the pool. Instead, a single +vmcs02 is allocated per VCPU when the VCPU enters VMX operation. + +Cc: stable@vger.kernel.org # prereq for Spectre mitigation +Signed-off-by: Jim Mattson +Signed-off-by: Mark Kanda +Reviewed-by: Ameya More +Reviewed-by: David Hildenbrand +Reviewed-by: Paolo Bonzini +Signed-off-by: Radim Krčmář +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx.c | 146 ++++++++--------------------------------------------- + 1 file changed, 23 insertions(+), 123 deletions(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -173,7 +173,6 @@ module_param(ple_window_max, int, S_IRUG + extern const ulong vmx_return; + + #define NR_AUTOLOAD_MSRS 8 +-#define VMCS02_POOL_SIZE 1 + + struct vmcs { + u32 revision_id; +@@ -207,7 +206,7 @@ struct shared_msr_entry { + * stored in guest memory specified by VMPTRLD, but is opaque to the guest, + * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. + * More than one of these structures may exist, if L1 runs multiple L2 guests. +- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the ++ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the + * underlying hardware which will be used to run L2. + * This structure is packed to ensure that its layout is identical across + * machines (necessary for live migration). +@@ -386,13 +385,6 @@ struct __packed vmcs12 { + */ + #define VMCS12_SIZE 0x1000 + +-/* Used to remember the last vmcs02 used for some recently used vmcs12s */ +-struct vmcs02_list { +- struct list_head list; +- gpa_t vmptr; +- struct loaded_vmcs vmcs02; +-}; +- + /* + * The nested_vmx structure is part of vcpu_vmx, and holds information we need + * for correct emulation of VMX (i.e., nested VMX) on this vcpu. +@@ -419,15 +411,15 @@ struct nested_vmx { + */ + bool sync_shadow_vmcs; + +- /* vmcs02_list cache of VMCSs recently used to run L2 guests */ +- struct list_head vmcs02_pool; +- int vmcs02_num; + bool change_vmcs01_virtual_x2apic_mode; + /* L2 must run next, and mustn't decide to exit to L1. */ + bool nested_run_pending; ++ ++ struct loaded_vmcs vmcs02; ++ + /* +- * Guest pages referred to in vmcs02 with host-physical pointers, so +- * we must keep them pinned while L2 runs. ++ * Guest pages referred to in the vmcs02 with host-physical ++ * pointers, so we must keep them pinned while L2 runs. + */ + struct page *apic_access_page; + struct page *virtual_apic_page; +@@ -6684,94 +6676,6 @@ static int handle_monitor(struct kvm_vcp + } + + /* +- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. +- * We could reuse a single VMCS for all the L2 guests, but we also want the +- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this +- * allows keeping them loaded on the processor, and in the future will allow +- * optimizations where prepare_vmcs02 doesn't need to set all the fields on +- * every entry if they never change. +- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE +- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first. +- * +- * The following functions allocate and free a vmcs02 in this pool. +- */ +- +-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */ +-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) +-{ +- struct vmcs02_list *item; +- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) +- if (item->vmptr == vmx->nested.current_vmptr) { +- list_move(&item->list, &vmx->nested.vmcs02_pool); +- return &item->vmcs02; +- } +- +- if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { +- /* Recycle the least recently used VMCS. */ +- item = list_last_entry(&vmx->nested.vmcs02_pool, +- struct vmcs02_list, list); +- item->vmptr = vmx->nested.current_vmptr; +- list_move(&item->list, &vmx->nested.vmcs02_pool); +- return &item->vmcs02; +- } +- +- /* Create a new VMCS */ +- item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); +- if (!item) +- return NULL; +- item->vmcs02.vmcs = alloc_vmcs(); +- item->vmcs02.shadow_vmcs = NULL; +- if (!item->vmcs02.vmcs) { +- kfree(item); +- return NULL; +- } +- loaded_vmcs_init(&item->vmcs02); +- item->vmptr = vmx->nested.current_vmptr; +- list_add(&(item->list), &(vmx->nested.vmcs02_pool)); +- vmx->nested.vmcs02_num++; +- return &item->vmcs02; +-} +- +-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */ +-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) +-{ +- struct vmcs02_list *item; +- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) +- if (item->vmptr == vmptr) { +- free_loaded_vmcs(&item->vmcs02); +- list_del(&item->list); +- kfree(item); +- vmx->nested.vmcs02_num--; +- return; +- } +-} +- +-/* +- * Free all VMCSs saved for this vcpu, except the one pointed by +- * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs +- * must be &vmx->vmcs01. +- */ +-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) +-{ +- struct vmcs02_list *item, *n; +- +- WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01); +- list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { +- /* +- * Something will leak if the above WARN triggers. Better than +- * a use-after-free. +- */ +- if (vmx->loaded_vmcs == &item->vmcs02) +- continue; +- +- free_loaded_vmcs(&item->vmcs02); +- list_del(&item->list); +- kfree(item); +- vmx->nested.vmcs02_num--; +- } +-} +- +-/* + * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), + * set the success or error code of an emulated VMX instruction, as specified + * by Vol 2B, VMX Instruction Reference, "Conventions". +@@ -7084,6 +6988,12 @@ static int handle_vmon(struct kvm_vcpu * + return 1; + } + ++ vmx->nested.vmcs02.vmcs = alloc_vmcs(); ++ vmx->nested.vmcs02.shadow_vmcs = NULL; ++ if (!vmx->nested.vmcs02.vmcs) ++ goto out_vmcs02; ++ loaded_vmcs_init(&vmx->nested.vmcs02); ++ + if (cpu_has_vmx_msr_bitmap()) { + vmx->nested.msr_bitmap = + (unsigned long *)__get_free_page(GFP_KERNEL); +@@ -7106,9 +7016,6 @@ static int handle_vmon(struct kvm_vcpu * + vmx->vmcs01.shadow_vmcs = shadow_vmcs; + } + +- INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); +- vmx->nested.vmcs02_num = 0; +- + hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); + vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; +@@ -7126,6 +7033,9 @@ out_cached_vmcs12: + free_page((unsigned long)vmx->nested.msr_bitmap); + + out_msr_bitmap: ++ free_loaded_vmcs(&vmx->nested.vmcs02); ++ ++out_vmcs02: + return -ENOMEM; + } + +@@ -7211,7 +7121,7 @@ static void free_nested(struct vcpu_vmx + vmx->vmcs01.shadow_vmcs = NULL; + } + kfree(vmx->nested.cached_vmcs12); +- /* Unpin physical memory we referred to in current vmcs02 */ ++ /* Unpin physical memory we referred to in the vmcs02 */ + if (vmx->nested.apic_access_page) { + nested_release_page(vmx->nested.apic_access_page); + vmx->nested.apic_access_page = NULL; +@@ -7227,7 +7137,7 @@ static void free_nested(struct vcpu_vmx + vmx->nested.pi_desc = NULL; + } + +- nested_free_all_saved_vmcss(vmx); ++ free_loaded_vmcs(&vmx->nested.vmcs02); + } + + /* Emulate the VMXOFF instruction */ +@@ -7261,8 +7171,6 @@ static int handle_vmclear(struct kvm_vcp + vmptr + offsetof(struct vmcs12, launch_state), + &zero, sizeof(zero)); + +- nested_free_vmcs02(vmx, vmptr); +- + skip_emulated_instruction(vcpu); + nested_vmx_succeed(vcpu); + return 1; +@@ -8051,10 +7959,11 @@ static bool nested_vmx_exit_handled(stru + + /* + * The host physical addresses of some pages of guest memory +- * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU +- * may write to these pages via their host physical address while +- * L2 is running, bypassing any address-translation-based dirty +- * tracking (e.g. EPT write protection). ++ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC ++ * Page). The CPU may write to these pages via their host ++ * physical address while L2 is running, bypassing any ++ * address-translation-based dirty tracking (e.g. EPT write ++ * protection). + * + * Mark them dirty on every exit from L2 to prevent them from + * getting out of sync with dirty tracking. +@@ -10223,7 +10132,6 @@ static int nested_vmx_run(struct kvm_vcp + struct vmcs12 *vmcs12; + struct vcpu_vmx *vmx = to_vmx(vcpu); + int cpu; +- struct loaded_vmcs *vmcs02; + bool ia32e; + u32 msr_entry_idx; + +@@ -10363,17 +10271,13 @@ static int nested_vmx_run(struct kvm_vcp + * the nested entry. + */ + +- vmcs02 = nested_get_current_vmcs02(vmx); +- if (!vmcs02) +- return -ENOMEM; +- + enter_guest_mode(vcpu); + + if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) + vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + + cpu = get_cpu(); +- vmx->loaded_vmcs = vmcs02; ++ vmx->loaded_vmcs = &vmx->nested.vmcs02; + vmx_vcpu_put(vcpu); + vmx_vcpu_load(vcpu, cpu); + vcpu->cpu = cpu; +@@ -10888,10 +10792,6 @@ static void nested_vmx_vmexit(struct kvm + vm_exit_controls_reset_shadow(vmx); + vmx_segment_cache_clear(vmx); + +- /* if no vmcs02 cache requested, remove the one we used */ +- if (VMCS02_POOL_SIZE == 0) +- nested_free_vmcs02(vmx, vmx->nested.current_vmptr); +- + load_vmcs12_host_state(vcpu, vmcs12); + + /* Update any VMCS fields that might have changed while L2 ran */ diff --git a/queue-4.9/kvm-nvmx-mark-vmcs12-pages-dirty-on-l2-exit.patch b/queue-4.9/kvm-nvmx-mark-vmcs12-pages-dirty-on-l2-exit.patch new file mode 100644 index 00000000000..9361ea07fba --- /dev/null +++ b/queue-4.9/kvm-nvmx-mark-vmcs12-pages-dirty-on-l2-exit.patch @@ -0,0 +1,116 @@ +From c9f04407f2e0b3fc9ff7913c65fcfcb0a4b61570 Mon Sep 17 00:00:00 2001 +From: David Matlack +Date: Tue, 1 Aug 2017 14:00:40 -0700 +Subject: KVM: nVMX: mark vmcs12 pages dirty on L2 exit +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: David Matlack + +commit c9f04407f2e0b3fc9ff7913c65fcfcb0a4b61570 upstream. + +The host physical addresses of L1's Virtual APIC Page and Posted +Interrupt descriptor are loaded into the VMCS02. The CPU may write +to these pages via their host physical address while L2 is running, +bypassing address-translation-based dirty tracking (e.g. EPT write +protection). Mark them dirty on every exit from L2 to prevent them +from getting out of sync with dirty tracking. + +Also mark the virtual APIC page and the posted interrupt descriptor +dirty when KVM is virtualizing posted interrupt processing. + +Signed-off-by: David Matlack +Reviewed-by: Paolo Bonzini +Signed-off-by: Radim Krčmář +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx.c | 53 +++++++++++++++++++++++++++++++++++++++++++---------- + 1 file changed, 43 insertions(+), 10 deletions(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -4738,6 +4738,28 @@ static bool vmx_get_enable_apicv(void) + return enable_apicv; + } + ++static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) ++{ ++ struct vmcs12 *vmcs12 = get_vmcs12(vcpu); ++ gfn_t gfn; ++ ++ /* ++ * Don't need to mark the APIC access page dirty; it is never ++ * written to by the CPU during APIC virtualization. ++ */ ++ ++ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { ++ gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; ++ kvm_vcpu_mark_page_dirty(vcpu, gfn); ++ } ++ ++ if (nested_cpu_has_posted_intr(vmcs12)) { ++ gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; ++ kvm_vcpu_mark_page_dirty(vcpu, gfn); ++ } ++} ++ ++ + static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) + { + struct vcpu_vmx *vmx = to_vmx(vcpu); +@@ -4745,18 +4767,15 @@ static void vmx_complete_nested_posted_i + void *vapic_page; + u16 status; + +- if (vmx->nested.pi_desc && +- vmx->nested.pi_pending) { +- vmx->nested.pi_pending = false; +- if (!pi_test_and_clear_on(vmx->nested.pi_desc)) +- return; +- +- max_irr = find_last_bit( +- (unsigned long *)vmx->nested.pi_desc->pir, 256); ++ if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) ++ return; + +- if (max_irr == 256) +- return; ++ vmx->nested.pi_pending = false; ++ if (!pi_test_and_clear_on(vmx->nested.pi_desc)) ++ return; + ++ max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); ++ if (max_irr != 256) { + vapic_page = kmap(vmx->nested.virtual_apic_page); + if (!vapic_page) { + WARN_ON(1); +@@ -4772,6 +4791,8 @@ static void vmx_complete_nested_posted_i + vmcs_write16(GUEST_INTR_STATUS, status); + } + } ++ ++ nested_mark_vmcs12_pages_dirty(vcpu); + } + + static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) +@@ -8028,6 +8049,18 @@ static bool nested_vmx_exit_handled(stru + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + KVM_ISA_VMX); + ++ /* ++ * The host physical addresses of some pages of guest memory ++ * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU ++ * may write to these pages via their host physical address while ++ * L2 is running, bypassing any address-translation-based dirty ++ * tracking (e.g. EPT write protection). ++ * ++ * Mark them dirty on every exit from L2 to prevent them from ++ * getting out of sync with dirty tracking. ++ */ ++ nested_mark_vmcs12_pages_dirty(vcpu); ++ + if (vmx->nested.nested_run_pending) + return false; + diff --git a/queue-4.9/kvm-nvmx-vmx_complete_nested_posted_interrupt-can-t-fail.patch b/queue-4.9/kvm-nvmx-vmx_complete_nested_posted_interrupt-can-t-fail.patch new file mode 100644 index 00000000000..9c5498f1bc9 --- /dev/null +++ b/queue-4.9/kvm-nvmx-vmx_complete_nested_posted_interrupt-can-t-fail.patch @@ -0,0 +1,65 @@ +From 6342c50ad12e8ce0736e722184a7dbdea4a3477f Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Wed, 25 Jan 2017 11:58:58 +0100 +Subject: KVM: nVMX: vmx_complete_nested_posted_interrupt() can't fail + +From: David Hildenbrand + +commit 6342c50ad12e8ce0736e722184a7dbdea4a3477f upstream. + +vmx_complete_nested_posted_interrupt() can't fail, let's turn it into +a void function. + +Signed-off-by: David Hildenbrand +Signed-off-by: Paolo Bonzini +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -4738,7 +4738,7 @@ static bool vmx_get_enable_apicv(void) + return enable_apicv; + } + +-static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) ++static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) + { + struct vcpu_vmx *vmx = to_vmx(vcpu); + int max_irr; +@@ -4749,13 +4749,13 @@ static int vmx_complete_nested_posted_in + vmx->nested.pi_pending) { + vmx->nested.pi_pending = false; + if (!pi_test_and_clear_on(vmx->nested.pi_desc)) +- return 0; ++ return; + + max_irr = find_last_bit( + (unsigned long *)vmx->nested.pi_desc->pir, 256); + + if (max_irr == 256) +- return 0; ++ return; + + vapic_page = kmap(vmx->nested.virtual_apic_page); + if (!vapic_page) { +@@ -4772,7 +4772,6 @@ static int vmx_complete_nested_posted_in + vmcs_write16(GUEST_INTR_STATUS, status); + } + } +- return 0; + } + + static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) +@@ -10493,7 +10492,8 @@ static int vmx_check_nested_events(struc + return 0; + } + +- return vmx_complete_nested_posted_interrupt(vcpu); ++ vmx_complete_nested_posted_interrupt(vcpu); ++ return 0; + } + + static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) diff --git a/queue-4.9/kvm-svm-allow-direct-access-to-msr_ia32_spec_ctrl.patch b/queue-4.9/kvm-svm-allow-direct-access-to-msr_ia32_spec_ctrl.patch new file mode 100644 index 00000000000..b81a525cb8b --- /dev/null +++ b/queue-4.9/kvm-svm-allow-direct-access-to-msr_ia32_spec_ctrl.patch @@ -0,0 +1,189 @@ +From b2ac58f90540e39324e7a29a7ad471407ae0bf48 Mon Sep 17 00:00:00 2001 +From: KarimAllah Ahmed +Date: Sat, 3 Feb 2018 15:56:23 +0100 +Subject: KVM/SVM: Allow direct access to MSR_IA32_SPEC_CTRL + +From: KarimAllah Ahmed + +commit b2ac58f90540e39324e7a29a7ad471407ae0bf48 upstream. + +[ Based on a patch from Paolo Bonzini ] + +... basically doing exactly what we do for VMX: + +- Passthrough SPEC_CTRL to guests (if enabled in guest CPUID) +- Save and restore SPEC_CTRL around VMExit and VMEntry only if the guest + actually used it. + +Signed-off-by: KarimAllah Ahmed +Signed-off-by: David Woodhouse +Signed-off-by: Thomas Gleixner +Reviewed-by: Darren Kenny +Reviewed-by: Konrad Rzeszutek Wilk +Cc: Andrea Arcangeli +Cc: Andi Kleen +Cc: Jun Nakajima +Cc: kvm@vger.kernel.org +Cc: Dave Hansen +Cc: Tim Chen +Cc: Andy Lutomirski +Cc: Asit Mallick +Cc: Arjan Van De Ven +Cc: Greg KH +Cc: Paolo Bonzini +Cc: Dan Williams +Cc: Linus Torvalds +Cc: Ashok Raj +Link: https://lkml.kernel.org/r/1517669783-20732-1-git-send-email-karahmed@amazon.de +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 88 insertions(+) + +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -183,6 +183,8 @@ struct vcpu_svm { + u64 gs_base; + } host; + ++ u64 spec_ctrl; ++ + u32 *msrpm; + + ulong nmi_iret_rip; +@@ -248,6 +250,7 @@ static const struct svm_direct_access_ms + { .index = MSR_CSTAR, .always = true }, + { .index = MSR_SYSCALL_MASK, .always = true }, + #endif ++ { .index = MSR_IA32_SPEC_CTRL, .always = false }, + { .index = MSR_IA32_PRED_CMD, .always = false }, + { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, + { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, +@@ -863,6 +866,25 @@ static bool valid_msr_intercept(u32 inde + return false; + } + ++static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr) ++{ ++ u8 bit_write; ++ unsigned long tmp; ++ u32 offset; ++ u32 *msrpm; ++ ++ msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: ++ to_svm(vcpu)->msrpm; ++ ++ offset = svm_msrpm_offset(msr); ++ bit_write = 2 * (msr & 0x0f) + 1; ++ tmp = msrpm[offset]; ++ ++ BUG_ON(offset == MSR_INVALID); ++ ++ return !!test_bit(bit_write, &tmp); ++} ++ + static void set_msr_interception(u32 *msrpm, unsigned msr, + int read, int write) + { +@@ -1537,6 +1559,8 @@ static void svm_vcpu_reset(struct kvm_vc + u32 dummy; + u32 eax = 1; + ++ svm->spec_ctrl = 0; ++ + if (!init_event) { + svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; +@@ -3520,6 +3544,13 @@ static int svm_get_msr(struct kvm_vcpu * + case MSR_VM_CR: + msr_info->data = svm->nested.vm_cr_msr; + break; ++ case MSR_IA32_SPEC_CTRL: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has_ibrs(vcpu)) ++ return 1; ++ ++ msr_info->data = svm->spec_ctrl; ++ break; + case MSR_IA32_UCODE_REV: + msr_info->data = 0x01000065; + break; +@@ -3611,6 +3642,33 @@ static int svm_set_msr(struct kvm_vcpu * + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, msr); + break; ++ case MSR_IA32_SPEC_CTRL: ++ if (!msr->host_initiated && ++ !guest_cpuid_has_ibrs(vcpu)) ++ return 1; ++ ++ /* The STIBP bit doesn't fault even if it's not advertised */ ++ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) ++ return 1; ++ ++ svm->spec_ctrl = data; ++ ++ if (!data) ++ break; ++ ++ /* ++ * For non-nested: ++ * When it's written (to non-zero) for the first time, pass ++ * it through. ++ * ++ * For nested: ++ * The handling of the MSR bitmap for L2 guests is done in ++ * nested_svm_vmrun_msrpm. ++ * We update the L1 MSR bit as well since it will end up ++ * touching the MSR anyway now. ++ */ ++ set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); ++ break; + case MSR_IA32_PRED_CMD: + if (!msr->host_initiated && + !guest_cpuid_has_ibpb(vcpu)) +@@ -4854,6 +4912,15 @@ static void svm_vcpu_run(struct kvm_vcpu + + local_irq_enable(); + ++ /* ++ * If this vCPU has touched SPEC_CTRL, restore the guest's value if ++ * it's non-zero. Since vmentry is serialising on affected CPUs, there ++ * is no need to worry about the conditional branch over the wrmsr ++ * being speculatively taken. ++ */ ++ if (svm->spec_ctrl) ++ wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); ++ + asm volatile ( + "push %%" _ASM_BP "; \n\t" + "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" +@@ -4946,6 +5013,27 @@ static void svm_vcpu_run(struct kvm_vcpu + #endif + ); + ++ /* ++ * We do not use IBRS in the kernel. If this vCPU has used the ++ * SPEC_CTRL MSR it may have left it on; save the value and ++ * turn it off. This is much more efficient than blindly adding ++ * it to the atomic save/restore list. Especially as the former ++ * (Saving guest MSRs on vmexit) doesn't even exist in KVM. ++ * ++ * For non-nested case: ++ * If the L01 MSR bitmap does not intercept the MSR, then we need to ++ * save it. ++ * ++ * For nested case: ++ * If the L02 MSR bitmap does not intercept the MSR, then we need to ++ * save it. ++ */ ++ if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) ++ rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); ++ ++ if (svm->spec_ctrl) ++ wrmsrl(MSR_IA32_SPEC_CTRL, 0); ++ + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + diff --git a/queue-4.9/kvm-vmx-allow-direct-access-to-msr_ia32_spec_ctrl.patch b/queue-4.9/kvm-vmx-allow-direct-access-to-msr_ia32_spec_ctrl.patch new file mode 100644 index 00000000000..b94e36d76a1 --- /dev/null +++ b/queue-4.9/kvm-vmx-allow-direct-access-to-msr_ia32_spec_ctrl.patch @@ -0,0 +1,296 @@ +From d28b387fb74da95d69d2615732f50cceb38e9a4d Mon Sep 17 00:00:00 2001 +From: KarimAllah Ahmed +Date: Thu, 1 Feb 2018 22:59:45 +0100 +Subject: KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL + +From: KarimAllah Ahmed + +commit d28b387fb74da95d69d2615732f50cceb38e9a4d upstream. + +[ Based on a patch from Ashok Raj ] + +Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for +guests that will only mitigate Spectre V2 through IBRS+IBPB and will not +be using a retpoline+IBPB based approach. + +To avoid the overhead of saving and restoring the MSR_IA32_SPEC_CTRL for +guests that do not actually use the MSR, only start saving and restoring +when a non-zero is written to it. + +No attempt is made to handle STIBP here, intentionally. Filtering STIBP +may be added in a future patch, which may require trapping all writes +if we don't want to pass it through directly to the guest. + +[dwmw2: Clean up CPUID bits, save/restore manually, handle reset] + +Signed-off-by: KarimAllah Ahmed +Signed-off-by: David Woodhouse +Signed-off-by: Thomas Gleixner +Reviewed-by: Darren Kenny +Reviewed-by: Konrad Rzeszutek Wilk +Reviewed-by: Jim Mattson +Cc: Andrea Arcangeli +Cc: Andi Kleen +Cc: Jun Nakajima +Cc: kvm@vger.kernel.org +Cc: Dave Hansen +Cc: Tim Chen +Cc: Andy Lutomirski +Cc: Asit Mallick +Cc: Arjan Van De Ven +Cc: Greg KH +Cc: Paolo Bonzini +Cc: Dan Williams +Cc: Linus Torvalds +Cc: Ashok Raj +Link: https://lkml.kernel.org/r/1517522386-18410-5-git-send-email-karahmed@amazon.de +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/cpuid.c | 8 ++- + arch/x86/kvm/cpuid.h | 11 +++++ + arch/x86/kvm/vmx.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++- + arch/x86/kvm/x86.c | 2 + 4 files changed, 118 insertions(+), 6 deletions(-) + +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -357,7 +357,7 @@ static inline int __do_cpuid_ent(struct + + /* cpuid 0x80000008.ebx */ + const u32 kvm_cpuid_8000_0008_ebx_x86_features = +- F(IBPB); ++ F(IBPB) | F(IBRS); + + /* cpuid 0xC0000001.edx */ + const u32 kvm_cpuid_C000_0001_edx_x86_features = +@@ -382,7 +382,7 @@ static inline int __do_cpuid_ent(struct + + /* cpuid 7.0.edx*/ + const u32 kvm_cpuid_7_0_edx_x86_features = +- F(ARCH_CAPABILITIES); ++ F(SPEC_CTRL) | F(ARCH_CAPABILITIES); + + /* all calls to cpuid_count() should be made on the same cpu */ + get_cpu(); +@@ -618,9 +618,11 @@ static inline int __do_cpuid_ent(struct + g_phys_as = phys_as; + entry->eax = g_phys_as | (virt_as << 8); + entry->edx = 0; +- /* IBPB isn't necessarily present in hardware cpuid */ ++ /* IBRS and IBPB aren't necessarily present in hardware cpuid */ + if (boot_cpu_has(X86_FEATURE_IBPB)) + entry->ebx |= F(IBPB); ++ if (boot_cpu_has(X86_FEATURE_IBRS)) ++ entry->ebx |= F(IBRS); + entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; + cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); + break; +--- a/arch/x86/kvm/cpuid.h ++++ b/arch/x86/kvm/cpuid.h +@@ -171,6 +171,17 @@ static inline bool guest_cpuid_has_ibpb( + return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); + } + ++static inline bool guest_cpuid_has_ibrs(struct kvm_vcpu *vcpu) ++{ ++ struct kvm_cpuid_entry2 *best; ++ ++ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); ++ if (best && (best->ebx & bit(X86_FEATURE_IBRS))) ++ return true; ++ best = kvm_find_cpuid_entry(vcpu, 7, 0); ++ return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); ++} ++ + static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu) + { + struct kvm_cpuid_entry2 *best; +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -551,6 +551,7 @@ struct vcpu_vmx { + #endif + + u64 arch_capabilities; ++ u64 spec_ctrl; + + u32 vm_entry_controls_shadow; + u32 vm_exit_controls_shadow; +@@ -1854,6 +1855,29 @@ static void update_exception_bitmap(stru + } + + /* ++ * Check if MSR is intercepted for currently loaded MSR bitmap. ++ */ ++static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) ++{ ++ unsigned long *msr_bitmap; ++ int f = sizeof(unsigned long); ++ ++ if (!cpu_has_vmx_msr_bitmap()) ++ return true; ++ ++ msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap; ++ ++ if (msr <= 0x1fff) { ++ return !!test_bit(msr, msr_bitmap + 0x800 / f); ++ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { ++ msr &= 0x1fff; ++ return !!test_bit(msr, msr_bitmap + 0xc00 / f); ++ } ++ ++ return true; ++} ++ ++/* + * Check if MSR is intercepted for L01 MSR bitmap. + */ + static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) +@@ -2983,6 +3007,13 @@ static int vmx_get_msr(struct kvm_vcpu * + case MSR_IA32_TSC: + msr_info->data = guest_read_tsc(vcpu); + break; ++ case MSR_IA32_SPEC_CTRL: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has_ibrs(vcpu)) ++ return 1; ++ ++ msr_info->data = to_vmx(vcpu)->spec_ctrl; ++ break; + case MSR_IA32_ARCH_CAPABILITIES: + if (!msr_info->host_initiated && + !guest_cpuid_has_arch_capabilities(vcpu)) +@@ -3093,6 +3124,36 @@ static int vmx_set_msr(struct kvm_vcpu * + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, msr_info); + break; ++ case MSR_IA32_SPEC_CTRL: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has_ibrs(vcpu)) ++ return 1; ++ ++ /* The STIBP bit doesn't fault even if it's not advertised */ ++ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) ++ return 1; ++ ++ vmx->spec_ctrl = data; ++ ++ if (!data) ++ break; ++ ++ /* ++ * For non-nested: ++ * When it's written (to non-zero) for the first time, pass ++ * it through. ++ * ++ * For nested: ++ * The handling of the MSR bitmap for L2 guests is done in ++ * nested_vmx_merge_msr_bitmap. We should not touch the ++ * vmcs02.msr_bitmap here since it gets completely overwritten ++ * in the merging. We update the vmcs01 here for L1 as well ++ * since it will end up touching the MSR anyway now. ++ */ ++ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, ++ MSR_IA32_SPEC_CTRL, ++ MSR_TYPE_RW); ++ break; + case MSR_IA32_PRED_CMD: + if (!msr_info->host_initiated && + !guest_cpuid_has_ibpb(vcpu)) +@@ -5245,6 +5306,7 @@ static void vmx_vcpu_reset(struct kvm_vc + u64 cr0; + + vmx->rmode.vm86_active = 0; ++ vmx->spec_ctrl = 0; + + vmx->soft_vnmi_blocked = 0; + +@@ -8830,6 +8892,15 @@ static void __noclone vmx_vcpu_run(struc + + vmx_arm_hv_timer(vcpu); + ++ /* ++ * If this vCPU has touched SPEC_CTRL, restore the guest's value if ++ * it's non-zero. Since vmentry is serialising on affected CPUs, there ++ * is no need to worry about the conditional branch over the wrmsr ++ * being speculatively taken. ++ */ ++ if (vmx->spec_ctrl) ++ wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); ++ + vmx->__launched = vmx->loaded_vmcs->launched; + asm( + /* Store host registers */ +@@ -8948,6 +9019,27 @@ static void __noclone vmx_vcpu_run(struc + #endif + ); + ++ /* ++ * We do not use IBRS in the kernel. If this vCPU has used the ++ * SPEC_CTRL MSR it may have left it on; save the value and ++ * turn it off. This is much more efficient than blindly adding ++ * it to the atomic save/restore list. Especially as the former ++ * (Saving guest MSRs on vmexit) doesn't even exist in KVM. ++ * ++ * For non-nested case: ++ * If the L01 MSR bitmap does not intercept the MSR, then we need to ++ * save it. ++ * ++ * For nested case: ++ * If the L02 MSR bitmap does not intercept the MSR, then we need to ++ * save it. ++ */ ++ if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) ++ rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); ++ ++ if (vmx->spec_ctrl) ++ wrmsrl(MSR_IA32_SPEC_CTRL, 0); ++ + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + +@@ -9507,7 +9599,7 @@ static inline bool nested_vmx_merge_msr_ + unsigned long *msr_bitmap_l1; + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; + /* +- * pred_cmd is trying to verify two things: ++ * pred_cmd & spec_ctrl are trying to verify two things: + * + * 1. L0 gave a permission to L1 to actually passthrough the MSR. This + * ensures that we do not accidentally generate an L02 MSR bitmap +@@ -9520,9 +9612,10 @@ static inline bool nested_vmx_merge_msr_ + * the MSR. + */ + bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); ++ bool spec_ctrl = msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL); + + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && +- !pred_cmd) ++ !pred_cmd && !spec_ctrl) + return false; + + page = nested_get_page(vcpu, vmcs12->msr_bitmap); +@@ -9561,6 +9654,12 @@ static inline bool nested_vmx_merge_msr_ + } + } + ++ if (spec_ctrl) ++ nested_vmx_disable_intercept_for_msr( ++ msr_bitmap_l1, msr_bitmap_l0, ++ MSR_IA32_SPEC_CTRL, ++ MSR_TYPE_R | MSR_TYPE_W); ++ + if (pred_cmd) + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -975,7 +975,7 @@ static u32 msrs_to_save[] = { + #endif + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, +- MSR_IA32_ARCH_CAPABILITIES ++ MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES + }; + + static unsigned num_msrs_to_save; diff --git a/queue-4.9/kvm-vmx-emulate-msr_ia32_arch_capabilities.patch b/queue-4.9/kvm-vmx-emulate-msr_ia32_arch_capabilities.patch new file mode 100644 index 00000000000..4b8d14c941e --- /dev/null +++ b/queue-4.9/kvm-vmx-emulate-msr_ia32_arch_capabilities.patch @@ -0,0 +1,147 @@ +From 28c1c9fabf48d6ad596273a11c46e0d0da3e14cd Mon Sep 17 00:00:00 2001 +From: KarimAllah Ahmed +Date: Thu, 1 Feb 2018 22:59:44 +0100 +Subject: KVM/VMX: Emulate MSR_IA32_ARCH_CAPABILITIES + +From: KarimAllah Ahmed + +commit 28c1c9fabf48d6ad596273a11c46e0d0da3e14cd upstream. + +Intel processors use MSR_IA32_ARCH_CAPABILITIES MSR to indicate RDCL_NO +(bit 0) and IBRS_ALL (bit 1). This is a read-only MSR. By default the +contents will come directly from the hardware, but user-space can still +override it. + +[dwmw2: The bit in kvm_cpuid_7_0_edx_x86_features can be unconditional] + +Signed-off-by: KarimAllah Ahmed +Signed-off-by: David Woodhouse +Signed-off-by: Thomas Gleixner +Reviewed-by: Paolo Bonzini +Reviewed-by: Darren Kenny +Reviewed-by: Jim Mattson +Reviewed-by: Konrad Rzeszutek Wilk +Cc: Andrea Arcangeli +Cc: Andi Kleen +Cc: Jun Nakajima +Cc: kvm@vger.kernel.org +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Andy Lutomirski +Cc: Asit Mallick +Cc: Arjan Van De Ven +Cc: Greg KH +Cc: Dan Williams +Cc: Tim Chen +Cc: Ashok Raj +Link: https://lkml.kernel.org/r/1517522386-18410-4-git-send-email-karahmed@amazon.de +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/cpuid.c | 8 +++++++- + arch/x86/kvm/cpuid.h | 8 ++++++++ + arch/x86/kvm/vmx.c | 15 +++++++++++++++ + arch/x86/kvm/x86.c | 1 + + 4 files changed, 31 insertions(+), 1 deletion(-) + +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -380,6 +380,10 @@ static inline int __do_cpuid_ent(struct + /* cpuid 7.0.ecx*/ + const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/; + ++ /* cpuid 7.0.edx*/ ++ const u32 kvm_cpuid_7_0_edx_x86_features = ++ F(ARCH_CAPABILITIES); ++ + /* all calls to cpuid_count() should be made on the same cpu */ + get_cpu(); + +@@ -462,12 +466,14 @@ static inline int __do_cpuid_ent(struct + /* PKU is not yet implemented for shadow paging. */ + if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) + entry->ecx &= ~F(PKU); ++ entry->edx &= kvm_cpuid_7_0_edx_x86_features; ++ cpuid_mask(&entry->edx, CPUID_7_EDX); + } else { + entry->ebx = 0; + entry->ecx = 0; ++ entry->edx = 0; + } + entry->eax = 0; +- entry->edx = 0; + break; + } + case 9: +--- a/arch/x86/kvm/cpuid.h ++++ b/arch/x86/kvm/cpuid.h +@@ -171,6 +171,14 @@ static inline bool guest_cpuid_has_ibpb( + return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); + } + ++static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu) ++{ ++ struct kvm_cpuid_entry2 *best; ++ ++ best = kvm_find_cpuid_entry(vcpu, 7, 0); ++ return best && (best->edx & bit(X86_FEATURE_ARCH_CAPABILITIES)); ++} ++ + + /* + * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -550,6 +550,8 @@ struct vcpu_vmx { + u64 msr_guest_kernel_gs_base; + #endif + ++ u64 arch_capabilities; ++ + u32 vm_entry_controls_shadow; + u32 vm_exit_controls_shadow; + /* +@@ -2981,6 +2983,12 @@ static int vmx_get_msr(struct kvm_vcpu * + case MSR_IA32_TSC: + msr_info->data = guest_read_tsc(vcpu); + break; ++ case MSR_IA32_ARCH_CAPABILITIES: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has_arch_capabilities(vcpu)) ++ return 1; ++ msr_info->data = to_vmx(vcpu)->arch_capabilities; ++ break; + case MSR_IA32_SYSENTER_CS: + msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); + break; +@@ -3112,6 +3120,11 @@ static int vmx_set_msr(struct kvm_vcpu * + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, + MSR_TYPE_W); + break; ++ case MSR_IA32_ARCH_CAPABILITIES: ++ if (!msr_info->host_initiated) ++ return 1; ++ vmx->arch_capabilities = data; ++ break; + case MSR_IA32_CR_PAT: + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) +@@ -5202,6 +5215,8 @@ static int vmx_vcpu_setup(struct vcpu_vm + ++vmx->nmsrs; + } + ++ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) ++ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities); + + vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -975,6 +975,7 @@ static u32 msrs_to_save[] = { + #endif + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, ++ MSR_IA32_ARCH_CAPABILITIES + }; + + static unsigned num_msrs_to_save; diff --git a/queue-4.9/kvm-vmx-introduce-alloc_loaded_vmcs.patch b/queue-4.9/kvm-vmx-introduce-alloc_loaded_vmcs.patch new file mode 100644 index 00000000000..32d35f88e11 --- /dev/null +++ b/queue-4.9/kvm-vmx-introduce-alloc_loaded_vmcs.patch @@ -0,0 +1,101 @@ +From f21f165ef922c2146cc5bdc620f542953c41714b Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Thu, 11 Jan 2018 12:16:15 +0100 +Subject: KVM: VMX: introduce alloc_loaded_vmcs + +From: Paolo Bonzini + +commit f21f165ef922c2146cc5bdc620f542953c41714b upstream. + +Group together the calls to alloc_vmcs and loaded_vmcs_init. Soon we'll also +allocate an MSR bitmap there. + +Cc: stable@vger.kernel.org # prereq for Spectre mitigation +Signed-off-by: Paolo Bonzini +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx.c | 38 +++++++++++++++++++++++--------------- + 1 file changed, 23 insertions(+), 15 deletions(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -3524,11 +3524,6 @@ static struct vmcs *alloc_vmcs_cpu(int c + return vmcs; + } + +-static struct vmcs *alloc_vmcs(void) +-{ +- return alloc_vmcs_cpu(raw_smp_processor_id()); +-} +- + static void free_vmcs(struct vmcs *vmcs) + { + free_pages((unsigned long)vmcs, vmcs_config.order); +@@ -3547,6 +3542,22 @@ static void free_loaded_vmcs(struct load + WARN_ON(loaded_vmcs->shadow_vmcs != NULL); + } + ++static struct vmcs *alloc_vmcs(void) ++{ ++ return alloc_vmcs_cpu(raw_smp_processor_id()); ++} ++ ++static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) ++{ ++ loaded_vmcs->vmcs = alloc_vmcs(); ++ if (!loaded_vmcs->vmcs) ++ return -ENOMEM; ++ ++ loaded_vmcs->shadow_vmcs = NULL; ++ loaded_vmcs_init(loaded_vmcs); ++ return 0; ++} ++ + static void free_kvm_area(void) + { + int cpu; +@@ -6949,6 +6960,7 @@ static int handle_vmon(struct kvm_vcpu * + struct vmcs *shadow_vmcs; + const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED + | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; ++ int r; + + /* The Intel VMX Instruction Reference lists a bunch of bits that + * are prerequisite to running VMXON, most notably cr4.VMXE must be +@@ -6988,11 +7000,9 @@ static int handle_vmon(struct kvm_vcpu * + return 1; + } + +- vmx->nested.vmcs02.vmcs = alloc_vmcs(); +- vmx->nested.vmcs02.shadow_vmcs = NULL; +- if (!vmx->nested.vmcs02.vmcs) ++ r = alloc_loaded_vmcs(&vmx->nested.vmcs02); ++ if (r < 0) + goto out_vmcs02; +- loaded_vmcs_init(&vmx->nested.vmcs02); + + if (cpu_has_vmx_msr_bitmap()) { + vmx->nested.msr_bitmap = +@@ -9113,17 +9123,15 @@ static struct kvm_vcpu *vmx_create_vcpu( + if (!vmx->guest_msrs) + goto free_pml; + +- vmx->loaded_vmcs = &vmx->vmcs01; +- vmx->loaded_vmcs->vmcs = alloc_vmcs(); +- vmx->loaded_vmcs->shadow_vmcs = NULL; +- if (!vmx->loaded_vmcs->vmcs) +- goto free_msrs; + if (!vmm_exclusive) + kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); +- loaded_vmcs_init(vmx->loaded_vmcs); ++ err = alloc_loaded_vmcs(&vmx->vmcs01); + if (!vmm_exclusive) + kvm_cpu_vmxoff(); ++ if (err < 0) ++ goto free_msrs; + ++ vmx->loaded_vmcs = &vmx->vmcs01; + cpu = get_cpu(); + vmx_vcpu_load(&vmx->vcpu, cpu); + vmx->vcpu.cpu = cpu; diff --git a/queue-4.9/kvm-vmx-make-msr-bitmaps-per-vcpu.patch b/queue-4.9/kvm-vmx-make-msr-bitmaps-per-vcpu.patch new file mode 100644 index 00000000000..a831850727e --- /dev/null +++ b/queue-4.9/kvm-vmx-make-msr-bitmaps-per-vcpu.patch @@ -0,0 +1,582 @@ +From 904e14fb7cb96401a7dc803ca2863fd5ba32ffe6 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Tue, 16 Jan 2018 16:51:18 +0100 +Subject: KVM: VMX: make MSR bitmaps per-VCPU + +From: Paolo Bonzini + +commit 904e14fb7cb96401a7dc803ca2863fd5ba32ffe6 upstream. + +Place the MSR bitmap in struct loaded_vmcs, and update it in place +every time the x2apic or APICv state can change. This is rare and +the loop can handle 64 MSRs per iteration, in a similar fashion as +nested_vmx_prepare_msr_bitmap. + +This prepares for choosing, on a per-VM basis, whether to intercept +the SPEC_CTRL and PRED_CMD MSRs. + +Cc: stable@vger.kernel.org # prereq for Spectre mitigation +Suggested-by: Jim Mattson +Signed-off-by: Paolo Bonzini +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx.c | 316 +++++++++++++++++++---------------------------------- + 1 file changed, 115 insertions(+), 201 deletions(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -109,6 +109,14 @@ static u64 __read_mostly host_xss; + static bool __read_mostly enable_pml = 1; + module_param_named(pml, enable_pml, bool, S_IRUGO); + ++#define MSR_TYPE_R 1 ++#define MSR_TYPE_W 2 ++#define MSR_TYPE_RW 3 ++ ++#define MSR_BITMAP_MODE_X2APIC 1 ++#define MSR_BITMAP_MODE_X2APIC_APICV 2 ++#define MSR_BITMAP_MODE_LM 4 ++ + #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL + + /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ +@@ -190,6 +198,7 @@ struct loaded_vmcs { + struct vmcs *shadow_vmcs; + int cpu; + int launched; ++ unsigned long *msr_bitmap; + struct list_head loaded_vmcss_on_cpu_link; + }; + +@@ -428,8 +437,6 @@ struct nested_vmx { + bool pi_pending; + u16 posted_intr_nv; + +- unsigned long *msr_bitmap; +- + struct hrtimer preemption_timer; + bool preemption_timer_expired; + +@@ -530,6 +537,7 @@ struct vcpu_vmx { + unsigned long host_rsp; + u8 fail; + bool nmi_known_unmasked; ++ u8 msr_bitmap_mode; + u32 exit_intr_info; + u32 idt_vectoring_info; + ulong rflags; +@@ -904,6 +912,7 @@ static u32 vmx_segment_access_rights(str + static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); + static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); + static int alloc_identity_pagetable(struct kvm *kvm); ++static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); + + static DEFINE_PER_CPU(struct vmcs *, vmxarea); + static DEFINE_PER_CPU(struct vmcs *, current_vmcs); +@@ -923,12 +932,6 @@ static DEFINE_PER_CPU(spinlock_t, blocke + + static unsigned long *vmx_io_bitmap_a; + static unsigned long *vmx_io_bitmap_b; +-static unsigned long *vmx_msr_bitmap_legacy; +-static unsigned long *vmx_msr_bitmap_longmode; +-static unsigned long *vmx_msr_bitmap_legacy_x2apic; +-static unsigned long *vmx_msr_bitmap_longmode_x2apic; +-static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive; +-static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive; + static unsigned long *vmx_vmread_bitmap; + static unsigned long *vmx_vmwrite_bitmap; + +@@ -2522,36 +2525,6 @@ static void move_msr_up(struct vcpu_vmx + vmx->guest_msrs[from] = tmp; + } + +-static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) +-{ +- unsigned long *msr_bitmap; +- +- if (is_guest_mode(vcpu)) +- msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; +- else if (cpu_has_secondary_exec_ctrls() && +- (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & +- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { +- if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) { +- if (is_long_mode(vcpu)) +- msr_bitmap = vmx_msr_bitmap_longmode_x2apic; +- else +- msr_bitmap = vmx_msr_bitmap_legacy_x2apic; +- } else { +- if (is_long_mode(vcpu)) +- msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive; +- else +- msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive; +- } +- } else { +- if (is_long_mode(vcpu)) +- msr_bitmap = vmx_msr_bitmap_longmode; +- else +- msr_bitmap = vmx_msr_bitmap_legacy; +- } +- +- vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); +-} +- + /* + * Set up the vmcs to automatically save and restore system + * msrs. Don't touch the 64-bit msrs if the guest is in legacy +@@ -2592,7 +2565,7 @@ static void setup_msrs(struct vcpu_vmx * + vmx->save_nmsrs = save_nmsrs; + + if (cpu_has_vmx_msr_bitmap()) +- vmx_set_msr_bitmap(&vmx->vcpu); ++ vmx_update_msr_bitmap(&vmx->vcpu); + } + + /* +@@ -3539,6 +3512,8 @@ static void free_loaded_vmcs(struct load + loaded_vmcs_clear(loaded_vmcs); + free_vmcs(loaded_vmcs->vmcs); + loaded_vmcs->vmcs = NULL; ++ if (loaded_vmcs->msr_bitmap) ++ free_page((unsigned long)loaded_vmcs->msr_bitmap); + WARN_ON(loaded_vmcs->shadow_vmcs != NULL); + } + +@@ -3555,7 +3530,18 @@ static int alloc_loaded_vmcs(struct load + + loaded_vmcs->shadow_vmcs = NULL; + loaded_vmcs_init(loaded_vmcs); ++ ++ if (cpu_has_vmx_msr_bitmap()) { ++ loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); ++ if (!loaded_vmcs->msr_bitmap) ++ goto out_vmcs; ++ memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); ++ } + return 0; ++ ++out_vmcs: ++ free_loaded_vmcs(loaded_vmcs); ++ return -ENOMEM; + } + + static void free_kvm_area(void) +@@ -4564,10 +4550,8 @@ static void free_vpid(int vpid) + spin_unlock(&vmx_vpid_lock); + } + +-#define MSR_TYPE_R 1 +-#define MSR_TYPE_W 2 +-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, +- u32 msr, int type) ++static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, ++ u32 msr, int type) + { + int f = sizeof(unsigned long); + +@@ -4601,8 +4585,8 @@ static void __vmx_disable_intercept_for_ + } + } + +-static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, +- u32 msr, int type) ++static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, ++ u32 msr, int type) + { + int f = sizeof(unsigned long); + +@@ -4636,6 +4620,15 @@ static void __vmx_enable_intercept_for_m + } + } + ++static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap, ++ u32 msr, int type, bool value) ++{ ++ if (value) ++ vmx_enable_intercept_for_msr(msr_bitmap, msr, type); ++ else ++ vmx_disable_intercept_for_msr(msr_bitmap, msr, type); ++} ++ + /* + * If a msr is allowed by L0, we should check whether it is allowed by L1. + * The corresponding bit will be cleared unless both of L0 and L1 allow it. +@@ -4682,58 +4675,68 @@ static void nested_vmx_disable_intercept + } + } + +-static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) ++static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) + { +- if (!longmode_only) +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +- msr, MSR_TYPE_R | MSR_TYPE_W); +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, +- msr, MSR_TYPE_R | MSR_TYPE_W); +-} +- +-static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) +-{ +- if (apicv_active) { +- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, +- msr, MSR_TYPE_R); +- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, +- msr, MSR_TYPE_R); +- } else { +- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, +- msr, MSR_TYPE_R); +- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, +- msr, MSR_TYPE_R); ++ u8 mode = 0; ++ ++ if (cpu_has_secondary_exec_ctrls() && ++ (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & ++ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { ++ mode |= MSR_BITMAP_MODE_X2APIC; ++ if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) ++ mode |= MSR_BITMAP_MODE_X2APIC_APICV; + } ++ ++ if (is_long_mode(vcpu)) ++ mode |= MSR_BITMAP_MODE_LM; ++ ++ return mode; + } + +-static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) ++#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) ++ ++static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, ++ u8 mode) + { +- if (apicv_active) { +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, +- msr, MSR_TYPE_R); +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, +- msr, MSR_TYPE_R); +- } else { +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, +- msr, MSR_TYPE_R); +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, +- msr, MSR_TYPE_R); ++ int msr; ++ ++ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { ++ unsigned word = msr / BITS_PER_LONG; ++ msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; ++ msr_bitmap[word + (0x800 / sizeof(long))] = ~0; ++ } ++ ++ if (mode & MSR_BITMAP_MODE_X2APIC) { ++ /* ++ * TPR reads and writes can be virtualized even if virtual interrupt ++ * delivery is not in use. ++ */ ++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW); ++ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { ++ vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R); ++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); ++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); ++ } + } + } + +-static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active) ++static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) + { +- if (apicv_active) { +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, +- msr, MSR_TYPE_W); +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, +- msr, MSR_TYPE_W); +- } else { +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, +- msr, MSR_TYPE_W); +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, +- msr, MSR_TYPE_W); +- } ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; ++ u8 mode = vmx_msr_bitmap_mode(vcpu); ++ u8 changed = mode ^ vmx->msr_bitmap_mode; ++ ++ if (!changed) ++ return; ++ ++ vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW, ++ !(mode & MSR_BITMAP_MODE_LM)); ++ ++ if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) ++ vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); ++ ++ vmx->msr_bitmap_mode = mode; + } + + static bool vmx_get_enable_apicv(void) +@@ -4982,7 +4985,7 @@ static void vmx_refresh_apicv_exec_ctrl( + } + + if (cpu_has_vmx_msr_bitmap()) +- vmx_set_msr_bitmap(vcpu); ++ vmx_update_msr_bitmap(vcpu); + } + + static u32 vmx_exec_control(struct vcpu_vmx *vmx) +@@ -5071,7 +5074,7 @@ static int vmx_vcpu_setup(struct vcpu_vm + vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); + } + if (cpu_has_vmx_msr_bitmap()) +- vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); ++ vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); + + vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ + +@@ -6402,7 +6405,7 @@ static void wakeup_handler(void) + + static __init int hardware_setup(void) + { +- int r = -ENOMEM, i, msr; ++ int r = -ENOMEM, i; + + rdmsrl_safe(MSR_EFER, &host_efer); + +@@ -6417,41 +6420,13 @@ static __init int hardware_setup(void) + if (!vmx_io_bitmap_b) + goto out; + +- vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx_msr_bitmap_legacy) +- goto out1; +- +- vmx_msr_bitmap_legacy_x2apic = +- (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx_msr_bitmap_legacy_x2apic) +- goto out2; +- +- vmx_msr_bitmap_legacy_x2apic_apicv_inactive = +- (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive) +- goto out3; +- +- vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx_msr_bitmap_longmode) +- goto out4; +- +- vmx_msr_bitmap_longmode_x2apic = +- (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx_msr_bitmap_longmode_x2apic) +- goto out5; +- +- vmx_msr_bitmap_longmode_x2apic_apicv_inactive = +- (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive) +- goto out6; +- + vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmread_bitmap) +- goto out7; ++ goto out1; + + vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmwrite_bitmap) +- goto out8; ++ goto out2; + + memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); + memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); +@@ -6460,12 +6435,9 @@ static __init int hardware_setup(void) + + memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); + +- memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); +- memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); +- + if (setup_vmcs_config(&vmcs_config) < 0) { + r = -EIO; +- goto out9; ++ goto out3; + } + + if (boot_cpu_has(X86_FEATURE_NX)) +@@ -6522,47 +6494,8 @@ static __init int hardware_setup(void) + kvm_tsc_scaling_ratio_frac_bits = 48; + } + +- vmx_disable_intercept_for_msr(MSR_FS_BASE, false); +- vmx_disable_intercept_for_msr(MSR_GS_BASE, false); +- vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); +- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); +- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); +- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); +- +- memcpy(vmx_msr_bitmap_legacy_x2apic, +- vmx_msr_bitmap_legacy, PAGE_SIZE); +- memcpy(vmx_msr_bitmap_longmode_x2apic, +- vmx_msr_bitmap_longmode, PAGE_SIZE); +- memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, +- vmx_msr_bitmap_legacy, PAGE_SIZE); +- memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, +- vmx_msr_bitmap_longmode, PAGE_SIZE); +- + set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + +- /* +- * enable_apicv && kvm_vcpu_apicv_active() +- */ +- for (msr = 0x800; msr <= 0x8ff; msr++) +- vmx_disable_intercept_msr_read_x2apic(msr, true); +- +- /* TMCCT */ +- vmx_enable_intercept_msr_read_x2apic(0x839, true); +- /* TPR */ +- vmx_disable_intercept_msr_write_x2apic(0x808, true); +- /* EOI */ +- vmx_disable_intercept_msr_write_x2apic(0x80b, true); +- /* SELF-IPI */ +- vmx_disable_intercept_msr_write_x2apic(0x83f, true); +- +- /* +- * (enable_apicv && !kvm_vcpu_apicv_active()) || +- * !enable_apicv +- */ +- /* TPR */ +- vmx_disable_intercept_msr_read_x2apic(0x808, false); +- vmx_disable_intercept_msr_write_x2apic(0x808, false); +- + if (enable_ept) { + kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, + (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, +@@ -6608,22 +6541,10 @@ static __init int hardware_setup(void) + + return alloc_kvm_area(); + +-out9: +- free_page((unsigned long)vmx_vmwrite_bitmap); +-out8: +- free_page((unsigned long)vmx_vmread_bitmap); +-out7: +- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); +-out6: +- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); +-out5: +- free_page((unsigned long)vmx_msr_bitmap_longmode); +-out4: +- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); + out3: +- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); ++ free_page((unsigned long)vmx_vmwrite_bitmap); + out2: +- free_page((unsigned long)vmx_msr_bitmap_legacy); ++ free_page((unsigned long)vmx_vmread_bitmap); + out1: + free_page((unsigned long)vmx_io_bitmap_b); + out: +@@ -6634,12 +6555,6 @@ out: + + static __exit void hardware_unsetup(void) + { +- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); +- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); +- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); +- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); +- free_page((unsigned long)vmx_msr_bitmap_legacy); +- free_page((unsigned long)vmx_msr_bitmap_longmode); + free_page((unsigned long)vmx_io_bitmap_b); + free_page((unsigned long)vmx_io_bitmap_a); + free_page((unsigned long)vmx_vmwrite_bitmap); +@@ -7004,13 +6919,6 @@ static int handle_vmon(struct kvm_vcpu * + if (r < 0) + goto out_vmcs02; + +- if (cpu_has_vmx_msr_bitmap()) { +- vmx->nested.msr_bitmap = +- (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx->nested.msr_bitmap) +- goto out_msr_bitmap; +- } +- + vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + if (!vmx->nested.cached_vmcs12) + goto out_cached_vmcs12; +@@ -7040,9 +6948,6 @@ out_shadow_vmcs: + kfree(vmx->nested.cached_vmcs12); + + out_cached_vmcs12: +- free_page((unsigned long)vmx->nested.msr_bitmap); +- +-out_msr_bitmap: + free_loaded_vmcs(&vmx->nested.vmcs02); + + out_vmcs02: +@@ -7121,10 +7026,6 @@ static void free_nested(struct vcpu_vmx + vmx->nested.vmxon = false; + free_vpid(vmx->nested.vpid02); + nested_release_vmcs12(vmx); +- if (vmx->nested.msr_bitmap) { +- free_page((unsigned long)vmx->nested.msr_bitmap); +- vmx->nested.msr_bitmap = NULL; +- } + if (enable_shadow_vmcs) { + vmcs_clear(vmx->vmcs01.shadow_vmcs); + free_vmcs(vmx->vmcs01.shadow_vmcs); +@@ -8471,7 +8372,7 @@ static void vmx_set_virtual_x2apic_mode( + } + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); + +- vmx_set_msr_bitmap(vcpu); ++ vmx_update_msr_bitmap(vcpu); + } + + static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) +@@ -9091,6 +8992,7 @@ static struct kvm_vcpu *vmx_create_vcpu( + { + int err; + struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); ++ unsigned long *msr_bitmap; + int cpu; + + if (!vmx) +@@ -9131,6 +9033,15 @@ static struct kvm_vcpu *vmx_create_vcpu( + if (err < 0) + goto free_msrs; + ++ msr_bitmap = vmx->vmcs01.msr_bitmap; ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); ++ vmx->msr_bitmap_mode = 0; ++ + vmx->loaded_vmcs = &vmx->vmcs01; + cpu = get_cpu(); + vmx_vcpu_load(&vmx->vcpu, cpu); +@@ -9525,7 +9436,7 @@ static inline bool nested_vmx_merge_msr_ + int msr; + struct page *page; + unsigned long *msr_bitmap_l1; +- unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; ++ unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; + + /* This shortcut is ok because we support only x2APIC MSRs so far. */ + if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) +@@ -10045,6 +9956,9 @@ static void prepare_vmcs02(struct kvm_vc + if (kvm_has_tsc_control) + decache_tsc_multiplier(vmx); + ++ if (cpu_has_vmx_msr_bitmap()) ++ vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); ++ + if (enable_vpid) { + /* + * There is no direct mapping between vpid02 and vpid12, the +@@ -10749,7 +10663,7 @@ static void load_vmcs12_host_state(struc + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + if (cpu_has_vmx_msr_bitmap()) +- vmx_set_msr_bitmap(vcpu); ++ vmx_update_msr_bitmap(vcpu); + + if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, + vmcs12->vm_exit_msr_load_count)) diff --git a/queue-4.9/kvm-x86-add-ibpb-support.patch b/queue-4.9/kvm-x86-add-ibpb-support.patch new file mode 100644 index 00000000000..c893472bd1a --- /dev/null +++ b/queue-4.9/kvm-x86-add-ibpb-support.patch @@ -0,0 +1,343 @@ +From 15d45071523d89b3fb7372e2135fbd72f6af9506 Mon Sep 17 00:00:00 2001 +From: Ashok Raj +Date: Thu, 1 Feb 2018 22:59:43 +0100 +Subject: KVM/x86: Add IBPB support + +From: Ashok Raj + +commit 15d45071523d89b3fb7372e2135fbd72f6af9506 upstream. + +The Indirect Branch Predictor Barrier (IBPB) is an indirect branch +control mechanism. It keeps earlier branches from influencing +later ones. + +Unlike IBRS and STIBP, IBPB does not define a new mode of operation. +It's a command that ensures predicted branch targets aren't used after +the barrier. Although IBRS and IBPB are enumerated by the same CPUID +enumeration, IBPB is very different. + +IBPB helps mitigate against three potential attacks: + +* Mitigate guests from being attacked by other guests. + - This is addressed by issing IBPB when we do a guest switch. + +* Mitigate attacks from guest/ring3->host/ring3. + These would require a IBPB during context switch in host, or after + VMEXIT. The host process has two ways to mitigate + - Either it can be compiled with retpoline + - If its going through context switch, and has set !dumpable then + there is a IBPB in that path. + (Tim's patch: https://patchwork.kernel.org/patch/10192871) + - The case where after a VMEXIT you return back to Qemu might make + Qemu attackable from guest when Qemu isn't compiled with retpoline. + There are issues reported when doing IBPB on every VMEXIT that resulted + in some tsc calibration woes in guest. + +* Mitigate guest/ring0->host/ring0 attacks. + When host kernel is using retpoline it is safe against these attacks. + If host kernel isn't using retpoline we might need to do a IBPB flush on + every VMEXIT. + +Even when using retpoline for indirect calls, in certain conditions 'ret' +can use the BTB on Skylake-era CPUs. There are other mitigations +available like RSB stuffing/clearing. + +* IBPB is issued only for SVM during svm_free_vcpu(). + VMX has a vmclear and SVM doesn't. Follow discussion here: + https://lkml.org/lkml/2018/1/15/146 + +Please refer to the following spec for more details on the enumeration +and control. + +Refer here to get documentation about mitigations. + +https://software.intel.com/en-us/side-channel-security-support + +[peterz: rebase and changelog rewrite] +[karahmed: - rebase + - vmx: expose PRED_CMD if guest has it in CPUID + - svm: only pass through IBPB if guest has it in CPUID + - vmx: support !cpu_has_vmx_msr_bitmap()] + - vmx: support nested] +[dwmw2: Expose CPUID bit too (AMD IBPB only for now as we lack IBRS) + PRED_CMD is a write-only MSR] + +Signed-off-by: Ashok Raj +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: David Woodhouse +Signed-off-by: KarimAllah Ahmed +Signed-off-by: Thomas Gleixner +Reviewed-by: Konrad Rzeszutek Wilk +Cc: Andrea Arcangeli +Cc: Andi Kleen +Cc: kvm@vger.kernel.org +Cc: Asit Mallick +Cc: Linus Torvalds +Cc: Andy Lutomirski +Cc: Dave Hansen +Cc: Arjan Van De Ven +Cc: Greg KH +Cc: Jun Nakajima +Cc: Paolo Bonzini +Cc: Dan Williams +Cc: Tim Chen +Link: http://lkml.kernel.org/r/1515720739-43819-6-git-send-email-ashok.raj@intel.com +Link: https://lkml.kernel.org/r/1517522386-18410-3-git-send-email-karahmed@amazon.de +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/cpuid.c | 11 ++++++- + arch/x86/kvm/cpuid.h | 12 +++++++ + arch/x86/kvm/svm.c | 28 ++++++++++++++++++ + arch/x86/kvm/vmx.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++-- + 4 files changed, 127 insertions(+), 3 deletions(-) + +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -355,6 +355,10 @@ static inline int __do_cpuid_ent(struct + F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | + 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); + ++ /* cpuid 0x80000008.ebx */ ++ const u32 kvm_cpuid_8000_0008_ebx_x86_features = ++ F(IBPB); ++ + /* cpuid 0xC0000001.edx */ + const u32 kvm_cpuid_C000_0001_edx_x86_features = + F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | +@@ -607,7 +611,12 @@ static inline int __do_cpuid_ent(struct + if (!g_phys_as) + g_phys_as = phys_as; + entry->eax = g_phys_as | (virt_as << 8); +- entry->ebx = entry->edx = 0; ++ entry->edx = 0; ++ /* IBPB isn't necessarily present in hardware cpuid */ ++ if (boot_cpu_has(X86_FEATURE_IBPB)) ++ entry->ebx |= F(IBPB); ++ entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; ++ cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); + break; + } + case 0x80000019: +--- a/arch/x86/kvm/cpuid.h ++++ b/arch/x86/kvm/cpuid.h +@@ -160,6 +160,18 @@ static inline bool guest_cpuid_has_rdtsc + return best && (best->edx & bit(X86_FEATURE_RDTSCP)); + } + ++static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu *vcpu) ++{ ++ struct kvm_cpuid_entry2 *best; ++ ++ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); ++ if (best && (best->ebx & bit(X86_FEATURE_IBPB))) ++ return true; ++ best = kvm_find_cpuid_entry(vcpu, 7, 0); ++ return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); ++} ++ ++ + /* + * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3 + */ +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -248,6 +248,7 @@ static const struct svm_direct_access_ms + { .index = MSR_CSTAR, .always = true }, + { .index = MSR_SYSCALL_MASK, .always = true }, + #endif ++ { .index = MSR_IA32_PRED_CMD, .always = false }, + { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, + { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, + { .index = MSR_IA32_LASTINTFROMIP, .always = false }, +@@ -510,6 +511,7 @@ struct svm_cpu_data { + struct kvm_ldttss_desc *tss_desc; + + struct page *save_area; ++ struct vmcb *current_vmcb; + }; + + static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); +@@ -1644,11 +1646,17 @@ static void svm_free_vcpu(struct kvm_vcp + __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, svm); ++ /* ++ * The vmcb page can be recycled, causing a false negative in ++ * svm_vcpu_load(). So do a full IBPB now. ++ */ ++ indirect_branch_prediction_barrier(); + } + + static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + { + struct vcpu_svm *svm = to_svm(vcpu); ++ struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + int i; + + if (unlikely(cpu != vcpu->cpu)) { +@@ -1677,6 +1685,10 @@ static void svm_vcpu_load(struct kvm_vcp + if (static_cpu_has(X86_FEATURE_RDTSCP)) + wrmsrl(MSR_TSC_AUX, svm->tsc_aux); + ++ if (sd->current_vmcb != svm->vmcb) { ++ sd->current_vmcb = svm->vmcb; ++ indirect_branch_prediction_barrier(); ++ } + avic_vcpu_load(vcpu, cpu); + } + +@@ -3599,6 +3611,22 @@ static int svm_set_msr(struct kvm_vcpu * + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, msr); + break; ++ case MSR_IA32_PRED_CMD: ++ if (!msr->host_initiated && ++ !guest_cpuid_has_ibpb(vcpu)) ++ return 1; ++ ++ if (data & ~PRED_CMD_IBPB) ++ return 1; ++ ++ if (!data) ++ break; ++ ++ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); ++ if (is_guest_mode(vcpu)) ++ break; ++ set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); ++ break; + case MSR_STAR: + svm->vmcb->save.star = data; + break; +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -549,6 +549,7 @@ struct vcpu_vmx { + u64 msr_host_kernel_gs_base; + u64 msr_guest_kernel_gs_base; + #endif ++ + u32 vm_entry_controls_shadow; + u32 vm_exit_controls_shadow; + /* +@@ -913,6 +914,8 @@ static void copy_vmcs12_to_shadow(struct + static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); + static int alloc_identity_pagetable(struct kvm *kvm); + static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); ++static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, ++ u32 msr, int type); + + static DEFINE_PER_CPU(struct vmcs *, vmxarea); + static DEFINE_PER_CPU(struct vmcs *, current_vmcs); +@@ -1848,6 +1851,29 @@ static void update_exception_bitmap(stru + vmcs_write32(EXCEPTION_BITMAP, eb); + } + ++/* ++ * Check if MSR is intercepted for L01 MSR bitmap. ++ */ ++static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) ++{ ++ unsigned long *msr_bitmap; ++ int f = sizeof(unsigned long); ++ ++ if (!cpu_has_vmx_msr_bitmap()) ++ return true; ++ ++ msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; ++ ++ if (msr <= 0x1fff) { ++ return !!test_bit(msr, msr_bitmap + 0x800 / f); ++ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { ++ msr &= 0x1fff; ++ return !!test_bit(msr, msr_bitmap + 0xc00 / f); ++ } ++ ++ return true; ++} ++ + static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, + unsigned long entry, unsigned long exit) + { +@@ -2257,6 +2283,7 @@ static void vmx_vcpu_load(struct kvm_vcp + if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { + per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; + vmcs_load(vmx->loaded_vmcs->vmcs); ++ indirect_branch_prediction_barrier(); + } + + if (!already_loaded) { +@@ -3058,6 +3085,33 @@ static int vmx_set_msr(struct kvm_vcpu * + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, msr_info); + break; ++ case MSR_IA32_PRED_CMD: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has_ibpb(vcpu)) ++ return 1; ++ ++ if (data & ~PRED_CMD_IBPB) ++ return 1; ++ ++ if (!data) ++ break; ++ ++ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); ++ ++ /* ++ * For non-nested: ++ * When it's written (to non-zero) for the first time, pass ++ * it through. ++ * ++ * For nested: ++ * The handling of the MSR bitmap for L2 guests is done in ++ * nested_vmx_merge_msr_bitmap. We should not touch the ++ * vmcs02.msr_bitmap here since it gets completely overwritten ++ * in the merging. ++ */ ++ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, ++ MSR_TYPE_W); ++ break; + case MSR_IA32_CR_PAT: + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) +@@ -9437,9 +9491,23 @@ static inline bool nested_vmx_merge_msr_ + struct page *page; + unsigned long *msr_bitmap_l1; + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; ++ /* ++ * pred_cmd is trying to verify two things: ++ * ++ * 1. L0 gave a permission to L1 to actually passthrough the MSR. This ++ * ensures that we do not accidentally generate an L02 MSR bitmap ++ * from the L12 MSR bitmap that is too permissive. ++ * 2. That L1 or L2s have actually used the MSR. This avoids ++ * unnecessarily merging of the bitmap if the MSR is unused. This ++ * works properly because we only update the L01 MSR bitmap lazily. ++ * So even if L0 should pass L1 these MSRs, the L01 bitmap is only ++ * updated to reflect this when L1 (or its L2s) actually write to ++ * the MSR. ++ */ ++ bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); + +- /* This shortcut is ok because we support only x2APIC MSRs so far. */ +- if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) ++ if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && ++ !pred_cmd) + return false; + + page = nested_get_page(vcpu, vmcs12->msr_bitmap); +@@ -9477,6 +9545,13 @@ static inline bool nested_vmx_merge_msr_ + MSR_TYPE_W); + } + } ++ ++ if (pred_cmd) ++ nested_vmx_disable_intercept_for_msr( ++ msr_bitmap_l1, msr_bitmap_l0, ++ MSR_IA32_PRED_CMD, ++ MSR_TYPE_W); ++ + kunmap(page); + nested_release_page_clean(page); + diff --git a/queue-4.9/series b/queue-4.9/series index f6d225945cb..af23d56d520 100644 --- a/queue-4.9/series +++ b/queue-4.9/series @@ -16,3 +16,12 @@ kaiser-fix-intel_bts-perf-crashes.patch x86-pti-make-unpoison-of-pgd-for-trusted-boot-work-for-real.patch kaiser-allocate-pgd-with-order-0-when-pti-off.patch serial-core-mark-port-as-initialized-after-successful-irq-change.patch +kvm-nvmx-vmx_complete_nested_posted_interrupt-can-t-fail.patch +kvm-nvmx-mark-vmcs12-pages-dirty-on-l2-exit.patch +kvm-nvmx-eliminate-vmcs02-pool.patch +kvm-vmx-introduce-alloc_loaded_vmcs.patch +kvm-vmx-make-msr-bitmaps-per-vcpu.patch +kvm-x86-add-ibpb-support.patch +kvm-vmx-emulate-msr_ia32_arch_capabilities.patch +kvm-vmx-allow-direct-access-to-msr_ia32_spec_ctrl.patch +kvm-svm-allow-direct-access-to-msr_ia32_spec_ctrl.patch