]>
Commit | Line | Data |
---|---|---|
55d2375e SC |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include <linux/frame.h> | |
4 | #include <linux/percpu.h> | |
5 | ||
6 | #include <asm/debugreg.h> | |
7 | #include <asm/mmu_context.h> | |
8 | ||
9 | #include "cpuid.h" | |
10 | #include "hyperv.h" | |
11 | #include "mmu.h" | |
12 | #include "nested.h" | |
13 | #include "trace.h" | |
14 | #include "x86.h" | |
15 | ||
16 | static bool __read_mostly enable_shadow_vmcs = 1; | |
17 | module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); | |
18 | ||
19 | static bool __read_mostly nested_early_check = 0; | |
20 | module_param(nested_early_check, bool, S_IRUGO); | |
21 | ||
55d2375e SC |
22 | /* |
23 | * Hyper-V requires all of these, so mark them as supported even though | |
24 | * they are just treated the same as all-context. | |
25 | */ | |
26 | #define VMX_VPID_EXTENT_SUPPORTED_MASK \ | |
27 | (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ | |
28 | VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ | |
29 | VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ | |
30 | VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) | |
31 | ||
32 | #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 | |
33 | ||
34 | enum { | |
35 | VMX_VMREAD_BITMAP, | |
36 | VMX_VMWRITE_BITMAP, | |
37 | VMX_BITMAP_NR | |
38 | }; | |
39 | static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; | |
40 | ||
41 | #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) | |
42 | #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) | |
43 | ||
44 | static u16 shadow_read_only_fields[] = { | |
45 | #define SHADOW_FIELD_RO(x) x, | |
46 | #include "vmcs_shadow_fields.h" | |
47 | }; | |
48 | static int max_shadow_read_only_fields = | |
49 | ARRAY_SIZE(shadow_read_only_fields); | |
50 | ||
51 | static u16 shadow_read_write_fields[] = { | |
52 | #define SHADOW_FIELD_RW(x) x, | |
53 | #include "vmcs_shadow_fields.h" | |
54 | }; | |
55 | static int max_shadow_read_write_fields = | |
56 | ARRAY_SIZE(shadow_read_write_fields); | |
57 | ||
8997f657 | 58 | static void init_vmcs_shadow_fields(void) |
55d2375e SC |
59 | { |
60 | int i, j; | |
61 | ||
62 | memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); | |
63 | memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); | |
64 | ||
65 | for (i = j = 0; i < max_shadow_read_only_fields; i++) { | |
66 | u16 field = shadow_read_only_fields[i]; | |
67 | ||
68 | if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && | |
69 | (i + 1 == max_shadow_read_only_fields || | |
70 | shadow_read_only_fields[i + 1] != field + 1)) | |
71 | pr_err("Missing field from shadow_read_only_field %x\n", | |
72 | field + 1); | |
73 | ||
74 | clear_bit(field, vmx_vmread_bitmap); | |
75 | #ifdef CONFIG_X86_64 | |
76 | if (field & 1) | |
77 | continue; | |
78 | #endif | |
79 | if (j < i) | |
80 | shadow_read_only_fields[j] = field; | |
81 | j++; | |
82 | } | |
83 | max_shadow_read_only_fields = j; | |
84 | ||
85 | for (i = j = 0; i < max_shadow_read_write_fields; i++) { | |
86 | u16 field = shadow_read_write_fields[i]; | |
87 | ||
88 | if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && | |
89 | (i + 1 == max_shadow_read_write_fields || | |
90 | shadow_read_write_fields[i + 1] != field + 1)) | |
91 | pr_err("Missing field from shadow_read_write_field %x\n", | |
92 | field + 1); | |
93 | ||
94 | /* | |
95 | * PML and the preemption timer can be emulated, but the | |
96 | * processor cannot vmwrite to fields that don't exist | |
97 | * on bare metal. | |
98 | */ | |
99 | switch (field) { | |
100 | case GUEST_PML_INDEX: | |
101 | if (!cpu_has_vmx_pml()) | |
102 | continue; | |
103 | break; | |
104 | case VMX_PREEMPTION_TIMER_VALUE: | |
105 | if (!cpu_has_vmx_preemption_timer()) | |
106 | continue; | |
107 | break; | |
108 | case GUEST_INTR_STATUS: | |
109 | if (!cpu_has_vmx_apicv()) | |
110 | continue; | |
111 | break; | |
112 | default: | |
113 | break; | |
114 | } | |
115 | ||
116 | clear_bit(field, vmx_vmwrite_bitmap); | |
117 | clear_bit(field, vmx_vmread_bitmap); | |
118 | #ifdef CONFIG_X86_64 | |
119 | if (field & 1) | |
120 | continue; | |
121 | #endif | |
122 | if (j < i) | |
123 | shadow_read_write_fields[j] = field; | |
124 | j++; | |
125 | } | |
126 | max_shadow_read_write_fields = j; | |
127 | } | |
128 | ||
129 | /* | |
130 | * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), | |
131 | * set the success or error code of an emulated VMX instruction (as specified | |
132 | * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated | |
133 | * instruction. | |
134 | */ | |
135 | static int nested_vmx_succeed(struct kvm_vcpu *vcpu) | |
136 | { | |
137 | vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) | |
138 | & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | | |
139 | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); | |
140 | return kvm_skip_emulated_instruction(vcpu); | |
141 | } | |
142 | ||
143 | static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) | |
144 | { | |
145 | vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) | |
146 | & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | | |
147 | X86_EFLAGS_SF | X86_EFLAGS_OF)) | |
148 | | X86_EFLAGS_CF); | |
149 | return kvm_skip_emulated_instruction(vcpu); | |
150 | } | |
151 | ||
152 | static int nested_vmx_failValid(struct kvm_vcpu *vcpu, | |
153 | u32 vm_instruction_error) | |
154 | { | |
155 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
156 | ||
157 | /* | |
158 | * failValid writes the error number to the current VMCS, which | |
159 | * can't be done if there isn't a current VMCS. | |
160 | */ | |
161 | if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) | |
162 | return nested_vmx_failInvalid(vcpu); | |
163 | ||
164 | vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) | |
165 | & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | | |
166 | X86_EFLAGS_SF | X86_EFLAGS_OF)) | |
167 | | X86_EFLAGS_ZF); | |
168 | get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; | |
169 | /* | |
170 | * We don't need to force a shadow sync because | |
171 | * VM_INSTRUCTION_ERROR is not shadowed | |
172 | */ | |
173 | return kvm_skip_emulated_instruction(vcpu); | |
174 | } | |
175 | ||
176 | static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) | |
177 | { | |
178 | /* TODO: not to reset guest simply here. */ | |
179 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); | |
180 | pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); | |
181 | } | |
182 | ||
183 | static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) | |
184 | { | |
185 | vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); | |
186 | vmcs_write64(VMCS_LINK_POINTER, -1ull); | |
187 | } | |
188 | ||
189 | static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) | |
190 | { | |
191 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
192 | ||
193 | if (!vmx->nested.hv_evmcs) | |
194 | return; | |
195 | ||
196 | kunmap(vmx->nested.hv_evmcs_page); | |
197 | kvm_release_page_dirty(vmx->nested.hv_evmcs_page); | |
198 | vmx->nested.hv_evmcs_vmptr = -1ull; | |
199 | vmx->nested.hv_evmcs_page = NULL; | |
200 | vmx->nested.hv_evmcs = NULL; | |
201 | } | |
202 | ||
203 | /* | |
204 | * Free whatever needs to be freed from vmx->nested when L1 goes down, or | |
205 | * just stops using VMX. | |
206 | */ | |
207 | static void free_nested(struct kvm_vcpu *vcpu) | |
208 | { | |
209 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
210 | ||
211 | if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) | |
212 | return; | |
213 | ||
214 | vmx->nested.vmxon = false; | |
215 | vmx->nested.smm.vmxon = false; | |
216 | free_vpid(vmx->nested.vpid02); | |
217 | vmx->nested.posted_intr_nv = -1; | |
218 | vmx->nested.current_vmptr = -1ull; | |
219 | if (enable_shadow_vmcs) { | |
220 | vmx_disable_shadow_vmcs(vmx); | |
221 | vmcs_clear(vmx->vmcs01.shadow_vmcs); | |
222 | free_vmcs(vmx->vmcs01.shadow_vmcs); | |
223 | vmx->vmcs01.shadow_vmcs = NULL; | |
224 | } | |
225 | kfree(vmx->nested.cached_vmcs12); | |
226 | kfree(vmx->nested.cached_shadow_vmcs12); | |
227 | /* Unpin physical memory we referred to in the vmcs02 */ | |
228 | if (vmx->nested.apic_access_page) { | |
229 | kvm_release_page_dirty(vmx->nested.apic_access_page); | |
230 | vmx->nested.apic_access_page = NULL; | |
231 | } | |
232 | if (vmx->nested.virtual_apic_page) { | |
233 | kvm_release_page_dirty(vmx->nested.virtual_apic_page); | |
234 | vmx->nested.virtual_apic_page = NULL; | |
235 | } | |
236 | if (vmx->nested.pi_desc_page) { | |
237 | kunmap(vmx->nested.pi_desc_page); | |
238 | kvm_release_page_dirty(vmx->nested.pi_desc_page); | |
239 | vmx->nested.pi_desc_page = NULL; | |
240 | vmx->nested.pi_desc = NULL; | |
241 | } | |
242 | ||
243 | kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); | |
244 | ||
245 | nested_release_evmcs(vcpu); | |
246 | ||
247 | free_loaded_vmcs(&vmx->nested.vmcs02); | |
248 | } | |
249 | ||
250 | static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) | |
251 | { | |
252 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
253 | int cpu; | |
254 | ||
255 | if (vmx->loaded_vmcs == vmcs) | |
256 | return; | |
257 | ||
258 | cpu = get_cpu(); | |
259 | vmx_vcpu_put(vcpu); | |
260 | vmx->loaded_vmcs = vmcs; | |
261 | vmx_vcpu_load(vcpu, cpu); | |
262 | put_cpu(); | |
263 | ||
264 | vm_entry_controls_reset_shadow(vmx); | |
265 | vm_exit_controls_reset_shadow(vmx); | |
266 | vmx_segment_cache_clear(vmx); | |
267 | } | |
268 | ||
269 | /* | |
270 | * Ensure that the current vmcs of the logical processor is the | |
271 | * vmcs01 of the vcpu before calling free_nested(). | |
272 | */ | |
273 | void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) | |
274 | { | |
275 | vcpu_load(vcpu); | |
b4b65b56 | 276 | vmx_leave_nested(vcpu); |
55d2375e SC |
277 | vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); |
278 | free_nested(vcpu); | |
279 | vcpu_put(vcpu); | |
280 | } | |
281 | ||
282 | static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, | |
283 | struct x86_exception *fault) | |
284 | { | |
285 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
286 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
287 | u32 exit_reason; | |
288 | unsigned long exit_qualification = vcpu->arch.exit_qualification; | |
289 | ||
290 | if (vmx->nested.pml_full) { | |
291 | exit_reason = EXIT_REASON_PML_FULL; | |
292 | vmx->nested.pml_full = false; | |
293 | exit_qualification &= INTR_INFO_UNBLOCK_NMI; | |
294 | } else if (fault->error_code & PFERR_RSVD_MASK) | |
295 | exit_reason = EXIT_REASON_EPT_MISCONFIG; | |
296 | else | |
297 | exit_reason = EXIT_REASON_EPT_VIOLATION; | |
298 | ||
299 | nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); | |
300 | vmcs12->guest_physical_address = fault->address; | |
301 | } | |
302 | ||
303 | static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) | |
304 | { | |
305 | WARN_ON(mmu_is_nested(vcpu)); | |
306 | ||
307 | vcpu->arch.mmu = &vcpu->arch.guest_mmu; | |
308 | kvm_init_shadow_ept_mmu(vcpu, | |
309 | to_vmx(vcpu)->nested.msrs.ept_caps & | |
310 | VMX_EPT_EXECUTE_ONLY_BIT, | |
311 | nested_ept_ad_enabled(vcpu), | |
312 | nested_ept_get_cr3(vcpu)); | |
313 | vcpu->arch.mmu->set_cr3 = vmx_set_cr3; | |
314 | vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3; | |
315 | vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; | |
316 | vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; | |
317 | ||
318 | vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; | |
319 | } | |
320 | ||
321 | static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) | |
322 | { | |
323 | vcpu->arch.mmu = &vcpu->arch.root_mmu; | |
324 | vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; | |
325 | } | |
326 | ||
327 | static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, | |
328 | u16 error_code) | |
329 | { | |
330 | bool inequality, bit; | |
331 | ||
332 | bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; | |
333 | inequality = | |
334 | (error_code & vmcs12->page_fault_error_code_mask) != | |
335 | vmcs12->page_fault_error_code_match; | |
336 | return inequality ^ bit; | |
337 | } | |
338 | ||
339 | ||
340 | /* | |
341 | * KVM wants to inject page-faults which it got to the guest. This function | |
342 | * checks whether in a nested guest, we need to inject them to L1 or L2. | |
343 | */ | |
344 | static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) | |
345 | { | |
346 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
347 | unsigned int nr = vcpu->arch.exception.nr; | |
348 | bool has_payload = vcpu->arch.exception.has_payload; | |
349 | unsigned long payload = vcpu->arch.exception.payload; | |
350 | ||
351 | if (nr == PF_VECTOR) { | |
352 | if (vcpu->arch.exception.nested_apf) { | |
353 | *exit_qual = vcpu->arch.apf.nested_apf_token; | |
354 | return 1; | |
355 | } | |
356 | if (nested_vmx_is_page_fault_vmexit(vmcs12, | |
357 | vcpu->arch.exception.error_code)) { | |
358 | *exit_qual = has_payload ? payload : vcpu->arch.cr2; | |
359 | return 1; | |
360 | } | |
361 | } else if (vmcs12->exception_bitmap & (1u << nr)) { | |
362 | if (nr == DB_VECTOR) { | |
363 | if (!has_payload) { | |
364 | payload = vcpu->arch.dr6; | |
365 | payload &= ~(DR6_FIXED_1 | DR6_BT); | |
366 | payload ^= DR6_RTM; | |
367 | } | |
368 | *exit_qual = payload; | |
369 | } else | |
370 | *exit_qual = 0; | |
371 | return 1; | |
372 | } | |
373 | ||
374 | return 0; | |
375 | } | |
376 | ||
377 | ||
378 | static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, | |
379 | struct x86_exception *fault) | |
380 | { | |
381 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
382 | ||
383 | WARN_ON(!is_guest_mode(vcpu)); | |
384 | ||
385 | if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && | |
386 | !to_vmx(vcpu)->nested.nested_run_pending) { | |
387 | vmcs12->vm_exit_intr_error_code = fault->error_code; | |
388 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, | |
389 | PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | | |
390 | INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, | |
391 | fault->address); | |
392 | } else { | |
393 | kvm_inject_page_fault(vcpu, fault); | |
394 | } | |
395 | } | |
396 | ||
397 | static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) | |
398 | { | |
399 | return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); | |
400 | } | |
401 | ||
402 | static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, | |
403 | struct vmcs12 *vmcs12) | |
404 | { | |
405 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) | |
406 | return 0; | |
407 | ||
408 | if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) || | |
409 | !page_address_valid(vcpu, vmcs12->io_bitmap_b)) | |
410 | return -EINVAL; | |
411 | ||
412 | return 0; | |
413 | } | |
414 | ||
415 | static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, | |
416 | struct vmcs12 *vmcs12) | |
417 | { | |
418 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) | |
419 | return 0; | |
420 | ||
421 | if (!page_address_valid(vcpu, vmcs12->msr_bitmap)) | |
422 | return -EINVAL; | |
423 | ||
424 | return 0; | |
425 | } | |
426 | ||
427 | static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, | |
428 | struct vmcs12 *vmcs12) | |
429 | { | |
430 | if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) | |
431 | return 0; | |
432 | ||
433 | if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)) | |
434 | return -EINVAL; | |
435 | ||
436 | return 0; | |
437 | } | |
438 | ||
439 | /* | |
440 | * Check if MSR is intercepted for L01 MSR bitmap. | |
441 | */ | |
442 | static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) | |
443 | { | |
444 | unsigned long *msr_bitmap; | |
445 | int f = sizeof(unsigned long); | |
446 | ||
447 | if (!cpu_has_vmx_msr_bitmap()) | |
448 | return true; | |
449 | ||
450 | msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; | |
451 | ||
452 | if (msr <= 0x1fff) { | |
453 | return !!test_bit(msr, msr_bitmap + 0x800 / f); | |
454 | } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { | |
455 | msr &= 0x1fff; | |
456 | return !!test_bit(msr, msr_bitmap + 0xc00 / f); | |
457 | } | |
458 | ||
459 | return true; | |
460 | } | |
461 | ||
462 | /* | |
463 | * If a msr is allowed by L0, we should check whether it is allowed by L1. | |
464 | * The corresponding bit will be cleared unless both of L0 and L1 allow it. | |
465 | */ | |
466 | static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, | |
467 | unsigned long *msr_bitmap_nested, | |
468 | u32 msr, int type) | |
469 | { | |
470 | int f = sizeof(unsigned long); | |
471 | ||
472 | /* | |
473 | * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals | |
474 | * have the write-low and read-high bitmap offsets the wrong way round. | |
475 | * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. | |
476 | */ | |
477 | if (msr <= 0x1fff) { | |
478 | if (type & MSR_TYPE_R && | |
479 | !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) | |
480 | /* read-low */ | |
481 | __clear_bit(msr, msr_bitmap_nested + 0x000 / f); | |
482 | ||
483 | if (type & MSR_TYPE_W && | |
484 | !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) | |
485 | /* write-low */ | |
486 | __clear_bit(msr, msr_bitmap_nested + 0x800 / f); | |
487 | ||
488 | } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { | |
489 | msr &= 0x1fff; | |
490 | if (type & MSR_TYPE_R && | |
491 | !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) | |
492 | /* read-high */ | |
493 | __clear_bit(msr, msr_bitmap_nested + 0x400 / f); | |
494 | ||
495 | if (type & MSR_TYPE_W && | |
496 | !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) | |
497 | /* write-high */ | |
498 | __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); | |
499 | ||
500 | } | |
501 | } | |
502 | ||
acff7847 MO |
503 | static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) { |
504 | int msr; | |
505 | ||
506 | for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { | |
507 | unsigned word = msr / BITS_PER_LONG; | |
508 | ||
509 | msr_bitmap[word] = ~0; | |
510 | msr_bitmap[word + (0x800 / sizeof(long))] = ~0; | |
511 | } | |
512 | } | |
513 | ||
55d2375e SC |
514 | /* |
515 | * Merge L0's and L1's MSR bitmap, return false to indicate that | |
516 | * we do not use the hardware. | |
517 | */ | |
518 | static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, | |
519 | struct vmcs12 *vmcs12) | |
520 | { | |
521 | int msr; | |
522 | struct page *page; | |
523 | unsigned long *msr_bitmap_l1; | |
524 | unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; | |
525 | /* | |
526 | * pred_cmd & spec_ctrl are trying to verify two things: | |
527 | * | |
528 | * 1. L0 gave a permission to L1 to actually passthrough the MSR. This | |
529 | * ensures that we do not accidentally generate an L02 MSR bitmap | |
530 | * from the L12 MSR bitmap that is too permissive. | |
531 | * 2. That L1 or L2s have actually used the MSR. This avoids | |
532 | * unnecessarily merging of the bitmap if the MSR is unused. This | |
533 | * works properly because we only update the L01 MSR bitmap lazily. | |
534 | * So even if L0 should pass L1 these MSRs, the L01 bitmap is only | |
535 | * updated to reflect this when L1 (or its L2s) actually write to | |
536 | * the MSR. | |
537 | */ | |
538 | bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); | |
539 | bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL); | |
540 | ||
541 | /* Nothing to do if the MSR bitmap is not in use. */ | |
542 | if (!cpu_has_vmx_msr_bitmap() || | |
543 | !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) | |
544 | return false; | |
545 | ||
546 | if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && | |
547 | !pred_cmd && !spec_ctrl) | |
548 | return false; | |
549 | ||
550 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap); | |
551 | if (is_error_page(page)) | |
552 | return false; | |
553 | ||
554 | msr_bitmap_l1 = (unsigned long *)kmap(page); | |
55d2375e | 555 | |
acff7847 MO |
556 | /* |
557 | * To keep the control flow simple, pay eight 8-byte writes (sixteen | |
558 | * 4-byte writes on 32-bit systems) up front to enable intercepts for | |
559 | * the x2APIC MSR range and selectively disable them below. | |
560 | */ | |
561 | enable_x2apic_msr_intercepts(msr_bitmap_l0); | |
562 | ||
563 | if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { | |
564 | if (nested_cpu_has_apic_reg_virt(vmcs12)) { | |
565 | /* | |
566 | * L0 need not intercept reads for MSRs between 0x800 | |
567 | * and 0x8ff, it just lets the processor take the value | |
568 | * from the virtual-APIC page; take those 256 bits | |
569 | * directly from the L1 bitmap. | |
570 | */ | |
571 | for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { | |
572 | unsigned word = msr / BITS_PER_LONG; | |
573 | ||
574 | msr_bitmap_l0[word] = msr_bitmap_l1[word]; | |
575 | } | |
576 | } | |
55d2375e | 577 | |
55d2375e SC |
578 | nested_vmx_disable_intercept_for_msr( |
579 | msr_bitmap_l1, msr_bitmap_l0, | |
acff7847 | 580 | X2APIC_MSR(APIC_TASKPRI), |
c73f4c99 | 581 | MSR_TYPE_R | MSR_TYPE_W); |
acff7847 MO |
582 | |
583 | if (nested_cpu_has_vid(vmcs12)) { | |
584 | nested_vmx_disable_intercept_for_msr( | |
585 | msr_bitmap_l1, msr_bitmap_l0, | |
586 | X2APIC_MSR(APIC_EOI), | |
587 | MSR_TYPE_W); | |
588 | nested_vmx_disable_intercept_for_msr( | |
589 | msr_bitmap_l1, msr_bitmap_l0, | |
590 | X2APIC_MSR(APIC_SELF_IPI), | |
591 | MSR_TYPE_W); | |
592 | } | |
55d2375e SC |
593 | } |
594 | ||
595 | if (spec_ctrl) | |
596 | nested_vmx_disable_intercept_for_msr( | |
597 | msr_bitmap_l1, msr_bitmap_l0, | |
598 | MSR_IA32_SPEC_CTRL, | |
599 | MSR_TYPE_R | MSR_TYPE_W); | |
600 | ||
601 | if (pred_cmd) | |
602 | nested_vmx_disable_intercept_for_msr( | |
603 | msr_bitmap_l1, msr_bitmap_l0, | |
604 | MSR_IA32_PRED_CMD, | |
605 | MSR_TYPE_W); | |
606 | ||
607 | kunmap(page); | |
608 | kvm_release_page_clean(page); | |
609 | ||
610 | return true; | |
611 | } | |
612 | ||
613 | static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, | |
614 | struct vmcs12 *vmcs12) | |
615 | { | |
616 | struct vmcs12 *shadow; | |
617 | struct page *page; | |
618 | ||
619 | if (!nested_cpu_has_shadow_vmcs(vmcs12) || | |
620 | vmcs12->vmcs_link_pointer == -1ull) | |
621 | return; | |
622 | ||
623 | shadow = get_shadow_vmcs12(vcpu); | |
624 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); | |
625 | ||
626 | memcpy(shadow, kmap(page), VMCS12_SIZE); | |
627 | ||
628 | kunmap(page); | |
629 | kvm_release_page_clean(page); | |
630 | } | |
631 | ||
632 | static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, | |
633 | struct vmcs12 *vmcs12) | |
634 | { | |
635 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
636 | ||
637 | if (!nested_cpu_has_shadow_vmcs(vmcs12) || | |
638 | vmcs12->vmcs_link_pointer == -1ull) | |
639 | return; | |
640 | ||
641 | kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, | |
642 | get_shadow_vmcs12(vcpu), VMCS12_SIZE); | |
643 | } | |
644 | ||
645 | /* | |
646 | * In nested virtualization, check if L1 has set | |
647 | * VM_EXIT_ACK_INTR_ON_EXIT | |
648 | */ | |
649 | static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) | |
650 | { | |
651 | return get_vmcs12(vcpu)->vm_exit_controls & | |
652 | VM_EXIT_ACK_INTR_ON_EXIT; | |
653 | } | |
654 | ||
655 | static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) | |
656 | { | |
657 | return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); | |
658 | } | |
659 | ||
660 | static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, | |
661 | struct vmcs12 *vmcs12) | |
662 | { | |
663 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && | |
664 | !page_address_valid(vcpu, vmcs12->apic_access_addr)) | |
665 | return -EINVAL; | |
666 | else | |
667 | return 0; | |
668 | } | |
669 | ||
670 | static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, | |
671 | struct vmcs12 *vmcs12) | |
672 | { | |
673 | if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && | |
674 | !nested_cpu_has_apic_reg_virt(vmcs12) && | |
675 | !nested_cpu_has_vid(vmcs12) && | |
676 | !nested_cpu_has_posted_intr(vmcs12)) | |
677 | return 0; | |
678 | ||
679 | /* | |
680 | * If virtualize x2apic mode is enabled, | |
681 | * virtualize apic access must be disabled. | |
682 | */ | |
683 | if (nested_cpu_has_virt_x2apic_mode(vmcs12) && | |
684 | nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) | |
685 | return -EINVAL; | |
686 | ||
687 | /* | |
688 | * If virtual interrupt delivery is enabled, | |
689 | * we must exit on external interrupts. | |
690 | */ | |
691 | if (nested_cpu_has_vid(vmcs12) && | |
692 | !nested_exit_on_intr(vcpu)) | |
693 | return -EINVAL; | |
694 | ||
695 | /* | |
696 | * bits 15:8 should be zero in posted_intr_nv, | |
697 | * the descriptor address has been already checked | |
698 | * in nested_get_vmcs12_pages. | |
699 | * | |
700 | * bits 5:0 of posted_intr_desc_addr should be zero. | |
701 | */ | |
702 | if (nested_cpu_has_posted_intr(vmcs12) && | |
703 | (!nested_cpu_has_vid(vmcs12) || | |
704 | !nested_exit_intr_ack_set(vcpu) || | |
705 | (vmcs12->posted_intr_nv & 0xff00) || | |
706 | (vmcs12->posted_intr_desc_addr & 0x3f) || | |
707 | (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))) | |
708 | return -EINVAL; | |
709 | ||
710 | /* tpr shadow is needed by all apicv features. */ | |
711 | if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) | |
712 | return -EINVAL; | |
713 | ||
714 | return 0; | |
715 | } | |
716 | ||
717 | static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, | |
f9b245e1 | 718 | u32 count, u64 addr) |
55d2375e | 719 | { |
55d2375e | 720 | int maxphyaddr; |
55d2375e | 721 | |
55d2375e SC |
722 | if (count == 0) |
723 | return 0; | |
724 | maxphyaddr = cpuid_maxphyaddr(vcpu); | |
725 | if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || | |
f9b245e1 | 726 | (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) |
55d2375e | 727 | return -EINVAL; |
f9b245e1 | 728 | |
55d2375e SC |
729 | return 0; |
730 | } | |
731 | ||
61446ba7 KS |
732 | static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, |
733 | struct vmcs12 *vmcs12) | |
55d2375e | 734 | { |
f9b245e1 SC |
735 | if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_load_count, |
736 | vmcs12->vm_exit_msr_load_addr) || | |
737 | nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_store_count, | |
61446ba7 | 738 | vmcs12->vm_exit_msr_store_addr)) |
55d2375e | 739 | return -EINVAL; |
f9b245e1 | 740 | |
55d2375e SC |
741 | return 0; |
742 | } | |
743 | ||
5fbf9634 KS |
744 | static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, |
745 | struct vmcs12 *vmcs12) | |
61446ba7 KS |
746 | { |
747 | if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_entry_msr_load_count, | |
748 | vmcs12->vm_entry_msr_load_addr)) | |
749 | return -EINVAL; | |
750 | ||
751 | return 0; | |
752 | } | |
753 | ||
55d2375e SC |
754 | static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, |
755 | struct vmcs12 *vmcs12) | |
756 | { | |
757 | if (!nested_cpu_has_pml(vmcs12)) | |
758 | return 0; | |
759 | ||
760 | if (!nested_cpu_has_ept(vmcs12) || | |
761 | !page_address_valid(vcpu, vmcs12->pml_address)) | |
762 | return -EINVAL; | |
763 | ||
764 | return 0; | |
765 | } | |
766 | ||
767 | static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, | |
768 | struct vmcs12 *vmcs12) | |
769 | { | |
770 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && | |
771 | !nested_cpu_has_ept(vmcs12)) | |
772 | return -EINVAL; | |
773 | return 0; | |
774 | } | |
775 | ||
776 | static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, | |
777 | struct vmcs12 *vmcs12) | |
778 | { | |
779 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && | |
780 | !nested_cpu_has_ept(vmcs12)) | |
781 | return -EINVAL; | |
782 | return 0; | |
783 | } | |
784 | ||
785 | static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, | |
786 | struct vmcs12 *vmcs12) | |
787 | { | |
788 | if (!nested_cpu_has_shadow_vmcs(vmcs12)) | |
789 | return 0; | |
790 | ||
791 | if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) || | |
792 | !page_address_valid(vcpu, vmcs12->vmwrite_bitmap)) | |
793 | return -EINVAL; | |
794 | ||
795 | return 0; | |
796 | } | |
797 | ||
798 | static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, | |
799 | struct vmx_msr_entry *e) | |
800 | { | |
801 | /* x2APIC MSR accesses are not allowed */ | |
802 | if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8) | |
803 | return -EINVAL; | |
804 | if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ | |
805 | e->index == MSR_IA32_UCODE_REV) | |
806 | return -EINVAL; | |
807 | if (e->reserved != 0) | |
808 | return -EINVAL; | |
809 | return 0; | |
810 | } | |
811 | ||
812 | static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, | |
813 | struct vmx_msr_entry *e) | |
814 | { | |
815 | if (e->index == MSR_FS_BASE || | |
816 | e->index == MSR_GS_BASE || | |
817 | e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ | |
818 | nested_vmx_msr_check_common(vcpu, e)) | |
819 | return -EINVAL; | |
820 | return 0; | |
821 | } | |
822 | ||
823 | static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, | |
824 | struct vmx_msr_entry *e) | |
825 | { | |
826 | if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ | |
827 | nested_vmx_msr_check_common(vcpu, e)) | |
828 | return -EINVAL; | |
829 | return 0; | |
830 | } | |
831 | ||
832 | /* | |
833 | * Load guest's/host's msr at nested entry/exit. | |
834 | * return 0 for success, entry index for failure. | |
835 | */ | |
836 | static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) | |
837 | { | |
838 | u32 i; | |
839 | struct vmx_msr_entry e; | |
840 | struct msr_data msr; | |
841 | ||
842 | msr.host_initiated = false; | |
843 | for (i = 0; i < count; i++) { | |
844 | if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), | |
845 | &e, sizeof(e))) { | |
846 | pr_debug_ratelimited( | |
847 | "%s cannot read MSR entry (%u, 0x%08llx)\n", | |
848 | __func__, i, gpa + i * sizeof(e)); | |
849 | goto fail; | |
850 | } | |
851 | if (nested_vmx_load_msr_check(vcpu, &e)) { | |
852 | pr_debug_ratelimited( | |
853 | "%s check failed (%u, 0x%x, 0x%x)\n", | |
854 | __func__, i, e.index, e.reserved); | |
855 | goto fail; | |
856 | } | |
857 | msr.index = e.index; | |
858 | msr.data = e.value; | |
859 | if (kvm_set_msr(vcpu, &msr)) { | |
860 | pr_debug_ratelimited( | |
861 | "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", | |
862 | __func__, i, e.index, e.value); | |
863 | goto fail; | |
864 | } | |
865 | } | |
866 | return 0; | |
867 | fail: | |
868 | return i + 1; | |
869 | } | |
870 | ||
871 | static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) | |
872 | { | |
873 | u32 i; | |
874 | struct vmx_msr_entry e; | |
875 | ||
876 | for (i = 0; i < count; i++) { | |
877 | struct msr_data msr_info; | |
878 | if (kvm_vcpu_read_guest(vcpu, | |
879 | gpa + i * sizeof(e), | |
880 | &e, 2 * sizeof(u32))) { | |
881 | pr_debug_ratelimited( | |
882 | "%s cannot read MSR entry (%u, 0x%08llx)\n", | |
883 | __func__, i, gpa + i * sizeof(e)); | |
884 | return -EINVAL; | |
885 | } | |
886 | if (nested_vmx_store_msr_check(vcpu, &e)) { | |
887 | pr_debug_ratelimited( | |
888 | "%s check failed (%u, 0x%x, 0x%x)\n", | |
889 | __func__, i, e.index, e.reserved); | |
890 | return -EINVAL; | |
891 | } | |
892 | msr_info.host_initiated = false; | |
893 | msr_info.index = e.index; | |
894 | if (kvm_get_msr(vcpu, &msr_info)) { | |
895 | pr_debug_ratelimited( | |
896 | "%s cannot read MSR (%u, 0x%x)\n", | |
897 | __func__, i, e.index); | |
898 | return -EINVAL; | |
899 | } | |
900 | if (kvm_vcpu_write_guest(vcpu, | |
901 | gpa + i * sizeof(e) + | |
902 | offsetof(struct vmx_msr_entry, value), | |
903 | &msr_info.data, sizeof(msr_info.data))) { | |
904 | pr_debug_ratelimited( | |
905 | "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", | |
906 | __func__, i, e.index, msr_info.data); | |
907 | return -EINVAL; | |
908 | } | |
909 | } | |
910 | return 0; | |
911 | } | |
912 | ||
913 | static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) | |
914 | { | |
915 | unsigned long invalid_mask; | |
916 | ||
917 | invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); | |
918 | return (val & invalid_mask) == 0; | |
919 | } | |
920 | ||
921 | /* | |
922 | * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are | |
923 | * emulating VM entry into a guest with EPT enabled. | |
924 | * Returns 0 on success, 1 on failure. Invalid state exit qualification code | |
925 | * is assigned to entry_failure_code on failure. | |
926 | */ | |
927 | static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, | |
928 | u32 *entry_failure_code) | |
929 | { | |
930 | if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { | |
931 | if (!nested_cr3_valid(vcpu, cr3)) { | |
932 | *entry_failure_code = ENTRY_FAIL_DEFAULT; | |
933 | return 1; | |
934 | } | |
935 | ||
936 | /* | |
937 | * If PAE paging and EPT are both on, CR3 is not used by the CPU and | |
938 | * must not be dereferenced. | |
939 | */ | |
940 | if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) && | |
941 | !nested_ept) { | |
942 | if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { | |
943 | *entry_failure_code = ENTRY_FAIL_PDPTE; | |
944 | return 1; | |
945 | } | |
946 | } | |
947 | } | |
948 | ||
949 | if (!nested_ept) | |
950 | kvm_mmu_new_cr3(vcpu, cr3, false); | |
951 | ||
952 | vcpu->arch.cr3 = cr3; | |
953 | __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); | |
954 | ||
955 | kvm_init_mmu(vcpu, false); | |
956 | ||
957 | return 0; | |
958 | } | |
959 | ||
960 | /* | |
961 | * Returns if KVM is able to config CPU to tag TLB entries | |
962 | * populated by L2 differently than TLB entries populated | |
963 | * by L1. | |
964 | * | |
965 | * If L1 uses EPT, then TLB entries are tagged with different EPTP. | |
966 | * | |
967 | * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged | |
968 | * with different VPID (L1 entries are tagged with vmx->vpid | |
969 | * while L2 entries are tagged with vmx->nested.vpid02). | |
970 | */ | |
971 | static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) | |
972 | { | |
973 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
974 | ||
975 | return nested_cpu_has_ept(vmcs12) || | |
976 | (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); | |
977 | } | |
978 | ||
979 | static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) | |
980 | { | |
981 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
982 | ||
983 | return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; | |
984 | } | |
985 | ||
986 | ||
987 | static inline bool vmx_control_verify(u32 control, u32 low, u32 high) | |
988 | { | |
989 | return fixed_bits_valid(control, low, high); | |
990 | } | |
991 | ||
992 | static inline u64 vmx_control_msr(u32 low, u32 high) | |
993 | { | |
994 | return low | ((u64)high << 32); | |
995 | } | |
996 | ||
997 | static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) | |
998 | { | |
999 | superset &= mask; | |
1000 | subset &= mask; | |
1001 | ||
1002 | return (superset | subset) == superset; | |
1003 | } | |
1004 | ||
1005 | static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) | |
1006 | { | |
1007 | const u64 feature_and_reserved = | |
1008 | /* feature (except bit 48; see below) */ | |
1009 | BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | | |
1010 | /* reserved */ | |
1011 | BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); | |
1012 | u64 vmx_basic = vmx->nested.msrs.basic; | |
1013 | ||
1014 | if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) | |
1015 | return -EINVAL; | |
1016 | ||
1017 | /* | |
1018 | * KVM does not emulate a version of VMX that constrains physical | |
1019 | * addresses of VMX structures (e.g. VMCS) to 32-bits. | |
1020 | */ | |
1021 | if (data & BIT_ULL(48)) | |
1022 | return -EINVAL; | |
1023 | ||
1024 | if (vmx_basic_vmcs_revision_id(vmx_basic) != | |
1025 | vmx_basic_vmcs_revision_id(data)) | |
1026 | return -EINVAL; | |
1027 | ||
1028 | if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) | |
1029 | return -EINVAL; | |
1030 | ||
1031 | vmx->nested.msrs.basic = data; | |
1032 | return 0; | |
1033 | } | |
1034 | ||
1035 | static int | |
1036 | vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) | |
1037 | { | |
1038 | u64 supported; | |
1039 | u32 *lowp, *highp; | |
1040 | ||
1041 | switch (msr_index) { | |
1042 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: | |
1043 | lowp = &vmx->nested.msrs.pinbased_ctls_low; | |
1044 | highp = &vmx->nested.msrs.pinbased_ctls_high; | |
1045 | break; | |
1046 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: | |
1047 | lowp = &vmx->nested.msrs.procbased_ctls_low; | |
1048 | highp = &vmx->nested.msrs.procbased_ctls_high; | |
1049 | break; | |
1050 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: | |
1051 | lowp = &vmx->nested.msrs.exit_ctls_low; | |
1052 | highp = &vmx->nested.msrs.exit_ctls_high; | |
1053 | break; | |
1054 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: | |
1055 | lowp = &vmx->nested.msrs.entry_ctls_low; | |
1056 | highp = &vmx->nested.msrs.entry_ctls_high; | |
1057 | break; | |
1058 | case MSR_IA32_VMX_PROCBASED_CTLS2: | |
1059 | lowp = &vmx->nested.msrs.secondary_ctls_low; | |
1060 | highp = &vmx->nested.msrs.secondary_ctls_high; | |
1061 | break; | |
1062 | default: | |
1063 | BUG(); | |
1064 | } | |
1065 | ||
1066 | supported = vmx_control_msr(*lowp, *highp); | |
1067 | ||
1068 | /* Check must-be-1 bits are still 1. */ | |
1069 | if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) | |
1070 | return -EINVAL; | |
1071 | ||
1072 | /* Check must-be-0 bits are still 0. */ | |
1073 | if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) | |
1074 | return -EINVAL; | |
1075 | ||
1076 | *lowp = data; | |
1077 | *highp = data >> 32; | |
1078 | return 0; | |
1079 | } | |
1080 | ||
1081 | static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) | |
1082 | { | |
1083 | const u64 feature_and_reserved_bits = | |
1084 | /* feature */ | |
1085 | BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | | |
1086 | BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | | |
1087 | /* reserved */ | |
1088 | GENMASK_ULL(13, 9) | BIT_ULL(31); | |
1089 | u64 vmx_misc; | |
1090 | ||
1091 | vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, | |
1092 | vmx->nested.msrs.misc_high); | |
1093 | ||
1094 | if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) | |
1095 | return -EINVAL; | |
1096 | ||
1097 | if ((vmx->nested.msrs.pinbased_ctls_high & | |
1098 | PIN_BASED_VMX_PREEMPTION_TIMER) && | |
1099 | vmx_misc_preemption_timer_rate(data) != | |
1100 | vmx_misc_preemption_timer_rate(vmx_misc)) | |
1101 | return -EINVAL; | |
1102 | ||
1103 | if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) | |
1104 | return -EINVAL; | |
1105 | ||
1106 | if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) | |
1107 | return -EINVAL; | |
1108 | ||
1109 | if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) | |
1110 | return -EINVAL; | |
1111 | ||
1112 | vmx->nested.msrs.misc_low = data; | |
1113 | vmx->nested.msrs.misc_high = data >> 32; | |
1114 | ||
1115 | /* | |
1116 | * If L1 has read-only VM-exit information fields, use the | |
1117 | * less permissive vmx_vmwrite_bitmap to specify write | |
1118 | * permissions for the shadow VMCS. | |
1119 | */ | |
1120 | if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) | |
1121 | vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); | |
1122 | ||
1123 | return 0; | |
1124 | } | |
1125 | ||
1126 | static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) | |
1127 | { | |
1128 | u64 vmx_ept_vpid_cap; | |
1129 | ||
1130 | vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, | |
1131 | vmx->nested.msrs.vpid_caps); | |
1132 | ||
1133 | /* Every bit is either reserved or a feature bit. */ | |
1134 | if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) | |
1135 | return -EINVAL; | |
1136 | ||
1137 | vmx->nested.msrs.ept_caps = data; | |
1138 | vmx->nested.msrs.vpid_caps = data >> 32; | |
1139 | return 0; | |
1140 | } | |
1141 | ||
1142 | static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) | |
1143 | { | |
1144 | u64 *msr; | |
1145 | ||
1146 | switch (msr_index) { | |
1147 | case MSR_IA32_VMX_CR0_FIXED0: | |
1148 | msr = &vmx->nested.msrs.cr0_fixed0; | |
1149 | break; | |
1150 | case MSR_IA32_VMX_CR4_FIXED0: | |
1151 | msr = &vmx->nested.msrs.cr4_fixed0; | |
1152 | break; | |
1153 | default: | |
1154 | BUG(); | |
1155 | } | |
1156 | ||
1157 | /* | |
1158 | * 1 bits (which indicates bits which "must-be-1" during VMX operation) | |
1159 | * must be 1 in the restored value. | |
1160 | */ | |
1161 | if (!is_bitwise_subset(data, *msr, -1ULL)) | |
1162 | return -EINVAL; | |
1163 | ||
1164 | *msr = data; | |
1165 | return 0; | |
1166 | } | |
1167 | ||
1168 | /* | |
1169 | * Called when userspace is restoring VMX MSRs. | |
1170 | * | |
1171 | * Returns 0 on success, non-0 otherwise. | |
1172 | */ | |
1173 | int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |
1174 | { | |
1175 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
1176 | ||
1177 | /* | |
1178 | * Don't allow changes to the VMX capability MSRs while the vCPU | |
1179 | * is in VMX operation. | |
1180 | */ | |
1181 | if (vmx->nested.vmxon) | |
1182 | return -EBUSY; | |
1183 | ||
1184 | switch (msr_index) { | |
1185 | case MSR_IA32_VMX_BASIC: | |
1186 | return vmx_restore_vmx_basic(vmx, data); | |
1187 | case MSR_IA32_VMX_PINBASED_CTLS: | |
1188 | case MSR_IA32_VMX_PROCBASED_CTLS: | |
1189 | case MSR_IA32_VMX_EXIT_CTLS: | |
1190 | case MSR_IA32_VMX_ENTRY_CTLS: | |
1191 | /* | |
1192 | * The "non-true" VMX capability MSRs are generated from the | |
1193 | * "true" MSRs, so we do not support restoring them directly. | |
1194 | * | |
1195 | * If userspace wants to emulate VMX_BASIC[55]=0, userspace | |
1196 | * should restore the "true" MSRs with the must-be-1 bits | |
1197 | * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND | |
1198 | * DEFAULT SETTINGS". | |
1199 | */ | |
1200 | return -EINVAL; | |
1201 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: | |
1202 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: | |
1203 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: | |
1204 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: | |
1205 | case MSR_IA32_VMX_PROCBASED_CTLS2: | |
1206 | return vmx_restore_control_msr(vmx, msr_index, data); | |
1207 | case MSR_IA32_VMX_MISC: | |
1208 | return vmx_restore_vmx_misc(vmx, data); | |
1209 | case MSR_IA32_VMX_CR0_FIXED0: | |
1210 | case MSR_IA32_VMX_CR4_FIXED0: | |
1211 | return vmx_restore_fixed0_msr(vmx, msr_index, data); | |
1212 | case MSR_IA32_VMX_CR0_FIXED1: | |
1213 | case MSR_IA32_VMX_CR4_FIXED1: | |
1214 | /* | |
1215 | * These MSRs are generated based on the vCPU's CPUID, so we | |
1216 | * do not support restoring them directly. | |
1217 | */ | |
1218 | return -EINVAL; | |
1219 | case MSR_IA32_VMX_EPT_VPID_CAP: | |
1220 | return vmx_restore_vmx_ept_vpid_cap(vmx, data); | |
1221 | case MSR_IA32_VMX_VMCS_ENUM: | |
1222 | vmx->nested.msrs.vmcs_enum = data; | |
1223 | return 0; | |
1224 | default: | |
1225 | /* | |
1226 | * The rest of the VMX capability MSRs do not support restore. | |
1227 | */ | |
1228 | return -EINVAL; | |
1229 | } | |
1230 | } | |
1231 | ||
1232 | /* Returns 0 on success, non-0 otherwise. */ | |
1233 | int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) | |
1234 | { | |
1235 | switch (msr_index) { | |
1236 | case MSR_IA32_VMX_BASIC: | |
1237 | *pdata = msrs->basic; | |
1238 | break; | |
1239 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: | |
1240 | case MSR_IA32_VMX_PINBASED_CTLS: | |
1241 | *pdata = vmx_control_msr( | |
1242 | msrs->pinbased_ctls_low, | |
1243 | msrs->pinbased_ctls_high); | |
1244 | if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) | |
1245 | *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; | |
1246 | break; | |
1247 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: | |
1248 | case MSR_IA32_VMX_PROCBASED_CTLS: | |
1249 | *pdata = vmx_control_msr( | |
1250 | msrs->procbased_ctls_low, | |
1251 | msrs->procbased_ctls_high); | |
1252 | if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) | |
1253 | *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; | |
1254 | break; | |
1255 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: | |
1256 | case MSR_IA32_VMX_EXIT_CTLS: | |
1257 | *pdata = vmx_control_msr( | |
1258 | msrs->exit_ctls_low, | |
1259 | msrs->exit_ctls_high); | |
1260 | if (msr_index == MSR_IA32_VMX_EXIT_CTLS) | |
1261 | *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; | |
1262 | break; | |
1263 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: | |
1264 | case MSR_IA32_VMX_ENTRY_CTLS: | |
1265 | *pdata = vmx_control_msr( | |
1266 | msrs->entry_ctls_low, | |
1267 | msrs->entry_ctls_high); | |
1268 | if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) | |
1269 | *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; | |
1270 | break; | |
1271 | case MSR_IA32_VMX_MISC: | |
1272 | *pdata = vmx_control_msr( | |
1273 | msrs->misc_low, | |
1274 | msrs->misc_high); | |
1275 | break; | |
1276 | case MSR_IA32_VMX_CR0_FIXED0: | |
1277 | *pdata = msrs->cr0_fixed0; | |
1278 | break; | |
1279 | case MSR_IA32_VMX_CR0_FIXED1: | |
1280 | *pdata = msrs->cr0_fixed1; | |
1281 | break; | |
1282 | case MSR_IA32_VMX_CR4_FIXED0: | |
1283 | *pdata = msrs->cr4_fixed0; | |
1284 | break; | |
1285 | case MSR_IA32_VMX_CR4_FIXED1: | |
1286 | *pdata = msrs->cr4_fixed1; | |
1287 | break; | |
1288 | case MSR_IA32_VMX_VMCS_ENUM: | |
1289 | *pdata = msrs->vmcs_enum; | |
1290 | break; | |
1291 | case MSR_IA32_VMX_PROCBASED_CTLS2: | |
1292 | *pdata = vmx_control_msr( | |
1293 | msrs->secondary_ctls_low, | |
1294 | msrs->secondary_ctls_high); | |
1295 | break; | |
1296 | case MSR_IA32_VMX_EPT_VPID_CAP: | |
1297 | *pdata = msrs->ept_caps | | |
1298 | ((u64)msrs->vpid_caps << 32); | |
1299 | break; | |
1300 | case MSR_IA32_VMX_VMFUNC: | |
1301 | *pdata = msrs->vmfunc_controls; | |
1302 | break; | |
1303 | default: | |
1304 | return 1; | |
1305 | } | |
1306 | ||
1307 | return 0; | |
1308 | } | |
1309 | ||
1310 | /* | |
1311 | * Copy the writable VMCS shadow fields back to the VMCS12, in case | |
1312 | * they have been modified by the L1 guest. Note that the "read-only" | |
1313 | * VM-exit information fields are actually writable if the vCPU is | |
1314 | * configured to support "VMWRITE to any supported field in the VMCS." | |
1315 | */ | |
1316 | static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) | |
1317 | { | |
1318 | const u16 *fields[] = { | |
1319 | shadow_read_write_fields, | |
1320 | shadow_read_only_fields | |
1321 | }; | |
1322 | const int max_fields[] = { | |
1323 | max_shadow_read_write_fields, | |
1324 | max_shadow_read_only_fields | |
1325 | }; | |
1326 | int i, q; | |
1327 | unsigned long field; | |
1328 | u64 field_value; | |
1329 | struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; | |
1330 | ||
1331 | preempt_disable(); | |
1332 | ||
1333 | vmcs_load(shadow_vmcs); | |
1334 | ||
1335 | for (q = 0; q < ARRAY_SIZE(fields); q++) { | |
1336 | for (i = 0; i < max_fields[q]; i++) { | |
1337 | field = fields[q][i]; | |
1338 | field_value = __vmcs_readl(field); | |
1339 | vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value); | |
1340 | } | |
1341 | /* | |
1342 | * Skip the VM-exit information fields if they are read-only. | |
1343 | */ | |
1344 | if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) | |
1345 | break; | |
1346 | } | |
1347 | ||
1348 | vmcs_clear(shadow_vmcs); | |
1349 | vmcs_load(vmx->loaded_vmcs->vmcs); | |
1350 | ||
1351 | preempt_enable(); | |
1352 | } | |
1353 | ||
1354 | static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) | |
1355 | { | |
1356 | const u16 *fields[] = { | |
1357 | shadow_read_write_fields, | |
1358 | shadow_read_only_fields | |
1359 | }; | |
1360 | const int max_fields[] = { | |
1361 | max_shadow_read_write_fields, | |
1362 | max_shadow_read_only_fields | |
1363 | }; | |
1364 | int i, q; | |
1365 | unsigned long field; | |
1366 | u64 field_value = 0; | |
1367 | struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; | |
1368 | ||
1369 | vmcs_load(shadow_vmcs); | |
1370 | ||
1371 | for (q = 0; q < ARRAY_SIZE(fields); q++) { | |
1372 | for (i = 0; i < max_fields[q]; i++) { | |
1373 | field = fields[q][i]; | |
1374 | vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value); | |
1375 | __vmcs_writel(field, field_value); | |
1376 | } | |
1377 | } | |
1378 | ||
1379 | vmcs_clear(shadow_vmcs); | |
1380 | vmcs_load(vmx->loaded_vmcs->vmcs); | |
1381 | } | |
1382 | ||
1383 | static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) | |
1384 | { | |
1385 | struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; | |
1386 | struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; | |
1387 | ||
1388 | /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ | |
1389 | vmcs12->tpr_threshold = evmcs->tpr_threshold; | |
1390 | vmcs12->guest_rip = evmcs->guest_rip; | |
1391 | ||
1392 | if (unlikely(!(evmcs->hv_clean_fields & | |
1393 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { | |
1394 | vmcs12->guest_rsp = evmcs->guest_rsp; | |
1395 | vmcs12->guest_rflags = evmcs->guest_rflags; | |
1396 | vmcs12->guest_interruptibility_info = | |
1397 | evmcs->guest_interruptibility_info; | |
1398 | } | |
1399 | ||
1400 | if (unlikely(!(evmcs->hv_clean_fields & | |
1401 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { | |
1402 | vmcs12->cpu_based_vm_exec_control = | |
1403 | evmcs->cpu_based_vm_exec_control; | |
1404 | } | |
1405 | ||
1406 | if (unlikely(!(evmcs->hv_clean_fields & | |
e80b4f62 | 1407 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { |
55d2375e SC |
1408 | vmcs12->exception_bitmap = evmcs->exception_bitmap; |
1409 | } | |
1410 | ||
1411 | if (unlikely(!(evmcs->hv_clean_fields & | |
1412 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { | |
1413 | vmcs12->vm_entry_controls = evmcs->vm_entry_controls; | |
1414 | } | |
1415 | ||
1416 | if (unlikely(!(evmcs->hv_clean_fields & | |
1417 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { | |
1418 | vmcs12->vm_entry_intr_info_field = | |
1419 | evmcs->vm_entry_intr_info_field; | |
1420 | vmcs12->vm_entry_exception_error_code = | |
1421 | evmcs->vm_entry_exception_error_code; | |
1422 | vmcs12->vm_entry_instruction_len = | |
1423 | evmcs->vm_entry_instruction_len; | |
1424 | } | |
1425 | ||
1426 | if (unlikely(!(evmcs->hv_clean_fields & | |
1427 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { | |
1428 | vmcs12->host_ia32_pat = evmcs->host_ia32_pat; | |
1429 | vmcs12->host_ia32_efer = evmcs->host_ia32_efer; | |
1430 | vmcs12->host_cr0 = evmcs->host_cr0; | |
1431 | vmcs12->host_cr3 = evmcs->host_cr3; | |
1432 | vmcs12->host_cr4 = evmcs->host_cr4; | |
1433 | vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; | |
1434 | vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; | |
1435 | vmcs12->host_rip = evmcs->host_rip; | |
1436 | vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; | |
1437 | vmcs12->host_es_selector = evmcs->host_es_selector; | |
1438 | vmcs12->host_cs_selector = evmcs->host_cs_selector; | |
1439 | vmcs12->host_ss_selector = evmcs->host_ss_selector; | |
1440 | vmcs12->host_ds_selector = evmcs->host_ds_selector; | |
1441 | vmcs12->host_fs_selector = evmcs->host_fs_selector; | |
1442 | vmcs12->host_gs_selector = evmcs->host_gs_selector; | |
1443 | vmcs12->host_tr_selector = evmcs->host_tr_selector; | |
1444 | } | |
1445 | ||
1446 | if (unlikely(!(evmcs->hv_clean_fields & | |
e80b4f62 | 1447 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { |
55d2375e SC |
1448 | vmcs12->pin_based_vm_exec_control = |
1449 | evmcs->pin_based_vm_exec_control; | |
1450 | vmcs12->vm_exit_controls = evmcs->vm_exit_controls; | |
1451 | vmcs12->secondary_vm_exec_control = | |
1452 | evmcs->secondary_vm_exec_control; | |
1453 | } | |
1454 | ||
1455 | if (unlikely(!(evmcs->hv_clean_fields & | |
1456 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { | |
1457 | vmcs12->io_bitmap_a = evmcs->io_bitmap_a; | |
1458 | vmcs12->io_bitmap_b = evmcs->io_bitmap_b; | |
1459 | } | |
1460 | ||
1461 | if (unlikely(!(evmcs->hv_clean_fields & | |
1462 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { | |
1463 | vmcs12->msr_bitmap = evmcs->msr_bitmap; | |
1464 | } | |
1465 | ||
1466 | if (unlikely(!(evmcs->hv_clean_fields & | |
1467 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { | |
1468 | vmcs12->guest_es_base = evmcs->guest_es_base; | |
1469 | vmcs12->guest_cs_base = evmcs->guest_cs_base; | |
1470 | vmcs12->guest_ss_base = evmcs->guest_ss_base; | |
1471 | vmcs12->guest_ds_base = evmcs->guest_ds_base; | |
1472 | vmcs12->guest_fs_base = evmcs->guest_fs_base; | |
1473 | vmcs12->guest_gs_base = evmcs->guest_gs_base; | |
1474 | vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; | |
1475 | vmcs12->guest_tr_base = evmcs->guest_tr_base; | |
1476 | vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; | |
1477 | vmcs12->guest_idtr_base = evmcs->guest_idtr_base; | |
1478 | vmcs12->guest_es_limit = evmcs->guest_es_limit; | |
1479 | vmcs12->guest_cs_limit = evmcs->guest_cs_limit; | |
1480 | vmcs12->guest_ss_limit = evmcs->guest_ss_limit; | |
1481 | vmcs12->guest_ds_limit = evmcs->guest_ds_limit; | |
1482 | vmcs12->guest_fs_limit = evmcs->guest_fs_limit; | |
1483 | vmcs12->guest_gs_limit = evmcs->guest_gs_limit; | |
1484 | vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; | |
1485 | vmcs12->guest_tr_limit = evmcs->guest_tr_limit; | |
1486 | vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; | |
1487 | vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; | |
1488 | vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; | |
1489 | vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; | |
1490 | vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; | |
1491 | vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; | |
1492 | vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; | |
1493 | vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; | |
1494 | vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; | |
1495 | vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; | |
1496 | vmcs12->guest_es_selector = evmcs->guest_es_selector; | |
1497 | vmcs12->guest_cs_selector = evmcs->guest_cs_selector; | |
1498 | vmcs12->guest_ss_selector = evmcs->guest_ss_selector; | |
1499 | vmcs12->guest_ds_selector = evmcs->guest_ds_selector; | |
1500 | vmcs12->guest_fs_selector = evmcs->guest_fs_selector; | |
1501 | vmcs12->guest_gs_selector = evmcs->guest_gs_selector; | |
1502 | vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; | |
1503 | vmcs12->guest_tr_selector = evmcs->guest_tr_selector; | |
1504 | } | |
1505 | ||
1506 | if (unlikely(!(evmcs->hv_clean_fields & | |
1507 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { | |
1508 | vmcs12->tsc_offset = evmcs->tsc_offset; | |
1509 | vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; | |
1510 | vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; | |
1511 | } | |
1512 | ||
1513 | if (unlikely(!(evmcs->hv_clean_fields & | |
1514 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { | |
1515 | vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; | |
1516 | vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; | |
1517 | vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; | |
1518 | vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; | |
1519 | vmcs12->guest_cr0 = evmcs->guest_cr0; | |
1520 | vmcs12->guest_cr3 = evmcs->guest_cr3; | |
1521 | vmcs12->guest_cr4 = evmcs->guest_cr4; | |
1522 | vmcs12->guest_dr7 = evmcs->guest_dr7; | |
1523 | } | |
1524 | ||
1525 | if (unlikely(!(evmcs->hv_clean_fields & | |
1526 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { | |
1527 | vmcs12->host_fs_base = evmcs->host_fs_base; | |
1528 | vmcs12->host_gs_base = evmcs->host_gs_base; | |
1529 | vmcs12->host_tr_base = evmcs->host_tr_base; | |
1530 | vmcs12->host_gdtr_base = evmcs->host_gdtr_base; | |
1531 | vmcs12->host_idtr_base = evmcs->host_idtr_base; | |
1532 | vmcs12->host_rsp = evmcs->host_rsp; | |
1533 | } | |
1534 | ||
1535 | if (unlikely(!(evmcs->hv_clean_fields & | |
1536 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { | |
1537 | vmcs12->ept_pointer = evmcs->ept_pointer; | |
1538 | vmcs12->virtual_processor_id = evmcs->virtual_processor_id; | |
1539 | } | |
1540 | ||
1541 | if (unlikely(!(evmcs->hv_clean_fields & | |
1542 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { | |
1543 | vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; | |
1544 | vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; | |
1545 | vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; | |
1546 | vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; | |
1547 | vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; | |
1548 | vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; | |
1549 | vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; | |
1550 | vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; | |
1551 | vmcs12->guest_pending_dbg_exceptions = | |
1552 | evmcs->guest_pending_dbg_exceptions; | |
1553 | vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; | |
1554 | vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; | |
1555 | vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; | |
1556 | vmcs12->guest_activity_state = evmcs->guest_activity_state; | |
1557 | vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; | |
1558 | } | |
1559 | ||
1560 | /* | |
1561 | * Not used? | |
1562 | * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; | |
1563 | * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; | |
1564 | * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; | |
1565 | * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0; | |
1566 | * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1; | |
1567 | * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2; | |
1568 | * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3; | |
1569 | * vmcs12->page_fault_error_code_mask = | |
1570 | * evmcs->page_fault_error_code_mask; | |
1571 | * vmcs12->page_fault_error_code_match = | |
1572 | * evmcs->page_fault_error_code_match; | |
1573 | * vmcs12->cr3_target_count = evmcs->cr3_target_count; | |
1574 | * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; | |
1575 | * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; | |
1576 | * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; | |
1577 | */ | |
1578 | ||
1579 | /* | |
1580 | * Read only fields: | |
1581 | * vmcs12->guest_physical_address = evmcs->guest_physical_address; | |
1582 | * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; | |
1583 | * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; | |
1584 | * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; | |
1585 | * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; | |
1586 | * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; | |
1587 | * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; | |
1588 | * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; | |
1589 | * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; | |
1590 | * vmcs12->exit_qualification = evmcs->exit_qualification; | |
1591 | * vmcs12->guest_linear_address = evmcs->guest_linear_address; | |
1592 | * | |
1593 | * Not present in struct vmcs12: | |
1594 | * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; | |
1595 | * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; | |
1596 | * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; | |
1597 | * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; | |
1598 | */ | |
1599 | ||
1600 | return 0; | |
1601 | } | |
1602 | ||
1603 | static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) | |
1604 | { | |
1605 | struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; | |
1606 | struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; | |
1607 | ||
1608 | /* | |
1609 | * Should not be changed by KVM: | |
1610 | * | |
1611 | * evmcs->host_es_selector = vmcs12->host_es_selector; | |
1612 | * evmcs->host_cs_selector = vmcs12->host_cs_selector; | |
1613 | * evmcs->host_ss_selector = vmcs12->host_ss_selector; | |
1614 | * evmcs->host_ds_selector = vmcs12->host_ds_selector; | |
1615 | * evmcs->host_fs_selector = vmcs12->host_fs_selector; | |
1616 | * evmcs->host_gs_selector = vmcs12->host_gs_selector; | |
1617 | * evmcs->host_tr_selector = vmcs12->host_tr_selector; | |
1618 | * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; | |
1619 | * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; | |
1620 | * evmcs->host_cr0 = vmcs12->host_cr0; | |
1621 | * evmcs->host_cr3 = vmcs12->host_cr3; | |
1622 | * evmcs->host_cr4 = vmcs12->host_cr4; | |
1623 | * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; | |
1624 | * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; | |
1625 | * evmcs->host_rip = vmcs12->host_rip; | |
1626 | * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; | |
1627 | * evmcs->host_fs_base = vmcs12->host_fs_base; | |
1628 | * evmcs->host_gs_base = vmcs12->host_gs_base; | |
1629 | * evmcs->host_tr_base = vmcs12->host_tr_base; | |
1630 | * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; | |
1631 | * evmcs->host_idtr_base = vmcs12->host_idtr_base; | |
1632 | * evmcs->host_rsp = vmcs12->host_rsp; | |
1633 | * sync_vmcs12() doesn't read these: | |
1634 | * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; | |
1635 | * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; | |
1636 | * evmcs->msr_bitmap = vmcs12->msr_bitmap; | |
1637 | * evmcs->ept_pointer = vmcs12->ept_pointer; | |
1638 | * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; | |
1639 | * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; | |
1640 | * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; | |
1641 | * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; | |
1642 | * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0; | |
1643 | * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1; | |
1644 | * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2; | |
1645 | * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3; | |
1646 | * evmcs->tpr_threshold = vmcs12->tpr_threshold; | |
1647 | * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; | |
1648 | * evmcs->exception_bitmap = vmcs12->exception_bitmap; | |
1649 | * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; | |
1650 | * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; | |
1651 | * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; | |
1652 | * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; | |
1653 | * evmcs->page_fault_error_code_mask = | |
1654 | * vmcs12->page_fault_error_code_mask; | |
1655 | * evmcs->page_fault_error_code_match = | |
1656 | * vmcs12->page_fault_error_code_match; | |
1657 | * evmcs->cr3_target_count = vmcs12->cr3_target_count; | |
1658 | * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; | |
1659 | * evmcs->tsc_offset = vmcs12->tsc_offset; | |
1660 | * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; | |
1661 | * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; | |
1662 | * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; | |
1663 | * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; | |
1664 | * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; | |
1665 | * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; | |
1666 | * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; | |
1667 | * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; | |
1668 | * | |
1669 | * Not present in struct vmcs12: | |
1670 | * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; | |
1671 | * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; | |
1672 | * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; | |
1673 | * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; | |
1674 | */ | |
1675 | ||
1676 | evmcs->guest_es_selector = vmcs12->guest_es_selector; | |
1677 | evmcs->guest_cs_selector = vmcs12->guest_cs_selector; | |
1678 | evmcs->guest_ss_selector = vmcs12->guest_ss_selector; | |
1679 | evmcs->guest_ds_selector = vmcs12->guest_ds_selector; | |
1680 | evmcs->guest_fs_selector = vmcs12->guest_fs_selector; | |
1681 | evmcs->guest_gs_selector = vmcs12->guest_gs_selector; | |
1682 | evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; | |
1683 | evmcs->guest_tr_selector = vmcs12->guest_tr_selector; | |
1684 | ||
1685 | evmcs->guest_es_limit = vmcs12->guest_es_limit; | |
1686 | evmcs->guest_cs_limit = vmcs12->guest_cs_limit; | |
1687 | evmcs->guest_ss_limit = vmcs12->guest_ss_limit; | |
1688 | evmcs->guest_ds_limit = vmcs12->guest_ds_limit; | |
1689 | evmcs->guest_fs_limit = vmcs12->guest_fs_limit; | |
1690 | evmcs->guest_gs_limit = vmcs12->guest_gs_limit; | |
1691 | evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; | |
1692 | evmcs->guest_tr_limit = vmcs12->guest_tr_limit; | |
1693 | evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; | |
1694 | evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; | |
1695 | ||
1696 | evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; | |
1697 | evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; | |
1698 | evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; | |
1699 | evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; | |
1700 | evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; | |
1701 | evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; | |
1702 | evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; | |
1703 | evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; | |
1704 | ||
1705 | evmcs->guest_es_base = vmcs12->guest_es_base; | |
1706 | evmcs->guest_cs_base = vmcs12->guest_cs_base; | |
1707 | evmcs->guest_ss_base = vmcs12->guest_ss_base; | |
1708 | evmcs->guest_ds_base = vmcs12->guest_ds_base; | |
1709 | evmcs->guest_fs_base = vmcs12->guest_fs_base; | |
1710 | evmcs->guest_gs_base = vmcs12->guest_gs_base; | |
1711 | evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; | |
1712 | evmcs->guest_tr_base = vmcs12->guest_tr_base; | |
1713 | evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; | |
1714 | evmcs->guest_idtr_base = vmcs12->guest_idtr_base; | |
1715 | ||
1716 | evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; | |
1717 | evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; | |
1718 | ||
1719 | evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; | |
1720 | evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; | |
1721 | evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; | |
1722 | evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; | |
1723 | ||
1724 | evmcs->guest_pending_dbg_exceptions = | |
1725 | vmcs12->guest_pending_dbg_exceptions; | |
1726 | evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; | |
1727 | evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; | |
1728 | ||
1729 | evmcs->guest_activity_state = vmcs12->guest_activity_state; | |
1730 | evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; | |
1731 | ||
1732 | evmcs->guest_cr0 = vmcs12->guest_cr0; | |
1733 | evmcs->guest_cr3 = vmcs12->guest_cr3; | |
1734 | evmcs->guest_cr4 = vmcs12->guest_cr4; | |
1735 | evmcs->guest_dr7 = vmcs12->guest_dr7; | |
1736 | ||
1737 | evmcs->guest_physical_address = vmcs12->guest_physical_address; | |
1738 | ||
1739 | evmcs->vm_instruction_error = vmcs12->vm_instruction_error; | |
1740 | evmcs->vm_exit_reason = vmcs12->vm_exit_reason; | |
1741 | evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; | |
1742 | evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; | |
1743 | evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; | |
1744 | evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; | |
1745 | evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; | |
1746 | evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; | |
1747 | ||
1748 | evmcs->exit_qualification = vmcs12->exit_qualification; | |
1749 | ||
1750 | evmcs->guest_linear_address = vmcs12->guest_linear_address; | |
1751 | evmcs->guest_rsp = vmcs12->guest_rsp; | |
1752 | evmcs->guest_rflags = vmcs12->guest_rflags; | |
1753 | ||
1754 | evmcs->guest_interruptibility_info = | |
1755 | vmcs12->guest_interruptibility_info; | |
1756 | evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; | |
1757 | evmcs->vm_entry_controls = vmcs12->vm_entry_controls; | |
1758 | evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; | |
1759 | evmcs->vm_entry_exception_error_code = | |
1760 | vmcs12->vm_entry_exception_error_code; | |
1761 | evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; | |
1762 | ||
1763 | evmcs->guest_rip = vmcs12->guest_rip; | |
1764 | ||
1765 | evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; | |
1766 | ||
1767 | return 0; | |
1768 | } | |
1769 | ||
1770 | /* | |
1771 | * This is an equivalent of the nested hypervisor executing the vmptrld | |
1772 | * instruction. | |
1773 | */ | |
1774 | static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, | |
1775 | bool from_launch) | |
1776 | { | |
1777 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
1778 | struct hv_vp_assist_page assist_page; | |
1779 | ||
1780 | if (likely(!vmx->nested.enlightened_vmcs_enabled)) | |
1781 | return 1; | |
1782 | ||
1783 | if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page))) | |
1784 | return 1; | |
1785 | ||
1786 | if (unlikely(!assist_page.enlighten_vmentry)) | |
1787 | return 1; | |
1788 | ||
1789 | if (unlikely(assist_page.current_nested_vmcs != | |
1790 | vmx->nested.hv_evmcs_vmptr)) { | |
1791 | ||
1792 | if (!vmx->nested.hv_evmcs) | |
1793 | vmx->nested.current_vmptr = -1ull; | |
1794 | ||
1795 | nested_release_evmcs(vcpu); | |
1796 | ||
1797 | vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page( | |
1798 | vcpu, assist_page.current_nested_vmcs); | |
1799 | ||
1800 | if (unlikely(is_error_page(vmx->nested.hv_evmcs_page))) | |
1801 | return 0; | |
1802 | ||
1803 | vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page); | |
1804 | ||
1805 | /* | |
1806 | * Currently, KVM only supports eVMCS version 1 | |
1807 | * (== KVM_EVMCS_VERSION) and thus we expect guest to set this | |
1808 | * value to first u32 field of eVMCS which should specify eVMCS | |
1809 | * VersionNumber. | |
1810 | * | |
1811 | * Guest should be aware of supported eVMCS versions by host by | |
1812 | * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is | |
1813 | * expected to set this CPUID leaf according to the value | |
1814 | * returned in vmcs_version from nested_enable_evmcs(). | |
1815 | * | |
1816 | * However, it turns out that Microsoft Hyper-V fails to comply | |
1817 | * to their own invented interface: When Hyper-V use eVMCS, it | |
1818 | * just sets first u32 field of eVMCS to revision_id specified | |
1819 | * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number | |
1820 | * which is one of the supported versions specified in | |
1821 | * CPUID.0x4000000A.EAX[0:15]. | |
1822 | * | |
1823 | * To overcome Hyper-V bug, we accept here either a supported | |
1824 | * eVMCS version or VMCS12 revision_id as valid values for first | |
1825 | * u32 field of eVMCS. | |
1826 | */ | |
1827 | if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && | |
1828 | (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { | |
1829 | nested_release_evmcs(vcpu); | |
1830 | return 0; | |
1831 | } | |
1832 | ||
1833 | vmx->nested.dirty_vmcs12 = true; | |
1834 | /* | |
1835 | * As we keep L2 state for one guest only 'hv_clean_fields' mask | |
1836 | * can't be used when we switch between them. Reset it here for | |
1837 | * simplicity. | |
1838 | */ | |
1839 | vmx->nested.hv_evmcs->hv_clean_fields &= | |
1840 | ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; | |
1841 | vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs; | |
1842 | ||
1843 | /* | |
1844 | * Unlike normal vmcs12, enlightened vmcs12 is not fully | |
1845 | * reloaded from guest's memory (read only fields, fields not | |
1846 | * present in struct hv_enlightened_vmcs, ...). Make sure there | |
1847 | * are no leftovers. | |
1848 | */ | |
1849 | if (from_launch) { | |
1850 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
1851 | memset(vmcs12, 0, sizeof(*vmcs12)); | |
1852 | vmcs12->hdr.revision_id = VMCS12_REVISION; | |
1853 | } | |
1854 | ||
1855 | } | |
1856 | return 1; | |
1857 | } | |
1858 | ||
1859 | void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu) | |
1860 | { | |
1861 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
1862 | ||
1863 | /* | |
1864 | * hv_evmcs may end up being not mapped after migration (when | |
1865 | * L2 was running), map it here to make sure vmcs12 changes are | |
1866 | * properly reflected. | |
1867 | */ | |
1868 | if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) | |
1869 | nested_vmx_handle_enlightened_vmptrld(vcpu, false); | |
1870 | ||
1871 | if (vmx->nested.hv_evmcs) { | |
1872 | copy_vmcs12_to_enlightened(vmx); | |
1873 | /* All fields are clean */ | |
1874 | vmx->nested.hv_evmcs->hv_clean_fields |= | |
1875 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; | |
1876 | } else { | |
1877 | copy_vmcs12_to_shadow(vmx); | |
1878 | } | |
1879 | ||
1880 | vmx->nested.need_vmcs12_sync = false; | |
1881 | } | |
1882 | ||
1883 | static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) | |
1884 | { | |
1885 | struct vcpu_vmx *vmx = | |
1886 | container_of(timer, struct vcpu_vmx, nested.preemption_timer); | |
1887 | ||
1888 | vmx->nested.preemption_timer_expired = true; | |
1889 | kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); | |
1890 | kvm_vcpu_kick(&vmx->vcpu); | |
1891 | ||
1892 | return HRTIMER_NORESTART; | |
1893 | } | |
1894 | ||
1895 | static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) | |
1896 | { | |
1897 | u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; | |
1898 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
1899 | ||
1900 | /* | |
1901 | * A timer value of zero is architecturally guaranteed to cause | |
1902 | * a VMExit prior to executing any instructions in the guest. | |
1903 | */ | |
1904 | if (preemption_timeout == 0) { | |
1905 | vmx_preemption_timer_fn(&vmx->nested.preemption_timer); | |
1906 | return; | |
1907 | } | |
1908 | ||
1909 | if (vcpu->arch.virtual_tsc_khz == 0) | |
1910 | return; | |
1911 | ||
1912 | preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; | |
1913 | preemption_timeout *= 1000000; | |
1914 | do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); | |
1915 | hrtimer_start(&vmx->nested.preemption_timer, | |
1916 | ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); | |
1917 | } | |
1918 | ||
1919 | static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) | |
1920 | { | |
1921 | if (vmx->nested.nested_run_pending && | |
1922 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) | |
1923 | return vmcs12->guest_ia32_efer; | |
1924 | else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) | |
1925 | return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); | |
1926 | else | |
1927 | return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); | |
1928 | } | |
1929 | ||
1930 | static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) | |
1931 | { | |
1932 | /* | |
1933 | * If vmcs02 hasn't been initialized, set the constant vmcs02 state | |
1934 | * according to L0's settings (vmcs12 is irrelevant here). Host | |
1935 | * fields that come from L0 and are not constant, e.g. HOST_CR3, | |
1936 | * will be set as needed prior to VMLAUNCH/VMRESUME. | |
1937 | */ | |
1938 | if (vmx->nested.vmcs02_initialized) | |
1939 | return; | |
1940 | vmx->nested.vmcs02_initialized = true; | |
1941 | ||
1942 | /* | |
1943 | * We don't care what the EPTP value is we just need to guarantee | |
1944 | * it's valid so we don't get a false positive when doing early | |
1945 | * consistency checks. | |
1946 | */ | |
1947 | if (enable_ept && nested_early_check) | |
1948 | vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0)); | |
1949 | ||
1950 | /* All VMFUNCs are currently emulated through L0 vmexits. */ | |
1951 | if (cpu_has_vmx_vmfunc()) | |
1952 | vmcs_write64(VM_FUNCTION_CONTROL, 0); | |
1953 | ||
1954 | if (cpu_has_vmx_posted_intr()) | |
1955 | vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); | |
1956 | ||
1957 | if (cpu_has_vmx_msr_bitmap()) | |
1958 | vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); | |
1959 | ||
1960 | if (enable_pml) | |
1961 | vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); | |
1962 | ||
1963 | /* | |
1964 | * Set the MSR load/store lists to match L0's settings. Only the | |
1965 | * addresses are constant (for vmcs02), the counts can change based | |
1966 | * on L2's behavior, e.g. switching to/from long mode. | |
1967 | */ | |
1968 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); | |
1969 | vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); | |
1970 | vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); | |
1971 | ||
1972 | vmx_set_constant_host_state(vmx); | |
1973 | } | |
1974 | ||
1975 | static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx, | |
1976 | struct vmcs12 *vmcs12) | |
1977 | { | |
1978 | prepare_vmcs02_constant_state(vmx); | |
1979 | ||
1980 | vmcs_write64(VMCS_LINK_POINTER, -1ull); | |
1981 | ||
1982 | if (enable_vpid) { | |
1983 | if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) | |
1984 | vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); | |
1985 | else | |
1986 | vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); | |
1987 | } | |
1988 | } | |
1989 | ||
1990 | static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) | |
1991 | { | |
1992 | u32 exec_control, vmcs12_exec_ctrl; | |
1993 | u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); | |
1994 | ||
1995 | if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) | |
1996 | prepare_vmcs02_early_full(vmx, vmcs12); | |
1997 | ||
55d2375e SC |
1998 | /* |
1999 | * PIN CONTROLS | |
2000 | */ | |
2001 | exec_control = vmcs12->pin_based_vm_exec_control; | |
2002 | ||
2003 | /* Preemption timer setting is computed directly in vmx_vcpu_run. */ | |
2004 | exec_control |= vmcs_config.pin_based_exec_ctrl; | |
2005 | exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; | |
2006 | vmx->loaded_vmcs->hv_timer_armed = false; | |
2007 | ||
2008 | /* Posted interrupts setting is only taken from vmcs12. */ | |
2009 | if (nested_cpu_has_posted_intr(vmcs12)) { | |
2010 | vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; | |
2011 | vmx->nested.pi_pending = false; | |
2012 | } else { | |
2013 | exec_control &= ~PIN_BASED_POSTED_INTR; | |
2014 | } | |
2015 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); | |
2016 | ||
2017 | /* | |
2018 | * EXEC CONTROLS | |
2019 | */ | |
2020 | exec_control = vmx_exec_control(vmx); /* L0's desires */ | |
2021 | exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; | |
2022 | exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; | |
2023 | exec_control &= ~CPU_BASED_TPR_SHADOW; | |
2024 | exec_control |= vmcs12->cpu_based_vm_exec_control; | |
2025 | ||
2026 | /* | |
2027 | * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if | |
2028 | * nested_get_vmcs12_pages can't fix it up, the illegal value | |
2029 | * will result in a VM entry failure. | |
2030 | */ | |
2031 | if (exec_control & CPU_BASED_TPR_SHADOW) { | |
2032 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); | |
2033 | vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); | |
2034 | } else { | |
2035 | #ifdef CONFIG_X86_64 | |
2036 | exec_control |= CPU_BASED_CR8_LOAD_EXITING | | |
2037 | CPU_BASED_CR8_STORE_EXITING; | |
2038 | #endif | |
2039 | } | |
2040 | ||
2041 | /* | |
2042 | * A vmexit (to either L1 hypervisor or L0 userspace) is always needed | |
2043 | * for I/O port accesses. | |
2044 | */ | |
2045 | exec_control &= ~CPU_BASED_USE_IO_BITMAPS; | |
2046 | exec_control |= CPU_BASED_UNCOND_IO_EXITING; | |
2047 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); | |
2048 | ||
2049 | /* | |
2050 | * SECONDARY EXEC CONTROLS | |
2051 | */ | |
2052 | if (cpu_has_secondary_exec_ctrls()) { | |
2053 | exec_control = vmx->secondary_exec_control; | |
2054 | ||
2055 | /* Take the following fields only from vmcs12 */ | |
2056 | exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | |
2057 | SECONDARY_EXEC_ENABLE_INVPCID | | |
2058 | SECONDARY_EXEC_RDTSCP | | |
2059 | SECONDARY_EXEC_XSAVES | | |
2060 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | | |
2061 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | |
2062 | SECONDARY_EXEC_ENABLE_VMFUNC); | |
2063 | if (nested_cpu_has(vmcs12, | |
2064 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { | |
2065 | vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & | |
2066 | ~SECONDARY_EXEC_ENABLE_PML; | |
2067 | exec_control |= vmcs12_exec_ctrl; | |
2068 | } | |
2069 | ||
2070 | /* VMCS shadowing for L2 is emulated for now */ | |
2071 | exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; | |
2072 | ||
2073 | if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) | |
2074 | vmcs_write16(GUEST_INTR_STATUS, | |
2075 | vmcs12->guest_intr_status); | |
2076 | ||
2077 | /* | |
2078 | * Write an illegal value to APIC_ACCESS_ADDR. Later, | |
2079 | * nested_get_vmcs12_pages will either fix it up or | |
2080 | * remove the VM execution control. | |
2081 | */ | |
2082 | if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) | |
2083 | vmcs_write64(APIC_ACCESS_ADDR, -1ull); | |
2084 | ||
2085 | if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) | |
2086 | vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); | |
2087 | ||
2088 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | |
2089 | } | |
2090 | ||
2091 | /* | |
2092 | * ENTRY CONTROLS | |
2093 | * | |
2094 | * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE | |
2095 | * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate | |
2096 | * on the related bits (if supported by the CPU) in the hope that | |
2097 | * we can avoid VMWrites during vmx_set_efer(). | |
2098 | */ | |
2099 | exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) & | |
2100 | ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; | |
2101 | if (cpu_has_load_ia32_efer()) { | |
2102 | if (guest_efer & EFER_LMA) | |
2103 | exec_control |= VM_ENTRY_IA32E_MODE; | |
2104 | if (guest_efer != host_efer) | |
2105 | exec_control |= VM_ENTRY_LOAD_IA32_EFER; | |
2106 | } | |
2107 | vm_entry_controls_init(vmx, exec_control); | |
2108 | ||
2109 | /* | |
2110 | * EXIT CONTROLS | |
2111 | * | |
2112 | * L2->L1 exit controls are emulated - the hardware exit is to L0 so | |
2113 | * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER | |
2114 | * bits may be modified by vmx_set_efer() in prepare_vmcs02(). | |
2115 | */ | |
2116 | exec_control = vmx_vmexit_ctrl(); | |
2117 | if (cpu_has_load_ia32_efer() && guest_efer != host_efer) | |
2118 | exec_control |= VM_EXIT_LOAD_IA32_EFER; | |
2119 | vm_exit_controls_init(vmx, exec_control); | |
2120 | ||
2121 | /* | |
2122 | * Conceptually we want to copy the PML address and index from | |
2123 | * vmcs01 here, and then back to vmcs01 on nested vmexit. But, | |
2124 | * since we always flush the log on each vmexit and never change | |
2125 | * the PML address (once set), this happens to be equivalent to | |
2126 | * simply resetting the index in vmcs02. | |
2127 | */ | |
2128 | if (enable_pml) | |
2129 | vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); | |
2130 | ||
2131 | /* | |
2132 | * Interrupt/Exception Fields | |
2133 | */ | |
2134 | if (vmx->nested.nested_run_pending) { | |
2135 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | |
2136 | vmcs12->vm_entry_intr_info_field); | |
2137 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, | |
2138 | vmcs12->vm_entry_exception_error_code); | |
2139 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | |
2140 | vmcs12->vm_entry_instruction_len); | |
2141 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | |
2142 | vmcs12->guest_interruptibility_info); | |
2143 | vmx->loaded_vmcs->nmi_known_unmasked = | |
2144 | !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); | |
2145 | } else { | |
2146 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); | |
2147 | } | |
2148 | } | |
2149 | ||
2150 | static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) | |
2151 | { | |
2152 | struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; | |
2153 | ||
2154 | if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & | |
2155 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { | |
2156 | vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); | |
2157 | vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); | |
2158 | vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); | |
2159 | vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); | |
2160 | vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); | |
2161 | vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); | |
2162 | vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); | |
2163 | vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); | |
2164 | vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); | |
2165 | vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); | |
2166 | vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); | |
2167 | vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); | |
2168 | vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); | |
2169 | vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); | |
2170 | vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); | |
2171 | vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); | |
2172 | vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); | |
2173 | vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); | |
2174 | vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); | |
2175 | vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); | |
2176 | vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); | |
2177 | vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); | |
2178 | vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); | |
2179 | vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); | |
2180 | vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); | |
2181 | vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); | |
2182 | vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); | |
2183 | vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); | |
2184 | vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); | |
2185 | vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); | |
2186 | vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); | |
2187 | vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); | |
2188 | vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); | |
2189 | vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); | |
2190 | } | |
2191 | ||
2192 | if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & | |
2193 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { | |
2194 | vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); | |
2195 | vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, | |
2196 | vmcs12->guest_pending_dbg_exceptions); | |
2197 | vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); | |
2198 | vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); | |
2199 | ||
2200 | /* | |
2201 | * L1 may access the L2's PDPTR, so save them to construct | |
2202 | * vmcs12 | |
2203 | */ | |
2204 | if (enable_ept) { | |
2205 | vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); | |
2206 | vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); | |
2207 | vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); | |
2208 | vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); | |
2209 | } | |
2210 | } | |
2211 | ||
2212 | if (nested_cpu_has_xsaves(vmcs12)) | |
2213 | vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); | |
2214 | ||
2215 | /* | |
2216 | * Whether page-faults are trapped is determined by a combination of | |
2217 | * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. | |
2218 | * If enable_ept, L0 doesn't care about page faults and we should | |
2219 | * set all of these to L1's desires. However, if !enable_ept, L0 does | |
2220 | * care about (at least some) page faults, and because it is not easy | |
2221 | * (if at all possible?) to merge L0 and L1's desires, we simply ask | |
2222 | * to exit on each and every L2 page fault. This is done by setting | |
2223 | * MASK=MATCH=0 and (see below) EB.PF=1. | |
2224 | * Note that below we don't need special code to set EB.PF beyond the | |
2225 | * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, | |
2226 | * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when | |
2227 | * !enable_ept, EB.PF is 1, so the "or" will always be 1. | |
2228 | */ | |
2229 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, | |
2230 | enable_ept ? vmcs12->page_fault_error_code_mask : 0); | |
2231 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, | |
2232 | enable_ept ? vmcs12->page_fault_error_code_match : 0); | |
2233 | ||
2234 | if (cpu_has_vmx_apicv()) { | |
2235 | vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); | |
2236 | vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); | |
2237 | vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); | |
2238 | vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); | |
2239 | } | |
2240 | ||
2241 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); | |
2242 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); | |
2243 | ||
2244 | set_cr4_guest_host_mask(vmx); | |
2245 | ||
3ea511be SC |
2246 | if (kvm_mpx_supported() && vmx->nested.nested_run_pending && |
2247 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) | |
2248 | vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); | |
55d2375e SC |
2249 | } |
2250 | ||
2251 | /* | |
2252 | * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested | |
2253 | * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it | |
2254 | * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 | |
2255 | * guest in a way that will both be appropriate to L1's requests, and our | |
2256 | * needs. In addition to modifying the active vmcs (which is vmcs02), this | |
2257 | * function also has additional necessary side-effects, like setting various | |
2258 | * vcpu->arch fields. | |
2259 | * Returns 0 on success, 1 on failure. Invalid state exit qualification code | |
2260 | * is assigned to entry_failure_code on failure. | |
2261 | */ | |
2262 | static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, | |
2263 | u32 *entry_failure_code) | |
2264 | { | |
2265 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
2266 | struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; | |
2267 | ||
2268 | if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) { | |
2269 | prepare_vmcs02_full(vmx, vmcs12); | |
2270 | vmx->nested.dirty_vmcs12 = false; | |
2271 | } | |
2272 | ||
2273 | /* | |
2274 | * First, the fields that are shadowed. This must be kept in sync | |
2275 | * with vmcs_shadow_fields.h. | |
2276 | */ | |
2277 | if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & | |
2278 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { | |
2279 | vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); | |
2280 | vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); | |
2281 | } | |
2282 | ||
2283 | if (vmx->nested.nested_run_pending && | |
2284 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { | |
2285 | kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); | |
2286 | vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); | |
2287 | } else { | |
2288 | kvm_set_dr(vcpu, 7, vcpu->arch.dr7); | |
2289 | vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); | |
2290 | } | |
3ea511be SC |
2291 | if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || |
2292 | !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) | |
2293 | vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); | |
55d2375e SC |
2294 | vmx_set_rflags(vcpu, vmcs12->guest_rflags); |
2295 | ||
55d2375e SC |
2296 | /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the |
2297 | * bitwise-or of what L1 wants to trap for L2, and what we want to | |
2298 | * trap. Note that CR0.TS also needs updating - we do this later. | |
2299 | */ | |
2300 | update_exception_bitmap(vcpu); | |
2301 | vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; | |
2302 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); | |
2303 | ||
2304 | if (vmx->nested.nested_run_pending && | |
2305 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { | |
2306 | vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); | |
2307 | vcpu->arch.pat = vmcs12->guest_ia32_pat; | |
2308 | } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { | |
2309 | vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); | |
2310 | } | |
2311 | ||
2312 | vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); | |
2313 | ||
2314 | if (kvm_has_tsc_control) | |
2315 | decache_tsc_multiplier(vmx); | |
2316 | ||
2317 | if (enable_vpid) { | |
2318 | /* | |
2319 | * There is no direct mapping between vpid02 and vpid12, the | |
2320 | * vpid02 is per-vCPU for L0 and reused while the value of | |
2321 | * vpid12 is changed w/ one invvpid during nested vmentry. | |
2322 | * The vpid12 is allocated by L1 for L2, so it will not | |
2323 | * influence global bitmap(for vpid01 and vpid02 allocation) | |
2324 | * even if spawn a lot of nested vCPUs. | |
2325 | */ | |
2326 | if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) { | |
2327 | if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { | |
2328 | vmx->nested.last_vpid = vmcs12->virtual_processor_id; | |
2329 | __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false); | |
2330 | } | |
2331 | } else { | |
2332 | /* | |
2333 | * If L1 use EPT, then L0 needs to execute INVEPT on | |
2334 | * EPTP02 instead of EPTP01. Therefore, delay TLB | |
2335 | * flush until vmcs02->eptp is fully updated by | |
2336 | * KVM_REQ_LOAD_CR3. Note that this assumes | |
2337 | * KVM_REQ_TLB_FLUSH is evaluated after | |
2338 | * KVM_REQ_LOAD_CR3 in vcpu_enter_guest(). | |
2339 | */ | |
2340 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | |
2341 | } | |
2342 | } | |
2343 | ||
2344 | if (nested_cpu_has_ept(vmcs12)) | |
2345 | nested_ept_init_mmu_context(vcpu); | |
2346 | else if (nested_cpu_has2(vmcs12, | |
2347 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) | |
2348 | vmx_flush_tlb(vcpu, true); | |
2349 | ||
2350 | /* | |
2351 | * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those | |
2352 | * bits which we consider mandatory enabled. | |
2353 | * The CR0_READ_SHADOW is what L2 should have expected to read given | |
2354 | * the specifications by L1; It's not enough to take | |
2355 | * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we | |
2356 | * have more bits than L1 expected. | |
2357 | */ | |
2358 | vmx_set_cr0(vcpu, vmcs12->guest_cr0); | |
2359 | vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); | |
2360 | ||
2361 | vmx_set_cr4(vcpu, vmcs12->guest_cr4); | |
2362 | vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); | |
2363 | ||
2364 | vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); | |
2365 | /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ | |
2366 | vmx_set_efer(vcpu, vcpu->arch.efer); | |
2367 | ||
2368 | /* | |
2369 | * Guest state is invalid and unrestricted guest is disabled, | |
2370 | * which means L1 attempted VMEntry to L2 with invalid state. | |
2371 | * Fail the VMEntry. | |
2372 | */ | |
2373 | if (vmx->emulation_required) { | |
2374 | *entry_failure_code = ENTRY_FAIL_DEFAULT; | |
2375 | return 1; | |
2376 | } | |
2377 | ||
2378 | /* Shadow page tables on either EPT or shadow page tables. */ | |
2379 | if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), | |
2380 | entry_failure_code)) | |
2381 | return 1; | |
2382 | ||
2383 | if (!enable_ept) | |
2384 | vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; | |
2385 | ||
2386 | kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); | |
2387 | kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); | |
2388 | return 0; | |
2389 | } | |
2390 | ||
2391 | static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) | |
2392 | { | |
2393 | if (!nested_cpu_has_nmi_exiting(vmcs12) && | |
2394 | nested_cpu_has_virtual_nmis(vmcs12)) | |
2395 | return -EINVAL; | |
2396 | ||
2397 | if (!nested_cpu_has_virtual_nmis(vmcs12) && | |
2398 | nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING)) | |
2399 | return -EINVAL; | |
2400 | ||
2401 | return 0; | |
2402 | } | |
2403 | ||
2404 | static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) | |
2405 | { | |
2406 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
2407 | int maxphyaddr = cpuid_maxphyaddr(vcpu); | |
2408 | ||
2409 | /* Check for memory type validity */ | |
2410 | switch (address & VMX_EPTP_MT_MASK) { | |
2411 | case VMX_EPTP_MT_UC: | |
2412 | if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)) | |
2413 | return false; | |
2414 | break; | |
2415 | case VMX_EPTP_MT_WB: | |
2416 | if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)) | |
2417 | return false; | |
2418 | break; | |
2419 | default: | |
2420 | return false; | |
2421 | } | |
2422 | ||
2423 | /* only 4 levels page-walk length are valid */ | |
2424 | if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4) | |
2425 | return false; | |
2426 | ||
2427 | /* Reserved bits should not be set */ | |
2428 | if (address >> maxphyaddr || ((address >> 7) & 0x1f)) | |
2429 | return false; | |
2430 | ||
2431 | /* AD, if set, should be supported */ | |
2432 | if (address & VMX_EPTP_AD_ENABLE_BIT) { | |
2433 | if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)) | |
2434 | return false; | |
2435 | } | |
2436 | ||
2437 | return true; | |
2438 | } | |
2439 | ||
461b4ba4 KS |
2440 | /* |
2441 | * Checks related to VM-Execution Control Fields | |
2442 | */ | |
2443 | static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, | |
2444 | struct vmcs12 *vmcs12) | |
55d2375e SC |
2445 | { |
2446 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
55d2375e | 2447 | |
461b4ba4 | 2448 | if (!vmx_control_verify(vmcs12->pin_based_vm_exec_control, |
55d2375e SC |
2449 | vmx->nested.msrs.pinbased_ctls_low, |
2450 | vmx->nested.msrs.pinbased_ctls_high) || | |
461b4ba4 KS |
2451 | !vmx_control_verify(vmcs12->cpu_based_vm_exec_control, |
2452 | vmx->nested.msrs.procbased_ctls_low, | |
2453 | vmx->nested.msrs.procbased_ctls_high)) | |
2454 | return -EINVAL; | |
55d2375e | 2455 | |
461b4ba4 KS |
2456 | if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && |
2457 | !vmx_control_verify(vmcs12->secondary_vm_exec_control, | |
2458 | vmx->nested.msrs.secondary_ctls_low, | |
2459 | vmx->nested.msrs.secondary_ctls_high)) | |
2460 | return -EINVAL; | |
2461 | ||
2462 | if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu) || | |
2463 | nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || | |
2464 | nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || | |
2465 | nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || | |
2466 | nested_vmx_check_apic_access_controls(vcpu, vmcs12) || | |
2467 | nested_vmx_check_apicv_controls(vcpu, vmcs12) || | |
2468 | nested_vmx_check_nmi_controls(vmcs12) || | |
2469 | nested_vmx_check_pml_controls(vcpu, vmcs12) || | |
2470 | nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || | |
2471 | nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || | |
2472 | nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || | |
2473 | (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) | |
2474 | return -EINVAL; | |
2475 | ||
bc441211 SC |
2476 | if (!nested_cpu_has_preemption_timer(vmcs12) && |
2477 | nested_cpu_has_save_preemption_timer(vmcs12)) | |
2478 | return -EINVAL; | |
2479 | ||
461b4ba4 KS |
2480 | if (nested_cpu_has_ept(vmcs12) && |
2481 | !valid_ept_address(vcpu, vmcs12->ept_pointer)) | |
2482 | return -EINVAL; | |
55d2375e SC |
2483 | |
2484 | if (nested_cpu_has_vmfunc(vmcs12)) { | |
2485 | if (vmcs12->vm_function_control & | |
2486 | ~vmx->nested.msrs.vmfunc_controls) | |
461b4ba4 | 2487 | return -EINVAL; |
55d2375e SC |
2488 | |
2489 | if (nested_cpu_has_eptp_switching(vmcs12)) { | |
2490 | if (!nested_cpu_has_ept(vmcs12) || | |
2491 | !page_address_valid(vcpu, vmcs12->eptp_list_address)) | |
461b4ba4 | 2492 | return -EINVAL; |
55d2375e SC |
2493 | } |
2494 | } | |
2495 | ||
461b4ba4 KS |
2496 | return 0; |
2497 | } | |
2498 | ||
61446ba7 KS |
2499 | /* |
2500 | * Checks related to VM-Exit Control Fields | |
2501 | */ | |
2502 | static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, | |
2503 | struct vmcs12 *vmcs12) | |
2504 | { | |
2505 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
2506 | ||
2507 | if (!vmx_control_verify(vmcs12->vm_exit_controls, | |
2508 | vmx->nested.msrs.exit_ctls_low, | |
2509 | vmx->nested.msrs.exit_ctls_high) || | |
2510 | nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)) | |
2511 | return -EINVAL; | |
2512 | ||
2513 | return 0; | |
2514 | } | |
2515 | ||
5fbf9634 KS |
2516 | /* |
2517 | * Checks related to VM-Entry Control Fields | |
2518 | */ | |
2519 | static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, | |
2520 | struct vmcs12 *vmcs12) | |
461b4ba4 KS |
2521 | { |
2522 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
55d2375e | 2523 | |
61446ba7 | 2524 | if (!vmx_control_verify(vmcs12->vm_entry_controls, |
461b4ba4 KS |
2525 | vmx->nested.msrs.entry_ctls_low, |
2526 | vmx->nested.msrs.entry_ctls_high)) | |
5fbf9634 | 2527 | return -EINVAL; |
55d2375e SC |
2528 | |
2529 | /* | |
2530 | * From the Intel SDM, volume 3: | |
2531 | * Fields relevant to VM-entry event injection must be set properly. | |
2532 | * These fields are the VM-entry interruption-information field, the | |
2533 | * VM-entry exception error code, and the VM-entry instruction length. | |
2534 | */ | |
2535 | if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { | |
2536 | u32 intr_info = vmcs12->vm_entry_intr_info_field; | |
2537 | u8 vector = intr_info & INTR_INFO_VECTOR_MASK; | |
2538 | u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; | |
2539 | bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; | |
2540 | bool should_have_error_code; | |
2541 | bool urg = nested_cpu_has2(vmcs12, | |
2542 | SECONDARY_EXEC_UNRESTRICTED_GUEST); | |
2543 | bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; | |
2544 | ||
2545 | /* VM-entry interruption-info field: interruption type */ | |
2546 | if (intr_type == INTR_TYPE_RESERVED || | |
2547 | (intr_type == INTR_TYPE_OTHER_EVENT && | |
2548 | !nested_cpu_supports_monitor_trap_flag(vcpu))) | |
5fbf9634 | 2549 | return -EINVAL; |
55d2375e SC |
2550 | |
2551 | /* VM-entry interruption-info field: vector */ | |
2552 | if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || | |
2553 | (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || | |
2554 | (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) | |
5fbf9634 | 2555 | return -EINVAL; |
55d2375e SC |
2556 | |
2557 | /* VM-entry interruption-info field: deliver error code */ | |
2558 | should_have_error_code = | |
2559 | intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && | |
2560 | x86_exception_has_error_code(vector); | |
2561 | if (has_error_code != should_have_error_code) | |
5fbf9634 | 2562 | return -EINVAL; |
55d2375e SC |
2563 | |
2564 | /* VM-entry exception error code */ | |
2565 | if (has_error_code && | |
2566 | vmcs12->vm_entry_exception_error_code & GENMASK(31, 15)) | |
5fbf9634 | 2567 | return -EINVAL; |
55d2375e SC |
2568 | |
2569 | /* VM-entry interruption-info field: reserved bits */ | |
2570 | if (intr_info & INTR_INFO_RESVD_BITS_MASK) | |
5fbf9634 | 2571 | return -EINVAL; |
55d2375e SC |
2572 | |
2573 | /* VM-entry instruction length */ | |
2574 | switch (intr_type) { | |
2575 | case INTR_TYPE_SOFT_EXCEPTION: | |
2576 | case INTR_TYPE_SOFT_INTR: | |
2577 | case INTR_TYPE_PRIV_SW_EXCEPTION: | |
2578 | if ((vmcs12->vm_entry_instruction_len > 15) || | |
2579 | (vmcs12->vm_entry_instruction_len == 0 && | |
2580 | !nested_cpu_has_zero_length_injection(vcpu))) | |
5fbf9634 | 2581 | return -EINVAL; |
55d2375e SC |
2582 | } |
2583 | } | |
2584 | ||
5fbf9634 KS |
2585 | if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) |
2586 | return -EINVAL; | |
2587 | ||
2588 | return 0; | |
2589 | } | |
2590 | ||
254b2f3b KS |
2591 | /* |
2592 | * Checks related to Host Control Registers and MSRs | |
2593 | */ | |
2594 | static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, | |
2595 | struct vmcs12 *vmcs12) | |
5fbf9634 KS |
2596 | { |
2597 | bool ia32e; | |
2598 | ||
5fbf9634 KS |
2599 | if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || |
2600 | !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || | |
2601 | !nested_cr3_valid(vcpu, vmcs12->host_cr3)) | |
254b2f3b | 2602 | return -EINVAL; |
711eff3a KS |
2603 | |
2604 | if (is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu) || | |
2605 | is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)) | |
2606 | return -EINVAL; | |
2607 | ||
5fbf9634 KS |
2608 | /* |
2609 | * If the load IA32_EFER VM-exit control is 1, bits reserved in the | |
2610 | * IA32_EFER MSR must be 0 in the field for that register. In addition, | |
2611 | * the values of the LMA and LME bits in the field must each be that of | |
2612 | * the host address-space size VM-exit control. | |
2613 | */ | |
2614 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { | |
2615 | ia32e = (vmcs12->vm_exit_controls & | |
2616 | VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; | |
2617 | if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || | |
2618 | ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || | |
2619 | ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) | |
254b2f3b | 2620 | return -EINVAL; |
5fbf9634 KS |
2621 | } |
2622 | ||
55d2375e SC |
2623 | return 0; |
2624 | } | |
2625 | ||
4e445aee KS |
2626 | /* |
2627 | * Checks related to Guest Non-register State | |
2628 | */ | |
2629 | static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) | |
254b2f3b KS |
2630 | { |
2631 | if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && | |
2632 | vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) | |
4e445aee KS |
2633 | return -EINVAL; |
2634 | ||
2635 | return 0; | |
2636 | } | |
254b2f3b | 2637 | |
4e445aee KS |
2638 | static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, |
2639 | struct vmcs12 *vmcs12) | |
2640 | { | |
254b2f3b KS |
2641 | if (nested_check_vm_execution_controls(vcpu, vmcs12) || |
2642 | nested_check_vm_exit_controls(vcpu, vmcs12) || | |
2643 | nested_check_vm_entry_controls(vcpu, vmcs12)) | |
2644 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | |
2645 | ||
2646 | if (nested_check_host_control_regs(vcpu, vmcs12)) | |
2647 | return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; | |
2648 | ||
4e445aee KS |
2649 | if (nested_check_guest_non_reg_state(vmcs12)) |
2650 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | |
2651 | ||
254b2f3b KS |
2652 | return 0; |
2653 | } | |
2654 | ||
55d2375e SC |
2655 | static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, |
2656 | struct vmcs12 *vmcs12) | |
2657 | { | |
2658 | int r; | |
2659 | struct page *page; | |
2660 | struct vmcs12 *shadow; | |
2661 | ||
2662 | if (vmcs12->vmcs_link_pointer == -1ull) | |
2663 | return 0; | |
2664 | ||
2665 | if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) | |
2666 | return -EINVAL; | |
2667 | ||
2668 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); | |
2669 | if (is_error_page(page)) | |
2670 | return -EINVAL; | |
2671 | ||
2672 | r = 0; | |
2673 | shadow = kmap(page); | |
2674 | if (shadow->hdr.revision_id != VMCS12_REVISION || | |
2675 | shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) | |
2676 | r = -EINVAL; | |
2677 | kunmap(page); | |
2678 | kvm_release_page_clean(page); | |
2679 | return r; | |
2680 | } | |
2681 | ||
16322a3b | 2682 | static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu, |
461b4ba4 KS |
2683 | struct vmcs12 *vmcs12, |
2684 | u32 *exit_qual) | |
55d2375e SC |
2685 | { |
2686 | bool ia32e; | |
2687 | ||
2688 | *exit_qual = ENTRY_FAIL_DEFAULT; | |
2689 | ||
2690 | if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || | |
2691 | !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) | |
2692 | return 1; | |
2693 | ||
2694 | if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { | |
2695 | *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; | |
2696 | return 1; | |
2697 | } | |
2698 | ||
2699 | /* | |
2700 | * If the load IA32_EFER VM-entry control is 1, the following checks | |
2701 | * are performed on the field for the IA32_EFER MSR: | |
2702 | * - Bits reserved in the IA32_EFER MSR must be 0. | |
2703 | * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of | |
2704 | * the IA-32e mode guest VM-exit control. It must also be identical | |
2705 | * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to | |
2706 | * CR0.PG) is 1. | |
2707 | */ | |
2708 | if (to_vmx(vcpu)->nested.nested_run_pending && | |
2709 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { | |
2710 | ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; | |
2711 | if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || | |
2712 | ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || | |
2713 | ((vmcs12->guest_cr0 & X86_CR0_PG) && | |
2714 | ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) | |
2715 | return 1; | |
2716 | } | |
2717 | ||
2718 | if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && | |
2719 | (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || | |
2720 | (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) | |
2721 | return 1; | |
2722 | ||
2723 | return 0; | |
2724 | } | |
2725 | ||
453eafbe | 2726 | static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) |
55d2375e SC |
2727 | { |
2728 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
2729 | unsigned long cr3, cr4; | |
f1727b49 | 2730 | bool vm_fail; |
55d2375e SC |
2731 | |
2732 | if (!nested_early_check) | |
2733 | return 0; | |
2734 | ||
2735 | if (vmx->msr_autoload.host.nr) | |
2736 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); | |
2737 | if (vmx->msr_autoload.guest.nr) | |
2738 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); | |
2739 | ||
2740 | preempt_disable(); | |
2741 | ||
2742 | vmx_prepare_switch_to_guest(vcpu); | |
2743 | ||
2744 | /* | |
2745 | * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, | |
2746 | * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to | |
2747 | * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e. | |
2748 | * there is no need to preserve other bits or save/restore the field. | |
2749 | */ | |
2750 | vmcs_writel(GUEST_RFLAGS, 0); | |
2751 | ||
55d2375e SC |
2752 | cr3 = __get_current_cr3_fast(); |
2753 | if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { | |
2754 | vmcs_writel(HOST_CR3, cr3); | |
2755 | vmx->loaded_vmcs->host_state.cr3 = cr3; | |
2756 | } | |
2757 | ||
2758 | cr4 = cr4_read_shadow(); | |
2759 | if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { | |
2760 | vmcs_writel(HOST_CR4, cr4); | |
2761 | vmx->loaded_vmcs->host_state.cr4 = cr4; | |
2762 | } | |
2763 | ||
55d2375e | 2764 | asm( |
453eafbe | 2765 | "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ |
5a878160 SC |
2766 | "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" |
2767 | "je 1f \n\t" | |
fbda0fd3 | 2768 | __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" |
5a878160 SC |
2769 | "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" |
2770 | "1: \n\t" | |
453eafbe | 2771 | "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ |
55d2375e SC |
2772 | |
2773 | /* Check if vmlaunch or vmresume is needed */ | |
74dfa278 | 2774 | "cmpb $0, %c[launched](%[loaded_vmcs])\n\t" |
453eafbe | 2775 | |
f1727b49 SC |
2776 | /* |
2777 | * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set | |
2778 | * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail | |
2779 | * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the | |
bbc0b823 | 2780 | * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail. |
f1727b49 | 2781 | */ |
453eafbe SC |
2782 | "call vmx_vmenter\n\t" |
2783 | ||
bbc0b823 SC |
2784 | CC_SET(be) |
2785 | : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) | |
5a878160 | 2786 | : [HOST_RSP]"r"((unsigned long)HOST_RSP), |
74dfa278 SC |
2787 | [loaded_vmcs]"r"(vmx->loaded_vmcs), |
2788 | [launched]"i"(offsetof(struct loaded_vmcs, launched)), | |
5a878160 | 2789 | [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), |
453eafbe | 2790 | [wordsize]"i"(sizeof(ulong)) |
9ce0a07a | 2791 | : "cc", "memory" |
55d2375e SC |
2792 | ); |
2793 | ||
55d2375e SC |
2794 | if (vmx->msr_autoload.host.nr) |
2795 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); | |
2796 | if (vmx->msr_autoload.guest.nr) | |
2797 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); | |
2798 | ||
f1727b49 | 2799 | if (vm_fail) { |
e3feb4af | 2800 | preempt_enable(); |
55d2375e SC |
2801 | WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != |
2802 | VMXERR_ENTRY_INVALID_CONTROL_FIELD); | |
55d2375e SC |
2803 | return 1; |
2804 | } | |
2805 | ||
2806 | /* | |
2807 | * VMExit clears RFLAGS.IF and DR7, even on a consistency check. | |
2808 | */ | |
2809 | local_irq_enable(); | |
2810 | if (hw_breakpoint_active()) | |
2811 | set_debugreg(__this_cpu_read(cpu_dr7), 7); | |
e3feb4af | 2812 | preempt_enable(); |
55d2375e SC |
2813 | |
2814 | /* | |
2815 | * A non-failing VMEntry means we somehow entered guest mode with | |
2816 | * an illegal RIP, and that's just the tip of the iceberg. There | |
2817 | * is no telling what memory has been modified or what state has | |
2818 | * been exposed to unknown code. Hitting this all but guarantees | |
2819 | * a (very critical) hardware issue. | |
2820 | */ | |
2821 | WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & | |
2822 | VMX_EXIT_REASONS_FAILED_VMENTRY)); | |
2823 | ||
2824 | return 0; | |
2825 | } | |
55d2375e SC |
2826 | |
2827 | static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, | |
2828 | struct vmcs12 *vmcs12); | |
2829 | ||
2830 | static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) | |
2831 | { | |
2832 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
2833 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
2834 | struct page *page; | |
2835 | u64 hpa; | |
2836 | ||
2837 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { | |
2838 | /* | |
2839 | * Translate L1 physical address to host physical | |
2840 | * address for vmcs02. Keep the page pinned, so this | |
2841 | * physical address remains valid. We keep a reference | |
2842 | * to it so we can release it later. | |
2843 | */ | |
2844 | if (vmx->nested.apic_access_page) { /* shouldn't happen */ | |
2845 | kvm_release_page_dirty(vmx->nested.apic_access_page); | |
2846 | vmx->nested.apic_access_page = NULL; | |
2847 | } | |
2848 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); | |
2849 | /* | |
2850 | * If translation failed, no matter: This feature asks | |
2851 | * to exit when accessing the given address, and if it | |
2852 | * can never be accessed, this feature won't do | |
2853 | * anything anyway. | |
2854 | */ | |
2855 | if (!is_error_page(page)) { | |
2856 | vmx->nested.apic_access_page = page; | |
2857 | hpa = page_to_phys(vmx->nested.apic_access_page); | |
2858 | vmcs_write64(APIC_ACCESS_ADDR, hpa); | |
2859 | } else { | |
2860 | vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, | |
2861 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); | |
2862 | } | |
2863 | } | |
2864 | ||
2865 | if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { | |
2866 | if (vmx->nested.virtual_apic_page) { /* shouldn't happen */ | |
2867 | kvm_release_page_dirty(vmx->nested.virtual_apic_page); | |
2868 | vmx->nested.virtual_apic_page = NULL; | |
2869 | } | |
2870 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr); | |
2871 | ||
2872 | /* | |
2873 | * If translation failed, VM entry will fail because | |
2874 | * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull. | |
55d2375e SC |
2875 | */ |
2876 | if (!is_error_page(page)) { | |
2877 | vmx->nested.virtual_apic_page = page; | |
2878 | hpa = page_to_phys(vmx->nested.virtual_apic_page); | |
2879 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); | |
69090810 PB |
2880 | } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && |
2881 | nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && | |
2882 | !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { | |
2883 | /* | |
2884 | * The processor will never use the TPR shadow, simply | |
2885 | * clear the bit from the execution control. Such a | |
2886 | * configuration is useless, but it happens in tests. | |
2887 | * For any other configuration, failing the vm entry is | |
2888 | * _not_ what the processor does but it's basically the | |
2889 | * only possibility we have. | |
2890 | */ | |
2891 | vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, | |
2892 | CPU_BASED_TPR_SHADOW); | |
55d2375e SC |
2893 | } |
2894 | } | |
2895 | ||
2896 | if (nested_cpu_has_posted_intr(vmcs12)) { | |
2897 | if (vmx->nested.pi_desc_page) { /* shouldn't happen */ | |
2898 | kunmap(vmx->nested.pi_desc_page); | |
2899 | kvm_release_page_dirty(vmx->nested.pi_desc_page); | |
2900 | vmx->nested.pi_desc_page = NULL; | |
42b00f12 LT |
2901 | vmx->nested.pi_desc = NULL; |
2902 | vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull); | |
55d2375e SC |
2903 | } |
2904 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr); | |
2905 | if (is_error_page(page)) | |
2906 | return; | |
2907 | vmx->nested.pi_desc_page = page; | |
2908 | vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page); | |
2909 | vmx->nested.pi_desc = | |
2910 | (struct pi_desc *)((void *)vmx->nested.pi_desc + | |
2911 | (unsigned long)(vmcs12->posted_intr_desc_addr & | |
2912 | (PAGE_SIZE - 1))); | |
2913 | vmcs_write64(POSTED_INTR_DESC_ADDR, | |
2914 | page_to_phys(vmx->nested.pi_desc_page) + | |
2915 | (unsigned long)(vmcs12->posted_intr_desc_addr & | |
2916 | (PAGE_SIZE - 1))); | |
2917 | } | |
2918 | if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) | |
2919 | vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, | |
2920 | CPU_BASED_USE_MSR_BITMAPS); | |
2921 | else | |
2922 | vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, | |
2923 | CPU_BASED_USE_MSR_BITMAPS); | |
2924 | } | |
2925 | ||
2926 | /* | |
2927 | * Intel's VMX Instruction Reference specifies a common set of prerequisites | |
2928 | * for running VMX instructions (except VMXON, whose prerequisites are | |
2929 | * slightly different). It also specifies what exception to inject otherwise. | |
2930 | * Note that many of these exceptions have priority over VM exits, so they | |
2931 | * don't have to be checked again here. | |
2932 | */ | |
2933 | static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) | |
2934 | { | |
2935 | if (!to_vmx(vcpu)->nested.vmxon) { | |
2936 | kvm_queue_exception(vcpu, UD_VECTOR); | |
2937 | return 0; | |
2938 | } | |
2939 | ||
2940 | if (vmx_get_cpl(vcpu)) { | |
2941 | kvm_inject_gp(vcpu, 0); | |
2942 | return 0; | |
2943 | } | |
2944 | ||
2945 | return 1; | |
2946 | } | |
2947 | ||
2948 | static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) | |
2949 | { | |
2950 | u8 rvi = vmx_get_rvi(); | |
2951 | u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); | |
2952 | ||
2953 | return ((rvi & 0xf0) > (vppr & 0xf0)); | |
2954 | } | |
2955 | ||
2956 | static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, | |
2957 | struct vmcs12 *vmcs12); | |
2958 | ||
2959 | /* | |
2960 | * If from_vmentry is false, this is being called from state restore (either RSM | |
2961 | * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. | |
2962 | + * | |
2963 | + * Returns: | |
2964 | + * 0 - success, i.e. proceed with actual VMEnter | |
2965 | + * 1 - consistency check VMExit | |
2966 | + * -1 - consistency check VMFail | |
2967 | */ | |
2968 | int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) | |
2969 | { | |
2970 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
2971 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
2972 | bool evaluate_pending_interrupts; | |
2973 | u32 exit_reason = EXIT_REASON_INVALID_STATE; | |
2974 | u32 exit_qual; | |
2975 | ||
2976 | evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & | |
2977 | (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); | |
2978 | if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) | |
2979 | evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); | |
2980 | ||
2981 | if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) | |
2982 | vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); | |
2983 | if (kvm_mpx_supported() && | |
2984 | !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) | |
2985 | vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); | |
2986 | ||
2987 | vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); | |
2988 | ||
2989 | prepare_vmcs02_early(vmx, vmcs12); | |
2990 | ||
2991 | if (from_vmentry) { | |
2992 | nested_get_vmcs12_pages(vcpu); | |
2993 | ||
2994 | if (nested_vmx_check_vmentry_hw(vcpu)) { | |
2995 | vmx_switch_vmcs(vcpu, &vmx->vmcs01); | |
2996 | return -1; | |
2997 | } | |
2998 | ||
16322a3b | 2999 | if (nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) |
55d2375e SC |
3000 | goto vmentry_fail_vmexit; |
3001 | } | |
3002 | ||
3003 | enter_guest_mode(vcpu); | |
3004 | if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) | |
3005 | vcpu->arch.tsc_offset += vmcs12->tsc_offset; | |
3006 | ||
3007 | if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) | |
3008 | goto vmentry_fail_vmexit_guest_mode; | |
3009 | ||
3010 | if (from_vmentry) { | |
3011 | exit_reason = EXIT_REASON_MSR_LOAD_FAIL; | |
3012 | exit_qual = nested_vmx_load_msr(vcpu, | |
3013 | vmcs12->vm_entry_msr_load_addr, | |
3014 | vmcs12->vm_entry_msr_load_count); | |
3015 | if (exit_qual) | |
3016 | goto vmentry_fail_vmexit_guest_mode; | |
3017 | } else { | |
3018 | /* | |
3019 | * The MMU is not initialized to point at the right entities yet and | |
3020 | * "get pages" would need to read data from the guest (i.e. we will | |
3021 | * need to perform gpa to hpa translation). Request a call | |
3022 | * to nested_get_vmcs12_pages before the next VM-entry. The MSRs | |
3023 | * have already been set at vmentry time and should not be reset. | |
3024 | */ | |
3025 | kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); | |
3026 | } | |
3027 | ||
3028 | /* | |
3029 | * If L1 had a pending IRQ/NMI until it executed | |
3030 | * VMLAUNCH/VMRESUME which wasn't delivered because it was | |
3031 | * disallowed (e.g. interrupts disabled), L0 needs to | |
3032 | * evaluate if this pending event should cause an exit from L2 | |
3033 | * to L1 or delivered directly to L2 (e.g. In case L1 don't | |
3034 | * intercept EXTERNAL_INTERRUPT). | |
3035 | * | |
3036 | * Usually this would be handled by the processor noticing an | |
3037 | * IRQ/NMI window request, or checking RVI during evaluation of | |
3038 | * pending virtual interrupts. However, this setting was done | |
3039 | * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 | |
3040 | * to perform pending event evaluation by requesting a KVM_REQ_EVENT. | |
3041 | */ | |
3042 | if (unlikely(evaluate_pending_interrupts)) | |
3043 | kvm_make_request(KVM_REQ_EVENT, vcpu); | |
3044 | ||
359a6c3d PB |
3045 | /* |
3046 | * Do not start the preemption timer hrtimer until after we know | |
3047 | * we are successful, so that only nested_vmx_vmexit needs to cancel | |
3048 | * the timer. | |
3049 | */ | |
3050 | vmx->nested.preemption_timer_expired = false; | |
3051 | if (nested_cpu_has_preemption_timer(vmcs12)) | |
3052 | vmx_start_preemption_timer(vcpu); | |
3053 | ||
55d2375e SC |
3054 | /* |
3055 | * Note no nested_vmx_succeed or nested_vmx_fail here. At this point | |
3056 | * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet | |
3057 | * returned as far as L1 is concerned. It will only return (and set | |
3058 | * the success flag) when L2 exits (see nested_vmx_vmexit()). | |
3059 | */ | |
3060 | return 0; | |
3061 | ||
3062 | /* | |
3063 | * A failed consistency check that leads to a VMExit during L1's | |
3064 | * VMEnter to L2 is a variation of a normal VMexit, as explained in | |
3065 | * 26.7 "VM-entry failures during or after loading guest state". | |
3066 | */ | |
3067 | vmentry_fail_vmexit_guest_mode: | |
3068 | if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) | |
3069 | vcpu->arch.tsc_offset -= vmcs12->tsc_offset; | |
3070 | leave_guest_mode(vcpu); | |
3071 | ||
3072 | vmentry_fail_vmexit: | |
3073 | vmx_switch_vmcs(vcpu, &vmx->vmcs01); | |
3074 | ||
3075 | if (!from_vmentry) | |
3076 | return 1; | |
3077 | ||
3078 | load_vmcs12_host_state(vcpu, vmcs12); | |
3079 | vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; | |
3080 | vmcs12->exit_qualification = exit_qual; | |
3081 | if (enable_shadow_vmcs || vmx->nested.hv_evmcs) | |
3082 | vmx->nested.need_vmcs12_sync = true; | |
3083 | return 1; | |
3084 | } | |
3085 | ||
3086 | /* | |
3087 | * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 | |
3088 | * for running an L2 nested guest. | |
3089 | */ | |
3090 | static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) | |
3091 | { | |
3092 | struct vmcs12 *vmcs12; | |
3093 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
3094 | u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); | |
3095 | int ret; | |
3096 | ||
3097 | if (!nested_vmx_check_permission(vcpu)) | |
3098 | return 1; | |
3099 | ||
3100 | if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true)) | |
3101 | return 1; | |
3102 | ||
3103 | if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) | |
3104 | return nested_vmx_failInvalid(vcpu); | |
3105 | ||
3106 | vmcs12 = get_vmcs12(vcpu); | |
3107 | ||
3108 | /* | |
3109 | * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact | |
3110 | * that there *is* a valid VMCS pointer, RFLAGS.CF is set | |
3111 | * rather than RFLAGS.ZF, and no error number is stored to the | |
3112 | * VM-instruction error field. | |
3113 | */ | |
3114 | if (vmcs12->hdr.shadow_vmcs) | |
3115 | return nested_vmx_failInvalid(vcpu); | |
3116 | ||
3117 | if (vmx->nested.hv_evmcs) { | |
3118 | copy_enlightened_to_vmcs12(vmx); | |
3119 | /* Enlightened VMCS doesn't have launch state */ | |
3120 | vmcs12->launch_state = !launch; | |
3121 | } else if (enable_shadow_vmcs) { | |
3122 | copy_shadow_to_vmcs12(vmx); | |
3123 | } | |
3124 | ||
3125 | /* | |
3126 | * The nested entry process starts with enforcing various prerequisites | |
3127 | * on vmcs12 as required by the Intel SDM, and act appropriately when | |
3128 | * they fail: As the SDM explains, some conditions should cause the | |
3129 | * instruction to fail, while others will cause the instruction to seem | |
3130 | * to succeed, but return an EXIT_REASON_INVALID_STATE. | |
3131 | * To speed up the normal (success) code path, we should avoid checking | |
3132 | * for misconfigurations which will anyway be caught by the processor | |
3133 | * when using the merged vmcs02. | |
3134 | */ | |
3135 | if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) | |
3136 | return nested_vmx_failValid(vcpu, | |
3137 | VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); | |
3138 | ||
3139 | if (vmcs12->launch_state == launch) | |
3140 | return nested_vmx_failValid(vcpu, | |
3141 | launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS | |
3142 | : VMXERR_VMRESUME_NONLAUNCHED_VMCS); | |
3143 | ||
16322a3b | 3144 | ret = nested_vmx_check_vmentry_prereqs(vcpu, vmcs12); |
55d2375e SC |
3145 | if (ret) |
3146 | return nested_vmx_failValid(vcpu, ret); | |
3147 | ||
3148 | /* | |
3149 | * We're finally done with prerequisite checking, and can start with | |
3150 | * the nested entry. | |
3151 | */ | |
3152 | vmx->nested.nested_run_pending = 1; | |
3153 | ret = nested_vmx_enter_non_root_mode(vcpu, true); | |
3154 | vmx->nested.nested_run_pending = !ret; | |
3155 | if (ret > 0) | |
3156 | return 1; | |
3157 | else if (ret) | |
3158 | return nested_vmx_failValid(vcpu, | |
3159 | VMXERR_ENTRY_INVALID_CONTROL_FIELD); | |
3160 | ||
3161 | /* Hide L1D cache contents from the nested guest. */ | |
3162 | vmx->vcpu.arch.l1tf_flush_l1d = true; | |
3163 | ||
3164 | /* | |
3165 | * Must happen outside of nested_vmx_enter_non_root_mode() as it will | |
3166 | * also be used as part of restoring nVMX state for | |
3167 | * snapshot restore (migration). | |
3168 | * | |
3169 | * In this flow, it is assumed that vmcs12 cache was | |
3170 | * trasferred as part of captured nVMX state and should | |
3171 | * therefore not be read from guest memory (which may not | |
3172 | * exist on destination host yet). | |
3173 | */ | |
3174 | nested_cache_shadow_vmcs12(vcpu, vmcs12); | |
3175 | ||
3176 | /* | |
9ebdfe52 JM |
3177 | * If we're entering a halted L2 vcpu and the L2 vcpu won't be |
3178 | * awakened by event injection or by an NMI-window VM-exit or | |
3179 | * by an interrupt-window VM-exit, halt the vcpu. | |
55d2375e SC |
3180 | */ |
3181 | if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && | |
9ebdfe52 JM |
3182 | !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && |
3183 | !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) && | |
3184 | !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) && | |
3185 | (vmcs12->guest_rflags & X86_EFLAGS_IF))) { | |
55d2375e SC |
3186 | vmx->nested.nested_run_pending = 0; |
3187 | return kvm_vcpu_halt(vcpu); | |
3188 | } | |
3189 | return 1; | |
3190 | } | |
3191 | ||
3192 | /* | |
3193 | * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date | |
3194 | * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). | |
3195 | * This function returns the new value we should put in vmcs12.guest_cr0. | |
3196 | * It's not enough to just return the vmcs02 GUEST_CR0. Rather, | |
3197 | * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now | |
3198 | * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 | |
3199 | * didn't trap the bit, because if L1 did, so would L0). | |
3200 | * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have | |
3201 | * been modified by L2, and L1 knows it. So just leave the old value of | |
3202 | * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 | |
3203 | * isn't relevant, because if L0 traps this bit it can set it to anything. | |
3204 | * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have | |
3205 | * changed these bits, and therefore they need to be updated, but L0 | |
3206 | * didn't necessarily allow them to be changed in GUEST_CR0 - and rather | |
3207 | * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. | |
3208 | */ | |
3209 | static inline unsigned long | |
3210 | vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | |
3211 | { | |
3212 | return | |
3213 | /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | | |
3214 | /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | | |
3215 | /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | | |
3216 | vcpu->arch.cr0_guest_owned_bits)); | |
3217 | } | |
3218 | ||
3219 | static inline unsigned long | |
3220 | vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | |
3221 | { | |
3222 | return | |
3223 | /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | | |
3224 | /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | | |
3225 | /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | | |
3226 | vcpu->arch.cr4_guest_owned_bits)); | |
3227 | } | |
3228 | ||
3229 | static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, | |
3230 | struct vmcs12 *vmcs12) | |
3231 | { | |
3232 | u32 idt_vectoring; | |
3233 | unsigned int nr; | |
3234 | ||
3235 | if (vcpu->arch.exception.injected) { | |
3236 | nr = vcpu->arch.exception.nr; | |
3237 | idt_vectoring = nr | VECTORING_INFO_VALID_MASK; | |
3238 | ||
3239 | if (kvm_exception_is_soft(nr)) { | |
3240 | vmcs12->vm_exit_instruction_len = | |
3241 | vcpu->arch.event_exit_inst_len; | |
3242 | idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; | |
3243 | } else | |
3244 | idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; | |
3245 | ||
3246 | if (vcpu->arch.exception.has_error_code) { | |
3247 | idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; | |
3248 | vmcs12->idt_vectoring_error_code = | |
3249 | vcpu->arch.exception.error_code; | |
3250 | } | |
3251 | ||
3252 | vmcs12->idt_vectoring_info_field = idt_vectoring; | |
3253 | } else if (vcpu->arch.nmi_injected) { | |
3254 | vmcs12->idt_vectoring_info_field = | |
3255 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; | |
3256 | } else if (vcpu->arch.interrupt.injected) { | |
3257 | nr = vcpu->arch.interrupt.nr; | |
3258 | idt_vectoring = nr | VECTORING_INFO_VALID_MASK; | |
3259 | ||
3260 | if (vcpu->arch.interrupt.soft) { | |
3261 | idt_vectoring |= INTR_TYPE_SOFT_INTR; | |
3262 | vmcs12->vm_entry_instruction_len = | |
3263 | vcpu->arch.event_exit_inst_len; | |
3264 | } else | |
3265 | idt_vectoring |= INTR_TYPE_EXT_INTR; | |
3266 | ||
3267 | vmcs12->idt_vectoring_info_field = idt_vectoring; | |
3268 | } | |
3269 | } | |
3270 | ||
3271 | ||
3272 | static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) | |
3273 | { | |
3274 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
3275 | gfn_t gfn; | |
3276 | ||
3277 | /* | |
3278 | * Don't need to mark the APIC access page dirty; it is never | |
3279 | * written to by the CPU during APIC virtualization. | |
3280 | */ | |
3281 | ||
3282 | if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { | |
3283 | gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; | |
3284 | kvm_vcpu_mark_page_dirty(vcpu, gfn); | |
3285 | } | |
3286 | ||
3287 | if (nested_cpu_has_posted_intr(vmcs12)) { | |
3288 | gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; | |
3289 | kvm_vcpu_mark_page_dirty(vcpu, gfn); | |
3290 | } | |
3291 | } | |
3292 | ||
3293 | static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) | |
3294 | { | |
3295 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
3296 | int max_irr; | |
3297 | void *vapic_page; | |
3298 | u16 status; | |
3299 | ||
3300 | if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) | |
3301 | return; | |
3302 | ||
3303 | vmx->nested.pi_pending = false; | |
3304 | if (!pi_test_and_clear_on(vmx->nested.pi_desc)) | |
3305 | return; | |
3306 | ||
3307 | max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); | |
3308 | if (max_irr != 256) { | |
3309 | vapic_page = kmap(vmx->nested.virtual_apic_page); | |
3310 | __kvm_apic_update_irr(vmx->nested.pi_desc->pir, | |
3311 | vapic_page, &max_irr); | |
3312 | kunmap(vmx->nested.virtual_apic_page); | |
3313 | ||
3314 | status = vmcs_read16(GUEST_INTR_STATUS); | |
3315 | if ((u8)max_irr > ((u8)status & 0xff)) { | |
3316 | status &= ~0xff; | |
3317 | status |= (u8)max_irr; | |
3318 | vmcs_write16(GUEST_INTR_STATUS, status); | |
3319 | } | |
3320 | } | |
3321 | ||
3322 | nested_mark_vmcs12_pages_dirty(vcpu); | |
3323 | } | |
3324 | ||
3325 | static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, | |
3326 | unsigned long exit_qual) | |
3327 | { | |
3328 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
3329 | unsigned int nr = vcpu->arch.exception.nr; | |
3330 | u32 intr_info = nr | INTR_INFO_VALID_MASK; | |
3331 | ||
3332 | if (vcpu->arch.exception.has_error_code) { | |
3333 | vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; | |
3334 | intr_info |= INTR_INFO_DELIVER_CODE_MASK; | |
3335 | } | |
3336 | ||
3337 | if (kvm_exception_is_soft(nr)) | |
3338 | intr_info |= INTR_TYPE_SOFT_EXCEPTION; | |
3339 | else | |
3340 | intr_info |= INTR_TYPE_HARD_EXCEPTION; | |
3341 | ||
3342 | if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && | |
3343 | vmx_get_nmi_mask(vcpu)) | |
3344 | intr_info |= INTR_INFO_UNBLOCK_NMI; | |
3345 | ||
3346 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); | |
3347 | } | |
3348 | ||
3349 | static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) | |
3350 | { | |
3351 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
3352 | unsigned long exit_qual; | |
3353 | bool block_nested_events = | |
3354 | vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); | |
3355 | ||
3356 | if (vcpu->arch.exception.pending && | |
3357 | nested_vmx_check_exception(vcpu, &exit_qual)) { | |
3358 | if (block_nested_events) | |
3359 | return -EBUSY; | |
3360 | nested_vmx_inject_exception_vmexit(vcpu, exit_qual); | |
3361 | return 0; | |
3362 | } | |
3363 | ||
3364 | if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && | |
3365 | vmx->nested.preemption_timer_expired) { | |
3366 | if (block_nested_events) | |
3367 | return -EBUSY; | |
3368 | nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); | |
3369 | return 0; | |
3370 | } | |
3371 | ||
3372 | if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { | |
3373 | if (block_nested_events) | |
3374 | return -EBUSY; | |
3375 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, | |
3376 | NMI_VECTOR | INTR_TYPE_NMI_INTR | | |
3377 | INTR_INFO_VALID_MASK, 0); | |
3378 | /* | |
3379 | * The NMI-triggered VM exit counts as injection: | |
3380 | * clear this one and block further NMIs. | |
3381 | */ | |
3382 | vcpu->arch.nmi_pending = 0; | |
3383 | vmx_set_nmi_mask(vcpu, true); | |
3384 | return 0; | |
3385 | } | |
3386 | ||
3387 | if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && | |
3388 | nested_exit_on_intr(vcpu)) { | |
3389 | if (block_nested_events) | |
3390 | return -EBUSY; | |
3391 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); | |
3392 | return 0; | |
3393 | } | |
3394 | ||
3395 | vmx_complete_nested_posted_interrupt(vcpu); | |
3396 | return 0; | |
3397 | } | |
3398 | ||
3399 | static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) | |
3400 | { | |
3401 | ktime_t remaining = | |
3402 | hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); | |
3403 | u64 value; | |
3404 | ||
3405 | if (ktime_to_ns(remaining) <= 0) | |
3406 | return 0; | |
3407 | ||
3408 | value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; | |
3409 | do_div(value, 1000000); | |
3410 | return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; | |
3411 | } | |
3412 | ||
3413 | /* | |
3414 | * Update the guest state fields of vmcs12 to reflect changes that | |
3415 | * occurred while L2 was running. (The "IA-32e mode guest" bit of the | |
3416 | * VM-entry controls is also updated, since this is really a guest | |
3417 | * state bit.) | |
3418 | */ | |
3419 | static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | |
3420 | { | |
3421 | vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); | |
3422 | vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); | |
3423 | ||
3424 | vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); | |
3425 | vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); | |
3426 | vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); | |
3427 | ||
3428 | vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); | |
3429 | vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); | |
3430 | vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); | |
3431 | vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); | |
3432 | vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); | |
3433 | vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); | |
3434 | vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); | |
3435 | vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); | |
3436 | vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); | |
3437 | vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); | |
3438 | vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); | |
3439 | vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); | |
3440 | vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); | |
3441 | vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); | |
3442 | vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); | |
3443 | vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); | |
3444 | vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); | |
3445 | vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); | |
3446 | vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); | |
3447 | vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); | |
3448 | vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); | |
3449 | vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); | |
3450 | vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); | |
3451 | vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); | |
3452 | vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); | |
3453 | vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); | |
3454 | vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); | |
3455 | vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); | |
3456 | vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); | |
3457 | vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); | |
3458 | vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); | |
3459 | vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); | |
3460 | vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); | |
3461 | vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); | |
3462 | vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); | |
3463 | vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); | |
3464 | ||
3465 | vmcs12->guest_interruptibility_info = | |
3466 | vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | |
3467 | vmcs12->guest_pending_dbg_exceptions = | |
3468 | vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); | |
3469 | if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) | |
3470 | vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; | |
3471 | else | |
3472 | vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; | |
3473 | ||
b4b65b56 PB |
3474 | if (nested_cpu_has_preemption_timer(vmcs12) && |
3475 | vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) | |
55d2375e SC |
3476 | vmcs12->vmx_preemption_timer_value = |
3477 | vmx_get_preemption_timer_value(vcpu); | |
55d2375e SC |
3478 | |
3479 | /* | |
3480 | * In some cases (usually, nested EPT), L2 is allowed to change its | |
3481 | * own CR3 without exiting. If it has changed it, we must keep it. | |
3482 | * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined | |
3483 | * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. | |
3484 | * | |
3485 | * Additionally, restore L2's PDPTR to vmcs12. | |
3486 | */ | |
3487 | if (enable_ept) { | |
3488 | vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); | |
3489 | vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); | |
3490 | vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); | |
3491 | vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); | |
3492 | vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); | |
3493 | } | |
3494 | ||
3495 | vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); | |
3496 | ||
3497 | if (nested_cpu_has_vid(vmcs12)) | |
3498 | vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); | |
3499 | ||
3500 | vmcs12->vm_entry_controls = | |
3501 | (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | | |
3502 | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); | |
3503 | ||
3504 | if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { | |
3505 | kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); | |
3506 | vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); | |
3507 | } | |
3508 | ||
3509 | /* TODO: These cannot have changed unless we have MSR bitmaps and | |
3510 | * the relevant bit asks not to trap the change */ | |
3511 | if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) | |
3512 | vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); | |
3513 | if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) | |
3514 | vmcs12->guest_ia32_efer = vcpu->arch.efer; | |
3515 | vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); | |
3516 | vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); | |
3517 | vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); | |
3518 | if (kvm_mpx_supported()) | |
3519 | vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); | |
3520 | } | |
3521 | ||
3522 | /* | |
3523 | * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits | |
3524 | * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), | |
3525 | * and this function updates it to reflect the changes to the guest state while | |
3526 | * L2 was running (and perhaps made some exits which were handled directly by L0 | |
3527 | * without going back to L1), and to reflect the exit reason. | |
3528 | * Note that we do not have to copy here all VMCS fields, just those that | |
3529 | * could have changed by the L2 guest or the exit - i.e., the guest-state and | |
3530 | * exit-information fields only. Other fields are modified by L1 with VMWRITE, | |
3531 | * which already writes to vmcs12 directly. | |
3532 | */ | |
3533 | static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, | |
3534 | u32 exit_reason, u32 exit_intr_info, | |
3535 | unsigned long exit_qualification) | |
3536 | { | |
3537 | /* update guest state fields: */ | |
3538 | sync_vmcs12(vcpu, vmcs12); | |
3539 | ||
3540 | /* update exit information fields: */ | |
3541 | ||
3542 | vmcs12->vm_exit_reason = exit_reason; | |
3543 | vmcs12->exit_qualification = exit_qualification; | |
3544 | vmcs12->vm_exit_intr_info = exit_intr_info; | |
3545 | ||
3546 | vmcs12->idt_vectoring_info_field = 0; | |
3547 | vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | |
3548 | vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | |
3549 | ||
3550 | if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { | |
3551 | vmcs12->launch_state = 1; | |
3552 | ||
3553 | /* vm_entry_intr_info_field is cleared on exit. Emulate this | |
3554 | * instead of reading the real value. */ | |
3555 | vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; | |
3556 | ||
3557 | /* | |
3558 | * Transfer the event that L0 or L1 may wanted to inject into | |
3559 | * L2 to IDT_VECTORING_INFO_FIELD. | |
3560 | */ | |
3561 | vmcs12_save_pending_event(vcpu, vmcs12); | |
a0d4f803 KS |
3562 | |
3563 | /* | |
3564 | * According to spec, there's no need to store the guest's | |
3565 | * MSRs if the exit is due to a VM-entry failure that occurs | |
3566 | * during or after loading the guest state. Since this exit | |
3567 | * does not fall in that category, we need to save the MSRs. | |
3568 | */ | |
3569 | if (nested_vmx_store_msr(vcpu, | |
3570 | vmcs12->vm_exit_msr_store_addr, | |
3571 | vmcs12->vm_exit_msr_store_count)) | |
3572 | nested_vmx_abort(vcpu, | |
3573 | VMX_ABORT_SAVE_GUEST_MSR_FAIL); | |
55d2375e SC |
3574 | } |
3575 | ||
3576 | /* | |
3577 | * Drop what we picked up for L2 via vmx_complete_interrupts. It is | |
3578 | * preserved above and would only end up incorrectly in L1. | |
3579 | */ | |
3580 | vcpu->arch.nmi_injected = false; | |
3581 | kvm_clear_exception_queue(vcpu); | |
3582 | kvm_clear_interrupt_queue(vcpu); | |
3583 | } | |
3584 | ||
3585 | /* | |
3586 | * A part of what we need to when the nested L2 guest exits and we want to | |
3587 | * run its L1 parent, is to reset L1's guest state to the host state specified | |
3588 | * in vmcs12. | |
3589 | * This function is to be called not only on normal nested exit, but also on | |
3590 | * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry | |
3591 | * Failures During or After Loading Guest State"). | |
3592 | * This function should be called when the active VMCS is L1's (vmcs01). | |
3593 | */ | |
3594 | static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, | |
3595 | struct vmcs12 *vmcs12) | |
3596 | { | |
3597 | struct kvm_segment seg; | |
3598 | u32 entry_failure_code; | |
3599 | ||
3600 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) | |
3601 | vcpu->arch.efer = vmcs12->host_ia32_efer; | |
3602 | else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) | |
3603 | vcpu->arch.efer |= (EFER_LMA | EFER_LME); | |
3604 | else | |
3605 | vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); | |
3606 | vmx_set_efer(vcpu, vcpu->arch.efer); | |
3607 | ||
3608 | kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); | |
3609 | kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); | |
3610 | vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); | |
3611 | vmx_set_interrupt_shadow(vcpu, 0); | |
3612 | ||
3613 | /* | |
3614 | * Note that calling vmx_set_cr0 is important, even if cr0 hasn't | |
3615 | * actually changed, because vmx_set_cr0 refers to efer set above. | |
3616 | * | |
3617 | * CR0_GUEST_HOST_MASK is already set in the original vmcs01 | |
3618 | * (KVM doesn't change it); | |
3619 | */ | |
3620 | vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; | |
3621 | vmx_set_cr0(vcpu, vmcs12->host_cr0); | |
3622 | ||
3623 | /* Same as above - no reason to call set_cr4_guest_host_mask(). */ | |
3624 | vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); | |
3625 | vmx_set_cr4(vcpu, vmcs12->host_cr4); | |
3626 | ||
3627 | nested_ept_uninit_mmu_context(vcpu); | |
3628 | ||
3629 | /* | |
3630 | * Only PDPTE load can fail as the value of cr3 was checked on entry and | |
3631 | * couldn't have changed. | |
3632 | */ | |
3633 | if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) | |
3634 | nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); | |
3635 | ||
3636 | if (!enable_ept) | |
3637 | vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; | |
3638 | ||
3639 | /* | |
3640 | * If vmcs01 doesn't use VPID, CPU flushes TLB on every | |
3641 | * VMEntry/VMExit. Thus, no need to flush TLB. | |
3642 | * | |
3643 | * If vmcs12 doesn't use VPID, L1 expects TLB to be | |
3644 | * flushed on every VMEntry/VMExit. | |
3645 | * | |
3646 | * Otherwise, we can preserve TLB entries as long as we are | |
3647 | * able to tag L1 TLB entries differently than L2 TLB entries. | |
3648 | * | |
3649 | * If vmcs12 uses EPT, we need to execute this flush on EPTP01 | |
3650 | * and therefore we request the TLB flush to happen only after VMCS EPTP | |
3651 | * has been set by KVM_REQ_LOAD_CR3. | |
3652 | */ | |
3653 | if (enable_vpid && | |
3654 | (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) { | |
3655 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | |
3656 | } | |
3657 | ||
3658 | vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); | |
3659 | vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); | |
3660 | vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); | |
3661 | vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); | |
3662 | vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); | |
3663 | vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); | |
3664 | vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); | |
3665 | ||
3666 | /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ | |
3667 | if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) | |
3668 | vmcs_write64(GUEST_BNDCFGS, 0); | |
3669 | ||
3670 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { | |
3671 | vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); | |
3672 | vcpu->arch.pat = vmcs12->host_ia32_pat; | |
3673 | } | |
3674 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) | |
3675 | vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, | |
3676 | vmcs12->host_ia32_perf_global_ctrl); | |
3677 | ||
3678 | /* Set L1 segment info according to Intel SDM | |
3679 | 27.5.2 Loading Host Segment and Descriptor-Table Registers */ | |
3680 | seg = (struct kvm_segment) { | |
3681 | .base = 0, | |
3682 | .limit = 0xFFFFFFFF, | |
3683 | .selector = vmcs12->host_cs_selector, | |
3684 | .type = 11, | |
3685 | .present = 1, | |
3686 | .s = 1, | |
3687 | .g = 1 | |
3688 | }; | |
3689 | if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) | |
3690 | seg.l = 1; | |
3691 | else | |
3692 | seg.db = 1; | |
3693 | vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); | |
3694 | seg = (struct kvm_segment) { | |
3695 | .base = 0, | |
3696 | .limit = 0xFFFFFFFF, | |
3697 | .type = 3, | |
3698 | .present = 1, | |
3699 | .s = 1, | |
3700 | .db = 1, | |
3701 | .g = 1 | |
3702 | }; | |
3703 | seg.selector = vmcs12->host_ds_selector; | |
3704 | vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); | |
3705 | seg.selector = vmcs12->host_es_selector; | |
3706 | vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); | |
3707 | seg.selector = vmcs12->host_ss_selector; | |
3708 | vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); | |
3709 | seg.selector = vmcs12->host_fs_selector; | |
3710 | seg.base = vmcs12->host_fs_base; | |
3711 | vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); | |
3712 | seg.selector = vmcs12->host_gs_selector; | |
3713 | seg.base = vmcs12->host_gs_base; | |
3714 | vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); | |
3715 | seg = (struct kvm_segment) { | |
3716 | .base = vmcs12->host_tr_base, | |
3717 | .limit = 0x67, | |
3718 | .selector = vmcs12->host_tr_selector, | |
3719 | .type = 11, | |
3720 | .present = 1 | |
3721 | }; | |
3722 | vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); | |
3723 | ||
3724 | kvm_set_dr(vcpu, 7, 0x400); | |
3725 | vmcs_write64(GUEST_IA32_DEBUGCTL, 0); | |
3726 | ||
3727 | if (cpu_has_vmx_msr_bitmap()) | |
3728 | vmx_update_msr_bitmap(vcpu); | |
3729 | ||
3730 | if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, | |
3731 | vmcs12->vm_exit_msr_load_count)) | |
3732 | nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); | |
3733 | } | |
3734 | ||
3735 | static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) | |
3736 | { | |
3737 | struct shared_msr_entry *efer_msr; | |
3738 | unsigned int i; | |
3739 | ||
3740 | if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) | |
3741 | return vmcs_read64(GUEST_IA32_EFER); | |
3742 | ||
3743 | if (cpu_has_load_ia32_efer()) | |
3744 | return host_efer; | |
3745 | ||
3746 | for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { | |
3747 | if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) | |
3748 | return vmx->msr_autoload.guest.val[i].value; | |
3749 | } | |
3750 | ||
3751 | efer_msr = find_msr_entry(vmx, MSR_EFER); | |
3752 | if (efer_msr) | |
3753 | return efer_msr->data; | |
3754 | ||
3755 | return host_efer; | |
3756 | } | |
3757 | ||
3758 | static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) | |
3759 | { | |
3760 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
3761 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
3762 | struct vmx_msr_entry g, h; | |
3763 | struct msr_data msr; | |
3764 | gpa_t gpa; | |
3765 | u32 i, j; | |
3766 | ||
3767 | vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); | |
3768 | ||
3769 | if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { | |
3770 | /* | |
3771 | * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set | |
3772 | * as vmcs01.GUEST_DR7 contains a userspace defined value | |
3773 | * and vcpu->arch.dr7 is not squirreled away before the | |
3774 | * nested VMENTER (not worth adding a variable in nested_vmx). | |
3775 | */ | |
3776 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) | |
3777 | kvm_set_dr(vcpu, 7, DR7_FIXED_1); | |
3778 | else | |
3779 | WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); | |
3780 | } | |
3781 | ||
3782 | /* | |
3783 | * Note that calling vmx_set_{efer,cr0,cr4} is important as they | |
3784 | * handle a variety of side effects to KVM's software model. | |
3785 | */ | |
3786 | vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); | |
3787 | ||
3788 | vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; | |
3789 | vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); | |
3790 | ||
3791 | vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); | |
3792 | vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); | |
3793 | ||
3794 | nested_ept_uninit_mmu_context(vcpu); | |
2b27924b PB |
3795 | |
3796 | /* | |
3797 | * This is only valid if EPT is in use, otherwise the vmcs01 GUEST_CR3 | |
3798 | * points to shadow pages! Fortunately we only get here after a WARN_ON | |
3799 | * if EPT is disabled, so a VMabort is perfectly fine. | |
3800 | */ | |
3801 | if (enable_ept) { | |
3802 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); | |
3803 | __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); | |
3804 | } else { | |
3805 | nested_vmx_abort(vcpu, VMX_ABORT_VMCS_CORRUPTED); | |
3806 | } | |
55d2375e SC |
3807 | |
3808 | /* | |
3809 | * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs | |
3810 | * from vmcs01 (if necessary). The PDPTRs are not loaded on | |
3811 | * VMFail, like everything else we just need to ensure our | |
3812 | * software model is up-to-date. | |
3813 | */ | |
3814 | ept_save_pdptrs(vcpu); | |
3815 | ||
3816 | kvm_mmu_reset_context(vcpu); | |
3817 | ||
3818 | if (cpu_has_vmx_msr_bitmap()) | |
3819 | vmx_update_msr_bitmap(vcpu); | |
3820 | ||
3821 | /* | |
3822 | * This nasty bit of open coding is a compromise between blindly | |
3823 | * loading L1's MSRs using the exit load lists (incorrect emulation | |
3824 | * of VMFail), leaving the nested VM's MSRs in the software model | |
3825 | * (incorrect behavior) and snapshotting the modified MSRs (too | |
3826 | * expensive since the lists are unbound by hardware). For each | |
3827 | * MSR that was (prematurely) loaded from the nested VMEntry load | |
3828 | * list, reload it from the exit load list if it exists and differs | |
3829 | * from the guest value. The intent is to stuff host state as | |
3830 | * silently as possible, not to fully process the exit load list. | |
3831 | */ | |
3832 | msr.host_initiated = false; | |
3833 | for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { | |
3834 | gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); | |
3835 | if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { | |
3836 | pr_debug_ratelimited( | |
3837 | "%s read MSR index failed (%u, 0x%08llx)\n", | |
3838 | __func__, i, gpa); | |
3839 | goto vmabort; | |
3840 | } | |
3841 | ||
3842 | for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { | |
3843 | gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); | |
3844 | if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { | |
3845 | pr_debug_ratelimited( | |
3846 | "%s read MSR failed (%u, 0x%08llx)\n", | |
3847 | __func__, j, gpa); | |
3848 | goto vmabort; | |
3849 | } | |
3850 | if (h.index != g.index) | |
3851 | continue; | |
3852 | if (h.value == g.value) | |
3853 | break; | |
3854 | ||
3855 | if (nested_vmx_load_msr_check(vcpu, &h)) { | |
3856 | pr_debug_ratelimited( | |
3857 | "%s check failed (%u, 0x%x, 0x%x)\n", | |
3858 | __func__, j, h.index, h.reserved); | |
3859 | goto vmabort; | |
3860 | } | |
3861 | ||
3862 | msr.index = h.index; | |
3863 | msr.data = h.value; | |
3864 | if (kvm_set_msr(vcpu, &msr)) { | |
3865 | pr_debug_ratelimited( | |
3866 | "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", | |
3867 | __func__, j, h.index, h.value); | |
3868 | goto vmabort; | |
3869 | } | |
3870 | } | |
3871 | } | |
3872 | ||
3873 | return; | |
3874 | ||
3875 | vmabort: | |
3876 | nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); | |
3877 | } | |
3878 | ||
3879 | /* | |
3880 | * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 | |
3881 | * and modify vmcs12 to make it see what it would expect to see there if | |
3882 | * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) | |
3883 | */ | |
3884 | void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, | |
3885 | u32 exit_intr_info, unsigned long exit_qualification) | |
3886 | { | |
3887 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
3888 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
3889 | ||
3890 | /* trying to cancel vmlaunch/vmresume is a bug */ | |
3891 | WARN_ON_ONCE(vmx->nested.nested_run_pending); | |
3892 | ||
3893 | leave_guest_mode(vcpu); | |
3894 | ||
b4b65b56 PB |
3895 | if (nested_cpu_has_preemption_timer(vmcs12)) |
3896 | hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); | |
3897 | ||
55d2375e SC |
3898 | if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) |
3899 | vcpu->arch.tsc_offset -= vmcs12->tsc_offset; | |
3900 | ||
3901 | if (likely(!vmx->fail)) { | |
3902 | if (exit_reason == -1) | |
3903 | sync_vmcs12(vcpu, vmcs12); | |
3904 | else | |
3905 | prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, | |
3906 | exit_qualification); | |
3907 | ||
3908 | /* | |
3909 | * Must happen outside of sync_vmcs12() as it will | |
3910 | * also be used to capture vmcs12 cache as part of | |
3911 | * capturing nVMX state for snapshot (migration). | |
3912 | * | |
3913 | * Otherwise, this flush will dirty guest memory at a | |
3914 | * point it is already assumed by user-space to be | |
3915 | * immutable. | |
3916 | */ | |
3917 | nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); | |
55d2375e SC |
3918 | } else { |
3919 | /* | |
3920 | * The only expected VM-instruction error is "VM entry with | |
3921 | * invalid control field(s)." Anything else indicates a | |
3922 | * problem with L0. And we should never get here with a | |
3923 | * VMFail of any type if early consistency checks are enabled. | |
3924 | */ | |
3925 | WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != | |
3926 | VMXERR_ENTRY_INVALID_CONTROL_FIELD); | |
3927 | WARN_ON_ONCE(nested_early_check); | |
3928 | } | |
3929 | ||
3930 | vmx_switch_vmcs(vcpu, &vmx->vmcs01); | |
3931 | ||
3932 | /* Update any VMCS fields that might have changed while L2 ran */ | |
3933 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); | |
3934 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); | |
3935 | vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); | |
3936 | ||
3937 | if (kvm_has_tsc_control) | |
3938 | decache_tsc_multiplier(vmx); | |
3939 | ||
3940 | if (vmx->nested.change_vmcs01_virtual_apic_mode) { | |
3941 | vmx->nested.change_vmcs01_virtual_apic_mode = false; | |
3942 | vmx_set_virtual_apic_mode(vcpu); | |
3943 | } else if (!nested_cpu_has_ept(vmcs12) && | |
3944 | nested_cpu_has2(vmcs12, | |
3945 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { | |
3946 | vmx_flush_tlb(vcpu, true); | |
3947 | } | |
3948 | ||
55d2375e SC |
3949 | /* Unpin physical memory we referred to in vmcs02 */ |
3950 | if (vmx->nested.apic_access_page) { | |
3951 | kvm_release_page_dirty(vmx->nested.apic_access_page); | |
3952 | vmx->nested.apic_access_page = NULL; | |
3953 | } | |
3954 | if (vmx->nested.virtual_apic_page) { | |
3955 | kvm_release_page_dirty(vmx->nested.virtual_apic_page); | |
3956 | vmx->nested.virtual_apic_page = NULL; | |
3957 | } | |
3958 | if (vmx->nested.pi_desc_page) { | |
3959 | kunmap(vmx->nested.pi_desc_page); | |
3960 | kvm_release_page_dirty(vmx->nested.pi_desc_page); | |
3961 | vmx->nested.pi_desc_page = NULL; | |
3962 | vmx->nested.pi_desc = NULL; | |
3963 | } | |
3964 | ||
3965 | /* | |
3966 | * We are now running in L2, mmu_notifier will force to reload the | |
3967 | * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. | |
3968 | */ | |
3969 | kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); | |
3970 | ||
3971 | if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs)) | |
3972 | vmx->nested.need_vmcs12_sync = true; | |
3973 | ||
3974 | /* in case we halted in L2 */ | |
3975 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | |
3976 | ||
3977 | if (likely(!vmx->fail)) { | |
3978 | /* | |
3979 | * TODO: SDM says that with acknowledge interrupt on | |
3980 | * exit, bit 31 of the VM-exit interrupt information | |
3981 | * (valid interrupt) is always set to 1 on | |
3982 | * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't | |
3983 | * need kvm_cpu_has_interrupt(). See the commit | |
3984 | * message for details. | |
3985 | */ | |
3986 | if (nested_exit_intr_ack_set(vcpu) && | |
3987 | exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && | |
3988 | kvm_cpu_has_interrupt(vcpu)) { | |
3989 | int irq = kvm_cpu_get_interrupt(vcpu); | |
3990 | WARN_ON(irq < 0); | |
3991 | vmcs12->vm_exit_intr_info = irq | | |
3992 | INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; | |
3993 | } | |
3994 | ||
3995 | if (exit_reason != -1) | |
3996 | trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, | |
3997 | vmcs12->exit_qualification, | |
3998 | vmcs12->idt_vectoring_info_field, | |
3999 | vmcs12->vm_exit_intr_info, | |
4000 | vmcs12->vm_exit_intr_error_code, | |
4001 | KVM_ISA_VMX); | |
4002 | ||
4003 | load_vmcs12_host_state(vcpu, vmcs12); | |
4004 | ||
4005 | return; | |
4006 | } | |
4007 | ||
4008 | /* | |
4009 | * After an early L2 VM-entry failure, we're now back | |
4010 | * in L1 which thinks it just finished a VMLAUNCH or | |
4011 | * VMRESUME instruction, so we need to set the failure | |
4012 | * flag and the VM-instruction error field of the VMCS | |
4013 | * accordingly, and skip the emulated instruction. | |
4014 | */ | |
4015 | (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | |
4016 | ||
4017 | /* | |
4018 | * Restore L1's host state to KVM's software model. We're here | |
4019 | * because a consistency check was caught by hardware, which | |
4020 | * means some amount of guest state has been propagated to KVM's | |
4021 | * model and needs to be unwound to the host's state. | |
4022 | */ | |
4023 | nested_vmx_restore_host_state(vcpu); | |
4024 | ||
4025 | vmx->fail = 0; | |
4026 | } | |
4027 | ||
4028 | /* | |
4029 | * Decode the memory-address operand of a vmx instruction, as recorded on an | |
4030 | * exit caused by such an instruction (run by a guest hypervisor). | |
4031 | * On success, returns 0. When the operand is invalid, returns 1 and throws | |
4032 | * #UD or #GP. | |
4033 | */ | |
4034 | int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, | |
4035 | u32 vmx_instruction_info, bool wr, gva_t *ret) | |
4036 | { | |
4037 | gva_t off; | |
4038 | bool exn; | |
4039 | struct kvm_segment s; | |
4040 | ||
4041 | /* | |
4042 | * According to Vol. 3B, "Information for VM Exits Due to Instruction | |
4043 | * Execution", on an exit, vmx_instruction_info holds most of the | |
4044 | * addressing components of the operand. Only the displacement part | |
4045 | * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). | |
4046 | * For how an actual address is calculated from all these components, | |
4047 | * refer to Vol. 1, "Operand Addressing". | |
4048 | */ | |
4049 | int scaling = vmx_instruction_info & 3; | |
4050 | int addr_size = (vmx_instruction_info >> 7) & 7; | |
4051 | bool is_reg = vmx_instruction_info & (1u << 10); | |
4052 | int seg_reg = (vmx_instruction_info >> 15) & 7; | |
4053 | int index_reg = (vmx_instruction_info >> 18) & 0xf; | |
4054 | bool index_is_valid = !(vmx_instruction_info & (1u << 22)); | |
4055 | int base_reg = (vmx_instruction_info >> 23) & 0xf; | |
4056 | bool base_is_valid = !(vmx_instruction_info & (1u << 27)); | |
4057 | ||
4058 | if (is_reg) { | |
4059 | kvm_queue_exception(vcpu, UD_VECTOR); | |
4060 | return 1; | |
4061 | } | |
4062 | ||
4063 | /* Addr = segment_base + offset */ | |
4064 | /* offset = base + [index * scale] + displacement */ | |
4065 | off = exit_qualification; /* holds the displacement */ | |
946c522b SC |
4066 | if (addr_size == 1) |
4067 | off = (gva_t)sign_extend64(off, 31); | |
4068 | else if (addr_size == 0) | |
4069 | off = (gva_t)sign_extend64(off, 15); | |
55d2375e SC |
4070 | if (base_is_valid) |
4071 | off += kvm_register_read(vcpu, base_reg); | |
4072 | if (index_is_valid) | |
4073 | off += kvm_register_read(vcpu, index_reg)<<scaling; | |
4074 | vmx_get_segment(vcpu, &s, seg_reg); | |
55d2375e | 4075 | |
8570f9e8 SC |
4076 | /* |
4077 | * The effective address, i.e. @off, of a memory operand is truncated | |
4078 | * based on the address size of the instruction. Note that this is | |
4079 | * the *effective address*, i.e. the address prior to accounting for | |
4080 | * the segment's base. | |
4081 | */ | |
55d2375e | 4082 | if (addr_size == 1) /* 32 bit */ |
8570f9e8 SC |
4083 | off &= 0xffffffff; |
4084 | else if (addr_size == 0) /* 16 bit */ | |
4085 | off &= 0xffff; | |
55d2375e SC |
4086 | |
4087 | /* Checks for #GP/#SS exceptions. */ | |
4088 | exn = false; | |
4089 | if (is_long_mode(vcpu)) { | |
8570f9e8 SC |
4090 | /* |
4091 | * The virtual/linear address is never truncated in 64-bit | |
4092 | * mode, e.g. a 32-bit address size can yield a 64-bit virtual | |
4093 | * address when using FS/GS with a non-zero base. | |
4094 | */ | |
4095 | *ret = s.base + off; | |
4096 | ||
55d2375e SC |
4097 | /* Long mode: #GP(0)/#SS(0) if the memory address is in a |
4098 | * non-canonical form. This is the only check on the memory | |
4099 | * destination for long mode! | |
4100 | */ | |
4101 | exn = is_noncanonical_address(*ret, vcpu); | |
e0dfacbf | 4102 | } else { |
8570f9e8 SC |
4103 | /* |
4104 | * When not in long mode, the virtual/linear address is | |
4105 | * unconditionally truncated to 32 bits regardless of the | |
4106 | * address size. | |
4107 | */ | |
4108 | *ret = (s.base + off) & 0xffffffff; | |
4109 | ||
55d2375e SC |
4110 | /* Protected mode: apply checks for segment validity in the |
4111 | * following order: | |
4112 | * - segment type check (#GP(0) may be thrown) | |
4113 | * - usability check (#GP(0)/#SS(0)) | |
4114 | * - limit check (#GP(0)/#SS(0)) | |
4115 | */ | |
4116 | if (wr) | |
4117 | /* #GP(0) if the destination operand is located in a | |
4118 | * read-only data segment or any code segment. | |
4119 | */ | |
4120 | exn = ((s.type & 0xa) == 0 || (s.type & 8)); | |
4121 | else | |
4122 | /* #GP(0) if the source operand is located in an | |
4123 | * execute-only code segment | |
4124 | */ | |
4125 | exn = ((s.type & 0xa) == 8); | |
4126 | if (exn) { | |
4127 | kvm_queue_exception_e(vcpu, GP_VECTOR, 0); | |
4128 | return 1; | |
4129 | } | |
4130 | /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. | |
4131 | */ | |
4132 | exn = (s.unusable != 0); | |
34333cc6 SC |
4133 | |
4134 | /* | |
4135 | * Protected mode: #GP(0)/#SS(0) if the memory operand is | |
4136 | * outside the segment limit. All CPUs that support VMX ignore | |
4137 | * limit checks for flat segments, i.e. segments with base==0, | |
4138 | * limit==0xffffffff and of type expand-up data or code. | |
55d2375e | 4139 | */ |
34333cc6 SC |
4140 | if (!(s.base == 0 && s.limit == 0xffffffff && |
4141 | ((s.type & 8) || !(s.type & 4)))) | |
4142 | exn = exn || (off + sizeof(u64) > s.limit); | |
55d2375e SC |
4143 | } |
4144 | if (exn) { | |
4145 | kvm_queue_exception_e(vcpu, | |
4146 | seg_reg == VCPU_SREG_SS ? | |
4147 | SS_VECTOR : GP_VECTOR, | |
4148 | 0); | |
4149 | return 1; | |
4150 | } | |
4151 | ||
4152 | return 0; | |
4153 | } | |
4154 | ||
4155 | static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) | |
4156 | { | |
4157 | gva_t gva; | |
4158 | struct x86_exception e; | |
4159 | ||
4160 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | |
4161 | vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) | |
4162 | return 1; | |
4163 | ||
4164 | if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { | |
4165 | kvm_inject_page_fault(vcpu, &e); | |
4166 | return 1; | |
4167 | } | |
4168 | ||
4169 | return 0; | |
4170 | } | |
4171 | ||
4172 | /* | |
4173 | * Allocate a shadow VMCS and associate it with the currently loaded | |
4174 | * VMCS, unless such a shadow VMCS already exists. The newly allocated | |
4175 | * VMCS is also VMCLEARed, so that it is ready for use. | |
4176 | */ | |
4177 | static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) | |
4178 | { | |
4179 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
4180 | struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; | |
4181 | ||
4182 | /* | |
4183 | * We should allocate a shadow vmcs for vmcs01 only when L1 | |
4184 | * executes VMXON and free it when L1 executes VMXOFF. | |
4185 | * As it is invalid to execute VMXON twice, we shouldn't reach | |
4186 | * here when vmcs01 already have an allocated shadow vmcs. | |
4187 | */ | |
4188 | WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); | |
4189 | ||
4190 | if (!loaded_vmcs->shadow_vmcs) { | |
4191 | loaded_vmcs->shadow_vmcs = alloc_vmcs(true); | |
4192 | if (loaded_vmcs->shadow_vmcs) | |
4193 | vmcs_clear(loaded_vmcs->shadow_vmcs); | |
4194 | } | |
4195 | return loaded_vmcs->shadow_vmcs; | |
4196 | } | |
4197 | ||
4198 | static int enter_vmx_operation(struct kvm_vcpu *vcpu) | |
4199 | { | |
4200 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
4201 | int r; | |
4202 | ||
4203 | r = alloc_loaded_vmcs(&vmx->nested.vmcs02); | |
4204 | if (r < 0) | |
4205 | goto out_vmcs02; | |
4206 | ||
41836839 | 4207 | vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); |
55d2375e SC |
4208 | if (!vmx->nested.cached_vmcs12) |
4209 | goto out_cached_vmcs12; | |
4210 | ||
41836839 | 4211 | vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); |
55d2375e SC |
4212 | if (!vmx->nested.cached_shadow_vmcs12) |
4213 | goto out_cached_shadow_vmcs12; | |
4214 | ||
4215 | if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) | |
4216 | goto out_shadow_vmcs; | |
4217 | ||
4218 | hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, | |
4219 | HRTIMER_MODE_REL_PINNED); | |
4220 | vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; | |
4221 | ||
4222 | vmx->nested.vpid02 = allocate_vpid(); | |
4223 | ||
4224 | vmx->nested.vmcs02_initialized = false; | |
4225 | vmx->nested.vmxon = true; | |
ee85dec2 LK |
4226 | |
4227 | if (pt_mode == PT_MODE_HOST_GUEST) { | |
4228 | vmx->pt_desc.guest.ctl = 0; | |
4229 | pt_update_intercept_for_msr(vmx); | |
4230 | } | |
4231 | ||
55d2375e SC |
4232 | return 0; |
4233 | ||
4234 | out_shadow_vmcs: | |
4235 | kfree(vmx->nested.cached_shadow_vmcs12); | |
4236 | ||
4237 | out_cached_shadow_vmcs12: | |
4238 | kfree(vmx->nested.cached_vmcs12); | |
4239 | ||
4240 | out_cached_vmcs12: | |
4241 | free_loaded_vmcs(&vmx->nested.vmcs02); | |
4242 | ||
4243 | out_vmcs02: | |
4244 | return -ENOMEM; | |
4245 | } | |
4246 | ||
4247 | /* | |
4248 | * Emulate the VMXON instruction. | |
4249 | * Currently, we just remember that VMX is active, and do not save or even | |
4250 | * inspect the argument to VMXON (the so-called "VMXON pointer") because we | |
4251 | * do not currently need to store anything in that guest-allocated memory | |
4252 | * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their | |
4253 | * argument is different from the VMXON pointer (which the spec says they do). | |
4254 | */ | |
4255 | static int handle_vmon(struct kvm_vcpu *vcpu) | |
4256 | { | |
4257 | int ret; | |
4258 | gpa_t vmptr; | |
4259 | struct page *page; | |
4260 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
4261 | const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED | |
4262 | | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; | |
4263 | ||
4264 | /* | |
4265 | * The Intel VMX Instruction Reference lists a bunch of bits that are | |
4266 | * prerequisite to running VMXON, most notably cr4.VMXE must be set to | |
4267 | * 1 (see vmx_set_cr4() for when we allow the guest to set this). | |
4268 | * Otherwise, we should fail with #UD. But most faulting conditions | |
4269 | * have already been checked by hardware, prior to the VM-exit for | |
4270 | * VMXON. We do test guest cr4.VMXE because processor CR4 always has | |
4271 | * that bit set to 1 in non-root mode. | |
4272 | */ | |
4273 | if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { | |
4274 | kvm_queue_exception(vcpu, UD_VECTOR); | |
4275 | return 1; | |
4276 | } | |
4277 | ||
4278 | /* CPL=0 must be checked manually. */ | |
4279 | if (vmx_get_cpl(vcpu)) { | |
4280 | kvm_inject_gp(vcpu, 0); | |
4281 | return 1; | |
4282 | } | |
4283 | ||
4284 | if (vmx->nested.vmxon) | |
4285 | return nested_vmx_failValid(vcpu, | |
4286 | VMXERR_VMXON_IN_VMX_ROOT_OPERATION); | |
4287 | ||
4288 | if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) | |
4289 | != VMXON_NEEDED_FEATURES) { | |
4290 | kvm_inject_gp(vcpu, 0); | |
4291 | return 1; | |
4292 | } | |
4293 | ||
4294 | if (nested_vmx_get_vmptr(vcpu, &vmptr)) | |
4295 | return 1; | |
4296 | ||
4297 | /* | |
4298 | * SDM 3: 24.11.5 | |
4299 | * The first 4 bytes of VMXON region contain the supported | |
4300 | * VMCS revision identifier | |
4301 | * | |
4302 | * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; | |
4303 | * which replaces physical address width with 32 | |
4304 | */ | |
4305 | if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) | |
4306 | return nested_vmx_failInvalid(vcpu); | |
4307 | ||
4308 | page = kvm_vcpu_gpa_to_page(vcpu, vmptr); | |
4309 | if (is_error_page(page)) | |
4310 | return nested_vmx_failInvalid(vcpu); | |
4311 | ||
4312 | if (*(u32 *)kmap(page) != VMCS12_REVISION) { | |
4313 | kunmap(page); | |
4314 | kvm_release_page_clean(page); | |
4315 | return nested_vmx_failInvalid(vcpu); | |
4316 | } | |
4317 | kunmap(page); | |
4318 | kvm_release_page_clean(page); | |
4319 | ||
4320 | vmx->nested.vmxon_ptr = vmptr; | |
4321 | ret = enter_vmx_operation(vcpu); | |
4322 | if (ret) | |
4323 | return ret; | |
4324 | ||
4325 | return nested_vmx_succeed(vcpu); | |
4326 | } | |
4327 | ||
4328 | static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) | |
4329 | { | |
4330 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
4331 | ||
4332 | if (vmx->nested.current_vmptr == -1ull) | |
4333 | return; | |
4334 | ||
4335 | if (enable_shadow_vmcs) { | |
4336 | /* copy to memory all shadowed fields in case | |
4337 | they were modified */ | |
4338 | copy_shadow_to_vmcs12(vmx); | |
4339 | vmx->nested.need_vmcs12_sync = false; | |
4340 | vmx_disable_shadow_vmcs(vmx); | |
4341 | } | |
4342 | vmx->nested.posted_intr_nv = -1; | |
4343 | ||
4344 | /* Flush VMCS12 to guest memory */ | |
4345 | kvm_vcpu_write_guest_page(vcpu, | |
4346 | vmx->nested.current_vmptr >> PAGE_SHIFT, | |
4347 | vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); | |
4348 | ||
4349 | kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); | |
4350 | ||
4351 | vmx->nested.current_vmptr = -1ull; | |
4352 | } | |
4353 | ||
4354 | /* Emulate the VMXOFF instruction */ | |
4355 | static int handle_vmoff(struct kvm_vcpu *vcpu) | |
4356 | { | |
4357 | if (!nested_vmx_check_permission(vcpu)) | |
4358 | return 1; | |
4359 | free_nested(vcpu); | |
4360 | return nested_vmx_succeed(vcpu); | |
4361 | } | |
4362 | ||
4363 | /* Emulate the VMCLEAR instruction */ | |
4364 | static int handle_vmclear(struct kvm_vcpu *vcpu) | |
4365 | { | |
4366 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
4367 | u32 zero = 0; | |
4368 | gpa_t vmptr; | |
4369 | ||
4370 | if (!nested_vmx_check_permission(vcpu)) | |
4371 | return 1; | |
4372 | ||
4373 | if (nested_vmx_get_vmptr(vcpu, &vmptr)) | |
4374 | return 1; | |
4375 | ||
4376 | if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) | |
4377 | return nested_vmx_failValid(vcpu, | |
4378 | VMXERR_VMCLEAR_INVALID_ADDRESS); | |
4379 | ||
4380 | if (vmptr == vmx->nested.vmxon_ptr) | |
4381 | return nested_vmx_failValid(vcpu, | |
4382 | VMXERR_VMCLEAR_VMXON_POINTER); | |
4383 | ||
4384 | if (vmx->nested.hv_evmcs_page) { | |
4385 | if (vmptr == vmx->nested.hv_evmcs_vmptr) | |
4386 | nested_release_evmcs(vcpu); | |
4387 | } else { | |
4388 | if (vmptr == vmx->nested.current_vmptr) | |
4389 | nested_release_vmcs12(vcpu); | |
4390 | ||
4391 | kvm_vcpu_write_guest(vcpu, | |
4392 | vmptr + offsetof(struct vmcs12, | |
4393 | launch_state), | |
4394 | &zero, sizeof(zero)); | |
4395 | } | |
4396 | ||
4397 | return nested_vmx_succeed(vcpu); | |
4398 | } | |
4399 | ||
4400 | static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); | |
4401 | ||
4402 | /* Emulate the VMLAUNCH instruction */ | |
4403 | static int handle_vmlaunch(struct kvm_vcpu *vcpu) | |
4404 | { | |
4405 | return nested_vmx_run(vcpu, true); | |
4406 | } | |
4407 | ||
4408 | /* Emulate the VMRESUME instruction */ | |
4409 | static int handle_vmresume(struct kvm_vcpu *vcpu) | |
4410 | { | |
4411 | ||
4412 | return nested_vmx_run(vcpu, false); | |
4413 | } | |
4414 | ||
4415 | static int handle_vmread(struct kvm_vcpu *vcpu) | |
4416 | { | |
4417 | unsigned long field; | |
4418 | u64 field_value; | |
4419 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | |
4420 | u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | |
4421 | gva_t gva = 0; | |
4422 | struct vmcs12 *vmcs12; | |
4423 | ||
4424 | if (!nested_vmx_check_permission(vcpu)) | |
4425 | return 1; | |
4426 | ||
4427 | if (to_vmx(vcpu)->nested.current_vmptr == -1ull) | |
4428 | return nested_vmx_failInvalid(vcpu); | |
4429 | ||
4430 | if (!is_guest_mode(vcpu)) | |
4431 | vmcs12 = get_vmcs12(vcpu); | |
4432 | else { | |
4433 | /* | |
4434 | * When vmcs->vmcs_link_pointer is -1ull, any VMREAD | |
4435 | * to shadowed-field sets the ALU flags for VMfailInvalid. | |
4436 | */ | |
4437 | if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) | |
4438 | return nested_vmx_failInvalid(vcpu); | |
4439 | vmcs12 = get_shadow_vmcs12(vcpu); | |
4440 | } | |
4441 | ||
4442 | /* Decode instruction info and find the field to read */ | |
4443 | field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); | |
4444 | /* Read the field, zero-extended to a u64 field_value */ | |
4445 | if (vmcs12_read_any(vmcs12, field, &field_value) < 0) | |
4446 | return nested_vmx_failValid(vcpu, | |
4447 | VMXERR_UNSUPPORTED_VMCS_COMPONENT); | |
4448 | ||
4449 | /* | |
4450 | * Now copy part of this value to register or memory, as requested. | |
4451 | * Note that the number of bits actually copied is 32 or 64 depending | |
4452 | * on the guest's mode (32 or 64 bit), not on the given field's length. | |
4453 | */ | |
4454 | if (vmx_instruction_info & (1u << 10)) { | |
4455 | kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), | |
4456 | field_value); | |
4457 | } else { | |
4458 | if (get_vmx_mem_address(vcpu, exit_qualification, | |
4459 | vmx_instruction_info, true, &gva)) | |
4460 | return 1; | |
4461 | /* _system ok, nested_vmx_check_permission has verified cpl=0 */ | |
4462 | kvm_write_guest_virt_system(vcpu, gva, &field_value, | |
4463 | (is_long_mode(vcpu) ? 8 : 4), NULL); | |
4464 | } | |
4465 | ||
4466 | return nested_vmx_succeed(vcpu); | |
4467 | } | |
4468 | ||
4469 | ||
4470 | static int handle_vmwrite(struct kvm_vcpu *vcpu) | |
4471 | { | |
4472 | unsigned long field; | |
4473 | gva_t gva; | |
4474 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
4475 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | |
4476 | u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | |
4477 | ||
4478 | /* The value to write might be 32 or 64 bits, depending on L1's long | |
4479 | * mode, and eventually we need to write that into a field of several | |
4480 | * possible lengths. The code below first zero-extends the value to 64 | |
4481 | * bit (field_value), and then copies only the appropriate number of | |
4482 | * bits into the vmcs12 field. | |
4483 | */ | |
4484 | u64 field_value = 0; | |
4485 | struct x86_exception e; | |
4486 | struct vmcs12 *vmcs12; | |
4487 | ||
4488 | if (!nested_vmx_check_permission(vcpu)) | |
4489 | return 1; | |
4490 | ||
4491 | if (vmx->nested.current_vmptr == -1ull) | |
4492 | return nested_vmx_failInvalid(vcpu); | |
4493 | ||
4494 | if (vmx_instruction_info & (1u << 10)) | |
4495 | field_value = kvm_register_readl(vcpu, | |
4496 | (((vmx_instruction_info) >> 3) & 0xf)); | |
4497 | else { | |
4498 | if (get_vmx_mem_address(vcpu, exit_qualification, | |
4499 | vmx_instruction_info, false, &gva)) | |
4500 | return 1; | |
4501 | if (kvm_read_guest_virt(vcpu, gva, &field_value, | |
4502 | (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { | |
4503 | kvm_inject_page_fault(vcpu, &e); | |
4504 | return 1; | |
4505 | } | |
4506 | } | |
4507 | ||
4508 | ||
4509 | field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); | |
4510 | /* | |
4511 | * If the vCPU supports "VMWRITE to any supported field in the | |
4512 | * VMCS," then the "read-only" fields are actually read/write. | |
4513 | */ | |
4514 | if (vmcs_field_readonly(field) && | |
4515 | !nested_cpu_has_vmwrite_any_field(vcpu)) | |
4516 | return nested_vmx_failValid(vcpu, | |
4517 | VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); | |
4518 | ||
4519 | if (!is_guest_mode(vcpu)) | |
4520 | vmcs12 = get_vmcs12(vcpu); | |
4521 | else { | |
4522 | /* | |
4523 | * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE | |
4524 | * to shadowed-field sets the ALU flags for VMfailInvalid. | |
4525 | */ | |
4526 | if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) | |
4527 | return nested_vmx_failInvalid(vcpu); | |
4528 | vmcs12 = get_shadow_vmcs12(vcpu); | |
4529 | } | |
4530 | ||
4531 | if (vmcs12_write_any(vmcs12, field, field_value) < 0) | |
4532 | return nested_vmx_failValid(vcpu, | |
4533 | VMXERR_UNSUPPORTED_VMCS_COMPONENT); | |
4534 | ||
4535 | /* | |
4536 | * Do not track vmcs12 dirty-state if in guest-mode | |
4537 | * as we actually dirty shadow vmcs12 instead of vmcs12. | |
4538 | */ | |
4539 | if (!is_guest_mode(vcpu)) { | |
4540 | switch (field) { | |
4541 | #define SHADOW_FIELD_RW(x) case x: | |
4542 | #include "vmcs_shadow_fields.h" | |
4543 | /* | |
4544 | * The fields that can be updated by L1 without a vmexit are | |
4545 | * always updated in the vmcs02, the others go down the slow | |
4546 | * path of prepare_vmcs02. | |
4547 | */ | |
4548 | break; | |
4549 | default: | |
4550 | vmx->nested.dirty_vmcs12 = true; | |
4551 | break; | |
4552 | } | |
4553 | } | |
4554 | ||
4555 | return nested_vmx_succeed(vcpu); | |
4556 | } | |
4557 | ||
4558 | static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) | |
4559 | { | |
4560 | vmx->nested.current_vmptr = vmptr; | |
4561 | if (enable_shadow_vmcs) { | |
4562 | vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, | |
4563 | SECONDARY_EXEC_SHADOW_VMCS); | |
4564 | vmcs_write64(VMCS_LINK_POINTER, | |
4565 | __pa(vmx->vmcs01.shadow_vmcs)); | |
4566 | vmx->nested.need_vmcs12_sync = true; | |
4567 | } | |
4568 | vmx->nested.dirty_vmcs12 = true; | |
4569 | } | |
4570 | ||
4571 | /* Emulate the VMPTRLD instruction */ | |
4572 | static int handle_vmptrld(struct kvm_vcpu *vcpu) | |
4573 | { | |
4574 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
4575 | gpa_t vmptr; | |
4576 | ||
4577 | if (!nested_vmx_check_permission(vcpu)) | |
4578 | return 1; | |
4579 | ||
4580 | if (nested_vmx_get_vmptr(vcpu, &vmptr)) | |
4581 | return 1; | |
4582 | ||
4583 | if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) | |
4584 | return nested_vmx_failValid(vcpu, | |
4585 | VMXERR_VMPTRLD_INVALID_ADDRESS); | |
4586 | ||
4587 | if (vmptr == vmx->nested.vmxon_ptr) | |
4588 | return nested_vmx_failValid(vcpu, | |
4589 | VMXERR_VMPTRLD_VMXON_POINTER); | |
4590 | ||
4591 | /* Forbid normal VMPTRLD if Enlightened version was used */ | |
4592 | if (vmx->nested.hv_evmcs) | |
4593 | return 1; | |
4594 | ||
4595 | if (vmx->nested.current_vmptr != vmptr) { | |
4596 | struct vmcs12 *new_vmcs12; | |
4597 | struct page *page; | |
4598 | ||
4599 | page = kvm_vcpu_gpa_to_page(vcpu, vmptr); | |
4600 | if (is_error_page(page)) { | |
4601 | /* | |
4602 | * Reads from an unbacked page return all 1s, | |
4603 | * which means that the 32 bits located at the | |
4604 | * given physical address won't match the required | |
4605 | * VMCS12_REVISION identifier. | |
4606 | */ | |
826c1362 | 4607 | return nested_vmx_failValid(vcpu, |
55d2375e | 4608 | VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); |
55d2375e SC |
4609 | } |
4610 | new_vmcs12 = kmap(page); | |
4611 | if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || | |
4612 | (new_vmcs12->hdr.shadow_vmcs && | |
4613 | !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { | |
4614 | kunmap(page); | |
4615 | kvm_release_page_clean(page); | |
4616 | return nested_vmx_failValid(vcpu, | |
4617 | VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); | |
4618 | } | |
4619 | ||
4620 | nested_release_vmcs12(vcpu); | |
4621 | ||
4622 | /* | |
4623 | * Load VMCS12 from guest memory since it is not already | |
4624 | * cached. | |
4625 | */ | |
4626 | memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); | |
4627 | kunmap(page); | |
4628 | kvm_release_page_clean(page); | |
4629 | ||
4630 | set_current_vmptr(vmx, vmptr); | |
4631 | } | |
4632 | ||
4633 | return nested_vmx_succeed(vcpu); | |
4634 | } | |
4635 | ||
4636 | /* Emulate the VMPTRST instruction */ | |
4637 | static int handle_vmptrst(struct kvm_vcpu *vcpu) | |
4638 | { | |
4639 | unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION); | |
4640 | u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); | |
4641 | gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; | |
4642 | struct x86_exception e; | |
4643 | gva_t gva; | |
4644 | ||
4645 | if (!nested_vmx_check_permission(vcpu)) | |
4646 | return 1; | |
4647 | ||
4648 | if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) | |
4649 | return 1; | |
4650 | ||
4651 | if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva)) | |
4652 | return 1; | |
4653 | /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ | |
4654 | if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, | |
4655 | sizeof(gpa_t), &e)) { | |
4656 | kvm_inject_page_fault(vcpu, &e); | |
4657 | return 1; | |
4658 | } | |
4659 | return nested_vmx_succeed(vcpu); | |
4660 | } | |
4661 | ||
4662 | /* Emulate the INVEPT instruction */ | |
4663 | static int handle_invept(struct kvm_vcpu *vcpu) | |
4664 | { | |
4665 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
4666 | u32 vmx_instruction_info, types; | |
4667 | unsigned long type; | |
4668 | gva_t gva; | |
4669 | struct x86_exception e; | |
4670 | struct { | |
4671 | u64 eptp, gpa; | |
4672 | } operand; | |
4673 | ||
4674 | if (!(vmx->nested.msrs.secondary_ctls_high & | |
4675 | SECONDARY_EXEC_ENABLE_EPT) || | |
4676 | !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { | |
4677 | kvm_queue_exception(vcpu, UD_VECTOR); | |
4678 | return 1; | |
4679 | } | |
4680 | ||
4681 | if (!nested_vmx_check_permission(vcpu)) | |
4682 | return 1; | |
4683 | ||
4684 | vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | |
4685 | type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); | |
4686 | ||
4687 | types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; | |
4688 | ||
4689 | if (type >= 32 || !(types & (1 << type))) | |
4690 | return nested_vmx_failValid(vcpu, | |
4691 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); | |
4692 | ||
4693 | /* According to the Intel VMX instruction reference, the memory | |
4694 | * operand is read even if it isn't needed (e.g., for type==global) | |
4695 | */ | |
4696 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | |
4697 | vmx_instruction_info, false, &gva)) | |
4698 | return 1; | |
4699 | if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { | |
4700 | kvm_inject_page_fault(vcpu, &e); | |
4701 | return 1; | |
4702 | } | |
4703 | ||
4704 | switch (type) { | |
4705 | case VMX_EPT_EXTENT_GLOBAL: | |
4706 | /* | |
4707 | * TODO: track mappings and invalidate | |
4708 | * single context requests appropriately | |
4709 | */ | |
4710 | case VMX_EPT_EXTENT_CONTEXT: | |
4711 | kvm_mmu_sync_roots(vcpu); | |
4712 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | |
4713 | break; | |
4714 | default: | |
4715 | BUG_ON(1); | |
4716 | break; | |
4717 | } | |
4718 | ||
4719 | return nested_vmx_succeed(vcpu); | |
4720 | } | |
4721 | ||
4722 | static int handle_invvpid(struct kvm_vcpu *vcpu) | |
4723 | { | |
4724 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
4725 | u32 vmx_instruction_info; | |
4726 | unsigned long type, types; | |
4727 | gva_t gva; | |
4728 | struct x86_exception e; | |
4729 | struct { | |
4730 | u64 vpid; | |
4731 | u64 gla; | |
4732 | } operand; | |
4733 | u16 vpid02; | |
4734 | ||
4735 | if (!(vmx->nested.msrs.secondary_ctls_high & | |
4736 | SECONDARY_EXEC_ENABLE_VPID) || | |
4737 | !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { | |
4738 | kvm_queue_exception(vcpu, UD_VECTOR); | |
4739 | return 1; | |
4740 | } | |
4741 | ||
4742 | if (!nested_vmx_check_permission(vcpu)) | |
4743 | return 1; | |
4744 | ||
4745 | vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | |
4746 | type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); | |
4747 | ||
4748 | types = (vmx->nested.msrs.vpid_caps & | |
4749 | VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; | |
4750 | ||
4751 | if (type >= 32 || !(types & (1 << type))) | |
4752 | return nested_vmx_failValid(vcpu, | |
4753 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); | |
4754 | ||
4755 | /* according to the intel vmx instruction reference, the memory | |
4756 | * operand is read even if it isn't needed (e.g., for type==global) | |
4757 | */ | |
4758 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | |
4759 | vmx_instruction_info, false, &gva)) | |
4760 | return 1; | |
4761 | if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { | |
4762 | kvm_inject_page_fault(vcpu, &e); | |
4763 | return 1; | |
4764 | } | |
4765 | if (operand.vpid >> 16) | |
4766 | return nested_vmx_failValid(vcpu, | |
4767 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); | |
4768 | ||
4769 | vpid02 = nested_get_vpid02(vcpu); | |
4770 | switch (type) { | |
4771 | case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: | |
4772 | if (!operand.vpid || | |
4773 | is_noncanonical_address(operand.gla, vcpu)) | |
4774 | return nested_vmx_failValid(vcpu, | |
4775 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); | |
4776 | if (cpu_has_vmx_invvpid_individual_addr()) { | |
4777 | __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, | |
4778 | vpid02, operand.gla); | |
4779 | } else | |
4780 | __vmx_flush_tlb(vcpu, vpid02, false); | |
4781 | break; | |
4782 | case VMX_VPID_EXTENT_SINGLE_CONTEXT: | |
4783 | case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: | |
4784 | if (!operand.vpid) | |
4785 | return nested_vmx_failValid(vcpu, | |
4786 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); | |
4787 | __vmx_flush_tlb(vcpu, vpid02, false); | |
4788 | break; | |
4789 | case VMX_VPID_EXTENT_ALL_CONTEXT: | |
4790 | __vmx_flush_tlb(vcpu, vpid02, false); | |
4791 | break; | |
4792 | default: | |
4793 | WARN_ON_ONCE(1); | |
4794 | return kvm_skip_emulated_instruction(vcpu); | |
4795 | } | |
4796 | ||
4797 | return nested_vmx_succeed(vcpu); | |
4798 | } | |
4799 | ||
4800 | static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, | |
4801 | struct vmcs12 *vmcs12) | |
4802 | { | |
4803 | u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; | |
4804 | u64 address; | |
4805 | bool accessed_dirty; | |
4806 | struct kvm_mmu *mmu = vcpu->arch.walk_mmu; | |
4807 | ||
4808 | if (!nested_cpu_has_eptp_switching(vmcs12) || | |
4809 | !nested_cpu_has_ept(vmcs12)) | |
4810 | return 1; | |
4811 | ||
4812 | if (index >= VMFUNC_EPTP_ENTRIES) | |
4813 | return 1; | |
4814 | ||
4815 | ||
4816 | if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, | |
4817 | &address, index * 8, 8)) | |
4818 | return 1; | |
4819 | ||
4820 | accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT); | |
4821 | ||
4822 | /* | |
4823 | * If the (L2) guest does a vmfunc to the currently | |
4824 | * active ept pointer, we don't have to do anything else | |
4825 | */ | |
4826 | if (vmcs12->ept_pointer != address) { | |
4827 | if (!valid_ept_address(vcpu, address)) | |
4828 | return 1; | |
4829 | ||
4830 | kvm_mmu_unload(vcpu); | |
4831 | mmu->ept_ad = accessed_dirty; | |
4832 | mmu->mmu_role.base.ad_disabled = !accessed_dirty; | |
4833 | vmcs12->ept_pointer = address; | |
4834 | /* | |
4835 | * TODO: Check what's the correct approach in case | |
4836 | * mmu reload fails. Currently, we just let the next | |
4837 | * reload potentially fail | |
4838 | */ | |
4839 | kvm_mmu_reload(vcpu); | |
4840 | } | |
4841 | ||
4842 | return 0; | |
4843 | } | |
4844 | ||
4845 | static int handle_vmfunc(struct kvm_vcpu *vcpu) | |
4846 | { | |
4847 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
4848 | struct vmcs12 *vmcs12; | |
4849 | u32 function = vcpu->arch.regs[VCPU_REGS_RAX]; | |
4850 | ||
4851 | /* | |
4852 | * VMFUNC is only supported for nested guests, but we always enable the | |
4853 | * secondary control for simplicity; for non-nested mode, fake that we | |
4854 | * didn't by injecting #UD. | |
4855 | */ | |
4856 | if (!is_guest_mode(vcpu)) { | |
4857 | kvm_queue_exception(vcpu, UD_VECTOR); | |
4858 | return 1; | |
4859 | } | |
4860 | ||
4861 | vmcs12 = get_vmcs12(vcpu); | |
4862 | if ((vmcs12->vm_function_control & (1 << function)) == 0) | |
4863 | goto fail; | |
4864 | ||
4865 | switch (function) { | |
4866 | case 0: | |
4867 | if (nested_vmx_eptp_switching(vcpu, vmcs12)) | |
4868 | goto fail; | |
4869 | break; | |
4870 | default: | |
4871 | goto fail; | |
4872 | } | |
4873 | return kvm_skip_emulated_instruction(vcpu); | |
4874 | ||
4875 | fail: | |
4876 | nested_vmx_vmexit(vcpu, vmx->exit_reason, | |
4877 | vmcs_read32(VM_EXIT_INTR_INFO), | |
4878 | vmcs_readl(EXIT_QUALIFICATION)); | |
4879 | return 1; | |
4880 | } | |
4881 | ||
4882 | ||
4883 | static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, | |
4884 | struct vmcs12 *vmcs12) | |
4885 | { | |
4886 | unsigned long exit_qualification; | |
4887 | gpa_t bitmap, last_bitmap; | |
4888 | unsigned int port; | |
4889 | int size; | |
4890 | u8 b; | |
4891 | ||
4892 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) | |
4893 | return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); | |
4894 | ||
4895 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | |
4896 | ||
4897 | port = exit_qualification >> 16; | |
4898 | size = (exit_qualification & 7) + 1; | |
4899 | ||
4900 | last_bitmap = (gpa_t)-1; | |
4901 | b = -1; | |
4902 | ||
4903 | while (size > 0) { | |
4904 | if (port < 0x8000) | |
4905 | bitmap = vmcs12->io_bitmap_a; | |
4906 | else if (port < 0x10000) | |
4907 | bitmap = vmcs12->io_bitmap_b; | |
4908 | else | |
4909 | return true; | |
4910 | bitmap += (port & 0x7fff) / 8; | |
4911 | ||
4912 | if (last_bitmap != bitmap) | |
4913 | if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) | |
4914 | return true; | |
4915 | if (b & (1 << (port & 7))) | |
4916 | return true; | |
4917 | ||
4918 | port++; | |
4919 | size--; | |
4920 | last_bitmap = bitmap; | |
4921 | } | |
4922 | ||
4923 | return false; | |
4924 | } | |
4925 | ||
4926 | /* | |
4927 | * Return 1 if we should exit from L2 to L1 to handle an MSR access access, | |
4928 | * rather than handle it ourselves in L0. I.e., check whether L1 expressed | |
4929 | * disinterest in the current event (read or write a specific MSR) by using an | |
4930 | * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. | |
4931 | */ | |
4932 | static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, | |
4933 | struct vmcs12 *vmcs12, u32 exit_reason) | |
4934 | { | |
4935 | u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; | |
4936 | gpa_t bitmap; | |
4937 | ||
4938 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) | |
4939 | return true; | |
4940 | ||
4941 | /* | |
4942 | * The MSR_BITMAP page is divided into four 1024-byte bitmaps, | |
4943 | * for the four combinations of read/write and low/high MSR numbers. | |
4944 | * First we need to figure out which of the four to use: | |
4945 | */ | |
4946 | bitmap = vmcs12->msr_bitmap; | |
4947 | if (exit_reason == EXIT_REASON_MSR_WRITE) | |
4948 | bitmap += 2048; | |
4949 | if (msr_index >= 0xc0000000) { | |
4950 | msr_index -= 0xc0000000; | |
4951 | bitmap += 1024; | |
4952 | } | |
4953 | ||
4954 | /* Then read the msr_index'th bit from this bitmap: */ | |
4955 | if (msr_index < 1024*8) { | |
4956 | unsigned char b; | |
4957 | if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) | |
4958 | return true; | |
4959 | return 1 & (b >> (msr_index & 7)); | |
4960 | } else | |
4961 | return true; /* let L1 handle the wrong parameter */ | |
4962 | } | |
4963 | ||
4964 | /* | |
4965 | * Return 1 if we should exit from L2 to L1 to handle a CR access exit, | |
4966 | * rather than handle it ourselves in L0. I.e., check if L1 wanted to | |
4967 | * intercept (via guest_host_mask etc.) the current event. | |
4968 | */ | |
4969 | static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, | |
4970 | struct vmcs12 *vmcs12) | |
4971 | { | |
4972 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | |
4973 | int cr = exit_qualification & 15; | |
4974 | int reg; | |
4975 | unsigned long val; | |
4976 | ||
4977 | switch ((exit_qualification >> 4) & 3) { | |
4978 | case 0: /* mov to cr */ | |
4979 | reg = (exit_qualification >> 8) & 15; | |
4980 | val = kvm_register_readl(vcpu, reg); | |
4981 | switch (cr) { | |
4982 | case 0: | |
4983 | if (vmcs12->cr0_guest_host_mask & | |
4984 | (val ^ vmcs12->cr0_read_shadow)) | |
4985 | return true; | |
4986 | break; | |
4987 | case 3: | |
4988 | if ((vmcs12->cr3_target_count >= 1 && | |
4989 | vmcs12->cr3_target_value0 == val) || | |
4990 | (vmcs12->cr3_target_count >= 2 && | |
4991 | vmcs12->cr3_target_value1 == val) || | |
4992 | (vmcs12->cr3_target_count >= 3 && | |
4993 | vmcs12->cr3_target_value2 == val) || | |
4994 | (vmcs12->cr3_target_count >= 4 && | |
4995 | vmcs12->cr3_target_value3 == val)) | |
4996 | return false; | |
4997 | if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) | |
4998 | return true; | |
4999 | break; | |
5000 | case 4: | |
5001 | if (vmcs12->cr4_guest_host_mask & | |
5002 | (vmcs12->cr4_read_shadow ^ val)) | |
5003 | return true; | |
5004 | break; | |
5005 | case 8: | |
5006 | if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) | |
5007 | return true; | |
5008 | break; | |
5009 | } | |
5010 | break; | |
5011 | case 2: /* clts */ | |
5012 | if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && | |
5013 | (vmcs12->cr0_read_shadow & X86_CR0_TS)) | |
5014 | return true; | |
5015 | break; | |
5016 | case 1: /* mov from cr */ | |
5017 | switch (cr) { | |
5018 | case 3: | |
5019 | if (vmcs12->cpu_based_vm_exec_control & | |
5020 | CPU_BASED_CR3_STORE_EXITING) | |
5021 | return true; | |
5022 | break; | |
5023 | case 8: | |
5024 | if (vmcs12->cpu_based_vm_exec_control & | |
5025 | CPU_BASED_CR8_STORE_EXITING) | |
5026 | return true; | |
5027 | break; | |
5028 | } | |
5029 | break; | |
5030 | case 3: /* lmsw */ | |
5031 | /* | |
5032 | * lmsw can change bits 1..3 of cr0, and only set bit 0 of | |
5033 | * cr0. Other attempted changes are ignored, with no exit. | |
5034 | */ | |
5035 | val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; | |
5036 | if (vmcs12->cr0_guest_host_mask & 0xe & | |
5037 | (val ^ vmcs12->cr0_read_shadow)) | |
5038 | return true; | |
5039 | if ((vmcs12->cr0_guest_host_mask & 0x1) && | |
5040 | !(vmcs12->cr0_read_shadow & 0x1) && | |
5041 | (val & 0x1)) | |
5042 | return true; | |
5043 | break; | |
5044 | } | |
5045 | return false; | |
5046 | } | |
5047 | ||
5048 | static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, | |
5049 | struct vmcs12 *vmcs12, gpa_t bitmap) | |
5050 | { | |
5051 | u32 vmx_instruction_info; | |
5052 | unsigned long field; | |
5053 | u8 b; | |
5054 | ||
5055 | if (!nested_cpu_has_shadow_vmcs(vmcs12)) | |
5056 | return true; | |
5057 | ||
5058 | /* Decode instruction info and find the field to access */ | |
5059 | vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | |
5060 | field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); | |
5061 | ||
5062 | /* Out-of-range fields always cause a VM exit from L2 to L1 */ | |
5063 | if (field >> 15) | |
5064 | return true; | |
5065 | ||
5066 | if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) | |
5067 | return true; | |
5068 | ||
5069 | return 1 & (b >> (field & 7)); | |
5070 | } | |
5071 | ||
5072 | /* | |
5073 | * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we | |
5074 | * should handle it ourselves in L0 (and then continue L2). Only call this | |
5075 | * when in is_guest_mode (L2). | |
5076 | */ | |
5077 | bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) | |
5078 | { | |
5079 | u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | |
5080 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
5081 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | |
5082 | ||
5083 | if (vmx->nested.nested_run_pending) | |
5084 | return false; | |
5085 | ||
5086 | if (unlikely(vmx->fail)) { | |
5087 | pr_info_ratelimited("%s failed vm entry %x\n", __func__, | |
5088 | vmcs_read32(VM_INSTRUCTION_ERROR)); | |
5089 | return true; | |
5090 | } | |
5091 | ||
5092 | /* | |
5093 | * The host physical addresses of some pages of guest memory | |
5094 | * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC | |
5095 | * Page). The CPU may write to these pages via their host | |
5096 | * physical address while L2 is running, bypassing any | |
5097 | * address-translation-based dirty tracking (e.g. EPT write | |
5098 | * protection). | |
5099 | * | |
5100 | * Mark them dirty on every exit from L2 to prevent them from | |
5101 | * getting out of sync with dirty tracking. | |
5102 | */ | |
5103 | nested_mark_vmcs12_pages_dirty(vcpu); | |
5104 | ||
5105 | trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, | |
5106 | vmcs_readl(EXIT_QUALIFICATION), | |
5107 | vmx->idt_vectoring_info, | |
5108 | intr_info, | |
5109 | vmcs_read32(VM_EXIT_INTR_ERROR_CODE), | |
5110 | KVM_ISA_VMX); | |
5111 | ||
5112 | switch (exit_reason) { | |
5113 | case EXIT_REASON_EXCEPTION_NMI: | |
5114 | if (is_nmi(intr_info)) | |
5115 | return false; | |
5116 | else if (is_page_fault(intr_info)) | |
5117 | return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; | |
5118 | else if (is_debug(intr_info) && | |
5119 | vcpu->guest_debug & | |
5120 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) | |
5121 | return false; | |
5122 | else if (is_breakpoint(intr_info) && | |
5123 | vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) | |
5124 | return false; | |
5125 | return vmcs12->exception_bitmap & | |
5126 | (1u << (intr_info & INTR_INFO_VECTOR_MASK)); | |
5127 | case EXIT_REASON_EXTERNAL_INTERRUPT: | |
5128 | return false; | |
5129 | case EXIT_REASON_TRIPLE_FAULT: | |
5130 | return true; | |
5131 | case EXIT_REASON_PENDING_INTERRUPT: | |
5132 | return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); | |
5133 | case EXIT_REASON_NMI_WINDOW: | |
5134 | return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); | |
5135 | case EXIT_REASON_TASK_SWITCH: | |
5136 | return true; | |
5137 | case EXIT_REASON_CPUID: | |
5138 | return true; | |
5139 | case EXIT_REASON_HLT: | |
5140 | return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); | |
5141 | case EXIT_REASON_INVD: | |
5142 | return true; | |
5143 | case EXIT_REASON_INVLPG: | |
5144 | return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); | |
5145 | case EXIT_REASON_RDPMC: | |
5146 | return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); | |
5147 | case EXIT_REASON_RDRAND: | |
5148 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); | |
5149 | case EXIT_REASON_RDSEED: | |
5150 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); | |
5151 | case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: | |
5152 | return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); | |
5153 | case EXIT_REASON_VMREAD: | |
5154 | return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, | |
5155 | vmcs12->vmread_bitmap); | |
5156 | case EXIT_REASON_VMWRITE: | |
5157 | return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, | |
5158 | vmcs12->vmwrite_bitmap); | |
5159 | case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: | |
5160 | case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: | |
5161 | case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: | |
5162 | case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: | |
5163 | case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: | |
5164 | /* | |
5165 | * VMX instructions trap unconditionally. This allows L1 to | |
5166 | * emulate them for its L2 guest, i.e., allows 3-level nesting! | |
5167 | */ | |
5168 | return true; | |
5169 | case EXIT_REASON_CR_ACCESS: | |
5170 | return nested_vmx_exit_handled_cr(vcpu, vmcs12); | |
5171 | case EXIT_REASON_DR_ACCESS: | |
5172 | return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); | |
5173 | case EXIT_REASON_IO_INSTRUCTION: | |
5174 | return nested_vmx_exit_handled_io(vcpu, vmcs12); | |
5175 | case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: | |
5176 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); | |
5177 | case EXIT_REASON_MSR_READ: | |
5178 | case EXIT_REASON_MSR_WRITE: | |
5179 | return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); | |
5180 | case EXIT_REASON_INVALID_STATE: | |
5181 | return true; | |
5182 | case EXIT_REASON_MWAIT_INSTRUCTION: | |
5183 | return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); | |
5184 | case EXIT_REASON_MONITOR_TRAP_FLAG: | |
5185 | return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); | |
5186 | case EXIT_REASON_MONITOR_INSTRUCTION: | |
5187 | return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); | |
5188 | case EXIT_REASON_PAUSE_INSTRUCTION: | |
5189 | return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || | |
5190 | nested_cpu_has2(vmcs12, | |
5191 | SECONDARY_EXEC_PAUSE_LOOP_EXITING); | |
5192 | case EXIT_REASON_MCE_DURING_VMENTRY: | |
5193 | return false; | |
5194 | case EXIT_REASON_TPR_BELOW_THRESHOLD: | |
5195 | return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); | |
5196 | case EXIT_REASON_APIC_ACCESS: | |
5197 | case EXIT_REASON_APIC_WRITE: | |
5198 | case EXIT_REASON_EOI_INDUCED: | |
5199 | /* | |
5200 | * The controls for "virtualize APIC accesses," "APIC- | |
5201 | * register virtualization," and "virtual-interrupt | |
5202 | * delivery" only come from vmcs12. | |
5203 | */ | |
5204 | return true; | |
5205 | case EXIT_REASON_EPT_VIOLATION: | |
5206 | /* | |
5207 | * L0 always deals with the EPT violation. If nested EPT is | |
5208 | * used, and the nested mmu code discovers that the address is | |
5209 | * missing in the guest EPT table (EPT12), the EPT violation | |
5210 | * will be injected with nested_ept_inject_page_fault() | |
5211 | */ | |
5212 | return false; | |
5213 | case EXIT_REASON_EPT_MISCONFIG: | |
5214 | /* | |
5215 | * L2 never uses directly L1's EPT, but rather L0's own EPT | |
5216 | * table (shadow on EPT) or a merged EPT table that L0 built | |
5217 | * (EPT on EPT). So any problems with the structure of the | |
5218 | * table is L0's fault. | |
5219 | */ | |
5220 | return false; | |
5221 | case EXIT_REASON_INVPCID: | |
5222 | return | |
5223 | nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && | |
5224 | nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); | |
5225 | case EXIT_REASON_WBINVD: | |
5226 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); | |
5227 | case EXIT_REASON_XSETBV: | |
5228 | return true; | |
5229 | case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: | |
5230 | /* | |
5231 | * This should never happen, since it is not possible to | |
5232 | * set XSS to a non-zero value---neither in L1 nor in L2. | |
5233 | * If if it were, XSS would have to be checked against | |
5234 | * the XSS exit bitmap in vmcs12. | |
5235 | */ | |
5236 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); | |
5237 | case EXIT_REASON_PREEMPTION_TIMER: | |
5238 | return false; | |
5239 | case EXIT_REASON_PML_FULL: | |
5240 | /* We emulate PML support to L1. */ | |
5241 | return false; | |
5242 | case EXIT_REASON_VMFUNC: | |
5243 | /* VM functions are emulated through L2->L0 vmexits. */ | |
5244 | return false; | |
5245 | case EXIT_REASON_ENCLS: | |
5246 | /* SGX is never exposed to L1 */ | |
5247 | return false; | |
5248 | default: | |
5249 | return true; | |
5250 | } | |
5251 | } | |
5252 | ||
5253 | ||
5254 | static int vmx_get_nested_state(struct kvm_vcpu *vcpu, | |
5255 | struct kvm_nested_state __user *user_kvm_nested_state, | |
5256 | u32 user_data_size) | |
5257 | { | |
5258 | struct vcpu_vmx *vmx; | |
5259 | struct vmcs12 *vmcs12; | |
5260 | struct kvm_nested_state kvm_state = { | |
5261 | .flags = 0, | |
5262 | .format = 0, | |
5263 | .size = sizeof(kvm_state), | |
5264 | .vmx.vmxon_pa = -1ull, | |
5265 | .vmx.vmcs_pa = -1ull, | |
5266 | }; | |
5267 | ||
5268 | if (!vcpu) | |
5269 | return kvm_state.size + 2 * VMCS12_SIZE; | |
5270 | ||
5271 | vmx = to_vmx(vcpu); | |
5272 | vmcs12 = get_vmcs12(vcpu); | |
5273 | ||
5274 | if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled) | |
5275 | kvm_state.flags |= KVM_STATE_NESTED_EVMCS; | |
5276 | ||
5277 | if (nested_vmx_allowed(vcpu) && | |
5278 | (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { | |
5279 | kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr; | |
5280 | kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr; | |
5281 | ||
5282 | if (vmx_has_valid_vmcs12(vcpu)) { | |
5283 | kvm_state.size += VMCS12_SIZE; | |
5284 | ||
5285 | if (is_guest_mode(vcpu) && | |
5286 | nested_cpu_has_shadow_vmcs(vmcs12) && | |
5287 | vmcs12->vmcs_link_pointer != -1ull) | |
5288 | kvm_state.size += VMCS12_SIZE; | |
5289 | } | |
5290 | ||
5291 | if (vmx->nested.smm.vmxon) | |
5292 | kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; | |
5293 | ||
5294 | if (vmx->nested.smm.guest_mode) | |
5295 | kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; | |
5296 | ||
5297 | if (is_guest_mode(vcpu)) { | |
5298 | kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; | |
5299 | ||
5300 | if (vmx->nested.nested_run_pending) | |
5301 | kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; | |
5302 | } | |
5303 | } | |
5304 | ||
5305 | if (user_data_size < kvm_state.size) | |
5306 | goto out; | |
5307 | ||
5308 | if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) | |
5309 | return -EFAULT; | |
5310 | ||
5311 | if (!vmx_has_valid_vmcs12(vcpu)) | |
5312 | goto out; | |
5313 | ||
5314 | /* | |
5315 | * When running L2, the authoritative vmcs12 state is in the | |
5316 | * vmcs02. When running L1, the authoritative vmcs12 state is | |
5317 | * in the shadow or enlightened vmcs linked to vmcs01, unless | |
5318 | * need_vmcs12_sync is set, in which case, the authoritative | |
5319 | * vmcs12 state is in the vmcs12 already. | |
5320 | */ | |
5321 | if (is_guest_mode(vcpu)) { | |
5322 | sync_vmcs12(vcpu, vmcs12); | |
5323 | } else if (!vmx->nested.need_vmcs12_sync) { | |
5324 | if (vmx->nested.hv_evmcs) | |
5325 | copy_enlightened_to_vmcs12(vmx); | |
5326 | else if (enable_shadow_vmcs) | |
5327 | copy_shadow_to_vmcs12(vmx); | |
5328 | } | |
5329 | ||
3a33d030 TR |
5330 | /* |
5331 | * Copy over the full allocated size of vmcs12 rather than just the size | |
5332 | * of the struct. | |
5333 | */ | |
5334 | if (copy_to_user(user_kvm_nested_state->data, vmcs12, VMCS12_SIZE)) | |
55d2375e SC |
5335 | return -EFAULT; |
5336 | ||
5337 | if (nested_cpu_has_shadow_vmcs(vmcs12) && | |
5338 | vmcs12->vmcs_link_pointer != -1ull) { | |
5339 | if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE, | |
3a33d030 | 5340 | get_shadow_vmcs12(vcpu), VMCS12_SIZE)) |
55d2375e SC |
5341 | return -EFAULT; |
5342 | } | |
5343 | ||
5344 | out: | |
5345 | return kvm_state.size; | |
5346 | } | |
5347 | ||
5348 | /* | |
5349 | * Forcibly leave nested mode in order to be able to reset the VCPU later on. | |
5350 | */ | |
5351 | void vmx_leave_nested(struct kvm_vcpu *vcpu) | |
5352 | { | |
5353 | if (is_guest_mode(vcpu)) { | |
5354 | to_vmx(vcpu)->nested.nested_run_pending = 0; | |
5355 | nested_vmx_vmexit(vcpu, -1, 0, 0); | |
5356 | } | |
5357 | free_nested(vcpu); | |
5358 | } | |
5359 | ||
5360 | static int vmx_set_nested_state(struct kvm_vcpu *vcpu, | |
5361 | struct kvm_nested_state __user *user_kvm_nested_state, | |
5362 | struct kvm_nested_state *kvm_state) | |
5363 | { | |
5364 | struct vcpu_vmx *vmx = to_vmx(vcpu); | |
5365 | struct vmcs12 *vmcs12; | |
5366 | u32 exit_qual; | |
5367 | int ret; | |
5368 | ||
5369 | if (kvm_state->format != 0) | |
5370 | return -EINVAL; | |
5371 | ||
5372 | if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) | |
5373 | nested_enable_evmcs(vcpu, NULL); | |
5374 | ||
5375 | if (!nested_vmx_allowed(vcpu)) | |
5376 | return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL; | |
5377 | ||
5378 | if (kvm_state->vmx.vmxon_pa == -1ull) { | |
5379 | if (kvm_state->vmx.smm.flags) | |
5380 | return -EINVAL; | |
5381 | ||
5382 | if (kvm_state->vmx.vmcs_pa != -1ull) | |
5383 | return -EINVAL; | |
5384 | ||
5385 | vmx_leave_nested(vcpu); | |
5386 | return 0; | |
5387 | } | |
5388 | ||
5389 | if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa)) | |
5390 | return -EINVAL; | |
5391 | ||
5392 | if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && | |
5393 | (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) | |
5394 | return -EINVAL; | |
5395 | ||
5396 | if (kvm_state->vmx.smm.flags & | |
5397 | ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) | |
5398 | return -EINVAL; | |
5399 | ||
5400 | /* | |
5401 | * SMM temporarily disables VMX, so we cannot be in guest mode, | |
5402 | * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags | |
5403 | * must be zero. | |
5404 | */ | |
5405 | if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags) | |
5406 | return -EINVAL; | |
5407 | ||
5408 | if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && | |
5409 | !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) | |
5410 | return -EINVAL; | |
5411 | ||
5412 | vmx_leave_nested(vcpu); | |
5413 | if (kvm_state->vmx.vmxon_pa == -1ull) | |
5414 | return 0; | |
5415 | ||
5416 | vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa; | |
5417 | ret = enter_vmx_operation(vcpu); | |
5418 | if (ret) | |
5419 | return ret; | |
5420 | ||
5421 | /* Empty 'VMXON' state is permitted */ | |
e8ab8d24 | 5422 | if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) |
55d2375e SC |
5423 | return 0; |
5424 | ||
5425 | if (kvm_state->vmx.vmcs_pa != -1ull) { | |
5426 | if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa || | |
5427 | !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa)) | |
5428 | return -EINVAL; | |
5429 | ||
5430 | set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa); | |
5431 | } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { | |
5432 | /* | |
5433 | * Sync eVMCS upon entry as we may not have | |
5434 | * HV_X64_MSR_VP_ASSIST_PAGE set up yet. | |
5435 | */ | |
5436 | vmx->nested.need_vmcs12_sync = true; | |
5437 | } else { | |
5438 | return -EINVAL; | |
5439 | } | |
5440 | ||
5441 | if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { | |
5442 | vmx->nested.smm.vmxon = true; | |
5443 | vmx->nested.vmxon = false; | |
5444 | ||
5445 | if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) | |
5446 | vmx->nested.smm.guest_mode = true; | |
5447 | } | |
5448 | ||
5449 | vmcs12 = get_vmcs12(vcpu); | |
5450 | if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12))) | |
5451 | return -EFAULT; | |
5452 | ||
5453 | if (vmcs12->hdr.revision_id != VMCS12_REVISION) | |
5454 | return -EINVAL; | |
5455 | ||
5456 | if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) | |
5457 | return 0; | |
5458 | ||
5459 | vmx->nested.nested_run_pending = | |
5460 | !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); | |
5461 | ||
5462 | if (nested_cpu_has_shadow_vmcs(vmcs12) && | |
5463 | vmcs12->vmcs_link_pointer != -1ull) { | |
5464 | struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); | |
5465 | ||
936a2fe9 | 5466 | if (kvm_state->size < sizeof(*kvm_state) + VMCS12_SIZE + sizeof(*vmcs12)) |
55d2375e SC |
5467 | return -EINVAL; |
5468 | ||
5469 | if (copy_from_user(shadow_vmcs12, | |
5470 | user_kvm_nested_state->data + VMCS12_SIZE, | |
5471 | sizeof(*vmcs12))) | |
5472 | return -EFAULT; | |
5473 | ||
5474 | if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || | |
5475 | !shadow_vmcs12->hdr.shadow_vmcs) | |
5476 | return -EINVAL; | |
5477 | } | |
5478 | ||
16322a3b KS |
5479 | if (nested_vmx_check_vmentry_prereqs(vcpu, vmcs12) || |
5480 | nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) | |
55d2375e SC |
5481 | return -EINVAL; |
5482 | ||
5483 | vmx->nested.dirty_vmcs12 = true; | |
5484 | ret = nested_vmx_enter_non_root_mode(vcpu, false); | |
5485 | if (ret) | |
5486 | return -EINVAL; | |
5487 | ||
5488 | return 0; | |
5489 | } | |
5490 | ||
5491 | void nested_vmx_vcpu_setup(void) | |
5492 | { | |
5493 | if (enable_shadow_vmcs) { | |
5494 | /* | |
5495 | * At vCPU creation, "VMWRITE to any supported field | |
5496 | * in the VMCS" is supported, so use the more | |
5497 | * permissive vmx_vmread_bitmap to specify both read | |
5498 | * and write permissions for the shadow VMCS. | |
5499 | */ | |
5500 | vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); | |
5501 | vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap)); | |
5502 | } | |
5503 | } | |
5504 | ||
5505 | /* | |
5506 | * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be | |
5507 | * returned for the various VMX controls MSRs when nested VMX is enabled. | |
5508 | * The same values should also be used to verify that vmcs12 control fields are | |
5509 | * valid during nested entry from L1 to L2. | |
5510 | * Each of these control msrs has a low and high 32-bit half: A low bit is on | |
5511 | * if the corresponding bit in the (32-bit) control field *must* be on, and a | |
5512 | * bit in the high half is on if the corresponding bit in the control field | |
5513 | * may be on. See also vmx_control_verify(). | |
5514 | */ | |
5515 | void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, | |
5516 | bool apicv) | |
5517 | { | |
5518 | /* | |
5519 | * Note that as a general rule, the high half of the MSRs (bits in | |
5520 | * the control fields which may be 1) should be initialized by the | |
5521 | * intersection of the underlying hardware's MSR (i.e., features which | |
5522 | * can be supported) and the list of features we want to expose - | |
5523 | * because they are known to be properly supported in our code. | |
5524 | * Also, usually, the low half of the MSRs (bits which must be 1) can | |
5525 | * be set to 0, meaning that L1 may turn off any of these bits. The | |
5526 | * reason is that if one of these bits is necessary, it will appear | |
5527 | * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control | |
5528 | * fields of vmcs01 and vmcs02, will turn these bits off - and | |
5529 | * nested_vmx_exit_reflected() will not pass related exits to L1. | |
5530 | * These rules have exceptions below. | |
5531 | */ | |
5532 | ||
5533 | /* pin-based controls */ | |
5534 | rdmsr(MSR_IA32_VMX_PINBASED_CTLS, | |
5535 | msrs->pinbased_ctls_low, | |
5536 | msrs->pinbased_ctls_high); | |
5537 | msrs->pinbased_ctls_low |= | |
5538 | PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; | |
5539 | msrs->pinbased_ctls_high &= | |
5540 | PIN_BASED_EXT_INTR_MASK | | |
5541 | PIN_BASED_NMI_EXITING | | |
5542 | PIN_BASED_VIRTUAL_NMIS | | |
5543 | (apicv ? PIN_BASED_POSTED_INTR : 0); | |
5544 | msrs->pinbased_ctls_high |= | |
5545 | PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | | |
5546 | PIN_BASED_VMX_PREEMPTION_TIMER; | |
5547 | ||
5548 | /* exit controls */ | |
5549 | rdmsr(MSR_IA32_VMX_EXIT_CTLS, | |
5550 | msrs->exit_ctls_low, | |
5551 | msrs->exit_ctls_high); | |
5552 | msrs->exit_ctls_low = | |
5553 | VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; | |
5554 | ||
5555 | msrs->exit_ctls_high &= | |
5556 | #ifdef CONFIG_X86_64 | |
5557 | VM_EXIT_HOST_ADDR_SPACE_SIZE | | |
5558 | #endif | |
5559 | VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; | |
5560 | msrs->exit_ctls_high |= | |
5561 | VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | | |
5562 | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | | |
5563 | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; | |
5564 | ||
5565 | /* We support free control of debug control saving. */ | |
5566 | msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; | |
5567 | ||
5568 | /* entry controls */ | |
5569 | rdmsr(MSR_IA32_VMX_ENTRY_CTLS, | |
5570 | msrs->entry_ctls_low, | |
5571 | msrs->entry_ctls_high); | |
5572 | msrs->entry_ctls_low = | |
5573 | VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; | |
5574 | msrs->entry_ctls_high &= | |
5575 | #ifdef CONFIG_X86_64 | |
5576 | VM_ENTRY_IA32E_MODE | | |
5577 | #endif | |
5578 | VM_ENTRY_LOAD_IA32_PAT; | |
5579 | msrs->entry_ctls_high |= | |
5580 | (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); | |
5581 | ||
5582 | /* We support free control of debug control loading. */ | |
5583 | msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; | |
5584 | ||
5585 | /* cpu-based controls */ | |
5586 | rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, | |
5587 | msrs->procbased_ctls_low, | |
5588 | msrs->procbased_ctls_high); | |
5589 | msrs->procbased_ctls_low = | |
5590 | CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; | |
5591 | msrs->procbased_ctls_high &= | |
5592 | CPU_BASED_VIRTUAL_INTR_PENDING | | |
5593 | CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | | |
5594 | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | | |
5595 | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | | |
5596 | CPU_BASED_CR3_STORE_EXITING | | |
5597 | #ifdef CONFIG_X86_64 | |
5598 | CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | | |
5599 | #endif | |
5600 | CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | | |
5601 | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | | |
5602 | CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | | |
5603 | CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | | |
5604 | CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | |
5605 | /* | |
5606 | * We can allow some features even when not supported by the | |
5607 | * hardware. For example, L1 can specify an MSR bitmap - and we | |
5608 | * can use it to avoid exits to L1 - even when L0 runs L2 | |
5609 | * without MSR bitmaps. | |
5610 | */ | |
5611 | msrs->procbased_ctls_high |= | |
5612 | CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | | |
5613 | CPU_BASED_USE_MSR_BITMAPS; | |
5614 | ||
5615 | /* We support free control of CR3 access interception. */ | |
5616 | msrs->procbased_ctls_low &= | |
5617 | ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); | |
5618 | ||
5619 | /* | |
5620 | * secondary cpu-based controls. Do not include those that | |
5621 | * depend on CPUID bits, they are added later by vmx_cpuid_update. | |
5622 | */ | |
6b1971c6 VK |
5623 | if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) |
5624 | rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, | |
5625 | msrs->secondary_ctls_low, | |
5626 | msrs->secondary_ctls_high); | |
5627 | ||
55d2375e SC |
5628 | msrs->secondary_ctls_low = 0; |
5629 | msrs->secondary_ctls_high &= | |
5630 | SECONDARY_EXEC_DESC | | |
5631 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | | |
5632 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | |
5633 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | | |
5634 | SECONDARY_EXEC_WBINVD_EXITING; | |
5635 | ||
5636 | /* | |
5637 | * We can emulate "VMCS shadowing," even if the hardware | |
5638 | * doesn't support it. | |
5639 | */ | |
5640 | msrs->secondary_ctls_high |= | |
5641 | SECONDARY_EXEC_SHADOW_VMCS; | |
5642 | ||
5643 | if (enable_ept) { | |
5644 | /* nested EPT: emulate EPT also to L1 */ | |
5645 | msrs->secondary_ctls_high |= | |
5646 | SECONDARY_EXEC_ENABLE_EPT; | |
5647 | msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | | |
5648 | VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; | |
5649 | if (cpu_has_vmx_ept_execute_only()) | |
5650 | msrs->ept_caps |= | |
5651 | VMX_EPT_EXECUTE_ONLY_BIT; | |
5652 | msrs->ept_caps &= ept_caps; | |
5653 | msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | | |
5654 | VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | | |
5655 | VMX_EPT_1GB_PAGE_BIT; | |
5656 | if (enable_ept_ad_bits) { | |
5657 | msrs->secondary_ctls_high |= | |
5658 | SECONDARY_EXEC_ENABLE_PML; | |
5659 | msrs->ept_caps |= VMX_EPT_AD_BIT; | |
5660 | } | |
5661 | } | |
5662 | ||
5663 | if (cpu_has_vmx_vmfunc()) { | |
5664 | msrs->secondary_ctls_high |= | |
5665 | SECONDARY_EXEC_ENABLE_VMFUNC; | |
5666 | /* | |
5667 | * Advertise EPTP switching unconditionally | |
5668 | * since we emulate it | |
5669 | */ | |
5670 | if (enable_ept) | |
5671 | msrs->vmfunc_controls = | |
5672 | VMX_VMFUNC_EPTP_SWITCHING; | |
5673 | } | |
5674 | ||
5675 | /* | |
5676 | * Old versions of KVM use the single-context version without | |
5677 | * checking for support, so declare that it is supported even | |
5678 | * though it is treated as global context. The alternative is | |
5679 | * not failing the single-context invvpid, and it is worse. | |
5680 | */ | |
5681 | if (enable_vpid) { | |
5682 | msrs->secondary_ctls_high |= | |
5683 | SECONDARY_EXEC_ENABLE_VPID; | |
5684 | msrs->vpid_caps = VMX_VPID_INVVPID_BIT | | |
5685 | VMX_VPID_EXTENT_SUPPORTED_MASK; | |
5686 | } | |
5687 | ||
5688 | if (enable_unrestricted_guest) | |
5689 | msrs->secondary_ctls_high |= | |
5690 | SECONDARY_EXEC_UNRESTRICTED_GUEST; | |
5691 | ||
5692 | if (flexpriority_enabled) | |
5693 | msrs->secondary_ctls_high |= | |
5694 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | |
5695 | ||
5696 | /* miscellaneous data */ | |
5697 | rdmsr(MSR_IA32_VMX_MISC, | |
5698 | msrs->misc_low, | |
5699 | msrs->misc_high); | |
5700 | msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; | |
5701 | msrs->misc_low |= | |
5702 | MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | | |
5703 | VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | | |
5704 | VMX_MISC_ACTIVITY_HLT; | |
5705 | msrs->misc_high = 0; | |
5706 | ||
5707 | /* | |
5708 | * This MSR reports some information about VMX support. We | |
5709 | * should return information about the VMX we emulate for the | |
5710 | * guest, and the VMCS structure we give it - not about the | |
5711 | * VMX support of the underlying hardware. | |
5712 | */ | |
5713 | msrs->basic = | |
5714 | VMCS12_REVISION | | |
5715 | VMX_BASIC_TRUE_CTLS | | |
5716 | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | | |
5717 | (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); | |
5718 | ||
5719 | if (cpu_has_vmx_basic_inout()) | |
5720 | msrs->basic |= VMX_BASIC_INOUT; | |
5721 | ||
5722 | /* | |
5723 | * These MSRs specify bits which the guest must keep fixed on | |
5724 | * while L1 is in VMXON mode (in L1's root mode, or running an L2). | |
5725 | * We picked the standard core2 setting. | |
5726 | */ | |
5727 | #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) | |
5728 | #define VMXON_CR4_ALWAYSON X86_CR4_VMXE | |
5729 | msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; | |
5730 | msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; | |
5731 | ||
5732 | /* These MSRs specify bits which the guest must keep fixed off. */ | |
5733 | rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); | |
5734 | rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); | |
5735 | ||
5736 | /* highest index: VMX_PREEMPTION_TIMER_VALUE */ | |
5737 | msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; | |
5738 | } | |
5739 | ||
5740 | void nested_vmx_hardware_unsetup(void) | |
5741 | { | |
5742 | int i; | |
5743 | ||
5744 | if (enable_shadow_vmcs) { | |
5745 | for (i = 0; i < VMX_BITMAP_NR; i++) | |
5746 | free_page((unsigned long)vmx_bitmap[i]); | |
5747 | } | |
5748 | } | |
5749 | ||
5750 | __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) | |
5751 | { | |
5752 | int i; | |
5753 | ||
2b27924b PB |
5754 | /* |
5755 | * Without EPT it is not possible to restore L1's CR3 and PDPTR on | |
5756 | * VMfail, because they are not available in vmcs01. Just always | |
5757 | * use hardware checks. | |
5758 | */ | |
5759 | if (!enable_ept) | |
5760 | nested_early_check = 1; | |
5761 | ||
55d2375e SC |
5762 | if (!cpu_has_vmx_shadow_vmcs()) |
5763 | enable_shadow_vmcs = 0; | |
5764 | if (enable_shadow_vmcs) { | |
5765 | for (i = 0; i < VMX_BITMAP_NR; i++) { | |
41836839 BG |
5766 | /* |
5767 | * The vmx_bitmap is not tied to a VM and so should | |
5768 | * not be charged to a memcg. | |
5769 | */ | |
55d2375e SC |
5770 | vmx_bitmap[i] = (unsigned long *) |
5771 | __get_free_page(GFP_KERNEL); | |
5772 | if (!vmx_bitmap[i]) { | |
5773 | nested_vmx_hardware_unsetup(); | |
5774 | return -ENOMEM; | |
5775 | } | |
5776 | } | |
5777 | ||
5778 | init_vmcs_shadow_fields(); | |
5779 | } | |
5780 | ||
5781 | exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear, | |
5782 | exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch, | |
5783 | exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld, | |
5784 | exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst, | |
5785 | exit_handlers[EXIT_REASON_VMREAD] = handle_vmread, | |
5786 | exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume, | |
5787 | exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite, | |
5788 | exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff, | |
5789 | exit_handlers[EXIT_REASON_VMON] = handle_vmon, | |
5790 | exit_handlers[EXIT_REASON_INVEPT] = handle_invept, | |
5791 | exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid, | |
5792 | exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc, | |
5793 | ||
5794 | kvm_x86_ops->check_nested_events = vmx_check_nested_events; | |
5795 | kvm_x86_ops->get_nested_state = vmx_get_nested_state; | |
5796 | kvm_x86_ops->set_nested_state = vmx_set_nested_state; | |
5797 | kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages, | |
5798 | kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs; | |
e2e871ab | 5799 | kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version; |
55d2375e SC |
5800 | |
5801 | return 0; | |
5802 | } |