]>
Commit | Line | Data |
---|---|---|
43aa3132 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
8730046c S |
2 | /* |
3 | * X86 specific Hyper-V initialization code. | |
4 | * | |
5 | * Copyright (C) 2016, Microsoft, Inc. | |
6 | * | |
7 | * Author : K. Y. Srinivasan <kys@microsoft.com> | |
8730046c S |
8 | */ |
9 | ||
203a521b SS |
10 | #define pr_fmt(fmt) "Hyper-V: " fmt |
11 | ||
2f285f46 | 12 | #include <linux/efi.h> |
8730046c | 13 | #include <linux/types.h> |
a6c76bb0 | 14 | #include <linux/bitfield.h> |
6dc77fa5 | 15 | #include <linux/io.h> |
93286261 VK |
16 | #include <asm/apic.h> |
17 | #include <asm/desc.h> | |
7e8037b0 | 18 | #include <asm/e820/api.h> |
49d6a3c0 | 19 | #include <asm/sev.h> |
d5ace2a7 | 20 | #include <asm/ibt.h> |
8730046c | 21 | #include <asm/hypervisor.h> |
5a485803 | 22 | #include <asm/hyperv-tlfs.h> |
8730046c | 23 | #include <asm/mshyperv.h> |
a16be368 | 24 | #include <asm/idtentry.h> |
b1310355 | 25 | #include <asm/set_memory.h> |
dfe94d40 | 26 | #include <linux/kexec.h> |
8730046c S |
27 | #include <linux/version.h> |
28 | #include <linux/vmalloc.h> | |
29 | #include <linux/mm.h> | |
67071816 | 30 | #include <linux/hyperv.h> |
7415aea6 | 31 | #include <linux/slab.h> |
f3a99e76 | 32 | #include <linux/kernel.h> |
7415aea6 | 33 | #include <linux/cpuhotplug.h> |
05bd330a | 34 | #include <linux/syscore_ops.h> |
dd2cb348 | 35 | #include <clocksource/hyperv_timer.h> |
80f73c9f | 36 | #include <linux/highmem.h> |
8730046c | 37 | |
dfe94d40 | 38 | int hyperv_init_cpuhp; |
99a0f46a WL |
39 | u64 hv_current_partition_id = ~0ull; |
40 | EXPORT_SYMBOL_GPL(hv_current_partition_id); | |
dfe94d40 | 41 | |
fc53662f VK |
42 | void *hv_hypercall_pg; |
43 | EXPORT_SYMBOL_GPL(hv_hypercall_pg); | |
dee863b5 | 44 | |
e1878402 | 45 | union hv_ghcb * __percpu *hv_ghcb_pg; |
0cc4f6d9 | 46 | |
05bd330a DC |
47 | /* Storage to save the hypercall page temporarily for hibernation */ |
48 | static void *hv_hypercall_pg_saved; | |
49 | ||
a46d15cc VK |
50 | struct hv_vp_assist_page **hv_vp_assist_page; |
51 | EXPORT_SYMBOL_GPL(hv_vp_assist_page); | |
52 | ||
0cc4f6d9 TL |
53 | static int hyperv_init_ghcb(void) |
54 | { | |
55 | u64 ghcb_gpa; | |
56 | void *ghcb_va; | |
57 | void **ghcb_base; | |
58 | ||
e3131f1c | 59 | if (!ms_hyperv.paravisor_present || !hv_isolation_type_snp()) |
0cc4f6d9 TL |
60 | return 0; |
61 | ||
62 | if (!hv_ghcb_pg) | |
63 | return -EINVAL; | |
64 | ||
65 | /* | |
66 | * GHCB page is allocated by paravisor. The address | |
67 | * returned by MSR_AMD64_SEV_ES_GHCB is above shared | |
68 | * memory boundary and map it here. | |
69 | */ | |
70 | rdmsrl(MSR_AMD64_SEV_ES_GHCB, ghcb_gpa); | |
6afd9dc1 MK |
71 | |
72 | /* Mask out vTOM bit. ioremap_cache() maps decrypted */ | |
73 | ghcb_gpa &= ~ms_hyperv.shared_gpa_boundary; | |
74 | ghcb_va = (void *)ioremap_cache(ghcb_gpa, HV_HYP_PAGE_SIZE); | |
0cc4f6d9 TL |
75 | if (!ghcb_va) |
76 | return -ENOMEM; | |
77 | ||
78 | ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg); | |
79 | *ghcb_base = ghcb_va; | |
80 | ||
81 | return 0; | |
82 | } | |
83 | ||
7415aea6 VK |
84 | static int hv_cpu_init(unsigned int cpu) |
85 | { | |
e5d9b714 | 86 | union hv_vp_assist_msr_contents msr = { 0 }; |
68f2f2bc | 87 | struct hv_vp_assist_page **hvp; |
afca4d95 | 88 | int ret; |
7415aea6 | 89 | |
afca4d95 MK |
90 | ret = hv_common_cpu_init(cpu); |
91 | if (ret) | |
92 | return ret; | |
a3b74243 | 93 | |
a46d15cc VK |
94 | if (!hv_vp_assist_page) |
95 | return 0; | |
96 | ||
68f2f2bc | 97 | hvp = &hv_vp_assist_page[cpu]; |
ee681541 VK |
98 | if (hv_root_partition) { |
99 | /* | |
100 | * For root partition we get the hypervisor provided VP assist | |
101 | * page, instead of allocating a new page. | |
102 | */ | |
103 | rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); | |
104 | *hvp = memremap(msr.pfn << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT, | |
105 | PAGE_SIZE, MEMREMAP_WB); | |
106 | } else { | |
107 | /* | |
108 | * The VP assist page is an "overlay" page (see Hyper-V TLFS's | |
109 | * Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed | |
110 | * out to make sure we always write the EOI MSR in | |
111 | * hv_apic_eoi_write() *after* the EOI optimization is disabled | |
112 | * in hv_cpu_die(), otherwise a CPU may not be stopped in the | |
113 | * case of CPU offlining and the VM will hang. | |
114 | */ | |
b1310355 | 115 | if (!*hvp) { |
e5d9b714 | 116 | *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); |
b1310355 TL |
117 | |
118 | /* | |
119 | * Hyper-V should never specify a VM that is a Confidential | |
120 | * VM and also running in the root partition. Root partition | |
121 | * is blocked to run in Confidential VM. So only decrypt assist | |
122 | * page in non-root partition here. | |
123 | */ | |
e3131f1c | 124 | if (*hvp && !ms_hyperv.paravisor_present && hv_isolation_type_snp()) { |
b1310355 TL |
125 | WARN_ON_ONCE(set_memory_decrypted((unsigned long)(*hvp), 1)); |
126 | memset(*hvp, 0, PAGE_SIZE); | |
127 | } | |
128 | } | |
129 | ||
ee681541 VK |
130 | if (*hvp) |
131 | msr.pfn = vmalloc_to_pfn(*hvp); | |
132 | ||
133 | } | |
134 | if (!WARN_ON(!(*hvp))) { | |
135 | msr.enable = 1; | |
136 | wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); | |
a46d15cc VK |
137 | } |
138 | ||
0cc4f6d9 | 139 | return hyperv_init_ghcb(); |
7415aea6 VK |
140 | } |
141 | ||
93286261 VK |
142 | static void (*hv_reenlightenment_cb)(void); |
143 | ||
144 | static void hv_reenlightenment_notify(struct work_struct *dummy) | |
145 | { | |
146 | struct hv_tsc_emulation_status emu_status; | |
147 | ||
148 | rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); | |
149 | ||
150 | /* Don't issue the callback if TSC accesses are not emulated */ | |
151 | if (hv_reenlightenment_cb && emu_status.inprogress) | |
152 | hv_reenlightenment_cb(); | |
153 | } | |
154 | static DECLARE_DELAYED_WORK(hv_reenlightenment_work, hv_reenlightenment_notify); | |
155 | ||
156 | void hyperv_stop_tsc_emulation(void) | |
157 | { | |
158 | u64 freq; | |
159 | struct hv_tsc_emulation_status emu_status; | |
160 | ||
161 | rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); | |
162 | emu_status.inprogress = 0; | |
163 | wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); | |
164 | ||
165 | rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); | |
166 | tsc_khz = div64_u64(freq, 1000); | |
167 | } | |
168 | EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation); | |
169 | ||
170 | static inline bool hv_reenlightenment_available(void) | |
171 | { | |
172 | /* | |
d9f6e12f | 173 | * Check for required features and privileges to make TSC frequency |
93286261 VK |
174 | * change notifications work. |
175 | */ | |
dfc53baa | 176 | return ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && |
93286261 | 177 | ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE && |
dfc53baa | 178 | ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT; |
93286261 VK |
179 | } |
180 | ||
a16be368 | 181 | DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_reenlightenment) |
93286261 | 182 | { |
670c04ad | 183 | apic_eoi(); |
51d4e5da | 184 | inc_irq_stat(irq_hv_reenlightenment_count); |
93286261 | 185 | schedule_delayed_work(&hv_reenlightenment_work, HZ/10); |
93286261 VK |
186 | } |
187 | ||
188 | void set_hv_tscchange_cb(void (*cb)(void)) | |
189 | { | |
190 | struct hv_reenlightenment_control re_ctrl = { | |
191 | .vector = HYPERV_REENLIGHTENMENT_VECTOR, | |
192 | .enabled = 1, | |
93286261 VK |
193 | }; |
194 | struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1}; | |
195 | ||
196 | if (!hv_reenlightenment_available()) { | |
203a521b | 197 | pr_warn("reenlightenment support is unavailable\n"); |
93286261 VK |
198 | return; |
199 | } | |
200 | ||
daf97211 SC |
201 | if (!hv_vp_index) |
202 | return; | |
203 | ||
93286261 VK |
204 | hv_reenlightenment_cb = cb; |
205 | ||
206 | /* Make sure callback is registered before we write to MSRs */ | |
207 | wmb(); | |
208 | ||
285f68af VK |
209 | re_ctrl.target_vp = hv_vp_index[get_cpu()]; |
210 | ||
93286261 VK |
211 | wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); |
212 | wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl)); | |
285f68af VK |
213 | |
214 | put_cpu(); | |
93286261 VK |
215 | } |
216 | EXPORT_SYMBOL_GPL(set_hv_tscchange_cb); | |
217 | ||
218 | void clear_hv_tscchange_cb(void) | |
219 | { | |
220 | struct hv_reenlightenment_control re_ctrl; | |
221 | ||
222 | if (!hv_reenlightenment_available()) | |
223 | return; | |
224 | ||
225 | rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); | |
226 | re_ctrl.enabled = 0; | |
227 | wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); | |
228 | ||
229 | hv_reenlightenment_cb = NULL; | |
230 | } | |
231 | EXPORT_SYMBOL_GPL(clear_hv_tscchange_cb); | |
232 | ||
e7c4e36c VK |
233 | static int hv_cpu_die(unsigned int cpu) |
234 | { | |
235 | struct hv_reenlightenment_control re_ctrl; | |
236 | unsigned int new_cpu; | |
0cc4f6d9 TL |
237 | void **ghcb_va; |
238 | ||
239 | if (hv_ghcb_pg) { | |
240 | ghcb_va = (void **)this_cpu_ptr(hv_ghcb_pg); | |
241 | if (*ghcb_va) | |
6afd9dc1 | 242 | iounmap(*ghcb_va); |
0cc4f6d9 TL |
243 | *ghcb_va = NULL; |
244 | } | |
68bb7bfb | 245 | |
afca4d95 | 246 | hv_common_cpu_die(cpu); |
e7c4e36c | 247 | |
e5d9b714 PK |
248 | if (hv_vp_assist_page && hv_vp_assist_page[cpu]) { |
249 | union hv_vp_assist_msr_contents msr = { 0 }; | |
250 | if (hv_root_partition) { | |
251 | /* | |
252 | * For root partition the VP assist page is mapped to | |
253 | * hypervisor provided page, and thus we unmap the | |
254 | * page here and nullify it, so that in future we have | |
255 | * correct page address mapped in hv_cpu_init. | |
256 | */ | |
257 | memunmap(hv_vp_assist_page[cpu]); | |
258 | hv_vp_assist_page[cpu] = NULL; | |
259 | rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); | |
260 | msr.enable = 0; | |
261 | } | |
262 | wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); | |
263 | } | |
a46d15cc | 264 | |
e7c4e36c VK |
265 | if (hv_reenlightenment_cb == NULL) |
266 | return 0; | |
267 | ||
268 | rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); | |
269 | if (re_ctrl.target_vp == hv_vp_index[cpu]) { | |
38dce419 VK |
270 | /* |
271 | * Reassign reenlightenment notifications to some other online | |
272 | * CPU or just disable the feature if there are no online CPUs | |
273 | * left (happens on hibernation). | |
274 | */ | |
e7c4e36c VK |
275 | new_cpu = cpumask_any_but(cpu_online_mask, cpu); |
276 | ||
38dce419 VK |
277 | if (new_cpu < nr_cpu_ids) |
278 | re_ctrl.target_vp = hv_vp_index[new_cpu]; | |
279 | else | |
280 | re_ctrl.enabled = 0; | |
281 | ||
e7c4e36c VK |
282 | wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); |
283 | } | |
284 | ||
285 | return 0; | |
286 | } | |
287 | ||
2f285f46 DC |
288 | static int __init hv_pci_init(void) |
289 | { | |
7e8037b0 | 290 | bool gen2vm = efi_enabled(EFI_BOOT); |
2f285f46 DC |
291 | |
292 | /* | |
7e8037b0 SS |
293 | * A Generation-2 VM doesn't support legacy PCI/PCIe, so both |
294 | * raw_pci_ops and raw_pci_ext_ops are NULL, and pci_subsys_init() -> | |
295 | * pcibios_init() doesn't call pcibios_resource_survey() -> | |
296 | * e820__reserve_resources_late(); as a result, any emulated persistent | |
297 | * memory of E820_TYPE_PRAM (12) via the kernel parameter | |
298 | * memmap=nn[KMG]!ss is not added into iomem_resource and hence can't be | |
299 | * detected by register_e820_pmem(). Fix this by directly calling | |
300 | * e820__reserve_resources_late() here: e820__reserve_resources_late() | |
301 | * depends on e820__reserve_resources(), which has been called earlier | |
302 | * from setup_arch(). Note: e820__reserve_resources_late() also adds | |
303 | * any memory of E820_TYPE_PMEM (7) into iomem_resource, and | |
304 | * acpi_nfit_register_region() -> acpi_nfit_insert_resource() -> | |
305 | * region_intersects() returns REGION_INTERSECTS, so the memory of | |
306 | * E820_TYPE_PMEM won't get added twice. | |
307 | * | |
308 | * We return 0 here so that pci_arch_init() won't print the warning: | |
2f285f46 DC |
309 | * "PCI: Fatal: No config space access function found" |
310 | */ | |
7e8037b0 SS |
311 | if (gen2vm) { |
312 | e820__reserve_resources_late(); | |
2f285f46 | 313 | return 0; |
7e8037b0 | 314 | } |
2f285f46 DC |
315 | |
316 | /* For Generation-1 VM, we'll proceed in pci_arch_init(). */ | |
317 | return 1; | |
318 | } | |
319 | ||
05bd330a DC |
320 | static int hv_suspend(void) |
321 | { | |
322 | union hv_x64_msr_hypercall_contents hypercall_msr; | |
421f090c | 323 | int ret; |
05bd330a | 324 | |
80f73c9f WL |
325 | if (hv_root_partition) |
326 | return -EPERM; | |
327 | ||
05bd330a DC |
328 | /* |
329 | * Reset the hypercall page as it is going to be invalidated | |
d9f6e12f | 330 | * across hibernation. Setting hv_hypercall_pg to NULL ensures |
05bd330a DC |
331 | * that any subsequent hypercall operation fails safely instead of |
332 | * crashing due to an access of an invalid page. The hypercall page | |
333 | * pointer is restored on resume. | |
334 | */ | |
335 | hv_hypercall_pg_saved = hv_hypercall_pg; | |
336 | hv_hypercall_pg = NULL; | |
337 | ||
338 | /* Disable the hypercall page in the hypervisor */ | |
339 | rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); | |
340 | hypercall_msr.enable = 0; | |
341 | wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); | |
342 | ||
421f090c DC |
343 | ret = hv_cpu_die(0); |
344 | return ret; | |
05bd330a DC |
345 | } |
346 | ||
347 | static void hv_resume(void) | |
348 | { | |
349 | union hv_x64_msr_hypercall_contents hypercall_msr; | |
421f090c DC |
350 | int ret; |
351 | ||
352 | ret = hv_cpu_init(0); | |
353 | WARN_ON(ret); | |
05bd330a DC |
354 | |
355 | /* Re-enable the hypercall page */ | |
356 | rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); | |
357 | hypercall_msr.enable = 1; | |
358 | hypercall_msr.guest_physical_address = | |
359 | vmalloc_to_pfn(hv_hypercall_pg_saved); | |
360 | wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); | |
361 | ||
362 | hv_hypercall_pg = hv_hypercall_pg_saved; | |
363 | hv_hypercall_pg_saved = NULL; | |
38dce419 VK |
364 | |
365 | /* | |
366 | * Reenlightenment notifications are disabled by hv_cpu_die(0), | |
367 | * reenable them here if hv_reenlightenment_cb was previously set. | |
368 | */ | |
369 | if (hv_reenlightenment_cb) | |
370 | set_hv_tscchange_cb(hv_reenlightenment_cb); | |
05bd330a DC |
371 | } |
372 | ||
421f090c | 373 | /* Note: when the ops are called, only CPU0 is online and IRQs are disabled. */ |
05bd330a DC |
374 | static struct syscore_ops hv_syscore_ops = { |
375 | .suspend = hv_suspend, | |
376 | .resume = hv_resume, | |
377 | }; | |
378 | ||
fff7b5e6 DC |
379 | static void (* __initdata old_setup_percpu_clockev)(void); |
380 | ||
381 | static void __init hv_stimer_setup_percpu_clockev(void) | |
382 | { | |
383 | /* | |
384 | * Ignore any errors in setting up stimer clockevents | |
385 | * as we can run with the LAPIC timer as a fallback. | |
386 | */ | |
ec866be6 | 387 | (void)hv_stimer_alloc(false); |
fff7b5e6 DC |
388 | |
389 | /* | |
390 | * Still register the LAPIC timer, because the direct-mode STIMER is | |
391 | * not supported by old versions of Hyper-V. This also allows users | |
392 | * to switch to LAPIC timer via /sys, if they want to. | |
393 | */ | |
394 | if (old_setup_percpu_clockev) | |
395 | old_setup_percpu_clockev(); | |
396 | } | |
397 | ||
99a0f46a WL |
398 | static void __init hv_get_partition_id(void) |
399 | { | |
400 | struct hv_get_partition_id *output_page; | |
401 | u64 status; | |
402 | unsigned long flags; | |
403 | ||
404 | local_irq_save(flags); | |
405 | output_page = *this_cpu_ptr(hyperv_pcpu_output_arg); | |
406 | status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output_page); | |
753ed9c9 | 407 | if (!hv_result_success(status)) { |
99a0f46a WL |
408 | /* No point in proceeding if this failed */ |
409 | pr_err("Failed to get partition ID: %lld\n", status); | |
410 | BUG(); | |
411 | } | |
412 | hv_current_partition_id = output_page->partition_id; | |
413 | local_irq_restore(flags); | |
414 | } | |
415 | ||
f2a55d08 | 416 | #if IS_ENABLED(CONFIG_HYPERV_VTL_MODE) |
8387ce06 TL |
417 | static u8 __init get_vtl(void) |
418 | { | |
419 | u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS; | |
420 | struct hv_get_vp_registers_input *input; | |
421 | struct hv_get_vp_registers_output *output; | |
422 | unsigned long flags; | |
423 | u64 ret; | |
424 | ||
425 | local_irq_save(flags); | |
426 | input = *this_cpu_ptr(hyperv_pcpu_input_arg); | |
427 | output = (struct hv_get_vp_registers_output *)input; | |
428 | ||
429 | memset(input, 0, struct_size(input, element, 1)); | |
430 | input->header.partitionid = HV_PARTITION_ID_SELF; | |
431 | input->header.vpindex = HV_VP_INDEX_SELF; | |
432 | input->header.inputvtl = 0; | |
433 | input->element[0].name0 = HV_X64_REGISTER_VSM_VP_STATUS; | |
434 | ||
435 | ret = hv_do_hypercall(control, input, output); | |
436 | if (hv_result_success(ret)) { | |
437 | ret = output->as64.low & HV_X64_VTL_MASK; | |
438 | } else { | |
f2a55d08 SS |
439 | pr_err("Failed to get VTL(error: %lld) exiting...\n", ret); |
440 | BUG(); | |
8387ce06 TL |
441 | } |
442 | ||
443 | local_irq_restore(flags); | |
444 | return ret; | |
445 | } | |
f2a55d08 SS |
446 | #else |
447 | static inline u8 get_vtl(void) { return 0; } | |
448 | #endif | |
8387ce06 | 449 | |
8730046c S |
450 | /* |
451 | * This function is to be invoked early in the boot sequence after the | |
452 | * hypervisor has been detected. | |
453 | * | |
454 | * 1. Setup the hypercall page. | |
63ed4e0c | 455 | * 2. Register Hyper-V specific clocksource. |
6b48cb5f | 456 | * 3. Setup Hyper-V specific APIC entry points. |
8730046c | 457 | */ |
6b48cb5f | 458 | void __init hyperv_init(void) |
8730046c | 459 | { |
f3e613e7 | 460 | u64 guest_id; |
8730046c | 461 | union hv_x64_msr_hypercall_contents hypercall_msr; |
afca4d95 | 462 | int cpuhp; |
8730046c | 463 | |
03b2a320 | 464 | if (x86_hyper_type != X86_HYPER_MS_HYPERV) |
8730046c S |
465 | return; |
466 | ||
afca4d95 | 467 | if (hv_common_init()) |
7415aea6 VK |
468 | return; |
469 | ||
68f2f2bc DC |
470 | /* |
471 | * The VP assist page is useless to a TDX guest: the only use we | |
472 | * would have for it is lazy EOI, which can not be used with TDX. | |
473 | */ | |
474 | if (hv_isolation_type_tdx()) | |
475 | hv_vp_assist_page = NULL; | |
476 | else | |
477 | hv_vp_assist_page = kcalloc(num_possible_cpus(), | |
478 | sizeof(*hv_vp_assist_page), | |
479 | GFP_KERNEL); | |
a46d15cc VK |
480 | if (!hv_vp_assist_page) { |
481 | ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; | |
68f2f2bc DC |
482 | |
483 | if (!hv_isolation_type_tdx()) | |
484 | goto common_free; | |
a46d15cc VK |
485 | } |
486 | ||
e3131f1c | 487 | if (ms_hyperv.paravisor_present && hv_isolation_type_snp()) { |
49d6a3c0 TL |
488 | /* Negotiate GHCB Version. */ |
489 | if (!hv_ghcb_negotiate_protocol()) | |
490 | hv_ghcb_terminate(SEV_TERM_SET_GEN, | |
491 | GHCB_SEV_ES_PROT_UNSUPPORTED); | |
492 | ||
faff4406 | 493 | hv_ghcb_pg = alloc_percpu(union hv_ghcb *); |
0cc4f6d9 TL |
494 | if (!hv_ghcb_pg) |
495 | goto free_vp_assist_page; | |
496 | } | |
497 | ||
9636be85 | 498 | cpuhp = cpuhp_setup_state(CPUHP_AP_HYPERV_ONLINE, "x86/hyperv_init:online", |
a46d15cc VK |
499 | hv_cpu_init, hv_cpu_die); |
500 | if (cpuhp < 0) | |
0cc4f6d9 | 501 | goto free_ghcb_page; |
7415aea6 | 502 | |
8730046c S |
503 | /* |
504 | * Setup the hypercall page and enable hypercalls. | |
505 | * 1. Register the guest ID | |
506 | * 2. Enable the hypercall and register the hypercall page | |
23378295 DC |
507 | * |
508 | * A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg: | |
509 | * when the hypercall input is a page, such a VM must pass a decrypted | |
510 | * page to Hyper-V, e.g. hv_post_message() uses the per-CPU page | |
511 | * hyperv_pcpu_input_arg, which is decrypted if no paravisor is present. | |
512 | * | |
513 | * A TDX VM with the paravisor uses hv_hypercall_pg for most hypercalls, | |
514 | * which are handled by the paravisor and the VM must use an encrypted | |
515 | * input page: in such a VM, the hyperv_pcpu_input_arg is encrypted and | |
516 | * used in the hypercalls, e.g. see hv_mark_gpa_visibility() and | |
517 | * hv_arch_irq_unmask(). Such a VM uses TDX GHCI for two hypercalls: | |
518 | * 1. HVCALL_SIGNAL_EVENT: see vmbus_set_event() and _hv_do_fast_hypercall8(). | |
519 | * 2. HVCALL_POST_MESSAGE: the input page must be a decrypted page, i.e. | |
520 | * hv_post_message() in such a VM can't use the encrypted hyperv_pcpu_input_arg; | |
521 | * instead, hv_post_message() uses the post_msg_page, which is decrypted | |
522 | * in such a VM and is only used in such a VM. | |
8730046c | 523 | */ |
d5ebde1e | 524 | guest_id = hv_generate_guest_id(LINUX_VERSION_CODE); |
8730046c S |
525 | wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); |
526 | ||
b9b4fe3a DC |
527 | /* With the paravisor, the VM must also write the ID via GHCB/GHCI */ |
528 | hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id); | |
faff4406 | 529 | |
23378295 DC |
530 | /* A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg */ |
531 | if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present) | |
d6e0228d | 532 | goto skip_hypercall_pg_init; |
faff4406 | 533 | |
800e26b8 CH |
534 | hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, |
535 | VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX, | |
a3a66c38 CH |
536 | VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, |
537 | __builtin_return_address(0)); | |
0cc4f6d9 TL |
538 | if (hv_hypercall_pg == NULL) |
539 | goto clean_guest_os_id; | |
8730046c S |
540 | |
541 | rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); | |
542 | hypercall_msr.enable = 1; | |
80f73c9f WL |
543 | |
544 | if (hv_root_partition) { | |
545 | struct page *pg; | |
03b9a6e1 | 546 | void *src; |
80f73c9f WL |
547 | |
548 | /* | |
549 | * For the root partition, the hypervisor will set up its | |
550 | * hypercall page. The hypervisor guarantees it will not show | |
551 | * up in the root's address space. The root can't change the | |
552 | * location of the hypercall page. | |
553 | * | |
554 | * Order is important here. We must enable the hypercall page | |
555 | * so it is populated with code, then copy the code to an | |
556 | * executable page. | |
557 | */ | |
558 | wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); | |
559 | ||
560 | pg = vmalloc_to_page(hv_hypercall_pg); | |
80f73c9f WL |
561 | src = memremap(hypercall_msr.guest_physical_address << PAGE_SHIFT, PAGE_SIZE, |
562 | MEMREMAP_WB); | |
03b9a6e1 ZL |
563 | BUG_ON(!src); |
564 | memcpy_to_page(pg, 0, src, HV_HYP_PAGE_SIZE); | |
80f73c9f | 565 | memunmap(src); |
0408f16b SK |
566 | |
567 | hv_remap_tsc_clocksource(); | |
80f73c9f WL |
568 | } else { |
569 | hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); | |
570 | wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); | |
571 | } | |
63ed4e0c | 572 | |
d6e0228d | 573 | skip_hypercall_pg_init: |
d5ace2a7 MK |
574 | /* |
575 | * Some versions of Hyper-V that provide IBT in guest VMs have a bug | |
576 | * in that there's no ENDBR64 instruction at the entry to the | |
577 | * hypercall page. Because hypercalls are invoked via an indirect call | |
578 | * to the hypercall page, all hypercall attempts fail when IBT is | |
579 | * enabled, and Linux panics. For such buggy versions, disable IBT. | |
580 | * | |
581 | * Fixed versions of Hyper-V always provide ENDBR64 on the hypercall | |
582 | * page, so if future Linux kernel versions enable IBT for 32-bit | |
583 | * builds, additional hypercall page hackery will be required here | |
584 | * to provide an ENDBR32. | |
585 | */ | |
586 | #ifdef CONFIG_X86_KERNEL_IBT | |
587 | if (cpu_feature_enabled(X86_FEATURE_IBT) && | |
588 | *(u32 *)hv_hypercall_pg != gen_endbr()) { | |
589 | setup_clear_cpu_cap(X86_FEATURE_IBT); | |
203a521b | 590 | pr_warn("Disabling IBT because of Hyper-V bug\n"); |
d5ace2a7 MK |
591 | } |
592 | #endif | |
593 | ||
4df4cb9e | 594 | /* |
fff7b5e6 DC |
595 | * hyperv_init() is called before LAPIC is initialized: see |
596 | * apic_intr_mode_init() -> x86_platform.apic_post_init() and | |
597 | * apic_bsp_setup() -> setup_local_APIC(). The direct-mode STIMER | |
598 | * depends on LAPIC, so hv_stimer_alloc() should be called from | |
599 | * x86_init.timers.setup_percpu_clockev. | |
4df4cb9e | 600 | */ |
fff7b5e6 DC |
601 | old_setup_percpu_clockev = x86_init.timers.setup_percpu_clockev; |
602 | x86_init.timers.setup_percpu_clockev = hv_stimer_setup_percpu_clockev; | |
4df4cb9e | 603 | |
6b48cb5f S |
604 | hv_apic_init(); |
605 | ||
2f285f46 DC |
606 | x86_init.pci.arch_init = hv_pci_init; |
607 | ||
05bd330a DC |
608 | register_syscore_ops(&hv_syscore_ops); |
609 | ||
dfe94d40 | 610 | hyperv_init_cpuhp = cpuhp; |
99a0f46a WL |
611 | |
612 | if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID) | |
613 | hv_get_partition_id(); | |
614 | ||
615 | BUG_ON(hv_root_partition && hv_current_partition_id == ~0ull); | |
616 | ||
e39397d1 WL |
617 | #ifdef CONFIG_PCI_MSI |
618 | /* | |
619 | * If we're running as root, we want to create our own PCI MSI domain. | |
620 | * We can't set this in hv_pci_init because that would be too late. | |
621 | */ | |
622 | if (hv_root_partition) | |
623 | x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain; | |
624 | #endif | |
625 | ||
6dc2a774 SM |
626 | /* Query the VMs extended capability once, so that it can be cached. */ |
627 | hv_query_ext_cap(0); | |
062a5c42 | 628 | |
8387ce06 | 629 | /* Find the VTL */ |
f2a55d08 | 630 | ms_hyperv.vtl = get_vtl(); |
8387ce06 | 631 | |
14058f72 SS |
632 | if (ms_hyperv.vtl > 0) /* non default VTL */ |
633 | hv_vtl_early_init(); | |
634 | ||
7415aea6 VK |
635 | return; |
636 | ||
0cc4f6d9 TL |
637 | clean_guest_os_id: |
638 | wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); | |
b9b4fe3a | 639 | hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); |
a46d15cc | 640 | cpuhp_remove_state(cpuhp); |
0cc4f6d9 TL |
641 | free_ghcb_page: |
642 | free_percpu(hv_ghcb_pg); | |
a46d15cc VK |
643 | free_vp_assist_page: |
644 | kfree(hv_vp_assist_page); | |
645 | hv_vp_assist_page = NULL; | |
afca4d95 MK |
646 | common_free: |
647 | hv_common_free(); | |
8730046c | 648 | } |
6ab42a66 | 649 | |
d6f3609d VK |
650 | /* |
651 | * This routine is called before kexec/kdump, it does the required cleanup. | |
652 | */ | |
653 | void hyperv_cleanup(void) | |
654 | { | |
655 | union hv_x64_msr_hypercall_contents hypercall_msr; | |
2982635a | 656 | union hv_reference_tsc_msr tsc_msr; |
d6f3609d VK |
657 | |
658 | /* Reset our OS id */ | |
659 | wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); | |
b9b4fe3a | 660 | hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); |
d6f3609d | 661 | |
179fb36a KS |
662 | /* |
663 | * Reset hypercall page reference before reset the page, | |
664 | * let hypercall operations fail safely rather than | |
665 | * panic the kernel for using invalid hypercall page | |
666 | */ | |
667 | hv_hypercall_pg = NULL; | |
668 | ||
d6f3609d | 669 | /* Reset the hypercall page */ |
2982635a AR |
670 | hypercall_msr.as_uint64 = hv_get_register(HV_X64_MSR_HYPERCALL); |
671 | hypercall_msr.enable = 0; | |
672 | hv_set_register(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); | |
5647dbf8 VK |
673 | |
674 | /* Reset the TSC page */ | |
2982635a AR |
675 | tsc_msr.as_uint64 = hv_get_register(HV_X64_MSR_REFERENCE_TSC); |
676 | tsc_msr.enable = 0; | |
677 | hv_set_register(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); | |
d6f3609d | 678 | } |
d6f3609d | 679 | |
f3a99e76 | 680 | void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die) |
d058fa7e S |
681 | { |
682 | static bool panic_reported; | |
7ed4325a | 683 | u64 guest_id; |
d058fa7e | 684 | |
f3a99e76 TL |
685 | if (in_die && !panic_on_oops) |
686 | return; | |
687 | ||
d058fa7e S |
688 | /* |
689 | * We prefer to report panic on 'die' chain as we have proper | |
690 | * registers to report, but if we miss it (e.g. on BUG()) we need | |
691 | * to report it on 'panic'. | |
692 | */ | |
693 | if (panic_reported) | |
694 | return; | |
695 | panic_reported = true; | |
696 | ||
7ed4325a S |
697 | rdmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); |
698 | ||
699 | wrmsrl(HV_X64_MSR_CRASH_P0, err); | |
700 | wrmsrl(HV_X64_MSR_CRASH_P1, guest_id); | |
701 | wrmsrl(HV_X64_MSR_CRASH_P2, regs->ip); | |
702 | wrmsrl(HV_X64_MSR_CRASH_P3, regs->ax); | |
703 | wrmsrl(HV_X64_MSR_CRASH_P4, regs->sp); | |
d058fa7e S |
704 | |
705 | /* | |
706 | * Let Hyper-V know there is crash data available | |
707 | */ | |
708 | wrmsrl(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY); | |
709 | } | |
710 | EXPORT_SYMBOL_GPL(hyperv_report_panic); | |
73638cdd | 711 | |
4a5f3cde | 712 | bool hv_is_hyperv_initialized(void) |
73638cdd S |
713 | { |
714 | union hv_x64_msr_hypercall_contents hypercall_msr; | |
715 | ||
4a5f3cde MK |
716 | /* |
717 | * Ensure that we're really on Hyper-V, and not a KVM or Xen | |
718 | * emulation of Hyper-V | |
719 | */ | |
720 | if (x86_hyper_type != X86_HYPER_MS_HYPERV) | |
721 | return false; | |
722 | ||
d3a9d7e4 DC |
723 | /* A TDX VM with no paravisor uses TDX GHCI call rather than hv_hypercall_pg */ |
724 | if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present) | |
d6e0228d | 725 | return true; |
4a5f3cde MK |
726 | /* |
727 | * Verify that earlier initialization succeeded by checking | |
728 | * that the hypercall page is setup | |
729 | */ | |
73638cdd S |
730 | hypercall_msr.as_uint64 = 0; |
731 | rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); | |
732 | ||
4a5f3cde | 733 | return hypercall_msr.enable; |
73638cdd | 734 | } |
4a5f3cde | 735 | EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized); |