]> git.ipfire.org Git - thirdparty/kernel/stable.git/blob - arch/x86/coco/tdx/tdx.c
KVM: clean up directives to compile out irqfds
[thirdparty/kernel/stable.git] / arch / x86 / coco / tdx / tdx.c
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (C) 2021-2022 Intel Corporation */
3
4 #undef pr_fmt
5 #define pr_fmt(fmt) "tdx: " fmt
6
7 #include <linux/cpufeature.h>
8 #include <linux/export.h>
9 #include <linux/io.h>
10 #include <asm/coco.h>
11 #include <asm/tdx.h>
12 #include <asm/vmx.h>
13 #include <asm/insn.h>
14 #include <asm/insn-eval.h>
15 #include <asm/pgtable.h>
16
17 /* MMIO direction */
18 #define EPT_READ 0
19 #define EPT_WRITE 1
20
21 /* Port I/O direction */
22 #define PORT_READ 0
23 #define PORT_WRITE 1
24
25 /* See Exit Qualification for I/O Instructions in VMX documentation */
26 #define VE_IS_IO_IN(e) ((e) & BIT(3))
27 #define VE_GET_IO_SIZE(e) (((e) & GENMASK(2, 0)) + 1)
28 #define VE_GET_PORT_NUM(e) ((e) >> 16)
29 #define VE_IS_IO_STRING(e) ((e) & BIT(4))
30
31 #define ATTR_DEBUG BIT(0)
32 #define ATTR_SEPT_VE_DISABLE BIT(28)
33
34 /* TDX Module call error codes */
35 #define TDCALL_RETURN_CODE(a) ((a) >> 32)
36 #define TDCALL_INVALID_OPERAND 0xc0000100
37
38 #define TDREPORT_SUBTYPE_0 0
39
40 /* Called from __tdx_hypercall() for unrecoverable failure */
41 noinstr void __noreturn __tdx_hypercall_failed(void)
42 {
43 instrumentation_begin();
44 panic("TDVMCALL failed. TDX module bug?");
45 }
46
47 #ifdef CONFIG_KVM_GUEST
48 long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,
49 unsigned long p3, unsigned long p4)
50 {
51 struct tdx_module_args args = {
52 .r10 = nr,
53 .r11 = p1,
54 .r12 = p2,
55 .r13 = p3,
56 .r14 = p4,
57 };
58
59 return __tdx_hypercall(&args);
60 }
61 EXPORT_SYMBOL_GPL(tdx_kvm_hypercall);
62 #endif
63
64 /*
65 * Used for TDX guests to make calls directly to the TD module. This
66 * should only be used for calls that have no legitimate reason to fail
67 * or where the kernel can not survive the call failing.
68 */
69 static inline void tdcall(u64 fn, struct tdx_module_args *args)
70 {
71 if (__tdcall_ret(fn, args))
72 panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
73 }
74
75 /**
76 * tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT
77 * subtype 0) using TDG.MR.REPORT TDCALL.
78 * @reportdata: Address of the input buffer which contains user-defined
79 * REPORTDATA to be included into TDREPORT.
80 * @tdreport: Address of the output buffer to store TDREPORT.
81 *
82 * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module
83 * v1.0 specification for more information on TDG.MR.REPORT TDCALL.
84 * It is used in the TDX guest driver module to get the TDREPORT0.
85 *
86 * Return 0 on success, -EINVAL for invalid operands, or -EIO on
87 * other TDCALL failures.
88 */
89 int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
90 {
91 struct tdx_module_args args = {
92 .rcx = virt_to_phys(tdreport),
93 .rdx = virt_to_phys(reportdata),
94 .r8 = TDREPORT_SUBTYPE_0,
95 };
96 u64 ret;
97
98 ret = __tdcall(TDG_MR_REPORT, &args);
99 if (ret) {
100 if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
101 return -EINVAL;
102 return -EIO;
103 }
104
105 return 0;
106 }
107 EXPORT_SYMBOL_GPL(tdx_mcall_get_report0);
108
109 /**
110 * tdx_hcall_get_quote() - Wrapper to request TD Quote using GetQuote
111 * hypercall.
112 * @buf: Address of the directly mapped shared kernel buffer which
113 * contains TDREPORT. The same buffer will be used by VMM to
114 * store the generated TD Quote output.
115 * @size: size of the tdquote buffer (4KB-aligned).
116 *
117 * Refer to section titled "TDG.VP.VMCALL<GetQuote>" in the TDX GHCI
118 * v1.0 specification for more information on GetQuote hypercall.
119 * It is used in the TDX guest driver module to get the TD Quote.
120 *
121 * Return 0 on success or error code on failure.
122 */
123 u64 tdx_hcall_get_quote(u8 *buf, size_t size)
124 {
125 /* Since buf is a shared memory, set the shared (decrypted) bits */
126 return _tdx_hypercall(TDVMCALL_GET_QUOTE, cc_mkdec(virt_to_phys(buf)), size, 0, 0);
127 }
128 EXPORT_SYMBOL_GPL(tdx_hcall_get_quote);
129
130 static void __noreturn tdx_panic(const char *msg)
131 {
132 struct tdx_module_args args = {
133 .r10 = TDX_HYPERCALL_STANDARD,
134 .r11 = TDVMCALL_REPORT_FATAL_ERROR,
135 .r12 = 0, /* Error code: 0 is Panic */
136 };
137 union {
138 /* Define register order according to the GHCI */
139 struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; };
140
141 char str[64];
142 } message;
143
144 /* VMM assumes '\0' in byte 65, if the message took all 64 bytes */
145 strtomem_pad(message.str, msg, '\0');
146
147 args.r8 = message.r8;
148 args.r9 = message.r9;
149 args.r14 = message.r14;
150 args.r15 = message.r15;
151 args.rdi = message.rdi;
152 args.rsi = message.rsi;
153 args.rbx = message.rbx;
154 args.rdx = message.rdx;
155
156 /*
157 * This hypercall should never return and it is not safe
158 * to keep the guest running. Call it forever if it
159 * happens to return.
160 */
161 while (1)
162 __tdx_hypercall(&args);
163 }
164
165 static void tdx_parse_tdinfo(u64 *cc_mask)
166 {
167 struct tdx_module_args args = {};
168 unsigned int gpa_width;
169 u64 td_attr;
170
171 /*
172 * TDINFO TDX module call is used to get the TD execution environment
173 * information like GPA width, number of available vcpus, debug mode
174 * information, etc. More details about the ABI can be found in TDX
175 * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
176 * [TDG.VP.INFO].
177 */
178 tdcall(TDG_VP_INFO, &args);
179
180 /*
181 * The highest bit of a guest physical address is the "sharing" bit.
182 * Set it for shared pages and clear it for private pages.
183 *
184 * The GPA width that comes out of this call is critical. TDX guests
185 * can not meaningfully run without it.
186 */
187 gpa_width = args.rcx & GENMASK(5, 0);
188 *cc_mask = BIT_ULL(gpa_width - 1);
189
190 /*
191 * The kernel can not handle #VE's when accessing normal kernel
192 * memory. Ensure that no #VE will be delivered for accesses to
193 * TD-private memory. Only VMM-shared memory (MMIO) will #VE.
194 */
195 td_attr = args.rdx;
196 if (!(td_attr & ATTR_SEPT_VE_DISABLE)) {
197 const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set.";
198
199 /* Relax SEPT_VE_DISABLE check for debug TD. */
200 if (td_attr & ATTR_DEBUG)
201 pr_warn("%s\n", msg);
202 else
203 tdx_panic(msg);
204 }
205 }
206
207 /*
208 * The TDX module spec states that #VE may be injected for a limited set of
209 * reasons:
210 *
211 * - Emulation of the architectural #VE injection on EPT violation;
212 *
213 * - As a result of guest TD execution of a disallowed instruction,
214 * a disallowed MSR access, or CPUID virtualization;
215 *
216 * - A notification to the guest TD about anomalous behavior;
217 *
218 * The last one is opt-in and is not used by the kernel.
219 *
220 * The Intel Software Developer's Manual describes cases when instruction
221 * length field can be used in section "Information for VM Exits Due to
222 * Instruction Execution".
223 *
224 * For TDX, it ultimately means GET_VEINFO provides reliable instruction length
225 * information if #VE occurred due to instruction execution, but not for EPT
226 * violations.
227 */
228 static int ve_instr_len(struct ve_info *ve)
229 {
230 switch (ve->exit_reason) {
231 case EXIT_REASON_HLT:
232 case EXIT_REASON_MSR_READ:
233 case EXIT_REASON_MSR_WRITE:
234 case EXIT_REASON_CPUID:
235 case EXIT_REASON_IO_INSTRUCTION:
236 /* It is safe to use ve->instr_len for #VE due instructions */
237 return ve->instr_len;
238 case EXIT_REASON_EPT_VIOLATION:
239 /*
240 * For EPT violations, ve->insn_len is not defined. For those,
241 * the kernel must decode instructions manually and should not
242 * be using this function.
243 */
244 WARN_ONCE(1, "ve->instr_len is not defined for EPT violations");
245 return 0;
246 default:
247 WARN_ONCE(1, "Unexpected #VE-type: %lld\n", ve->exit_reason);
248 return ve->instr_len;
249 }
250 }
251
252 static u64 __cpuidle __halt(const bool irq_disabled)
253 {
254 struct tdx_module_args args = {
255 .r10 = TDX_HYPERCALL_STANDARD,
256 .r11 = hcall_func(EXIT_REASON_HLT),
257 .r12 = irq_disabled,
258 };
259
260 /*
261 * Emulate HLT operation via hypercall. More info about ABI
262 * can be found in TDX Guest-Host-Communication Interface
263 * (GHCI), section 3.8 TDG.VP.VMCALL<Instruction.HLT>.
264 *
265 * The VMM uses the "IRQ disabled" param to understand IRQ
266 * enabled status (RFLAGS.IF) of the TD guest and to determine
267 * whether or not it should schedule the halted vCPU if an
268 * IRQ becomes pending. E.g. if IRQs are disabled, the VMM
269 * can keep the vCPU in virtual HLT, even if an IRQ is
270 * pending, without hanging/breaking the guest.
271 */
272 return __tdx_hypercall(&args);
273 }
274
275 static int handle_halt(struct ve_info *ve)
276 {
277 const bool irq_disabled = irqs_disabled();
278
279 if (__halt(irq_disabled))
280 return -EIO;
281
282 return ve_instr_len(ve);
283 }
284
285 void __cpuidle tdx_safe_halt(void)
286 {
287 const bool irq_disabled = false;
288
289 /*
290 * Use WARN_ONCE() to report the failure.
291 */
292 if (__halt(irq_disabled))
293 WARN_ONCE(1, "HLT instruction emulation failed\n");
294 }
295
296 static int read_msr(struct pt_regs *regs, struct ve_info *ve)
297 {
298 struct tdx_module_args args = {
299 .r10 = TDX_HYPERCALL_STANDARD,
300 .r11 = hcall_func(EXIT_REASON_MSR_READ),
301 .r12 = regs->cx,
302 };
303
304 /*
305 * Emulate the MSR read via hypercall. More info about ABI
306 * can be found in TDX Guest-Host-Communication Interface
307 * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
308 */
309 if (__tdx_hypercall(&args))
310 return -EIO;
311
312 regs->ax = lower_32_bits(args.r11);
313 regs->dx = upper_32_bits(args.r11);
314 return ve_instr_len(ve);
315 }
316
317 static int write_msr(struct pt_regs *regs, struct ve_info *ve)
318 {
319 struct tdx_module_args args = {
320 .r10 = TDX_HYPERCALL_STANDARD,
321 .r11 = hcall_func(EXIT_REASON_MSR_WRITE),
322 .r12 = regs->cx,
323 .r13 = (u64)regs->dx << 32 | regs->ax,
324 };
325
326 /*
327 * Emulate the MSR write via hypercall. More info about ABI
328 * can be found in TDX Guest-Host-Communication Interface
329 * (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
330 */
331 if (__tdx_hypercall(&args))
332 return -EIO;
333
334 return ve_instr_len(ve);
335 }
336
337 static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
338 {
339 struct tdx_module_args args = {
340 .r10 = TDX_HYPERCALL_STANDARD,
341 .r11 = hcall_func(EXIT_REASON_CPUID),
342 .r12 = regs->ax,
343 .r13 = regs->cx,
344 };
345
346 /*
347 * Only allow VMM to control range reserved for hypervisor
348 * communication.
349 *
350 * Return all-zeros for any CPUID outside the range. It matches CPU
351 * behaviour for non-supported leaf.
352 */
353 if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) {
354 regs->ax = regs->bx = regs->cx = regs->dx = 0;
355 return ve_instr_len(ve);
356 }
357
358 /*
359 * Emulate the CPUID instruction via a hypercall. More info about
360 * ABI can be found in TDX Guest-Host-Communication Interface
361 * (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
362 */
363 if (__tdx_hypercall(&args))
364 return -EIO;
365
366 /*
367 * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of
368 * EAX, EBX, ECX, EDX registers after the CPUID instruction execution.
369 * So copy the register contents back to pt_regs.
370 */
371 regs->ax = args.r12;
372 regs->bx = args.r13;
373 regs->cx = args.r14;
374 regs->dx = args.r15;
375
376 return ve_instr_len(ve);
377 }
378
379 static bool mmio_read(int size, unsigned long addr, unsigned long *val)
380 {
381 struct tdx_module_args args = {
382 .r10 = TDX_HYPERCALL_STANDARD,
383 .r11 = hcall_func(EXIT_REASON_EPT_VIOLATION),
384 .r12 = size,
385 .r13 = EPT_READ,
386 .r14 = addr,
387 .r15 = *val,
388 };
389
390 if (__tdx_hypercall(&args))
391 return false;
392
393 *val = args.r11;
394 return true;
395 }
396
397 static bool mmio_write(int size, unsigned long addr, unsigned long val)
398 {
399 return !_tdx_hypercall(hcall_func(EXIT_REASON_EPT_VIOLATION), size,
400 EPT_WRITE, addr, val);
401 }
402
403 static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
404 {
405 unsigned long *reg, val, vaddr;
406 char buffer[MAX_INSN_SIZE];
407 enum insn_mmio_type mmio;
408 struct insn insn = {};
409 int size, extend_size;
410 u8 extend_val = 0;
411
412 /* Only in-kernel MMIO is supported */
413 if (WARN_ON_ONCE(user_mode(regs)))
414 return -EFAULT;
415
416 if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE))
417 return -EFAULT;
418
419 if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64))
420 return -EINVAL;
421
422 mmio = insn_decode_mmio(&insn, &size);
423 if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))
424 return -EINVAL;
425
426 if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
427 reg = insn_get_modrm_reg_ptr(&insn, regs);
428 if (!reg)
429 return -EINVAL;
430 }
431
432 /*
433 * Reject EPT violation #VEs that split pages.
434 *
435 * MMIO accesses are supposed to be naturally aligned and therefore
436 * never cross page boundaries. Seeing split page accesses indicates
437 * a bug or a load_unaligned_zeropad() that stepped into an MMIO page.
438 *
439 * load_unaligned_zeropad() will recover using exception fixups.
440 */
441 vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
442 if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
443 return -EFAULT;
444
445 /* Handle writes first */
446 switch (mmio) {
447 case INSN_MMIO_WRITE:
448 memcpy(&val, reg, size);
449 if (!mmio_write(size, ve->gpa, val))
450 return -EIO;
451 return insn.length;
452 case INSN_MMIO_WRITE_IMM:
453 val = insn.immediate.value;
454 if (!mmio_write(size, ve->gpa, val))
455 return -EIO;
456 return insn.length;
457 case INSN_MMIO_READ:
458 case INSN_MMIO_READ_ZERO_EXTEND:
459 case INSN_MMIO_READ_SIGN_EXTEND:
460 /* Reads are handled below */
461 break;
462 case INSN_MMIO_MOVS:
463 case INSN_MMIO_DECODE_FAILED:
464 /*
465 * MMIO was accessed with an instruction that could not be
466 * decoded or handled properly. It was likely not using io.h
467 * helpers or accessed MMIO accidentally.
468 */
469 return -EINVAL;
470 default:
471 WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
472 return -EINVAL;
473 }
474
475 /* Handle reads */
476 if (!mmio_read(size, ve->gpa, &val))
477 return -EIO;
478
479 switch (mmio) {
480 case INSN_MMIO_READ:
481 /* Zero-extend for 32-bit operation */
482 extend_size = size == 4 ? sizeof(*reg) : 0;
483 break;
484 case INSN_MMIO_READ_ZERO_EXTEND:
485 /* Zero extend based on operand size */
486 extend_size = insn.opnd_bytes;
487 break;
488 case INSN_MMIO_READ_SIGN_EXTEND:
489 /* Sign extend based on operand size */
490 extend_size = insn.opnd_bytes;
491 if (size == 1 && val & BIT(7))
492 extend_val = 0xFF;
493 else if (size > 1 && val & BIT(15))
494 extend_val = 0xFF;
495 break;
496 default:
497 /* All other cases has to be covered with the first switch() */
498 WARN_ON_ONCE(1);
499 return -EINVAL;
500 }
501
502 if (extend_size)
503 memset(reg, extend_val, extend_size);
504 memcpy(reg, &val, size);
505 return insn.length;
506 }
507
508 static bool handle_in(struct pt_regs *regs, int size, int port)
509 {
510 struct tdx_module_args args = {
511 .r10 = TDX_HYPERCALL_STANDARD,
512 .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
513 .r12 = size,
514 .r13 = PORT_READ,
515 .r14 = port,
516 };
517 u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
518 bool success;
519
520 /*
521 * Emulate the I/O read via hypercall. More info about ABI can be found
522 * in TDX Guest-Host-Communication Interface (GHCI) section titled
523 * "TDG.VP.VMCALL<Instruction.IO>".
524 */
525 success = !__tdx_hypercall(&args);
526
527 /* Update part of the register affected by the emulated instruction */
528 regs->ax &= ~mask;
529 if (success)
530 regs->ax |= args.r11 & mask;
531
532 return success;
533 }
534
535 static bool handle_out(struct pt_regs *regs, int size, int port)
536 {
537 u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
538
539 /*
540 * Emulate the I/O write via hypercall. More info about ABI can be found
541 * in TDX Guest-Host-Communication Interface (GHCI) section titled
542 * "TDG.VP.VMCALL<Instruction.IO>".
543 */
544 return !_tdx_hypercall(hcall_func(EXIT_REASON_IO_INSTRUCTION), size,
545 PORT_WRITE, port, regs->ax & mask);
546 }
547
548 /*
549 * Emulate I/O using hypercall.
550 *
551 * Assumes the IO instruction was using ax, which is enforced
552 * by the standard io.h macros.
553 *
554 * Return True on success or False on failure.
555 */
556 static int handle_io(struct pt_regs *regs, struct ve_info *ve)
557 {
558 u32 exit_qual = ve->exit_qual;
559 int size, port;
560 bool in, ret;
561
562 if (VE_IS_IO_STRING(exit_qual))
563 return -EIO;
564
565 in = VE_IS_IO_IN(exit_qual);
566 size = VE_GET_IO_SIZE(exit_qual);
567 port = VE_GET_PORT_NUM(exit_qual);
568
569
570 if (in)
571 ret = handle_in(regs, size, port);
572 else
573 ret = handle_out(regs, size, port);
574 if (!ret)
575 return -EIO;
576
577 return ve_instr_len(ve);
578 }
579
580 /*
581 * Early #VE exception handler. Only handles a subset of port I/O.
582 * Intended only for earlyprintk. If failed, return false.
583 */
584 __init bool tdx_early_handle_ve(struct pt_regs *regs)
585 {
586 struct ve_info ve;
587 int insn_len;
588
589 tdx_get_ve_info(&ve);
590
591 if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION)
592 return false;
593
594 insn_len = handle_io(regs, &ve);
595 if (insn_len < 0)
596 return false;
597
598 regs->ip += insn_len;
599 return true;
600 }
601
602 void tdx_get_ve_info(struct ve_info *ve)
603 {
604 struct tdx_module_args args = {};
605
606 /*
607 * Called during #VE handling to retrieve the #VE info from the
608 * TDX module.
609 *
610 * This has to be called early in #VE handling. A "nested" #VE which
611 * occurs before this will raise a #DF and is not recoverable.
612 *
613 * The call retrieves the #VE info from the TDX module, which also
614 * clears the "#VE valid" flag. This must be done before anything else
615 * because any #VE that occurs while the valid flag is set will lead to
616 * #DF.
617 *
618 * Note, the TDX module treats virtual NMIs as inhibited if the #VE
619 * valid flag is set. It means that NMI=>#VE will not result in a #DF.
620 */
621 tdcall(TDG_VP_VEINFO_GET, &args);
622
623 /* Transfer the output parameters */
624 ve->exit_reason = args.rcx;
625 ve->exit_qual = args.rdx;
626 ve->gla = args.r8;
627 ve->gpa = args.r9;
628 ve->instr_len = lower_32_bits(args.r10);
629 ve->instr_info = upper_32_bits(args.r10);
630 }
631
632 /*
633 * Handle the user initiated #VE.
634 *
635 * On success, returns the number of bytes RIP should be incremented (>=0)
636 * or -errno on error.
637 */
638 static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
639 {
640 switch (ve->exit_reason) {
641 case EXIT_REASON_CPUID:
642 return handle_cpuid(regs, ve);
643 default:
644 pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
645 return -EIO;
646 }
647 }
648
649 static inline bool is_private_gpa(u64 gpa)
650 {
651 return gpa == cc_mkenc(gpa);
652 }
653
654 /*
655 * Handle the kernel #VE.
656 *
657 * On success, returns the number of bytes RIP should be incremented (>=0)
658 * or -errno on error.
659 */
660 static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
661 {
662 switch (ve->exit_reason) {
663 case EXIT_REASON_HLT:
664 return handle_halt(ve);
665 case EXIT_REASON_MSR_READ:
666 return read_msr(regs, ve);
667 case EXIT_REASON_MSR_WRITE:
668 return write_msr(regs, ve);
669 case EXIT_REASON_CPUID:
670 return handle_cpuid(regs, ve);
671 case EXIT_REASON_EPT_VIOLATION:
672 if (is_private_gpa(ve->gpa))
673 panic("Unexpected EPT-violation on private memory.");
674 return handle_mmio(regs, ve);
675 case EXIT_REASON_IO_INSTRUCTION:
676 return handle_io(regs, ve);
677 default:
678 pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
679 return -EIO;
680 }
681 }
682
683 bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
684 {
685 int insn_len;
686
687 if (user_mode(regs))
688 insn_len = virt_exception_user(regs, ve);
689 else
690 insn_len = virt_exception_kernel(regs, ve);
691 if (insn_len < 0)
692 return false;
693
694 /* After successful #VE handling, move the IP */
695 regs->ip += insn_len;
696
697 return true;
698 }
699
700 static bool tdx_tlb_flush_required(bool private)
701 {
702 /*
703 * TDX guest is responsible for flushing TLB on private->shared
704 * transition. VMM is responsible for flushing on shared->private.
705 *
706 * The VMM _can't_ flush private addresses as it can't generate PAs
707 * with the guest's HKID. Shared memory isn't subject to integrity
708 * checking, i.e. the VMM doesn't need to flush for its own protection.
709 *
710 * There's no need to flush when converting from shared to private,
711 * as flushing is the VMM's responsibility in this case, e.g. it must
712 * flush to avoid integrity failures in the face of a buggy or
713 * malicious guest.
714 */
715 return !private;
716 }
717
718 static bool tdx_cache_flush_required(void)
719 {
720 /*
721 * AMD SME/SEV can avoid cache flushing if HW enforces cache coherence.
722 * TDX doesn't have such capability.
723 *
724 * Flush cache unconditionally.
725 */
726 return true;
727 }
728
729 /*
730 * Notify the VMM about page mapping conversion. More info about ABI
731 * can be found in TDX Guest-Host-Communication Interface (GHCI),
732 * section "TDG.VP.VMCALL<MapGPA>".
733 */
734 static bool tdx_map_gpa(phys_addr_t start, phys_addr_t end, bool enc)
735 {
736 /* Retrying the hypercall a second time should succeed; use 3 just in case */
737 const int max_retries_per_page = 3;
738 int retry_count = 0;
739
740 if (!enc) {
741 /* Set the shared (decrypted) bits: */
742 start |= cc_mkdec(0);
743 end |= cc_mkdec(0);
744 }
745
746 while (retry_count < max_retries_per_page) {
747 struct tdx_module_args args = {
748 .r10 = TDX_HYPERCALL_STANDARD,
749 .r11 = TDVMCALL_MAP_GPA,
750 .r12 = start,
751 .r13 = end - start };
752
753 u64 map_fail_paddr;
754 u64 ret = __tdx_hypercall(&args);
755
756 if (ret != TDVMCALL_STATUS_RETRY)
757 return !ret;
758 /*
759 * The guest must retry the operation for the pages in the
760 * region starting at the GPA specified in R11. R11 comes
761 * from the untrusted VMM. Sanity check it.
762 */
763 map_fail_paddr = args.r11;
764 if (map_fail_paddr < start || map_fail_paddr >= end)
765 return false;
766
767 /* "Consume" a retry without forward progress */
768 if (map_fail_paddr == start) {
769 retry_count++;
770 continue;
771 }
772
773 start = map_fail_paddr;
774 retry_count = 0;
775 }
776
777 return false;
778 }
779
780 /*
781 * Inform the VMM of the guest's intent for this physical page: shared with
782 * the VMM or private to the guest. The VMM is expected to change its mapping
783 * of the page in response.
784 */
785 static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
786 {
787 phys_addr_t start = __pa(vaddr);
788 phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE);
789
790 if (!tdx_map_gpa(start, end, enc))
791 return false;
792
793 /* shared->private conversion requires memory to be accepted before use */
794 if (enc)
795 return tdx_accept_memory(start, end);
796
797 return true;
798 }
799
800 static bool tdx_enc_status_change_prepare(unsigned long vaddr, int numpages,
801 bool enc)
802 {
803 /*
804 * Only handle shared->private conversion here.
805 * See the comment in tdx_early_init().
806 */
807 if (enc)
808 return tdx_enc_status_changed(vaddr, numpages, enc);
809 return true;
810 }
811
812 static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
813 bool enc)
814 {
815 /*
816 * Only handle private->shared conversion here.
817 * See the comment in tdx_early_init().
818 */
819 if (!enc)
820 return tdx_enc_status_changed(vaddr, numpages, enc);
821 return true;
822 }
823
824 void __init tdx_early_init(void)
825 {
826 struct tdx_module_args args = {
827 .rdx = TDCS_NOTIFY_ENABLES,
828 .r9 = -1ULL,
829 };
830 u64 cc_mask;
831 u32 eax, sig[3];
832
833 cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]);
834
835 if (memcmp(TDX_IDENT, sig, sizeof(sig)))
836 return;
837
838 setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
839
840 /* TSC is the only reliable clock in TDX guest */
841 setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
842
843 cc_vendor = CC_VENDOR_INTEL;
844 tdx_parse_tdinfo(&cc_mask);
845 cc_set_mask(cc_mask);
846
847 /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
848 tdcall(TDG_VM_WR, &args);
849
850 /*
851 * All bits above GPA width are reserved and kernel treats shared bit
852 * as flag, not as part of physical address.
853 *
854 * Adjust physical mask to only cover valid GPA bits.
855 */
856 physical_mask &= cc_mask - 1;
857
858 /*
859 * The kernel mapping should match the TDX metadata for the page.
860 * load_unaligned_zeropad() can touch memory *adjacent* to that which is
861 * owned by the caller and can catch even _momentary_ mismatches. Bad
862 * things happen on mismatch:
863 *
864 * - Private mapping => Shared Page == Guest shutdown
865 * - Shared mapping => Private Page == Recoverable #VE
866 *
867 * guest.enc_status_change_prepare() converts the page from
868 * shared=>private before the mapping becomes private.
869 *
870 * guest.enc_status_change_finish() converts the page from
871 * private=>shared after the mapping becomes private.
872 *
873 * In both cases there is a temporary shared mapping to a private page,
874 * which can result in a #VE. But, there is never a private mapping to
875 * a shared page.
876 */
877 x86_platform.guest.enc_status_change_prepare = tdx_enc_status_change_prepare;
878 x86_platform.guest.enc_status_change_finish = tdx_enc_status_change_finish;
879
880 x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required;
881 x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required;
882
883 /*
884 * TDX intercepts the RDMSR to read the X2APIC ID in the parallel
885 * bringup low level code. That raises #VE which cannot be handled
886 * there.
887 *
888 * Intel-TDX has a secure RDMSR hypercall, but that needs to be
889 * implemented seperately in the low level startup ASM code.
890 * Until that is in place, disable parallel bringup for TDX.
891 */
892 x86_cpuinit.parallel_bringup = false;
893
894 pr_info("Guest detected\n");
895 }