arch/x86/coco/tdx/tdx.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /* Copyright (C) 2021-2022 Intel Corporation */
   3
   4 #undef pr_fmt
   5 #define pr_fmt(fmt)     "tdx: " fmt
   6
   7 #include <linux/cpufeature.h>
   8 #include <linux/export.h>
   9 #include <linux/io.h>
  10 #include <asm/coco.h>
  11 #include <asm/tdx.h>
  12 #include <asm/vmx.h>
  13 #include <asm/insn.h>
  14 #include <asm/insn-eval.h>
  15 #include <asm/pgtable.h>
  16
  17 /* MMIO direction */
  18 #define EPT_READ        0
  19 #define EPT_WRITE       1
  20
  21 /* Port I/O direction */
  22 #define PORT_READ       0
  23 #define PORT_WRITE      1
  24
  25 /* See Exit Qualification for I/O Instructions in VMX documentation */
  26 #define VE_IS_IO_IN(e)          ((e) & BIT(3))
  27 #define VE_GET_IO_SIZE(e)       (((e) & GENMASK(2, 0)) + 1)
  28 #define VE_GET_PORT_NUM(e)      ((e) >> 16)
  29 #define VE_IS_IO_STRING(e)      ((e) & BIT(4))
  30
  31 #define ATTR_DEBUG              BIT(0)
  32 #define ATTR_SEPT_VE_DISABLE    BIT(28)
  33
  34 /* TDX Module call error codes */
  35 #define TDCALL_RETURN_CODE(a)   ((a) >> 32)
  36 #define TDCALL_INVALID_OPERAND  0xc0000100
  37
  38 #define TDREPORT_SUBTYPE_0      0
  39
  40 /* Called from __tdx_hypercall() for unrecoverable failure */
  41 noinstr void __noreturn __tdx_hypercall_failed(void)
  42 {
  43         instrumentation_begin();
  44         panic("TDVMCALL failed. TDX module bug?");
  45 }
  46
  47 #ifdef CONFIG_KVM_GUEST
  48 long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,
  49                        unsigned long p3, unsigned long p4)
  50 {
  51         struct tdx_module_args args = {
  52                 .r10 = nr,
  53                 .r11 = p1,
  54                 .r12 = p2,
  55                 .r13 = p3,
  56                 .r14 = p4,
  57         };
  58
  59         return __tdx_hypercall(&args);
  60 }
  61 EXPORT_SYMBOL_GPL(tdx_kvm_hypercall);
  62 #endif
  63
  64 /*
  65  * Used for TDX guests to make calls directly to the TD module.  This
  66  * should only be used for calls that have no legitimate reason to fail
  67  * or where the kernel can not survive the call failing.
  68  */
  69 static inline void tdcall(u64 fn, struct tdx_module_args *args)
  70 {
  71         if (__tdcall_ret(fn, args))
  72                 panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
  73 }
  74
  75 /**
  76  * tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT
  77  *                           subtype 0) using TDG.MR.REPORT TDCALL.
  78  * @reportdata: Address of the input buffer which contains user-defined
  79  *              REPORTDATA to be included into TDREPORT.
  80  * @tdreport: Address of the output buffer to store TDREPORT.
  81  *
  82  * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module
  83  * v1.0 specification for more information on TDG.MR.REPORT TDCALL.
  84  * It is used in the TDX guest driver module to get the TDREPORT0.
  85  *
  86  * Return 0 on success, -EINVAL for invalid operands, or -EIO on
  87  * other TDCALL failures.
  88  */
  89 int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
  90 {
  91         struct tdx_module_args args = {
  92                 .rcx = virt_to_phys(tdreport),
  93                 .rdx = virt_to_phys(reportdata),
  94                 .r8 = TDREPORT_SUBTYPE_0,
  95         };
  96         u64 ret;
  97
  98         ret = __tdcall(TDG_MR_REPORT, &args);
  99         if (ret) {
 100                 if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
 101                         return -EINVAL;
 102                 return -EIO;
 103         }
 104
 105         return 0;
 106 }
 107 EXPORT_SYMBOL_GPL(tdx_mcall_get_report0);
 108
 109 /**
 110  * tdx_hcall_get_quote() - Wrapper to request TD Quote using GetQuote
 111  *                         hypercall.
 112  * @buf: Address of the directly mapped shared kernel buffer which
 113  *       contains TDREPORT. The same buffer will be used by VMM to
 114  *       store the generated TD Quote output.
 115  * @size: size of the tdquote buffer (4KB-aligned).
 116  *
 117  * Refer to section titled "TDG.VP.VMCALL<GetQuote>" in the TDX GHCI
 118  * v1.0 specification for more information on GetQuote hypercall.
 119  * It is used in the TDX guest driver module to get the TD Quote.
 120  *
 121  * Return 0 on success or error code on failure.
 122  */
 123 u64 tdx_hcall_get_quote(u8 *buf, size_t size)
 124 {
 125         /* Since buf is a shared memory, set the shared (decrypted) bits */
 126         return _tdx_hypercall(TDVMCALL_GET_QUOTE, cc_mkdec(virt_to_phys(buf)), size, 0, 0);
 127 }
 128 EXPORT_SYMBOL_GPL(tdx_hcall_get_quote);
 129
 130 static void __noreturn tdx_panic(const char *msg)
 131 {
 132         struct tdx_module_args args = {
 133                 .r10 = TDX_HYPERCALL_STANDARD,
 134                 .r11 = TDVMCALL_REPORT_FATAL_ERROR,
 135                 .r12 = 0, /* Error code: 0 is Panic */
 136         };
 137         union {
 138                 /* Define register order according to the GHCI */
 139                 struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; };
 140
 141                 char str[64];
 142         } message;
 143
 144         /* VMM assumes '\0' in byte 65, if the message took all 64 bytes */
 145         strtomem_pad(message.str, msg, '\0');
 146
 147         args.r8  = message.r8;
 148         args.r9  = message.r9;
 149         args.r14 = message.r14;
 150         args.r15 = message.r15;
 151         args.rdi = message.rdi;
 152         args.rsi = message.rsi;
 153         args.rbx = message.rbx;
 154         args.rdx = message.rdx;
 155
 156         /*
 157          * This hypercall should never return and it is not safe
 158          * to keep the guest running. Call it forever if it
 159          * happens to return.
 160          */
 161         while (1)
 162                 __tdx_hypercall(&args);
 163 }
 164
 165 static void tdx_parse_tdinfo(u64 *cc_mask)
 166 {
 167         struct tdx_module_args args = {};
 168         unsigned int gpa_width;
 169         u64 td_attr;
 170
 171         /*
 172          * TDINFO TDX module call is used to get the TD execution environment
 173          * information like GPA width, number of available vcpus, debug mode
 174          * information, etc. More details about the ABI can be found in TDX
 175          * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
 176          * [TDG.VP.INFO].
 177          */
 178         tdcall(TDG_VP_INFO, &args);
 179
 180         /*
 181          * The highest bit of a guest physical address is the "sharing" bit.
 182          * Set it for shared pages and clear it for private pages.
 183          *
 184          * The GPA width that comes out of this call is critical. TDX guests
 185          * can not meaningfully run without it.
 186          */
 187         gpa_width = args.rcx & GENMASK(5, 0);
 188         *cc_mask = BIT_ULL(gpa_width - 1);
 189
 190         /*
 191          * The kernel can not handle #VE's when accessing normal kernel
 192          * memory.  Ensure that no #VE will be delivered for accesses to
 193          * TD-private memory.  Only VMM-shared memory (MMIO) will #VE.
 194          */
 195         td_attr = args.rdx;
 196         if (!(td_attr & ATTR_SEPT_VE_DISABLE)) {
 197                 const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set.";
 198
 199                 /* Relax SEPT_VE_DISABLE check for debug TD. */
 200                 if (td_attr & ATTR_DEBUG)
 201                         pr_warn("%s\n", msg);
 202                 else
 203                         tdx_panic(msg);
 204         }
 205 }
 206
 207 /*
 208  * The TDX module spec states that #VE may be injected for a limited set of
 209  * reasons:
 210  *
 211  *  - Emulation of the architectural #VE injection on EPT violation;
 212  *
 213  *  - As a result of guest TD execution of a disallowed instruction,
 214  *    a disallowed MSR access, or CPUID virtualization;
 215  *
 216  *  - A notification to the guest TD about anomalous behavior;
 217  *
 218  * The last one is opt-in and is not used by the kernel.
 219  *
 220  * The Intel Software Developer's Manual describes cases when instruction
 221  * length field can be used in section "Information for VM Exits Due to
 222  * Instruction Execution".
 223  *
 224  * For TDX, it ultimately means GET_VEINFO provides reliable instruction length
 225  * information if #VE occurred due to instruction execution, but not for EPT
 226  * violations.
 227  */
 228 static int ve_instr_len(struct ve_info *ve)
 229 {
 230         switch (ve->exit_reason) {
 231         case EXIT_REASON_HLT:
 232         case EXIT_REASON_MSR_READ:
 233         case EXIT_REASON_MSR_WRITE:
 234         case EXIT_REASON_CPUID:
 235         case EXIT_REASON_IO_INSTRUCTION:
 236                 /* It is safe to use ve->instr_len for #VE due instructions */
 237                 return ve->instr_len;
 238         case EXIT_REASON_EPT_VIOLATION:
 239                 /*
 240                  * For EPT violations, ve->insn_len is not defined. For those,
 241                  * the kernel must decode instructions manually and should not
 242                  * be using this function.
 243                  */
 244                 WARN_ONCE(1, "ve->instr_len is not defined for EPT violations");
 245                 return 0;
 246         default:
 247                 WARN_ONCE(1, "Unexpected #VE-type: %lld\n", ve->exit_reason);
 248                 return ve->instr_len;
 249         }
 250 }
 251
 252 static u64 __cpuidle __halt(const bool irq_disabled)
 253 {
 254         struct tdx_module_args args = {
 255                 .r10 = TDX_HYPERCALL_STANDARD,
 256                 .r11 = hcall_func(EXIT_REASON_HLT),
 257                 .r12 = irq_disabled,
 258         };
 259
 260         /*
 261          * Emulate HLT operation via hypercall. More info about ABI
 262          * can be found in TDX Guest-Host-Communication Interface
 263          * (GHCI), section 3.8 TDG.VP.VMCALL<Instruction.HLT>.
 264          *
 265          * The VMM uses the "IRQ disabled" param to understand IRQ
 266          * enabled status (RFLAGS.IF) of the TD guest and to determine
 267          * whether or not it should schedule the halted vCPU if an
 268          * IRQ becomes pending. E.g. if IRQs are disabled, the VMM
 269          * can keep the vCPU in virtual HLT, even if an IRQ is
 270          * pending, without hanging/breaking the guest.
 271          */
 272         return __tdx_hypercall(&args);
 273 }
 274
 275 static int handle_halt(struct ve_info *ve)
 276 {
 277         const bool irq_disabled = irqs_disabled();
 278
 279         if (__halt(irq_disabled))
 280                 return -EIO;
 281
 282         return ve_instr_len(ve);
 283 }
 284
 285 void __cpuidle tdx_safe_halt(void)
 286 {
 287         const bool irq_disabled = false;
 288
 289         /*
 290          * Use WARN_ONCE() to report the failure.
 291          */
 292         if (__halt(irq_disabled))
 293                 WARN_ONCE(1, "HLT instruction emulation failed\n");
 294 }
 295
 296 static int read_msr(struct pt_regs *regs, struct ve_info *ve)
 297 {
 298         struct tdx_module_args args = {
 299                 .r10 = TDX_HYPERCALL_STANDARD,
 300                 .r11 = hcall_func(EXIT_REASON_MSR_READ),
 301                 .r12 = regs->cx,
 302         };
 303
 304         /*
 305          * Emulate the MSR read via hypercall. More info about ABI
 306          * can be found in TDX Guest-Host-Communication Interface
 307          * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
 308          */
 309         if (__tdx_hypercall(&args))
 310                 return -EIO;
 311
 312         regs->ax = lower_32_bits(args.r11);
 313         regs->dx = upper_32_bits(args.r11);
 314         return ve_instr_len(ve);
 315 }
 316
 317 static int write_msr(struct pt_regs *regs, struct ve_info *ve)
 318 {
 319         struct tdx_module_args args = {
 320                 .r10 = TDX_HYPERCALL_STANDARD,
 321                 .r11 = hcall_func(EXIT_REASON_MSR_WRITE),
 322                 .r12 = regs->cx,
 323                 .r13 = (u64)regs->dx << 32 | regs->ax,
 324         };
 325
 326         /*
 327          * Emulate the MSR write via hypercall. More info about ABI
 328          * can be found in TDX Guest-Host-Communication Interface
 329          * (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
 330          */
 331         if (__tdx_hypercall(&args))
 332                 return -EIO;
 333
 334         return ve_instr_len(ve);
 335 }
 336
 337 static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
 338 {
 339         struct tdx_module_args args = {
 340                 .r10 = TDX_HYPERCALL_STANDARD,
 341                 .r11 = hcall_func(EXIT_REASON_CPUID),
 342                 .r12 = regs->ax,
 343                 .r13 = regs->cx,
 344         };
 345
 346         /*
 347          * Only allow VMM to control range reserved for hypervisor
 348          * communication.
 349          *
 350          * Return all-zeros for any CPUID outside the range. It matches CPU
 351          * behaviour for non-supported leaf.
 352          */
 353         if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) {
 354                 regs->ax = regs->bx = regs->cx = regs->dx = 0;
 355                 return ve_instr_len(ve);
 356         }
 357
 358         /*
 359          * Emulate the CPUID instruction via a hypercall. More info about
 360          * ABI can be found in TDX Guest-Host-Communication Interface
 361          * (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
 362          */
 363         if (__tdx_hypercall(&args))
 364                 return -EIO;
 365
 366         /*
 367          * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of
 368          * EAX, EBX, ECX, EDX registers after the CPUID instruction execution.
 369          * So copy the register contents back to pt_regs.
 370          */
 371         regs->ax = args.r12;
 372         regs->bx = args.r13;
 373         regs->cx = args.r14;
 374         regs->dx = args.r15;
 375
 376         return ve_instr_len(ve);
 377 }
 378
 379 static bool mmio_read(int size, unsigned long addr, unsigned long *val)
 380 {
 381         struct tdx_module_args args = {
 382                 .r10 = TDX_HYPERCALL_STANDARD,
 383                 .r11 = hcall_func(EXIT_REASON_EPT_VIOLATION),
 384                 .r12 = size,
 385                 .r13 = EPT_READ,
 386                 .r14 = addr,
 387                 .r15 = *val,
 388         };
 389
 390         if (__tdx_hypercall(&args))
 391                 return false;
 392
 393         *val = args.r11;
 394         return true;
 395 }
 396
 397 static bool mmio_write(int size, unsigned long addr, unsigned long val)
 398 {
 399         return !_tdx_hypercall(hcall_func(EXIT_REASON_EPT_VIOLATION), size,
 400                                EPT_WRITE, addr, val);
 401 }
 402
 403 static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
 404 {
 405         unsigned long *reg, val, vaddr;
 406         char buffer[MAX_INSN_SIZE];
 407         enum insn_mmio_type mmio;
 408         struct insn insn = {};
 409         int size, extend_size;
 410         u8 extend_val = 0;
 411
 412         /* Only in-kernel MMIO is supported */
 413         if (WARN_ON_ONCE(user_mode(regs)))
 414                 return -EFAULT;
 415
 416         if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE))
 417                 return -EFAULT;
 418
 419         if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64))
 420                 return -EINVAL;
 421
 422         mmio = insn_decode_mmio(&insn, &size);
 423         if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))
 424                 return -EINVAL;
 425
 426         if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
 427                 reg = insn_get_modrm_reg_ptr(&insn, regs);
 428                 if (!reg)
 429                         return -EINVAL;
 430         }
 431
 432         /*
 433          * Reject EPT violation #VEs that split pages.
 434          *
 435          * MMIO accesses are supposed to be naturally aligned and therefore
 436          * never cross page boundaries. Seeing split page accesses indicates
 437          * a bug or a load_unaligned_zeropad() that stepped into an MMIO page.
 438          *
 439          * load_unaligned_zeropad() will recover using exception fixups.
 440          */
 441         vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
 442         if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
 443                 return -EFAULT;
 444
 445         /* Handle writes first */
 446         switch (mmio) {
 447         case INSN_MMIO_WRITE:
 448                 memcpy(&val, reg, size);
 449                 if (!mmio_write(size, ve->gpa, val))
 450                         return -EIO;
 451                 return insn.length;
 452         case INSN_MMIO_WRITE_IMM:
 453                 val = insn.immediate.value;
 454                 if (!mmio_write(size, ve->gpa, val))
 455                         return -EIO;
 456                 return insn.length;
 457         case INSN_MMIO_READ:
 458         case INSN_MMIO_READ_ZERO_EXTEND:
 459         case INSN_MMIO_READ_SIGN_EXTEND:
 460                 /* Reads are handled below */
 461                 break;
 462         case INSN_MMIO_MOVS:
 463         case INSN_MMIO_DECODE_FAILED:
 464                 /*
 465                  * MMIO was accessed with an instruction that could not be
 466                  * decoded or handled properly. It was likely not using io.h
 467                  * helpers or accessed MMIO accidentally.
 468                  */
 469                 return -EINVAL;
 470         default:
 471                 WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
 472                 return -EINVAL;
 473         }
 474
 475         /* Handle reads */
 476         if (!mmio_read(size, ve->gpa, &val))
 477                 return -EIO;
 478
 479         switch (mmio) {
 480         case INSN_MMIO_READ:
 481                 /* Zero-extend for 32-bit operation */
 482                 extend_size = size == 4 ? sizeof(*reg) : 0;
 483                 break;
 484         case INSN_MMIO_READ_ZERO_EXTEND:
 485                 /* Zero extend based on operand size */
 486                 extend_size = insn.opnd_bytes;
 487                 break;
 488         case INSN_MMIO_READ_SIGN_EXTEND:
 489                 /* Sign extend based on operand size */
 490                 extend_size = insn.opnd_bytes;
 491                 if (size == 1 && val & BIT(7))
 492                         extend_val = 0xFF;
 493                 else if (size > 1 && val & BIT(15))
 494                         extend_val = 0xFF;
 495                 break;
 496         default:
 497                 /* All other cases has to be covered with the first switch() */
 498                 WARN_ON_ONCE(1);
 499                 return -EINVAL;
 500         }
 501
 502         if (extend_size)
 503                 memset(reg, extend_val, extend_size);
 504         memcpy(reg, &val, size);
 505         return insn.length;
 506 }
 507
 508 static bool handle_in(struct pt_regs *regs, int size, int port)
 509 {
 510         struct tdx_module_args args = {
 511                 .r10 = TDX_HYPERCALL_STANDARD,
 512                 .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
 513                 .r12 = size,
 514                 .r13 = PORT_READ,
 515                 .r14 = port,
 516         };
 517         u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
 518         bool success;
 519
 520         /*
 521          * Emulate the I/O read via hypercall. More info about ABI can be found
 522          * in TDX Guest-Host-Communication Interface (GHCI) section titled
 523          * "TDG.VP.VMCALL<Instruction.IO>".
 524          */
 525         success = !__tdx_hypercall(&args);
 526
 527         /* Update part of the register affected by the emulated instruction */
 528         regs->ax &= ~mask;
 529         if (success)
 530                 regs->ax |= args.r11 & mask;
 531
 532         return success;
 533 }
 534
 535 static bool handle_out(struct pt_regs *regs, int size, int port)
 536 {
 537         u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
 538
 539         /*
 540          * Emulate the I/O write via hypercall. More info about ABI can be found
 541          * in TDX Guest-Host-Communication Interface (GHCI) section titled
 542          * "TDG.VP.VMCALL<Instruction.IO>".
 543          */
 544         return !_tdx_hypercall(hcall_func(EXIT_REASON_IO_INSTRUCTION), size,
 545                                PORT_WRITE, port, regs->ax & mask);
 546 }
 547
 548 /*
 549  * Emulate I/O using hypercall.
 550  *
 551  * Assumes the IO instruction was using ax, which is enforced
 552  * by the standard io.h macros.
 553  *
 554  * Return True on success or False on failure.
 555  */
 556 static int handle_io(struct pt_regs *regs, struct ve_info *ve)
 557 {
 558         u32 exit_qual = ve->exit_qual;
 559         int size, port;
 560         bool in, ret;
 561
 562         if (VE_IS_IO_STRING(exit_qual))
 563                 return -EIO;
 564
 565         in   = VE_IS_IO_IN(exit_qual);
 566         size = VE_GET_IO_SIZE(exit_qual);
 567         port = VE_GET_PORT_NUM(exit_qual);
 568
 569
 570         if (in)
 571                 ret = handle_in(regs, size, port);
 572         else
 573                 ret = handle_out(regs, size, port);
 574         if (!ret)
 575                 return -EIO;
 576
 577         return ve_instr_len(ve);
 578 }
 579
 580 /*
 581  * Early #VE exception handler. Only handles a subset of port I/O.
 582  * Intended only for earlyprintk. If failed, return false.
 583  */
 584 __init bool tdx_early_handle_ve(struct pt_regs *regs)
 585 {
 586         struct ve_info ve;
 587         int insn_len;
 588
 589         tdx_get_ve_info(&ve);
 590
 591         if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION)
 592                 return false;
 593
 594         insn_len = handle_io(regs, &ve);
 595         if (insn_len < 0)
 596                 return false;
 597
 598         regs->ip += insn_len;
 599         return true;
 600 }
 601
 602 void tdx_get_ve_info(struct ve_info *ve)
 603 {
 604         struct tdx_module_args args = {};
 605
 606         /*
 607          * Called during #VE handling to retrieve the #VE info from the
 608          * TDX module.
 609          *
 610          * This has to be called early in #VE handling.  A "nested" #VE which
 611          * occurs before this will raise a #DF and is not recoverable.
 612          *
 613          * The call retrieves the #VE info from the TDX module, which also
 614          * clears the "#VE valid" flag. This must be done before anything else
 615          * because any #VE that occurs while the valid flag is set will lead to
 616          * #DF.
 617          *
 618          * Note, the TDX module treats virtual NMIs as inhibited if the #VE
 619          * valid flag is set. It means that NMI=>#VE will not result in a #DF.
 620          */
 621         tdcall(TDG_VP_VEINFO_GET, &args);
 622
 623         /* Transfer the output parameters */
 624         ve->exit_reason = args.rcx;
 625         ve->exit_qual   = args.rdx;
 626         ve->gla         = args.r8;
 627         ve->gpa         = args.r9;
 628         ve->instr_len   = lower_32_bits(args.r10);
 629         ve->instr_info  = upper_32_bits(args.r10);
 630 }
 631
 632 /*
 633  * Handle the user initiated #VE.
 634  *
 635  * On success, returns the number of bytes RIP should be incremented (>=0)
 636  * or -errno on error.
 637  */
 638 static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
 639 {
 640         switch (ve->exit_reason) {
 641         case EXIT_REASON_CPUID:
 642                 return handle_cpuid(regs, ve);
 643         default:
 644                 pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
 645                 return -EIO;
 646         }
 647 }
 648
 649 static inline bool is_private_gpa(u64 gpa)
 650 {
 651         return gpa == cc_mkenc(gpa);
 652 }
 653
 654 /*
 655  * Handle the kernel #VE.
 656  *
 657  * On success, returns the number of bytes RIP should be incremented (>=0)
 658  * or -errno on error.
 659  */
 660 static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
 661 {
 662         switch (ve->exit_reason) {
 663         case EXIT_REASON_HLT:
 664                 return handle_halt(ve);
 665         case EXIT_REASON_MSR_READ:
 666                 return read_msr(regs, ve);
 667         case EXIT_REASON_MSR_WRITE:
 668                 return write_msr(regs, ve);
 669         case EXIT_REASON_CPUID:
 670                 return handle_cpuid(regs, ve);
 671         case EXIT_REASON_EPT_VIOLATION:
 672                 if (is_private_gpa(ve->gpa))
 673                         panic("Unexpected EPT-violation on private memory.");
 674                 return handle_mmio(regs, ve);
 675         case EXIT_REASON_IO_INSTRUCTION:
 676                 return handle_io(regs, ve);
 677         default:
 678                 pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
 679                 return -EIO;
 680         }
 681 }
 682
 683 bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
 684 {
 685         int insn_len;
 686
 687         if (user_mode(regs))
 688                 insn_len = virt_exception_user(regs, ve);
 689         else
 690                 insn_len = virt_exception_kernel(regs, ve);
 691         if (insn_len < 0)
 692                 return false;
 693
 694         /* After successful #VE handling, move the IP */
 695         regs->ip += insn_len;
 696
 697         return true;
 698 }
 699
 700 static bool tdx_tlb_flush_required(bool private)
 701 {
 702         /*
 703          * TDX guest is responsible for flushing TLB on private->shared
 704          * transition. VMM is responsible for flushing on shared->private.
 705          *
 706          * The VMM _can't_ flush private addresses as it can't generate PAs
 707          * with the guest's HKID.  Shared memory isn't subject to integrity
 708          * checking, i.e. the VMM doesn't need to flush for its own protection.
 709          *
 710          * There's no need to flush when converting from shared to private,
 711          * as flushing is the VMM's responsibility in this case, e.g. it must
 712          * flush to avoid integrity failures in the face of a buggy or
 713          * malicious guest.
 714          */
 715         return !private;
 716 }
 717
 718 static bool tdx_cache_flush_required(void)
 719 {
 720         /*
 721          * AMD SME/SEV can avoid cache flushing if HW enforces cache coherence.
 722          * TDX doesn't have such capability.
 723          *
 724          * Flush cache unconditionally.
 725          */
 726         return true;
 727 }
 728
 729 /*
 730  * Notify the VMM about page mapping conversion. More info about ABI
 731  * can be found in TDX Guest-Host-Communication Interface (GHCI),
 732  * section "TDG.VP.VMCALL<MapGPA>".
 733  */
 734 static bool tdx_map_gpa(phys_addr_t start, phys_addr_t end, bool enc)
 735 {
 736         /* Retrying the hypercall a second time should succeed; use 3 just in case */
 737         const int max_retries_per_page = 3;
 738         int retry_count = 0;
 739
 740         if (!enc) {
 741                 /* Set the shared (decrypted) bits: */
 742                 start |= cc_mkdec(0);
 743                 end   |= cc_mkdec(0);
 744         }
 745
 746         while (retry_count < max_retries_per_page) {
 747                 struct tdx_module_args args = {
 748                         .r10 = TDX_HYPERCALL_STANDARD,
 749                         .r11 = TDVMCALL_MAP_GPA,
 750                         .r12 = start,
 751                         .r13 = end - start };
 752
 753                 u64 map_fail_paddr;
 754                 u64 ret = __tdx_hypercall(&args);
 755
 756                 if (ret != TDVMCALL_STATUS_RETRY)
 757                         return !ret;
 758                 /*
 759                  * The guest must retry the operation for the pages in the
 760                  * region starting at the GPA specified in R11. R11 comes
 761                  * from the untrusted VMM. Sanity check it.
 762                  */
 763                 map_fail_paddr = args.r11;
 764                 if (map_fail_paddr < start || map_fail_paddr >= end)
 765                         return false;
 766
 767                 /* "Consume" a retry without forward progress */
 768                 if (map_fail_paddr == start) {
 769                         retry_count++;
 770                         continue;
 771                 }
 772
 773                 start = map_fail_paddr;
 774                 retry_count = 0;
 775         }
 776
 777         return false;
 778 }
 779
 780 /*
 781  * Inform the VMM of the guest's intent for this physical page: shared with
 782  * the VMM or private to the guest.  The VMM is expected to change its mapping
 783  * of the page in response.
 784  */
 785 static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
 786 {
 787         phys_addr_t start = __pa(vaddr);
 788         phys_addr_t end   = __pa(vaddr + numpages * PAGE_SIZE);
 789
 790         if (!tdx_map_gpa(start, end, enc))
 791                 return false;
 792
 793         /* shared->private conversion requires memory to be accepted before use */
 794         if (enc)
 795                 return tdx_accept_memory(start, end);
 796
 797         return true;
 798 }
 799
 800 static bool tdx_enc_status_change_prepare(unsigned long vaddr, int numpages,
 801                                           bool enc)
 802 {
 803         /*
 804          * Only handle shared->private conversion here.
 805          * See the comment in tdx_early_init().
 806          */
 807         if (enc)
 808                 return tdx_enc_status_changed(vaddr, numpages, enc);
 809         return true;
 810 }
 811
 812 static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
 813                                          bool enc)
 814 {
 815         /*
 816          * Only handle private->shared conversion here.
 817          * See the comment in tdx_early_init().
 818          */
 819         if (!enc)
 820                 return tdx_enc_status_changed(vaddr, numpages, enc);
 821         return true;
 822 }
 823
 824 void __init tdx_early_init(void)
 825 {
 826         struct tdx_module_args args = {
 827                 .rdx = TDCS_NOTIFY_ENABLES,
 828                 .r9 = -1ULL,
 829         };
 830         u64 cc_mask;
 831         u32 eax, sig[3];
 832
 833         cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2],  &sig[1]);
 834
 835         if (memcmp(TDX_IDENT, sig, sizeof(sig)))
 836                 return;
 837
 838         setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
 839
 840         /* TSC is the only reliable clock in TDX guest */
 841         setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
 842
 843         cc_vendor = CC_VENDOR_INTEL;
 844         tdx_parse_tdinfo(&cc_mask);
 845         cc_set_mask(cc_mask);
 846
 847         /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
 848         tdcall(TDG_VM_WR, &args);
 849
 850         /*
 851          * All bits above GPA width are reserved and kernel treats shared bit
 852          * as flag, not as part of physical address.
 853          *
 854          * Adjust physical mask to only cover valid GPA bits.
 855          */
 856         physical_mask &= cc_mask - 1;
 857
 858         /*
 859          * The kernel mapping should match the TDX metadata for the page.
 860          * load_unaligned_zeropad() can touch memory *adjacent* to that which is
 861          * owned by the caller and can catch even _momentary_ mismatches.  Bad
 862          * things happen on mismatch:
 863          *
 864          *   - Private mapping => Shared Page  == Guest shutdown
 865          *   - Shared mapping  => Private Page == Recoverable #VE
 866          *
 867          * guest.enc_status_change_prepare() converts the page from
 868          * shared=>private before the mapping becomes private.
 869          *
 870          * guest.enc_status_change_finish() converts the page from
 871          * private=>shared after the mapping becomes private.
 872          *
 873          * In both cases there is a temporary shared mapping to a private page,
 874          * which can result in a #VE.  But, there is never a private mapping to
 875          * a shared page.
 876          */
 877         x86_platform.guest.enc_status_change_prepare = tdx_enc_status_change_prepare;
 878         x86_platform.guest.enc_status_change_finish  = tdx_enc_status_change_finish;
 879
 880         x86_platform.guest.enc_cache_flush_required  = tdx_cache_flush_required;
 881         x86_platform.guest.enc_tlb_flush_required    = tdx_tlb_flush_required;
 882
 883         /*
 884          * TDX intercepts the RDMSR to read the X2APIC ID in the parallel
 885          * bringup low level code. That raises #VE which cannot be handled
 886          * there.
 887          *
 888          * Intel-TDX has a secure RDMSR hypercall, but that needs to be
 889          * implemented seperately in the low level startup ASM code.
 890          * Until that is in place, disable parallel bringup for TDX.
 891          */
 892         x86_cpuinit.parallel_bringup = false;
 893
 894         pr_info("Guest detected\n");
 895 }